Borrador: StyleTTS2 Local Open Source, Síntesis de Voz Natural en Inglés

Preparar entorno para síntesis de voz con StyleTTS2


# Para las pruebas se utilizó una tarjeta Nvidia 3050 TI de 4GB de VRAM
# Se puede adaptar a otras gráficas o para utilizar en CPU (más lento)


# Crear un entorno aislado para mantener seguro y limpio el sistema
mkdir -p ~/ia-voz
cd ~/ia-voz
python3 -m venv venv
source venv/bin/activate


# Redirigir caché y temporales de forma local
mkdir -p ~/ia-voz/.cache ~/ia-voz/.tmp
export XDG_CACHE_HOME="$HOME/ia-voz/.cache"
export TMPDIR="$HOME/ia-voz/.tmp"


# Instalar dependencias del sistema
# ffmpeg: Herramienta para grabar, convertir y reproducir audio/video. (GPL)
# libsndfile1: Biblioteca para leer y escribir archivos de audio. (LGPL)
sudo apt install ffmpeg libsndfile1


# Instalar dependencias con pip
# torch: Cálculos con tensores y GPU (BSD-3-Clause)
# torchaudio: Procesamiento de audio con PyTorch (BSD-3-Clause)
# styletts2: Motor TTS con difusión y estilo (MIT)
# nltk.downloader punkt: Tokenizador de frases (Apache 2.0)
pip install torch==2.5.1+cu118 torchaudio==2.5.1+cu118 --index-url https://download.pytorch.org/whl/cu118
pip install styletts2
python -m nltk.downloader punkt


# Crear el archivo con el editor de texto (puede usar nvim, nano o cualquier otro)
nvim voz.py


#!/usr/bin/env python3
import os
import sys
import torch
import warnings
import subprocess
from contextlib import contextmanager
from nltk.tokenize import sent_tokenize
import nltk

#Silenciar warnings
warnings.filterwarnings("ignore")

#Silenciar stdout y stderr durante carga/inferencia
@contextmanager
def suppress_output():
	with open(os.devnull, 'w') as devnull:
		old_stdout = sys.stdout
		old_stderr = sys.stderr
		sys.stdout = devnull
		sys.stderr = devnull
		try:
			yield
		finally:
			sys.stdout = old_stdout
			sys.stderr = old_stderr

#Silenciar nltk
def suppress_nltk_download():
	nltk.download = lambda *args, **kwargs: None

suppress_nltk_download()

#Mostrar dispositivo en uso
device = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"
print(f"🧠 Usando dispositivo: {device}")

#Cargar modelo sin ruido
from styletts2 import tts
with suppress_output():
	engine = tts.StyleTTS2()

output_file = "voz_salida.wav"
print("🗣️ Escribe texto en inglés y presiona [Enter] (Ctrl+C para salir)")

while True:
	try:
		text = input(">>> ").strip()
		if not text:
			continue

		#Agregar punto final al último texto para mejorar el audio
		if text[-1] not in ".!?":
			text += "."

		with suppress_output():
			sentences = sent_tokenize(text)

		for i, sentence in enumerate(sentences, 1):
			if not sentence.strip():
				continue

			print(f"🎤 Sintetizando frase {i}/{len(sentences)}...")

			with suppress_output():
				engine.inference(
					sentence,
					output_wav_file=output_file,
					output_sample_rate=24000,
					alpha=0.3,
					beta=0.7,
					diffusion_steps=5,
					embedding_scale=1.0
				)

			subprocess.run(
				["ffplay", "-nodisp", "-autoexit", output_file],
				stdout=subprocess.DEVNULL,
				stderr=subprocess.DEVNULL
			)

	except (KeyboardInterrupt, EOFError):
		print("\n👋 Salida del sistema de voz.")
		break


# Ejecutar el script
python voz.py


# Desactiva el entorno
deactivate

Script para activar el entorno y ejecutar el sistema de voz


# Crear el script de activación automática
nvim iavoz.sh


#!/bin/bash
cd ~/ia-voz
source venv/bin/activate
export XDG_CACHE_HOME="$PWD/.cache"
export TMPDIR="$PWD/.tmp"
python voz.py


# Dar permisos de ejecución y ejecutarlo
chmod +x iavoz.sh
./iavoz.sh

Utilizar una interfaz web interactiva


# Ingresar al entorno virtual antes de ejecutar el servidor
cd ~/ia-voz
source venv/bin/activate
export XDG_CACHE_HOME="$PWD/.cache"
export TMPDIR="$PWD/.tmp"


# Instalar dependencias:
# Flask: Microframework web en Python. (BSD-3-Clause)
# Flask-Cors: Permite CORS en apps Flask. (MIT)
# Gunicorn: Servidor WSGI para producción. (MIT)
# Scipy: Librería para cálculos científicos. (BSD-3-Clause)
pip install flask flask-cors gunicorn scipy


# Crear carpetas necesarias para audio generado y HTML
mkdir -p ~/ia-voz/static ~/ia-voz/templates


Crear el nuevo Script para ejecución web
nvim voz0.py


#!/usr/bin/env python3
import os
import sys
import warnings
import torch
import nltk
from flask import Flask, request, render_template, jsonify, send_from_directory
from nltk.tokenize import sent_tokenize
from styletts2 import tts

#Silenciar nltk
nltk.download = lambda *args, **kwargs: None
warnings.filterwarnings("ignore")

#Configurar Flask y rutas absolutas
app = Flask(__name__)
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
STATIC_DIR = os.path.join(BASE_DIR, "static")
os.makedirs(STATIC_DIR, exist_ok=True)

#Cargar modelo StyleTTS2
print("🧠 Cargando modelo StyleTTS2...")
try:
    engine = tts.StyleTTS2()
    print("✅ Modelo cargado correctamente.")
except Exception as e:
    print(f"❌ Error cargando el modelo: {e}")
    sys.exit(1)

#Página principal
@app.route('/')
def index():
    return render_template('index.html')

#Endpoint para sintetizar frases y devolver un JSON
@app.route('/sintetizar', methods=['POST'])
def sintetizar():
    texto = request.form['texto'].strip()
    print(f"\n📝 Texto recibido: {texto}")

    #Agregar punto final
    if texto and texto[-1] not in ".!?":
        texto += "."

    #Eliminar archivos temporales
    for f in os.listdir(STATIC_DIR):
        if f.startswith("temp_") and f.endswith(".wav"):
            try:
                os.remove(os.path.join(STATIC_DIR, f))
            except Exception as e:
                print(f"⚠️ No se pudo borrar {f}: {e}")
    print("🧹 Archivos anteriores eliminados.")

    #Dividir texto y sintetizar
    sentences = sent_tokenize(texto)
    file_list = []

    try:
        for i, sentence in enumerate(sentences):
            if sentence.strip():
                filename = f"temp_{i}.wav"
                abs_path = os.path.join(STATIC_DIR, filename)
                print(f"🎤 Sintetizando frase {i+1}/{len(sentences)}: {sentence}")
                engine.inference(
                    sentence,
                    output_wav_file=abs_path,
                    output_sample_rate=24000,
                    alpha=0.3,
                    beta=0.7,
                    diffusion_steps=5,
                    embedding_scale=1.0
                )
                file_list.append(f"/static/{filename}")

        print("✅ Archivos generados:", file_list)
        return jsonify({
            "success": True,
            "frases": sentences,
            "audios": file_list
        })

    except Exception as e:
        print(f"💥 Error durante la inferencia: {e}")
        return jsonify({"success": False, "error": str(e)}), 500

#Servir archivos de audio
@app.route('/audio/<filename>')
def audio(filename):
    return send_from_directory(STATIC_DIR, filename)

#Iniciar servidor
if __name__ == '__main__':
    print("🧠 Usando dispositivo:", "GPU (CUDA)" if torch.cuda.is_available() else "CPU")
    print("✅ Abre el navegador en: http://127.0.0.1:5000")
    app.run(debug=True)

Crear el archivo HTML interactivo


nvim templates/index.html


<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>Texto a Voz por Frases</title>
  <style>
    @font-face {
      font-family: 'OpenSans';
      src: local('Open Sans'), local('OpenSans'), sans-serif;
    }

    html, body {
      margin: 0;
      padding: 0;
      background: #121212;
      color: #f1f1f1;
      font-family: 'OpenSans', sans-serif;
    }

    body {
      display: flex;
      flex-direction: column;
      align-items: center;
      padding: 2em;
    }

    h1 {
      color: #00e676;
      margin-bottom: 1em;
    }

    form {
      width: 100%;
      max-width: 1200px;
    }

    textarea {
      width: 100%;
      height: 300px;
      padding: 1em;
      font-size: 1.2em;
      border-radius: 10px;
      border: none;
      resize: both;
      background-color: #1e1e1e;
      color: #fff;
      box-shadow: 0 0 10px rgba(0, 255, 128, 0.2);
    }

    .controls {
      display: flex;
      flex-wrap: wrap;
      gap: 1em;
      margin-top: 1.5em;
      margin-bottom: 2.5em;
    }

    button {
      padding: 0.8em 1.6em;
      font-size: 1em;
      background-color: #00e676;
      border: none;
      border-radius: 5px;
      cursor: pointer;
      color: #121212;
      font-weight: bold;
    }

    button:hover {
      background-color: #00c853;
    }

    .velocidad {
      margin-top: 2em;
      display: flex;
      flex-wrap: wrap;
      justify-content: center;
      gap: 0.5em;
    }

    .velocidad button {
      padding: 0.3em 0.8em;
      font-size: 0.8em;
      background-color: #333;
      color: #00e676;
      border: 1px solid #00e676;
    }

    .velocidad button:hover {
      background-color: #00e676;
      color: #121212;
    }

    audio {
      margin-top: 2em;
      width: 100%;
      max-width: 600px;
    }

    #texto-destacado {
      margin-top: 2em;
      font-size: 1.3em;
      text-align: left;
      line-height: 2.0;
      max-width: 1200px;
      padding: 1em;
      word-wrap: break-word;
      white-space: pre-wrap;
    }

    #texto-destacado span {
      padding: 0.2em 0.4em;
      transition: background-color 0.3s;
    }

    .highlight {
      background-color: #33339988;
      border-radius: 5px;
    }

    .frase-audio {
      display: flex;
      align-items: center;
      gap: 1em;
      margin: 1em 0;
      background-color: #1e1e1e;
      padding: 1em;
      border-radius: 6px;
      box-shadow: 0 0 5px rgba(0, 255, 128, 0.1);
      max-width: 1200px;
    }

    .frase-texto {
      flex: 1;
      font-size: 1.1em;
    }
  </style>
</head>
<body>
  <h1>🗣️ Texto a Voz</h1>

  <form id="formulario">
    <textarea id="texto" placeholder="Escribe aquí tu texto en inglés..."></textarea>
    <div class="controls">
      <button type="submit">▶ Reproducir Texto</button>
      <button type="button" onclick="document.getElementById('texto').value = ''">🗑️ Borrar texto</button>
    </div>
  </form>

  <div class="velocidad">
    <button onclick="setSpeed(0.5)">0.5×</button>
    <button onclick="setSpeed(0.6)">0.6×</button>
    <button onclick="setSpeed(0.75)">0.75×</button>
    <button onclick="setSpeed(0.85)">0.85×</button>
    <button onclick="setSpeed(0.95)">0.95×</button>
    <button onclick="setSpeed(1.0)">Normal</button>
    <button onclick="setSpeed(1.1)">1.1×</button>
    <button onclick="setSpeed(1.25)">1.25×</button>
    <button onclick="setSpeed(1.5)">1.5×</button>
    <button onclick="setSpeed(1.75)">1.75×</button>
    <button onclick="setSpeed(2.0)">2×</button>
  </div>

  <audio id="audio" controls style="display:none;"></audio>
  <div id="texto-destacado"></div>
  <div id="lista-audios"></div>

  <script>
    const form = document.getElementById('formulario');
    const audio = document.getElementById('audio');
    const textoDestacado = document.getElementById('texto-destacado');
    const listaAudios = document.getElementById('lista-audios');
    let currentPlaybackRate = 1.0;

    function setSpeed(rate) {
      currentPlaybackRate = rate;
      audio.playbackRate = rate;
    }

    form.addEventListener('submit', async (e) => {
      e.preventDefault();
      const texto = document.getElementById('texto').value.trim();
      if (!texto) return alert("Por favor, escribe algo.");

      audio.style.display = 'none';
      audio.pause();
      textoDestacado.innerHTML = '⏳ Procesando...';
      listaAudios.innerHTML = '';

      const response = await fetch('/sintetizar', {
        method: 'POST',
        headers: { 'Content-Type': 'application/x-www-form-urlencoded' },
        body: `texto=${encodeURIComponent(texto)}`
      });

      const data = await response.json();
      if (!data.success || data.frases.length !== data.audios.length) {
        return alert("❌ No se pudieron sincronizar las frases.");
      }

      textoDestacado.innerHTML = data.frases.map(f => `<span>${f.trim()}</span>`).join(" ");
      const spans = textoDestacado.querySelectorAll('span');

      data.frases.forEach((frase, i) => {
        const contenedor = document.createElement('div');
        contenedor.classList.add('frase-audio');

        const textoEl = document.createElement('div');
        textoEl.classList.add('frase-texto');
        textoEl.textContent = frase;

        const boton = document.createElement('button');
        boton.textContent = '▶ Reproducir';
        boton.onclick = () => {
          audio.src = data.audios[i];
          audio.playbackRate = currentPlaybackRate;
          audio.play();
          spans.forEach(s => s.classList.remove('highlight'));
          spans[i].classList.add('highlight');
        };

        contenedor.appendChild(textoEl);
        contenedor.appendChild(boton);
        listaAudios.appendChild(contenedor);
      });

      let index = 0;
      const playNext = () => {
        if (index >= data.audios.length) return;
        spans.forEach(s => s.classList.remove("highlight"));
        spans[index].classList.add("highlight");
        audio.src = data.audios[index];
        audio.playbackRate = currentPlaybackRate;
        audio.play();
        index++;
      };

      audio.onended = playNext;
      audio.src = data.audios[0];
      audio.playbackRate = currentPlaybackRate;
      audio.style.display = 'block';
      spans[0].classList.add("highlight");
      audio.play();
      index = 1;
    });
  </script>
</body>
</html>


# Ejecutar el script Flask
python voz0.py

# Abrir el navegador en la siguiente dirección:
http://127.0.0.1:5000

Borrar todo el proyecto


# Desactivar el entorno
deactivate

# ⚠️ Este comando eliminará todo el proyecto y sus archivos
rm -rf ~/ia-voz