Borrador: StyleTTS2 Local Open Source, Síntesis de Voz Natural en Inglés

Preparar entorno para síntesis de voz con StyleTTS2
mkdir -p ~/ia-voz
cd ~/ia-voz
python3 -m venv venv
source venv/bin/activate
mkdir -p ~/ia-voz/.cache ~/ia-voz/.tmp
export XDG_CACHE_HOME="$HOME/ia-voz/.cache"
export TMPDIR="$HOME/ia-voz/.tmp"
sudo apt install ffmpeg libsndfile1
pip install torch==2.5.1+cu118 torchaudio==2.5.1+cu118 --index-url https://download.pytorch.org/whl/cu118
pip install styletts2
python -m nltk.downloader punkt
nvim voz.py
#!/usr/bin/env python3
import os
import sys
import torch
import warnings
import subprocess
from contextlib import contextmanager
from nltk.tokenize import sent_tokenize
import nltk
#Silenciar warnings
warnings.filterwarnings("ignore")
#Silenciar stdout y stderr durante carga/inferencia
@contextmanager
def suppress_output():
with open(os.devnull, 'w') as devnull:
old_stdout = sys.stdout
old_stderr = sys.stderr
sys.stdout = devnull
sys.stderr = devnull
try:
yield
finally:
sys.stdout = old_stdout
sys.stderr = old_stderr
#Silenciar nltk
def suppress_nltk_download():
nltk.download = lambda *args, **kwargs: None
suppress_nltk_download()
#Mostrar dispositivo en uso
device = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"
print(f"🧠 Usando dispositivo: {device}")
#Cargar modelo sin ruido
from styletts2 import tts
with suppress_output():
engine = tts.StyleTTS2()
output_file = "voz_salida.wav"
print("🗣️ Escribe texto en inglés y presiona [Enter] (Ctrl+C para salir)")
while True:
try:
text = input(">>> ").strip()
if not text:
continue
#Agregar punto final al último texto para mejorar el audio
if text[-1] not in ".!?":
text += "."
with suppress_output():
sentences = sent_tokenize(text)
for i, sentence in enumerate(sentences, 1):
if not sentence.strip():
continue
print(f"🎤 Sintetizando frase {i}/{len(sentences)}...")
with suppress_output():
engine.inference(
sentence,
output_wav_file=output_file,
output_sample_rate=24000,
alpha=0.3,
beta=0.7,
diffusion_steps=5,
embedding_scale=1.0
)
subprocess.run(
["ffplay", "-nodisp", "-autoexit", output_file],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL
)
except (KeyboardInterrupt, EOFError):
print("\n👋 Salida del sistema de voz.")
break
python voz.py
deactivate
Script para activar el entorno y ejecutar el sistema de voz
nvim iavoz.sh
#!/bin/bash
cd ~/ia-voz
source venv/bin/activate
export XDG_CACHE_HOME="$PWD/.cache"
export TMPDIR="$PWD/.tmp"
python voz.py
chmod +x iavoz.sh
./iavoz.sh
Utilizar una interfaz web interactiva
cd ~/ia-voz
source venv/bin/activate
export XDG_CACHE_HOME="$PWD/.cache"
export TMPDIR="$PWD/.tmp"
pip install flask flask-cors gunicorn scipy
mkdir -p ~/ia-voz/static ~/ia-voz/templates
nvim voz0.py
#!/usr/bin/env python3
import os
import sys
import warnings
import torch
import nltk
from flask import Flask, request, render_template, jsonify, send_from_directory
from nltk.tokenize import sent_tokenize
from styletts2 import tts
#Silenciar nltk
nltk.download = lambda *args, **kwargs: None
warnings.filterwarnings("ignore")
#Configurar Flask y rutas absolutas
app = Flask(__name__)
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
STATIC_DIR = os.path.join(BASE_DIR, "static")
os.makedirs(STATIC_DIR, exist_ok=True)
#Cargar modelo StyleTTS2
print("🧠 Cargando modelo StyleTTS2...")
try:
engine = tts.StyleTTS2()
print("✅ Modelo cargado correctamente.")
except Exception as e:
print(f"❌ Error cargando el modelo: {e}")
sys.exit(1)
#Página principal
@app.route('/')
def index():
return render_template('index.html')
#Endpoint para sintetizar frases y devolver un JSON
@app.route('/sintetizar', methods=['POST'])
def sintetizar():
texto = request.form['texto'].strip()
print(f"\n📝 Texto recibido: {texto}")
#Agregar punto final
if texto and texto[-1] not in ".!?":
texto += "."
#Eliminar archivos temporales
for f in os.listdir(STATIC_DIR):
if f.startswith("temp_") and f.endswith(".wav"):
try:
os.remove(os.path.join(STATIC_DIR, f))
except Exception as e:
print(f"⚠️ No se pudo borrar {f}: {e}")
print("🧹 Archivos anteriores eliminados.")
#Dividir texto y sintetizar
sentences = sent_tokenize(texto)
file_list = []
try:
for i, sentence in enumerate(sentences):
if sentence.strip():
filename = f"temp_{i}.wav"
abs_path = os.path.join(STATIC_DIR, filename)
print(f"🎤 Sintetizando frase {i+1}/{len(sentences)}: {sentence}")
engine.inference(
sentence,
output_wav_file=abs_path,
output_sample_rate=24000,
alpha=0.3,
beta=0.7,
diffusion_steps=5,
embedding_scale=1.0
)
file_list.append(f"/static/{filename}")
print("✅ Archivos generados:", file_list)
return jsonify({
"success": True,
"frases": sentences,
"audios": file_list
})
except Exception as e:
print(f"💥 Error durante la inferencia: {e}")
return jsonify({"success": False, "error": str(e)}), 500
#Servir archivos de audio
@app.route('/audio/<filename>')
def audio(filename):
return send_from_directory(STATIC_DIR, filename)
#Iniciar servidor
if __name__ == '__main__':
print("🧠 Usando dispositivo:", "GPU (CUDA)" if torch.cuda.is_available() else "CPU")
print("✅ Abre el navegador en: http://127.0.0.1:5000")
app.run(debug=True)
Crear el archivo HTML interactivo
nvim templates/index.html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Texto a Voz por Frases</title>
<style>
@font-face {
font-family: 'OpenSans';
src: local('Open Sans'), local('OpenSans'), sans-serif;
}
html, body {
margin: 0;
padding: 0;
background: #121212;
color: #f1f1f1;
font-family: 'OpenSans', sans-serif;
}
body {
display: flex;
flex-direction: column;
align-items: center;
padding: 2em;
}
h1 {
color: #00e676;
margin-bottom: 1em;
}
form {
width: 100%;
max-width: 1200px;
}
textarea {
width: 100%;
height: 300px;
padding: 1em;
font-size: 1.2em;
border-radius: 10px;
border: none;
resize: both;
background-color: #1e1e1e;
color: #fff;
box-shadow: 0 0 10px rgba(0, 255, 128, 0.2);
}
.controls {
display: flex;
flex-wrap: wrap;
gap: 1em;
margin-top: 1.5em;
margin-bottom: 2.5em;
}
button {
padding: 0.8em 1.6em;
font-size: 1em;
background-color: #00e676;
border: none;
border-radius: 5px;
cursor: pointer;
color: #121212;
font-weight: bold;
}
button:hover {
background-color: #00c853;
}
.velocidad {
margin-top: 2em;
display: flex;
flex-wrap: wrap;
justify-content: center;
gap: 0.5em;
}
.velocidad button {
padding: 0.3em 0.8em;
font-size: 0.8em;
background-color: #333;
color: #00e676;
border: 1px solid #00e676;
}
.velocidad button:hover {
background-color: #00e676;
color: #121212;
}
audio {
margin-top: 2em;
width: 100%;
max-width: 600px;
}
#texto-destacado {
margin-top: 2em;
font-size: 1.3em;
text-align: left;
line-height: 2.0;
max-width: 1200px;
padding: 1em;
word-wrap: break-word;
white-space: pre-wrap;
}
#texto-destacado span {
padding: 0.2em 0.4em;
transition: background-color 0.3s;
}
.highlight {
background-color: #33339988;
border-radius: 5px;
}
.frase-audio {
display: flex;
align-items: center;
gap: 1em;
margin: 1em 0;
background-color: #1e1e1e;
padding: 1em;
border-radius: 6px;
box-shadow: 0 0 5px rgba(0, 255, 128, 0.1);
max-width: 1200px;
}
.frase-texto {
flex: 1;
font-size: 1.1em;
}
</style>
</head>
<body>
<h1>🗣️ Texto a Voz</h1>
<form id="formulario">
<textarea id="texto" placeholder="Escribe aquí tu texto en inglés..."></textarea>
<div class="controls">
<button type="submit">▶ Reproducir Texto</button>
<button type="button" onclick="document.getElementById('texto').value = ''">🗑️ Borrar texto</button>
</div>
</form>
<div class="velocidad">
<button onclick="setSpeed(0.5)">0.5×</button>
<button onclick="setSpeed(0.6)">0.6×</button>
<button onclick="setSpeed(0.75)">0.75×</button>
<button onclick="setSpeed(0.85)">0.85×</button>
<button onclick="setSpeed(0.95)">0.95×</button>
<button onclick="setSpeed(1.0)">Normal</button>
<button onclick="setSpeed(1.1)">1.1×</button>
<button onclick="setSpeed(1.25)">1.25×</button>
<button onclick="setSpeed(1.5)">1.5×</button>
<button onclick="setSpeed(1.75)">1.75×</button>
<button onclick="setSpeed(2.0)">2×</button>
</div>
<audio id="audio" controls style="display:none;"></audio>
<div id="texto-destacado"></div>
<div id="lista-audios"></div>
<script>
const form = document.getElementById('formulario');
const audio = document.getElementById('audio');
const textoDestacado = document.getElementById('texto-destacado');
const listaAudios = document.getElementById('lista-audios');
let currentPlaybackRate = 1.0;
function setSpeed(rate) {
currentPlaybackRate = rate;
audio.playbackRate = rate;
}
form.addEventListener('submit', async (e) => {
e.preventDefault();
const texto = document.getElementById('texto').value.trim();
if (!texto) return alert("Por favor, escribe algo.");
audio.style.display = 'none';
audio.pause();
textoDestacado.innerHTML = '⏳ Procesando...';
listaAudios.innerHTML = '';
const response = await fetch('/sintetizar', {
method: 'POST',
headers: { 'Content-Type': 'application/x-www-form-urlencoded' },
body: `texto=${encodeURIComponent(texto)}`
});
const data = await response.json();
if (!data.success || data.frases.length !== data.audios.length) {
return alert("❌ No se pudieron sincronizar las frases.");
}
textoDestacado.innerHTML = data.frases.map(f => `<span>${f.trim()}</span>`).join(" ");
const spans = textoDestacado.querySelectorAll('span');
data.frases.forEach((frase, i) => {
const contenedor = document.createElement('div');
contenedor.classList.add('frase-audio');
const textoEl = document.createElement('div');
textoEl.classList.add('frase-texto');
textoEl.textContent = frase;
const boton = document.createElement('button');
boton.textContent = '▶ Reproducir';
boton.onclick = () => {
audio.src = data.audios[i];
audio.playbackRate = currentPlaybackRate;
audio.play();
spans.forEach(s => s.classList.remove('highlight'));
spans[i].classList.add('highlight');
};
contenedor.appendChild(textoEl);
contenedor.appendChild(boton);
listaAudios.appendChild(contenedor);
});
let index = 0;
const playNext = () => {
if (index >= data.audios.length) return;
spans.forEach(s => s.classList.remove("highlight"));
spans[index].classList.add("highlight");
audio.src = data.audios[index];
audio.playbackRate = currentPlaybackRate;
audio.play();
index++;
};
audio.onended = playNext;
audio.src = data.audios[0];
audio.playbackRate = currentPlaybackRate;
audio.style.display = 'block';
spans[0].classList.add("highlight");
audio.play();
index = 1;
});
</script>
</body>
</html>
python voz0.py
http://127.0.0.1:5000
Borrar todo el proyecto
deactivate
rm -rf ~/ia-voz