Luganda_TTS / app.py
damla921's picture
Update app.py
61aea24 verified
import time, os, shutil, subprocess, tempfile
import numpy as np
import gradio as gr
import soundfile as sf
import torch
from speechbrain.inference.TTS import Tacotron2
from speechbrain.inference.vocoders import HIFIGAN
from speechbrain.utils.fetching import LocalStrategy
SAMPLE_RATE = 22050
# ---- Load models once (on Space startup) ----
taco = Tacotron2.from_hparams(
source="Sunbird/tts-tacotron2-lug",
savedir="pretrained/tts-tacotron2-lug",
local_strategy=LocalStrategy.COPY,
)
vocoder = HIFIGAN.from_hparams(
source="speechbrain/tts-hifigan-ljspeech",
savedir="pretrained/tts-hifigan-ljspeech",
local_strategy=LocalStrategy.COPY,
)
def _ensure_mel_shape(mel):
# Make sure mel is [B, n_mels, T]
if isinstance(mel, (tuple, list)):
mel = mel[0]
if mel.dim() == 3 and mel.shape[1] != 80 and mel.shape[2] == 80:
mel = mel.transpose(1, 2)
return mel
def _have_ffmpeg():
return shutil.which("ffmpeg") is not None
def _save_wav_np(path, wav_tensor):
"""Save float32 mono [-1,1] to WAV using soundfile (no torchaudio backend needed)."""
x = wav_tensor.detach().cpu().numpy().astype(np.float32)
sf.write(path, x, SAMPLE_RATE, subtype="PCM_16")
def tts_luganda(text):
text = (text or "").strip()
if not text:
return None, None, "Please enter Luganda text."
# Synthesize
mel = _ensure_mel_shape(taco.encode_text(text))
wav = vocoder.decode_batch(mel)[0].squeeze(0) # 1D torch tensor
# Save a temporary WAV
ts = int(time.time())
base = f"luganda_tts_{ts}"
wav_path = os.path.join(tempfile.gettempdir(), base + ".wav")
_save_wav_np(wav_path, wav)
# Optional MP3 via ffmpeg
mp3_path = None
if _have_ffmpeg():
mp3_path = os.path.join(tempfile.gettempdir(), base + ".mp3")
try:
subprocess.run(
["ffmpeg", "-y", "-i", wav_path, "-codec:a", "libmp3lame", "-q:a", "2", mp3_path],
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
except Exception:
mp3_path = None
status = "βœ… Done."
if mp3_path:
status += " (WAV + MP3 ready)"
else:
status += " (WAV ready)"
return wav_path, (mp3_path if mp3_path else None), status
with gr.Blocks(title="Luganda TTS") as demo:
gr.Markdown("# 🌍 Luganda Text-to-Speech\nType Luganda, click **Generate**, and listen/download the audio.")
text = gr.Textbox(label="Luganda text", lines=6, value="Ngenda mu kibuga Kampala olunaku lwa leero.")
btn = gr.Button("Generate", variant="primary")
out_wav = gr.Audio(label="WAV (22.05 kHz)", type="filepath")
out_mp3 = gr.File(label="Download MP3", interactive=False)
status = gr.Markdown("Ready.")
btn.click(fn=tts_luganda, inputs=text, outputs=[out_wav, out_mp3, status])
# Just enable queue with defaults (no unsupported args)
demo.queue().launch()