import os import gradio as gr import numpy as np import soundfile as sf import tempfile import time import json from typing import List, Tuple # -------------------------- # Repo ensure (clone + LFS pull) # -------------------------- REPO_URL = "https://huggingface.co/Supertone/supertonic" TARGET_DIR = "supertonic" # folder name after clone def run_cmd(cmd: str) -> int: print(f"[CMD] {cmd}") return os.system(cmd) print("=== Checking Supertonic repo ===") if not os.path.exists(TARGET_DIR): print("[+] Cloning repo (LFS pointers only)...") run_cmd("git lfs install") ret = run_cmd(f"GIT_LFS_SKIP_SMUDGE=1 git clone {REPO_URL} {TARGET_DIR}") if ret != 0: raise RuntimeError("git clone failed") else: print("[✓] Repo already exists. Skipping clone.") # Pull LFS assets (real ONNX files). If user doesn't want this, remove this line. print("[+] Pulling LFS files (this will download ONNX models; skip if already pulled)...") run_cmd(f"cd {TARGET_DIR} && git lfs pull") # -------------------------- # Make repo importable # -------------------------- import sys sys.path.insert(0, os.path.abspath(TARGET_DIR)) # -------------------------- # Import your TTS code from repo # -------------------------- from tts_model import ( load_text_to_speech, load_voice_style, sanitize_filename, chunk_text, ) # -------------------------- # Discover available voice styles # -------------------------- VOICE_STYLES_DIR = os.path.join(TARGET_DIR, "voice_styles") def list_voice_styles(styles_dir: str = VOICE_STYLES_DIR) -> List[str]: if not os.path.exists(styles_dir): return [] files = sorted( [f for f in os.listdir(styles_dir) if f.lower().endswith(".json")] ) return files available_styles = list_voice_styles() if not available_styles: print("No voice styles found in", VOICE_STYLES_DIR) else: print("Found voice styles:", available_styles) # -------------------------- # Load TTS model once # -------------------------- ONNX_DIR = os.path.join(TARGET_DIR, "onnx") TOTAL_STEP = 15 print("Loading TTS model...") tts = load_text_to_speech(ONNX_DIR) # may take a while # -------------------------- # Helper: load a single style by filename (returns Style) # -------------------------- def load_style_by_name(filename: str): if not filename: raise ValueError("No style selected") path = os.path.join(VOICE_STYLES_DIR, filename) if not os.path.exists(path): raise FileNotFoundError(f"Style file not found: {path}") # load_voice_style expects list of paths return load_voice_style([path]) # -------------------------- # Voice style descriptions # -------------------------- VOICE_DESCRIPTIONS = { "F1.json": "Female Voice 1 - Clear and professional", "F2.json": "Female Voice 2 - Warm and expressive", "M1.json": "Male Voice 1 - Deep and authoritative", "M2.json": "Male Voice 2 - Casual and friendly" } # -------------------------- # Continuous Streaming TTS Generator # -------------------------- def run_tts_stream(text: str, speed: float, style_name: str): """ Generator that yields continuous audio stream as chunks are generated. """ try: if not text or not text.strip(): yield None, "❌ Text cannot be empty." return try: style = load_style_by_name(style_name) except Exception as e: yield None, f"❌ Failed to load voice style: {e}" return chunks = chunk_text(text) total_chunks = len(chunks) yield None, f"🟡 Starting generation: {total_chunks} chunk(s) to process..." # Create a temporary file for streaming temp_dir = tempfile.mkdtemp() stream_file = os.path.join(temp_dir, "stream_audio.wav") all_audio_chunks = [] for idx, chunk in enumerate(chunks, start=1): yield None, f"⏳ Generating chunk {idx}/{total_chunks}..." # Generate the audio chunk wav, dur = tts._infer([chunk], style, TOTAL_STEP, float(speed)) audio = wav.squeeze() all_audio_chunks.append(audio) # Extract duration as scalar if hasattr(dur, '__len__'): dur_scalar = float(dur[0]) if len(dur) > 0 else 0.0 else: dur_scalar = float(dur) # Concatenate all chunks so far current_audio = np.concatenate(all_audio_chunks) if len(all_audio_chunks) > 1 else all_audio_chunks[0] # Save current state to temporary file sf.write(stream_file, current_audio, tts.sample_rate) # Yield the file path - Gradio will stream this continuously yield stream_file, f"🔊 Playing... Chunk {idx}/{total_chunks} ready ({dur_scalar:.1f}s)" # Final update with complete audio total_duration = len(current_audio) / tts.sample_rate yield stream_file, f"🎉 Generation complete! Total duration: {total_duration:.1f}s" except Exception as e: yield None, f"❌ Error: {type(e).__name__}: {e}" # -------------------------- # Full generation endpoint # -------------------------- def run_tts_full(text: str, speed: float, style_name: str): if not text or not text.strip(): return None, "❌ Text cannot be empty." try: style = load_style_by_name(style_name) wav_cat, dur_cat = tts(text=text, style=style, total_step=TOTAL_STEP, speed=float(speed), silence_duration=0.2) audio = wav_cat.squeeze() if hasattr(dur_cat, '__len__'): dur_scalar = float(dur_cat[0]) if len(dur_cat) > 0 else 0.0 else: dur_scalar = float(dur_cat) tmp_path = tempfile.mktemp(suffix=".wav") sf.write(tmp_path, audio, tts.sample_rate) return tmp_path, f"✅ Generated successfully! Duration: {dur_scalar:.1f}s" except Exception as e: return None, f"❌ Error: {type(e).__name__}: {e}" # -------------------------- # Professional UI # -------------------------- def ui(): with gr.Blocks(title="Supertonic TTS Studio") as demo: # Custom CSS gr.HTML(""" """) # Header Section gr.HTML("""
Professional Text-to-Speech with Real-time Continuous Streaming