IFMedTechdemo commited on
Commit
43d1773
·
verified ·
1 Parent(s): d1aa924

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -174
app.py CHANGED
@@ -1,124 +1,29 @@
1
  """
2
- 🎙️ Multi-Engine TTS – Zero-GPU edition
3
- Kokoro │ Veena │ pyttsx3 (fallback)
4
  Routes every synthesis to an idle A100.
5
  """
6
 
7
  import os, tempfile, subprocess, numpy as np
8
  import gradio as gr
 
9
  import soundfile as sf
10
- import spaces # << Zero-GPU helper
11
 
12
  # ------------------------------------------------------------------
13
- # 1. Engine availability flags
14
- # ------------------------------------------------------------------
15
- KOKORO_OK = False
16
- VEENA_OK = False
17
- PYT_OK = False
18
-
19
- try:
20
- from kokoro import KPipeline
21
- KOKORO_OK = True
22
- except Exception as e:
23
- print("Kokoro unavailable:", e)
24
-
25
- try:
26
- import torch, transformers, snac
27
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
28
- from snac import SNAC
29
- VEENA_OK = True
30
- except Exception as e:
31
- print("Veena deps unavailable:", e)
32
-
33
- try:
34
- import pyttsx3
35
- PYT_OK = True
36
- except Exception as e:
37
- print("pyttsx3 unavailable:", e)
38
-
39
- # ------------------------------------------------------------------
40
- # 2. Lazy model loader (runs once per GPU worker)
41
  # ------------------------------------------------------------------
42
  kokoro_pipe = None
43
- veena_model = None
44
- veena_tok = None
45
- veena_snac = None
46
 
47
  def load_kokoro():
48
  global kokoro_pipe
49
- if kokoro_pipe is None and KOKORO_OK:
 
50
  kokoro_pipe = KPipeline(lang_code='a')
51
  return kokoro_pipe
52
 
53
- def load_veena():
54
- global veena_model, veena_tok, veena_snac
55
- if veena_model is None and VEENA_OK:
56
- bnb = BitsAndBytesConfig(load_in_4bit=True,
57
- bnb_4bit_quant_type="nf4",
58
- bnb_4bit_compute_dtype=torch.bfloat16)
59
- veena_model = AutoModelForCausalLM.from_pretrained(
60
- "maya-research/veena-tts",
61
- quantization_config=bnb,
62
- device_map="auto",
63
- trust_remote_code=True)
64
- veena_tok = AutoTokenizer.from_pretrained("maya-research/veena-tts",
65
- trust_remote_code=True)
66
- veena_snac = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
67
- if torch.cuda.is_available():
68
- veena_snac = veena_snac.cuda()
69
- return veena_model
70
-
71
  # ------------------------------------------------------------------
72
- # 3. Generation helpers (CPU→GPU off-load)
73
  # ------------------------------------------------------------------
74
- AUDIO_CODE_BASE_OFFSET = 128266
75
- START_OF_SPEECH_TOKEN = 128257
76
- END_OF_SPEECH_TOKEN = 128258
77
- START_OF_HUMAN_TOKEN = 128259
78
- END_OF_HUMAN_TOKEN = 128260
79
- START_OF_AI_TOKEN = 128261
80
- END_OF_AI_TOKEN = 128262
81
-
82
- def decode_snac(tokens):
83
- if len(tokens) % 7:
84
- return None
85
- codes = [[] for _ in range(3)]
86
- offsets = [AUDIO_CODE_BASE_OFFSET + i*4096 for i in range(7)]
87
- for i in range(0, len(tokens), 7):
88
- codes[0].append(tokens[i] - offsets[0])
89
- codes[1].extend([tokens[i+1]-offsets[1], tokens[i+4]-offsets[4]])
90
- codes[2].extend([tokens[i+2]-offsets[2], tokens[i+3]-offsets[3],
91
- tokens[i+5]-offsets[5], tokens[i+6]-offsets[6]])
92
- device = veena_snac.device
93
- hierarchical = [torch.tensor(c, dtype=torch.int32, device=device).unsqueeze(0)
94
- for c in codes]
95
- with torch.no_grad():
96
- wav = veena_snac.decode(hierarchical).squeeze().clamp(-1,1).cpu().numpy()
97
- return wav
98
-
99
- def tts_veena(text, speaker, temperature, top_p):
100
- load_veena()
101
- prompt = f"<spk_{speaker}> {text}"
102
- tok = veena_tok.encode(prompt, add_special_tokens=False)
103
- input_ids = [START_OF_HUMAN_TOKEN] + tok + [END_OF_HUMAN_TOKEN,
104
- START_OF_AI_TOKEN, START_OF_SPEECH_TOKEN]
105
- input_ids = torch.tensor([input_ids], device=veena_model.device)
106
- max_new = min(int(len(text)*1.3)*7 + 21, 700)
107
- out = veena_model.generate(
108
- input_ids,
109
- max_new_tokens=max_new,
110
- do_sample=True,
111
- temperature=temperature,
112
- top_p=top_p,
113
- repetition_penalty=1.05,
114
- pad_token_id=veena_tok.pad_token_id,
115
- eos_token_id=[END_OF_SPEECH_TOKEN, END_OF_AI_TOKEN])
116
- gen = out[0, len(input_ids[0]):].tolist()
117
- snac_toks = [t for t in gen if AUDIO_CODE_BASE_OFFSET <= t < AUDIO_CODE_BASE_OFFSET+7*4096]
118
- if not snac_toks:
119
- raise RuntimeError("No audio tokens produced")
120
- return decode_snac(snac_toks)
121
-
122
  def tts_kokoro(text, voice, speed):
123
  pipe = load_kokoro()
124
  generator = pipe(text, voice=voice, speed=speed)
@@ -126,99 +31,49 @@ def tts_kokoro(text, voice, speed):
126
  return audio
127
  raise RuntimeError("Kokoro generation failed")
128
 
129
- def tts_pyttsx3(text, rate, volume):
130
- engine = pyttsx3.init()
131
- engine.setProperty('rate', rate)
132
- engine.setProperty('volume', volume)
133
- fd, path = tempfile.mkstemp(suffix='.wav')
134
- os.close(fd)
135
- engine.save_to_file(text, path)
136
- engine.runAndWait()
137
- wav, sr = sf.read(path)
138
- os.remove(path)
139
- return wav
140
-
141
  # ------------------------------------------------------------------
142
- # 4. ZERO-GPU ENTRY POINT (decorated)
143
  # ------------------------------------------------------------------
144
  @spaces.GPU
145
- def synthesise(text, engine, voice, speed, speaker, temperature, top_p, rate, vol):
146
  if not text.strip():
147
  raise gr.Error("Please enter some text.")
148
- if engine == "kokoro" and KOKORO_OK:
149
- wav = tts_kokoro(text, voice=voice, speed=speed)
150
- elif engine == "veena" and VEENA_OK:
151
- wav = tts_veena(text, speaker=speaker, temperature=temperature, top_p=top_p)
152
- elif engine == "pyttsx3" and PYT_OK:
153
- wav = tts_pyttsx3(text, rate=rate, volume=vol)
154
- else:
155
- raise gr.Error(f"{engine} is not available on this Space.")
156
  fd, tmp = tempfile.mkstemp(suffix='.wav')
157
  os.close(fd)
158
  sf.write(tmp, wav, 24000)
159
  return tmp
160
 
161
  # ------------------------------------------------------------------
162
- # 5. Gradio UI (unchanged visuals)
163
  # ------------------------------------------------------------------
164
- css = """footer {visibility: hidden} #col-left {max-width: 320px}"""
165
 
166
- with gr.Blocks(css=css, title="Multi-Engine TTS – Zero-GPU") as demo:
167
- gr.Markdown("## 🎙️ Multi-Engine TTS Demo – Zero-GPU \n*Kokoro ‑ Veena ‑ pyttsx3*")
168
 
169
  with gr.Row():
170
- with gr.Column(elem_id="col-left"):
171
- engine = gr.Radio(label="Engine",
172
- choices=[e for e in ["kokoro","veena","pyttsx3"]
173
- if globals().get({"pyttsx3":"PYT_OK"}.get(e,e.upper()+"_OK"), False)],
174
- value="kokoro" if KOKORO_OK else
175
- "veena" if VEENA_OK else "pyttsx3")
176
-
177
- with gr.Group(visible=KOKORO_OK) as kokoro_box:
178
- voice = gr.Dropdown(label="Voice",
179
- choices=['af_heart','af_sky','af_mist','af_dusk'],
180
- value='af_heart')
181
- speed = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Speed")
182
-
183
- with gr.Group(visible=VEENA_OK) as veena_box:
184
- speaker = gr.Dropdown(label="Speaker",
185
- choices=['kavya','agastya','maitri','vinaya'],
186
- value='kavya')
187
- temperature = gr.Slider(0.1, 1.0, 0.4, step=0.05, label="Temperature")
188
- top_p = gr.Slider(0.1, 1.0, 0.9, step=0.05, label="Top-p")
189
-
190
- with gr.Group(visible=PYT_OK) as pyttsx3_box:
191
- rate = gr.Slider(50, 300, 180, step=5, label="Words / min")
192
- vol = gr.Slider(0.0, 1.0, 1.0, step=0.05, label="Volume")
193
 
194
  with gr.Column(scale=3):
195
- text = gr.Textbox(label="Text to speak",
196
- placeholder="Type or paste text here …",
197
- lines=6, max_lines=12)
198
- btn = gr.Button("🎧 Synthesise", variant="primary")
 
 
199
  audio_out = gr.Audio(label="Generated speech", type="filepath")
200
 
201
- # show/hide panels
202
- def switch_panel(e):
203
- return (gr.update(visible=e=="kokoro"),
204
- gr.update(visible=e=="veena"),
205
- gr.update(visible=e=="pyttsx3"))
206
- engine.change(switch_panel, inputs=engine,
207
- outputs=[kokoro_box, veena_box, pyttsx3_box])
208
-
209
- # binding
210
- btn.click(synthesise,
211
- inputs=[text, engine, voice, speed, speaker,
212
- temperature, top_p, rate, vol],
213
- outputs=audio_out)
214
 
215
  gr.Markdown("### Tips \n"
216
- "- **Kokoro** – fastest, good quality English \n"
217
- "- **Veena** – multilingual, GPU-friendly (4-bit) \n"
218
- "- **pyttsx3** – offline fallback, any language \n"
219
  "Audio is returned as 24 kHz WAV.")
220
 
221
- # ------------------------------------------------------------------
222
- # 6. Launch
223
- # ------------------------------------------------------------------
224
  demo.launch()
 
1
  """
2
+ 🎙️ Kokoro-TTS-only demo – Zero-GPU edition
 
3
  Routes every synthesis to an idle A100.
4
  """
5
 
6
  import os, tempfile, subprocess, numpy as np
7
  import gradio as gr
8
+ import spaces # Zero-GPU decorator
9
  import soundfile as sf
 
10
 
11
  # ------------------------------------------------------------------
12
+ # 1. Lazy Kokoro loader (runs once per GPU worker)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # ------------------------------------------------------------------
14
  kokoro_pipe = None
 
 
 
15
 
16
  def load_kokoro():
17
  global kokoro_pipe
18
+ if kokoro_pipe is None:
19
+ from kokoro import KPipeline
20
  kokoro_pipe = KPipeline(lang_code='a')
21
  return kokoro_pipe
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # ------------------------------------------------------------------
24
+ # 2. Generation helper
25
  # ------------------------------------------------------------------
26
+ @spaces.GPU
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def tts_kokoro(text, voice, speed):
28
  pipe = load_kokoro()
29
  generator = pipe(text, voice=voice, speed=speed)
 
31
  return audio
32
  raise RuntimeError("Kokoro generation failed")
33
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # ------------------------------------------------------------------
35
+ # 3. Zero-GPU entry point
36
  # ------------------------------------------------------------------
37
  @spaces.GPU
38
+ def synthesise(text, voice, speed):
39
  if not text.strip():
40
  raise gr.Error("Please enter some text.")
41
+ wav = tts_kokoro(text, voice=voice, speed=speed)
 
 
 
 
 
 
 
42
  fd, tmp = tempfile.mkstemp(suffix='.wav')
43
  os.close(fd)
44
  sf.write(tmp, wav, 24000)
45
  return tmp
46
 
47
  # ------------------------------------------------------------------
48
+ # 4. Gradio UI
49
  # ------------------------------------------------------------------
50
+ css = """footer {visibility: hidden}"""
51
 
52
+ with gr.Blocks(css=css, title="Kokoro TTS – Zero-GPU") as demo:
53
+ gr.Markdown("## 🎙️ Kokoro TTS – Zero-GPU Demo")
54
 
55
  with gr.Row():
56
+ with gr.Column():
57
+ voice = gr.Dropdown(
58
+ label="Voice",
59
+ choices=['af_heart', 'af_sky', 'af_mist', 'af_dusk'],
60
+ value='af_heart'
61
+ )
62
+ speed = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Speed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  with gr.Column(scale=3):
65
+ text = gr.Textbox(
66
+ label="Text to speak",
67
+ placeholder="Type or paste text here …",
68
+ lines=6, max_lines=12
69
+ )
70
+ btn = gr.Button("🎧 Synthesise", variant="primary")
71
  audio_out = gr.Audio(label="Generated speech", type="filepath")
72
 
73
+ btn.click(synthesise, inputs=[text, voice, speed], outputs=audio_out)
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  gr.Markdown("### Tips \n"
76
+ "- **Kokoro** – fast, high-quality English TTS \n"
 
 
77
  "Audio is returned as 24 kHz WAV.")
78
 
 
 
 
79
  demo.launch()