ibrahimabdelaal commited on
Commit
6d0fe97
·
1 Parent(s): 7307ef3

Add torchaudio dependency and improve UI layout

Browse files
Files changed (2) hide show
  1. app.py +53 -95
  2. requirements.txt +1 -0
app.py CHANGED
@@ -286,117 +286,75 @@ DEFAULT_REFERENCE_AUDIO = "reference.wav"
286
  # Create Gradio interface
287
  with gr.Blocks(title="Arabic TTS - Spark", theme=gr.themes.Soft()) as demo:
288
  gr.Markdown("""
289
- # 🎙️ Arabic Text-to-Speech (Spark Model)
290
 
291
- Generate high-quality Arabic speech from text using the Spark TTS model with voice cloning capabilities.
292
 
293
  **Model:** [IbrahimSalah/Arabic-TTS-Spark](https://huggingface.co/IbrahimSalah/Arabic-TTS-Spark)
294
-
295
- ### ⚡ Quick Start:
296
- 1. Enter **diacritized Arabic text** to synthesize (تشكيل required)
297
- 2. Use the default reference audio or upload your own (5-30 seconds, clear speech)
298
- 3. Provide the **diacritized transcript** of your reference audio
299
- 4. Click "Generate Speech"
300
-
301
- ### ⚠️ Important Notes:
302
- - **Diacritized text (تشكيل) is required** for both input text and reference transcript
303
- - You can use any LLM (GPT, Claude, Gemini) to add diacritics to your text
304
- - Example prompt for LLM: "أضف التشكيل الكامل للنص التالي: [your text]"
305
- - Default reference audio is provided for quick testing
306
-
307
- ### 💡 Tips:
308
- - Use high-quality reference audio with minimal background noise
309
- - Reference audio should be 5-30 seconds long
310
- - Longer texts are automatically split into chunks with smooth transitions
311
- - First generation may take 30-60 seconds due to model loading
312
  """)
313
 
314
  with gr.Row():
315
- with gr.Column():
316
  text_input = gr.Textbox(
317
- label="📝 Text to Synthesize (Diacritized Arabic / نص عربي مُشكّل)",
318
- placeholder="Enter diacritized Arabic text here... مثال: تُسَاهِمُ التِّقْنِيَّاتُ الْحَدِيثَةُ فِي تَسْهِيلِ حَيَاةِ الْإِنْسَانِ",
319
- lines=5,
320
- value=DEFAULT_TEXT,
321
- info="⚠️ Text must include diacritics (تشكيل). Use GPT/Claude to add them."
322
- )
323
-
324
- gr.Markdown("**🎵 Reference Audio (Default Provided)**")
325
- gr.Markdown("*Upload custom reference audio or use the default (WAV format, 5-30 seconds)*")
326
- reference_audio = gr.Audio(
327
- label="Reference Audio",
328
- type="filepath",
329
- value=DEFAULT_REFERENCE_AUDIO
330
  )
331
 
332
- reference_transcript = gr.Textbox(
333
- label="📄 Reference Transcript (Diacritized / نص مُشكّل)",
334
- placeholder="Enter the diacritized transcript of your reference audio...",
335
- lines=2,
336
- value=DEFAULT_REFERENCE_TEXT,
337
- info="⚠️ Must match the reference audio exactly with full diacritics"
338
- )
 
 
 
 
 
 
 
 
 
339
 
340
  with gr.Accordion("⚙️ Advanced Settings", open=False):
341
- temperature = gr.Slider(0.1, 1.5, value=0.8, step=0.1, label="Temperature",
342
- info="Higher = more variation (0.6-1.0 recommended)")
343
- top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top P",
344
- info="Nucleus sampling threshold")
345
- max_chunk = gr.Slider(100, 500, value=300, step=50, label="Max Chunk Length",
346
- info="Characters per chunk for long texts")
347
- crossfade = gr.Slider(0.01, 0.2, value=0.08, step=0.01, label="Crossfade Duration (s)",
348
- info="Smooth transitions between chunks")
349
 
350
  generate_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
351
 
352
- with gr.Column():
353
  output_audio = gr.Audio(label="🔊 Generated Speech", type="filepath")
354
- status_text = gr.Textbox(label="Status", interactive=False, lines=3)
 
 
 
 
 
 
 
 
 
 
 
 
355
 
356
  # Examples
357
- gr.Markdown("### 📚 Examples (All with Full Diacritics)")
358
- gr.Examples(
359
- examples=[
360
- [DEFAULT_TEXT, DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT],
361
- ["السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللَّهِ وَبَرَكَاتُهُ، كَيْفَ حَالُكَ الْيَوْمَ؟", DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT],
362
- ["الذَّكَاءُ الِاصْطِنَاعِيُّ يُغَيِّرُ الْعَالَمَ بِسُرْعَةٍ كَبِيرَةٍ وَيُسَاهِمُ فِي تَطْوِيرِ حُلُولٍ مُبْتَكَرَةٍ لِلْمُشْكِلَاتِ الْمُعَقَّدَةِ.", DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT]
363
- ],
364
- inputs=[text_input, reference_audio, reference_transcript],
365
- label="Click an example to try it out"
366
- )
367
-
368
- gr.Markdown("""
369
- ### 📖 About
370
- This Space uses the **Arabic-TTS-Spark** model for high-quality Arabic text-to-speech synthesis with voice cloning.
371
-
372
- ### 🔧 How to Add Diacritics (التشكيل):
373
-
374
- **Option 1: Use AI (Recommended)**
375
- - Ask ChatGPT, Claude, or Gemini: "أضف التشكيل الكامل للنص التالي: [paste your text]"
376
- - Or in English: "Add full Arabic diacritics to the following text: [paste your text]"
377
-
378
- **Option 2: Online Tools**
379
- - [Tashkeel Tool](https://tahadz.com/mishkal)
380
- - [Harakat.ai](https://harakat.ai)
381
-
382
- **Option 3: Microsoft Word**
383
- - Type Arabic text → Select text → Review tab → Arabic Diacritics
384
-
385
- ### 📊 Model Info
386
- - **Architecture**: Transformer-based TTS with voice cloning
387
- - **Sample Rate**: 24kHz
388
- - **Languages**: Modern Standard Arabic (MSA) and dialects
389
- - **Max Input**: Unlimited (automatic chunking)
390
-
391
- ### 🔗 Links
392
- - **Model Card**: [IbrahimSalah/Arabic-TTS-Spark](https://huggingface.co/IbrahimSalah/Arabic-TTS-Spark)
393
- - **F5-TTS Arabic**: [IbrahimSalah/Arabic-F5-TTS-v2](https://huggingface.co/IbrahimSalah/Arabic-F5-TTS-v2)
394
- - **Report Issues**: [Discussions](https://huggingface.co/IbrahimSalah/Arabic-TTS-Spark/discussions)
395
-
396
- ---
397
-
398
- Made with ❤️ by **Ibrahim Salah** | [HuggingFace Profile](https://huggingface.co/IbrahimSalah)
399
- """)
400
 
401
  generate_btn.click(
402
  fn=generate_speech,
@@ -405,5 +363,5 @@ with gr.Blocks(title="Arabic TTS - Spark", theme=gr.themes.Soft()) as demo:
405
  )
406
 
407
  if __name__ == "__main__":
408
- demo.queue(max_size=20) # Enable queue for better handling
409
  demo.launch()
 
286
  # Create Gradio interface
287
  with gr.Blocks(title="Arabic TTS - Spark", theme=gr.themes.Soft()) as demo:
288
  gr.Markdown("""
289
+ # 🎙️ Arabic Text-to-Speech | Spark Model
290
 
291
+ High-quality Arabic TTS with voice cloning. **Diacritized text (تشكيل) required.**
292
 
293
  **Model:** [IbrahimSalah/Arabic-TTS-Spark](https://huggingface.co/IbrahimSalah/Arabic-TTS-Spark)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  """)
295
 
296
  with gr.Row():
297
+ with gr.Column(scale=1):
298
  text_input = gr.Textbox(
299
+ label="📝 Text to Synthesize (Arabic with Tashkeel)",
300
+ placeholder="أَدْخِلْ نَصًّا عَرَبِيًّا مُشَكَّلًا هُنَا...",
301
+ lines=6,
302
+ value=DEFAULT_TEXT
 
 
 
 
 
 
 
 
 
303
  )
304
 
305
+ with gr.Row():
306
+ with gr.Column():
307
+ gr.Markdown("**🎵 Reference Audio**")
308
+ reference_audio = gr.Audio(
309
+ label="",
310
+ type="filepath",
311
+ value=DEFAULT_REFERENCE_AUDIO
312
+ )
313
+
314
+ with gr.Column():
315
+ reference_transcript = gr.Textbox(
316
+ label="📄 Reference Transcript (with Tashkeel)",
317
+ placeholder="النص المقابل للصوت المرجعي...",
318
+ lines=4,
319
+ value=DEFAULT_REFERENCE_TEXT
320
+ )
321
 
322
  with gr.Accordion("⚙️ Advanced Settings", open=False):
323
+ with gr.Row():
324
+ temperature = gr.Slider(0.1, 1.5, value=0.8, step=0.1, label="Temperature")
325
+ top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top P")
326
+ with gr.Row():
327
+ max_chunk = gr.Slider(100, 500, value=300, step=50, label="Max Chunk Length")
328
+ crossfade = gr.Slider(0.01, 0.2, value=0.08, step=0.01, label="Crossfade (s)")
 
 
329
 
330
  generate_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
331
 
332
+ with gr.Column(scale=1):
333
  output_audio = gr.Audio(label="🔊 Generated Speech", type="filepath")
334
+ status_text = gr.Textbox(label="Status", interactive=False, lines=2)
335
+
336
+ gr.Markdown("""
337
+ ### ℹ️ Requirements
338
+ - **Diacritized text is required** (تشكيل/تشكيل)
339
+ - Reference audio: 5-30 seconds, clear speech
340
+ - Use AI (ChatGPT/Claude) or [online tools](https://tahadz.com/mishkal) to add diacritics
341
+
342
+ ### 🔗 Resources
343
+ - [Model Card](https://huggingface.co/IbrahimSalah/Arabic-TTS-Spark)
344
+ - [F5-TTS Arabic](https://huggingface.co/IbrahimSalah/Arabic-F5-TTS-v2)
345
+ - [Report Issues](https://huggingface.co/IbrahimSalah/Arabic-TTS-Spark/discussions)
346
+ """)
347
 
348
  # Examples
349
+ with gr.Accordion("📚 Examples", open=False):
350
+ gr.Examples(
351
+ examples=[
352
+ [DEFAULT_TEXT, DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT],
353
+ ["السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللَّهِ وَبَرَكَاتُهُ، كَيْفَ حَالُكَ الْيَوْمَ؟", DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT],
354
+ ["الذَّكَاءُ الِاصْطِنَاعِيُّ يُغَيِّرُ الْعَالَمَ بِسُرْعَةٍ كَبِيرَةٍ وَيُسَاهِمُ فِي تَطْوِيرِ حُلُولٍ مُبْتَكَرَةٍ.", DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT]
355
+ ],
356
+ inputs=[text_input, reference_audio, reference_transcript]
357
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
 
359
  generate_btn.click(
360
  fn=generate_speech,
 
363
  )
364
 
365
  if __name__ == "__main__":
366
+ demo.queue(max_size=20)
367
  demo.launch()
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
  gradio==4.44.0
2
  torch==2.1.0
 
3
  transformers==4.46.2
4
  soundfile==0.12.1
5
  numpy==1.24.3
 
1
  gradio==4.44.0
2
  torch==2.1.0
3
+ torchaudio==2.1.0
4
  transformers==4.46.2
5
  soundfile==0.12.1
6
  numpy==1.24.3