Spaces:

hon9kon9ize
/

Cantonese-TTS-playground

Running

App Files Files Community

Cheng Jed commited on Mar 26

Commit

c005bf8

1 Parent(s): ad04301

initial commit

Browse files

Files changed (5) hide show

app.py +118 -0
tts.py +140 -0
voices/doraemon3.wav +0 -0
voices/mk_girl.wav +0 -0
voices/sing.mp3 +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import gradio as gr
+import asyncio
+import base64
+import os
+from tts import voices, tts, get_task_result, Voice
+import tempfile
+def generate_speech(text, voice_name, custom_audio=None, custom_prompt_text=None):
+    """Generate speech from text using the selected voice or custom voice"""
+    if not text.strip():
+        return None, "Please enter some text"
+    output_file = "temp_output.wav"
+    # Handle custom voice upload
+    if custom_audio is not None and custom_prompt_text and custom_prompt_text.strip():
+        # Create a temporary Voice object with the uploaded audio
+        temp_audio_path = custom_audio
+        voice = {
+            "name": "Custom Voice",
+            "promptText": custom_prompt_text,
+            "promptAudio": temp_audio_path
+        }
+    else:
+        # Use predefined voice
+        voice = voices[voice_name]
+    async def process_tts():
+        try:
+            task_id = await tts(text, voice)
+            while True:
+                result = await get_task_result(task_id)
+                if result['status'] != 'PENDING':
+                    break
+                await asyncio.sleep(1)
+            if result['status'] == 'SUCCESS':
+                audio_data = result['audio_url']
+                if ',' in audio_data:
+                    audio_data = audio_data.split(',')[1]
+                with open(output_file, 'wb') as f:
+                    f.write(base64.b64decode(audio_data))
+                return output_file, f"Successfully generated audio using {voice['name']}"
+            else:
+                return None, f"TTS generation failed: {result['message']}"
+        except Exception as e:
+            return None, f"Error: {str(e)}"
+    return asyncio.run(process_tts())
+# Create a dictionary of voice names for the dropdown
+voice_options = {k: v["name"] for k, v in voices.items()}
+# Create the Gradio interface
+with gr.Blocks(title="Cantonese Text-to-Speech") as demo:
+    gr.Markdown("# Cantonese Text-to-Speech Demo")
+    gr.Markdown("Enter text in Cantonese and select a voice to generate speech.")
+    with gr.Row():
+        with gr.Column(scale=2):
+            text_input = gr.Textbox(
+                placeholder="輸入廣東話文字...",
+                label="Text to convert",
+                lines=5
+            )
+            with gr.Group():
+                gr.Markdown("### Choose a voice option")
+                voice_dropdown = gr.Dropdown(
+                    choices=list(voice_options.keys()),
+                    value=list(voice_options.keys())[0],
+                    label="Select Predefined Voice",
+                    info="Choose a voice for synthesis"
+                )
+                # Display the actual voice name based on the selection
+                voice_name_display = gr.Markdown(value=f"Selected Voice: {voice_options[list(voice_options.keys())[0]]}")
+        with gr.Column(scale=2):
+            with gr.Group():
+                gr.Markdown("### Or upload your own voice (optional)")
+                custom_audio = gr.Audio(
+                    label="Upload Voice Sample (WAV format)",
+                    type="filepath",
+                    format="wav"
+                )
+                custom_prompt_text = gr.Textbox(
+                    placeholder="Enter the exact transcription of the uploaded audio...",
+                    label="Transcription of Uploaded Audio (required if using custom voice)",
+                    lines=2
+                )
+                gr.Markdown("*Note: The custom voice sample should be clear with minimal background noise.*")
+            generate_btn = gr.Button("Generate Speech", variant="primary")
+        with gr.Column(scale=3):
+            audio_output = gr.Audio(label="Generated Speech", type="filepath")
+            status_text = gr.Markdown("Ready to generate speech")
+    # Update the voice name display when dropdown changes
+    voice_dropdown.change(
+        fn=lambda x: f"Selected Voice: {voice_options[x]}",
+        inputs=voice_dropdown,
+        outputs=voice_name_display
+    )
+    # Generate speech when button is clicked
+    generate_btn.click(
+        fn=generate_speech,
+        inputs=[text_input, voice_dropdown, custom_audio, custom_prompt_text],
+        outputs=[audio_output, status_text],
+        concurrency_limit=1
+    )
+if __name__ == "__main__":
+    demo.launch()

tts.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import requests
+import os
+import io
+from os import path
+from typing import Dict, Literal, TypedDict, Optional
+import argparse
+import asyncio
+import base64
+# Environment variables
+TTS_CLIENT_ID = os.environ.get('TTS_CLIENT_ID')
+TTS_CLIENT_SECRET = os.environ.get('TTS_CLIENT_SECRET')
+TTS_API_URL = os.environ.get('TTS_API_URL')
+if not TTS_CLIENT_ID or not TTS_CLIENT_SECRET or not TTS_API_URL:
+    raise ValueError('Missing environment variables')
+class TaskResult(TypedDict):
+    task_id: str
+    message: str
+    status: Literal['PENDING', 'SUCCESS', 'FAILED']
+    audio_url: str  # base64 encoded wav audio
+class Voice(TypedDict):
+    name: str
+    promptText: str
+    promptAudio: str
+voices: Dict[str, Voice] = {
+    "mk_girl": {
+        "name": "👧 凱婷",
+        "promptText": "我決定咗啦，我要做一件到目前為止又或者永遠都唔會再見到我做嘅事。",
+        "promptAudio": path.join(path.dirname(__file__), "./voices/mk_girl.wav")
+    },
+    "doraemon": {
+        "name": "🥸 全叔",
+        "promptText": "各位觀眾大家好，我叮噹呢又同你哋見面啦。好多謝咁多年嚟各位嘅捧場同支持。",
+        "promptAudio": path.join(path.dirname(__file__), "./voices/doraemon3.wav")
+    },
+    "周星馳": {
+        "name": "😈 星爺",
+        "promptText": "大家好啊，想唔想同我做好朋友啊。",
+        "promptAudio": path.join(path.dirname(__file__), "./voices/sing.mp3")
+    }
+}
+async def tts(input_text: str, voice: Voice) -> str:
+    """
+    Send TTS request with voice information
+    Args:
+        input_text: Text to be converted to speech
+        voice: Voice configuration
+    Returns:
+        task_id: ID of the TTS task
+    """
+    files = {
+        'input_text': (None, input_text),
+        'prompt_text': (None, voice['promptText']),
+        'audio': ('prompt.wav', open(voice['promptAudio'], 'rb')),
+        'speed': (None, '1.0')
+    }
+    headers = {
+        'CF-Access-Client-Id': TTS_CLIENT_ID,
+        'CF-Access-Client-Secret': TTS_CLIENT_SECRET
+    }
+    response = requests.post(f"{TTS_API_URL}/api/tts",
+                             files=files,
+                             headers=headers)
+    response.raise_for_status()
+    return response.json()['task_id']
+async def get_task_result(task_id: str) -> TaskResult:
+    """
+    Get result of TTS task
+    Args:
+        task_id: ID of the TTS task
+    Returns:
+        Task result information
+    """
+    headers = {
+        'Content-Type': 'application/json',
+        'CF-Access-Client-Id': TTS_CLIENT_ID,
+        'CF-Access-Client-Secret': TTS_CLIENT_SECRET
+    }
+    response = requests.get(f"{TTS_API_URL}/api/tts/{task_id}",
+                           headers=headers)
+    response.raise_for_status()
+    return response.json()
+async def main():
+    parser = argparse.ArgumentParser(description='Text-to-Speech with CosyVoice')
+    parser.add_argument('--text', help='Text to convert to speech')
+    parser.add_argument('--voice', '-v', choices=list(voices.keys()), default='mk_girl',
+                        help='Voice to use for synthesis')
+    parser.add_argument('--output', '-o', default='output.wav',
+                        help='Output audio file path')
+    args = parser.parse_args()
+    voice = voices[args.voice]
+    print(f"Converting text to speech using voice: {voice['name']}")
+    print(f"Text: {args.text}")
+    try:
+        task_id = await tts(args.text, voice)
+        print(f"TTS request submitted. Task ID: {task_id}")
+        while True:
+            result = await get_task_result(task_id)
+            if result['status'] != 'PENDING':
+                break
+            print("Waiting for TTS processing...")
+            await asyncio.sleep(1)
+        if result['status'] == 'SUCCESS':
+            audio_data = result['audio_url']
+            if ',' in audio_data:
+                audio_data = audio_data.split(',')[1]
+            with open(args.output, 'wb') as f:
+                f.write(base64.b64decode(audio_data))
+            print(f"Audio saved to {args.output}")
+        else:
+            print(f"TTS generation failed: {result['message']}")
+    except Exception as e:
+        print(f"Error: {str(e)}")
+if __name__ == "__main__":
+    asyncio.run(main())

voices/doraemon3.wav ADDED Viewed

Binary file (624 kB). View file

voices/mk_girl.wav ADDED Viewed

Binary file (182 kB). View file

voices/sing.mp3 ADDED Viewed

Binary file (125 kB). View file