Spaces:
Running
Running
| import gradio as gr | |
| import soundfile as sf | |
| import numpy as np | |
| from voxcpm import VoxCPM | |
| import tempfile | |
| import os | |
| import spaces | |
| # Load the model once at startup | |
| model = VoxCPM.from_pretrained("openbmb/VoxCPM-0.5B") | |
| def generate_speech( | |
| text, | |
| prompt_audio, | |
| prompt_text, | |
| cfg_value, | |
| inference_timesteps, | |
| normalize, | |
| denoise, | |
| retry_badcase, | |
| retry_badcase_max_times, | |
| retry_badcase_ratio_threshold | |
| ): | |
| if not text: | |
| gr.Warning("Please enter text to generate speech") | |
| return None | |
| # Handle prompt audio if provided | |
| prompt_wav_path = None | |
| if prompt_audio is not None: | |
| prompt_wav_path = prompt_audio | |
| # Handle empty prompt text | |
| if prompt_text and prompt_text.strip() == "": | |
| prompt_text = None | |
| try: | |
| # Generate speech | |
| wav = model.generate( | |
| text=text, | |
| prompt_wav_path=prompt_wav_path, | |
| prompt_text=prompt_text, | |
| cfg_value=cfg_value, | |
| inference_timesteps=int(inference_timesteps), | |
| normalize=normalize, | |
| denoise=denoise, | |
| retry_badcase=retry_badcase, | |
| retry_badcase_max_times=int(retry_badcase_max_times), | |
| retry_badcase_ratio_threshold=retry_badcase_ratio_threshold | |
| ) | |
| # Create temporary file for audio output | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
| sf.write(tmp_file.name, wav, 16000) | |
| return tmp_file.name | |
| except Exception as e: | |
| gr.Error(f"Error generating speech: {str(e)}") | |
| return None | |
| # Create Gradio interface | |
| with gr.Blocks(title="VoxCPM Text-to-Speech", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # 🎙️ VoxCPM Text-to-Speech | |
| Generate highly expressive speech using VoxCPM-0.5B model. Optionally clone voices by providing reference audio. | |
| [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder) | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| # Input section | |
| text_input = gr.Textbox( | |
| label="Text to Synthesize", | |
| placeholder="Enter the text you want to convert to speech...", | |
| lines=3, | |
| value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." | |
| ) | |
| with gr.Accordion("Voice Cloning", open=False): | |
| prompt_audio = gr.Audio( | |
| label="Reference Audio (Upload a reference audio file for voice cloning)", | |
| type="filepath", | |
| sources=["upload"] | |
| ) | |
| prompt_text = gr.Textbox( | |
| label="Reference Text", | |
| placeholder="Text corresponding to the reference audio", | |
| lines=2 | |
| ) | |
| with gr.Accordion("Advanced Settings", open=False): | |
| cfg_value = gr.Slider( | |
| minimum=0.5, | |
| maximum=5.0, | |
| value=2.0, | |
| step=0.1, | |
| label="CFG Value", | |
| info="LM guidance on LocDiT, higher for better adherence to prompt" | |
| ) | |
| inference_timesteps = gr.Slider( | |
| minimum=5, | |
| maximum=50, | |
| value=10, | |
| step=1, | |
| label="Inference Timesteps", | |
| info="Higher for better quality, lower for faster speed" | |
| ) | |
| with gr.Row(): | |
| normalize = gr.Checkbox( | |
| value=True, | |
| label="Normalize", | |
| info="Enable external TN tool" | |
| ) | |
| denoise = gr.Checkbox( | |
| value=True, | |
| label="Denoise", | |
| info="Enable external Denoise tool" | |
| ) | |
| retry_badcase = gr.Checkbox( | |
| value=True, | |
| label="Retry Bad Cases", | |
| info="Enable retrying for bad cases" | |
| ) | |
| with gr.Row(): | |
| retry_badcase_max_times = gr.Number( | |
| value=3, | |
| minimum=1, | |
| maximum=10, | |
| step=1, | |
| label="Max Retry Times" | |
| ) | |
| retry_badcase_ratio_threshold = gr.Number( | |
| value=6.0, | |
| minimum=1.0, | |
| maximum=10.0, | |
| step=0.5, | |
| label="Retry Ratio Threshold" | |
| ) | |
| generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| # Output section | |
| audio_output = gr.Audio( | |
| label="Generated Speech", | |
| type="filepath", | |
| autoplay=False | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### Tips: | |
| - For voice cloning, upload a clear reference audio (3-10 seconds recommended) | |
| - Higher CFG values provide better prompt adherence but may affect naturalness | |
| - Increase inference timesteps for better quality at the cost of speed | |
| - The retry mechanism helps handle edge cases automatically | |
| """ | |
| ) | |
| # Connect the generate button | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[ | |
| text_input, | |
| prompt_audio, | |
| prompt_text, | |
| cfg_value, | |
| inference_timesteps, | |
| normalize, | |
| denoise, | |
| retry_badcase, | |
| retry_badcase_max_times, | |
| retry_badcase_ratio_threshold | |
| ], | |
| outputs=audio_output, | |
| show_progress="full" | |
| ) | |
| demo.launch() |