VoxCPM-0.5B

Running

App Files Files Community

VoxCPM-0.5B / app.py

akhaliq HF Staff

Update app.py

2ce4e07 verified 3 months ago

raw

history blame contribute delete

6.27 kB

	import gradio as gr
	import soundfile as sf
	import numpy as np
	from voxcpm import VoxCPM
	import tempfile
	import os
	import spaces

	# Load the model once at startup
	model = VoxCPM.from_pretrained("openbmb/VoxCPM-0.5B")

	@spaces.GPU(duration=120)
	def generate_speech(
	text,
	prompt_audio,
	prompt_text,
	cfg_value,
	inference_timesteps,
	normalize,
	denoise,
	retry_badcase,
	retry_badcase_max_times,
	retry_badcase_ratio_threshold
	):
	if not text:
	gr.Warning("Please enter text to generate speech")
	return None

	# Handle prompt audio if provided
	prompt_wav_path = None
	if prompt_audio is not None:
	prompt_wav_path = prompt_audio

	# Handle empty prompt text
	if prompt_text and prompt_text.strip() == "":
	prompt_text = None

	try:
	# Generate speech
	wav = model.generate(
	text=text,
	prompt_wav_path=prompt_wav_path,
	prompt_text=prompt_text,
	cfg_value=cfg_value,
	inference_timesteps=int(inference_timesteps),
	normalize=normalize,
	denoise=denoise,
	retry_badcase=retry_badcase,
	retry_badcase_max_times=int(retry_badcase_max_times),
	retry_badcase_ratio_threshold=retry_badcase_ratio_threshold
	)

	# Create temporary file for audio output
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
	sf.write(tmp_file.name, wav, 16000)
	return tmp_file.name

	except Exception as e:
	gr.Error(f"Error generating speech: {str(e)}")
	return None

	# Create Gradio interface
	with gr.Blocks(title="VoxCPM Text-to-Speech", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# 🎙️ VoxCPM Text-to-Speech

	Generate highly expressive speech using VoxCPM-0.5B model. Optionally clone voices by providing reference audio.

	[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	# Input section
	text_input = gr.Textbox(
	label="Text to Synthesize",
	placeholder="Enter the text you want to convert to speech...",
	lines=3,
	value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech."
	)

	with gr.Accordion("Voice Cloning", open=False):
	prompt_audio = gr.Audio(
	label="Reference Audio (Upload a reference audio file for voice cloning)",
	type="filepath",
	sources=["upload"]
	)
	prompt_text = gr.Textbox(
	label="Reference Text",
	placeholder="Text corresponding to the reference audio",
	lines=2
	)

	with gr.Accordion("Advanced Settings", open=False):
	cfg_value = gr.Slider(
	minimum=0.5,
	maximum=5.0,
	value=2.0,
	step=0.1,
	label="CFG Value",
	info="LM guidance on LocDiT, higher for better adherence to prompt"
	)

	inference_timesteps = gr.Slider(
	minimum=5,
	maximum=50,
	value=10,
	step=1,
	label="Inference Timesteps",
	info="Higher for better quality, lower for faster speed"
	)

	with gr.Row():
	normalize = gr.Checkbox(
	value=True,
	label="Normalize",
	info="Enable external TN tool"
	)
	denoise = gr.Checkbox(
	value=True,
	label="Denoise",
	info="Enable external Denoise tool"
	)
	retry_badcase = gr.Checkbox(
	value=True,
	label="Retry Bad Cases",
	info="Enable retrying for bad cases"
	)

	with gr.Row():
	retry_badcase_max_times = gr.Number(
	value=3,
	minimum=1,
	maximum=10,
	step=1,
	label="Max Retry Times"
	)
	retry_badcase_ratio_threshold = gr.Number(
	value=6.0,
	minimum=1.0,
	maximum=10.0,
	step=0.5,
	label="Retry Ratio Threshold"
	)

	generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")

	with gr.Column(scale=1):
	# Output section
	audio_output = gr.Audio(
	label="Generated Speech",
	type="filepath",
	autoplay=False
	)

	gr.Markdown(
	"""
	### Tips:
	- For voice cloning, upload a clear reference audio (3-10 seconds recommended)
	- Higher CFG values provide better prompt adherence but may affect naturalness
	- Increase inference timesteps for better quality at the cost of speed
	- The retry mechanism helps handle edge cases automatically
	"""
	)

	# Connect the generate button
	generate_btn.click(
	fn=generate_speech,
	inputs=[
	text_input,
	prompt_audio,
	prompt_text,
	cfg_value,
	inference_timesteps,
	normalize,
	denoise,
	retry_badcase,
	retry_badcase_max_times,
	retry_badcase_ratio_threshold
	],
	outputs=audio_output,
	show_progress="full"
	)

	demo.launch()