Borio047's picture
Update app.py
bf9b81d verified
import gradio as gr
import numpy as np
from f5_tts.api import F5TTS
# -----------------------
# Load model ONCE globally
# -----------------------
# Works with the older F5TTS API (no vocoder_name argument).
# It will use default base model + default vocoder.
f5 = F5TTS(device="cpu") # or simply F5TTS() if you like
def clone_voice(
ref_audio,
ref_text,
gen_text,
nfe_step,
speed,
target_rms,
):
if ref_audio is None:
raise gr.Error("Please upload a 5–15 second reference audio.")
if not ref_text.strip():
raise gr.Error(
"Please enter the EXACT transcript of your reference audio.\n"
"This avoids using a slow ASR model on CPU."
)
if not gen_text.strip():
raise gr.Error("Please enter the text you want to generate.")
ref_path = ref_audio # because we use type='filepath' in the Audio component
wav, sr, _ = f5.infer(
ref_file=ref_path,
ref_text=ref_text,
gen_text=gen_text,
nfe_step=int(nfe_step), # lower β†’ faster
speed=float(speed), # speaking speed
target_rms=float(target_rms)
# other params use defaults (cfg_strength, sway_sampling_coef, etc.)
)
return sr, np.array(wav, dtype=np.float32)
# -----------------------
# Gradio UI
# -----------------------
with gr.Blocks() as demo:
gr.Markdown(
"""
# F5-TTS Voice Cloner β€” Optimized for Free CPU
**Tips for best speed:**
- Upload **5–15 seconds** of clean speech.
- ALWAYS fill the **Reference Text** (do NOT rely on ASR).
- Generate **1–2 sentences** at a time.
- Lower **NFE Steps** β†’ faster (start with 12–16).
"""
)
with gr.Row():
with gr.Column():
ref_audio = gr.Audio(
label="Reference audio (5–15 seconds)",
sources=["upload"],
type="filepath"
)
ref_text = gr.Textbox(
label="Reference text (transcription of the reference audio)",
placeholder="Type EXACTLY what you said in the audio...",
lines=2
)
gen_text = gr.Textbox(
label="Text to synthesize",
placeholder="Enter 1–2 sentences...",
lines=3
)
nfe_step = gr.Slider(
minimum=4,
maximum=32,
value=16,
step=2,
label="NFE steps (Lower = faster)",
)
speed = gr.Slider(
minimum=0.7,
maximum=1.4,
value=1.0,
step=0.05,
label="Speaking speed"
)
target_rms = gr.Slider(
minimum=0.05,
maximum=0.3,
value=0.1,
step=0.01,
label="Volume (RMS)"
)
generate_btn = gr.Button("Generate")
with gr.Column():
output_audio = gr.Audio(label="Output audio")
generate_btn.click(
fn=clone_voice,
inputs=[ref_audio, ref_text, gen_text, nfe_step, speed, target_rms],
outputs=output_audio
)
if __name__ == "__main__":
demo.launch()