File size: 3,319 Bytes
f1e8e3e 88d4acb ab49f1b 88d4acb ab49f1b 88d4acb bf9b81d 88d4acb ab49f1b 88d4acb ab49f1b 88d4acb ab49f1b bf9b81d 88d4acb ab49f1b bf9b81d 88d4acb ab49f1b 88d4acb bf9b81d ab49f1b bf9b81d ab49f1b 88d4acb ab49f1b 88d4acb ab49f1b 88d4acb ab49f1b 88d4acb ab49f1b 88d4acb ab49f1b 88d4acb ab49f1b 88d4acb ab49f1b 88d4acb ab49f1b 88d4acb ab49f1b 88d4acb ab49f1b 88d4acb ab49f1b 88d4acb ab49f1b f1e8e3e ab49f1b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import gradio as gr
import numpy as np
from f5_tts.api import F5TTS
# -----------------------
# Load model ONCE globally
# -----------------------
# Works with the older F5TTS API (no vocoder_name argument).
# It will use default base model + default vocoder.
f5 = F5TTS(device="cpu") # or simply F5TTS() if you like
def clone_voice(
ref_audio,
ref_text,
gen_text,
nfe_step,
speed,
target_rms,
):
if ref_audio is None:
raise gr.Error("Please upload a 5β15 second reference audio.")
if not ref_text.strip():
raise gr.Error(
"Please enter the EXACT transcript of your reference audio.\n"
"This avoids using a slow ASR model on CPU."
)
if not gen_text.strip():
raise gr.Error("Please enter the text you want to generate.")
ref_path = ref_audio # because we use type='filepath' in the Audio component
wav, sr, _ = f5.infer(
ref_file=ref_path,
ref_text=ref_text,
gen_text=gen_text,
nfe_step=int(nfe_step), # lower β faster
speed=float(speed), # speaking speed
target_rms=float(target_rms)
# other params use defaults (cfg_strength, sway_sampling_coef, etc.)
)
return sr, np.array(wav, dtype=np.float32)
# -----------------------
# Gradio UI
# -----------------------
with gr.Blocks() as demo:
gr.Markdown(
"""
# F5-TTS Voice Cloner β Optimized for Free CPU
**Tips for best speed:**
- Upload **5β15 seconds** of clean speech.
- ALWAYS fill the **Reference Text** (do NOT rely on ASR).
- Generate **1β2 sentences** at a time.
- Lower **NFE Steps** β faster (start with 12β16).
"""
)
with gr.Row():
with gr.Column():
ref_audio = gr.Audio(
label="Reference audio (5β15 seconds)",
sources=["upload"],
type="filepath"
)
ref_text = gr.Textbox(
label="Reference text (transcription of the reference audio)",
placeholder="Type EXACTLY what you said in the audio...",
lines=2
)
gen_text = gr.Textbox(
label="Text to synthesize",
placeholder="Enter 1β2 sentences...",
lines=3
)
nfe_step = gr.Slider(
minimum=4,
maximum=32,
value=16,
step=2,
label="NFE steps (Lower = faster)",
)
speed = gr.Slider(
minimum=0.7,
maximum=1.4,
value=1.0,
step=0.05,
label="Speaking speed"
)
target_rms = gr.Slider(
minimum=0.05,
maximum=0.3,
value=0.1,
step=0.01,
label="Volume (RMS)"
)
generate_btn = gr.Button("Generate")
with gr.Column():
output_audio = gr.Audio(label="Output audio")
generate_btn.click(
fn=clone_voice,
inputs=[ref_audio, ref_text, gen_text, nfe_step, speed, target_rms],
outputs=output_audio
)
if __name__ == "__main__":
demo.launch()
|