|
|
import gradio as gr |
|
|
import numpy as np |
|
|
from f5_tts.api import F5TTS |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
f5 = F5TTS(device="cpu") |
|
|
|
|
|
|
|
|
def clone_voice( |
|
|
ref_audio, |
|
|
ref_text, |
|
|
gen_text, |
|
|
nfe_step, |
|
|
speed, |
|
|
target_rms, |
|
|
): |
|
|
if ref_audio is None: |
|
|
raise gr.Error("Please upload a 5β15 second reference audio.") |
|
|
|
|
|
if not ref_text.strip(): |
|
|
raise gr.Error( |
|
|
"Please enter the EXACT transcript of your reference audio.\n" |
|
|
"This avoids using a slow ASR model on CPU." |
|
|
) |
|
|
|
|
|
if not gen_text.strip(): |
|
|
raise gr.Error("Please enter the text you want to generate.") |
|
|
|
|
|
ref_path = ref_audio |
|
|
|
|
|
wav, sr, _ = f5.infer( |
|
|
ref_file=ref_path, |
|
|
ref_text=ref_text, |
|
|
gen_text=gen_text, |
|
|
nfe_step=int(nfe_step), |
|
|
speed=float(speed), |
|
|
target_rms=float(target_rms) |
|
|
|
|
|
) |
|
|
|
|
|
return sr, np.array(wav, dtype=np.float32) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# F5-TTS Voice Cloner β Optimized for Free CPU |
|
|
|
|
|
**Tips for best speed:** |
|
|
- Upload **5β15 seconds** of clean speech. |
|
|
- ALWAYS fill the **Reference Text** (do NOT rely on ASR). |
|
|
- Generate **1β2 sentences** at a time. |
|
|
- Lower **NFE Steps** β faster (start with 12β16). |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
ref_audio = gr.Audio( |
|
|
label="Reference audio (5β15 seconds)", |
|
|
sources=["upload"], |
|
|
type="filepath" |
|
|
) |
|
|
|
|
|
ref_text = gr.Textbox( |
|
|
label="Reference text (transcription of the reference audio)", |
|
|
placeholder="Type EXACTLY what you said in the audio...", |
|
|
lines=2 |
|
|
) |
|
|
|
|
|
gen_text = gr.Textbox( |
|
|
label="Text to synthesize", |
|
|
placeholder="Enter 1β2 sentences...", |
|
|
lines=3 |
|
|
) |
|
|
|
|
|
nfe_step = gr.Slider( |
|
|
minimum=4, |
|
|
maximum=32, |
|
|
value=16, |
|
|
step=2, |
|
|
label="NFE steps (Lower = faster)", |
|
|
) |
|
|
|
|
|
speed = gr.Slider( |
|
|
minimum=0.7, |
|
|
maximum=1.4, |
|
|
value=1.0, |
|
|
step=0.05, |
|
|
label="Speaking speed" |
|
|
) |
|
|
|
|
|
target_rms = gr.Slider( |
|
|
minimum=0.05, |
|
|
maximum=0.3, |
|
|
value=0.1, |
|
|
step=0.01, |
|
|
label="Volume (RMS)" |
|
|
) |
|
|
|
|
|
generate_btn = gr.Button("Generate") |
|
|
|
|
|
with gr.Column(): |
|
|
output_audio = gr.Audio(label="Output audio") |
|
|
|
|
|
generate_btn.click( |
|
|
fn=clone_voice, |
|
|
inputs=[ref_audio, ref_text, gen_text, nfe_step, speed, target_rms], |
|
|
outputs=output_audio |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|