File size: 3,319 Bytes
f1e8e3e
88d4acb
ab49f1b
88d4acb
 
ab49f1b
88d4acb
bf9b81d
 
 
88d4acb
 
 
 
 
 
 
 
 
 
 
ab49f1b
88d4acb
 
 
ab49f1b
 
88d4acb
 
ab49f1b
 
 
bf9b81d
88d4acb
 
 
 
 
ab49f1b
 
 
bf9b81d
88d4acb
 
 
 
 
 
 
 
 
 
 
ab49f1b
88d4acb
bf9b81d
ab49f1b
bf9b81d
ab49f1b
 
88d4acb
 
 
 
 
 
ab49f1b
88d4acb
ab49f1b
88d4acb
ab49f1b
88d4acb
ab49f1b
 
 
88d4acb
ab49f1b
88d4acb
ab49f1b
 
 
88d4acb
 
 
 
 
 
 
ab49f1b
88d4acb
 
 
 
 
 
 
ab49f1b
88d4acb
 
 
 
 
 
 
ab49f1b
88d4acb
 
 
 
 
ab49f1b
88d4acb
 
 
 
ab49f1b
88d4acb
 
ab49f1b
f1e8e3e
ab49f1b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import gradio as gr
import numpy as np
from f5_tts.api import F5TTS

# -----------------------
# Load model ONCE globally
# -----------------------
# Works with the older F5TTS API (no vocoder_name argument).
# It will use default base model + default vocoder.
f5 = F5TTS(device="cpu")  # or simply F5TTS() if you like


def clone_voice(
    ref_audio,
    ref_text,
    gen_text,
    nfe_step,
    speed,
    target_rms,
):
    if ref_audio is None:
        raise gr.Error("Please upload a 5–15 second reference audio.")

    if not ref_text.strip():
        raise gr.Error(
            "Please enter the EXACT transcript of your reference audio.\n"
            "This avoids using a slow ASR model on CPU."
        )

    if not gen_text.strip():
        raise gr.Error("Please enter the text you want to generate.")

    ref_path = ref_audio  # because we use type='filepath' in the Audio component

    wav, sr, _ = f5.infer(
        ref_file=ref_path,
        ref_text=ref_text,
        gen_text=gen_text,
        nfe_step=int(nfe_step),     # lower β†’ faster
        speed=float(speed),         # speaking speed
        target_rms=float(target_rms)
        # other params use defaults (cfg_strength, sway_sampling_coef, etc.)
    )

    return sr, np.array(wav, dtype=np.float32)


# -----------------------
# Gradio UI
# -----------------------
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # F5-TTS Voice Cloner β€” Optimized for Free CPU

        **Tips for best speed:**
        - Upload **5–15 seconds** of clean speech.
        - ALWAYS fill the **Reference Text** (do NOT rely on ASR).
        - Generate **1–2 sentences** at a time.
        - Lower **NFE Steps** β†’ faster (start with 12–16).
        """
    )

    with gr.Row():
        with gr.Column():
            ref_audio = gr.Audio(
                label="Reference audio (5–15 seconds)",
                sources=["upload"],
                type="filepath"
            )

            ref_text = gr.Textbox(
                label="Reference text (transcription of the reference audio)",
                placeholder="Type EXACTLY what you said in the audio...",
                lines=2
            )

            gen_text = gr.Textbox(
                label="Text to synthesize",
                placeholder="Enter 1–2 sentences...",
                lines=3
            )

            nfe_step = gr.Slider(
                minimum=4,
                maximum=32,
                value=16,
                step=2,
                label="NFE steps (Lower = faster)",
            )

            speed = gr.Slider(
                minimum=0.7,
                maximum=1.4,
                value=1.0,
                step=0.05,
                label="Speaking speed"
            )

            target_rms = gr.Slider(
                minimum=0.05,
                maximum=0.3,
                value=0.1,
                step=0.01,
                label="Volume (RMS)"
            )

            generate_btn = gr.Button("Generate")

        with gr.Column():
            output_audio = gr.Audio(label="Output audio")

    generate_btn.click(
        fn=clone_voice,
        inputs=[ref_audio, ref_text, gen_text, nfe_step, speed, target_rms],
        outputs=output_audio
    )


if __name__ == "__main__":
    demo.launch()