Borio047 commited on
Commit
ab49f1b
Β·
verified Β·
1 Parent(s): a6968e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -51
app.py CHANGED
@@ -1,17 +1,15 @@
1
  import gradio as gr
2
  import numpy as np
3
-
4
- from f5_tts.api import F5TTS # Official high-level API
5
 
6
 
7
  # -----------------------
8
- # Load model ONCE (global)
9
  # -----------------------
10
- # Use only one TTS model (F5-TTS) + vocoder, on CPU
11
  f5 = F5TTS(
12
- model_type="F5-TTS", # or "E2-TTS" if you prefer that model
13
- vocoder_name="vocos", # default vocoder used in examples
14
- device="cpu", # force CPU (free Space has no GPU)
15
  )
16
 
17
 
@@ -24,37 +22,28 @@ def clone_voice(
24
  target_rms,
25
  ):
26
  if ref_audio is None:
27
- raise gr.Error("Please upload a short reference audio (5–15 seconds).")
28
-
29
- if not gen_text.strip():
30
- raise gr.Error("Please enter the text to generate.")
31
 
32
- # Very important on CPU: avoid ASR.
33
- # If ref_text is empty, F5-TTS may call a transcription model (slow + heavy).
34
  if not ref_text.strip():
35
  raise gr.Error(
36
- "Please type the transcript of the reference audio (ref_text). "
37
- "This avoids loading a heavy ASR model and keeps it faster on CPU."
38
  )
39
 
40
- ref_path = ref_audio # because we will use type='filepath' for Audio
 
 
 
41
 
42
- # Call the F5TTS API.
43
- # Key speed knobs:
44
- # - nfe_step: fewer steps = faster, slightly lower quality
45
- # - speed: >1.0 = faster speaking
46
  wav, sr, _ = f5.infer(
47
  ref_file=ref_path,
48
  ref_text=ref_text,
49
  gen_text=gen_text,
50
- nfe_step=int(nfe_step), # e.g. 12–24 is reasonable on CPU
51
- speed=float(speed), # speaking rate
52
- target_rms=float(target_rms), # audio loudness
53
- sway_sampling_coef=-1, # default
54
- cfg_strength=2.0, # default
55
  )
56
 
57
- # Gradio expects (sample_rate, np.array)
58
  return sr, np.array(wav, dtype=np.float32)
59
 
60
 
@@ -64,32 +53,34 @@ def clone_voice(
64
  with gr.Blocks() as demo:
65
  gr.Markdown(
66
  """
67
- # F5-TTS Voice Cloner – CPU Optimized
68
 
69
- ⚠️ **Free CPU tip:**
70
- - Use **5–15 seconds** of clean reference audio.
71
- - **Always fill in the reference text** (what you said in that clip) to avoid slow ASR.
72
- - Generate only **1–2 sentences** at a time.
73
- - Lower **NFE steps** β†’ faster, slightly lower quality.
74
  """
75
  )
76
 
77
  with gr.Row():
78
  with gr.Column():
79
  ref_audio = gr.Audio(
 
80
  sources=["upload"],
81
- type="filepath",
82
- label="Reference audio (5–15s of your voice)",
83
  )
 
84
  ref_text = gr.Textbox(
85
- label="Reference text (exact words in the reference audio)",
86
- lines=2,
87
- placeholder="Type exactly what you said in the reference clip...",
88
  )
 
89
  gen_text = gr.Textbox(
90
- label="Text to generate in the same voice",
91
- lines=3,
92
- placeholder="Write 1–2 short sentences...",
93
  )
94
 
95
  nfe_step = gr.Slider(
@@ -97,8 +88,7 @@ with gr.Blocks() as demo:
97
  maximum=32,
98
  value=16,
99
  step=2,
100
- label="Quality vs Speed (NFE steps – lower = faster)",
101
- info="Try 12–16 on CPU. Higher gives better quality but is slower.",
102
  )
103
 
104
  speed = gr.Slider(
@@ -106,7 +96,7 @@ with gr.Blocks() as demo:
106
  maximum=1.4,
107
  value=1.0,
108
  step=0.05,
109
- label="Speaking speed",
110
  )
111
 
112
  target_rms = gr.Slider(
@@ -114,23 +104,20 @@ with gr.Blocks() as demo:
114
  maximum=0.3,
115
  value=0.1,
116
  step=0.01,
117
- label="Volume target (RMS)",
118
  )
119
 
120
  generate_btn = gr.Button("Generate")
121
 
122
  with gr.Column():
123
- output_audio = gr.Audio(
124
- label="Cloned output",
125
- autoplay=False,
126
- )
127
 
128
  generate_btn.click(
129
  fn=clone_voice,
130
  inputs=[ref_audio, ref_text, gen_text, nfe_step, speed, target_rms],
131
- outputs=output_audio,
132
  )
133
 
134
- # Entry point for Spaces
135
  if __name__ == "__main__":
136
- demo.launch()
 
1
  import gradio as gr
2
  import numpy as np
3
+ from f5_tts.api import F5TTS
 
4
 
5
 
6
  # -----------------------
7
+ # Load model ONCE globally
8
  # -----------------------
9
+ # This constructor works for f5-tts==1.1.10 (NO model_type argument).
10
  f5 = F5TTS(
11
+ vocoder_name="vocos", # default vocoder
12
+ device="cpu" # CPU only
 
13
  )
14
 
15
 
 
22
  target_rms,
23
  ):
24
  if ref_audio is None:
25
+ raise gr.Error("Please upload a 5–15 second reference audio.")
 
 
 
26
 
 
 
27
  if not ref_text.strip():
28
  raise gr.Error(
29
+ "Please enter the EXACT transcript of your reference audio.\n"
30
+ "This avoids using a slow ASR model on CPU."
31
  )
32
 
33
+ if not gen_text.strip():
34
+ raise gr.Error("Please enter the text you want to generate.")
35
+
36
+ ref_path = ref_audio # because type='filepath'
37
 
 
 
 
 
38
  wav, sr, _ = f5.infer(
39
  ref_file=ref_path,
40
  ref_text=ref_text,
41
  gen_text=gen_text,
42
+ nfe_step=int(nfe_step), # lower β†’ faster
43
+ speed=float(speed), # speaking speed
44
+ target_rms=float(target_rms)
 
 
45
  )
46
 
 
47
  return sr, np.array(wav, dtype=np.float32)
48
 
49
 
 
53
  with gr.Blocks() as demo:
54
  gr.Markdown(
55
  """
56
+ # F5-TTS Voice Cloner β€” Optimized for Free CPU
57
 
58
+ **Tips for best speed on CPU Spaces:**
59
+ - Upload **5–15 seconds** of clean speech.
60
+ - ALWAYS fill the **Reference Text** (do NOT let ASR run).
61
+ - Generate **1–2 sentences** at a time.
62
+ - Lower **NFE Steps** β†’ faster (start with 12–16).
63
  """
64
  )
65
 
66
  with gr.Row():
67
  with gr.Column():
68
  ref_audio = gr.Audio(
69
+ label="Reference audio (5–15 seconds)",
70
  sources=["upload"],
71
+ type="filepath"
 
72
  )
73
+
74
  ref_text = gr.Textbox(
75
+ label="Reference text (transcription of the reference audio)",
76
+ placeholder="Type EXACTLY what you said in the audio...",
77
+ lines=2
78
  )
79
+
80
  gen_text = gr.Textbox(
81
+ label="Text to synthesize",
82
+ placeholder="Enter 1–2 sentences...",
83
+ lines=3
84
  )
85
 
86
  nfe_step = gr.Slider(
 
88
  maximum=32,
89
  value=16,
90
  step=2,
91
+ label="NFE steps (Lower = faster)",
 
92
  )
93
 
94
  speed = gr.Slider(
 
96
  maximum=1.4,
97
  value=1.0,
98
  step=0.05,
99
+ label="Speaking speed"
100
  )
101
 
102
  target_rms = gr.Slider(
 
104
  maximum=0.3,
105
  value=0.1,
106
  step=0.01,
107
+ label="Volume (RMS)"
108
  )
109
 
110
  generate_btn = gr.Button("Generate")
111
 
112
  with gr.Column():
113
+ output_audio = gr.Audio(label="Output audio")
 
 
 
114
 
115
  generate_btn.click(
116
  fn=clone_voice,
117
  inputs=[ref_audio, ref_text, gen_text, nfe_step, speed, target_rms],
118
+ outputs=output_audio
119
  )
120
 
121
+
122
  if __name__ == "__main__":
123
+ demo.launch()