Willing to explore ChartterBox

#10
Files changed (48) hide show
  1. .gitignore +0 -1
  2. app.py +18 -34
  3. chatterbox/src/chatterbox/__pycache__/__init__.cpython-311.pyc +0 -0
  4. chatterbox/src/chatterbox/__pycache__/tts.cpython-311.pyc +0 -0
  5. chatterbox/src/chatterbox/__pycache__/utils.cpython-311.pyc +0 -0
  6. chatterbox/src/chatterbox/__pycache__/vc.cpython-311.pyc +0 -0
  7. chatterbox/src/chatterbox/models/s3gen/__pycache__/__init__.cpython-311.pyc +0 -0
  8. chatterbox/src/chatterbox/models/s3gen/__pycache__/const.cpython-311.pyc +0 -0
  9. chatterbox/src/chatterbox/models/s3gen/__pycache__/decoder.cpython-311.pyc +0 -0
  10. chatterbox/src/chatterbox/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc +0 -0
  11. chatterbox/src/chatterbox/models/s3gen/__pycache__/flow.cpython-311.pyc +0 -0
  12. chatterbox/src/chatterbox/models/s3gen/__pycache__/flow_matching.cpython-311.pyc +0 -0
  13. chatterbox/src/chatterbox/models/s3gen/__pycache__/hifigan.cpython-311.pyc +0 -0
  14. chatterbox/src/chatterbox/models/s3gen/__pycache__/s3gen.cpython-311.pyc +0 -0
  15. chatterbox/src/chatterbox/models/s3gen/__pycache__/xvector.cpython-311.pyc +0 -0
  16. chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc +0 -0
  17. chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc +0 -0
  18. chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc +0 -0
  19. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc +0 -0
  20. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc +0 -0
  21. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc +0 -0
  22. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc +0 -0
  23. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc +0 -0
  24. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc +0 -0
  25. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc +0 -0
  26. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc +0 -0
  27. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc +0 -0
  28. chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc +0 -0
  29. chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mask.cpython-311.pyc +0 -0
  30. chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mel.cpython-311.pyc +0 -0
  31. chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc +0 -0
  32. chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc +0 -0
  33. chatterbox/src/chatterbox/models/t3/__pycache__/__init__.cpython-311.pyc +0 -0
  34. chatterbox/src/chatterbox/models/t3/__pycache__/llama_configs.cpython-311.pyc +0 -0
  35. chatterbox/src/chatterbox/models/t3/__pycache__/t3.cpython-311.pyc +0 -0
  36. chatterbox/src/chatterbox/models/t3/inference/__pycache__/alignment_stream_analyzer.cpython-311.pyc +0 -0
  37. chatterbox/src/chatterbox/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc +0 -0
  38. chatterbox/src/chatterbox/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc +0 -0
  39. chatterbox/src/chatterbox/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc +0 -0
  40. chatterbox/src/chatterbox/models/t3/modules/__pycache__/perceiver.cpython-311.pyc +0 -0
  41. chatterbox/src/chatterbox/models/t3/modules/__pycache__/t3_config.cpython-311.pyc +0 -0
  42. chatterbox/src/chatterbox/models/tokenizers/__pycache__/__init__.cpython-311.pyc +0 -0
  43. chatterbox/src/chatterbox/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc +0 -0
  44. chatterbox/src/chatterbox/models/voice_encoder/__pycache__/__init__.cpython-311.pyc +0 -0
  45. chatterbox/src/chatterbox/models/voice_encoder/__pycache__/config.cpython-311.pyc +0 -0
  46. chatterbox/src/chatterbox/models/voice_encoder/__pycache__/melspec.cpython-311.pyc +0 -0
  47. chatterbox/src/chatterbox/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc +0 -0
  48. chatterbox/src/chatterbox/tts.py +5 -31
.gitignore DELETED
@@ -1 +0,0 @@
1
- __pycache__
 
 
app.py CHANGED
@@ -45,30 +45,25 @@ def set_seed(seed: int):
45
  @spaces.GPU
46
  def generate_tts_audio(
47
  text_input: str,
48
- audio_prompt_path_input: str = None,
49
- exaggeration_input: float = 0.5,
50
- temperature_input: float = 0.8,
51
- seed_num_input: int = 0,
52
- cfgw_input: float = 0.5,
53
- vad_trim_input: bool = False,
54
  ) -> tuple[int, np.ndarray]:
55
  """
56
- Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
57
-
58
- This tool synthesizes natural-sounding speech from input text. When a reference audio file
59
- is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
60
- maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
61
 
62
  Args:
63
- text_input (str): The text to synthesize into speech (maximum 300 characters)
64
- audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
65
- exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
66
- temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
67
- seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
68
- cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5.
69
 
70
  Returns:
71
- tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
72
  """
73
  current_model = get_or_load_model()
74
 
@@ -79,21 +74,12 @@ def generate_tts_audio(
79
  set_seed(int(seed_num_input))
80
 
81
  print(f"Generating audio for text: '{text_input[:50]}...'")
82
-
83
- # Handle optional audio prompt
84
- generate_kwargs = {
85
- "exaggeration": exaggeration_input,
86
- "temperature": temperature_input,
87
- "cfg_weight": cfgw_input,
88
- "vad_trim": vad_trim_input,
89
- }
90
-
91
- if audio_prompt_path_input:
92
- generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
93
-
94
  wav = current_model.generate(
95
  text_input[:300], # Truncate text to max chars
96
- **generate_kwargs
 
 
 
97
  )
98
  print("Audio generation complete.")
99
  return (current_model.sr, wav.squeeze(0).numpy())
@@ -128,7 +114,6 @@ with gr.Blocks() as demo:
128
  with gr.Accordion("More options", open=False):
129
  seed_num = gr.Number(value=0, label="Random seed (0 for random)")
130
  temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
131
- vad_trim = gr.Checkbox(label="Ref VAD trimming", value=False)
132
 
133
  run_btn = gr.Button("Generate", variant="primary")
134
 
@@ -144,9 +129,8 @@ with gr.Blocks() as demo:
144
  temp,
145
  seed_num,
146
  cfg_weight,
147
- vad_trim,
148
  ],
149
  outputs=[audio_output],
150
  )
151
 
152
- demo.launch(mcp_server=True)
 
45
  @spaces.GPU
46
  def generate_tts_audio(
47
  text_input: str,
48
+ audio_prompt_path_input: str,
49
+ exaggeration_input: float,
50
+ temperature_input: float,
51
+ seed_num_input: int,
52
+ cfgw_input: float
 
53
  ) -> tuple[int, np.ndarray]:
54
  """
55
+ Generates TTS audio using the ChatterboxTTS model.
 
 
 
 
56
 
57
  Args:
58
+ text_input: The text to synthesize (max 300 characters).
59
+ audio_prompt_path_input: Path to the reference audio file.
60
+ exaggeration_input: Exaggeration parameter for the model.
61
+ temperature_input: Temperature parameter for the model.
62
+ seed_num_input: Random seed (0 for random).
63
+ cfgw_input: CFG/Pace weight.
64
 
65
  Returns:
66
+ A tuple containing the sample rate (int) and the audio waveform (numpy.ndarray).
67
  """
68
  current_model = get_or_load_model()
69
 
 
74
  set_seed(int(seed_num_input))
75
 
76
  print(f"Generating audio for text: '{text_input[:50]}...'")
 
 
 
 
 
 
 
 
 
 
 
 
77
  wav = current_model.generate(
78
  text_input[:300], # Truncate text to max chars
79
+ audio_prompt_path=audio_prompt_path_input,
80
+ exaggeration=exaggeration_input,
81
+ temperature=temperature_input,
82
+ cfg_weight=cfgw_input,
83
  )
84
  print("Audio generation complete.")
85
  return (current_model.sr, wav.squeeze(0).numpy())
 
114
  with gr.Accordion("More options", open=False):
115
  seed_num = gr.Number(value=0, label="Random seed (0 for random)")
116
  temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
 
117
 
118
  run_btn = gr.Button("Generate", variant="primary")
119
 
 
129
  temp,
130
  seed_num,
131
  cfg_weight,
 
132
  ],
133
  outputs=[audio_output],
134
  )
135
 
136
+ demo.launch()
chatterbox/src/chatterbox/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (275 Bytes). View file
 
chatterbox/src/chatterbox/__pycache__/tts.cpython-311.pyc ADDED
Binary file (13.3 kB). View file
 
chatterbox/src/chatterbox/__pycache__/utils.cpython-311.pyc ADDED
Binary file (858 Bytes). View file
 
chatterbox/src/chatterbox/__pycache__/vc.cpython-311.pyc ADDED
Binary file (5.44 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (294 Bytes). View file
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/const.cpython-311.pyc ADDED
Binary file (190 Bytes). View file
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/decoder.cpython-311.pyc ADDED
Binary file (16.9 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc ADDED
Binary file (2.7 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/flow.cpython-311.pyc ADDED
Binary file (13.7 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/flow_matching.cpython-311.pyc ADDED
Binary file (13.3 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/hifigan.cpython-311.pyc ADDED
Binary file (26.3 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/s3gen.cpython-311.pyc ADDED
Binary file (13.7 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/xvector.cpython-311.pyc ADDED
Binary file (24 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc ADDED
Binary file (21.3 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc ADDED
Binary file (6.46 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc ADDED
Binary file (14.7 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (190 Bytes). View file
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc ADDED
Binary file (3.58 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc ADDED
Binary file (15.7 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc ADDED
Binary file (5.54 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc ADDED
Binary file (17.3 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc ADDED
Binary file (11.2 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc ADDED
Binary file (6.24 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc ADDED
Binary file (18.9 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc ADDED
Binary file (15.6 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc ADDED
Binary file (1.93 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mask.cpython-311.pyc ADDED
Binary file (6.25 kB). View file
 
chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mel.cpython-311.pyc ADDED
Binary file (4.05 kB). View file
 
chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (1.37 kB). View file
 
chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc ADDED
Binary file (7.94 kB). View file
 
chatterbox/src/chatterbox/models/t3/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (218 Bytes). View file
 
chatterbox/src/chatterbox/models/t3/__pycache__/llama_configs.cpython-311.pyc ADDED
Binary file (1.34 kB). View file
 
chatterbox/src/chatterbox/models/t3/__pycache__/t3.cpython-311.pyc ADDED
Binary file (15.8 kB). View file
 
chatterbox/src/chatterbox/models/t3/inference/__pycache__/alignment_stream_analyzer.cpython-311.pyc ADDED
Binary file (7.08 kB). View file
 
chatterbox/src/chatterbox/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc ADDED
Binary file (4.65 kB). View file
 
chatterbox/src/chatterbox/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc ADDED
Binary file (5.37 kB). View file
 
chatterbox/src/chatterbox/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc ADDED
Binary file (2.54 kB). View file
 
chatterbox/src/chatterbox/models/t3/modules/__pycache__/perceiver.cpython-311.pyc ADDED
Binary file (12.6 kB). View file
 
chatterbox/src/chatterbox/models/t3/modules/__pycache__/t3_config.cpython-311.pyc ADDED
Binary file (1.27 kB). View file
 
chatterbox/src/chatterbox/models/tokenizers/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (242 Bytes). View file
 
chatterbox/src/chatterbox/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc ADDED
Binary file (3.1 kB). View file
 
chatterbox/src/chatterbox/models/voice_encoder/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (281 Bytes). View file
 
chatterbox/src/chatterbox/models/voice_encoder/__pycache__/config.cpython-311.pyc ADDED
Binary file (859 Bytes). View file
 
chatterbox/src/chatterbox/models/voice_encoder/__pycache__/melspec.cpython-311.pyc ADDED
Binary file (3.59 kB). View file
 
chatterbox/src/chatterbox/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc ADDED
Binary file (18.7 kB). View file
 
chatterbox/src/chatterbox/tts.py CHANGED
@@ -2,12 +2,10 @@ from dataclasses import dataclass
2
  from pathlib import Path
3
 
4
  import librosa
5
- import numpy as np
6
  import torch
7
  import perth
8
  import torch.nn.functional as F
9
  from huggingface_hub import hf_hub_download
10
- from silero_vad import load_silero_vad, get_speech_timestamps
11
 
12
  from .models.t3 import T3
13
  from .models.s3tokenizer import S3_SR, drop_invalid_tokens
@@ -123,7 +121,6 @@ class ChatterboxTTS:
123
  self.device = device
124
  self.conds = conds
125
  self.watermarker = perth.PerthImplicitWatermarker()
126
- self.silero_vad = load_silero_vad()
127
 
128
  @classmethod
129
  def from_local(cls, ckpt_dir, device) -> 'ChatterboxTTS':
@@ -165,33 +162,11 @@ class ChatterboxTTS:
165
 
166
  return cls.from_local(Path(local_path).parent, device)
167
 
168
- def trim_excess_silence(self, wav, sr):
169
- "Trim excess silence from speech. Input must be a multiple of 16kHz."
170
- assert sr % 16_000 == 0, "Silero requires an integer multiple of 16kHz"
171
 
172
- # Get VAD as sample-level bool array
173
- silero_regions = get_speech_timestamps(wav, self.silero_vad, sampling_rate=sr)
174
- vad = np.zeros_like(wav)
175
- for region in silero_regions:
176
- vad[region["start"]:region["end"]] = 1
177
-
178
- # Dilate VAD
179
- max_silence_ms = 400
180
- cfilter = np.ones(int(sr * max_silence_ms / (2 * 1000)))
181
- dilated_vad = np.convolve(vad, cfilter, mode="same") > 0
182
-
183
- # Trim out silence
184
- return wav[dilated_vad]
185
-
186
- def prepare_conditionals(self, wav_fpath, exaggeration=0.5, vad_trim=False):
187
- # Load reference wav at high SR and trim silence
188
- ref_wav, highres_sr = librosa.load(wav_fpath, sr=48_000)
189
- if vad_trim:
190
- ref_wav = self.trim_excess_silence(ref_wav, highres_sr)
191
-
192
- # Resample down
193
- s3gen_ref_wav = librosa.resample(ref_wav, orig_sr=highres_sr, target_sr=S3GEN_SR)
194
- ref_16k_wav = librosa.resample(ref_wav, orig_sr=highres_sr, target_sr=S3_SR)
195
 
196
  s3gen_ref_wav = s3gen_ref_wav[:self.DEC_COND_LEN]
197
  s3gen_ref_dict = self.s3gen.embed_ref(s3gen_ref_wav, S3GEN_SR, device=self.device)
@@ -220,10 +195,9 @@ class ChatterboxTTS:
220
  exaggeration=0.5,
221
  cfg_weight=0.5,
222
  temperature=0.8,
223
- vad_trim=False,
224
  ):
225
  if audio_prompt_path:
226
- self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration, vad_trim=vad_trim)
227
  else:
228
  assert self.conds is not None, "Please `prepare_conditionals` first or specify `audio_prompt_path`"
229
 
 
2
  from pathlib import Path
3
 
4
  import librosa
 
5
  import torch
6
  import perth
7
  import torch.nn.functional as F
8
  from huggingface_hub import hf_hub_download
 
9
 
10
  from .models.t3 import T3
11
  from .models.s3tokenizer import S3_SR, drop_invalid_tokens
 
121
  self.device = device
122
  self.conds = conds
123
  self.watermarker = perth.PerthImplicitWatermarker()
 
124
 
125
  @classmethod
126
  def from_local(cls, ckpt_dir, device) -> 'ChatterboxTTS':
 
162
 
163
  return cls.from_local(Path(local_path).parent, device)
164
 
165
+ def prepare_conditionals(self, wav_fpath, exaggeration=0.5):
166
+ ## Load reference wav
167
+ s3gen_ref_wav, _sr = librosa.load(wav_fpath, sr=S3GEN_SR)
168
 
169
+ ref_16k_wav = librosa.resample(s3gen_ref_wav, orig_sr=S3GEN_SR, target_sr=S3_SR)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
  s3gen_ref_wav = s3gen_ref_wav[:self.DEC_COND_LEN]
172
  s3gen_ref_dict = self.s3gen.embed_ref(s3gen_ref_wav, S3GEN_SR, device=self.device)
 
195
  exaggeration=0.5,
196
  cfg_weight=0.5,
197
  temperature=0.8,
 
198
  ):
199
  if audio_prompt_path:
200
+ self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration)
201
  else:
202
  assert self.conds is not None, "Please `prepare_conditionals` first or specify `audio_prompt_path`"
203