Spaces:

ResembleAI
/

Chatterbox

Running on Zero

@@ -45,30 +45,25 @@ def set_seed(seed: int):
 @spaces.GPU
 def generate_tts_audio(
     text_input: str,
-    audio_prompt_path_input: str = None,
-    exaggeration_input: float = 0.5,
-    temperature_input: float = 0.8,
-    seed_num_input: int = 0,
-    cfgw_input: float = 0.5,
-    vad_trim_input: bool = False,
 ) -> tuple[int, np.ndarray]:
     """
-    Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
-    This tool synthesizes natural-sounding speech from input text. When a reference audio file
-    is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
-    maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
     Args:
-        text_input (str): The text to synthesize into speech (maximum 300 characters)
-        audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
-        exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
-        temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
-        seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
-        cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5.
     Returns:
-        tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
     """
     current_model = get_or_load_model()
@@ -79,21 +74,12 @@ def generate_tts_audio(
         set_seed(int(seed_num_input))
     print(f"Generating audio for text: '{text_input[:50]}...'")
-    # Handle optional audio prompt
-    generate_kwargs = {
-        "exaggeration": exaggeration_input,
-        "temperature": temperature_input,
-        "cfg_weight": cfgw_input,
-        "vad_trim": vad_trim_input,
-    }
-    if audio_prompt_path_input:
-        generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
     wav = current_model.generate(
         text_input[:300],  # Truncate text to max chars
-        **generate_kwargs
     )
     print("Audio generation complete.")
     return (current_model.sr, wav.squeeze(0).numpy())
@@ -128,7 +114,6 @@ with gr.Blocks() as demo:
             with gr.Accordion("More options", open=False):
                 seed_num = gr.Number(value=0, label="Random seed (0 for random)")
                 temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
-                vad_trim = gr.Checkbox(label="Ref VAD trimming", value=False)
             run_btn = gr.Button("Generate", variant="primary")
@@ -144,9 +129,8 @@ with gr.Blocks() as demo:
             temp,
             seed_num,
             cfg_weight,
-            vad_trim,
         ],
         outputs=[audio_output],
     )
-demo.launch(mcp_server=True)

 @spaces.GPU
 def generate_tts_audio(
     text_input: str,
+    audio_prompt_path_input: str,
+    exaggeration_input: float,
+    temperature_input: float,
+    seed_num_input: int,
+    cfgw_input: float
 ) -> tuple[int, np.ndarray]:
     """
+    Generates TTS audio using the ChatterboxTTS model.
     Args:
+        text_input: The text to synthesize (max 300 characters).
+        audio_prompt_path_input: Path to the reference audio file.
+        exaggeration_input: Exaggeration parameter for the model.
+        temperature_input: Temperature parameter for the model.
+        seed_num_input: Random seed (0 for random).
+        cfgw_input: CFG/Pace weight.
     Returns:
+        A tuple containing the sample rate (int) and the audio waveform (numpy.ndarray).
     """
     current_model = get_or_load_model()
         set_seed(int(seed_num_input))
     print(f"Generating audio for text: '{text_input[:50]}...'")
     wav = current_model.generate(
         text_input[:300],  # Truncate text to max chars
+        audio_prompt_path=audio_prompt_path_input,
+        exaggeration=exaggeration_input,
+        temperature=temperature_input,
+        cfg_weight=cfgw_input,
     )
     print("Audio generation complete.")
     return (current_model.sr, wav.squeeze(0).numpy())
             with gr.Accordion("More options", open=False):
                 seed_num = gr.Number(value=0, label="Random seed (0 for random)")
                 temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
             run_btn = gr.Button("Generate", variant="primary")
             temp,
             seed_num,
             cfg_weight,
         ],
         outputs=[audio_output],
     )
+demo.launch()

chatterbox/src/chatterbox/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (275 Bytes). View file

chatterbox/src/chatterbox/__pycache__/tts.cpython-311.pyc ADDED Viewed

Binary file (13.3 kB). View file

chatterbox/src/chatterbox/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (858 Bytes). View file

chatterbox/src/chatterbox/__pycache__/vc.cpython-311.pyc ADDED Viewed

Binary file (5.44 kB). View file

chatterbox/src/chatterbox/models/s3gen/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (294 Bytes). View file

chatterbox/src/chatterbox/models/s3gen/__pycache__/const.cpython-311.pyc ADDED Viewed

Binary file (190 Bytes). View file

chatterbox/src/chatterbox/models/s3gen/__pycache__/decoder.cpython-311.pyc ADDED Viewed

Binary file (16.9 kB). View file

chatterbox/src/chatterbox/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc ADDED Viewed

Binary file (2.7 kB). View file

chatterbox/src/chatterbox/models/s3gen/__pycache__/flow.cpython-311.pyc ADDED Viewed

Binary file (13.7 kB). View file

chatterbox/src/chatterbox/models/s3gen/__pycache__/flow_matching.cpython-311.pyc ADDED Viewed

Binary file (13.3 kB). View file

chatterbox/src/chatterbox/models/s3gen/__pycache__/hifigan.cpython-311.pyc ADDED Viewed

Binary file (26.3 kB). View file

chatterbox/src/chatterbox/models/s3gen/__pycache__/s3gen.cpython-311.pyc ADDED Viewed

Binary file (13.7 kB). View file

chatterbox/src/chatterbox/models/s3gen/__pycache__/xvector.cpython-311.pyc ADDED Viewed

Binary file (24 kB). View file

chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc ADDED Viewed

Binary file (21.3 kB). View file

chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc ADDED Viewed

Binary file (6.46 kB). View file

chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc ADDED Viewed

Binary file (14.7 kB). View file

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (190 Bytes). View file

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc ADDED Viewed

Binary file (3.58 kB). View file

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc ADDED Viewed

Binary file (15.7 kB). View file

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc ADDED Viewed

Binary file (5.54 kB). View file

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc ADDED Viewed

Binary file (17.3 kB). View file

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc ADDED Viewed

Binary file (11.2 kB). View file

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc ADDED Viewed

Binary file (6.24 kB). View file

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc ADDED Viewed

Binary file (18.9 kB). View file

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc ADDED Viewed

Binary file (15.6 kB). View file

chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc ADDED Viewed

Binary file (1.93 kB). View file

chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mask.cpython-311.pyc ADDED Viewed

Binary file (6.25 kB). View file

chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mel.cpython-311.pyc ADDED Viewed

Binary file (4.05 kB). View file

chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.37 kB). View file

chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc ADDED Viewed

Binary file (7.94 kB). View file

chatterbox/src/chatterbox/models/t3/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (218 Bytes). View file

chatterbox/src/chatterbox/models/t3/__pycache__/llama_configs.cpython-311.pyc ADDED Viewed

Binary file (1.34 kB). View file

chatterbox/src/chatterbox/models/t3/__pycache__/t3.cpython-311.pyc ADDED Viewed

Binary file (15.8 kB). View file

chatterbox/src/chatterbox/models/t3/inference/__pycache__/alignment_stream_analyzer.cpython-311.pyc ADDED Viewed

Binary file (7.08 kB). View file

chatterbox/src/chatterbox/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc ADDED Viewed

Binary file (4.65 kB). View file

chatterbox/src/chatterbox/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc ADDED Viewed

Binary file (5.37 kB). View file

chatterbox/src/chatterbox/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc ADDED Viewed

Binary file (2.54 kB). View file

chatterbox/src/chatterbox/models/t3/modules/__pycache__/perceiver.cpython-311.pyc ADDED Viewed

Binary file (12.6 kB). View file

chatterbox/src/chatterbox/models/t3/modules/__pycache__/t3_config.cpython-311.pyc ADDED Viewed

Binary file (1.27 kB). View file

chatterbox/src/chatterbox/models/tokenizers/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (242 Bytes). View file

chatterbox/src/chatterbox/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc ADDED Viewed

Binary file (3.1 kB). View file

chatterbox/src/chatterbox/models/voice_encoder/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (281 Bytes). View file

chatterbox/src/chatterbox/models/voice_encoder/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (859 Bytes). View file

chatterbox/src/chatterbox/models/voice_encoder/__pycache__/melspec.cpython-311.pyc ADDED Viewed

Binary file (3.59 kB). View file

chatterbox/src/chatterbox/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc ADDED Viewed

Binary file (18.7 kB). View file

chatterbox/src/chatterbox/tts.py CHANGED Viewed

@@ -2,12 +2,10 @@ from dataclasses import dataclass
 from pathlib import Path
 import librosa
-import numpy as np
 import torch
 import perth
 import torch.nn.functional as F
 from huggingface_hub import hf_hub_download
-from silero_vad import load_silero_vad, get_speech_timestamps
 from .models.t3 import T3
 from .models.s3tokenizer import S3_SR, drop_invalid_tokens
@@ -123,7 +121,6 @@ class ChatterboxTTS:
         self.device = device
         self.conds = conds
         self.watermarker = perth.PerthImplicitWatermarker()
-        self.silero_vad = load_silero_vad()
     @classmethod
     def from_local(cls, ckpt_dir, device) -> 'ChatterboxTTS':
@@ -165,33 +162,11 @@ class ChatterboxTTS:
         return cls.from_local(Path(local_path).parent, device)
-    def trim_excess_silence(self, wav, sr):
-        "Trim excess silence from speech. Input must be a multiple of 16kHz."
-        assert sr % 16_000 == 0, "Silero requires an integer multiple of 16kHz"
-        # Get VAD as sample-level bool array
-        silero_regions = get_speech_timestamps(wav, self.silero_vad, sampling_rate=sr)
-        vad = np.zeros_like(wav)
-        for region in silero_regions:
-            vad[region["start"]:region["end"]] = 1
-        # Dilate VAD
-        max_silence_ms = 400
-        cfilter = np.ones(int(sr * max_silence_ms / (2 * 1000)))
-        dilated_vad = np.convolve(vad, cfilter, mode="same") > 0
-        # Trim out silence
-        return wav[dilated_vad]
-    def prepare_conditionals(self, wav_fpath, exaggeration=0.5, vad_trim=False):
-        # Load reference wav at high SR and trim silence
-        ref_wav, highres_sr = librosa.load(wav_fpath, sr=48_000)
-        if vad_trim:
-            ref_wav = self.trim_excess_silence(ref_wav, highres_sr)
-        # Resample down
-        s3gen_ref_wav = librosa.resample(ref_wav, orig_sr=highres_sr, target_sr=S3GEN_SR)
-        ref_16k_wav = librosa.resample(ref_wav, orig_sr=highres_sr, target_sr=S3_SR)
         s3gen_ref_wav = s3gen_ref_wav[:self.DEC_COND_LEN]
         s3gen_ref_dict = self.s3gen.embed_ref(s3gen_ref_wav, S3GEN_SR, device=self.device)
@@ -220,10 +195,9 @@ class ChatterboxTTS:
         exaggeration=0.5,
         cfg_weight=0.5,
         temperature=0.8,
-        vad_trim=False,
     ):
         if audio_prompt_path:
-            self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration, vad_trim=vad_trim)
         else:
             assert self.conds is not None, "Please `prepare_conditionals` first or specify `audio_prompt_path`"

 from pathlib import Path
 import librosa
 import torch
 import perth
 import torch.nn.functional as F
 from huggingface_hub import hf_hub_download
 from .models.t3 import T3
 from .models.s3tokenizer import S3_SR, drop_invalid_tokens
         self.device = device
         self.conds = conds
         self.watermarker = perth.PerthImplicitWatermarker()
     @classmethod
     def from_local(cls, ckpt_dir, device) -> 'ChatterboxTTS':
         return cls.from_local(Path(local_path).parent, device)
+    def prepare_conditionals(self, wav_fpath, exaggeration=0.5):
+        ## Load reference wav
+        s3gen_ref_wav, _sr = librosa.load(wav_fpath, sr=S3GEN_SR)
+        ref_16k_wav = librosa.resample(s3gen_ref_wav, orig_sr=S3GEN_SR, target_sr=S3_SR)
         s3gen_ref_wav = s3gen_ref_wav[:self.DEC_COND_LEN]
         s3gen_ref_dict = self.s3gen.embed_ref(s3gen_ref_wav, S3GEN_SR, device=self.device)
         exaggeration=0.5,
         cfg_weight=0.5,
         temperature=0.8,
     ):
         if audio_prompt_path:
+            self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration)
         else:
             assert self.conds is not None, "Please `prepare_conditionals` first or specify `audio_prompt_path`"