RASMUS commited on 26 days ago

Commit

308155b

verified ·

1 Parent(s): 0725138

Upload Finnish Chatterbox model

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.devcontainer/devcontainer.json +30 -0
.gitattributes +8 -0
README.md +114 -3
attribution.csv +0 -0
dataset_filtering_lineage.csv +0 -0
generalization_comparison.png +0 -0
generalization_comparison_filtered.png +3 -0
inference_example.py +57 -0
install_dependencies.sh +65 -0
models/best_accuracy_cp795.safetensors +3 -0
models/best_naturalness_cp1060.safetensors +3 -0
requirements.txt +36 -0
samples/comparison/cv15_11_baseline.wav +3 -0
samples/comparison/cv15_11_finetuned.wav +3 -0
samples/comparison/cv15_16_baseline.wav +3 -0
samples/comparison/cv15_16_finetuned.wav +3 -0
samples/comparison/cv15_2_baseline.wav +3 -0
samples/comparison/cv15_2_finetuned.wav +3 -0
samples/reference_finnish.wav +3 -0
setup.py +201 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-311.pyc +0 -0
src/__pycache__/config.cpython-311.pyc +0 -0
src/__pycache__/dataset.cpython-311.pyc +0 -0
src/__pycache__/model.cpython-311.pyc +0 -0
src/__pycache__/preprocess_file_based.cpython-311.pyc +0 -0
src/__pycache__/preprocess_ljspeech.cpython-311.pyc +0 -0
src/__pycache__/utils.cpython-311.pyc +0 -0
src/chatterbox_/__init__.py +3 -0
src/chatterbox_/__pycache__/__init__.cpython-311.pyc +0 -0
src/chatterbox_/__pycache__/mtl_tts.cpython-311.pyc +0 -0
src/chatterbox_/__pycache__/tts.cpython-311.pyc +0 -0
src/chatterbox_/__pycache__/tts_turbo.cpython-311.pyc +0 -0
src/chatterbox_/__pycache__/vc.cpython-311.pyc +0 -0
src/chatterbox_/models/__init__.py +0 -0
src/chatterbox_/models/__pycache__/__init__.cpython-311.pyc +0 -0
src/chatterbox_/models/__pycache__/utils.cpython-311.pyc +0 -0
src/chatterbox_/models/s3gen/__init__.py +2 -0
src/chatterbox_/models/s3gen/__pycache__/__init__.cpython-311.pyc +0 -0
src/chatterbox_/models/s3gen/__pycache__/configs.cpython-311.pyc +0 -0
src/chatterbox_/models/s3gen/__pycache__/const.cpython-311.pyc +0 -0
src/chatterbox_/models/s3gen/__pycache__/decoder.cpython-311.pyc +0 -0
src/chatterbox_/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc +0 -0
src/chatterbox_/models/s3gen/__pycache__/flow.cpython-311.pyc +0 -0
src/chatterbox_/models/s3gen/__pycache__/flow_matching.cpython-311.pyc +0 -0
src/chatterbox_/models/s3gen/__pycache__/hifigan.cpython-311.pyc +0 -0
src/chatterbox_/models/s3gen/__pycache__/s3gen.cpython-311.pyc +0 -0
src/chatterbox_/models/s3gen/__pycache__/xvector.cpython-311.pyc +0 -0
src/chatterbox_/models/s3gen/configs.py +10 -0
src/chatterbox_/models/s3gen/const.py +2 -0

.devcontainer/devcontainer.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "name": "Chatterbox A100 Optimized",
+    "image": "unsloth/unsloth:2025.10.1-pt2.8.0-cu12.8-llamacpp-integration",
+    "forwardPorts": [8888],
+    "containerEnv": {
+        "JUPYTER_PASSWORD": "MASKED_PASSWORD",
+        "USER_PASSWORD": "unsloth"
+    },
+    "runArgs": [
+        "--gpus=all",
+        "--shm-size=64gb"
+    ],
+    "remoteUser": "root",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-python.python",
+                "ms-python.vscode-pylance",
+                "ms-toolsai.jupyter"
+            ]
+        }
+    },
+    "postCreateCommand": "apt-get update && apt-get install -y git ffmpeg libsndfile1 && chmod -R 777 /workspaces && cd /workspaces/work/chatterbox-finetuning"
+}

.gitattributes CHANGED Viewed

@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+generalization_comparison_filtered.png filter=lfs diff=lfs merge=lfs -text
+samples/comparison/cv15_11_baseline.wav filter=lfs diff=lfs merge=lfs -text
+samples/comparison/cv15_11_finetuned.wav filter=lfs diff=lfs merge=lfs -text
+samples/comparison/cv15_16_baseline.wav filter=lfs diff=lfs merge=lfs -text
+samples/comparison/cv15_16_finetuned.wav filter=lfs diff=lfs merge=lfs -text
+samples/comparison/cv15_2_baseline.wav filter=lfs diff=lfs merge=lfs -text
+samples/comparison/cv15_2_finetuned.wav filter=lfs diff=lfs merge=lfs -text
+samples/reference_finnish.wav filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,114 @@
----
-license: mit
----

+# Chatterbox Finnish Fine-Tuning: High-Fidelity Zero-Shot TTS
+This project focuses on fine-tuning the Chatterbox TTS model (based on the Llama architecture) specifically for the Finnish language. By leveraging a multilingual base and applying rigorous data quality filtering, we achieved a near-perfect zero-shot generalization to unseen Finnish speakers.
+## 🚀 Performance Comparison (Zero-Shot OOD)
+The following metrics were calculated on **Out-of-Distribution (OOD)** speakers who were strictly excluded from the training and validation sets. This measures how well the model can speak Finnish in voices it has never heard before.
+| Metric | Baseline (Original Multilingual) | Fine-Tuned (Best Step: 795) | Improvement |
+| :--- | :---: | :---: | :---: |
+| **Avg Word Error Rate (WER)** | 28.94% | **1.36%** | **~21x Accuracy Increase** |
+| **Mean Opinion Score (MOS)** | 2.29 / 5.0 | **4.16 / 5.0** | **+1.87 Quality Points** |
+*Note: MOS was evaluated using the Gemini 3 Flash API, and WER was calculated using Faster-Whisper Finnish Large v3.*
+---
+## 🎧 Audio Comparison (OOD Speakers)
+Listen to the difference between the generic multilingual baseline and our high-fidelity Finnish fine-tuning. These samples are from **Zero-Shot** speakers (never seen during training).
+| Speaker ID | Baseline (Generic Multilingual) | Fine-Tuned (Finnish Golden) |
+| :--- | :--- | :--- |
+| **cv-15_11** | [Baseline Audio](samples/comparison/cv15_11_baseline.wav) | [Fine-Tuned Audio](samples/comparison/cv15_11_finetuned.wav) |
+| **cv-15_16** | [Baseline Audio](samples/comparison/cv15_16_baseline.wav) | [Fine-Tuned Audio](samples/comparison/cv15_16_finetuned.wav) |
+| **cv-15_2** | [Baseline Audio](samples/comparison/cv15_2_baseline.wav) | [Fine-Tuned Audio](samples/comparison/cv15_2_finetuned.wav) |
+*The samples above use the same text and reference audio for a fair comparison.*
+---
+## 🛠 Data Processing & Transparency
+We implemented a "Golden Data" strategy to ensure the model learned high-quality Finnish prosody without acoustic artifacts. After strict filtering, the final training set consists of **8,655 high-quality samples**.
+### 1. Multi-Source Dataset Breakdown
+The final dataset is a diverse mix of Finnish speech from the following sources:
+- **Mozilla Common Voice (cv-15)**: 4,348 samples (Diverse crowdsourced voices)
+- **Filmot**: 2,605 samples (Media-based Finnish)
+- **YouTube**: 982 samples (Conversational modern Finnish)
+- **Parliament**: 720 samples (Formal Finnish speech)
+### 2. "Golden" Filtering Logic
+To prevent the model from cloning background noise or learning from single-word clips, we applied the following strict filters in `src/dataset.py`:
+- **Min Duration**: 4.0 seconds (ensures enough context for prosody).
+- **Min SNR**: 35.0 dB (removes low-quality/noisy recordings).
+- **Max SNR**: 100.0 dB (removes sterile/digital noise-gated artifacts).
+### 3. Traceability & Lineage
+Full lineage is maintained for every training run. The script automatically generates a `dataset_filtering_lineage.csv` in the output directory, detailing exactly which files were excluded and for what reason (`LOW_SNR`, `LOW_DURATION`, or `OOD_SPEAKER`).
+## 💻 Hardware & Infrastructure
+This training was performed on the **Verda platform** using an **NVIDIA A10 80GB** instance. This high-VRAM instance allowed us to use a larger batch size and 850ms speech sequences without hitting memory limits.
+### .devcontainer Configuration
+We have included the `.devcontainer` directory to ensure a reproducible environment. It pre-installs all necessary CUDA-optimized libraries and sets up the Jupyter environment for immediate experimentation.
+---
+## 🔧 Installation & Setup
+1. **Environment**: Ensure you have Python 3.10+ and CUDA-capable hardware.
+2. **Setup**:
+   ```bash
+   bash install_dependencies.sh
+   python setup.py  # Downloads the multilingual base weights
+   ```
+---
+## 🏃 Running Inference
+To generate Finnish speech using the fine-tuned model:
+```python
+from src.chatterbox_.tts import ChatterboxTTS
+# 1. Load the engine
+engine = ChatterboxTTS.from_local("./pretrained_models", device="cuda")
+# 2. Inject your best finetuned weights
+# (Assuming your best weights are in chatterbox_output/checkpoint-795)
+# engine.t3.load_state_dict(...)
+# 3. Generate with Finnish-optimized parameters
+wav = engine.generate(
+    text="Suomen kieli on poikkeuksellisen kaunista kuunneltavaa.",
+    audio_prompt_path="path/to/reference_voice.wav",
+    repetition_penalty=1.2,
+    temperature=0.8,
+    exaggeration=0.6
+)
+```
+### Optimized Parameters for Finnish
+Based on our research, we identified the following settings as the most stable for Finnish phonetics:
+- `repetition_penalty`: 1.2
+- `temperature`: 0.8
+- `Repetition Guard`: Increased to **10 tokens** in `AlignmentStreamAnalyzer` to allow for long Finnish vowels without premature cutoffs.
+---
+## 🛡 Repetition Guard Improvements
+A critical fix was applied to `src/chatterbox_/models/t3/inference/alignment_stream_analyzer.py`. The original threshold for token repetition was too sensitive for Finnish (which relies on long vowels). It has been increased from 3 to **10 tokens (~160ms)**, allowing for natural linguistic duration while still preventing infinite generation loops.
+---
+## 🙏 Acknowledgments & Credits
+- **Exploration Foundation**: Initial fine-tuning exploration was based on the [chatterbox-finetuning](https://github.com/gokhaneraslan/chatterbox-finetuning) toolkit by gokhaneraslan.
+- **Model Authors**: Deep thanks to the team at **ResembleAI** for releasing the [Chatterbox TTS model](https://huggingface.co/ResembleAI/chatterbox).
+- **Data Sourcing**: Special thanks to **#Jobik** at **Nordic AI** Discord for introducing [Filmot](https://filmot.com/), which was instrumental in sourcing high-quality media-based Finnish data.

attribution.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

dataset_filtering_lineage.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

generalization_comparison.png ADDED Viewed

generalization_comparison_filtered.png ADDED Viewed

Git LFS Details

SHA256: d7d5de6814697cd42ea23a58f3168fc86b04fb67d49d154cf770c20ce744e1ab
Pointer size: 131 Bytes
Size of remote file: 102 kB

inference_example.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import torch
+import soundfile as sf
+from src.chatterbox_.tts import ChatterboxTTS
+from safetensors.torch import load_file
+# ==============================================================================
+# CONFIGURATION
+# ==============================================================================
+# Path to your preferred checkpoint (e.g., CP 795 for best accuracy)
+FINE_TUNED_WEIGHTS = "./models/best_accuracy_cp795.safetensors"
+# Text to synthesize
+TEXT = "Suomen kieli on poikkeuksellisen kaunista kuunneltavaa varsinkin hienosti lausuttuna."
+# Reference audio for voice cloning (3-10s recommended)
+REFERENCE_AUDIO = "./samples/reference_finnish.wav"
+# Output filename
+OUTPUT_FILE = "inference_output.wav"
+# ==============================================================================
+def main():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # 1. Load the base engine
+    # Ensure you have run 'python setup.py' to download the base models first
+    print("Loading base engine...")
+    engine = ChatterboxTTS.from_local("./pretrained_models", device=device)
+    # 2. Inject the fine-tuned weights
+    print(f"Injecting fine-tuned weights from {FINE_TUNED_WEIGHTS}...")
+    checkpoint_state = load_file(FINE_TUNED_WEIGHTS)
+    # Strip "t3." prefix if present (added by the trainer wrapper)
+    t3_state_dict = {k[3:] if k.startswith("t3.") else k: v for k, v in checkpoint_state.items()}
+    engine.t3.load_state_dict(t3_state_dict, strict=False)
+    engine.t3.eval()
+    # 3. Generate Finnish audio
+    print(f"Generating audio for text: '{TEXT[:50]}...'")
+    wav_tensor = engine.generate(
+        text=TEXT,
+        audio_prompt_path=REFERENCE_AUDIO,
+        repetition_penalty=1.2,
+        temperature=0.8,
+        exaggeration=0.6
+    )
+    # 4. Save result
+    wav_np = wav_tensor.squeeze().cpu().numpy()
+    sf.write(OUTPUT_FILE, wav_np, engine.sr)
+    print(f"✓ Audio saved to {OUTPUT_FILE}")
+if __name__ == "__main__":
+    main()

install_dependencies.sh ADDED Viewed

	@@ -0,0 +1,65 @@

+#!/bin/bash
+# Chatterbox Finetuning - Dependency Installation Script
+# This script ensures correct PyTorch and dependency versions are installed
+set -e  # Exit on error
+echo "===================================="
+echo "Chatterbox Finetuning Setup"
+echo "===================================="
+# Check Python version
+PYTHON_VERSION=$(python --version 2>&1 | grep -oP '(?<=Python )\d+\.\d+')
+echo "Python version: $PYTHON_VERSION"
+# Uninstall conflicting packages if they exist
+echo ""
+echo "Step 1: Removing conflicting packages..."
+pip uninstall -y torch torchvision torchaudio xformers flash-attn 2>/dev/null || true
+# Install correct PyTorch version
+echo ""
+echo "Step 2: Installing PyTorch 2.5.1 with CUDA 12.4..."
+pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
+# Install xformers
+echo ""
+echo "Step 3: Installing xformers..."
+pip install xformers==0.0.28.post3 --index-url https://download.pytorch.org/whl/cu124
+# Install torchao (compatible version)
+echo ""
+echo "Step 4: Installing torchao..."
+pip install torchao==0.6.1
+# Install all other dependencies
+echo ""
+echo "Step 5: Installing remaining dependencies..."
+pip install -r requirements.txt
+# Verify installation
+echo ""
+echo "===================================="
+echo "Verifying installation..."
+echo "===================================="
+python -c "
+import torch
+import xformers
+import transformers
+print(f'✓ PyTorch: {torch.__version__}')
+print(f'✓ xformers: {xformers.__version__}')
+print(f'✓ Transformers: {transformers.__version__}')
+print(f'✓ CUDA available: {torch.cuda.is_available()}')
+if torch.cuda.is_available():
+    print(f'✓ CUDA version: {torch.version.cuda}')
+"
+echo ""
+echo "===================================="
+echo "Installation complete!"
+echo "===================================="
+echo ""
+echo "Next steps:"
+echo "1. Run: python setup.py           (to download pretrained models)"
+echo "2. Run: python train.py           (to start training)"
+echo ""

models/best_accuracy_cp795.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c0dd9023aba3e952c76435ce1e5e8f028dc7ad3ec1242f4df8dda377c058d0b
+size 2143990656

models/best_naturalness_cp1060.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f086086d30eed54c4f42160b2f2b0c192aa0a8251c10b5a67e0eea2935ca985d
+size 2143990656

requirements.txt ADDED Viewed

	@@ -0,0 +1,36 @@

+# Core PyTorch - Using 2.5.1 for stable xformers/flash-attn support
+--extra-index-url https://download.pytorch.org/whl/cu124
+torch==2.5.1
+torchaudio==2.5.1
+torchvision==0.20.1
+# Core dependencies with pinned versions for stability
+transformers==4.46.3
+xformers==0.0.28.post3
+torchao==0.6.1
+diffusers==0.29.0
+peft==0.17.1
+# Chatterbox TTS dependencies
+# Note: chatterbox-tts itself is installed via install_dependencies.sh --no-deps
+# to avoid strict torch==2.6.0 conflict
+resemble-perth==1.0.1
+conformer==0.3.2
+s3tokenizer==0.3.0
+# Audio processing
+silero-vad==6.2.0
+librosa==0.11.0
+soundfile==0.13.1
+pyloudnorm
+# Utilities
+num2words
+ffmpeg-python
+tqdm
+pandas
+safetensors
+tensorboard
+omegaconf
+hf_transfer
+gdown

samples/comparison/cv15_11_baseline.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c35b6067b981f318e0bd02c04e034c7ec0cb290f07ce117b7e7d6e49125d4c2
+size 527084

samples/comparison/cv15_11_finetuned.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c095d7a386a0430e8c105cca160e35a5321536b95ed6d3336456f80d5d28695
+size 431084

samples/comparison/cv15_16_baseline.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c76889754e1387567df26bb63faf355434990be9190e92cf2b2ec670c4abc9a
+size 469484

samples/comparison/cv15_16_finetuned.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:049e7435b69864d1f27df2a8a98f1b95d40eee7645fd3d03190512e9380d67b6
+size 358124

samples/comparison/cv15_2_baseline.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cfd64fbc57efddba7cd9e5e13dd41f2a7843796112b59e36f6a3c5999109fc8
+size 434924

samples/comparison/cv15_2_finetuned.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb8142ec3157d3d945e4896c215fca0e1031c520aaa03f8533f53b96f564eb8e
+size 423404

samples/reference_finnish.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb5f8e26a7a88276e23bf73375be147269ba2cd6fe6fb38ac68e7b330d1fc03c
+size 266156

setup.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import os
+import requests
+import sys
+import json
+from tqdm import tqdm
+from transformers import AutoTokenizer
+from src.config import TrainConfig
+DEST_DIR = "pretrained_models"
+CHATTERBOX_TURBO_FILES = {
+    "ve.safetensors": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/ve.safetensors?download=true",
+    "t3_turbo_v1.safetensors": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/t3_turbo_v1.safetensors?download=true",
+    "s3gen_meanflow.safetensors": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/s3gen_meanflow.safetensors?download=true",
+    "conds.pt": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/conds.pt?download=true",
+    "vocab.json": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/vocab.json?download=true",
+    "added_tokens.json": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/added_tokens.json?download=true",
+    "special_tokens_map.json": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/special_tokens_map.json?download=true",
+    "tokenizer_config.json": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/tokenizer_config.json?download=true",
+    "merges.txt": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/merges.txt?download=true",
+    "grapheme_mtl_merged_expanded_v1.json": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/grapheme_mtl_merged_expanded_v1.json?download=true"
+}
+CHATTERBOX_FILES = {
+    "ve.safetensors": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/ve.safetensors?download=true",
+    "t3_cfg.safetensors": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/t3_mtl23ls_v2.safetensors?download=true",
+    "s3gen.safetensors": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/s3gen.safetensors?download=true",
+    "conds.pt": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/conds.pt?download=true",
+    "tokenizer.json": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/grapheme_mtl_merged_expanded_v1.json?download=true"
+}
+def download_file(url, dest_path):
+    """Downloads a file from a URL to a specific destination with a progress bar."""
+    if os.path.exists(dest_path):
+        print(f"File already exists: {dest_path}")
+        return
+    print(f"Downloading: {os.path.basename(dest_path)}...")
+    try:
+        response = requests.get(url, stream=True)
+        response.raise_for_status()
+        total_size = int(response.headers.get('content-length', 0))
+        block_size = 1024
+        with open(dest_path, 'wb') as file, tqdm(
+            desc=os.path.basename(dest_path),
+            total=total_size,
+            unit='iB',
+            unit_scale=True,
+            unit_divisor=1024,
+        ) as bar:
+            for data in response.iter_content(block_size):
+                size = file.write(data)
+                bar.update(size)
+        print(f"Download complete: {dest_path}\n")
+    except requests.exceptions.RequestException as e:
+        print(f"Error downloading {url}: {e}")
+        sys.exit(1)
+def merge_and_save_turbo_tokenizer():
+    """
+    It combines the downloaded original GPT-2 tokenizer with our custom vocab
+    and overwrites the original files.
+    """
+    print("\n--- Turbo Vocab Merging Begins ---")
+    try:
+        base_tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
+    except Exception as e:
+        print(f"ERROR: The original tokenizer could not be loaded. Did you download the files correctly? -> {e}")
+        return 0
+    initial_len = len(base_tokenizer)
+    print(f"   Original Size: {initial_len}")
+    custom_vocab_path = os.path.join(DEST_DIR, "grapheme_mtl_merged_expanded_v1.json")
+    print(f"Loading: Custom Vocab ({custom_vocab_path})")
+    with open(custom_vocab_path, 'r', encoding='utf-8') as f:
+        custom_data = json.load(f)
+    if "model" in custom_data and "vocab" in custom_data["model"]:
+        vocab_dict = custom_data["model"]["vocab"]
+    else:
+        print("Warning: The custom VOCAB format may differ from what is expected.")
+        return 0
+    unique_tokens_to_add = list(vocab_dict.keys())
+    added_count = base_tokenizer.add_tokens(unique_tokens_to_add)
+    final_len = len(base_tokenizer)
+    print(f"Merging: {added_count} new token added.")
+    print(f"   New Dimension: {final_len}")
+    print(f"Saving: Writing the combined tokenizer to the '{DEST_DIR}' folder...")
+    base_tokenizer.save_pretrained(DEST_DIR)
+    print("MERGER SUCCESSFUL!")
+    return final_len
+def test_merge_tokenizer_process(tokenizer_path):
+    try:
+        tok = AutoTokenizer.from_pretrained(tokenizer_path)
+        print(f"--- RESULTS ---")
+        print(f"Folder: {tokenizer_path}")
+        print(f"Actual Vocab Size (len): {len(tok)}")
+        test_token = "[ta]"
+        test_id = tok.encode(test_token, add_special_tokens=False)
+        print(f"Test Token '{test_token}' ID: {test_id}")
+        if len(tok) > 50276:
+            print("SUCCESS! New tokens have been added.")
+        else:
+            print("ERROR: The size still appears old.")
+    except Exception as e:
+        print(f"Error: {e}")
+def main():
+    print("--- Chatterbox Pretrained Model Setup ---\n")
+    # 1. Create the directory if it doesn't exist
+    if not os.path.exists(DEST_DIR):
+        print(f"Creating directory: {DEST_DIR}")
+        os.makedirs(DEST_DIR, exist_ok=True)
+    else:
+        print(f"Directory found: {DEST_DIR}")
+    cfg = TrainConfig()
+    if cfg.is_turbo:
+        print(f"Mode: CHATTERBOX-TURBO (Checking {len(CHATTERBOX_TURBO_FILES)} files)")
+        FILES_TO_DOWNLOAD = CHATTERBOX_TURBO_FILES
+    else:
+        print(f"Mode: CHATTERBOX-TTS (Checking {len(CHATTERBOX_FILES)} files)")
+        FILES_TO_DOWNLOAD = CHATTERBOX_FILES
+    # 2. Download files
+    for filename, url in FILES_TO_DOWNLOAD.items():
+        dest_path = os.path.join(DEST_DIR, filename)
+        download_file(url, dest_path)
+    if cfg.is_turbo:
+        new_vocab_size = merge_and_save_turbo_tokenizer()
+        if new_vocab_size > 0:
+            #test_merge_tokenizer_process(DEST_DIR)
+            print("\n" + "="*60)
+            print("INSTALLATION COMPLETE (CHATTERBOX-TURBO MODE)")
+            print("All models are set up in 'pretrained_models/' folder.")
+            print(f"Please update the 'new_vocab_size' value in the 'src/config.py' file")
+            print(f"to: {new_vocab_size}")
+            print("="*60 + "\n")
+    else:
+        print("\nINSTALLATION COMPLETE (CHATTERBOX-TTS MOD)")
+        print("All models are set up in 'pretrained_models/' folder.")
+        print(f"Note: 'grapheme_mtl_merged_expanded_v1.json' was saved as 'tokenizer.json' for the new vocabulary.")
+if __name__ == "__main__":
+    main()

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (141 Bytes). View file

src/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (2.07 kB). View file

src/__pycache__/dataset.cpython-311.pyc ADDED Viewed

Binary file (12.2 kB). View file

src/__pycache__/model.cpython-311.pyc ADDED Viewed

Binary file (8.21 kB). View file

src/__pycache__/preprocess_file_based.cpython-311.pyc ADDED Viewed

Binary file (6.65 kB). View file

src/__pycache__/preprocess_ljspeech.cpython-311.pyc ADDED Viewed

Binary file (6.65 kB). View file

src/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (5.41 kB). View file

src/chatterbox_/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .tts import ChatterboxTTS
+from .vc import ChatterboxVC
+from .mtl_tts import ChatterboxMultilingualTTS, SUPPORTED_LANGUAGES

src/chatterbox_/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (372 Bytes). View file

src/chatterbox_/__pycache__/mtl_tts.cpython-311.pyc ADDED Viewed

Binary file (14.9 kB). View file

src/chatterbox_/__pycache__/tts.cpython-311.pyc ADDED Viewed

Binary file (14.5 kB). View file

src/chatterbox_/__pycache__/tts_turbo.cpython-311.pyc ADDED Viewed

Binary file (15.5 kB). View file

src/chatterbox_/__pycache__/vc.cpython-311.pyc ADDED Viewed

Binary file (6.27 kB). View file

src/chatterbox_/models/__init__.py ADDED Viewed

File without changes

src/chatterbox_/models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (160 Bytes). View file

src/chatterbox_/models/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (746 Bytes). View file

src/chatterbox_/models/s3gen/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .s3gen import S3Token2Wav as S3Gen
2	+ from .const import S3GEN_SR

src/chatterbox_/models/s3gen/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (282 Bytes). View file

src/chatterbox_/models/s3gen/__pycache__/configs.cpython-311.pyc ADDED Viewed

Binary file (442 Bytes). View file

src/chatterbox_/models/s3gen/__pycache__/const.cpython-311.pyc ADDED Viewed

Binary file (203 Bytes). View file

src/chatterbox_/models/s3gen/__pycache__/decoder.cpython-311.pyc ADDED Viewed

Binary file (17.6 kB). View file

src/chatterbox_/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc ADDED Viewed

Binary file (2.69 kB). View file

src/chatterbox_/models/s3gen/__pycache__/flow.cpython-311.pyc ADDED Viewed

Binary file (10.1 kB). View file

src/chatterbox_/models/s3gen/__pycache__/flow_matching.cpython-311.pyc ADDED Viewed

Binary file (12.3 kB). View file

src/chatterbox_/models/s3gen/__pycache__/hifigan.cpython-311.pyc ADDED Viewed

Binary file (26.3 kB). View file

src/chatterbox_/models/s3gen/__pycache__/s3gen.cpython-311.pyc ADDED Viewed

Binary file (15.4 kB). View file

src/chatterbox_/models/s3gen/__pycache__/xvector.cpython-311.pyc ADDED Viewed

Binary file (24 kB). View file

src/chatterbox_/models/s3gen/configs.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from ..utils import AttrDict
+CFM_PARAMS = AttrDict({
+    "sigma_min": 1e-06,
+    "solver": "euler",
+    "t_scheduler": "cosine",
+    "training_cfg_rate": 0.2,
+    "inference_cfg_rate": 0.7,
+    "reg_loss_type": "l1"
+})

src/chatterbox_/models/s3gen/const.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ S3GEN_SR = 24000
2	+ S3GEN_SIL = 4299