RASMUS commited on
Commit
308155b
·
verified ·
1 Parent(s): 0725138

Upload Finnish Chatterbox model

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .devcontainer/devcontainer.json +30 -0
  2. .gitattributes +8 -0
  3. README.md +114 -3
  4. attribution.csv +0 -0
  5. dataset_filtering_lineage.csv +0 -0
  6. generalization_comparison.png +0 -0
  7. generalization_comparison_filtered.png +3 -0
  8. inference_example.py +57 -0
  9. install_dependencies.sh +65 -0
  10. models/best_accuracy_cp795.safetensors +3 -0
  11. models/best_naturalness_cp1060.safetensors +3 -0
  12. requirements.txt +36 -0
  13. samples/comparison/cv15_11_baseline.wav +3 -0
  14. samples/comparison/cv15_11_finetuned.wav +3 -0
  15. samples/comparison/cv15_16_baseline.wav +3 -0
  16. samples/comparison/cv15_16_finetuned.wav +3 -0
  17. samples/comparison/cv15_2_baseline.wav +3 -0
  18. samples/comparison/cv15_2_finetuned.wav +3 -0
  19. samples/reference_finnish.wav +3 -0
  20. setup.py +201 -0
  21. src/__init__.py +0 -0
  22. src/__pycache__/__init__.cpython-311.pyc +0 -0
  23. src/__pycache__/config.cpython-311.pyc +0 -0
  24. src/__pycache__/dataset.cpython-311.pyc +0 -0
  25. src/__pycache__/model.cpython-311.pyc +0 -0
  26. src/__pycache__/preprocess_file_based.cpython-311.pyc +0 -0
  27. src/__pycache__/preprocess_ljspeech.cpython-311.pyc +0 -0
  28. src/__pycache__/utils.cpython-311.pyc +0 -0
  29. src/chatterbox_/__init__.py +3 -0
  30. src/chatterbox_/__pycache__/__init__.cpython-311.pyc +0 -0
  31. src/chatterbox_/__pycache__/mtl_tts.cpython-311.pyc +0 -0
  32. src/chatterbox_/__pycache__/tts.cpython-311.pyc +0 -0
  33. src/chatterbox_/__pycache__/tts_turbo.cpython-311.pyc +0 -0
  34. src/chatterbox_/__pycache__/vc.cpython-311.pyc +0 -0
  35. src/chatterbox_/models/__init__.py +0 -0
  36. src/chatterbox_/models/__pycache__/__init__.cpython-311.pyc +0 -0
  37. src/chatterbox_/models/__pycache__/utils.cpython-311.pyc +0 -0
  38. src/chatterbox_/models/s3gen/__init__.py +2 -0
  39. src/chatterbox_/models/s3gen/__pycache__/__init__.cpython-311.pyc +0 -0
  40. src/chatterbox_/models/s3gen/__pycache__/configs.cpython-311.pyc +0 -0
  41. src/chatterbox_/models/s3gen/__pycache__/const.cpython-311.pyc +0 -0
  42. src/chatterbox_/models/s3gen/__pycache__/decoder.cpython-311.pyc +0 -0
  43. src/chatterbox_/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc +0 -0
  44. src/chatterbox_/models/s3gen/__pycache__/flow.cpython-311.pyc +0 -0
  45. src/chatterbox_/models/s3gen/__pycache__/flow_matching.cpython-311.pyc +0 -0
  46. src/chatterbox_/models/s3gen/__pycache__/hifigan.cpython-311.pyc +0 -0
  47. src/chatterbox_/models/s3gen/__pycache__/s3gen.cpython-311.pyc +0 -0
  48. src/chatterbox_/models/s3gen/__pycache__/xvector.cpython-311.pyc +0 -0
  49. src/chatterbox_/models/s3gen/configs.py +10 -0
  50. src/chatterbox_/models/s3gen/const.py +2 -0
.devcontainer/devcontainer.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "Chatterbox A100 Optimized",
3
+ "image": "unsloth/unsloth:2025.10.1-pt2.8.0-cu12.8-llamacpp-integration",
4
+
5
+ "forwardPorts": [8888],
6
+
7
+ "containerEnv": {
8
+ "JUPYTER_PASSWORD": "MASKED_PASSWORD",
9
+ "USER_PASSWORD": "unsloth"
10
+ },
11
+
12
+ "runArgs": [
13
+ "--gpus=all",
14
+ "--shm-size=64gb"
15
+ ],
16
+
17
+ "remoteUser": "root",
18
+
19
+ "customizations": {
20
+ "vscode": {
21
+ "extensions": [
22
+ "ms-python.python",
23
+ "ms-python.vscode-pylance",
24
+ "ms-toolsai.jupyter"
25
+ ]
26
+ }
27
+ },
28
+
29
+ "postCreateCommand": "apt-get update && apt-get install -y git ffmpeg libsndfile1 && chmod -R 777 /workspaces && cd /workspaces/work/chatterbox-finetuning"
30
+ }
.gitattributes CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ generalization_comparison_filtered.png filter=lfs diff=lfs merge=lfs -text
37
+ samples/comparison/cv15_11_baseline.wav filter=lfs diff=lfs merge=lfs -text
38
+ samples/comparison/cv15_11_finetuned.wav filter=lfs diff=lfs merge=lfs -text
39
+ samples/comparison/cv15_16_baseline.wav filter=lfs diff=lfs merge=lfs -text
40
+ samples/comparison/cv15_16_finetuned.wav filter=lfs diff=lfs merge=lfs -text
41
+ samples/comparison/cv15_2_baseline.wav filter=lfs diff=lfs merge=lfs -text
42
+ samples/comparison/cv15_2_finetuned.wav filter=lfs diff=lfs merge=lfs -text
43
+ samples/reference_finnish.wav filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,114 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Chatterbox Finnish Fine-Tuning: High-Fidelity Zero-Shot TTS
2
+
3
+ This project focuses on fine-tuning the Chatterbox TTS model (based on the Llama architecture) specifically for the Finnish language. By leveraging a multilingual base and applying rigorous data quality filtering, we achieved a near-perfect zero-shot generalization to unseen Finnish speakers.
4
+
5
+ ## 🚀 Performance Comparison (Zero-Shot OOD)
6
+
7
+ The following metrics were calculated on **Out-of-Distribution (OOD)** speakers who were strictly excluded from the training and validation sets. This measures how well the model can speak Finnish in voices it has never heard before.
8
+
9
+ | Metric | Baseline (Original Multilingual) | Fine-Tuned (Best Step: 795) | Improvement |
10
+ | :--- | :---: | :---: | :---: |
11
+ | **Avg Word Error Rate (WER)** | 28.94% | **1.36%** | **~21x Accuracy Increase** |
12
+ | **Mean Opinion Score (MOS)** | 2.29 / 5.0 | **4.16 / 5.0** | **+1.87 Quality Points** |
13
+
14
+ *Note: MOS was evaluated using the Gemini 3 Flash API, and WER was calculated using Faster-Whisper Finnish Large v3.*
15
+
16
+ ---
17
+
18
+ ## 🎧 Audio Comparison (OOD Speakers)
19
+
20
+ Listen to the difference between the generic multilingual baseline and our high-fidelity Finnish fine-tuning. These samples are from **Zero-Shot** speakers (never seen during training).
21
+
22
+ | Speaker ID | Baseline (Generic Multilingual) | Fine-Tuned (Finnish Golden) |
23
+ | :--- | :--- | :--- |
24
+ | **cv-15_11** | [Baseline Audio](samples/comparison/cv15_11_baseline.wav) | [Fine-Tuned Audio](samples/comparison/cv15_11_finetuned.wav) |
25
+ | **cv-15_16** | [Baseline Audio](samples/comparison/cv15_16_baseline.wav) | [Fine-Tuned Audio](samples/comparison/cv15_16_finetuned.wav) |
26
+ | **cv-15_2** | [Baseline Audio](samples/comparison/cv15_2_baseline.wav) | [Fine-Tuned Audio](samples/comparison/cv15_2_finetuned.wav) |
27
+
28
+ *The samples above use the same text and reference audio for a fair comparison.*
29
+
30
+ ---
31
+
32
+ ## 🛠 Data Processing & Transparency
33
+
34
+ We implemented a "Golden Data" strategy to ensure the model learned high-quality Finnish prosody without acoustic artifacts. After strict filtering, the final training set consists of **8,655 high-quality samples**.
35
+
36
+ ### 1. Multi-Source Dataset Breakdown
37
+ The final dataset is a diverse mix of Finnish speech from the following sources:
38
+ - **Mozilla Common Voice (cv-15)**: 4,348 samples (Diverse crowdsourced voices)
39
+ - **Filmot**: 2,605 samples (Media-based Finnish)
40
+ - **YouTube**: 982 samples (Conversational modern Finnish)
41
+ - **Parliament**: 720 samples (Formal Finnish speech)
42
+
43
+ ### 2. "Golden" Filtering Logic
44
+ To prevent the model from cloning background noise or learning from single-word clips, we applied the following strict filters in `src/dataset.py`:
45
+ - **Min Duration**: 4.0 seconds (ensures enough context for prosody).
46
+ - **Min SNR**: 35.0 dB (removes low-quality/noisy recordings).
47
+ - **Max SNR**: 100.0 dB (removes sterile/digital noise-gated artifacts).
48
+
49
+ ### 3. Traceability & Lineage
50
+ Full lineage is maintained for every training run. The script automatically generates a `dataset_filtering_lineage.csv` in the output directory, detailing exactly which files were excluded and for what reason (`LOW_SNR`, `LOW_DURATION`, or `OOD_SPEAKER`).
51
+
52
+ ## 💻 Hardware & Infrastructure
53
+
54
+ This training was performed on the **Verda platform** using an **NVIDIA A10 80GB** instance. This high-VRAM instance allowed us to use a larger batch size and 850ms speech sequences without hitting memory limits.
55
+
56
+ ### .devcontainer Configuration
57
+ We have included the `.devcontainer` directory to ensure a reproducible environment. It pre-installs all necessary CUDA-optimized libraries and sets up the Jupyter environment for immediate experimentation.
58
+
59
+ ---
60
+
61
+ ## 🔧 Installation & Setup
62
+
63
+ 1. **Environment**: Ensure you have Python 3.10+ and CUDA-capable hardware.
64
+ 2. **Setup**:
65
+ ```bash
66
+ bash install_dependencies.sh
67
+ python setup.py # Downloads the multilingual base weights
68
+ ```
69
+
70
+ ---
71
+
72
+ ## 🏃 Running Inference
73
+
74
+ To generate Finnish speech using the fine-tuned model:
75
+
76
+ ```python
77
+ from src.chatterbox_.tts import ChatterboxTTS
78
+
79
+ # 1. Load the engine
80
+ engine = ChatterboxTTS.from_local("./pretrained_models", device="cuda")
81
+
82
+ # 2. Inject your best finetuned weights
83
+ # (Assuming your best weights are in chatterbox_output/checkpoint-795)
84
+ # engine.t3.load_state_dict(...)
85
+
86
+ # 3. Generate with Finnish-optimized parameters
87
+ wav = engine.generate(
88
+ text="Suomen kieli on poikkeuksellisen kaunista kuunneltavaa.",
89
+ audio_prompt_path="path/to/reference_voice.wav",
90
+ repetition_penalty=1.2,
91
+ temperature=0.8,
92
+ exaggeration=0.6
93
+ )
94
+ ```
95
+
96
+ ### Optimized Parameters for Finnish
97
+ Based on our research, we identified the following settings as the most stable for Finnish phonetics:
98
+ - `repetition_penalty`: 1.2
99
+ - `temperature`: 0.8
100
+ - `Repetition Guard`: Increased to **10 tokens** in `AlignmentStreamAnalyzer` to allow for long Finnish vowels without premature cutoffs.
101
+
102
+ ---
103
+
104
+ ## 🛡 Repetition Guard Improvements
105
+ A critical fix was applied to `src/chatterbox_/models/t3/inference/alignment_stream_analyzer.py`. The original threshold for token repetition was too sensitive for Finnish (which relies on long vowels). It has been increased from 3 to **10 tokens (~160ms)**, allowing for natural linguistic duration while still preventing infinite generation loops.
106
+
107
+ ---
108
+
109
+ ## 🙏 Acknowledgments & Credits
110
+
111
+ - **Exploration Foundation**: Initial fine-tuning exploration was based on the [chatterbox-finetuning](https://github.com/gokhaneraslan/chatterbox-finetuning) toolkit by gokhaneraslan.
112
+ - **Model Authors**: Deep thanks to the team at **ResembleAI** for releasing the [Chatterbox TTS model](https://huggingface.co/ResembleAI/chatterbox).
113
+ - **Data Sourcing**: Special thanks to **#Jobik** at **Nordic AI** Discord for introducing [Filmot](https://filmot.com/), which was instrumental in sourcing high-quality media-based Finnish data.
114
+
attribution.csv ADDED
The diff for this file is too large to render. See raw diff
 
dataset_filtering_lineage.csv ADDED
The diff for this file is too large to render. See raw diff
 
generalization_comparison.png ADDED
generalization_comparison_filtered.png ADDED

Git LFS Details

  • SHA256: d7d5de6814697cd42ea23a58f3168fc86b04fb67d49d154cf770c20ce744e1ab
  • Pointer size: 131 Bytes
  • Size of remote file: 102 kB
inference_example.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import soundfile as sf
3
+ from src.chatterbox_.tts import ChatterboxTTS
4
+ from safetensors.torch import load_file
5
+
6
+ # ==============================================================================
7
+ # CONFIGURATION
8
+ # ==============================================================================
9
+ # Path to your preferred checkpoint (e.g., CP 795 for best accuracy)
10
+ FINE_TUNED_WEIGHTS = "./models/best_accuracy_cp795.safetensors"
11
+
12
+ # Text to synthesize
13
+ TEXT = "Suomen kieli on poikkeuksellisen kaunista kuunneltavaa varsinkin hienosti lausuttuna."
14
+
15
+ # Reference audio for voice cloning (3-10s recommended)
16
+ REFERENCE_AUDIO = "./samples/reference_finnish.wav"
17
+
18
+ # Output filename
19
+ OUTPUT_FILE = "inference_output.wav"
20
+ # ==============================================================================
21
+
22
+ def main():
23
+ device = "cuda" if torch.cuda.is_available() else "cpu"
24
+
25
+ # 1. Load the base engine
26
+ # Ensure you have run 'python setup.py' to download the base models first
27
+ print("Loading base engine...")
28
+ engine = ChatterboxTTS.from_local("./pretrained_models", device=device)
29
+
30
+ # 2. Inject the fine-tuned weights
31
+ print(f"Injecting fine-tuned weights from {FINE_TUNED_WEIGHTS}...")
32
+ checkpoint_state = load_file(FINE_TUNED_WEIGHTS)
33
+
34
+ # Strip "t3." prefix if present (added by the trainer wrapper)
35
+ t3_state_dict = {k[3:] if k.startswith("t3.") else k: v for k, v in checkpoint_state.items()}
36
+
37
+ engine.t3.load_state_dict(t3_state_dict, strict=False)
38
+ engine.t3.eval()
39
+
40
+ # 3. Generate Finnish audio
41
+ print(f"Generating audio for text: '{TEXT[:50]}...'")
42
+ wav_tensor = engine.generate(
43
+ text=TEXT,
44
+ audio_prompt_path=REFERENCE_AUDIO,
45
+ repetition_penalty=1.2,
46
+ temperature=0.8,
47
+ exaggeration=0.6
48
+ )
49
+
50
+ # 4. Save result
51
+ wav_np = wav_tensor.squeeze().cpu().numpy()
52
+ sf.write(OUTPUT_FILE, wav_np, engine.sr)
53
+ print(f"✓ Audio saved to {OUTPUT_FILE}")
54
+
55
+ if __name__ == "__main__":
56
+ main()
57
+
install_dependencies.sh ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Chatterbox Finetuning - Dependency Installation Script
3
+ # This script ensures correct PyTorch and dependency versions are installed
4
+
5
+ set -e # Exit on error
6
+
7
+ echo "===================================="
8
+ echo "Chatterbox Finetuning Setup"
9
+ echo "===================================="
10
+
11
+ # Check Python version
12
+ PYTHON_VERSION=$(python --version 2>&1 | grep -oP '(?<=Python )\d+\.\d+')
13
+ echo "Python version: $PYTHON_VERSION"
14
+
15
+ # Uninstall conflicting packages if they exist
16
+ echo ""
17
+ echo "Step 1: Removing conflicting packages..."
18
+ pip uninstall -y torch torchvision torchaudio xformers flash-attn 2>/dev/null || true
19
+
20
+ # Install correct PyTorch version
21
+ echo ""
22
+ echo "Step 2: Installing PyTorch 2.5.1 with CUDA 12.4..."
23
+ pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
24
+
25
+ # Install xformers
26
+ echo ""
27
+ echo "Step 3: Installing xformers..."
28
+ pip install xformers==0.0.28.post3 --index-url https://download.pytorch.org/whl/cu124
29
+
30
+ # Install torchao (compatible version)
31
+ echo ""
32
+ echo "Step 4: Installing torchao..."
33
+ pip install torchao==0.6.1
34
+
35
+ # Install all other dependencies
36
+ echo ""
37
+ echo "Step 5: Installing remaining dependencies..."
38
+ pip install -r requirements.txt
39
+
40
+ # Verify installation
41
+ echo ""
42
+ echo "===================================="
43
+ echo "Verifying installation..."
44
+ echo "===================================="
45
+ python -c "
46
+ import torch
47
+ import xformers
48
+ import transformers
49
+ print(f'✓ PyTorch: {torch.__version__}')
50
+ print(f'✓ xformers: {xformers.__version__}')
51
+ print(f'✓ Transformers: {transformers.__version__}')
52
+ print(f'✓ CUDA available: {torch.cuda.is_available()}')
53
+ if torch.cuda.is_available():
54
+ print(f'✓ CUDA version: {torch.version.cuda}')
55
+ "
56
+
57
+ echo ""
58
+ echo "===================================="
59
+ echo "Installation complete!"
60
+ echo "===================================="
61
+ echo ""
62
+ echo "Next steps:"
63
+ echo "1. Run: python setup.py (to download pretrained models)"
64
+ echo "2. Run: python train.py (to start training)"
65
+ echo ""
models/best_accuracy_cp795.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c0dd9023aba3e952c76435ce1e5e8f028dc7ad3ec1242f4df8dda377c058d0b
3
+ size 2143990656
models/best_naturalness_cp1060.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f086086d30eed54c4f42160b2f2b0c192aa0a8251c10b5a67e0eea2935ca985d
3
+ size 2143990656
requirements.txt ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core PyTorch - Using 2.5.1 for stable xformers/flash-attn support
2
+ --extra-index-url https://download.pytorch.org/whl/cu124
3
+ torch==2.5.1
4
+ torchaudio==2.5.1
5
+ torchvision==0.20.1
6
+
7
+ # Core dependencies with pinned versions for stability
8
+ transformers==4.46.3
9
+ xformers==0.0.28.post3
10
+ torchao==0.6.1
11
+ diffusers==0.29.0
12
+ peft==0.17.1
13
+
14
+ # Chatterbox TTS dependencies
15
+ # Note: chatterbox-tts itself is installed via install_dependencies.sh --no-deps
16
+ # to avoid strict torch==2.6.0 conflict
17
+ resemble-perth==1.0.1
18
+ conformer==0.3.2
19
+ s3tokenizer==0.3.0
20
+
21
+ # Audio processing
22
+ silero-vad==6.2.0
23
+ librosa==0.11.0
24
+ soundfile==0.13.1
25
+ pyloudnorm
26
+
27
+ # Utilities
28
+ num2words
29
+ ffmpeg-python
30
+ tqdm
31
+ pandas
32
+ safetensors
33
+ tensorboard
34
+ omegaconf
35
+ hf_transfer
36
+ gdown
samples/comparison/cv15_11_baseline.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c35b6067b981f318e0bd02c04e034c7ec0cb290f07ce117b7e7d6e49125d4c2
3
+ size 527084
samples/comparison/cv15_11_finetuned.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c095d7a386a0430e8c105cca160e35a5321536b95ed6d3336456f80d5d28695
3
+ size 431084
samples/comparison/cv15_16_baseline.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c76889754e1387567df26bb63faf355434990be9190e92cf2b2ec670c4abc9a
3
+ size 469484
samples/comparison/cv15_16_finetuned.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:049e7435b69864d1f27df2a8a98f1b95d40eee7645fd3d03190512e9380d67b6
3
+ size 358124
samples/comparison/cv15_2_baseline.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cfd64fbc57efddba7cd9e5e13dd41f2a7843796112b59e36f6a3c5999109fc8
3
+ size 434924
samples/comparison/cv15_2_finetuned.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb8142ec3157d3d945e4896c215fca0e1031c520aaa03f8533f53b96f564eb8e
3
+ size 423404
samples/reference_finnish.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb5f8e26a7a88276e23bf73375be147269ba2cd6fe6fb38ac68e7b330d1fc03c
3
+ size 266156
setup.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import sys
4
+ import json
5
+ from tqdm import tqdm
6
+ from transformers import AutoTokenizer
7
+ from src.config import TrainConfig
8
+
9
+
10
+ DEST_DIR = "pretrained_models"
11
+
12
+ CHATTERBOX_TURBO_FILES = {
13
+ "ve.safetensors": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/ve.safetensors?download=true",
14
+ "t3_turbo_v1.safetensors": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/t3_turbo_v1.safetensors?download=true",
15
+ "s3gen_meanflow.safetensors": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/s3gen_meanflow.safetensors?download=true",
16
+ "conds.pt": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/conds.pt?download=true",
17
+ "vocab.json": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/vocab.json?download=true",
18
+ "added_tokens.json": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/added_tokens.json?download=true",
19
+ "special_tokens_map.json": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/special_tokens_map.json?download=true",
20
+ "tokenizer_config.json": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/tokenizer_config.json?download=true",
21
+ "merges.txt": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/merges.txt?download=true",
22
+ "grapheme_mtl_merged_expanded_v1.json": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/grapheme_mtl_merged_expanded_v1.json?download=true"
23
+ }
24
+
25
+
26
+ CHATTERBOX_FILES = {
27
+ "ve.safetensors": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/ve.safetensors?download=true",
28
+ "t3_cfg.safetensors": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/t3_mtl23ls_v2.safetensors?download=true",
29
+ "s3gen.safetensors": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/s3gen.safetensors?download=true",
30
+ "conds.pt": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/conds.pt?download=true",
31
+ "tokenizer.json": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/grapheme_mtl_merged_expanded_v1.json?download=true"
32
+ }
33
+
34
+ def download_file(url, dest_path):
35
+ """Downloads a file from a URL to a specific destination with a progress bar."""
36
+
37
+ if os.path.exists(dest_path):
38
+ print(f"File already exists: {dest_path}")
39
+ return
40
+
41
+ print(f"Downloading: {os.path.basename(dest_path)}...")
42
+
43
+ try:
44
+
45
+ response = requests.get(url, stream=True)
46
+ response.raise_for_status()
47
+
48
+ total_size = int(response.headers.get('content-length', 0))
49
+ block_size = 1024
50
+
51
+ with open(dest_path, 'wb') as file, tqdm(
52
+ desc=os.path.basename(dest_path),
53
+ total=total_size,
54
+ unit='iB',
55
+ unit_scale=True,
56
+ unit_divisor=1024,
57
+ ) as bar:
58
+
59
+ for data in response.iter_content(block_size):
60
+
61
+ size = file.write(data)
62
+ bar.update(size)
63
+
64
+ print(f"Download complete: {dest_path}\n")
65
+
66
+
67
+ except requests.exceptions.RequestException as e:
68
+ print(f"Error downloading {url}: {e}")
69
+ sys.exit(1)
70
+
71
+
72
+
73
+ def merge_and_save_turbo_tokenizer():
74
+ """
75
+ It combines the downloaded original GPT-2 tokenizer with our custom vocab
76
+ and overwrites the original files.
77
+ """
78
+ print("\n--- Turbo Vocab Merging Begins ---")
79
+
80
+ try:
81
+ base_tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
82
+ except Exception as e:
83
+ print(f"ERROR: The original tokenizer could not be loaded. Did you download the files correctly? -> {e}")
84
+ return 0
85
+
86
+
87
+ initial_len = len(base_tokenizer)
88
+ print(f" Original Size: {initial_len}")
89
+
90
+
91
+ custom_vocab_path = os.path.join(DEST_DIR, "grapheme_mtl_merged_expanded_v1.json")
92
+
93
+ print(f"Loading: Custom Vocab ({custom_vocab_path})")
94
+
95
+ with open(custom_vocab_path, 'r', encoding='utf-8') as f:
96
+ custom_data = json.load(f)
97
+
98
+
99
+ if "model" in custom_data and "vocab" in custom_data["model"]:
100
+ vocab_dict = custom_data["model"]["vocab"]
101
+
102
+ else:
103
+ print("Warning: The custom VOCAB format may differ from what is expected.")
104
+ return 0
105
+
106
+ unique_tokens_to_add = list(vocab_dict.keys())
107
+ added_count = base_tokenizer.add_tokens(unique_tokens_to_add)
108
+ final_len = len(base_tokenizer)
109
+
110
+ print(f"Merging: {added_count} new token added.")
111
+ print(f" New Dimension: {final_len}")
112
+
113
+
114
+ print(f"Saving: Writing the combined tokenizer to the '{DEST_DIR}' folder...")
115
+ base_tokenizer.save_pretrained(DEST_DIR)
116
+
117
+ print("MERGER SUCCESSFUL!")
118
+
119
+ return final_len
120
+
121
+
122
+
123
+ def test_merge_tokenizer_process(tokenizer_path):
124
+
125
+ try:
126
+
127
+ tok = AutoTokenizer.from_pretrained(tokenizer_path)
128
+
129
+ print(f"--- RESULTS ---")
130
+ print(f"Folder: {tokenizer_path}")
131
+ print(f"Actual Vocab Size (len): {len(tok)}")
132
+
133
+ test_token = "[ta]"
134
+ test_id = tok.encode(test_token, add_special_tokens=False)
135
+
136
+ print(f"Test Token '{test_token}' ID: {test_id}")
137
+
138
+ if len(tok) > 50276:
139
+ print("SUCCESS! New tokens have been added.")
140
+
141
+ else:
142
+ print("ERROR: The size still appears old.")
143
+
144
+
145
+ except Exception as e:
146
+ print(f"Error: {e}")
147
+
148
+
149
+
150
+
151
+ def main():
152
+
153
+ print("--- Chatterbox Pretrained Model Setup ---\n")
154
+
155
+ # 1. Create the directory if it doesn't exist
156
+ if not os.path.exists(DEST_DIR):
157
+
158
+ print(f"Creating directory: {DEST_DIR}")
159
+ os.makedirs(DEST_DIR, exist_ok=True)
160
+
161
+ else:
162
+ print(f"Directory found: {DEST_DIR}")
163
+
164
+
165
+ cfg = TrainConfig()
166
+
167
+ if cfg.is_turbo:
168
+ print(f"Mode: CHATTERBOX-TURBO (Checking {len(CHATTERBOX_TURBO_FILES)} files)")
169
+ FILES_TO_DOWNLOAD = CHATTERBOX_TURBO_FILES
170
+
171
+ else:
172
+ print(f"Mode: CHATTERBOX-TTS (Checking {len(CHATTERBOX_FILES)} files)")
173
+ FILES_TO_DOWNLOAD = CHATTERBOX_FILES
174
+
175
+ # 2. Download files
176
+ for filename, url in FILES_TO_DOWNLOAD.items():
177
+ dest_path = os.path.join(DEST_DIR, filename)
178
+ download_file(url, dest_path)
179
+
180
+ if cfg.is_turbo:
181
+ new_vocab_size = merge_and_save_turbo_tokenizer()
182
+ if new_vocab_size > 0:
183
+
184
+ #test_merge_tokenizer_process(DEST_DIR)
185
+
186
+ print("\n" + "="*60)
187
+ print("INSTALLATION COMPLETE (CHATTERBOX-TURBO MODE)")
188
+ print("All models are set up in 'pretrained_models/' folder.")
189
+ print(f"Please update the 'new_vocab_size' value in the 'src/config.py' file")
190
+ print(f"to: {new_vocab_size}")
191
+ print("="*60 + "\n")
192
+
193
+ else:
194
+ print("\nINSTALLATION COMPLETE (CHATTERBOX-TTS MOD)")
195
+ print("All models are set up in 'pretrained_models/' folder.")
196
+ print(f"Note: 'grapheme_mtl_merged_expanded_v1.json' was saved as 'tokenizer.json' for the new vocabulary.")
197
+
198
+
199
+
200
+ if __name__ == "__main__":
201
+ main()
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (141 Bytes). View file
 
src/__pycache__/config.cpython-311.pyc ADDED
Binary file (2.07 kB). View file
 
src/__pycache__/dataset.cpython-311.pyc ADDED
Binary file (12.2 kB). View file
 
src/__pycache__/model.cpython-311.pyc ADDED
Binary file (8.21 kB). View file
 
src/__pycache__/preprocess_file_based.cpython-311.pyc ADDED
Binary file (6.65 kB). View file
 
src/__pycache__/preprocess_ljspeech.cpython-311.pyc ADDED
Binary file (6.65 kB). View file
 
src/__pycache__/utils.cpython-311.pyc ADDED
Binary file (5.41 kB). View file
 
src/chatterbox_/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .tts import ChatterboxTTS
2
+ from .vc import ChatterboxVC
3
+ from .mtl_tts import ChatterboxMultilingualTTS, SUPPORTED_LANGUAGES
src/chatterbox_/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (372 Bytes). View file
 
src/chatterbox_/__pycache__/mtl_tts.cpython-311.pyc ADDED
Binary file (14.9 kB). View file
 
src/chatterbox_/__pycache__/tts.cpython-311.pyc ADDED
Binary file (14.5 kB). View file
 
src/chatterbox_/__pycache__/tts_turbo.cpython-311.pyc ADDED
Binary file (15.5 kB). View file
 
src/chatterbox_/__pycache__/vc.cpython-311.pyc ADDED
Binary file (6.27 kB). View file
 
src/chatterbox_/models/__init__.py ADDED
File without changes
src/chatterbox_/models/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (160 Bytes). View file
 
src/chatterbox_/models/__pycache__/utils.cpython-311.pyc ADDED
Binary file (746 Bytes). View file
 
src/chatterbox_/models/s3gen/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .s3gen import S3Token2Wav as S3Gen
2
+ from .const import S3GEN_SR
src/chatterbox_/models/s3gen/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (282 Bytes). View file
 
src/chatterbox_/models/s3gen/__pycache__/configs.cpython-311.pyc ADDED
Binary file (442 Bytes). View file
 
src/chatterbox_/models/s3gen/__pycache__/const.cpython-311.pyc ADDED
Binary file (203 Bytes). View file
 
src/chatterbox_/models/s3gen/__pycache__/decoder.cpython-311.pyc ADDED
Binary file (17.6 kB). View file
 
src/chatterbox_/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc ADDED
Binary file (2.69 kB). View file
 
src/chatterbox_/models/s3gen/__pycache__/flow.cpython-311.pyc ADDED
Binary file (10.1 kB). View file
 
src/chatterbox_/models/s3gen/__pycache__/flow_matching.cpython-311.pyc ADDED
Binary file (12.3 kB). View file
 
src/chatterbox_/models/s3gen/__pycache__/hifigan.cpython-311.pyc ADDED
Binary file (26.3 kB). View file
 
src/chatterbox_/models/s3gen/__pycache__/s3gen.cpython-311.pyc ADDED
Binary file (15.4 kB). View file
 
src/chatterbox_/models/s3gen/__pycache__/xvector.cpython-311.pyc ADDED
Binary file (24 kB). View file
 
src/chatterbox_/models/s3gen/configs.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..utils import AttrDict
2
+
3
+ CFM_PARAMS = AttrDict({
4
+ "sigma_min": 1e-06,
5
+ "solver": "euler",
6
+ "t_scheduler": "cosine",
7
+ "training_cfg_rate": 0.2,
8
+ "inference_cfg_rate": 0.7,
9
+ "reg_loss_type": "l1"
10
+ })
src/chatterbox_/models/s3gen/const.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ S3GEN_SR = 24000
2
+ S3GEN_SIL = 4299