Upload Finnish Chatterbox model
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .devcontainer/devcontainer.json +30 -0
- .gitattributes +8 -0
- README.md +114 -3
- attribution.csv +0 -0
- dataset_filtering_lineage.csv +0 -0
- generalization_comparison.png +0 -0
- generalization_comparison_filtered.png +3 -0
- inference_example.py +57 -0
- install_dependencies.sh +65 -0
- models/best_accuracy_cp795.safetensors +3 -0
- models/best_naturalness_cp1060.safetensors +3 -0
- requirements.txt +36 -0
- samples/comparison/cv15_11_baseline.wav +3 -0
- samples/comparison/cv15_11_finetuned.wav +3 -0
- samples/comparison/cv15_16_baseline.wav +3 -0
- samples/comparison/cv15_16_finetuned.wav +3 -0
- samples/comparison/cv15_2_baseline.wav +3 -0
- samples/comparison/cv15_2_finetuned.wav +3 -0
- samples/reference_finnish.wav +3 -0
- setup.py +201 -0
- src/__init__.py +0 -0
- src/__pycache__/__init__.cpython-311.pyc +0 -0
- src/__pycache__/config.cpython-311.pyc +0 -0
- src/__pycache__/dataset.cpython-311.pyc +0 -0
- src/__pycache__/model.cpython-311.pyc +0 -0
- src/__pycache__/preprocess_file_based.cpython-311.pyc +0 -0
- src/__pycache__/preprocess_ljspeech.cpython-311.pyc +0 -0
- src/__pycache__/utils.cpython-311.pyc +0 -0
- src/chatterbox_/__init__.py +3 -0
- src/chatterbox_/__pycache__/__init__.cpython-311.pyc +0 -0
- src/chatterbox_/__pycache__/mtl_tts.cpython-311.pyc +0 -0
- src/chatterbox_/__pycache__/tts.cpython-311.pyc +0 -0
- src/chatterbox_/__pycache__/tts_turbo.cpython-311.pyc +0 -0
- src/chatterbox_/__pycache__/vc.cpython-311.pyc +0 -0
- src/chatterbox_/models/__init__.py +0 -0
- src/chatterbox_/models/__pycache__/__init__.cpython-311.pyc +0 -0
- src/chatterbox_/models/__pycache__/utils.cpython-311.pyc +0 -0
- src/chatterbox_/models/s3gen/__init__.py +2 -0
- src/chatterbox_/models/s3gen/__pycache__/__init__.cpython-311.pyc +0 -0
- src/chatterbox_/models/s3gen/__pycache__/configs.cpython-311.pyc +0 -0
- src/chatterbox_/models/s3gen/__pycache__/const.cpython-311.pyc +0 -0
- src/chatterbox_/models/s3gen/__pycache__/decoder.cpython-311.pyc +0 -0
- src/chatterbox_/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc +0 -0
- src/chatterbox_/models/s3gen/__pycache__/flow.cpython-311.pyc +0 -0
- src/chatterbox_/models/s3gen/__pycache__/flow_matching.cpython-311.pyc +0 -0
- src/chatterbox_/models/s3gen/__pycache__/hifigan.cpython-311.pyc +0 -0
- src/chatterbox_/models/s3gen/__pycache__/s3gen.cpython-311.pyc +0 -0
- src/chatterbox_/models/s3gen/__pycache__/xvector.cpython-311.pyc +0 -0
- src/chatterbox_/models/s3gen/configs.py +10 -0
- src/chatterbox_/models/s3gen/const.py +2 -0
.devcontainer/devcontainer.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "Chatterbox A100 Optimized",
|
| 3 |
+
"image": "unsloth/unsloth:2025.10.1-pt2.8.0-cu12.8-llamacpp-integration",
|
| 4 |
+
|
| 5 |
+
"forwardPorts": [8888],
|
| 6 |
+
|
| 7 |
+
"containerEnv": {
|
| 8 |
+
"JUPYTER_PASSWORD": "MASKED_PASSWORD",
|
| 9 |
+
"USER_PASSWORD": "unsloth"
|
| 10 |
+
},
|
| 11 |
+
|
| 12 |
+
"runArgs": [
|
| 13 |
+
"--gpus=all",
|
| 14 |
+
"--shm-size=64gb"
|
| 15 |
+
],
|
| 16 |
+
|
| 17 |
+
"remoteUser": "root",
|
| 18 |
+
|
| 19 |
+
"customizations": {
|
| 20 |
+
"vscode": {
|
| 21 |
+
"extensions": [
|
| 22 |
+
"ms-python.python",
|
| 23 |
+
"ms-python.vscode-pylance",
|
| 24 |
+
"ms-toolsai.jupyter"
|
| 25 |
+
]
|
| 26 |
+
}
|
| 27 |
+
},
|
| 28 |
+
|
| 29 |
+
"postCreateCommand": "apt-get update && apt-get install -y git ffmpeg libsndfile1 && chmod -R 777 /workspaces && cd /workspaces/work/chatterbox-finetuning"
|
| 30 |
+
}
|
.gitattributes
CHANGED
|
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
generalization_comparison_filtered.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
samples/comparison/cv15_11_baseline.wav filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
samples/comparison/cv15_11_finetuned.wav filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
samples/comparison/cv15_16_baseline.wav filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
samples/comparison/cv15_16_finetuned.wav filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
samples/comparison/cv15_2_baseline.wav filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
samples/comparison/cv15_2_finetuned.wav filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
samples/reference_finnish.wav filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,3 +1,114 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Chatterbox Finnish Fine-Tuning: High-Fidelity Zero-Shot TTS
|
| 2 |
+
|
| 3 |
+
This project focuses on fine-tuning the Chatterbox TTS model (based on the Llama architecture) specifically for the Finnish language. By leveraging a multilingual base and applying rigorous data quality filtering, we achieved a near-perfect zero-shot generalization to unseen Finnish speakers.
|
| 4 |
+
|
| 5 |
+
## 🚀 Performance Comparison (Zero-Shot OOD)
|
| 6 |
+
|
| 7 |
+
The following metrics were calculated on **Out-of-Distribution (OOD)** speakers who were strictly excluded from the training and validation sets. This measures how well the model can speak Finnish in voices it has never heard before.
|
| 8 |
+
|
| 9 |
+
| Metric | Baseline (Original Multilingual) | Fine-Tuned (Best Step: 795) | Improvement |
|
| 10 |
+
| :--- | :---: | :---: | :---: |
|
| 11 |
+
| **Avg Word Error Rate (WER)** | 28.94% | **1.36%** | **~21x Accuracy Increase** |
|
| 12 |
+
| **Mean Opinion Score (MOS)** | 2.29 / 5.0 | **4.16 / 5.0** | **+1.87 Quality Points** |
|
| 13 |
+
|
| 14 |
+
*Note: MOS was evaluated using the Gemini 3 Flash API, and WER was calculated using Faster-Whisper Finnish Large v3.*
|
| 15 |
+
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
## 🎧 Audio Comparison (OOD Speakers)
|
| 19 |
+
|
| 20 |
+
Listen to the difference between the generic multilingual baseline and our high-fidelity Finnish fine-tuning. These samples are from **Zero-Shot** speakers (never seen during training).
|
| 21 |
+
|
| 22 |
+
| Speaker ID | Baseline (Generic Multilingual) | Fine-Tuned (Finnish Golden) |
|
| 23 |
+
| :--- | :--- | :--- |
|
| 24 |
+
| **cv-15_11** | [Baseline Audio](samples/comparison/cv15_11_baseline.wav) | [Fine-Tuned Audio](samples/comparison/cv15_11_finetuned.wav) |
|
| 25 |
+
| **cv-15_16** | [Baseline Audio](samples/comparison/cv15_16_baseline.wav) | [Fine-Tuned Audio](samples/comparison/cv15_16_finetuned.wav) |
|
| 26 |
+
| **cv-15_2** | [Baseline Audio](samples/comparison/cv15_2_baseline.wav) | [Fine-Tuned Audio](samples/comparison/cv15_2_finetuned.wav) |
|
| 27 |
+
|
| 28 |
+
*The samples above use the same text and reference audio for a fair comparison.*
|
| 29 |
+
|
| 30 |
+
---
|
| 31 |
+
|
| 32 |
+
## 🛠 Data Processing & Transparency
|
| 33 |
+
|
| 34 |
+
We implemented a "Golden Data" strategy to ensure the model learned high-quality Finnish prosody without acoustic artifacts. After strict filtering, the final training set consists of **8,655 high-quality samples**.
|
| 35 |
+
|
| 36 |
+
### 1. Multi-Source Dataset Breakdown
|
| 37 |
+
The final dataset is a diverse mix of Finnish speech from the following sources:
|
| 38 |
+
- **Mozilla Common Voice (cv-15)**: 4,348 samples (Diverse crowdsourced voices)
|
| 39 |
+
- **Filmot**: 2,605 samples (Media-based Finnish)
|
| 40 |
+
- **YouTube**: 982 samples (Conversational modern Finnish)
|
| 41 |
+
- **Parliament**: 720 samples (Formal Finnish speech)
|
| 42 |
+
|
| 43 |
+
### 2. "Golden" Filtering Logic
|
| 44 |
+
To prevent the model from cloning background noise or learning from single-word clips, we applied the following strict filters in `src/dataset.py`:
|
| 45 |
+
- **Min Duration**: 4.0 seconds (ensures enough context for prosody).
|
| 46 |
+
- **Min SNR**: 35.0 dB (removes low-quality/noisy recordings).
|
| 47 |
+
- **Max SNR**: 100.0 dB (removes sterile/digital noise-gated artifacts).
|
| 48 |
+
|
| 49 |
+
### 3. Traceability & Lineage
|
| 50 |
+
Full lineage is maintained for every training run. The script automatically generates a `dataset_filtering_lineage.csv` in the output directory, detailing exactly which files were excluded and for what reason (`LOW_SNR`, `LOW_DURATION`, or `OOD_SPEAKER`).
|
| 51 |
+
|
| 52 |
+
## 💻 Hardware & Infrastructure
|
| 53 |
+
|
| 54 |
+
This training was performed on the **Verda platform** using an **NVIDIA A10 80GB** instance. This high-VRAM instance allowed us to use a larger batch size and 850ms speech sequences without hitting memory limits.
|
| 55 |
+
|
| 56 |
+
### .devcontainer Configuration
|
| 57 |
+
We have included the `.devcontainer` directory to ensure a reproducible environment. It pre-installs all necessary CUDA-optimized libraries and sets up the Jupyter environment for immediate experimentation.
|
| 58 |
+
|
| 59 |
+
---
|
| 60 |
+
|
| 61 |
+
## 🔧 Installation & Setup
|
| 62 |
+
|
| 63 |
+
1. **Environment**: Ensure you have Python 3.10+ and CUDA-capable hardware.
|
| 64 |
+
2. **Setup**:
|
| 65 |
+
```bash
|
| 66 |
+
bash install_dependencies.sh
|
| 67 |
+
python setup.py # Downloads the multilingual base weights
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
---
|
| 71 |
+
|
| 72 |
+
## 🏃 Running Inference
|
| 73 |
+
|
| 74 |
+
To generate Finnish speech using the fine-tuned model:
|
| 75 |
+
|
| 76 |
+
```python
|
| 77 |
+
from src.chatterbox_.tts import ChatterboxTTS
|
| 78 |
+
|
| 79 |
+
# 1. Load the engine
|
| 80 |
+
engine = ChatterboxTTS.from_local("./pretrained_models", device="cuda")
|
| 81 |
+
|
| 82 |
+
# 2. Inject your best finetuned weights
|
| 83 |
+
# (Assuming your best weights are in chatterbox_output/checkpoint-795)
|
| 84 |
+
# engine.t3.load_state_dict(...)
|
| 85 |
+
|
| 86 |
+
# 3. Generate with Finnish-optimized parameters
|
| 87 |
+
wav = engine.generate(
|
| 88 |
+
text="Suomen kieli on poikkeuksellisen kaunista kuunneltavaa.",
|
| 89 |
+
audio_prompt_path="path/to/reference_voice.wav",
|
| 90 |
+
repetition_penalty=1.2,
|
| 91 |
+
temperature=0.8,
|
| 92 |
+
exaggeration=0.6
|
| 93 |
+
)
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
### Optimized Parameters for Finnish
|
| 97 |
+
Based on our research, we identified the following settings as the most stable for Finnish phonetics:
|
| 98 |
+
- `repetition_penalty`: 1.2
|
| 99 |
+
- `temperature`: 0.8
|
| 100 |
+
- `Repetition Guard`: Increased to **10 tokens** in `AlignmentStreamAnalyzer` to allow for long Finnish vowels without premature cutoffs.
|
| 101 |
+
|
| 102 |
+
---
|
| 103 |
+
|
| 104 |
+
## 🛡 Repetition Guard Improvements
|
| 105 |
+
A critical fix was applied to `src/chatterbox_/models/t3/inference/alignment_stream_analyzer.py`. The original threshold for token repetition was too sensitive for Finnish (which relies on long vowels). It has been increased from 3 to **10 tokens (~160ms)**, allowing for natural linguistic duration while still preventing infinite generation loops.
|
| 106 |
+
|
| 107 |
+
---
|
| 108 |
+
|
| 109 |
+
## 🙏 Acknowledgments & Credits
|
| 110 |
+
|
| 111 |
+
- **Exploration Foundation**: Initial fine-tuning exploration was based on the [chatterbox-finetuning](https://github.com/gokhaneraslan/chatterbox-finetuning) toolkit by gokhaneraslan.
|
| 112 |
+
- **Model Authors**: Deep thanks to the team at **ResembleAI** for releasing the [Chatterbox TTS model](https://huggingface.co/ResembleAI/chatterbox).
|
| 113 |
+
- **Data Sourcing**: Special thanks to **#Jobik** at **Nordic AI** Discord for introducing [Filmot](https://filmot.com/), which was instrumental in sourcing high-quality media-based Finnish data.
|
| 114 |
+
|
attribution.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
dataset_filtering_lineage.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
generalization_comparison.png
ADDED
|
generalization_comparison_filtered.png
ADDED
|
Git LFS Details
|
inference_example.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import soundfile as sf
|
| 3 |
+
from src.chatterbox_.tts import ChatterboxTTS
|
| 4 |
+
from safetensors.torch import load_file
|
| 5 |
+
|
| 6 |
+
# ==============================================================================
|
| 7 |
+
# CONFIGURATION
|
| 8 |
+
# ==============================================================================
|
| 9 |
+
# Path to your preferred checkpoint (e.g., CP 795 for best accuracy)
|
| 10 |
+
FINE_TUNED_WEIGHTS = "./models/best_accuracy_cp795.safetensors"
|
| 11 |
+
|
| 12 |
+
# Text to synthesize
|
| 13 |
+
TEXT = "Suomen kieli on poikkeuksellisen kaunista kuunneltavaa varsinkin hienosti lausuttuna."
|
| 14 |
+
|
| 15 |
+
# Reference audio for voice cloning (3-10s recommended)
|
| 16 |
+
REFERENCE_AUDIO = "./samples/reference_finnish.wav"
|
| 17 |
+
|
| 18 |
+
# Output filename
|
| 19 |
+
OUTPUT_FILE = "inference_output.wav"
|
| 20 |
+
# ==============================================================================
|
| 21 |
+
|
| 22 |
+
def main():
|
| 23 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 24 |
+
|
| 25 |
+
# 1. Load the base engine
|
| 26 |
+
# Ensure you have run 'python setup.py' to download the base models first
|
| 27 |
+
print("Loading base engine...")
|
| 28 |
+
engine = ChatterboxTTS.from_local("./pretrained_models", device=device)
|
| 29 |
+
|
| 30 |
+
# 2. Inject the fine-tuned weights
|
| 31 |
+
print(f"Injecting fine-tuned weights from {FINE_TUNED_WEIGHTS}...")
|
| 32 |
+
checkpoint_state = load_file(FINE_TUNED_WEIGHTS)
|
| 33 |
+
|
| 34 |
+
# Strip "t3." prefix if present (added by the trainer wrapper)
|
| 35 |
+
t3_state_dict = {k[3:] if k.startswith("t3.") else k: v for k, v in checkpoint_state.items()}
|
| 36 |
+
|
| 37 |
+
engine.t3.load_state_dict(t3_state_dict, strict=False)
|
| 38 |
+
engine.t3.eval()
|
| 39 |
+
|
| 40 |
+
# 3. Generate Finnish audio
|
| 41 |
+
print(f"Generating audio for text: '{TEXT[:50]}...'")
|
| 42 |
+
wav_tensor = engine.generate(
|
| 43 |
+
text=TEXT,
|
| 44 |
+
audio_prompt_path=REFERENCE_AUDIO,
|
| 45 |
+
repetition_penalty=1.2,
|
| 46 |
+
temperature=0.8,
|
| 47 |
+
exaggeration=0.6
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# 4. Save result
|
| 51 |
+
wav_np = wav_tensor.squeeze().cpu().numpy()
|
| 52 |
+
sf.write(OUTPUT_FILE, wav_np, engine.sr)
|
| 53 |
+
print(f"✓ Audio saved to {OUTPUT_FILE}")
|
| 54 |
+
|
| 55 |
+
if __name__ == "__main__":
|
| 56 |
+
main()
|
| 57 |
+
|
install_dependencies.sh
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Chatterbox Finetuning - Dependency Installation Script
|
| 3 |
+
# This script ensures correct PyTorch and dependency versions are installed
|
| 4 |
+
|
| 5 |
+
set -e # Exit on error
|
| 6 |
+
|
| 7 |
+
echo "===================================="
|
| 8 |
+
echo "Chatterbox Finetuning Setup"
|
| 9 |
+
echo "===================================="
|
| 10 |
+
|
| 11 |
+
# Check Python version
|
| 12 |
+
PYTHON_VERSION=$(python --version 2>&1 | grep -oP '(?<=Python )\d+\.\d+')
|
| 13 |
+
echo "Python version: $PYTHON_VERSION"
|
| 14 |
+
|
| 15 |
+
# Uninstall conflicting packages if they exist
|
| 16 |
+
echo ""
|
| 17 |
+
echo "Step 1: Removing conflicting packages..."
|
| 18 |
+
pip uninstall -y torch torchvision torchaudio xformers flash-attn 2>/dev/null || true
|
| 19 |
+
|
| 20 |
+
# Install correct PyTorch version
|
| 21 |
+
echo ""
|
| 22 |
+
echo "Step 2: Installing PyTorch 2.5.1 with CUDA 12.4..."
|
| 23 |
+
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
|
| 24 |
+
|
| 25 |
+
# Install xformers
|
| 26 |
+
echo ""
|
| 27 |
+
echo "Step 3: Installing xformers..."
|
| 28 |
+
pip install xformers==0.0.28.post3 --index-url https://download.pytorch.org/whl/cu124
|
| 29 |
+
|
| 30 |
+
# Install torchao (compatible version)
|
| 31 |
+
echo ""
|
| 32 |
+
echo "Step 4: Installing torchao..."
|
| 33 |
+
pip install torchao==0.6.1
|
| 34 |
+
|
| 35 |
+
# Install all other dependencies
|
| 36 |
+
echo ""
|
| 37 |
+
echo "Step 5: Installing remaining dependencies..."
|
| 38 |
+
pip install -r requirements.txt
|
| 39 |
+
|
| 40 |
+
# Verify installation
|
| 41 |
+
echo ""
|
| 42 |
+
echo "===================================="
|
| 43 |
+
echo "Verifying installation..."
|
| 44 |
+
echo "===================================="
|
| 45 |
+
python -c "
|
| 46 |
+
import torch
|
| 47 |
+
import xformers
|
| 48 |
+
import transformers
|
| 49 |
+
print(f'✓ PyTorch: {torch.__version__}')
|
| 50 |
+
print(f'✓ xformers: {xformers.__version__}')
|
| 51 |
+
print(f'✓ Transformers: {transformers.__version__}')
|
| 52 |
+
print(f'✓ CUDA available: {torch.cuda.is_available()}')
|
| 53 |
+
if torch.cuda.is_available():
|
| 54 |
+
print(f'✓ CUDA version: {torch.version.cuda}')
|
| 55 |
+
"
|
| 56 |
+
|
| 57 |
+
echo ""
|
| 58 |
+
echo "===================================="
|
| 59 |
+
echo "Installation complete!"
|
| 60 |
+
echo "===================================="
|
| 61 |
+
echo ""
|
| 62 |
+
echo "Next steps:"
|
| 63 |
+
echo "1. Run: python setup.py (to download pretrained models)"
|
| 64 |
+
echo "2. Run: python train.py (to start training)"
|
| 65 |
+
echo ""
|
models/best_accuracy_cp795.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c0dd9023aba3e952c76435ce1e5e8f028dc7ad3ec1242f4df8dda377c058d0b
|
| 3 |
+
size 2143990656
|
models/best_naturalness_cp1060.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f086086d30eed54c4f42160b2f2b0c192aa0a8251c10b5a67e0eea2935ca985d
|
| 3 |
+
size 2143990656
|
requirements.txt
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core PyTorch - Using 2.5.1 for stable xformers/flash-attn support
|
| 2 |
+
--extra-index-url https://download.pytorch.org/whl/cu124
|
| 3 |
+
torch==2.5.1
|
| 4 |
+
torchaudio==2.5.1
|
| 5 |
+
torchvision==0.20.1
|
| 6 |
+
|
| 7 |
+
# Core dependencies with pinned versions for stability
|
| 8 |
+
transformers==4.46.3
|
| 9 |
+
xformers==0.0.28.post3
|
| 10 |
+
torchao==0.6.1
|
| 11 |
+
diffusers==0.29.0
|
| 12 |
+
peft==0.17.1
|
| 13 |
+
|
| 14 |
+
# Chatterbox TTS dependencies
|
| 15 |
+
# Note: chatterbox-tts itself is installed via install_dependencies.sh --no-deps
|
| 16 |
+
# to avoid strict torch==2.6.0 conflict
|
| 17 |
+
resemble-perth==1.0.1
|
| 18 |
+
conformer==0.3.2
|
| 19 |
+
s3tokenizer==0.3.0
|
| 20 |
+
|
| 21 |
+
# Audio processing
|
| 22 |
+
silero-vad==6.2.0
|
| 23 |
+
librosa==0.11.0
|
| 24 |
+
soundfile==0.13.1
|
| 25 |
+
pyloudnorm
|
| 26 |
+
|
| 27 |
+
# Utilities
|
| 28 |
+
num2words
|
| 29 |
+
ffmpeg-python
|
| 30 |
+
tqdm
|
| 31 |
+
pandas
|
| 32 |
+
safetensors
|
| 33 |
+
tensorboard
|
| 34 |
+
omegaconf
|
| 35 |
+
hf_transfer
|
| 36 |
+
gdown
|
samples/comparison/cv15_11_baseline.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4c35b6067b981f318e0bd02c04e034c7ec0cb290f07ce117b7e7d6e49125d4c2
|
| 3 |
+
size 527084
|
samples/comparison/cv15_11_finetuned.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7c095d7a386a0430e8c105cca160e35a5321536b95ed6d3336456f80d5d28695
|
| 3 |
+
size 431084
|
samples/comparison/cv15_16_baseline.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2c76889754e1387567df26bb63faf355434990be9190e92cf2b2ec670c4abc9a
|
| 3 |
+
size 469484
|
samples/comparison/cv15_16_finetuned.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:049e7435b69864d1f27df2a8a98f1b95d40eee7645fd3d03190512e9380d67b6
|
| 3 |
+
size 358124
|
samples/comparison/cv15_2_baseline.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4cfd64fbc57efddba7cd9e5e13dd41f2a7843796112b59e36f6a3c5999109fc8
|
| 3 |
+
size 434924
|
samples/comparison/cv15_2_finetuned.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fb8142ec3157d3d945e4896c215fca0e1031c520aaa03f8533f53b96f564eb8e
|
| 3 |
+
size 423404
|
samples/reference_finnish.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fb5f8e26a7a88276e23bf73375be147269ba2cd6fe6fb38ac68e7b330d1fc03c
|
| 3 |
+
size 266156
|
setup.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
import sys
|
| 4 |
+
import json
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
+
from transformers import AutoTokenizer
|
| 7 |
+
from src.config import TrainConfig
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
DEST_DIR = "pretrained_models"
|
| 11 |
+
|
| 12 |
+
CHATTERBOX_TURBO_FILES = {
|
| 13 |
+
"ve.safetensors": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/ve.safetensors?download=true",
|
| 14 |
+
"t3_turbo_v1.safetensors": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/t3_turbo_v1.safetensors?download=true",
|
| 15 |
+
"s3gen_meanflow.safetensors": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/s3gen_meanflow.safetensors?download=true",
|
| 16 |
+
"conds.pt": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/conds.pt?download=true",
|
| 17 |
+
"vocab.json": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/vocab.json?download=true",
|
| 18 |
+
"added_tokens.json": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/added_tokens.json?download=true",
|
| 19 |
+
"special_tokens_map.json": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/special_tokens_map.json?download=true",
|
| 20 |
+
"tokenizer_config.json": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/tokenizer_config.json?download=true",
|
| 21 |
+
"merges.txt": "https://huggingface.co/ResembleAI/chatterbox-turbo/resolve/main/merges.txt?download=true",
|
| 22 |
+
"grapheme_mtl_merged_expanded_v1.json": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/grapheme_mtl_merged_expanded_v1.json?download=true"
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
CHATTERBOX_FILES = {
|
| 27 |
+
"ve.safetensors": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/ve.safetensors?download=true",
|
| 28 |
+
"t3_cfg.safetensors": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/t3_mtl23ls_v2.safetensors?download=true",
|
| 29 |
+
"s3gen.safetensors": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/s3gen.safetensors?download=true",
|
| 30 |
+
"conds.pt": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/conds.pt?download=true",
|
| 31 |
+
"tokenizer.json": "https://huggingface.co/ResembleAI/chatterbox/resolve/main/grapheme_mtl_merged_expanded_v1.json?download=true"
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
def download_file(url, dest_path):
|
| 35 |
+
"""Downloads a file from a URL to a specific destination with a progress bar."""
|
| 36 |
+
|
| 37 |
+
if os.path.exists(dest_path):
|
| 38 |
+
print(f"File already exists: {dest_path}")
|
| 39 |
+
return
|
| 40 |
+
|
| 41 |
+
print(f"Downloading: {os.path.basename(dest_path)}...")
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
|
| 45 |
+
response = requests.get(url, stream=True)
|
| 46 |
+
response.raise_for_status()
|
| 47 |
+
|
| 48 |
+
total_size = int(response.headers.get('content-length', 0))
|
| 49 |
+
block_size = 1024
|
| 50 |
+
|
| 51 |
+
with open(dest_path, 'wb') as file, tqdm(
|
| 52 |
+
desc=os.path.basename(dest_path),
|
| 53 |
+
total=total_size,
|
| 54 |
+
unit='iB',
|
| 55 |
+
unit_scale=True,
|
| 56 |
+
unit_divisor=1024,
|
| 57 |
+
) as bar:
|
| 58 |
+
|
| 59 |
+
for data in response.iter_content(block_size):
|
| 60 |
+
|
| 61 |
+
size = file.write(data)
|
| 62 |
+
bar.update(size)
|
| 63 |
+
|
| 64 |
+
print(f"Download complete: {dest_path}\n")
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
except requests.exceptions.RequestException as e:
|
| 68 |
+
print(f"Error downloading {url}: {e}")
|
| 69 |
+
sys.exit(1)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def merge_and_save_turbo_tokenizer():
|
| 74 |
+
"""
|
| 75 |
+
It combines the downloaded original GPT-2 tokenizer with our custom vocab
|
| 76 |
+
and overwrites the original files.
|
| 77 |
+
"""
|
| 78 |
+
print("\n--- Turbo Vocab Merging Begins ---")
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
base_tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
|
| 82 |
+
except Exception as e:
|
| 83 |
+
print(f"ERROR: The original tokenizer could not be loaded. Did you download the files correctly? -> {e}")
|
| 84 |
+
return 0
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
initial_len = len(base_tokenizer)
|
| 88 |
+
print(f" Original Size: {initial_len}")
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
custom_vocab_path = os.path.join(DEST_DIR, "grapheme_mtl_merged_expanded_v1.json")
|
| 92 |
+
|
| 93 |
+
print(f"Loading: Custom Vocab ({custom_vocab_path})")
|
| 94 |
+
|
| 95 |
+
with open(custom_vocab_path, 'r', encoding='utf-8') as f:
|
| 96 |
+
custom_data = json.load(f)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
if "model" in custom_data and "vocab" in custom_data["model"]:
|
| 100 |
+
vocab_dict = custom_data["model"]["vocab"]
|
| 101 |
+
|
| 102 |
+
else:
|
| 103 |
+
print("Warning: The custom VOCAB format may differ from what is expected.")
|
| 104 |
+
return 0
|
| 105 |
+
|
| 106 |
+
unique_tokens_to_add = list(vocab_dict.keys())
|
| 107 |
+
added_count = base_tokenizer.add_tokens(unique_tokens_to_add)
|
| 108 |
+
final_len = len(base_tokenizer)
|
| 109 |
+
|
| 110 |
+
print(f"Merging: {added_count} new token added.")
|
| 111 |
+
print(f" New Dimension: {final_len}")
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
print(f"Saving: Writing the combined tokenizer to the '{DEST_DIR}' folder...")
|
| 115 |
+
base_tokenizer.save_pretrained(DEST_DIR)
|
| 116 |
+
|
| 117 |
+
print("MERGER SUCCESSFUL!")
|
| 118 |
+
|
| 119 |
+
return final_len
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def test_merge_tokenizer_process(tokenizer_path):
|
| 124 |
+
|
| 125 |
+
try:
|
| 126 |
+
|
| 127 |
+
tok = AutoTokenizer.from_pretrained(tokenizer_path)
|
| 128 |
+
|
| 129 |
+
print(f"--- RESULTS ---")
|
| 130 |
+
print(f"Folder: {tokenizer_path}")
|
| 131 |
+
print(f"Actual Vocab Size (len): {len(tok)}")
|
| 132 |
+
|
| 133 |
+
test_token = "[ta]"
|
| 134 |
+
test_id = tok.encode(test_token, add_special_tokens=False)
|
| 135 |
+
|
| 136 |
+
print(f"Test Token '{test_token}' ID: {test_id}")
|
| 137 |
+
|
| 138 |
+
if len(tok) > 50276:
|
| 139 |
+
print("SUCCESS! New tokens have been added.")
|
| 140 |
+
|
| 141 |
+
else:
|
| 142 |
+
print("ERROR: The size still appears old.")
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
except Exception as e:
|
| 146 |
+
print(f"Error: {e}")
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def main():
|
| 152 |
+
|
| 153 |
+
print("--- Chatterbox Pretrained Model Setup ---\n")
|
| 154 |
+
|
| 155 |
+
# 1. Create the directory if it doesn't exist
|
| 156 |
+
if not os.path.exists(DEST_DIR):
|
| 157 |
+
|
| 158 |
+
print(f"Creating directory: {DEST_DIR}")
|
| 159 |
+
os.makedirs(DEST_DIR, exist_ok=True)
|
| 160 |
+
|
| 161 |
+
else:
|
| 162 |
+
print(f"Directory found: {DEST_DIR}")
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
cfg = TrainConfig()
|
| 166 |
+
|
| 167 |
+
if cfg.is_turbo:
|
| 168 |
+
print(f"Mode: CHATTERBOX-TURBO (Checking {len(CHATTERBOX_TURBO_FILES)} files)")
|
| 169 |
+
FILES_TO_DOWNLOAD = CHATTERBOX_TURBO_FILES
|
| 170 |
+
|
| 171 |
+
else:
|
| 172 |
+
print(f"Mode: CHATTERBOX-TTS (Checking {len(CHATTERBOX_FILES)} files)")
|
| 173 |
+
FILES_TO_DOWNLOAD = CHATTERBOX_FILES
|
| 174 |
+
|
| 175 |
+
# 2. Download files
|
| 176 |
+
for filename, url in FILES_TO_DOWNLOAD.items():
|
| 177 |
+
dest_path = os.path.join(DEST_DIR, filename)
|
| 178 |
+
download_file(url, dest_path)
|
| 179 |
+
|
| 180 |
+
if cfg.is_turbo:
|
| 181 |
+
new_vocab_size = merge_and_save_turbo_tokenizer()
|
| 182 |
+
if new_vocab_size > 0:
|
| 183 |
+
|
| 184 |
+
#test_merge_tokenizer_process(DEST_DIR)
|
| 185 |
+
|
| 186 |
+
print("\n" + "="*60)
|
| 187 |
+
print("INSTALLATION COMPLETE (CHATTERBOX-TURBO MODE)")
|
| 188 |
+
print("All models are set up in 'pretrained_models/' folder.")
|
| 189 |
+
print(f"Please update the 'new_vocab_size' value in the 'src/config.py' file")
|
| 190 |
+
print(f"to: {new_vocab_size}")
|
| 191 |
+
print("="*60 + "\n")
|
| 192 |
+
|
| 193 |
+
else:
|
| 194 |
+
print("\nINSTALLATION COMPLETE (CHATTERBOX-TTS MOD)")
|
| 195 |
+
print("All models are set up in 'pretrained_models/' folder.")
|
| 196 |
+
print(f"Note: 'grapheme_mtl_merged_expanded_v1.json' was saved as 'tokenizer.json' for the new vocabulary.")
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
if __name__ == "__main__":
|
| 201 |
+
main()
|
src/__init__.py
ADDED
|
File without changes
|
src/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (141 Bytes). View file
|
|
|
src/__pycache__/config.cpython-311.pyc
ADDED
|
Binary file (2.07 kB). View file
|
|
|
src/__pycache__/dataset.cpython-311.pyc
ADDED
|
Binary file (12.2 kB). View file
|
|
|
src/__pycache__/model.cpython-311.pyc
ADDED
|
Binary file (8.21 kB). View file
|
|
|
src/__pycache__/preprocess_file_based.cpython-311.pyc
ADDED
|
Binary file (6.65 kB). View file
|
|
|
src/__pycache__/preprocess_ljspeech.cpython-311.pyc
ADDED
|
Binary file (6.65 kB). View file
|
|
|
src/__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (5.41 kB). View file
|
|
|
src/chatterbox_/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .tts import ChatterboxTTS
|
| 2 |
+
from .vc import ChatterboxVC
|
| 3 |
+
from .mtl_tts import ChatterboxMultilingualTTS, SUPPORTED_LANGUAGES
|
src/chatterbox_/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (372 Bytes). View file
|
|
|
src/chatterbox_/__pycache__/mtl_tts.cpython-311.pyc
ADDED
|
Binary file (14.9 kB). View file
|
|
|
src/chatterbox_/__pycache__/tts.cpython-311.pyc
ADDED
|
Binary file (14.5 kB). View file
|
|
|
src/chatterbox_/__pycache__/tts_turbo.cpython-311.pyc
ADDED
|
Binary file (15.5 kB). View file
|
|
|
src/chatterbox_/__pycache__/vc.cpython-311.pyc
ADDED
|
Binary file (6.27 kB). View file
|
|
|
src/chatterbox_/models/__init__.py
ADDED
|
File without changes
|
src/chatterbox_/models/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (160 Bytes). View file
|
|
|
src/chatterbox_/models/__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (746 Bytes). View file
|
|
|
src/chatterbox_/models/s3gen/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .s3gen import S3Token2Wav as S3Gen
|
| 2 |
+
from .const import S3GEN_SR
|
src/chatterbox_/models/s3gen/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (282 Bytes). View file
|
|
|
src/chatterbox_/models/s3gen/__pycache__/configs.cpython-311.pyc
ADDED
|
Binary file (442 Bytes). View file
|
|
|
src/chatterbox_/models/s3gen/__pycache__/const.cpython-311.pyc
ADDED
|
Binary file (203 Bytes). View file
|
|
|
src/chatterbox_/models/s3gen/__pycache__/decoder.cpython-311.pyc
ADDED
|
Binary file (17.6 kB). View file
|
|
|
src/chatterbox_/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc
ADDED
|
Binary file (2.69 kB). View file
|
|
|
src/chatterbox_/models/s3gen/__pycache__/flow.cpython-311.pyc
ADDED
|
Binary file (10.1 kB). View file
|
|
|
src/chatterbox_/models/s3gen/__pycache__/flow_matching.cpython-311.pyc
ADDED
|
Binary file (12.3 kB). View file
|
|
|
src/chatterbox_/models/s3gen/__pycache__/hifigan.cpython-311.pyc
ADDED
|
Binary file (26.3 kB). View file
|
|
|
src/chatterbox_/models/s3gen/__pycache__/s3gen.cpython-311.pyc
ADDED
|
Binary file (15.4 kB). View file
|
|
|
src/chatterbox_/models/s3gen/__pycache__/xvector.cpython-311.pyc
ADDED
|
Binary file (24 kB). View file
|
|
|
src/chatterbox_/models/s3gen/configs.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ..utils import AttrDict
|
| 2 |
+
|
| 3 |
+
CFM_PARAMS = AttrDict({
|
| 4 |
+
"sigma_min": 1e-06,
|
| 5 |
+
"solver": "euler",
|
| 6 |
+
"t_scheduler": "cosine",
|
| 7 |
+
"training_cfg_rate": 0.2,
|
| 8 |
+
"inference_cfg_rate": 0.7,
|
| 9 |
+
"reg_loss_type": "l1"
|
| 10 |
+
})
|
src/chatterbox_/models/s3gen/const.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
S3GEN_SR = 24000
|
| 2 |
+
S3GEN_SIL = 4299
|