atalink commited on
Commit
be0ba66
·
1 Parent(s): ce0b7dc
Files changed (41) hide show
  1. src/f5_tts/api.py +174 -0
  2. src/f5_tts/configs/E2TTS_Base_train.yaml +45 -0
  3. src/f5_tts/configs/E2TTS_Small_train.yaml +45 -0
  4. src/f5_tts/configs/F5TTS_Base_train.yaml +48 -0
  5. src/f5_tts/configs/F5TTS_Small_train.yaml +48 -0
  6. src/f5_tts/eval/README.md +52 -0
  7. src/f5_tts/eval/ecapa_tdnn.py +330 -0
  8. src/f5_tts/eval/eval_infer_batch.py +207 -0
  9. src/f5_tts/eval/eval_infer_batch.sh +13 -0
  10. src/f5_tts/eval/eval_librispeech_test_clean.py +96 -0
  11. src/f5_tts/eval/eval_seedtts_testset.py +95 -0
  12. src/f5_tts/eval/eval_utmos.py +44 -0
  13. src/f5_tts/eval/utils_eval.py +413 -0
  14. src/f5_tts/infer/README.md +199 -0
  15. src/f5_tts/infer/SHARED.md +164 -0
  16. src/f5_tts/infer/infer_cli.py +361 -0
  17. src/f5_tts/infer/infer_gradio.py +888 -0
  18. src/f5_tts/infer/speech_edit.py +201 -0
  19. src/f5_tts/infer/utils_infer.py +583 -0
  20. src/f5_tts/model/__init__.py +10 -0
  21. src/f5_tts/model/backbones/README.md +20 -0
  22. src/f5_tts/model/backbones/dit.py +177 -0
  23. src/f5_tts/model/backbones/mmdit.py +146 -0
  24. src/f5_tts/model/backbones/unett.py +219 -0
  25. src/f5_tts/model/cfm.py +282 -0
  26. src/f5_tts/model/dataset.py +327 -0
  27. src/f5_tts/model/modules.py +658 -0
  28. src/f5_tts/model/trainer.py +426 -0
  29. src/f5_tts/model/utils.py +200 -0
  30. src/f5_tts/scripts/count_max_epoch.py +33 -0
  31. src/f5_tts/scripts/count_params_gflops.py +39 -0
  32. src/f5_tts/socket_server.py +196 -0
  33. src/f5_tts/train/README.md +82 -0
  34. src/f5_tts/train/datasets/prepare_csv_wavs.py +284 -0
  35. src/f5_tts/train/datasets/prepare_emilia.py +230 -0
  36. src/f5_tts/train/datasets/prepare_libritts.py +97 -0
  37. src/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  38. src/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  39. src/f5_tts/train/finetune_cli.py +182 -0
  40. src/f5_tts/train/finetune_gradio.py +1889 -0
  41. src/f5_tts/train/train.py +76 -0
src/f5_tts/api.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import sys
3
+ from importlib.resources import files
4
+
5
+ import soundfile as sf
6
+ import tqdm
7
+ from cached_path import cached_path
8
+
9
+ from f5_tts.infer.utils_infer import (
10
+ hop_length,
11
+ infer_process,
12
+ load_model,
13
+ load_vocoder,
14
+ preprocess_ref_audio_text,
15
+ remove_silence_for_generated_wav,
16
+ save_spectrogram,
17
+ transcribe,
18
+ target_sample_rate,
19
+ )
20
+ from f5_tts.model import DiT, UNetT
21
+ from f5_tts.model.utils import seed_everything
22
+
23
+
24
+ class F5TTS:
25
+ def __init__(
26
+ self,
27
+ model_type="F5-TTS",
28
+ ckpt_file="",
29
+ vocab_file="",
30
+ ode_method="euler",
31
+ use_ema=True,
32
+ vocoder_name="vocos",
33
+ local_path=None,
34
+ device=None,
35
+ hf_cache_dir=None,
36
+ ):
37
+ # Initialize parameters
38
+ self.final_wave = None
39
+ self.target_sample_rate = target_sample_rate
40
+ self.hop_length = hop_length
41
+ self.seed = -1
42
+ self.mel_spec_type = vocoder_name
43
+
44
+ # Set device
45
+ if device is not None:
46
+ self.device = device
47
+ else:
48
+ import torch
49
+
50
+ self.device = (
51
+ "cuda"
52
+ if torch.cuda.is_available()
53
+ else "xpu"
54
+ if torch.xpu.is_available()
55
+ else "mps"
56
+ if torch.backends.mps.is_available()
57
+ else "cpu"
58
+ )
59
+
60
+ # Load models
61
+ self.load_vocoder_model(vocoder_name, local_path=local_path, hf_cache_dir=hf_cache_dir)
62
+ self.load_ema_model(
63
+ model_type, ckpt_file, vocoder_name, vocab_file, ode_method, use_ema, hf_cache_dir=hf_cache_dir
64
+ )
65
+
66
+ def load_vocoder_model(self, vocoder_name, local_path=None, hf_cache_dir=None):
67
+ self.vocoder = load_vocoder(vocoder_name, local_path is not None, local_path, self.device, hf_cache_dir)
68
+
69
+ def load_ema_model(self, model_type, ckpt_file, mel_spec_type, vocab_file, ode_method, use_ema, hf_cache_dir=None):
70
+ if model_type == "F5-TTS":
71
+ if not ckpt_file:
72
+ if mel_spec_type == "vocos":
73
+ ckpt_file = str(
74
+ cached_path("hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors", cache_dir=hf_cache_dir)
75
+ )
76
+ elif mel_spec_type == "bigvgan":
77
+ ckpt_file = str(
78
+ cached_path("hf://SWivid/F5-TTS/F5TTS_Base_bigvgan/model_1250000.pt", cache_dir=hf_cache_dir)
79
+ )
80
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
81
+ model_cls = DiT
82
+ elif model_type == "E2-TTS":
83
+ if not ckpt_file:
84
+ ckpt_file = str(
85
+ cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors", cache_dir=hf_cache_dir)
86
+ )
87
+ model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
88
+ model_cls = UNetT
89
+ else:
90
+ raise ValueError(f"Unknown model type: {model_type}")
91
+
92
+ self.ema_model = load_model(
93
+ model_cls, model_cfg, ckpt_file, mel_spec_type, vocab_file, ode_method, use_ema, self.device
94
+ )
95
+
96
+ def transcribe(self, ref_audio, language=None):
97
+ return transcribe(ref_audio, language)
98
+
99
+ def export_wav(self, wav, file_wave, remove_silence=False):
100
+ sf.write(file_wave, wav, self.target_sample_rate)
101
+
102
+ if remove_silence:
103
+ remove_silence_for_generated_wav(file_wave)
104
+
105
+ def export_spectrogram(self, spect, file_spect):
106
+ save_spectrogram(spect, file_spect)
107
+
108
+ def infer(
109
+ self,
110
+ ref_file,
111
+ ref_text,
112
+ gen_text,
113
+ show_info=print,
114
+ progress=tqdm,
115
+ target_rms=0.1,
116
+ cross_fade_duration=0.15,
117
+ sway_sampling_coef=-1,
118
+ cfg_strength=2,
119
+ nfe_step=32,
120
+ speed=1.0,
121
+ fix_duration=None,
122
+ remove_silence=False,
123
+ file_wave=None,
124
+ file_spect=None,
125
+ seed=-1,
126
+ ):
127
+ if seed == -1:
128
+ seed = random.randint(0, sys.maxsize)
129
+ seed_everything(seed)
130
+ self.seed = seed
131
+
132
+ ref_file, ref_text = preprocess_ref_audio_text(ref_file, ref_text, device=self.device)
133
+
134
+ wav, sr, spect = infer_process(
135
+ ref_file,
136
+ ref_text,
137
+ gen_text,
138
+ self.ema_model,
139
+ self.vocoder,
140
+ self.mel_spec_type,
141
+ show_info=show_info,
142
+ progress=progress,
143
+ target_rms=target_rms,
144
+ cross_fade_duration=cross_fade_duration,
145
+ nfe_step=nfe_step,
146
+ cfg_strength=cfg_strength,
147
+ sway_sampling_coef=sway_sampling_coef,
148
+ speed=speed,
149
+ fix_duration=fix_duration,
150
+ device=self.device,
151
+ )
152
+
153
+ if file_wave is not None:
154
+ self.export_wav(wav, file_wave, remove_silence)
155
+
156
+ if file_spect is not None:
157
+ self.export_spectrogram(spect, file_spect)
158
+
159
+ return wav, sr, spect
160
+
161
+
162
+ if __name__ == "__main__":
163
+ f5tts = F5TTS()
164
+
165
+ wav, sr, spect = f5tts.infer(
166
+ ref_file=str(files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")),
167
+ ref_text="some call me nature, others call me mother nature.",
168
+ gen_text="""I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences.""",
169
+ file_wave=str(files("f5_tts").joinpath("../../tests/api_out.wav")),
170
+ file_spect=str(files("f5_tts").joinpath("../../tests/api_out.png")),
171
+ seed=-1, # random seed = -1
172
+ )
173
+
174
+ print("seed :", f5tts.seed)
src/f5_tts/configs/E2TTS_Base_train.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN # dataset name
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # "frame" or "sample"
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
+
12
+ optim:
13
+ epochs: 15
14
+ learning_rate: 7.5e-5
15
+ num_warmup_updates: 20000 # warmup updates
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0 # gradient clipping
18
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
19
+
20
+ model:
21
+ name: E2TTS_Base
22
+ tokenizer: pinyin
23
+ tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
24
+ arch:
25
+ dim: 1024
26
+ depth: 24
27
+ heads: 16
28
+ ff_mult: 4
29
+ mel_spec:
30
+ target_sample_rate: 24000
31
+ n_mel_channels: 100
32
+ hop_length: 256
33
+ win_length: 1024
34
+ n_fft: 1024
35
+ mel_spec_type: vocos # 'vocos' or 'bigvgan'
36
+ vocoder:
37
+ is_local: False # use local offline ckpt or not
38
+ local_path: None # local vocoder path
39
+
40
+ ckpts:
41
+ logger: wandb # wandb | tensorboard | None
42
+ save_per_updates: 50000 # save checkpoint per updates
43
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
44
+ last_per_updates: 5000 # save last checkpoint per updates
45
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
src/f5_tts/configs/E2TTS_Small_train.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # "frame" or "sample"
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
+
12
+ optim:
13
+ epochs: 15
14
+ learning_rate: 7.5e-5
15
+ num_warmup_updates: 20000 # warmup updates
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0
18
+ bnb_optimizer: False
19
+
20
+ model:
21
+ name: E2TTS_Small
22
+ tokenizer: pinyin
23
+ tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
24
+ arch:
25
+ dim: 768
26
+ depth: 20
27
+ heads: 12
28
+ ff_mult: 4
29
+ mel_spec:
30
+ target_sample_rate: 24000
31
+ n_mel_channels: 100
32
+ hop_length: 256
33
+ win_length: 1024
34
+ n_fft: 1024
35
+ mel_spec_type: vocos # 'vocos' or 'bigvgan'
36
+ vocoder:
37
+ is_local: False # use local offline ckpt or not
38
+ local_path: None # local vocoder path
39
+
40
+ ckpts:
41
+ logger: wandb # wandb | tensorboard | None
42
+ save_per_updates: 50000 # save checkpoint per updates
43
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
44
+ last_per_updates: 5000 # save last checkpoint per updates
45
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
src/f5_tts/configs/F5TTS_Base_train.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: vn_1000h # dataset name
7
+ batch_size_per_gpu: 2000 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # "frame" or "sample"
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
+
12
+ optim:
13
+ epochs: 200
14
+ learning_rate: 7.5e-5
15
+ num_warmup_updates: 20000 # warmup updates
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0 # gradient clipping
18
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
19
+
20
+ model:
21
+ name: F5TTS_Base # model name
22
+ tokenizer: char # tokenizer type
23
+ tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
24
+ arch:
25
+ dim: 1024
26
+ depth: 22
27
+ heads: 16
28
+ ff_mult: 2
29
+ text_dim: 512
30
+ conv_layers: 4
31
+ checkpoint_activations: False # recompute activations and save memory for extra compute
32
+ mel_spec:
33
+ target_sample_rate: 24000
34
+ n_mel_channels: 100
35
+ hop_length: 256
36
+ win_length: 1024
37
+ n_fft: 1024
38
+ mel_spec_type: vocos # 'vocos' or 'bigvgan'
39
+ vocoder:
40
+ is_local: True # use local offline ckpt or not
41
+ local_path: /mnt/i/Project/F5-TTS/ckpts/vocos # local vocoder path
42
+
43
+ ckpts:
44
+ logger: tensorboard # wandb | tensorboard | None
45
+ save_per_updates: 30000 # save checkpoint per updates
46
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
47
+ last_per_updates: 5000 # save last checkpoint per updates
48
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
src/f5_tts/configs/F5TTS_Small_train.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # "frame" or "sample"
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
+
12
+ optim:
13
+ epochs: 15
14
+ learning_rate: 7.5e-5
15
+ num_warmup_updates: 20000 # warmup updates
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0 # gradient clipping
18
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
19
+
20
+ model:
21
+ name: F5TTS_Small
22
+ tokenizer: pinyin
23
+ tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
24
+ arch:
25
+ dim: 768
26
+ depth: 18
27
+ heads: 12
28
+ ff_mult: 2
29
+ text_dim: 512
30
+ conv_layers: 4
31
+ checkpoint_activations: False # recompute activations and save memory for extra compute
32
+ mel_spec:
33
+ target_sample_rate: 24000
34
+ n_mel_channels: 100
35
+ hop_length: 256
36
+ win_length: 1024
37
+ n_fft: 1024
38
+ mel_spec_type: vocos # 'vocos' or 'bigvgan'
39
+ vocoder:
40
+ is_local: False # use local offline ckpt or not
41
+ local_path: None # local vocoder path
42
+
43
+ ckpts:
44
+ logger: wandb # wandb | tensorboard | None
45
+ save_per_updates: 50000 # save checkpoint per updates
46
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
47
+ last_per_updates: 5000 # save last checkpoint per updates
48
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
src/f5_tts/eval/README.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Evaluation
3
+
4
+ Install packages for evaluation:
5
+
6
+ ```bash
7
+ pip install -e .[eval]
8
+ ```
9
+
10
+ ## Generating Samples for Evaluation
11
+
12
+ ### Prepare Test Datasets
13
+
14
+ 1. *Seed-TTS testset*: Download from [seed-tts-eval](https://github.com/BytedanceSpeech/seed-tts-eval).
15
+ 2. *LibriSpeech test-clean*: Download from [OpenSLR](http://www.openslr.org/12/).
16
+ 3. Unzip the downloaded datasets and place them in the `data/` directory.
17
+ 4. Update the path for *LibriSpeech test-clean* data in `src/f5_tts/eval/eval_infer_batch.py`
18
+ 5. Our filtered LibriSpeech-PC 4-10s subset: `data/librispeech_pc_test_clean_cross_sentence.lst`
19
+
20
+ ### Batch Inference for Test Set
21
+
22
+ To run batch inference for evaluations, execute the following commands:
23
+
24
+ ```bash
25
+ # batch inference for evaluations
26
+ accelerate config # if not set before
27
+ bash src/f5_tts/eval/eval_infer_batch.sh
28
+ ```
29
+
30
+ ## Objective Evaluation on Generated Results
31
+
32
+ ### Download Evaluation Model Checkpoints
33
+
34
+ 1. Chinese ASR Model: [Paraformer-zh](https://huggingface.co/funasr/paraformer-zh)
35
+ 2. English ASR Model: [Faster-Whisper](https://huggingface.co/Systran/faster-whisper-large-v3)
36
+ 3. WavLM Model: Download from [Google Drive](https://drive.google.com/file/d/1-aE1NfzpRCLxA4GUxX9ITI3F9LlbtEGP/view).
37
+
38
+ Then update in the following scripts with the paths you put evaluation model ckpts to.
39
+
40
+ ### Objective Evaluation
41
+
42
+ Update the path with your batch-inferenced results, and carry out WER / SIM / UTMOS evaluations:
43
+ ```bash
44
+ # Evaluation [WER] for Seed-TTS test [ZH] set
45
+ python src/f5_tts/eval/eval_seedtts_testset.py --eval_task wer --lang zh --gen_wav_dir <GEN_WAV_DIR> --gpu_nums 8
46
+
47
+ # Evaluation [SIM] for LibriSpeech-PC test-clean (cross-sentence)
48
+ python src/f5_tts/eval/eval_librispeech_test_clean.py --eval_task sim --gen_wav_dir <GEN_WAV_DIR> --librispeech_test_clean_path <TEST_CLEAN_PATH>
49
+
50
+ # Evaluation [UTMOS]. --ext: Audio extension
51
+ python src/f5_tts/eval/eval_utmos.py --audio_dir <WAV_DIR> --ext wav
52
+ ```
src/f5_tts/eval/ecapa_tdnn.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # just for speaker similarity evaluation, third-party code
2
+
3
+ # From https://github.com/microsoft/UniSpeech/blob/main/downstreams/speaker_verification/models/
4
+ # part of the code is borrowed from https://github.com/lawlict/ECAPA-TDNN
5
+
6
+ import os
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+
11
+
12
+ """ Res2Conv1d + BatchNorm1d + ReLU
13
+ """
14
+
15
+
16
+ class Res2Conv1dReluBn(nn.Module):
17
+ """
18
+ in_channels == out_channels == channels
19
+ """
20
+
21
+ def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True, scale=4):
22
+ super().__init__()
23
+ assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
24
+ self.scale = scale
25
+ self.width = channels // scale
26
+ self.nums = scale if scale == 1 else scale - 1
27
+
28
+ self.convs = []
29
+ self.bns = []
30
+ for i in range(self.nums):
31
+ self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
32
+ self.bns.append(nn.BatchNorm1d(self.width))
33
+ self.convs = nn.ModuleList(self.convs)
34
+ self.bns = nn.ModuleList(self.bns)
35
+
36
+ def forward(self, x):
37
+ out = []
38
+ spx = torch.split(x, self.width, 1)
39
+ for i in range(self.nums):
40
+ if i == 0:
41
+ sp = spx[i]
42
+ else:
43
+ sp = sp + spx[i]
44
+ # Order: conv -> relu -> bn
45
+ sp = self.convs[i](sp)
46
+ sp = self.bns[i](F.relu(sp))
47
+ out.append(sp)
48
+ if self.scale != 1:
49
+ out.append(spx[self.nums])
50
+ out = torch.cat(out, dim=1)
51
+
52
+ return out
53
+
54
+
55
+ """ Conv1d + BatchNorm1d + ReLU
56
+ """
57
+
58
+
59
+ class Conv1dReluBn(nn.Module):
60
+ def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True):
61
+ super().__init__()
62
+ self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
63
+ self.bn = nn.BatchNorm1d(out_channels)
64
+
65
+ def forward(self, x):
66
+ return self.bn(F.relu(self.conv(x)))
67
+
68
+
69
+ """ The SE connection of 1D case.
70
+ """
71
+
72
+
73
+ class SE_Connect(nn.Module):
74
+ def __init__(self, channels, se_bottleneck_dim=128):
75
+ super().__init__()
76
+ self.linear1 = nn.Linear(channels, se_bottleneck_dim)
77
+ self.linear2 = nn.Linear(se_bottleneck_dim, channels)
78
+
79
+ def forward(self, x):
80
+ out = x.mean(dim=2)
81
+ out = F.relu(self.linear1(out))
82
+ out = torch.sigmoid(self.linear2(out))
83
+ out = x * out.unsqueeze(2)
84
+
85
+ return out
86
+
87
+
88
+ """ SE-Res2Block of the ECAPA-TDNN architecture.
89
+ """
90
+
91
+ # def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale):
92
+ # return nn.Sequential(
93
+ # Conv1dReluBn(channels, 512, kernel_size=1, stride=1, padding=0),
94
+ # Res2Conv1dReluBn(512, kernel_size, stride, padding, dilation, scale=scale),
95
+ # Conv1dReluBn(512, channels, kernel_size=1, stride=1, padding=0),
96
+ # SE_Connect(channels)
97
+ # )
98
+
99
+
100
+ class SE_Res2Block(nn.Module):
101
+ def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, scale, se_bottleneck_dim):
102
+ super().__init__()
103
+ self.Conv1dReluBn1 = Conv1dReluBn(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
104
+ self.Res2Conv1dReluBn = Res2Conv1dReluBn(out_channels, kernel_size, stride, padding, dilation, scale=scale)
105
+ self.Conv1dReluBn2 = Conv1dReluBn(out_channels, out_channels, kernel_size=1, stride=1, padding=0)
106
+ self.SE_Connect = SE_Connect(out_channels, se_bottleneck_dim)
107
+
108
+ self.shortcut = None
109
+ if in_channels != out_channels:
110
+ self.shortcut = nn.Conv1d(
111
+ in_channels=in_channels,
112
+ out_channels=out_channels,
113
+ kernel_size=1,
114
+ )
115
+
116
+ def forward(self, x):
117
+ residual = x
118
+ if self.shortcut:
119
+ residual = self.shortcut(x)
120
+
121
+ x = self.Conv1dReluBn1(x)
122
+ x = self.Res2Conv1dReluBn(x)
123
+ x = self.Conv1dReluBn2(x)
124
+ x = self.SE_Connect(x)
125
+
126
+ return x + residual
127
+
128
+
129
+ """ Attentive weighted mean and standard deviation pooling.
130
+ """
131
+
132
+
133
+ class AttentiveStatsPool(nn.Module):
134
+ def __init__(self, in_dim, attention_channels=128, global_context_att=False):
135
+ super().__init__()
136
+ self.global_context_att = global_context_att
137
+
138
+ # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
139
+ if global_context_att:
140
+ self.linear1 = nn.Conv1d(in_dim * 3, attention_channels, kernel_size=1) # equals W and b in the paper
141
+ else:
142
+ self.linear1 = nn.Conv1d(in_dim, attention_channels, kernel_size=1) # equals W and b in the paper
143
+ self.linear2 = nn.Conv1d(attention_channels, in_dim, kernel_size=1) # equals V and k in the paper
144
+
145
+ def forward(self, x):
146
+ if self.global_context_att:
147
+ context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
148
+ context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
149
+ x_in = torch.cat((x, context_mean, context_std), dim=1)
150
+ else:
151
+ x_in = x
152
+
153
+ # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
154
+ alpha = torch.tanh(self.linear1(x_in))
155
+ # alpha = F.relu(self.linear1(x_in))
156
+ alpha = torch.softmax(self.linear2(alpha), dim=2)
157
+ mean = torch.sum(alpha * x, dim=2)
158
+ residuals = torch.sum(alpha * (x**2), dim=2) - mean**2
159
+ std = torch.sqrt(residuals.clamp(min=1e-9))
160
+ return torch.cat([mean, std], dim=1)
161
+
162
+
163
+ class ECAPA_TDNN(nn.Module):
164
+ def __init__(
165
+ self,
166
+ feat_dim=80,
167
+ channels=512,
168
+ emb_dim=192,
169
+ global_context_att=False,
170
+ feat_type="wavlm_large",
171
+ sr=16000,
172
+ feature_selection="hidden_states",
173
+ update_extract=False,
174
+ config_path=None,
175
+ ):
176
+ super().__init__()
177
+
178
+ self.feat_type = feat_type
179
+ self.feature_selection = feature_selection
180
+ self.update_extract = update_extract
181
+ self.sr = sr
182
+
183
+ torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
184
+ try:
185
+ local_s3prl_path = os.path.expanduser("~/.cache/torch/hub/s3prl_s3prl_main")
186
+ self.feature_extract = torch.hub.load(local_s3prl_path, feat_type, source="local", config_path=config_path)
187
+ except: # noqa: E722
188
+ self.feature_extract = torch.hub.load("s3prl/s3prl", feat_type)
189
+
190
+ if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
191
+ self.feature_extract.model.encoder.layers[23].self_attn, "fp32_attention"
192
+ ):
193
+ self.feature_extract.model.encoder.layers[23].self_attn.fp32_attention = False
194
+ if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
195
+ self.feature_extract.model.encoder.layers[11].self_attn, "fp32_attention"
196
+ ):
197
+ self.feature_extract.model.encoder.layers[11].self_attn.fp32_attention = False
198
+
199
+ self.feat_num = self.get_feat_num()
200
+ self.feature_weight = nn.Parameter(torch.zeros(self.feat_num))
201
+
202
+ if feat_type != "fbank" and feat_type != "mfcc":
203
+ freeze_list = ["final_proj", "label_embs_concat", "mask_emb", "project_q", "quantizer"]
204
+ for name, param in self.feature_extract.named_parameters():
205
+ for freeze_val in freeze_list:
206
+ if freeze_val in name:
207
+ param.requires_grad = False
208
+ break
209
+
210
+ if not self.update_extract:
211
+ for param in self.feature_extract.parameters():
212
+ param.requires_grad = False
213
+
214
+ self.instance_norm = nn.InstanceNorm1d(feat_dim)
215
+ # self.channels = [channels] * 4 + [channels * 3]
216
+ self.channels = [channels] * 4 + [1536]
217
+
218
+ self.layer1 = Conv1dReluBn(feat_dim, self.channels[0], kernel_size=5, padding=2)
219
+ self.layer2 = SE_Res2Block(
220
+ self.channels[0],
221
+ self.channels[1],
222
+ kernel_size=3,
223
+ stride=1,
224
+ padding=2,
225
+ dilation=2,
226
+ scale=8,
227
+ se_bottleneck_dim=128,
228
+ )
229
+ self.layer3 = SE_Res2Block(
230
+ self.channels[1],
231
+ self.channels[2],
232
+ kernel_size=3,
233
+ stride=1,
234
+ padding=3,
235
+ dilation=3,
236
+ scale=8,
237
+ se_bottleneck_dim=128,
238
+ )
239
+ self.layer4 = SE_Res2Block(
240
+ self.channels[2],
241
+ self.channels[3],
242
+ kernel_size=3,
243
+ stride=1,
244
+ padding=4,
245
+ dilation=4,
246
+ scale=8,
247
+ se_bottleneck_dim=128,
248
+ )
249
+
250
+ # self.conv = nn.Conv1d(self.channels[-1], self.channels[-1], kernel_size=1)
251
+ cat_channels = channels * 3
252
+ self.conv = nn.Conv1d(cat_channels, self.channels[-1], kernel_size=1)
253
+ self.pooling = AttentiveStatsPool(
254
+ self.channels[-1], attention_channels=128, global_context_att=global_context_att
255
+ )
256
+ self.bn = nn.BatchNorm1d(self.channels[-1] * 2)
257
+ self.linear = nn.Linear(self.channels[-1] * 2, emb_dim)
258
+
259
+ def get_feat_num(self):
260
+ self.feature_extract.eval()
261
+ wav = [torch.randn(self.sr).to(next(self.feature_extract.parameters()).device)]
262
+ with torch.no_grad():
263
+ features = self.feature_extract(wav)
264
+ select_feature = features[self.feature_selection]
265
+ if isinstance(select_feature, (list, tuple)):
266
+ return len(select_feature)
267
+ else:
268
+ return 1
269
+
270
+ def get_feat(self, x):
271
+ if self.update_extract:
272
+ x = self.feature_extract([sample for sample in x])
273
+ else:
274
+ with torch.no_grad():
275
+ if self.feat_type == "fbank" or self.feat_type == "mfcc":
276
+ x = self.feature_extract(x) + 1e-6 # B x feat_dim x time_len
277
+ else:
278
+ x = self.feature_extract([sample for sample in x])
279
+
280
+ if self.feat_type == "fbank":
281
+ x = x.log()
282
+
283
+ if self.feat_type != "fbank" and self.feat_type != "mfcc":
284
+ x = x[self.feature_selection]
285
+ if isinstance(x, (list, tuple)):
286
+ x = torch.stack(x, dim=0)
287
+ else:
288
+ x = x.unsqueeze(0)
289
+ norm_weights = F.softmax(self.feature_weight, dim=-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
290
+ x = (norm_weights * x).sum(dim=0)
291
+ x = torch.transpose(x, 1, 2) + 1e-6
292
+
293
+ x = self.instance_norm(x)
294
+ return x
295
+
296
+ def forward(self, x):
297
+ x = self.get_feat(x)
298
+
299
+ out1 = self.layer1(x)
300
+ out2 = self.layer2(out1)
301
+ out3 = self.layer3(out2)
302
+ out4 = self.layer4(out3)
303
+
304
+ out = torch.cat([out2, out3, out4], dim=1)
305
+ out = F.relu(self.conv(out))
306
+ out = self.bn(self.pooling(out))
307
+ out = self.linear(out)
308
+
309
+ return out
310
+
311
+
312
+ def ECAPA_TDNN_SMALL(
313
+ feat_dim,
314
+ emb_dim=256,
315
+ feat_type="wavlm_large",
316
+ sr=16000,
317
+ feature_selection="hidden_states",
318
+ update_extract=False,
319
+ config_path=None,
320
+ ):
321
+ return ECAPA_TDNN(
322
+ feat_dim=feat_dim,
323
+ channels=512,
324
+ emb_dim=emb_dim,
325
+ feat_type=feat_type,
326
+ sr=sr,
327
+ feature_selection=feature_selection,
328
+ update_extract=update_extract,
329
+ config_path=config_path,
330
+ )
src/f5_tts/eval/eval_infer_batch.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ sys.path.append(os.getcwd())
5
+
6
+ import argparse
7
+ import time
8
+ from importlib.resources import files
9
+
10
+ import torch
11
+ import torchaudio
12
+ from accelerate import Accelerator
13
+ from tqdm import tqdm
14
+
15
+ from f5_tts.eval.utils_eval import (
16
+ get_inference_prompt,
17
+ get_librispeech_test_clean_metainfo,
18
+ get_seedtts_testset_metainfo,
19
+ )
20
+ from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder
21
+ from f5_tts.model import CFM, DiT, UNetT
22
+ from f5_tts.model.utils import get_tokenizer
23
+
24
+ accelerator = Accelerator()
25
+ device = f"cuda:{accelerator.process_index}"
26
+
27
+
28
+ # --------------------- Dataset Settings -------------------- #
29
+
30
+ target_sample_rate = 24000
31
+ n_mel_channels = 100
32
+ hop_length = 256
33
+ win_length = 1024
34
+ n_fft = 1024
35
+ target_rms = 0.1
36
+
37
+ rel_path = str(files("f5_tts").joinpath("../../"))
38
+
39
+
40
+ def main():
41
+ # ---------------------- infer setting ---------------------- #
42
+
43
+ parser = argparse.ArgumentParser(description="batch inference")
44
+
45
+ parser.add_argument("-s", "--seed", default=None, type=int)
46
+ parser.add_argument("-d", "--dataset", default="Emilia_ZH_EN")
47
+ parser.add_argument("-n", "--expname", required=True)
48
+ parser.add_argument("-c", "--ckptstep", default=1200000, type=int)
49
+ parser.add_argument("-m", "--mel_spec_type", default="vocos", type=str, choices=["bigvgan", "vocos"])
50
+ parser.add_argument("-to", "--tokenizer", default="pinyin", type=str, choices=["pinyin", "char"])
51
+
52
+ parser.add_argument("-nfe", "--nfestep", default=32, type=int)
53
+ parser.add_argument("-o", "--odemethod", default="euler")
54
+ parser.add_argument("-ss", "--swaysampling", default=-1, type=float)
55
+
56
+ parser.add_argument("-t", "--testset", required=True)
57
+
58
+ args = parser.parse_args()
59
+
60
+ seed = args.seed
61
+ dataset_name = args.dataset
62
+ exp_name = args.expname
63
+ ckpt_step = args.ckptstep
64
+ ckpt_path = rel_path + f"/ckpts/{exp_name}/model_{ckpt_step}.pt"
65
+ mel_spec_type = args.mel_spec_type
66
+ tokenizer = args.tokenizer
67
+
68
+ nfe_step = args.nfestep
69
+ ode_method = args.odemethod
70
+ sway_sampling_coef = args.swaysampling
71
+
72
+ testset = args.testset
73
+
74
+ infer_batch_size = 1 # max frames. 1 for ddp single inference (recommended)
75
+ cfg_strength = 2.0
76
+ speed = 1.0
77
+ use_truth_duration = False
78
+ no_ref_audio = False
79
+
80
+ if exp_name == "F5TTS_Base":
81
+ model_cls = DiT
82
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
83
+
84
+ elif exp_name == "E2TTS_Base":
85
+ model_cls = UNetT
86
+ model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
87
+
88
+ if testset == "ls_pc_test_clean":
89
+ metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
90
+ librispeech_test_clean_path = "<SOME_PATH>/LibriSpeech/test-clean" # test-clean path
91
+ metainfo = get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path)
92
+
93
+ elif testset == "seedtts_test_zh":
94
+ metalst = rel_path + "/data/seedtts_testset/zh/meta.lst"
95
+ metainfo = get_seedtts_testset_metainfo(metalst)
96
+
97
+ elif testset == "seedtts_test_en":
98
+ metalst = rel_path + "/data/seedtts_testset/en/meta.lst"
99
+ metainfo = get_seedtts_testset_metainfo(metalst)
100
+
101
+ # path to save genereted wavs
102
+ output_dir = (
103
+ f"{rel_path}/"
104
+ f"results/{exp_name}_{ckpt_step}/{testset}/"
105
+ f"seed{seed}_{ode_method}_nfe{nfe_step}_{mel_spec_type}"
106
+ f"{f'_ss{sway_sampling_coef}' if sway_sampling_coef else ''}"
107
+ f"_cfg{cfg_strength}_speed{speed}"
108
+ f"{'_gt-dur' if use_truth_duration else ''}"
109
+ f"{'_no-ref-audio' if no_ref_audio else ''}"
110
+ )
111
+
112
+ # -------------------------------------------------#
113
+
114
+ use_ema = True
115
+
116
+ prompts_all = get_inference_prompt(
117
+ metainfo,
118
+ speed=speed,
119
+ tokenizer=tokenizer,
120
+ target_sample_rate=target_sample_rate,
121
+ n_mel_channels=n_mel_channels,
122
+ hop_length=hop_length,
123
+ mel_spec_type=mel_spec_type,
124
+ target_rms=target_rms,
125
+ use_truth_duration=use_truth_duration,
126
+ infer_batch_size=infer_batch_size,
127
+ )
128
+
129
+ # Vocoder model
130
+ local = False
131
+ if mel_spec_type == "vocos":
132
+ vocoder_local_path = "../checkpoints/charactr/vocos-mel-24khz"
133
+ elif mel_spec_type == "bigvgan":
134
+ vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
135
+ vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=local, local_path=vocoder_local_path)
136
+
137
+ # Tokenizer
138
+ vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
139
+
140
+ # Model
141
+ model = CFM(
142
+ transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
143
+ mel_spec_kwargs=dict(
144
+ n_fft=n_fft,
145
+ hop_length=hop_length,
146
+ win_length=win_length,
147
+ n_mel_channels=n_mel_channels,
148
+ target_sample_rate=target_sample_rate,
149
+ mel_spec_type=mel_spec_type,
150
+ ),
151
+ odeint_kwargs=dict(
152
+ method=ode_method,
153
+ ),
154
+ vocab_char_map=vocab_char_map,
155
+ ).to(device)
156
+
157
+ dtype = torch.float32 if mel_spec_type == "bigvgan" else None
158
+ model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
159
+
160
+ if not os.path.exists(output_dir) and accelerator.is_main_process:
161
+ os.makedirs(output_dir)
162
+
163
+ # start batch inference
164
+ accelerator.wait_for_everyone()
165
+ start = time.time()
166
+
167
+ with accelerator.split_between_processes(prompts_all) as prompts:
168
+ for prompt in tqdm(prompts, disable=not accelerator.is_local_main_process):
169
+ utts, ref_rms_list, ref_mels, ref_mel_lens, total_mel_lens, final_text_list = prompt
170
+ ref_mels = ref_mels.to(device)
171
+ ref_mel_lens = torch.tensor(ref_mel_lens, dtype=torch.long).to(device)
172
+ total_mel_lens = torch.tensor(total_mel_lens, dtype=torch.long).to(device)
173
+
174
+ # Inference
175
+ with torch.inference_mode():
176
+ generated, _ = model.sample(
177
+ cond=ref_mels,
178
+ text=final_text_list,
179
+ duration=total_mel_lens,
180
+ lens=ref_mel_lens,
181
+ steps=nfe_step,
182
+ cfg_strength=cfg_strength,
183
+ sway_sampling_coef=sway_sampling_coef,
184
+ no_ref_audio=no_ref_audio,
185
+ seed=seed,
186
+ )
187
+ # Final result
188
+ for i, gen in enumerate(generated):
189
+ gen = gen[ref_mel_lens[i] : total_mel_lens[i], :].unsqueeze(0)
190
+ gen_mel_spec = gen.permute(0, 2, 1).to(torch.float32)
191
+ if mel_spec_type == "vocos":
192
+ generated_wave = vocoder.decode(gen_mel_spec).cpu()
193
+ elif mel_spec_type == "bigvgan":
194
+ generated_wave = vocoder(gen_mel_spec).squeeze(0).cpu()
195
+
196
+ if ref_rms_list[i] < target_rms:
197
+ generated_wave = generated_wave * ref_rms_list[i] / target_rms
198
+ torchaudio.save(f"{output_dir}/{utts[i]}.wav", generated_wave, target_sample_rate)
199
+
200
+ accelerator.wait_for_everyone()
201
+ if accelerator.is_main_process:
202
+ timediff = time.time() - start
203
+ print(f"Done batch inference in {timediff / 60 :.2f} minutes.")
204
+
205
+
206
+ if __name__ == "__main__":
207
+ main()
src/f5_tts/eval/eval_infer_batch.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # e.g. F5-TTS, 16 NFE
4
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_Base" -t "seedtts_test_zh" -nfe 16
5
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_Base" -t "seedtts_test_en" -nfe 16
6
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_Base" -t "ls_pc_test_clean" -nfe 16
7
+
8
+ # e.g. Vanilla E2 TTS, 32 NFE
9
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -t "seedtts_test_zh" -o "midpoint" -ss 0
10
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -t "seedtts_test_en" -o "midpoint" -ss 0
11
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -t "ls_pc_test_clean" -o "midpoint" -ss 0
12
+
13
+ # etc.
src/f5_tts/eval/eval_librispeech_test_clean.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Evaluate with Librispeech test-clean, ~3s prompt to generate 4-10s audio (the way of valle/voicebox evaluation)
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ import sys
7
+
8
+ sys.path.append(os.getcwd())
9
+
10
+ import multiprocessing as mp
11
+ from importlib.resources import files
12
+
13
+ import numpy as np
14
+ from f5_tts.eval.utils_eval import (
15
+ get_librispeech_test,
16
+ run_asr_wer,
17
+ run_sim,
18
+ )
19
+
20
+ rel_path = str(files("f5_tts").joinpath("../../"))
21
+
22
+
23
+ def get_args():
24
+ parser = argparse.ArgumentParser()
25
+ parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
26
+ parser.add_argument("-l", "--lang", type=str, default="en")
27
+ parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
28
+ parser.add_argument("-p", "--librispeech_test_clean_path", type=str, required=True)
29
+ parser.add_argument("-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use")
30
+ parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
31
+ return parser.parse_args()
32
+
33
+
34
+ def main():
35
+ args = get_args()
36
+ eval_task = args.eval_task
37
+ lang = args.lang
38
+ librispeech_test_clean_path = args.librispeech_test_clean_path # test-clean path
39
+ gen_wav_dir = args.gen_wav_dir
40
+ metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
41
+
42
+ gpus = list(range(args.gpu_nums))
43
+ test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path)
44
+
45
+ ## In LibriSpeech, some speakers utilized varying voice characteristics for different characters in the book,
46
+ ## leading to a low similarity for the ground truth in some cases.
47
+ # test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth = True) # eval ground truth
48
+
49
+ local = args.local
50
+ if local: # use local custom checkpoint dir
51
+ asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
52
+ else:
53
+ asr_ckpt_dir = "" # auto download to cache dir
54
+ wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
55
+
56
+ # --------------------------- WER ---------------------------
57
+
58
+ if eval_task == "wer":
59
+ wer_results = []
60
+ wers = []
61
+
62
+ with mp.Pool(processes=len(gpus)) as pool:
63
+ args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
64
+ results = pool.map(run_asr_wer, args)
65
+ for r in results:
66
+ wer_results.extend(r)
67
+
68
+ wer_result_path = f"{gen_wav_dir}/{lang}_wer_results.jsonl"
69
+ with open(wer_result_path, "w") as f:
70
+ for line in wer_results:
71
+ wers.append(line["wer"])
72
+ json_line = json.dumps(line, ensure_ascii=False)
73
+ f.write(json_line + "\n")
74
+
75
+ wer = round(np.mean(wers) * 100, 3)
76
+ print(f"\nTotal {len(wers)} samples")
77
+ print(f"WER : {wer}%")
78
+ print(f"Results have been saved to {wer_result_path}")
79
+
80
+ # --------------------------- SIM ---------------------------
81
+
82
+ if eval_task == "sim":
83
+ sims = []
84
+ with mp.Pool(processes=len(gpus)) as pool:
85
+ args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
86
+ results = pool.map(run_sim, args)
87
+ for r in results:
88
+ sims.extend(r)
89
+
90
+ sim = round(sum(sims) / len(sims), 3)
91
+ print(f"\nTotal {len(sims)} samples")
92
+ print(f"SIM : {sim}")
93
+
94
+
95
+ if __name__ == "__main__":
96
+ main()
src/f5_tts/eval/eval_seedtts_testset.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Evaluate with Seed-TTS testset
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ import sys
7
+
8
+ sys.path.append(os.getcwd())
9
+
10
+ import multiprocessing as mp
11
+ from importlib.resources import files
12
+
13
+ import numpy as np
14
+ from f5_tts.eval.utils_eval import (
15
+ get_seed_tts_test,
16
+ run_asr_wer,
17
+ run_sim,
18
+ )
19
+
20
+ rel_path = str(files("f5_tts").joinpath("../../"))
21
+
22
+
23
+ def get_args():
24
+ parser = argparse.ArgumentParser()
25
+ parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
26
+ parser.add_argument("-l", "--lang", type=str, default="en", choices=["zh", "en"])
27
+ parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
28
+ parser.add_argument("-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use")
29
+ parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
30
+ return parser.parse_args()
31
+
32
+
33
+ def main():
34
+ args = get_args()
35
+ eval_task = args.eval_task
36
+ lang = args.lang
37
+ gen_wav_dir = args.gen_wav_dir
38
+ metalst = rel_path + f"/data/seedtts_testset/{lang}/meta.lst" # seed-tts testset
39
+
40
+ # NOTE. paraformer-zh result will be slightly different according to the number of gpus, cuz batchsize is different
41
+ # zh 1.254 seems a result of 4 workers wer_seed_tts
42
+ gpus = list(range(args.gpu_nums))
43
+ test_set = get_seed_tts_test(metalst, gen_wav_dir, gpus)
44
+
45
+ local = args.local
46
+ if local: # use local custom checkpoint dir
47
+ if lang == "zh":
48
+ asr_ckpt_dir = "../checkpoints/funasr" # paraformer-zh dir under funasr
49
+ elif lang == "en":
50
+ asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
51
+ else:
52
+ asr_ckpt_dir = "" # auto download to cache dir
53
+ wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
54
+
55
+ # --------------------------- WER ---------------------------
56
+
57
+ if eval_task == "wer":
58
+ wer_results = []
59
+ wers = []
60
+
61
+ with mp.Pool(processes=len(gpus)) as pool:
62
+ args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
63
+ results = pool.map(run_asr_wer, args)
64
+ for r in results:
65
+ wer_results.extend(r)
66
+
67
+ wer_result_path = f"{gen_wav_dir}/{lang}_wer_results.jsonl"
68
+ with open(wer_result_path, "w") as f:
69
+ for line in wer_results:
70
+ wers.append(line["wer"])
71
+ json_line = json.dumps(line, ensure_ascii=False)
72
+ f.write(json_line + "\n")
73
+
74
+ wer = round(np.mean(wers) * 100, 3)
75
+ print(f"\nTotal {len(wers)} samples")
76
+ print(f"WER : {wer}%")
77
+ print(f"Results have been saved to {wer_result_path}")
78
+
79
+ # --------------------------- SIM ---------------------------
80
+
81
+ if eval_task == "sim":
82
+ sims = []
83
+ with mp.Pool(processes=len(gpus)) as pool:
84
+ args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
85
+ results = pool.map(run_sim, args)
86
+ for r in results:
87
+ sims.extend(r)
88
+
89
+ sim = round(sum(sims) / len(sims), 3)
90
+ print(f"\nTotal {len(sims)} samples")
91
+ print(f"SIM : {sim}")
92
+
93
+
94
+ if __name__ == "__main__":
95
+ main()
src/f5_tts/eval/eval_utmos.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ from pathlib import Path
4
+
5
+ import librosa
6
+ import torch
7
+ from tqdm import tqdm
8
+
9
+
10
+ def main():
11
+ parser = argparse.ArgumentParser(description="UTMOS Evaluation")
12
+ parser.add_argument("--audio_dir", type=str, required=True, help="Audio file path.")
13
+ parser.add_argument("--ext", type=str, default="wav", help="Audio extension.")
14
+ args = parser.parse_args()
15
+
16
+ device = "cuda" if torch.cuda.is_available() else "xpu" if torch.xpu.is_available() else "cpu"
17
+
18
+ predictor = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True)
19
+ predictor = predictor.to(device)
20
+
21
+ audio_paths = list(Path(args.audio_dir).rglob(f"*.{args.ext}"))
22
+ utmos_results = {}
23
+ utmos_score = 0
24
+
25
+ for audio_path in tqdm(audio_paths, desc="Processing"):
26
+ wav_name = audio_path.stem
27
+ wav, sr = librosa.load(audio_path, sr=None, mono=True)
28
+ wav_tensor = torch.from_numpy(wav).to(device).unsqueeze(0)
29
+ score = predictor(wav_tensor, sr)
30
+ utmos_results[str(wav_name)] = score.item()
31
+ utmos_score += score.item()
32
+
33
+ avg_score = utmos_score / len(audio_paths) if len(audio_paths) > 0 else 0
34
+ print(f"UTMOS: {avg_score}")
35
+
36
+ utmos_result_path = Path(args.audio_dir) / "utmos_results.json"
37
+ with open(utmos_result_path, "w", encoding="utf-8") as f:
38
+ json.dump(utmos_results, f, ensure_ascii=False, indent=4)
39
+
40
+ print(f"Results have been saved to {utmos_result_path}")
41
+
42
+
43
+ if __name__ == "__main__":
44
+ main()
src/f5_tts/eval/utils_eval.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import random
4
+ import string
5
+ from pathlib import Path
6
+
7
+ import torch
8
+ import torch.nn.functional as F
9
+ import torchaudio
10
+ from tqdm import tqdm
11
+
12
+ from f5_tts.eval.ecapa_tdnn import ECAPA_TDNN_SMALL
13
+ from f5_tts.model.modules import MelSpec
14
+ from f5_tts.model.utils import convert_char_to_pinyin
15
+
16
+
17
+ # seedtts testset metainfo: utt, prompt_text, prompt_wav, gt_text, gt_wav
18
+ def get_seedtts_testset_metainfo(metalst):
19
+ f = open(metalst)
20
+ lines = f.readlines()
21
+ f.close()
22
+ metainfo = []
23
+ for line in lines:
24
+ if len(line.strip().split("|")) == 5:
25
+ utt, prompt_text, prompt_wav, gt_text, gt_wav = line.strip().split("|")
26
+ elif len(line.strip().split("|")) == 4:
27
+ utt, prompt_text, prompt_wav, gt_text = line.strip().split("|")
28
+ gt_wav = os.path.join(os.path.dirname(metalst), "wavs", utt + ".wav")
29
+ if not os.path.isabs(prompt_wav):
30
+ prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav)
31
+ metainfo.append((utt, prompt_text, prompt_wav, gt_text, gt_wav))
32
+ return metainfo
33
+
34
+
35
+ # librispeech test-clean metainfo: gen_utt, ref_txt, ref_wav, gen_txt, gen_wav
36
+ def get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path):
37
+ f = open(metalst)
38
+ lines = f.readlines()
39
+ f.close()
40
+ metainfo = []
41
+ for line in lines:
42
+ ref_utt, ref_dur, ref_txt, gen_utt, gen_dur, gen_txt = line.strip().split("\t")
43
+
44
+ # ref_txt = ref_txt[0] + ref_txt[1:].lower() + '.' # if use librispeech test-clean (no-pc)
45
+ ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
46
+ ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac")
47
+
48
+ # gen_txt = gen_txt[0] + gen_txt[1:].lower() + '.' # if use librispeech test-clean (no-pc)
49
+ gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
50
+ gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + ".flac")
51
+
52
+ metainfo.append((gen_utt, ref_txt, ref_wav, " " + gen_txt, gen_wav))
53
+
54
+ return metainfo
55
+
56
+
57
+ # padded to max length mel batch
58
+ def padded_mel_batch(ref_mels):
59
+ max_mel_length = torch.LongTensor([mel.shape[-1] for mel in ref_mels]).amax()
60
+ padded_ref_mels = []
61
+ for mel in ref_mels:
62
+ padded_ref_mel = F.pad(mel, (0, max_mel_length - mel.shape[-1]), value=0)
63
+ padded_ref_mels.append(padded_ref_mel)
64
+ padded_ref_mels = torch.stack(padded_ref_mels)
65
+ padded_ref_mels = padded_ref_mels.permute(0, 2, 1)
66
+ return padded_ref_mels
67
+
68
+
69
+ # get prompts from metainfo containing: utt, prompt_text, prompt_wav, gt_text, gt_wav
70
+
71
+
72
+ def get_inference_prompt(
73
+ metainfo,
74
+ speed=1.0,
75
+ tokenizer="pinyin",
76
+ polyphone=True,
77
+ target_sample_rate=24000,
78
+ n_fft=1024,
79
+ win_length=1024,
80
+ n_mel_channels=100,
81
+ hop_length=256,
82
+ mel_spec_type="vocos",
83
+ target_rms=0.1,
84
+ use_truth_duration=False,
85
+ infer_batch_size=1,
86
+ num_buckets=200,
87
+ min_secs=3,
88
+ max_secs=40,
89
+ ):
90
+ prompts_all = []
91
+
92
+ min_tokens = min_secs * target_sample_rate // hop_length
93
+ max_tokens = max_secs * target_sample_rate // hop_length
94
+
95
+ batch_accum = [0] * num_buckets
96
+ utts, ref_rms_list, ref_mels, ref_mel_lens, total_mel_lens, final_text_list = (
97
+ [[] for _ in range(num_buckets)] for _ in range(6)
98
+ )
99
+
100
+ mel_spectrogram = MelSpec(
101
+ n_fft=n_fft,
102
+ hop_length=hop_length,
103
+ win_length=win_length,
104
+ n_mel_channels=n_mel_channels,
105
+ target_sample_rate=target_sample_rate,
106
+ mel_spec_type=mel_spec_type,
107
+ )
108
+
109
+ for utt, prompt_text, prompt_wav, gt_text, gt_wav in tqdm(metainfo, desc="Processing prompts..."):
110
+ # Audio
111
+ ref_audio, ref_sr = torchaudio.load(prompt_wav)
112
+ ref_rms = torch.sqrt(torch.mean(torch.square(ref_audio)))
113
+ if ref_rms < target_rms:
114
+ ref_audio = ref_audio * target_rms / ref_rms
115
+ assert ref_audio.shape[-1] > 5000, f"Empty prompt wav: {prompt_wav}, or torchaudio backend issue."
116
+ if ref_sr != target_sample_rate:
117
+ resampler = torchaudio.transforms.Resample(ref_sr, target_sample_rate)
118
+ ref_audio = resampler(ref_audio)
119
+
120
+ # Text
121
+ if len(prompt_text[-1].encode("utf-8")) == 1:
122
+ prompt_text = prompt_text + " "
123
+ text = [prompt_text + gt_text]
124
+ if tokenizer == "pinyin":
125
+ text_list = convert_char_to_pinyin(text, polyphone=polyphone)
126
+ else:
127
+ text_list = text
128
+
129
+ # Duration, mel frame length
130
+ ref_mel_len = ref_audio.shape[-1] // hop_length
131
+ if use_truth_duration:
132
+ gt_audio, gt_sr = torchaudio.load(gt_wav)
133
+ if gt_sr != target_sample_rate:
134
+ resampler = torchaudio.transforms.Resample(gt_sr, target_sample_rate)
135
+ gt_audio = resampler(gt_audio)
136
+ total_mel_len = ref_mel_len + int(gt_audio.shape[-1] / hop_length / speed)
137
+
138
+ # # test vocoder resynthesis
139
+ # ref_audio = gt_audio
140
+ else:
141
+ ref_text_len = len(prompt_text.encode("utf-8"))
142
+ gen_text_len = len(gt_text.encode("utf-8"))
143
+ total_mel_len = ref_mel_len + int(ref_mel_len / ref_text_len * gen_text_len / speed)
144
+
145
+ # to mel spectrogram
146
+ ref_mel = mel_spectrogram(ref_audio)
147
+ ref_mel = ref_mel.squeeze(0)
148
+
149
+ # deal with batch
150
+ assert infer_batch_size > 0, "infer_batch_size should be greater than 0."
151
+ assert (
152
+ min_tokens <= total_mel_len <= max_tokens
153
+ ), f"Audio {utt} has duration {total_mel_len*hop_length//target_sample_rate}s out of range [{min_secs}, {max_secs}]."
154
+ bucket_i = math.floor((total_mel_len - min_tokens) / (max_tokens - min_tokens + 1) * num_buckets)
155
+
156
+ utts[bucket_i].append(utt)
157
+ ref_rms_list[bucket_i].append(ref_rms)
158
+ ref_mels[bucket_i].append(ref_mel)
159
+ ref_mel_lens[bucket_i].append(ref_mel_len)
160
+ total_mel_lens[bucket_i].append(total_mel_len)
161
+ final_text_list[bucket_i].extend(text_list)
162
+
163
+ batch_accum[bucket_i] += total_mel_len
164
+
165
+ if batch_accum[bucket_i] >= infer_batch_size:
166
+ # print(f"\n{len(ref_mels[bucket_i][0][0])}\n{ref_mel_lens[bucket_i]}\n{total_mel_lens[bucket_i]}")
167
+ prompts_all.append(
168
+ (
169
+ utts[bucket_i],
170
+ ref_rms_list[bucket_i],
171
+ padded_mel_batch(ref_mels[bucket_i]),
172
+ ref_mel_lens[bucket_i],
173
+ total_mel_lens[bucket_i],
174
+ final_text_list[bucket_i],
175
+ )
176
+ )
177
+ batch_accum[bucket_i] = 0
178
+ (
179
+ utts[bucket_i],
180
+ ref_rms_list[bucket_i],
181
+ ref_mels[bucket_i],
182
+ ref_mel_lens[bucket_i],
183
+ total_mel_lens[bucket_i],
184
+ final_text_list[bucket_i],
185
+ ) = [], [], [], [], [], []
186
+
187
+ # add residual
188
+ for bucket_i, bucket_frames in enumerate(batch_accum):
189
+ if bucket_frames > 0:
190
+ prompts_all.append(
191
+ (
192
+ utts[bucket_i],
193
+ ref_rms_list[bucket_i],
194
+ padded_mel_batch(ref_mels[bucket_i]),
195
+ ref_mel_lens[bucket_i],
196
+ total_mel_lens[bucket_i],
197
+ final_text_list[bucket_i],
198
+ )
199
+ )
200
+ # not only leave easy work for last workers
201
+ random.seed(666)
202
+ random.shuffle(prompts_all)
203
+
204
+ return prompts_all
205
+
206
+
207
+ # get wav_res_ref_text of seed-tts test metalst
208
+ # https://github.com/BytedanceSpeech/seed-tts-eval
209
+
210
+
211
+ def get_seed_tts_test(metalst, gen_wav_dir, gpus):
212
+ f = open(metalst)
213
+ lines = f.readlines()
214
+ f.close()
215
+
216
+ test_set_ = []
217
+ for line in tqdm(lines):
218
+ if len(line.strip().split("|")) == 5:
219
+ utt, prompt_text, prompt_wav, gt_text, gt_wav = line.strip().split("|")
220
+ elif len(line.strip().split("|")) == 4:
221
+ utt, prompt_text, prompt_wav, gt_text = line.strip().split("|")
222
+
223
+ if not os.path.exists(os.path.join(gen_wav_dir, utt + ".wav")):
224
+ continue
225
+ gen_wav = os.path.join(gen_wav_dir, utt + ".wav")
226
+ if not os.path.isabs(prompt_wav):
227
+ prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav)
228
+
229
+ test_set_.append((gen_wav, prompt_wav, gt_text))
230
+
231
+ num_jobs = len(gpus)
232
+ if num_jobs == 1:
233
+ return [(gpus[0], test_set_)]
234
+
235
+ wav_per_job = len(test_set_) // num_jobs + 1
236
+ test_set = []
237
+ for i in range(num_jobs):
238
+ test_set.append((gpus[i], test_set_[i * wav_per_job : (i + 1) * wav_per_job]))
239
+
240
+ return test_set
241
+
242
+
243
+ # get librispeech test-clean cross sentence test
244
+
245
+
246
+ def get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth=False):
247
+ f = open(metalst)
248
+ lines = f.readlines()
249
+ f.close()
250
+
251
+ test_set_ = []
252
+ for line in tqdm(lines):
253
+ ref_utt, ref_dur, ref_txt, gen_utt, gen_dur, gen_txt = line.strip().split("\t")
254
+
255
+ if eval_ground_truth:
256
+ gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
257
+ gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + ".flac")
258
+ else:
259
+ if not os.path.exists(os.path.join(gen_wav_dir, gen_utt + ".wav")):
260
+ raise FileNotFoundError(f"Generated wav not found: {gen_utt}")
261
+ gen_wav = os.path.join(gen_wav_dir, gen_utt + ".wav")
262
+
263
+ ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
264
+ ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac")
265
+
266
+ test_set_.append((gen_wav, ref_wav, gen_txt))
267
+
268
+ num_jobs = len(gpus)
269
+ if num_jobs == 1:
270
+ return [(gpus[0], test_set_)]
271
+
272
+ wav_per_job = len(test_set_) // num_jobs + 1
273
+ test_set = []
274
+ for i in range(num_jobs):
275
+ test_set.append((gpus[i], test_set_[i * wav_per_job : (i + 1) * wav_per_job]))
276
+
277
+ return test_set
278
+
279
+
280
+ # load asr model
281
+
282
+
283
+ def load_asr_model(lang, ckpt_dir=""):
284
+ if lang == "zh":
285
+ from funasr import AutoModel
286
+
287
+ model = AutoModel(
288
+ model=os.path.join(ckpt_dir, "paraformer-zh"),
289
+ # vad_model = os.path.join(ckpt_dir, "fsmn-vad"),
290
+ # punc_model = os.path.join(ckpt_dir, "ct-punc"),
291
+ # spk_model = os.path.join(ckpt_dir, "cam++"),
292
+ disable_update=True,
293
+ ) # following seed-tts setting
294
+ elif lang == "en":
295
+ from faster_whisper import WhisperModel
296
+
297
+ model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
298
+ model = WhisperModel(model_size, device="cuda", compute_type="float16")
299
+ return model
300
+
301
+
302
+ # WER Evaluation, the way Seed-TTS does
303
+
304
+
305
+ def run_asr_wer(args):
306
+ rank, lang, test_set, ckpt_dir = args
307
+
308
+ if lang == "zh":
309
+ import zhconv
310
+
311
+ torch.cuda.set_device(rank)
312
+ elif lang == "en":
313
+ os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
314
+ else:
315
+ raise NotImplementedError(
316
+ "lang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now."
317
+ )
318
+
319
+ asr_model = load_asr_model(lang, ckpt_dir=ckpt_dir)
320
+
321
+ from zhon.hanzi import punctuation
322
+
323
+ punctuation_all = punctuation + string.punctuation
324
+ wer_results = []
325
+
326
+ from jiwer import compute_measures
327
+
328
+ for gen_wav, prompt_wav, truth in tqdm(test_set):
329
+ if lang == "zh":
330
+ res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)
331
+ hypo = res[0]["text"]
332
+ hypo = zhconv.convert(hypo, "zh-cn")
333
+ elif lang == "en":
334
+ segments, _ = asr_model.transcribe(gen_wav, beam_size=5, language="en")
335
+ hypo = ""
336
+ for segment in segments:
337
+ hypo = hypo + " " + segment.text
338
+
339
+ raw_truth = truth
340
+ raw_hypo = hypo
341
+
342
+ for x in punctuation_all:
343
+ truth = truth.replace(x, "")
344
+ hypo = hypo.replace(x, "")
345
+
346
+ truth = truth.replace(" ", " ")
347
+ hypo = hypo.replace(" ", " ")
348
+
349
+ if lang == "zh":
350
+ truth = " ".join([x for x in truth])
351
+ hypo = " ".join([x for x in hypo])
352
+ elif lang == "en":
353
+ truth = truth.lower()
354
+ hypo = hypo.lower()
355
+
356
+ measures = compute_measures(truth, hypo)
357
+ wer = measures["wer"]
358
+
359
+ # ref_list = truth.split(" ")
360
+ # subs = measures["substitutions"] / len(ref_list)
361
+ # dele = measures["deletions"] / len(ref_list)
362
+ # inse = measures["insertions"] / len(ref_list)
363
+
364
+ wer_results.append(
365
+ {
366
+ "wav": Path(gen_wav).stem,
367
+ "truth": raw_truth,
368
+ "hypo": raw_hypo,
369
+ "wer": wer,
370
+ }
371
+ )
372
+
373
+ return wer_results
374
+
375
+
376
+ # SIM Evaluation
377
+
378
+
379
+ def run_sim(args):
380
+ rank, test_set, ckpt_dir = args
381
+ device = f"cuda:{rank}"
382
+
383
+ model = ECAPA_TDNN_SMALL(feat_dim=1024, feat_type="wavlm_large", config_path=None)
384
+ state_dict = torch.load(ckpt_dir, weights_only=True, map_location=lambda storage, loc: storage)
385
+ model.load_state_dict(state_dict["model"], strict=False)
386
+
387
+ use_gpu = True if torch.cuda.is_available() else False
388
+ if use_gpu:
389
+ model = model.cuda(device)
390
+ model.eval()
391
+
392
+ sims = []
393
+ for wav1, wav2, truth in tqdm(test_set):
394
+ wav1, sr1 = torchaudio.load(wav1)
395
+ wav2, sr2 = torchaudio.load(wav2)
396
+
397
+ resample1 = torchaudio.transforms.Resample(orig_freq=sr1, new_freq=16000)
398
+ resample2 = torchaudio.transforms.Resample(orig_freq=sr2, new_freq=16000)
399
+ wav1 = resample1(wav1)
400
+ wav2 = resample2(wav2)
401
+
402
+ if use_gpu:
403
+ wav1 = wav1.cuda(device)
404
+ wav2 = wav2.cuda(device)
405
+ with torch.no_grad():
406
+ emb1 = model(wav1)
407
+ emb2 = model(wav2)
408
+
409
+ sim = F.cosine_similarity(emb1, emb2)[0].item()
410
+ # print(f"VSim score between two audios: {sim:.4f} (-1.0, 1.0).")
411
+ sims.append(sim)
412
+
413
+ return sims
src/f5_tts/infer/README.md ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Inference
2
+
3
+ The pretrained model checkpoints can be reached at [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [🤖 Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or will be automatically downloaded when running inference scripts.
4
+
5
+ **More checkpoints with whole community efforts can be found in [SHARED.md](SHARED.md), supporting more languages.**
6
+
7
+ Currently support **30s for a single** generation, which is the **total length** including both prompt and output audio. However, you can provide `infer_cli` and `infer_gradio` with longer text, will automatically do chunk generation. Long reference audio will be **clip short to ~15s**.
8
+
9
+ To avoid possible inference failures, make sure you have seen through the following instructions.
10
+
11
+ - Use reference audio <15s and leave some silence (e.g. 1s) at the end. Otherwise there is a risk of truncating in the middle of word, leading to suboptimal generation.
12
+ - Uppercased letters will be uttered letter by letter, so use lowercased letters for normal words.
13
+ - Add some spaces (blank: " ") or punctuations (e.g. "," ".") to explicitly introduce some pauses.
14
+ - Preprocess numbers to Chinese letters if you want to have them read in Chinese, otherwise in English.
15
+ - If the generation output is blank (pure silence), check for ffmpeg installation (various tutorials online, blogs, videos, etc.).
16
+ - Try turn off use_ema if using an early-stage finetuned checkpoint (which goes just few updates).
17
+
18
+
19
+ ## Gradio App
20
+
21
+ Currently supported features:
22
+
23
+ - Basic TTS with Chunk Inference
24
+ - Multi-Style / Multi-Speaker Generation
25
+ - Voice Chat powered by Qwen2.5-3B-Instruct
26
+
27
+ The cli command `f5-tts_infer-gradio` equals to `python src/f5_tts/infer/infer_gradio.py`, which launches a Gradio APP (web interface) for inference.
28
+
29
+ The script will load model checkpoints from Huggingface. You can also manually download files and update the path to `load_model()` in `infer_gradio.py`. Currently only load TTS models first, will load ASR model to do transcription if `ref_text` not provided, will load LLM model if use Voice Chat.
30
+
31
+ Could also be used as a component for larger application.
32
+ ```python
33
+ import gradio as gr
34
+ from f5_tts.infer.infer_gradio import app
35
+
36
+ with gr.Blocks() as main_app:
37
+ gr.Markdown("# This is an example of using F5-TTS within a bigger Gradio app")
38
+
39
+ # ... other Gradio components
40
+
41
+ app.render()
42
+
43
+ main_app.launch()
44
+ ```
45
+
46
+
47
+ ## CLI Inference
48
+
49
+ The cli command `f5-tts_infer-cli` equals to `python src/f5_tts/infer/infer_cli.py`, which is a command line tool for inference.
50
+
51
+ The script will load model checkpoints from Huggingface. You can also manually download files and use `--ckpt_file` to specify the model you want to load, or directly update in `infer_cli.py`.
52
+
53
+ For change vocab.txt use `--vocab_file` to provide your `vocab.txt` file.
54
+
55
+ Basically you can inference with flags:
56
+ ```bash
57
+ # Leave --ref_text "" will have ASR model transcribe (extra GPU memory usage)
58
+ f5-tts_infer-cli \
59
+ --model "F5-TTS" \
60
+ --ref_audio "ref_audio.wav" \
61
+ --ref_text "hình ảnh cực đoan trong em_vi của sơn tùng mờ thành phố bị khán giả chỉ trích" \
62
+ --gen_text "tôi yêu em đến nay chừng có thể, ngọn lửa tình chưa hẳn đã tàn phai." \
63
+ --vocoder_name vocos \
64
+ --load_vocoder_from_local \
65
+ --ckpt_file ckpts/F5TTS_Base_vocos_char_vnTTS/model_last.pt
66
+
67
+ # Choose Vocoder
68
+ f5-tts_infer-cli --vocoder_name bigvgan --load_vocoder_from_local --ckpt_file <YOUR_CKPT_PATH, eg:ckpts/F5TTS_Base_bigvgan/model_1250000.pt>
69
+ f5-tts_infer-cli --vocoder_name vocos --load_vocoder_from_local --ckpt_file <YOUR_CKPT_PATH, eg:ckpts/F5TTS_Base/model_1200000.safetensors>
70
+
71
+ # More instructions
72
+ f5-tts_infer-cli --help
73
+ ```
74
+
75
+ And a `.toml` file would help with more flexible usage.
76
+
77
+ ```bash
78
+ f5-tts_infer-cli -c custom.toml
79
+ ```
80
+
81
+ For example, you can use `.toml` to pass in variables, refer to `src/f5_tts/infer/examples/basic/basic.toml`:
82
+
83
+ ```toml
84
+ # F5-TTS | E2-TTS
85
+ model = "F5-TTS"
86
+ ref_audio = "infer/examples/basic/basic_ref_en.wav"
87
+ # If an empty "", transcribes the reference audio automatically.
88
+ ref_text = "Some call me nature, others call me mother nature."
89
+ gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
90
+ # File with text to generate. Ignores the text above.
91
+ gen_file = ""
92
+ remove_silence = false
93
+ output_dir = "tests"
94
+ ```
95
+
96
+ You can also leverage `.toml` file to do multi-style generation, refer to `src/f5_tts/infer/examples/multi/story.toml`.
97
+
98
+ ```toml
99
+ # F5-TTS | E2-TTS
100
+ model = "F5-TTS"
101
+ ref_audio = "infer/examples/multi/main.flac"
102
+ # If an empty "", transcribes the reference audio automatically.
103
+ ref_text = ""
104
+ gen_text = ""
105
+ # File with text to generate. Ignores the text above.
106
+ gen_file = "infer/examples/multi/story.txt"
107
+ remove_silence = true
108
+ output_dir = "tests"
109
+
110
+ [voices.town]
111
+ ref_audio = "infer/examples/multi/town.flac"
112
+ ref_text = ""
113
+
114
+ [voices.country]
115
+ ref_audio = "infer/examples/multi/country.flac"
116
+ ref_text = ""
117
+ ```
118
+ You should mark the voice with `[main]` `[town]` `[country]` whenever you want to change voice, refer to `src/f5_tts/infer/examples/multi/story.txt`.
119
+
120
+ ## Speech Editing
121
+
122
+ To test speech editing capabilities, use the following command:
123
+
124
+ ```bash
125
+ python src/f5_tts/infer/speech_edit.py
126
+ ```
127
+
128
+ ## Socket Realtime Client
129
+
130
+ To communicate with socket server you need to run
131
+ ```bash
132
+ python src/f5_tts/socket_server.py
133
+ ```
134
+
135
+ <details>
136
+ <summary>Then create client to communicate</summary>
137
+
138
+ ``` python
139
+ import socket
140
+ import numpy as np
141
+ import asyncio
142
+ import pyaudio
143
+
144
+ async def listen_to_voice(text, server_ip='localhost', server_port=9999):
145
+ client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
146
+ client_socket.connect((server_ip, server_port))
147
+
148
+ async def play_audio_stream():
149
+ buffer = b''
150
+ p = pyaudio.PyAudio()
151
+ stream = p.open(format=pyaudio.paFloat32,
152
+ channels=1,
153
+ rate=24000, # Ensure this matches the server's sampling rate
154
+ output=True,
155
+ frames_per_buffer=2048)
156
+
157
+ try:
158
+ while True:
159
+ chunk = await asyncio.get_event_loop().run_in_executor(None, client_socket.recv, 1024)
160
+ if not chunk: # End of stream
161
+ break
162
+ if b"END_OF_AUDIO" in chunk:
163
+ buffer += chunk.replace(b"END_OF_AUDIO", b"")
164
+ if buffer:
165
+ audio_array = np.frombuffer(buffer, dtype=np.float32).copy() # Make a writable copy
166
+ stream.write(audio_array.tobytes())
167
+ break
168
+ buffer += chunk
169
+ if len(buffer) >= 4096:
170
+ audio_array = np.frombuffer(buffer[:4096], dtype=np.float32).copy() # Make a writable copy
171
+ stream.write(audio_array.tobytes())
172
+ buffer = buffer[4096:]
173
+ finally:
174
+ stream.stop_stream()
175
+ stream.close()
176
+ p.terminate()
177
+
178
+ try:
179
+ # Send only the text to the server
180
+ await asyncio.get_event_loop().run_in_executor(None, client_socket.sendall, text.encode('utf-8'))
181
+ await play_audio_stream()
182
+ print("Audio playback finished.")
183
+
184
+ except Exception as e:
185
+ print(f"Error in listen_to_voice: {e}")
186
+
187
+ finally:
188
+ client_socket.close()
189
+
190
+ # Example usage: Replace this with your actual server IP and port
191
+ async def main():
192
+ await listen_to_voice("my name is jenny..", server_ip='localhost', server_port=9998)
193
+
194
+ # Run the main async function
195
+ asyncio.run(main())
196
+ ```
197
+
198
+ </details>
199
+
src/f5_tts/infer/SHARED.md ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- omit in toc -->
2
+ # Shared Model Cards
3
+
4
+ <!-- omit in toc -->
5
+ ### **Prerequisites of using**
6
+ - This document is serving as a quick lookup table for the community training/finetuning result, with various language support.
7
+ - The models in this repository are open source and are based on voluntary contributions from contributors.
8
+ - The use of models must be conditioned on respect for the respective creators. The convenience brought comes from their efforts.
9
+
10
+ <!-- omit in toc -->
11
+ ### **Welcome to share here**
12
+ - Have a pretrained/finetuned result: model checkpoint (pruned best to facilitate inference, i.e. leave only `ema_model_state_dict`) and corresponding vocab file (for tokenization).
13
+ - Host a public [huggingface model repository](https://huggingface.co/new) and upload the model related files.
14
+ - Make a pull request adding a model card to the current page, i.e. `src\f5_tts\infer\SHARED.md`.
15
+
16
+ <!-- omit in toc -->
17
+ ### Supported Languages
18
+ - [Multilingual](#multilingual)
19
+ - [F5-TTS Base @ zh \& en @ F5-TTS](#f5-tts-base--zh--en--f5-tts)
20
+ - [English](#english)
21
+ - [Finnish](#finnish)
22
+ - [F5-TTS Base @ fi @ AsmoKoskinen](#f5-tts-base--fi--asmokoskinen)
23
+ - [French](#french)
24
+ - [F5-TTS Base @ fr @ RASPIAUDIO](#f5-tts-base--fr--raspiaudio)
25
+ - [Hindi](#hindi)
26
+ - [F5-TTS Small @ hi @ SPRINGLab](#f5-tts-small--hi--springlab)
27
+ - [Italian](#italian)
28
+ - [F5-TTS Base @ it @ alien79](#f5-tts-base--it--alien79)
29
+ - [Japanese](#japanese)
30
+ - [F5-TTS Base @ ja @ Jmica](#f5-tts-base--ja--jmica)
31
+ - [Mandarin](#mandarin)
32
+ - [Russian](#russian)
33
+ - [F5-TTS Base @ ru @ HotDro4illa](#f5-tts-base--ru--hotdro4illa)
34
+ - [Spanish](#spanish)
35
+ - [F5-TTS Base @ es @ jpgallegoar](#f5-tts-base--es--jpgallegoar)
36
+
37
+
38
+ ## Multilingual
39
+
40
+ #### F5-TTS Base @ zh & en @ F5-TTS
41
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
42
+ |:---:|:------------:|:-----------:|:-------------:|
43
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/SWivid/F5-TTS/tree/main/F5TTS_Base)|[Emilia 95K zh&en](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07)|cc-by-nc-4.0|
44
+
45
+ ```bash
46
+ Model: hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors
47
+ Vocab: hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt
48
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}
49
+ ```
50
+
51
+ *Other infos, e.g. Author info, Github repo, Link to some sampled results, Usage instruction, Tutorial (Blog, Video, etc.) ...*
52
+
53
+
54
+ ## English
55
+
56
+
57
+ ## Finnish
58
+
59
+ #### F5-TTS Base @ fi @ AsmoKoskinen
60
+ |Model|🤗Hugging Face|Data|Model License|
61
+ |:---:|:------------:|:-----------:|:-------------:|
62
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/AsmoKoskinen/F5-TTS_Finnish_Model)|[Common Voice](https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0), [Vox Populi](https://huggingface.co/datasets/facebook/voxpopuli)|cc-by-nc-4.0|
63
+
64
+ ```bash
65
+ Model: hf://AsmoKoskinen/F5-TTS_Finnish_Model/model_common_voice_fi_vox_populi_fi_20241206.safetensors
66
+ Vocab: hf://AsmoKoskinen/F5-TTS_Finnish_Model/vocab.txt
67
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}
68
+ ```
69
+
70
+
71
+ ## French
72
+
73
+ #### F5-TTS Base @ fr @ RASPIAUDIO
74
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
75
+ |:---:|:------------:|:-----------:|:-------------:|
76
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/RASPIAUDIO/F5-French-MixedSpeakers-reduced)|[LibriVox](https://librivox.org/)|cc-by-nc-4.0|
77
+
78
+ ```bash
79
+ Model: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/model_last_reduced.pt
80
+ Vocab: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/vocab.txt
81
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}
82
+ ```
83
+
84
+ - [Online Inference with Hugging Face Space](https://huggingface.co/spaces/RASPIAUDIO/f5-tts_french).
85
+ - [Tutorial video to train a new language model](https://www.youtube.com/watch?v=UO4usaOojys).
86
+ - [Discussion about this training can be found here](https://github.com/SWivid/F5-TTS/issues/434).
87
+
88
+
89
+ ## Hindi
90
+
91
+ #### F5-TTS Small @ hi @ SPRINGLab
92
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
93
+ |:---:|:------------:|:-----------:|:-------------:|
94
+ |F5-TTS Small|[ckpt & vocab](https://huggingface.co/SPRINGLab/F5-Hindi-24KHz)|[IndicTTS Hi](https://huggingface.co/datasets/SPRINGLab/IndicTTS-Hindi) & [IndicVoices-R Hi](https://huggingface.co/datasets/SPRINGLab/IndicVoices-R_Hindi) |cc-by-4.0|
95
+
96
+ ```bash
97
+ Model: hf://SPRINGLab/F5-Hindi-24KHz/model_2500000.safetensors
98
+ Vocab: hf://SPRINGLab/F5-Hindi-24KHz/vocab.txt
99
+ Config: {"dim": 768, "depth": 18, "heads": 12, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}
100
+ ```
101
+
102
+ - Authors: SPRING Lab, Indian Institute of Technology, Madras
103
+ - Website: https://asr.iitm.ac.in/
104
+
105
+
106
+ ## Italian
107
+
108
+ #### F5-TTS Base @ it @ alien79
109
+ |Model|🤗Hugging Face|Data|Model License|
110
+ |:---:|:------------:|:-----------:|:-------------:|
111
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/alien79/F5-TTS-italian)|[ylacombe/cml-tts](https://huggingface.co/datasets/ylacombe/cml-tts) |cc-by-nc-4.0|
112
+
113
+ ```bash
114
+ Model: hf://alien79/F5-TTS-italian/model_159600.safetensors
115
+ Vocab: hf://alien79/F5-TTS-italian/vocab.txt
116
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}
117
+ ```
118
+
119
+ - Trained by [Mithril Man](https://github.com/MithrilMan)
120
+ - Model details on [hf project home](https://huggingface.co/alien79/F5-TTS-italian)
121
+ - Open to collaborations to further improve the model
122
+
123
+
124
+ ## Japanese
125
+
126
+ #### F5-TTS Base @ ja @ Jmica
127
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
128
+ |:---:|:------------:|:-----------:|:-------------:|
129
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/Jmica/F5TTS/tree/main/JA_25498980)|[Emilia 1.7k JA](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07) & [Galgame Dataset 5.4k](https://huggingface.co/datasets/OOPPEENN/Galgame_Dataset)|cc-by-nc-4.0|
130
+
131
+ ```bash
132
+ Model: hf://Jmica/F5TTS/JA_25498980/model_25498980.pt
133
+ Vocab: hf://Jmica/F5TTS/JA_25498980/vocab_updated.txt
134
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}
135
+ ```
136
+
137
+
138
+ ## Mandarin
139
+
140
+
141
+ ## Russian
142
+
143
+ #### F5-TTS Base @ ru @ HotDro4illa
144
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
145
+ |:---:|:------------:|:-----------:|:-------------:|
146
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/hotstone228/F5-TTS-Russian)|[Common voice](https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0)|cc-by-nc-4.0|
147
+
148
+ ```bash
149
+ Model: hf://hotstone228/F5-TTS-Russian/model_last.safetensors
150
+ Vocab: hf://hotstone228/F5-TTS-Russian/vocab.txt
151
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}
152
+ ```
153
+ - Finetuned by [HotDro4illa](https://github.com/HotDro4illa)
154
+ - Any improvements are welcome
155
+
156
+
157
+ ## Spanish
158
+
159
+ #### F5-TTS Base @ es @ jpgallegoar
160
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
161
+ |:---:|:------------:|:-----------:|:-------------:|
162
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/jpgallegoar/F5-Spanish)|[Voxpopuli](https://huggingface.co/datasets/facebook/voxpopuli) & Crowdsourced & TEDx, 218 hours|cc0-1.0|
163
+
164
+ - @jpgallegoar [GitHub repo](https://github.com/jpgallegoar/Spanish-F5), Jupyter Notebook and Gradio usage for Spanish model.
src/f5_tts/infer/infer_cli.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import codecs
3
+ import os
4
+ import re
5
+ from datetime import datetime
6
+ from importlib.resources import files
7
+ from pathlib import Path
8
+
9
+ import numpy as np
10
+ import soundfile as sf
11
+ import tomli
12
+ from cached_path import cached_path
13
+ from omegaconf import OmegaConf
14
+
15
+ from f5_tts.infer.utils_infer import (
16
+ mel_spec_type,
17
+ target_rms,
18
+ cross_fade_duration,
19
+ nfe_step,
20
+ cfg_strength,
21
+ sway_sampling_coef,
22
+ speed,
23
+ fix_duration,
24
+ infer_process,
25
+ load_model,
26
+ load_vocoder,
27
+ preprocess_ref_audio_text,
28
+ remove_silence_for_generated_wav,
29
+ )
30
+ from f5_tts.model import DiT, UNetT
31
+
32
+
33
+ parser = argparse.ArgumentParser(
34
+ prog="python3 infer-cli.py",
35
+ description="Commandline interface for E2/F5 TTS with Advanced Batch Processing.",
36
+ epilog="Specify options above to override one or more settings from config.",
37
+ )
38
+ parser.add_argument(
39
+ "-c",
40
+ "--config",
41
+ type=str,
42
+ default=os.path.join(files("f5_tts").joinpath("infer/examples/basic"), "basic.toml"),
43
+ help="The configuration file, default see infer/examples/basic/basic.toml",
44
+ )
45
+
46
+
47
+ # Note. Not to provide default value here in order to read default from config file
48
+
49
+ parser.add_argument(
50
+ "-m",
51
+ "--model",
52
+ type=str,
53
+ help="The model name: F5-TTS | E2-TTS",
54
+ )
55
+ parser.add_argument(
56
+ "-mc",
57
+ "--model_cfg",
58
+ type=str,
59
+ help="The path to F5-TTS model config file .yaml",
60
+ )
61
+ parser.add_argument(
62
+ "-p",
63
+ "--ckpt_file",
64
+ type=str,
65
+ help="The path to model checkpoint .pt, leave blank to use default",
66
+ )
67
+ parser.add_argument(
68
+ "-v",
69
+ "--vocab_file",
70
+ type=str,
71
+ help="The path to vocab file .txt, leave blank to use default",
72
+ )
73
+ parser.add_argument(
74
+ "-r",
75
+ "--ref_audio",
76
+ type=str,
77
+ help="The reference audio file.",
78
+ )
79
+ parser.add_argument(
80
+ "-s",
81
+ "--ref_text",
82
+ type=str,
83
+ help="The transcript/subtitle for the reference audio",
84
+ )
85
+ parser.add_argument(
86
+ "-t",
87
+ "--gen_text",
88
+ type=str,
89
+ help="The text to make model synthesize a speech",
90
+ )
91
+ parser.add_argument(
92
+ "-f",
93
+ "--gen_file",
94
+ type=str,
95
+ help="The file with text to generate, will ignore --gen_text",
96
+ )
97
+ parser.add_argument(
98
+ "-o",
99
+ "--output_dir",
100
+ type=str,
101
+ help="The path to output folder",
102
+ )
103
+ parser.add_argument(
104
+ "-w",
105
+ "--output_file",
106
+ type=str,
107
+ help="The name of output file",
108
+ )
109
+ parser.add_argument(
110
+ "--save_chunk",
111
+ action="store_true",
112
+ help="To save each audio chunks during inference",
113
+ )
114
+ parser.add_argument(
115
+ "--remove_silence",
116
+ action="store_true",
117
+ help="To remove long silence found in ouput",
118
+ )
119
+ parser.add_argument(
120
+ "--load_vocoder_from_local",
121
+ action="store_true",
122
+ help="To load vocoder from local dir, default to ../checkpoints/vocos-mel-24khz",
123
+ )
124
+ parser.add_argument(
125
+ "--vocoder_name",
126
+ type=str,
127
+ choices=["vocos", "bigvgan"],
128
+ help=f"Used vocoder name: vocos | bigvgan, default {mel_spec_type}",
129
+ )
130
+ parser.add_argument(
131
+ "--target_rms",
132
+ type=float,
133
+ help=f"Target output speech loudness normalization value, default {target_rms}",
134
+ )
135
+ parser.add_argument(
136
+ "--cross_fade_duration",
137
+ type=float,
138
+ help=f"Duration of cross-fade between audio segments in seconds, default {cross_fade_duration}",
139
+ )
140
+ parser.add_argument(
141
+ "--nfe_step",
142
+ type=int,
143
+ help=f"The number of function evaluation (denoising steps), default {nfe_step}",
144
+ )
145
+ parser.add_argument(
146
+ "--cfg_strength",
147
+ type=float,
148
+ help=f"Classifier-free guidance strength, default {cfg_strength}",
149
+ )
150
+ parser.add_argument(
151
+ "--sway_sampling_coef",
152
+ type=float,
153
+ help=f"Sway Sampling coefficient, default {sway_sampling_coef}",
154
+ )
155
+ parser.add_argument(
156
+ "--speed",
157
+ type=float,
158
+ help=f"The speed of the generated audio, default {speed}",
159
+ )
160
+ parser.add_argument(
161
+ "--fix_duration",
162
+ type=float,
163
+ help=f"Fix the total duration (ref and gen audios) in seconds, default {fix_duration}",
164
+ )
165
+ args = parser.parse_args()
166
+
167
+
168
+ # config file
169
+
170
+ config = tomli.load(open(args.config, "rb"))
171
+
172
+
173
+ # command-line interface parameters
174
+
175
+ model = args.model or config.get("model", "F5-TTS")
176
+ model_cfg = args.model_cfg or config.get("model_cfg", str(files("f5_tts").joinpath("configs/F5TTS_Base_train.yaml")))
177
+ ckpt_file = args.ckpt_file or config.get("ckpt_file", "")
178
+ vocab_file = args.vocab_file or config.get("vocab_file", "")
179
+
180
+ ref_audio = args.ref_audio or config.get("ref_audio", "infer/examples/basic/basic_ref_en.wav")
181
+ ref_text = (
182
+ args.ref_text
183
+ if args.ref_text is not None
184
+ else config.get("ref_text", "Some call me nature, others call me mother nature.")
185
+ )
186
+ gen_text = args.gen_text or config.get("gen_text", "Here we generate something just for test.")
187
+ gen_file = args.gen_file or config.get("gen_file", "")
188
+
189
+ output_dir = args.output_dir or config.get("output_dir", "tests")
190
+ output_file = args.output_file or config.get(
191
+ "output_file", f"infer_cli_{datetime.now().strftime(r'%Y%m%d_%H%M%S')}.wav"
192
+ )
193
+
194
+ save_chunk = args.save_chunk or config.get("save_chunk", False)
195
+ remove_silence = args.remove_silence or config.get("remove_silence", False)
196
+ load_vocoder_from_local = args.load_vocoder_from_local or config.get("load_vocoder_from_local", False)
197
+
198
+ vocoder_name = args.vocoder_name or config.get("vocoder_name", mel_spec_type)
199
+ target_rms = args.target_rms or config.get("target_rms", target_rms)
200
+ cross_fade_duration = args.cross_fade_duration or config.get("cross_fade_duration", cross_fade_duration)
201
+ nfe_step = args.nfe_step or config.get("nfe_step", nfe_step)
202
+ cfg_strength = args.cfg_strength or config.get("cfg_strength", cfg_strength)
203
+ sway_sampling_coef = args.sway_sampling_coef or config.get("sway_sampling_coef", sway_sampling_coef)
204
+ speed = args.speed or config.get("speed", speed)
205
+ fix_duration = args.fix_duration or config.get("fix_duration", fix_duration)
206
+
207
+
208
+ # patches for pip pkg user
209
+ if "infer/examples/" in ref_audio:
210
+ ref_audio = str(files("f5_tts").joinpath(f"{ref_audio}"))
211
+ if "infer/examples/" in gen_file:
212
+ gen_file = str(files("f5_tts").joinpath(f"{gen_file}"))
213
+ if "voices" in config:
214
+ for voice in config["voices"]:
215
+ voice_ref_audio = config["voices"][voice]["ref_audio"]
216
+ if "infer/examples/" in voice_ref_audio:
217
+ config["voices"][voice]["ref_audio"] = str(files("f5_tts").joinpath(f"{voice_ref_audio}"))
218
+
219
+
220
+ # ignore gen_text if gen_file provided
221
+
222
+ if gen_file:
223
+ gen_text = codecs.open(gen_file, "r", "utf-8").read()
224
+
225
+
226
+ # output path
227
+
228
+ wave_path = Path(output_dir) / output_file
229
+ # spectrogram_path = Path(output_dir) / "infer_cli_out.png"
230
+ if save_chunk:
231
+ output_chunk_dir = os.path.join(output_dir, f"{Path(output_file).stem}_chunks")
232
+ if not os.path.exists(output_chunk_dir):
233
+ os.makedirs(output_chunk_dir)
234
+
235
+
236
+ # load vocoder
237
+
238
+ if vocoder_name == "vocos":
239
+ vocoder_local_path = "ckpts/vocos"
240
+ elif vocoder_name == "bigvgan":
241
+ vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
242
+
243
+ vocoder = load_vocoder(vocoder_name=vocoder_name, is_local=load_vocoder_from_local, local_path=vocoder_local_path)
244
+
245
+
246
+ # load TTS model
247
+
248
+ if model == "F5-TTS":
249
+ model_cls = DiT
250
+ model_cfg = OmegaConf.load(model_cfg).model.arch
251
+ if not ckpt_file: # path not specified, download from repo
252
+ if vocoder_name == "vocos":
253
+ repo_name = "F5-TTS"
254
+ exp_name = "F5TTS_Base"
255
+ ckpt_step = 1200000
256
+ ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
257
+ # ckpt_file = f"ckpts/{exp_name}/model_{ckpt_step}.pt" # .pt | .safetensors; local path
258
+ # ckpt_file = f"ckpts/{exp_name}/model_last.pt" # .pt | .safetensors; local path
259
+ elif vocoder_name == "bigvgan":
260
+ repo_name = "F5-TTS"
261
+ exp_name = "F5TTS_Base_bigvgan"
262
+ ckpt_step = 1250000
263
+ ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.pt"))
264
+
265
+ elif model == "E2-TTS":
266
+ assert args.model_cfg is None, "E2-TTS does not support custom model_cfg yet"
267
+ assert vocoder_name == "vocos", "E2-TTS only supports vocoder vocos yet"
268
+ model_cls = UNetT
269
+ model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
270
+ if not ckpt_file: # path not specified, download from repo
271
+ repo_name = "E2-TTS"
272
+ exp_name = "E2TTS_Base"
273
+ ckpt_step = 1200000
274
+ ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
275
+ # ckpt_file = f"ckpts/{exp_name}/model_{ckpt_step}.pt" # .pt | .safetensors; local path
276
+
277
+ print(f"Using {model}...")
278
+ ema_model = load_model(model_cls, model_cfg, ckpt_file, mel_spec_type=vocoder_name, vocab_file=vocab_file)
279
+
280
+
281
+ # inference process
282
+
283
+
284
+ def main():
285
+ main_voice = {"ref_audio": ref_audio, "ref_text": ref_text}
286
+ if "voices" not in config:
287
+ voices = {"main": main_voice}
288
+ else:
289
+ voices = config["voices"]
290
+ voices["main"] = main_voice
291
+ for voice in voices:
292
+ print("Voice:", voice)
293
+ print("ref_audio ", voices[voice]["ref_audio"])
294
+ voices[voice]["ref_audio"], voices[voice]["ref_text"] = preprocess_ref_audio_text(
295
+ voices[voice]["ref_audio"], voices[voice]["ref_text"]
296
+ )
297
+ print("ref_audio_", voices[voice]["ref_audio"], "\n\n")
298
+
299
+ generated_audio_segments = []
300
+ reg1 = r"(?=\[\w+\])"
301
+ chunks = re.split(reg1, gen_text)
302
+ reg2 = r"\[(\w+)\]"
303
+ for text in chunks:
304
+ if not text.strip():
305
+ continue
306
+ match = re.match(reg2, text)
307
+ if match:
308
+ voice = match[1]
309
+ else:
310
+ print("No voice tag found, using main.")
311
+ voice = "main"
312
+ if voice not in voices:
313
+ print(f"Voice {voice} not found, using main.")
314
+ voice = "main"
315
+ text = re.sub(reg2, "", text)
316
+ ref_audio_ = voices[voice]["ref_audio"]
317
+ ref_text_ = voices[voice]["ref_text"]
318
+ gen_text_ = text.strip()
319
+ print(f"Voice: {voice}")
320
+ audio_segment, final_sample_rate, spectragram = infer_process(
321
+ ref_audio_,
322
+ ref_text_,
323
+ gen_text_,
324
+ ema_model,
325
+ vocoder,
326
+ mel_spec_type=vocoder_name,
327
+ target_rms=target_rms,
328
+ cross_fade_duration=cross_fade_duration,
329
+ nfe_step=nfe_step,
330
+ cfg_strength=cfg_strength,
331
+ sway_sampling_coef=sway_sampling_coef,
332
+ speed=speed,
333
+ fix_duration=fix_duration,
334
+ )
335
+ generated_audio_segments.append(audio_segment)
336
+
337
+ if save_chunk:
338
+ if len(gen_text_) > 200:
339
+ gen_text_ = gen_text_[:200] + " ... "
340
+ sf.write(
341
+ os.path.join(output_chunk_dir, f"{len(generated_audio_segments)-1}_{gen_text_}.wav"),
342
+ audio_segment,
343
+ final_sample_rate,
344
+ )
345
+
346
+ if generated_audio_segments:
347
+ final_wave = np.concatenate(generated_audio_segments)
348
+
349
+ if not os.path.exists(output_dir):
350
+ os.makedirs(output_dir)
351
+
352
+ with open(wave_path, "wb") as f:
353
+ sf.write(f.name, final_wave, final_sample_rate)
354
+ # Remove silence
355
+ if remove_silence:
356
+ remove_silence_for_generated_wav(f.name)
357
+ print(f.name)
358
+
359
+
360
+ if __name__ == "__main__":
361
+ main()
src/f5_tts/infer/infer_gradio.py ADDED
@@ -0,0 +1,888 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ruff: noqa: E402
2
+ # Above allows ruff to ignore E402: module level import not at top of file
3
+
4
+ import json
5
+ import re
6
+ import tempfile
7
+ from collections import OrderedDict
8
+ from importlib.resources import files
9
+
10
+ import click
11
+ import gradio as gr
12
+ import numpy as np
13
+ import soundfile as sf
14
+ import torchaudio
15
+ from cached_path import cached_path
16
+ from transformers import AutoModelForCausalLM, AutoTokenizer
17
+
18
+ try:
19
+ import spaces
20
+
21
+ USING_SPACES = True
22
+ except ImportError:
23
+ USING_SPACES = False
24
+
25
+
26
+ def gpu_decorator(func):
27
+ if USING_SPACES:
28
+ return spaces.GPU(func)
29
+ else:
30
+ return func
31
+
32
+
33
+ from f5_tts.model import DiT, UNetT
34
+ from f5_tts.infer.utils_infer import (
35
+ load_vocoder,
36
+ load_model,
37
+ preprocess_ref_audio_text,
38
+ infer_process,
39
+ remove_silence_for_generated_wav,
40
+ save_spectrogram,
41
+ )
42
+
43
+
44
+ DEFAULT_TTS_MODEL = "F5-TTS"
45
+ tts_model_choice = DEFAULT_TTS_MODEL
46
+
47
+ DEFAULT_TTS_MODEL_CFG = [
48
+ "hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors",
49
+ "hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt",
50
+ json.dumps(dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)),
51
+ ]
52
+
53
+
54
+ # load models
55
+
56
+ vocoder = load_vocoder()
57
+
58
+
59
+ def load_f5tts(ckpt_path=str(cached_path("hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors"))):
60
+ F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
61
+ return load_model(DiT, F5TTS_model_cfg, ckpt_path)
62
+
63
+
64
+ def load_e2tts(ckpt_path=str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))):
65
+ E2TTS_model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
66
+ return load_model(UNetT, E2TTS_model_cfg, ckpt_path)
67
+
68
+
69
+ def load_custom(ckpt_path: str, vocab_path="", model_cfg=None):
70
+ ckpt_path, vocab_path = ckpt_path.strip(), vocab_path.strip()
71
+ if ckpt_path.startswith("hf://"):
72
+ ckpt_path = str(cached_path(ckpt_path))
73
+ if vocab_path.startswith("hf://"):
74
+ vocab_path = str(cached_path(vocab_path))
75
+ if model_cfg is None:
76
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
77
+ return load_model(DiT, model_cfg, ckpt_path, vocab_file=vocab_path)
78
+
79
+
80
+ F5TTS_ema_model = load_f5tts()
81
+ E2TTS_ema_model = load_e2tts() if USING_SPACES else None
82
+ custom_ema_model, pre_custom_path = None, ""
83
+
84
+ chat_model_state = None
85
+ chat_tokenizer_state = None
86
+
87
+
88
+ @gpu_decorator
89
+ def generate_response(messages, model, tokenizer):
90
+ """Generate response using Qwen"""
91
+ text = tokenizer.apply_chat_template(
92
+ messages,
93
+ tokenize=False,
94
+ add_generation_prompt=True,
95
+ )
96
+
97
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
98
+ generated_ids = model.generate(
99
+ **model_inputs,
100
+ max_new_tokens=512,
101
+ temperature=0.7,
102
+ top_p=0.95,
103
+ )
104
+
105
+ generated_ids = [
106
+ output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
107
+ ]
108
+ return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
109
+
110
+
111
+ @gpu_decorator
112
+ def infer(
113
+ ref_audio_orig,
114
+ ref_text,
115
+ gen_text,
116
+ model,
117
+ remove_silence,
118
+ cross_fade_duration=0.15,
119
+ nfe_step=32,
120
+ speed=1,
121
+ show_info=gr.Info,
122
+ ):
123
+ if not ref_audio_orig:
124
+ gr.Warning("Please provide reference audio.")
125
+ return gr.update(), gr.update(), ref_text
126
+
127
+ if not gen_text.strip():
128
+ gr.Warning("Please enter text to generate.")
129
+ return gr.update(), gr.update(), ref_text
130
+
131
+ ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
132
+
133
+ if model == "F5-TTS":
134
+ ema_model = F5TTS_ema_model
135
+ elif model == "E2-TTS":
136
+ global E2TTS_ema_model
137
+ if E2TTS_ema_model is None:
138
+ show_info("Loading E2-TTS model...")
139
+ E2TTS_ema_model = load_e2tts()
140
+ ema_model = E2TTS_ema_model
141
+ elif isinstance(model, list) and model[0] == "Custom":
142
+ assert not USING_SPACES, "Only official checkpoints allowed in Spaces."
143
+ global custom_ema_model, pre_custom_path
144
+ if pre_custom_path != model[1]:
145
+ show_info("Loading Custom TTS model...")
146
+ custom_ema_model = load_custom(model[1], vocab_path=model[2], model_cfg=model[3])
147
+ pre_custom_path = model[1]
148
+ ema_model = custom_ema_model
149
+
150
+ final_wave, final_sample_rate, combined_spectrogram = infer_process(
151
+ ref_audio,
152
+ ref_text,
153
+ gen_text,
154
+ ema_model,
155
+ vocoder,
156
+ cross_fade_duration=cross_fade_duration,
157
+ nfe_step=nfe_step,
158
+ speed=speed,
159
+ show_info=show_info,
160
+ progress=gr.Progress(),
161
+ )
162
+
163
+ # Remove silence
164
+ if remove_silence:
165
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
166
+ sf.write(f.name, final_wave, final_sample_rate)
167
+ remove_silence_for_generated_wav(f.name)
168
+ final_wave, _ = torchaudio.load(f.name)
169
+ final_wave = final_wave.squeeze().cpu().numpy()
170
+
171
+ # Save the spectrogram
172
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
173
+ spectrogram_path = tmp_spectrogram.name
174
+ save_spectrogram(combined_spectrogram, spectrogram_path)
175
+
176
+ return (final_sample_rate, final_wave), spectrogram_path, ref_text
177
+
178
+
179
+ with gr.Blocks() as app_credits:
180
+ gr.Markdown("""
181
+ # Credits
182
+
183
+ * [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
184
+ * [RootingInLoad](https://github.com/RootingInLoad) for initial chunk generation and podcast app exploration
185
+ * [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation & voice chat
186
+ """)
187
+ with gr.Blocks() as app_tts:
188
+ gr.Markdown("# Batched TTS")
189
+ ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
190
+ gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
191
+ generate_btn = gr.Button("Synthesize", variant="primary")
192
+ with gr.Accordion("Advanced Settings", open=False):
193
+ ref_text_input = gr.Textbox(
194
+ label="Reference Text",
195
+ info="Leave blank to automatically transcribe the reference audio. If you enter text it will override automatic transcription.",
196
+ lines=2,
197
+ )
198
+ remove_silence = gr.Checkbox(
199
+ label="Remove Silences",
200
+ info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
201
+ value=False,
202
+ )
203
+ speed_slider = gr.Slider(
204
+ label="Speed",
205
+ minimum=0.3,
206
+ maximum=2.0,
207
+ value=1.0,
208
+ step=0.1,
209
+ info="Adjust the speed of the audio.",
210
+ )
211
+ nfe_slider = gr.Slider(
212
+ label="NFE Steps",
213
+ minimum=4,
214
+ maximum=64,
215
+ value=32,
216
+ step=2,
217
+ info="Set the number of denoising steps.",
218
+ )
219
+ cross_fade_duration_slider = gr.Slider(
220
+ label="Cross-Fade Duration (s)",
221
+ minimum=0.0,
222
+ maximum=1.0,
223
+ value=0.15,
224
+ step=0.01,
225
+ info="Set the duration of the cross-fade between audio clips.",
226
+ )
227
+
228
+ audio_output = gr.Audio(label="Synthesized Audio")
229
+ spectrogram_output = gr.Image(label="Spectrogram")
230
+
231
+ @gpu_decorator
232
+ def basic_tts(
233
+ ref_audio_input,
234
+ ref_text_input,
235
+ gen_text_input,
236
+ remove_silence,
237
+ cross_fade_duration_slider,
238
+ nfe_slider,
239
+ speed_slider,
240
+ ):
241
+ audio_out, spectrogram_path, ref_text_out = infer(
242
+ ref_audio_input,
243
+ ref_text_input,
244
+ gen_text_input,
245
+ tts_model_choice,
246
+ remove_silence,
247
+ cross_fade_duration=cross_fade_duration_slider,
248
+ nfe_step=nfe_slider,
249
+ speed=speed_slider,
250
+ )
251
+ return audio_out, spectrogram_path, ref_text_out
252
+
253
+ generate_btn.click(
254
+ basic_tts,
255
+ inputs=[
256
+ ref_audio_input,
257
+ ref_text_input,
258
+ gen_text_input,
259
+ remove_silence,
260
+ cross_fade_duration_slider,
261
+ nfe_slider,
262
+ speed_slider,
263
+ ],
264
+ outputs=[audio_output, spectrogram_output, ref_text_input],
265
+ )
266
+
267
+
268
+ def parse_speechtypes_text(gen_text):
269
+ # Pattern to find {speechtype}
270
+ pattern = r"\{(.*?)\}"
271
+
272
+ # Split the text by the pattern
273
+ tokens = re.split(pattern, gen_text)
274
+
275
+ segments = []
276
+
277
+ current_style = "Regular"
278
+
279
+ for i in range(len(tokens)):
280
+ if i % 2 == 0:
281
+ # This is text
282
+ text = tokens[i].strip()
283
+ if text:
284
+ segments.append({"style": current_style, "text": text})
285
+ else:
286
+ # This is style
287
+ style = tokens[i].strip()
288
+ current_style = style
289
+
290
+ return segments
291
+
292
+
293
+ with gr.Blocks() as app_multistyle:
294
+ # New section for multistyle generation
295
+ gr.Markdown(
296
+ """
297
+ # Multiple Speech-Type Generation
298
+
299
+ This section allows you to generate multiple speech types or multiple people's voices. Enter your text in the format shown below, and the system will generate speech using the appropriate type. If unspecified, the model will use the regular speech type. The current speech type will be used until the next speech type is specified.
300
+ """
301
+ )
302
+
303
+ with gr.Row():
304
+ gr.Markdown(
305
+ """
306
+ **Example Input:**
307
+ {Regular} Hello, I'd like to order a sandwich please.
308
+ {Surprised} What do you mean you're out of bread?
309
+ {Sad} I really wanted a sandwich though...
310
+ {Angry} You know what, darn you and your little shop!
311
+ {Whisper} I'll just go back home and cry now.
312
+ {Shouting} Why me?!
313
+ """
314
+ )
315
+
316
+ gr.Markdown(
317
+ """
318
+ **Example Input 2:**
319
+ {Speaker1_Happy} Hello, I'd like to order a sandwich please.
320
+ {Speaker2_Regular} Sorry, we're out of bread.
321
+ {Speaker1_Sad} I really wanted a sandwich though...
322
+ {Speaker2_Whisper} I'll give you the last one I was hiding.
323
+ """
324
+ )
325
+
326
+ gr.Markdown(
327
+ "Upload different audio clips for each speech type. The first speech type is mandatory. You can add additional speech types by clicking the 'Add Speech Type' button."
328
+ )
329
+
330
+ # Regular speech type (mandatory)
331
+ with gr.Row() as regular_row:
332
+ with gr.Column():
333
+ regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
334
+ regular_insert = gr.Button("Insert Label", variant="secondary")
335
+ regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
336
+ regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=2)
337
+
338
+ # Regular speech type (max 100)
339
+ max_speech_types = 100
340
+ speech_type_rows = [regular_row]
341
+ speech_type_names = [regular_name]
342
+ speech_type_audios = [regular_audio]
343
+ speech_type_ref_texts = [regular_ref_text]
344
+ speech_type_delete_btns = [None]
345
+ speech_type_insert_btns = [regular_insert]
346
+
347
+ # Additional speech types (99 more)
348
+ for i in range(max_speech_types - 1):
349
+ with gr.Row(visible=False) as row:
350
+ with gr.Column():
351
+ name_input = gr.Textbox(label="Speech Type Name")
352
+ delete_btn = gr.Button("Delete Type", variant="secondary")
353
+ insert_btn = gr.Button("Insert Label", variant="secondary")
354
+ audio_input = gr.Audio(label="Reference Audio", type="filepath")
355
+ ref_text_input = gr.Textbox(label="Reference Text", lines=2)
356
+ speech_type_rows.append(row)
357
+ speech_type_names.append(name_input)
358
+ speech_type_audios.append(audio_input)
359
+ speech_type_ref_texts.append(ref_text_input)
360
+ speech_type_delete_btns.append(delete_btn)
361
+ speech_type_insert_btns.append(insert_btn)
362
+
363
+ # Button to add speech type
364
+ add_speech_type_btn = gr.Button("Add Speech Type")
365
+
366
+ # Keep track of autoincrement of speech types, no roll back
367
+ speech_type_count = 1
368
+
369
+ # Function to add a speech type
370
+ def add_speech_type_fn():
371
+ row_updates = [gr.update() for _ in range(max_speech_types)]
372
+ global speech_type_count
373
+ if speech_type_count < max_speech_types:
374
+ row_updates[speech_type_count] = gr.update(visible=True)
375
+ speech_type_count += 1
376
+ else:
377
+ gr.Warning("Exhausted maximum number of speech types. Consider restart the app.")
378
+ return row_updates
379
+
380
+ add_speech_type_btn.click(add_speech_type_fn, outputs=speech_type_rows)
381
+
382
+ # Function to delete a speech type
383
+ def delete_speech_type_fn():
384
+ return gr.update(visible=False), None, None, None
385
+
386
+ # Update delete button clicks
387
+ for i in range(1, len(speech_type_delete_btns)):
388
+ speech_type_delete_btns[i].click(
389
+ delete_speech_type_fn,
390
+ outputs=[speech_type_rows[i], speech_type_names[i], speech_type_audios[i], speech_type_ref_texts[i]],
391
+ )
392
+
393
+ # Text input for the prompt
394
+ gen_text_input_multistyle = gr.Textbox(
395
+ label="Text to Generate",
396
+ lines=10,
397
+ placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
398
+ )
399
+
400
+ def make_insert_speech_type_fn(index):
401
+ def insert_speech_type_fn(current_text, speech_type_name):
402
+ current_text = current_text or ""
403
+ speech_type_name = speech_type_name or "None"
404
+ updated_text = current_text + f"{{{speech_type_name}}} "
405
+ return updated_text
406
+
407
+ return insert_speech_type_fn
408
+
409
+ for i, insert_btn in enumerate(speech_type_insert_btns):
410
+ insert_fn = make_insert_speech_type_fn(i)
411
+ insert_btn.click(
412
+ insert_fn,
413
+ inputs=[gen_text_input_multistyle, speech_type_names[i]],
414
+ outputs=gen_text_input_multistyle,
415
+ )
416
+
417
+ with gr.Accordion("Advanced Settings", open=False):
418
+ remove_silence_multistyle = gr.Checkbox(
419
+ label="Remove Silences",
420
+ value=True,
421
+ )
422
+
423
+ # Generate button
424
+ generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
425
+
426
+ # Output audio
427
+ audio_output_multistyle = gr.Audio(label="Synthesized Audio")
428
+
429
+ @gpu_decorator
430
+ def generate_multistyle_speech(
431
+ gen_text,
432
+ *args,
433
+ ):
434
+ speech_type_names_list = args[:max_speech_types]
435
+ speech_type_audios_list = args[max_speech_types : 2 * max_speech_types]
436
+ speech_type_ref_texts_list = args[2 * max_speech_types : 3 * max_speech_types]
437
+ remove_silence = args[3 * max_speech_types]
438
+ # Collect the speech types and their audios into a dict
439
+ speech_types = OrderedDict()
440
+
441
+ ref_text_idx = 0
442
+ for name_input, audio_input, ref_text_input in zip(
443
+ speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list
444
+ ):
445
+ if name_input and audio_input:
446
+ speech_types[name_input] = {"audio": audio_input, "ref_text": ref_text_input}
447
+ else:
448
+ speech_types[f"@{ref_text_idx}@"] = {"audio": "", "ref_text": ""}
449
+ ref_text_idx += 1
450
+
451
+ # Parse the gen_text into segments
452
+ segments = parse_speechtypes_text(gen_text)
453
+
454
+ # For each segment, generate speech
455
+ generated_audio_segments = []
456
+ current_style = "Regular"
457
+
458
+ for segment in segments:
459
+ style = segment["style"]
460
+ text = segment["text"]
461
+
462
+ if style in speech_types:
463
+ current_style = style
464
+ else:
465
+ gr.Warning(f"Type {style} is not available, will use Regular as default.")
466
+ current_style = "Regular"
467
+
468
+ try:
469
+ ref_audio = speech_types[current_style]["audio"]
470
+ except KeyError:
471
+ gr.Warning(f"Please provide reference audio for type {current_style}.")
472
+ return [None] + [speech_types[style]["ref_text"] for style in speech_types]
473
+ ref_text = speech_types[current_style].get("ref_text", "")
474
+
475
+ # Generate speech for this segment
476
+ audio_out, _, ref_text_out = infer(
477
+ ref_audio, ref_text, text, tts_model_choice, remove_silence, 0, show_info=print
478
+ ) # show_info=print no pull to top when generating
479
+ sr, audio_data = audio_out
480
+
481
+ generated_audio_segments.append(audio_data)
482
+ speech_types[current_style]["ref_text"] = ref_text_out
483
+
484
+ # Concatenate all audio segments
485
+ if generated_audio_segments:
486
+ final_audio_data = np.concatenate(generated_audio_segments)
487
+ return [(sr, final_audio_data)] + [speech_types[style]["ref_text"] for style in speech_types]
488
+ else:
489
+ gr.Warning("No audio generated.")
490
+ return [None] + [speech_types[style]["ref_text"] for style in speech_types]
491
+
492
+ generate_multistyle_btn.click(
493
+ generate_multistyle_speech,
494
+ inputs=[
495
+ gen_text_input_multistyle,
496
+ ]
497
+ + speech_type_names
498
+ + speech_type_audios
499
+ + speech_type_ref_texts
500
+ + [
501
+ remove_silence_multistyle,
502
+ ],
503
+ outputs=[audio_output_multistyle] + speech_type_ref_texts,
504
+ )
505
+
506
+ # Validation function to disable Generate button if speech types are missing
507
+ def validate_speech_types(gen_text, regular_name, *args):
508
+ speech_type_names_list = args
509
+
510
+ # Collect the speech types names
511
+ speech_types_available = set()
512
+ if regular_name:
513
+ speech_types_available.add(regular_name)
514
+ for name_input in speech_type_names_list:
515
+ if name_input:
516
+ speech_types_available.add(name_input)
517
+
518
+ # Parse the gen_text to get the speech types used
519
+ segments = parse_speechtypes_text(gen_text)
520
+ speech_types_in_text = set(segment["style"] for segment in segments)
521
+
522
+ # Check if all speech types in text are available
523
+ missing_speech_types = speech_types_in_text - speech_types_available
524
+
525
+ if missing_speech_types:
526
+ # Disable the generate button
527
+ return gr.update(interactive=False)
528
+ else:
529
+ # Enable the generate button
530
+ return gr.update(interactive=True)
531
+
532
+ gen_text_input_multistyle.change(
533
+ validate_speech_types,
534
+ inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
535
+ outputs=generate_multistyle_btn,
536
+ )
537
+
538
+
539
+ with gr.Blocks() as app_chat:
540
+ gr.Markdown(
541
+ """
542
+ # Voice Chat
543
+ Have a conversation with an AI using your reference voice!
544
+ 1. Upload a reference audio clip and optionally its transcript.
545
+ 2. Load the chat model.
546
+ 3. Record your message through your microphone.
547
+ 4. The AI will respond using the reference voice.
548
+ """
549
+ )
550
+
551
+ if not USING_SPACES:
552
+ load_chat_model_btn = gr.Button("Load Chat Model", variant="primary")
553
+
554
+ chat_interface_container = gr.Column(visible=False)
555
+
556
+ @gpu_decorator
557
+ def load_chat_model():
558
+ global chat_model_state, chat_tokenizer_state
559
+ if chat_model_state is None:
560
+ show_info = gr.Info
561
+ show_info("Loading chat model...")
562
+ model_name = "Qwen/Qwen2.5-3B-Instruct"
563
+ chat_model_state = AutoModelForCausalLM.from_pretrained(
564
+ model_name, torch_dtype="auto", device_map="auto"
565
+ )
566
+ chat_tokenizer_state = AutoTokenizer.from_pretrained(model_name)
567
+ show_info("Chat model loaded.")
568
+
569
+ return gr.update(visible=False), gr.update(visible=True)
570
+
571
+ load_chat_model_btn.click(load_chat_model, outputs=[load_chat_model_btn, chat_interface_container])
572
+
573
+ else:
574
+ chat_interface_container = gr.Column()
575
+
576
+ if chat_model_state is None:
577
+ model_name = "Qwen/Qwen2.5-3B-Instruct"
578
+ chat_model_state = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
579
+ chat_tokenizer_state = AutoTokenizer.from_pretrained(model_name)
580
+
581
+ with chat_interface_container:
582
+ with gr.Row():
583
+ with gr.Column():
584
+ ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
585
+ with gr.Column():
586
+ with gr.Accordion("Advanced Settings", open=False):
587
+ remove_silence_chat = gr.Checkbox(
588
+ label="Remove Silences",
589
+ value=True,
590
+ )
591
+ ref_text_chat = gr.Textbox(
592
+ label="Reference Text",
593
+ info="Optional: Leave blank to auto-transcribe",
594
+ lines=2,
595
+ )
596
+ system_prompt_chat = gr.Textbox(
597
+ label="System Prompt",
598
+ value="You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
599
+ lines=2,
600
+ )
601
+
602
+ chatbot_interface = gr.Chatbot(label="Conversation")
603
+
604
+ with gr.Row():
605
+ with gr.Column():
606
+ audio_input_chat = gr.Microphone(
607
+ label="Speak your message",
608
+ type="filepath",
609
+ )
610
+ audio_output_chat = gr.Audio(autoplay=True)
611
+ with gr.Column():
612
+ text_input_chat = gr.Textbox(
613
+ label="Type your message",
614
+ lines=1,
615
+ )
616
+ send_btn_chat = gr.Button("Send Message")
617
+ clear_btn_chat = gr.Button("Clear Conversation")
618
+
619
+ conversation_state = gr.State(
620
+ value=[
621
+ {
622
+ "role": "system",
623
+ "content": "You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
624
+ }
625
+ ]
626
+ )
627
+
628
+ # Modify process_audio_input to use model and tokenizer from state
629
+ @gpu_decorator
630
+ def process_audio_input(audio_path, text, history, conv_state):
631
+ """Handle audio or text input from user"""
632
+
633
+ if not audio_path and not text.strip():
634
+ return history, conv_state, ""
635
+
636
+ if audio_path:
637
+ text = preprocess_ref_audio_text(audio_path, text)[1]
638
+
639
+ if not text.strip():
640
+ return history, conv_state, ""
641
+
642
+ conv_state.append({"role": "user", "content": text})
643
+ history.append((text, None))
644
+
645
+ response = generate_response(conv_state, chat_model_state, chat_tokenizer_state)
646
+
647
+ conv_state.append({"role": "assistant", "content": response})
648
+ history[-1] = (text, response)
649
+
650
+ return history, conv_state, ""
651
+
652
+ @gpu_decorator
653
+ def generate_audio_response(history, ref_audio, ref_text, remove_silence):
654
+ """Generate TTS audio for AI response"""
655
+ if not history or not ref_audio:
656
+ return None
657
+
658
+ last_user_message, last_ai_response = history[-1]
659
+ if not last_ai_response:
660
+ return None
661
+
662
+ audio_result, _, ref_text_out = infer(
663
+ ref_audio,
664
+ ref_text,
665
+ last_ai_response,
666
+ tts_model_choice,
667
+ remove_silence,
668
+ cross_fade_duration=0.15,
669
+ speed=1.0,
670
+ show_info=print, # show_info=print no pull to top when generating
671
+ )
672
+ return audio_result, ref_text_out
673
+
674
+ def clear_conversation():
675
+ """Reset the conversation"""
676
+ return [], [
677
+ {
678
+ "role": "system",
679
+ "content": "You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
680
+ }
681
+ ]
682
+
683
+ def update_system_prompt(new_prompt):
684
+ """Update the system prompt and reset the conversation"""
685
+ new_conv_state = [{"role": "system", "content": new_prompt}]
686
+ return [], new_conv_state
687
+
688
+ # Handle audio input
689
+ audio_input_chat.stop_recording(
690
+ process_audio_input,
691
+ inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
692
+ outputs=[chatbot_interface, conversation_state],
693
+ ).then(
694
+ generate_audio_response,
695
+ inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
696
+ outputs=[audio_output_chat, ref_text_chat],
697
+ ).then(
698
+ lambda: None,
699
+ None,
700
+ audio_input_chat,
701
+ )
702
+
703
+ # Handle text input
704
+ text_input_chat.submit(
705
+ process_audio_input,
706
+ inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
707
+ outputs=[chatbot_interface, conversation_state],
708
+ ).then(
709
+ generate_audio_response,
710
+ inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
711
+ outputs=[audio_output_chat, ref_text_chat],
712
+ ).then(
713
+ lambda: None,
714
+ None,
715
+ text_input_chat,
716
+ )
717
+
718
+ # Handle send button
719
+ send_btn_chat.click(
720
+ process_audio_input,
721
+ inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
722
+ outputs=[chatbot_interface, conversation_state],
723
+ ).then(
724
+ generate_audio_response,
725
+ inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
726
+ outputs=[audio_output_chat, ref_text_chat],
727
+ ).then(
728
+ lambda: None,
729
+ None,
730
+ text_input_chat,
731
+ )
732
+
733
+ # Handle clear button
734
+ clear_btn_chat.click(
735
+ clear_conversation,
736
+ outputs=[chatbot_interface, conversation_state],
737
+ )
738
+
739
+ # Handle system prompt change and reset conversation
740
+ system_prompt_chat.change(
741
+ update_system_prompt,
742
+ inputs=system_prompt_chat,
743
+ outputs=[chatbot_interface, conversation_state],
744
+ )
745
+
746
+
747
+ with gr.Blocks() as app:
748
+ gr.Markdown(
749
+ f"""
750
+ # E2/F5 TTS
751
+
752
+ This is {"a local web UI for [F5 TTS](https://github.com/SWivid/F5-TTS)" if not USING_SPACES else "an online demo for [F5-TTS](https://github.com/SWivid/F5-TTS)"} with advanced batch processing support. This app supports the following TTS models:
753
+
754
+ * [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
755
+ * [E2 TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS)
756
+
757
+ The checkpoints currently support English and Chinese.
758
+
759
+ If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s with ✂ in the bottom right corner (otherwise might have non-optimal auto-trimmed result).
760
+
761
+ **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
762
+ """
763
+ )
764
+
765
+ last_used_custom = files("f5_tts").joinpath("infer/.cache/last_used_custom_model_info.txt")
766
+
767
+ def load_last_used_custom():
768
+ try:
769
+ custom = []
770
+ with open(last_used_custom, "r", encoding="utf-8") as f:
771
+ for line in f:
772
+ custom.append(line.strip())
773
+ return custom
774
+ except FileNotFoundError:
775
+ last_used_custom.parent.mkdir(parents=True, exist_ok=True)
776
+ return DEFAULT_TTS_MODEL_CFG
777
+
778
+ def switch_tts_model(new_choice):
779
+ global tts_model_choice
780
+ if new_choice == "Custom": # override in case webpage is refreshed
781
+ custom_ckpt_path, custom_vocab_path, custom_model_cfg = load_last_used_custom()
782
+ tts_model_choice = ["Custom", custom_ckpt_path, custom_vocab_path, json.loads(custom_model_cfg)]
783
+ return (
784
+ gr.update(visible=True, value=custom_ckpt_path),
785
+ gr.update(visible=True, value=custom_vocab_path),
786
+ gr.update(visible=True, value=custom_model_cfg),
787
+ )
788
+ else:
789
+ tts_model_choice = new_choice
790
+ return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
791
+
792
+ def set_custom_model(custom_ckpt_path, custom_vocab_path, custom_model_cfg):
793
+ global tts_model_choice
794
+ tts_model_choice = ["Custom", custom_ckpt_path, custom_vocab_path, json.loads(custom_model_cfg)]
795
+ with open(last_used_custom, "w", encoding="utf-8") as f:
796
+ f.write(custom_ckpt_path + "\n" + custom_vocab_path + "\n" + custom_model_cfg + "\n")
797
+
798
+ with gr.Row():
799
+ if not USING_SPACES:
800
+ choose_tts_model = gr.Radio(
801
+ choices=[DEFAULT_TTS_MODEL, "E2-TTS", "Custom"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
802
+ )
803
+ else:
804
+ choose_tts_model = gr.Radio(
805
+ choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
806
+ )
807
+ custom_ckpt_path = gr.Dropdown(
808
+ choices=[DEFAULT_TTS_MODEL_CFG[0]],
809
+ value=load_last_used_custom()[0],
810
+ allow_custom_value=True,
811
+ label="Model: local_path | hf://user_id/repo_id/model_ckpt",
812
+ visible=False,
813
+ )
814
+ custom_vocab_path = gr.Dropdown(
815
+ choices=[DEFAULT_TTS_MODEL_CFG[1]],
816
+ value=load_last_used_custom()[1],
817
+ allow_custom_value=True,
818
+ label="Vocab: local_path | hf://user_id/repo_id/vocab_file",
819
+ visible=False,
820
+ )
821
+ custom_model_cfg = gr.Dropdown(
822
+ choices=[
823
+ DEFAULT_TTS_MODEL_CFG[2],
824
+ json.dumps(dict(dim=768, depth=18, heads=12, ff_mult=2, text_dim=512, conv_layers=4)),
825
+ ],
826
+ value=load_last_used_custom()[2],
827
+ allow_custom_value=True,
828
+ label="Config: in a dictionary form",
829
+ visible=False,
830
+ )
831
+
832
+ choose_tts_model.change(
833
+ switch_tts_model,
834
+ inputs=[choose_tts_model],
835
+ outputs=[custom_ckpt_path, custom_vocab_path, custom_model_cfg],
836
+ show_progress="hidden",
837
+ )
838
+ custom_ckpt_path.change(
839
+ set_custom_model,
840
+ inputs=[custom_ckpt_path, custom_vocab_path, custom_model_cfg],
841
+ show_progress="hidden",
842
+ )
843
+ custom_vocab_path.change(
844
+ set_custom_model,
845
+ inputs=[custom_ckpt_path, custom_vocab_path, custom_model_cfg],
846
+ show_progress="hidden",
847
+ )
848
+ custom_model_cfg.change(
849
+ set_custom_model,
850
+ inputs=[custom_ckpt_path, custom_vocab_path, custom_model_cfg],
851
+ show_progress="hidden",
852
+ )
853
+
854
+ gr.TabbedInterface(
855
+ [app_tts, app_multistyle, app_chat, app_credits],
856
+ ["Basic-TTS", "Multi-Speech", "Voice-Chat", "Credits"],
857
+ )
858
+
859
+
860
+ @click.command()
861
+ @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
862
+ @click.option("--host", "-H", default=None, help="Host to run the app on")
863
+ @click.option(
864
+ "--share",
865
+ "-s",
866
+ default=False,
867
+ is_flag=True,
868
+ help="Share the app via Gradio share link",
869
+ )
870
+ @click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
871
+ @click.option(
872
+ "--root_path",
873
+ "-r",
874
+ default=None,
875
+ type=str,
876
+ help='The root path (or "mount point") of the application, if it\'s not served from the root ("/") of the domain. Often used when the application is behind a reverse proxy that forwards requests to the application, e.g. set "/myapp" or full URL for application served at "https://example.com/myapp".',
877
+ )
878
+ def main(port, host, share, api, root_path):
879
+ global app
880
+ print("Starting app...")
881
+ app.queue(api_open=api).launch(server_name=host, server_port=port, share=share, show_api=api, root_path=root_path)
882
+
883
+
884
+ if __name__ == "__main__":
885
+ if not USING_SPACES:
886
+ main()
887
+ else:
888
+ app.queue().launch()
src/f5_tts/infer/speech_edit.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.environ["PYTOCH_ENABLE_MPS_FALLBACK"] = "1" # for MPS device compatibility
4
+
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import torchaudio
8
+
9
+ from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder, save_spectrogram
10
+ from f5_tts.model import CFM, DiT, UNetT
11
+ from f5_tts.model.utils import convert_char_to_pinyin, get_tokenizer
12
+
13
+ device = (
14
+ "cuda"
15
+ if torch.cuda.is_available()
16
+ else "xpu"
17
+ if torch.xpu.is_available()
18
+ else "mps"
19
+ if torch.backends.mps.is_available()
20
+ else "cpu"
21
+ )
22
+
23
+
24
+ # --------------------- Dataset Settings -------------------- #
25
+
26
+ target_sample_rate = 24000
27
+ n_mel_channels = 100
28
+ hop_length = 256
29
+ win_length = 1024
30
+ n_fft = 1024
31
+ mel_spec_type = "vocos" # 'vocos' or 'bigvgan'
32
+ target_rms = 0.1
33
+
34
+ tokenizer = "pinyin"
35
+ dataset_name = "Emilia_ZH_EN"
36
+
37
+
38
+ # ---------------------- infer setting ---------------------- #
39
+
40
+ seed = None # int | None
41
+
42
+ exp_name = "F5TTS_Base" # F5TTS_Base | E2TTS_Base
43
+ ckpt_step = 1200000
44
+
45
+ nfe_step = 32 # 16, 32
46
+ cfg_strength = 2.0
47
+ ode_method = "euler" # euler | midpoint
48
+ sway_sampling_coef = -1.0
49
+ speed = 1.0
50
+
51
+ if exp_name == "F5TTS_Base":
52
+ model_cls = DiT
53
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
54
+
55
+ elif exp_name == "E2TTS_Base":
56
+ model_cls = UNetT
57
+ model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
58
+
59
+ ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.safetensors"
60
+ output_dir = "tests"
61
+
62
+ # [leverage https://github.com/MahmoudAshraf97/ctc-forced-aligner to get char level alignment]
63
+ # pip install git+https://github.com/MahmoudAshraf97/ctc-forced-aligner.git
64
+ # [write the origin_text into a file, e.g. tests/test_edit.txt]
65
+ # ctc-forced-aligner --audio_path "src/f5_tts/infer/examples/basic/basic_ref_en.wav" --text_path "tests/test_edit.txt" --language "zho" --romanize --split_size "char"
66
+ # [result will be saved at same path of audio file]
67
+ # [--language "zho" for Chinese, "eng" for English]
68
+ # [if local ckpt, set --alignment_model "../checkpoints/mms-300m-1130-forced-aligner"]
69
+
70
+ audio_to_edit = "src/f5_tts/infer/examples/basic/basic_ref_en.wav"
71
+ origin_text = "Some call me nature, others call me mother nature."
72
+ target_text = "Some call me optimist, others call me realist."
73
+ parts_to_edit = [
74
+ [1.42, 2.44],
75
+ [4.04, 4.9],
76
+ ] # stard_ends of "nature" & "mother nature", in seconds
77
+ fix_duration = [
78
+ 1.2,
79
+ 1,
80
+ ] # fix duration for "optimist" & "realist", in seconds
81
+
82
+ # audio_to_edit = "src/f5_tts/infer/examples/basic/basic_ref_zh.wav"
83
+ # origin_text = "对,这就是我,万人敬仰的太乙真人。"
84
+ # target_text = "对,那就是你,万人敬仰的太白金星。"
85
+ # parts_to_edit = [[0.84, 1.4], [1.92, 2.4], [4.26, 6.26], ]
86
+ # fix_duration = None # use origin text duration
87
+
88
+
89
+ # -------------------------------------------------#
90
+
91
+ use_ema = True
92
+
93
+ if not os.path.exists(output_dir):
94
+ os.makedirs(output_dir)
95
+
96
+ # Vocoder model
97
+ local = False
98
+ if mel_spec_type == "vocos":
99
+ vocoder_local_path = "../checkpoints/charactr/vocos-mel-24khz"
100
+ elif mel_spec_type == "bigvgan":
101
+ vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
102
+ vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=local, local_path=vocoder_local_path)
103
+
104
+ # Tokenizer
105
+ vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
106
+
107
+ # Model
108
+ model = CFM(
109
+ transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
110
+ mel_spec_kwargs=dict(
111
+ n_fft=n_fft,
112
+ hop_length=hop_length,
113
+ win_length=win_length,
114
+ n_mel_channels=n_mel_channels,
115
+ target_sample_rate=target_sample_rate,
116
+ mel_spec_type=mel_spec_type,
117
+ ),
118
+ odeint_kwargs=dict(
119
+ method=ode_method,
120
+ ),
121
+ vocab_char_map=vocab_char_map,
122
+ ).to(device)
123
+
124
+ dtype = torch.float32 if mel_spec_type == "bigvgan" else None
125
+ model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
126
+
127
+ # Audio
128
+ audio, sr = torchaudio.load(audio_to_edit)
129
+ if audio.shape[0] > 1:
130
+ audio = torch.mean(audio, dim=0, keepdim=True)
131
+ rms = torch.sqrt(torch.mean(torch.square(audio)))
132
+ if rms < target_rms:
133
+ audio = audio * target_rms / rms
134
+ if sr != target_sample_rate:
135
+ resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
136
+ audio = resampler(audio)
137
+ offset = 0
138
+ audio_ = torch.zeros(1, 0)
139
+ edit_mask = torch.zeros(1, 0, dtype=torch.bool)
140
+ for part in parts_to_edit:
141
+ start, end = part
142
+ part_dur = end - start if fix_duration is None else fix_duration.pop(0)
143
+ part_dur = part_dur * target_sample_rate
144
+ start = start * target_sample_rate
145
+ audio_ = torch.cat((audio_, audio[:, round(offset) : round(start)], torch.zeros(1, round(part_dur))), dim=-1)
146
+ edit_mask = torch.cat(
147
+ (
148
+ edit_mask,
149
+ torch.ones(1, round((start - offset) / hop_length), dtype=torch.bool),
150
+ torch.zeros(1, round(part_dur / hop_length), dtype=torch.bool),
151
+ ),
152
+ dim=-1,
153
+ )
154
+ offset = end * target_sample_rate
155
+ # audio = torch.cat((audio_, audio[:, round(offset):]), dim = -1)
156
+ edit_mask = F.pad(edit_mask, (0, audio.shape[-1] // hop_length - edit_mask.shape[-1] + 1), value=True)
157
+ audio = audio.to(device)
158
+ edit_mask = edit_mask.to(device)
159
+
160
+ # Text
161
+ text_list = [target_text]
162
+ if tokenizer == "pinyin":
163
+ final_text_list = convert_char_to_pinyin(text_list)
164
+ else:
165
+ final_text_list = [text_list]
166
+ print(f"text : {text_list}")
167
+ print(f"pinyin: {final_text_list}")
168
+
169
+ # Duration
170
+ ref_audio_len = 0
171
+ duration = audio.shape[-1] // hop_length
172
+
173
+ # Inference
174
+ with torch.inference_mode():
175
+ generated, trajectory = model.sample(
176
+ cond=audio,
177
+ text=final_text_list,
178
+ duration=duration,
179
+ steps=nfe_step,
180
+ cfg_strength=cfg_strength,
181
+ sway_sampling_coef=sway_sampling_coef,
182
+ seed=seed,
183
+ edit_mask=edit_mask,
184
+ )
185
+ print(f"Generated mel: {generated.shape}")
186
+
187
+ # Final result
188
+ generated = generated.to(torch.float32)
189
+ generated = generated[:, ref_audio_len:, :]
190
+ gen_mel_spec = generated.permute(0, 2, 1)
191
+ if mel_spec_type == "vocos":
192
+ generated_wave = vocoder.decode(gen_mel_spec).cpu()
193
+ elif mel_spec_type == "bigvgan":
194
+ generated_wave = vocoder(gen_mel_spec).squeeze(0).cpu()
195
+
196
+ if rms < target_rms:
197
+ generated_wave = generated_wave * rms / target_rms
198
+
199
+ save_spectrogram(gen_mel_spec[0].cpu().numpy(), f"{output_dir}/speech_edit_out.png")
200
+ torchaudio.save(f"{output_dir}/speech_edit_out.wav", generated_wave, target_sample_rate)
201
+ print(f"Generated wav: {generated_wave.shape}")
src/f5_tts/infer/utils_infer.py ADDED
@@ -0,0 +1,583 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # A unified script for inference process
2
+ # Make adjustments inside functions, and consider both gradio and cli scripts if need to change func output format
3
+ import os
4
+ import sys
5
+
6
+ os.environ["PYTOCH_ENABLE_MPS_FALLBACK"] = "1" # for MPS device compatibility
7
+ sys.path.append(f"{os.path.dirname(os.path.abspath(__file__))}/../../third_party/BigVGAN/")
8
+
9
+ import hashlib
10
+ import re
11
+ import tempfile
12
+ from importlib.resources import files
13
+
14
+ import matplotlib
15
+
16
+ matplotlib.use("Agg")
17
+
18
+ import matplotlib.pylab as plt
19
+ import numpy as np
20
+ import torch
21
+ import torchaudio
22
+ import tqdm
23
+ from huggingface_hub import snapshot_download, hf_hub_download
24
+ from pydub import AudioSegment, silence
25
+ from transformers import pipeline
26
+ from vocos import Vocos
27
+
28
+ from f5_tts.model import CFM
29
+ from f5_tts.model.utils import (
30
+ get_tokenizer,
31
+ convert_char_to_pinyin,
32
+ )
33
+
34
+ _ref_audio_cache = {}
35
+
36
+ device = (
37
+ "cuda"
38
+ if torch.cuda.is_available()
39
+ else "xpu"
40
+ if torch.xpu.is_available()
41
+ else "mps"
42
+ if torch.backends.mps.is_available()
43
+ else "cpu"
44
+ )
45
+
46
+ # -----------------------------------------
47
+
48
+ target_sample_rate = 24000
49
+ n_mel_channels = 100
50
+ hop_length = 256
51
+ win_length = 1024
52
+ n_fft = 1024
53
+ mel_spec_type = "vocos"
54
+ target_rms = 0.1
55
+ cross_fade_duration = 0.15
56
+ ode_method = "euler"
57
+ nfe_step = 32 # 16, 32
58
+ cfg_strength = 2.0
59
+ sway_sampling_coef = -1.0
60
+ speed = 1.0
61
+ fix_duration = None
62
+
63
+ # -----------------------------------------
64
+
65
+
66
+ # chunk text into smaller pieces
67
+
68
+
69
+ def chunk_text(text, max_chars=135):
70
+
71
+ # print(text)
72
+
73
+ # Bước 1: Tách câu theo dấu ". "
74
+ sentences = [s.strip() for s in text.split('. ') if s.strip()]
75
+
76
+ # Ghép câu ngắn hơn 4 từ với câu liền kề
77
+ i = 0
78
+ while i < len(sentences):
79
+ if len(sentences[i].split()) < 4:
80
+ if i == 0 and i + 1 < len(sentences):
81
+ # Ghép với câu sau
82
+ sentences[i + 1] = sentences[i] + ', ' + sentences[i + 1]
83
+ del sentences[i]
84
+ else:
85
+ if i - 1 >= 0:
86
+ # Ghép với câu trước
87
+ sentences[i - 1] = sentences[i - 1] + ', ' + sentences[i]
88
+ del sentences[i]
89
+ i -= 1
90
+ else:
91
+ i += 1
92
+
93
+ # print(sentences)
94
+
95
+ # Bước 2: Tách phần quá dài trong câu theo dấu ", "
96
+ final_sentences = []
97
+ for sentence in sentences:
98
+ parts = [p.strip() for p in sentence.split(', ')]
99
+ buffer = []
100
+ for part in parts:
101
+ buffer.append(part)
102
+ total_words = sum(len(p.split()) for p in buffer)
103
+ if total_words > 20:
104
+ # Tách câu ra
105
+ long_part = ', '.join(buffer)
106
+ final_sentences.append(long_part)
107
+ buffer = []
108
+ if buffer:
109
+ final_sentences.append(', '.join(buffer))
110
+
111
+ # print(final_sentences)
112
+
113
+ if len(final_sentences[-1].split()) < 4 and len(final_sentences) >= 2:
114
+ final_sentences[-2] = final_sentences[-2] + ", " + final_sentences[-1]
115
+ final_sentences = final_sentences[0:-1]
116
+
117
+ # print(final_sentences)
118
+
119
+ return final_sentences
120
+
121
+
122
+ # load vocoder
123
+ def load_vocoder(vocoder_name="vocos", is_local=False, local_path="", device=device, hf_cache_dir=None):
124
+ if vocoder_name == "vocos":
125
+ # vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device)
126
+ if is_local:
127
+ print(f"Load vocos from local path {local_path}")
128
+ config_path = f"{local_path}/config.yaml"
129
+ model_path = f"{local_path}/pytorch_model.bin"
130
+ else:
131
+ print("Download Vocos from huggingface charactr/vocos-mel-24khz")
132
+ repo_id = "charactr/vocos-mel-24khz"
133
+ config_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="config.yaml")
134
+ model_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="pytorch_model.bin")
135
+ # print("Download Vocos from huggingface charactr/vocos-mel-24khz")
136
+ # repo_id = "charactr/vocos-mel-24khz"
137
+ # config_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="config.yaml")
138
+ # model_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="pytorch_model.bin")
139
+ vocoder = Vocos.from_hparams(config_path)
140
+ state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
141
+ # print(state_dict)
142
+ from vocos.feature_extractors import EncodecFeatures
143
+
144
+ if isinstance(vocoder.feature_extractor, EncodecFeatures):
145
+ encodec_parameters = {
146
+ "feature_extractor.encodec." + key: value
147
+ for key, value in vocoder.feature_extractor.encodec.state_dict().items()
148
+ }
149
+ state_dict.update(encodec_parameters)
150
+ vocoder.load_state_dict(state_dict)
151
+ vocoder = vocoder.eval().to(device)
152
+ elif vocoder_name == "bigvgan":
153
+ try:
154
+ from third_party.BigVGAN import bigvgan
155
+ except ImportError:
156
+ print("You need to follow the README to init submodule and change the BigVGAN source code.")
157
+ if is_local:
158
+ """download from https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x/tree/main"""
159
+ vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
160
+ else:
161
+ local_path = snapshot_download(repo_id="nvidia/bigvgan_v2_24khz_100band_256x", cache_dir=hf_cache_dir)
162
+ vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
163
+
164
+ vocoder.remove_weight_norm()
165
+ vocoder = vocoder.eval().to(device)
166
+ return vocoder
167
+
168
+
169
+ # load asr pipeline
170
+
171
+ asr_pipe = None
172
+
173
+
174
+ def initialize_asr_pipeline(device: str = device, dtype=None):
175
+ if dtype is None:
176
+ dtype = (
177
+ torch.float16
178
+ if "cuda" in device
179
+ and torch.cuda.get_device_properties(device).major >= 6
180
+ and not torch.cuda.get_device_name().endswith("[ZLUDA]")
181
+ else torch.float32
182
+ )
183
+ global asr_pipe
184
+ asr_pipe = pipeline(
185
+ "automatic-speech-recognition",
186
+ model="vinai/PhoWhisper-medium",
187
+ torch_dtype=dtype,
188
+ device=device,
189
+ )
190
+
191
+
192
+ # transcribe
193
+
194
+
195
+ def transcribe(ref_audio, language=None):
196
+ global asr_pipe
197
+ if asr_pipe is None:
198
+ initialize_asr_pipeline(device=device)
199
+ return asr_pipe(
200
+ ref_audio,
201
+ chunk_length_s=30,
202
+ batch_size=128,
203
+ generate_kwargs={"task": "transcribe", "language": language} if language else {"task": "transcribe"},
204
+ return_timestamps=False,
205
+ )["text"].strip()
206
+
207
+
208
+ # load model checkpoint for inference
209
+
210
+
211
+ def load_checkpoint(model, ckpt_path, device: str, dtype=None, use_ema=True):
212
+ if dtype is None:
213
+ dtype = (
214
+ torch.float16
215
+ if "cuda" in device
216
+ and torch.cuda.get_device_properties(device).major >= 6
217
+ and not torch.cuda.get_device_name().endswith("[ZLUDA]")
218
+ else torch.float32
219
+ )
220
+ model = model.to(dtype)
221
+
222
+ ckpt_type = ckpt_path.split(".")[-1]
223
+ if ckpt_type == "safetensors":
224
+ from safetensors.torch import load_file
225
+
226
+ checkpoint = load_file(ckpt_path, device=device)
227
+ else:
228
+ checkpoint = torch.load(ckpt_path, map_location=device, weights_only=True)
229
+
230
+ if use_ema:
231
+ if ckpt_type == "safetensors":
232
+ checkpoint = {"ema_model_state_dict": checkpoint}
233
+ checkpoint["model_state_dict"] = {
234
+ k.replace("ema_model.", ""): v
235
+ for k, v in checkpoint["ema_model_state_dict"].items()
236
+ if k not in ["initted", "step"]
237
+ }
238
+
239
+ # patch for backward compatibility, 305e3ea
240
+ for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window"]:
241
+ if key in checkpoint["model_state_dict"]:
242
+ del checkpoint["model_state_dict"][key]
243
+
244
+ model.load_state_dict(checkpoint["model_state_dict"])
245
+ else:
246
+ if ckpt_type == "safetensors":
247
+ checkpoint = {"model_state_dict": checkpoint}
248
+ model.load_state_dict(checkpoint["model_state_dict"])
249
+
250
+ del checkpoint
251
+ torch.cuda.empty_cache()
252
+
253
+ return model.to(device)
254
+
255
+
256
+ # load model for inference
257
+
258
+
259
+ def load_model(
260
+ model_cls,
261
+ model_cfg,
262
+ ckpt_path,
263
+ mel_spec_type=mel_spec_type,
264
+ vocab_file="",
265
+ ode_method=ode_method,
266
+ use_ema=True,
267
+ device=device,
268
+ ):
269
+ if vocab_file == "":
270
+ vocab_file = str(files("f5_tts").joinpath("infer/examples/vocab.txt"))
271
+ tokenizer = "custom"
272
+
273
+ print("\nvocab : ", vocab_file)
274
+ print("token : ", tokenizer)
275
+ print("model : ", ckpt_path, "\n")
276
+
277
+ vocab_char_map, vocab_size = get_tokenizer(vocab_file, tokenizer)
278
+ model = CFM(
279
+ transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
280
+ mel_spec_kwargs=dict(
281
+ n_fft=n_fft,
282
+ hop_length=hop_length,
283
+ win_length=win_length,
284
+ n_mel_channels=n_mel_channels,
285
+ target_sample_rate=target_sample_rate,
286
+ mel_spec_type=mel_spec_type,
287
+ ),
288
+ odeint_kwargs=dict(
289
+ method=ode_method,
290
+ ),
291
+ vocab_char_map=vocab_char_map,
292
+ ).to(device)
293
+
294
+ dtype = torch.float32 if mel_spec_type == "bigvgan" else None
295
+ model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
296
+
297
+ return model
298
+
299
+
300
+ def remove_silence_edges(audio, silence_threshold=-42):
301
+ # Remove silence from the start
302
+ non_silent_start_idx = silence.detect_leading_silence(audio, silence_threshold=silence_threshold)
303
+ audio = audio[non_silent_start_idx:]
304
+
305
+ # Remove silence from the end
306
+ non_silent_end_duration = audio.duration_seconds
307
+ for ms in reversed(audio):
308
+ if ms.dBFS > silence_threshold:
309
+ break
310
+ non_silent_end_duration -= 0.001
311
+ trimmed_audio = audio[: int(non_silent_end_duration * 1000)]
312
+
313
+ return trimmed_audio
314
+
315
+
316
+ # preprocess reference audio and text
317
+
318
+
319
+ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_info=print, device=device):
320
+ show_info("Converting audio...")
321
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
322
+ aseg = AudioSegment.from_file(ref_audio_orig)
323
+
324
+ if clip_short:
325
+ # 1. try to find long silence for clipping
326
+ non_silent_segs = silence.split_on_silence(
327
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
328
+ )
329
+ non_silent_wave = AudioSegment.silent(duration=0)
330
+ for non_silent_seg in non_silent_segs:
331
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 15000:
332
+ show_info("Audio is over 15s, clipping short. (1)")
333
+ break
334
+ non_silent_wave += non_silent_seg
335
+
336
+ # 2. try to find short silence for clipping if 1. failed
337
+ if len(non_silent_wave) > 15000:
338
+ non_silent_segs = silence.split_on_silence(
339
+ aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
340
+ )
341
+ non_silent_wave = AudioSegment.silent(duration=0)
342
+ for non_silent_seg in non_silent_segs:
343
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 15000:
344
+ show_info("Audio is over 15s, clipping short. (2)")
345
+ break
346
+ non_silent_wave += non_silent_seg
347
+
348
+ aseg = non_silent_wave
349
+
350
+ # 3. if no proper silence found for clipping
351
+ if len(aseg) > 15000:
352
+ aseg = aseg[:15000]
353
+ show_info("Audio is over 15s, clipping short. (3)")
354
+
355
+ aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
356
+ aseg.export(f.name, format="wav")
357
+ ref_audio = f.name
358
+
359
+ # Compute a hash of the reference audio file
360
+ with open(ref_audio, "rb") as audio_file:
361
+ audio_data = audio_file.read()
362
+ audio_hash = hashlib.md5(audio_data).hexdigest()
363
+
364
+ if not ref_text.strip():
365
+ global _ref_audio_cache
366
+ if audio_hash in _ref_audio_cache:
367
+ # Use cached asr transcription
368
+ show_info("Using cached reference text...")
369
+ ref_text = _ref_audio_cache[audio_hash]
370
+ else:
371
+ show_info("No reference text provided, transcribing reference audio...")
372
+ ref_text = transcribe(ref_audio)
373
+ # Cache the transcribed text (not caching custom ref_text, enabling users to do manual tweak)
374
+ _ref_audio_cache[audio_hash] = ref_text
375
+ else:
376
+ show_info("Using custom reference text...")
377
+
378
+ # Ensure ref_text ends with a proper sentence-ending punctuation
379
+ if not ref_text.endswith(". ") and not ref_text.endswith("。"):
380
+ if ref_text.endswith("."):
381
+ ref_text += " "
382
+ else:
383
+ ref_text += ". "
384
+
385
+ print("\nref_text ", ref_text)
386
+
387
+ return ref_audio, ref_text
388
+
389
+
390
+ # infer process: chunk text -> infer batches [i.e. infer_batch_process()]
391
+
392
+
393
+ def infer_process(
394
+ ref_audio,
395
+ ref_text,
396
+ gen_text,
397
+ model_obj,
398
+ vocoder,
399
+ mel_spec_type=mel_spec_type,
400
+ show_info=print,
401
+ progress=tqdm,
402
+ target_rms=target_rms,
403
+ cross_fade_duration=cross_fade_duration,
404
+ nfe_step=nfe_step,
405
+ cfg_strength=cfg_strength,
406
+ sway_sampling_coef=sway_sampling_coef,
407
+ speed=speed,
408
+ fix_duration=fix_duration,
409
+ device=device,
410
+ ):
411
+ # Split the input text into batches
412
+ audio, sr = torchaudio.load(ref_audio)
413
+ max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
414
+ gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
415
+ for i, gen_text in enumerate(gen_text_batches):
416
+ print(f"gen_text {i}", gen_text)
417
+ print("\n")
418
+
419
+ show_info(f"Generating audio in {len(gen_text_batches)} batches...")
420
+ return infer_batch_process(
421
+ (audio, sr),
422
+ ref_text,
423
+ gen_text_batches,
424
+ model_obj,
425
+ vocoder,
426
+ mel_spec_type=mel_spec_type,
427
+ progress=progress,
428
+ target_rms=target_rms,
429
+ cross_fade_duration=cross_fade_duration,
430
+ nfe_step=nfe_step,
431
+ cfg_strength=cfg_strength,
432
+ sway_sampling_coef=sway_sampling_coef,
433
+ speed=speed,
434
+ fix_duration=fix_duration,
435
+ device=device,
436
+ )
437
+
438
+
439
+ # infer batches
440
+
441
+
442
+ def infer_batch_process(
443
+ ref_audio,
444
+ ref_text,
445
+ gen_text_batches,
446
+ model_obj,
447
+ vocoder,
448
+ mel_spec_type="vocos",
449
+ progress=tqdm,
450
+ target_rms=0.1,
451
+ cross_fade_duration=0.15,
452
+ nfe_step=32,
453
+ cfg_strength=2.0,
454
+ sway_sampling_coef=-1,
455
+ speed=1,
456
+ fix_duration=None,
457
+ device=None,
458
+ ):
459
+ audio, sr = ref_audio
460
+ if audio.shape[0] > 1:
461
+ audio = torch.mean(audio, dim=0, keepdim=True)
462
+
463
+ rms = torch.sqrt(torch.mean(torch.square(audio)))
464
+ if rms < target_rms:
465
+ audio = audio * target_rms / rms
466
+ if sr != target_sample_rate:
467
+ resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
468
+ audio = resampler(audio)
469
+ audio = audio.to(device)
470
+
471
+ generated_waves = []
472
+ spectrograms = []
473
+
474
+ if len(ref_text[-1].encode("utf-8")) == 1:
475
+ ref_text = ref_text + " "
476
+ for i, gen_text in enumerate(progress.tqdm(gen_text_batches)):
477
+ # Prepare the text
478
+ text_list = [ref_text + gen_text]
479
+ final_text_list = convert_char_to_pinyin(text_list)
480
+
481
+ ref_audio_len = audio.shape[-1] // hop_length
482
+ if fix_duration is not None:
483
+ duration = int(fix_duration * target_sample_rate / hop_length)
484
+ else:
485
+ # Calculate duration
486
+ ref_text_len = len(ref_text.encode("utf-8"))
487
+ gen_text_len = len(gen_text.encode("utf-8"))
488
+ duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
489
+
490
+ # inference
491
+ with torch.inference_mode():
492
+ generated, _ = model_obj.sample(
493
+ cond=audio,
494
+ text=final_text_list,
495
+ duration=duration,
496
+ steps=nfe_step,
497
+ cfg_strength=cfg_strength,
498
+ sway_sampling_coef=sway_sampling_coef,
499
+ )
500
+
501
+ generated = generated.to(torch.float32)
502
+ generated = generated[:, ref_audio_len:, :]
503
+ generated_mel_spec = generated.permute(0, 2, 1)
504
+ if mel_spec_type == "vocos":
505
+ generated_wave = vocoder.decode(generated_mel_spec)
506
+ elif mel_spec_type == "bigvgan":
507
+ generated_wave = vocoder(generated_mel_spec)
508
+ if rms < target_rms:
509
+ generated_wave = generated_wave * rms / target_rms
510
+
511
+ # wav -> numpy
512
+ generated_wave = generated_wave.squeeze().cpu().numpy()
513
+
514
+ generated_waves.append(generated_wave)
515
+ spectrograms.append(generated_mel_spec[0].cpu().numpy())
516
+
517
+ # Combine all generated waves with cross-fading
518
+ if cross_fade_duration <= 0:
519
+ # Simply concatenate
520
+ final_wave = np.concatenate(generated_waves)
521
+ else:
522
+ final_wave = generated_waves[0]
523
+ for i in range(1, len(generated_waves)):
524
+ prev_wave = final_wave
525
+ next_wave = generated_waves[i]
526
+
527
+ # Calculate cross-fade samples, ensuring it does not exceed wave lengths
528
+ cross_fade_samples = int(cross_fade_duration * target_sample_rate)
529
+ cross_fade_samples = min(cross_fade_samples, len(prev_wave), len(next_wave))
530
+
531
+ if cross_fade_samples <= 0:
532
+ # No overlap possible, concatenate
533
+ final_wave = np.concatenate([prev_wave, next_wave])
534
+ continue
535
+
536
+ # Overlapping parts
537
+ prev_overlap = prev_wave[-cross_fade_samples:]
538
+ next_overlap = next_wave[:cross_fade_samples]
539
+
540
+ # Fade out and fade in
541
+ fade_out = np.linspace(1, 0, cross_fade_samples)
542
+ fade_in = np.linspace(0, 1, cross_fade_samples)
543
+
544
+ # Cross-faded overlap
545
+ cross_faded_overlap = prev_overlap * fade_out + next_overlap * fade_in
546
+
547
+ # Combine
548
+ new_wave = np.concatenate(
549
+ [prev_wave[:-cross_fade_samples], cross_faded_overlap, next_wave[cross_fade_samples:]]
550
+ )
551
+
552
+ final_wave = new_wave
553
+
554
+ # Create a combined spectrogram
555
+ combined_spectrogram = np.concatenate(spectrograms, axis=1)
556
+
557
+ return final_wave, target_sample_rate, combined_spectrogram
558
+
559
+
560
+ # remove silence from generated wav
561
+
562
+
563
+ def remove_silence_for_generated_wav(filename):
564
+ aseg = AudioSegment.from_file(filename)
565
+ non_silent_segs = silence.split_on_silence(
566
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500, seek_step=10
567
+ )
568
+ non_silent_wave = AudioSegment.silent(duration=0)
569
+ for non_silent_seg in non_silent_segs:
570
+ non_silent_wave += non_silent_seg
571
+ aseg = non_silent_wave
572
+ aseg.export(filename, format="wav")
573
+
574
+
575
+ # save spectrogram
576
+
577
+
578
+ def save_spectrogram(spectrogram, path):
579
+ plt.figure(figsize=(12, 4))
580
+ plt.imshow(spectrogram, origin="lower", aspect="auto")
581
+ plt.colorbar()
582
+ plt.savefig(path)
583
+ plt.close()
src/f5_tts/model/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from f5_tts.model.cfm import CFM
2
+
3
+ from f5_tts.model.backbones.unett import UNetT
4
+ from f5_tts.model.backbones.dit import DiT
5
+ from f5_tts.model.backbones.mmdit import MMDiT
6
+
7
+ from f5_tts.model.trainer import Trainer
8
+
9
+
10
+ __all__ = ["CFM", "UNetT", "DiT", "MMDiT", "Trainer"]
src/f5_tts/model/backbones/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Backbones quick introduction
2
+
3
+
4
+ ### unett.py
5
+ - flat unet transformer
6
+ - structure same as in e2-tts & voicebox paper except using rotary pos emb
7
+ - update: allow possible abs pos emb & convnextv2 blocks for embedded text before concat
8
+
9
+ ### dit.py
10
+ - adaln-zero dit
11
+ - embedded timestep as condition
12
+ - concatted noised_input + masked_cond + embedded_text, linear proj in
13
+ - possible abs pos emb & convnextv2 blocks for embedded text before concat
14
+ - possible long skip connection (first layer to last layer)
15
+
16
+ ### mmdit.py
17
+ - sd3 structure
18
+ - timestep as condition
19
+ - left stream: text embedded and applied a abs pos emb
20
+ - right stream: masked_cond & noised_input concatted and with same conv pos emb as unett
src/f5_tts/model/backbones/dit.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import torch
13
+ from torch import nn
14
+ import torch.nn.functional as F
15
+
16
+ from x_transformers.x_transformers import RotaryEmbedding
17
+
18
+ from f5_tts.model.modules import (
19
+ TimestepEmbedding,
20
+ ConvNeXtV2Block,
21
+ ConvPositionEmbedding,
22
+ DiTBlock,
23
+ AdaLayerNormZero_Final,
24
+ precompute_freqs_cis,
25
+ get_pos_embed_indices,
26
+ )
27
+
28
+
29
+ # Text embedding
30
+
31
+
32
+ class TextEmbedding(nn.Module):
33
+ def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
34
+ super().__init__()
35
+ self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim) # use 0 as filler token
36
+
37
+ if conv_layers > 0:
38
+ self.extra_modeling = True
39
+ self.precompute_max_pos = 4096 # ~44s of 24khz audio
40
+ self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
41
+ self.text_blocks = nn.Sequential(
42
+ *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
43
+ )
44
+ else:
45
+ self.extra_modeling = False
46
+
47
+ def forward(self, text: int["b nt"], seq_len, drop_text=False): # noqa: F722
48
+ text = text + 1 # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
49
+ text = text[:, :seq_len] # curtail if character tokens are more than the mel spec tokens
50
+ batch, text_len = text.shape[0], text.shape[1]
51
+ text = F.pad(text, (0, seq_len - text_len), value=0)
52
+
53
+ if drop_text: # cfg for text
54
+ text = torch.zeros_like(text)
55
+
56
+ text = self.text_embed(text) # b n -> b n d
57
+
58
+ # possible extra modeling
59
+ if self.extra_modeling:
60
+ # sinus pos emb
61
+ batch_start = torch.zeros((batch,), dtype=torch.long)
62
+ pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
63
+ text_pos_embed = self.freqs_cis[pos_idx]
64
+ text = text + text_pos_embed
65
+
66
+ # convnextv2 blocks
67
+ text = self.text_blocks(text)
68
+
69
+ return text
70
+
71
+
72
+ # noised input audio and context mixing embedding
73
+
74
+
75
+ class InputEmbedding(nn.Module):
76
+ def __init__(self, mel_dim, text_dim, out_dim):
77
+ super().__init__()
78
+ self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
79
+ self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
80
+
81
+ def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False): # noqa: F722
82
+ if drop_audio_cond: # cfg for cond audio
83
+ cond = torch.zeros_like(cond)
84
+
85
+ x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
86
+ x = self.conv_pos_embed(x) + x
87
+ return x
88
+
89
+
90
+ # Transformer backbone using DiT blocks
91
+
92
+
93
+ class DiT(nn.Module):
94
+ def __init__(
95
+ self,
96
+ *,
97
+ dim,
98
+ depth=8,
99
+ heads=8,
100
+ dim_head=64,
101
+ dropout=0.1,
102
+ ff_mult=4,
103
+ mel_dim=100,
104
+ text_num_embeds=256,
105
+ text_dim=None,
106
+ conv_layers=0,
107
+ long_skip_connection=False,
108
+ checkpoint_activations=False,
109
+ ):
110
+ super().__init__()
111
+
112
+ self.time_embed = TimestepEmbedding(dim)
113
+ if text_dim is None:
114
+ text_dim = mel_dim
115
+ self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers)
116
+ self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
117
+
118
+ self.rotary_embed = RotaryEmbedding(dim_head)
119
+
120
+ self.dim = dim
121
+ self.depth = depth
122
+
123
+ self.transformer_blocks = nn.ModuleList(
124
+ [DiTBlock(dim=dim, heads=heads, dim_head=dim_head, ff_mult=ff_mult, dropout=dropout) for _ in range(depth)]
125
+ )
126
+ self.long_skip_connection = nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
127
+
128
+ self.norm_out = AdaLayerNormZero_Final(dim) # final modulation
129
+ self.proj_out = nn.Linear(dim, mel_dim)
130
+
131
+ self.checkpoint_activations = checkpoint_activations
132
+
133
+ def ckpt_wrapper(self, module):
134
+ # https://github.com/chuanyangjin/fast-DiT/blob/main/models.py
135
+ def ckpt_forward(*inputs):
136
+ outputs = module(*inputs)
137
+ return outputs
138
+
139
+ return ckpt_forward
140
+
141
+ def forward(
142
+ self,
143
+ x: float["b n d"], # nosied input audio # noqa: F722
144
+ cond: float["b n d"], # masked cond audio # noqa: F722
145
+ text: int["b nt"], # text # noqa: F722
146
+ time: float["b"] | float[""], # time step # noqa: F821 F722
147
+ drop_audio_cond, # cfg for cond audio
148
+ drop_text, # cfg for text
149
+ mask: bool["b n"] | None = None, # noqa: F722
150
+ ):
151
+ batch, seq_len = x.shape[0], x.shape[1]
152
+ if time.ndim == 0:
153
+ time = time.repeat(batch)
154
+
155
+ # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
156
+ t = self.time_embed(time)
157
+ text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
158
+ x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
159
+
160
+ rope = self.rotary_embed.forward_from_seq_len(seq_len)
161
+
162
+ if self.long_skip_connection is not None:
163
+ residual = x
164
+
165
+ for block in self.transformer_blocks:
166
+ if self.checkpoint_activations:
167
+ x = torch.utils.checkpoint.checkpoint(self.ckpt_wrapper(block), x, t, mask, rope)
168
+ else:
169
+ x = block(x, t, mask=mask, rope=rope)
170
+
171
+ if self.long_skip_connection is not None:
172
+ x = self.long_skip_connection(torch.cat((x, residual), dim=-1))
173
+
174
+ x = self.norm_out(x, t)
175
+ output = self.proj_out(x)
176
+
177
+ return output
src/f5_tts/model/backbones/mmdit.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import torch
13
+ from torch import nn
14
+
15
+ from x_transformers.x_transformers import RotaryEmbedding
16
+
17
+ from f5_tts.model.modules import (
18
+ TimestepEmbedding,
19
+ ConvPositionEmbedding,
20
+ MMDiTBlock,
21
+ AdaLayerNormZero_Final,
22
+ precompute_freqs_cis,
23
+ get_pos_embed_indices,
24
+ )
25
+
26
+
27
+ # text embedding
28
+
29
+
30
+ class TextEmbedding(nn.Module):
31
+ def __init__(self, out_dim, text_num_embeds):
32
+ super().__init__()
33
+ self.text_embed = nn.Embedding(text_num_embeds + 1, out_dim) # will use 0 as filler token
34
+
35
+ self.precompute_max_pos = 1024
36
+ self.register_buffer("freqs_cis", precompute_freqs_cis(out_dim, self.precompute_max_pos), persistent=False)
37
+
38
+ def forward(self, text: int["b nt"], drop_text=False) -> int["b nt d"]: # noqa: F722
39
+ text = text + 1
40
+ if drop_text:
41
+ text = torch.zeros_like(text)
42
+ text = self.text_embed(text)
43
+
44
+ # sinus pos emb
45
+ batch_start = torch.zeros((text.shape[0],), dtype=torch.long)
46
+ batch_text_len = text.shape[1]
47
+ pos_idx = get_pos_embed_indices(batch_start, batch_text_len, max_pos=self.precompute_max_pos)
48
+ text_pos_embed = self.freqs_cis[pos_idx]
49
+
50
+ text = text + text_pos_embed
51
+
52
+ return text
53
+
54
+
55
+ # noised input & masked cond audio embedding
56
+
57
+
58
+ class AudioEmbedding(nn.Module):
59
+ def __init__(self, in_dim, out_dim):
60
+ super().__init__()
61
+ self.linear = nn.Linear(2 * in_dim, out_dim)
62
+ self.conv_pos_embed = ConvPositionEmbedding(out_dim)
63
+
64
+ def forward(self, x: float["b n d"], cond: float["b n d"], drop_audio_cond=False): # noqa: F722
65
+ if drop_audio_cond:
66
+ cond = torch.zeros_like(cond)
67
+ x = torch.cat((x, cond), dim=-1)
68
+ x = self.linear(x)
69
+ x = self.conv_pos_embed(x) + x
70
+ return x
71
+
72
+
73
+ # Transformer backbone using MM-DiT blocks
74
+
75
+
76
+ class MMDiT(nn.Module):
77
+ def __init__(
78
+ self,
79
+ *,
80
+ dim,
81
+ depth=8,
82
+ heads=8,
83
+ dim_head=64,
84
+ dropout=0.1,
85
+ ff_mult=4,
86
+ text_num_embeds=256,
87
+ mel_dim=100,
88
+ ):
89
+ super().__init__()
90
+
91
+ self.time_embed = TimestepEmbedding(dim)
92
+ self.text_embed = TextEmbedding(dim, text_num_embeds)
93
+ self.audio_embed = AudioEmbedding(mel_dim, dim)
94
+
95
+ self.rotary_embed = RotaryEmbedding(dim_head)
96
+
97
+ self.dim = dim
98
+ self.depth = depth
99
+
100
+ self.transformer_blocks = nn.ModuleList(
101
+ [
102
+ MMDiTBlock(
103
+ dim=dim,
104
+ heads=heads,
105
+ dim_head=dim_head,
106
+ dropout=dropout,
107
+ ff_mult=ff_mult,
108
+ context_pre_only=i == depth - 1,
109
+ )
110
+ for i in range(depth)
111
+ ]
112
+ )
113
+ self.norm_out = AdaLayerNormZero_Final(dim) # final modulation
114
+ self.proj_out = nn.Linear(dim, mel_dim)
115
+
116
+ def forward(
117
+ self,
118
+ x: float["b n d"], # nosied input audio # noqa: F722
119
+ cond: float["b n d"], # masked cond audio # noqa: F722
120
+ text: int["b nt"], # text # noqa: F722
121
+ time: float["b"] | float[""], # time step # noqa: F821 F722
122
+ drop_audio_cond, # cfg for cond audio
123
+ drop_text, # cfg for text
124
+ mask: bool["b n"] | None = None, # noqa: F722
125
+ ):
126
+ batch = x.shape[0]
127
+ if time.ndim == 0:
128
+ time = time.repeat(batch)
129
+
130
+ # t: conditioning (time), c: context (text + masked cond audio), x: noised input audio
131
+ t = self.time_embed(time)
132
+ c = self.text_embed(text, drop_text=drop_text)
133
+ x = self.audio_embed(x, cond, drop_audio_cond=drop_audio_cond)
134
+
135
+ seq_len = x.shape[1]
136
+ text_len = text.shape[1]
137
+ rope_audio = self.rotary_embed.forward_from_seq_len(seq_len)
138
+ rope_text = self.rotary_embed.forward_from_seq_len(text_len)
139
+
140
+ for block in self.transformer_blocks:
141
+ c, x = block(x, c, t, mask=mask, rope=rope_audio, c_rope=rope_text)
142
+
143
+ x = self.norm_out(x, t)
144
+ output = self.proj_out(x)
145
+
146
+ return output
src/f5_tts/model/backbones/unett.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+ from typing import Literal
12
+
13
+ import torch
14
+ from torch import nn
15
+ import torch.nn.functional as F
16
+
17
+ from x_transformers import RMSNorm
18
+ from x_transformers.x_transformers import RotaryEmbedding
19
+
20
+ from f5_tts.model.modules import (
21
+ TimestepEmbedding,
22
+ ConvNeXtV2Block,
23
+ ConvPositionEmbedding,
24
+ Attention,
25
+ AttnProcessor,
26
+ FeedForward,
27
+ precompute_freqs_cis,
28
+ get_pos_embed_indices,
29
+ )
30
+
31
+
32
+ # Text embedding
33
+
34
+
35
+ class TextEmbedding(nn.Module):
36
+ def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
37
+ super().__init__()
38
+ self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim) # use 0 as filler token
39
+
40
+ if conv_layers > 0:
41
+ self.extra_modeling = True
42
+ self.precompute_max_pos = 4096 # ~44s of 24khz audio
43
+ self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
44
+ self.text_blocks = nn.Sequential(
45
+ *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
46
+ )
47
+ else:
48
+ self.extra_modeling = False
49
+
50
+ def forward(self, text: int["b nt"], seq_len, drop_text=False): # noqa: F722
51
+ text = text + 1 # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
52
+ text = text[:, :seq_len] # curtail if character tokens are more than the mel spec tokens
53
+ batch, text_len = text.shape[0], text.shape[1]
54
+ text = F.pad(text, (0, seq_len - text_len), value=0)
55
+
56
+ if drop_text: # cfg for text
57
+ text = torch.zeros_like(text)
58
+
59
+ text = self.text_embed(text) # b n -> b n d
60
+
61
+ # possible extra modeling
62
+ if self.extra_modeling:
63
+ # sinus pos emb
64
+ batch_start = torch.zeros((batch,), dtype=torch.long)
65
+ pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
66
+ text_pos_embed = self.freqs_cis[pos_idx]
67
+ text = text + text_pos_embed
68
+
69
+ # convnextv2 blocks
70
+ text = self.text_blocks(text)
71
+
72
+ return text
73
+
74
+
75
+ # noised input audio and context mixing embedding
76
+
77
+
78
+ class InputEmbedding(nn.Module):
79
+ def __init__(self, mel_dim, text_dim, out_dim):
80
+ super().__init__()
81
+ self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
82
+ self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
83
+
84
+ def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False): # noqa: F722
85
+ if drop_audio_cond: # cfg for cond audio
86
+ cond = torch.zeros_like(cond)
87
+
88
+ x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
89
+ x = self.conv_pos_embed(x) + x
90
+ return x
91
+
92
+
93
+ # Flat UNet Transformer backbone
94
+
95
+
96
+ class UNetT(nn.Module):
97
+ def __init__(
98
+ self,
99
+ *,
100
+ dim,
101
+ depth=8,
102
+ heads=8,
103
+ dim_head=64,
104
+ dropout=0.1,
105
+ ff_mult=4,
106
+ mel_dim=100,
107
+ text_num_embeds=256,
108
+ text_dim=None,
109
+ conv_layers=0,
110
+ skip_connect_type: Literal["add", "concat", "none"] = "concat",
111
+ ):
112
+ super().__init__()
113
+ assert depth % 2 == 0, "UNet-Transformer's depth should be even."
114
+
115
+ self.time_embed = TimestepEmbedding(dim)
116
+ if text_dim is None:
117
+ text_dim = mel_dim
118
+ self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers)
119
+ self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
120
+
121
+ self.rotary_embed = RotaryEmbedding(dim_head)
122
+
123
+ # transformer layers & skip connections
124
+
125
+ self.dim = dim
126
+ self.skip_connect_type = skip_connect_type
127
+ needs_skip_proj = skip_connect_type == "concat"
128
+
129
+ self.depth = depth
130
+ self.layers = nn.ModuleList([])
131
+
132
+ for idx in range(depth):
133
+ is_later_half = idx >= (depth // 2)
134
+
135
+ attn_norm = RMSNorm(dim)
136
+ attn = Attention(
137
+ processor=AttnProcessor(),
138
+ dim=dim,
139
+ heads=heads,
140
+ dim_head=dim_head,
141
+ dropout=dropout,
142
+ )
143
+
144
+ ff_norm = RMSNorm(dim)
145
+ ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
146
+
147
+ skip_proj = nn.Linear(dim * 2, dim, bias=False) if needs_skip_proj and is_later_half else None
148
+
149
+ self.layers.append(
150
+ nn.ModuleList(
151
+ [
152
+ skip_proj,
153
+ attn_norm,
154
+ attn,
155
+ ff_norm,
156
+ ff,
157
+ ]
158
+ )
159
+ )
160
+
161
+ self.norm_out = RMSNorm(dim)
162
+ self.proj_out = nn.Linear(dim, mel_dim)
163
+
164
+ def forward(
165
+ self,
166
+ x: float["b n d"], # nosied input audio # noqa: F722
167
+ cond: float["b n d"], # masked cond audio # noqa: F722
168
+ text: int["b nt"], # text # noqa: F722
169
+ time: float["b"] | float[""], # time step # noqa: F821 F722
170
+ drop_audio_cond, # cfg for cond audio
171
+ drop_text, # cfg for text
172
+ mask: bool["b n"] | None = None, # noqa: F722
173
+ ):
174
+ batch, seq_len = x.shape[0], x.shape[1]
175
+ if time.ndim == 0:
176
+ time = time.repeat(batch)
177
+
178
+ # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
179
+ t = self.time_embed(time)
180
+ text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
181
+ x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
182
+
183
+ # postfix time t to input x, [b n d] -> [b n+1 d]
184
+ x = torch.cat([t.unsqueeze(1), x], dim=1) # pack t to x
185
+ if mask is not None:
186
+ mask = F.pad(mask, (1, 0), value=1)
187
+
188
+ rope = self.rotary_embed.forward_from_seq_len(seq_len + 1)
189
+
190
+ # flat unet transformer
191
+ skip_connect_type = self.skip_connect_type
192
+ skips = []
193
+ for idx, (maybe_skip_proj, attn_norm, attn, ff_norm, ff) in enumerate(self.layers):
194
+ layer = idx + 1
195
+
196
+ # skip connection logic
197
+ is_first_half = layer <= (self.depth // 2)
198
+ is_later_half = not is_first_half
199
+
200
+ if is_first_half:
201
+ skips.append(x)
202
+
203
+ if is_later_half:
204
+ skip = skips.pop()
205
+ if skip_connect_type == "concat":
206
+ x = torch.cat((x, skip), dim=-1)
207
+ x = maybe_skip_proj(x)
208
+ elif skip_connect_type == "add":
209
+ x = x + skip
210
+
211
+ # attention and feedforward blocks
212
+ x = attn(attn_norm(x), rope=rope, mask=mask) + x
213
+ x = ff(ff_norm(x)) + x
214
+
215
+ assert len(skips) == 0
216
+
217
+ x = self.norm_out(x)[:, 1:, :] # unpack t from x
218
+
219
+ return self.proj_out(x)
src/f5_tts/model/cfm.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from random import random
13
+ from typing import Callable
14
+
15
+ import torch
16
+ import torch.nn.functional as F
17
+ from torch import nn
18
+ from torch.nn.utils.rnn import pad_sequence
19
+ from torchdiffeq import odeint
20
+
21
+ from f5_tts.model.modules import MelSpec
22
+ from f5_tts.model.utils import (
23
+ default,
24
+ exists,
25
+ lens_to_mask,
26
+ list_str_to_idx,
27
+ list_str_to_tensor,
28
+ mask_from_frac_lengths,
29
+ )
30
+
31
+
32
+ class CFM(nn.Module):
33
+ def __init__(
34
+ self,
35
+ transformer: nn.Module,
36
+ sigma=0.0,
37
+ odeint_kwargs: dict = dict(
38
+ # atol = 1e-5,
39
+ # rtol = 1e-5,
40
+ method="euler" # 'midpoint'
41
+ ),
42
+ audio_drop_prob=0.3,
43
+ cond_drop_prob=0.2,
44
+ num_channels=None,
45
+ mel_spec_module: nn.Module | None = None,
46
+ mel_spec_kwargs: dict = dict(),
47
+ frac_lengths_mask: tuple[float, float] = (0.7, 1.0),
48
+ vocab_char_map: dict[str:int] | None = None,
49
+ ):
50
+ super().__init__()
51
+
52
+ self.frac_lengths_mask = frac_lengths_mask
53
+
54
+ # mel spec
55
+ self.mel_spec = default(mel_spec_module, MelSpec(**mel_spec_kwargs))
56
+ num_channels = default(num_channels, self.mel_spec.n_mel_channels)
57
+ self.num_channels = num_channels
58
+
59
+ # classifier-free guidance
60
+ self.audio_drop_prob = audio_drop_prob
61
+ self.cond_drop_prob = cond_drop_prob
62
+
63
+ # transformer
64
+ self.transformer = transformer
65
+ dim = transformer.dim
66
+ self.dim = dim
67
+
68
+ # conditional flow related
69
+ self.sigma = sigma
70
+
71
+ # sampling related
72
+ self.odeint_kwargs = odeint_kwargs
73
+
74
+ # vocab map for tokenization
75
+ self.vocab_char_map = vocab_char_map
76
+
77
+ @property
78
+ def device(self):
79
+ return next(self.parameters()).device
80
+
81
+ @torch.no_grad()
82
+ def sample(
83
+ self,
84
+ cond: float["b n d"] | float["b nw"], # noqa: F722
85
+ text: int["b nt"] | list[str], # noqa: F722
86
+ duration: int | int["b"], # noqa: F821
87
+ *,
88
+ lens: int["b"] | None = None, # noqa: F821
89
+ steps=32,
90
+ cfg_strength=1.0,
91
+ sway_sampling_coef=None,
92
+ seed: int | None = None,
93
+ max_duration=4096,
94
+ vocoder: Callable[[float["b d n"]], float["b nw"]] | None = None, # noqa: F722
95
+ no_ref_audio=False,
96
+ duplicate_test=False,
97
+ t_inter=0.1,
98
+ edit_mask=None,
99
+ ):
100
+ self.eval()
101
+ # raw wave
102
+
103
+ if cond.ndim == 2:
104
+ cond = self.mel_spec(cond)
105
+ cond = cond.permute(0, 2, 1)
106
+ assert cond.shape[-1] == self.num_channels
107
+
108
+ cond = cond.to(next(self.parameters()).dtype)
109
+
110
+ batch, cond_seq_len, device = *cond.shape[:2], cond.device
111
+ if not exists(lens):
112
+ lens = torch.full((batch,), cond_seq_len, device=device, dtype=torch.long)
113
+
114
+ # text
115
+
116
+ if isinstance(text, list):
117
+ if exists(self.vocab_char_map):
118
+ text = list_str_to_idx(text, self.vocab_char_map).to(device)
119
+ else:
120
+ text = list_str_to_tensor(text).to(device)
121
+ assert text.shape[0] == batch
122
+
123
+ # duration
124
+
125
+ cond_mask = lens_to_mask(lens)
126
+ if edit_mask is not None:
127
+ cond_mask = cond_mask & edit_mask
128
+
129
+ if isinstance(duration, int):
130
+ duration = torch.full((batch,), duration, device=device, dtype=torch.long)
131
+
132
+ duration = torch.maximum(
133
+ torch.maximum((text != -1).sum(dim=-1), lens) + 1, duration
134
+ ) # duration at least text/audio prompt length plus one token, so something is generated
135
+ duration = duration.clamp(max=max_duration)
136
+ max_duration = duration.amax()
137
+
138
+ # duplicate test corner for inner time step oberservation
139
+ if duplicate_test:
140
+ test_cond = F.pad(cond, (0, 0, cond_seq_len, max_duration - 2 * cond_seq_len), value=0.0)
141
+
142
+ cond = F.pad(cond, (0, 0, 0, max_duration - cond_seq_len), value=0.0)
143
+ if no_ref_audio:
144
+ cond = torch.zeros_like(cond)
145
+
146
+ cond_mask = F.pad(cond_mask, (0, max_duration - cond_mask.shape[-1]), value=False)
147
+ cond_mask = cond_mask.unsqueeze(-1)
148
+ step_cond = torch.where(
149
+ cond_mask, cond, torch.zeros_like(cond)
150
+ ) # allow direct control (cut cond audio) with lens passed in
151
+
152
+ if batch > 1:
153
+ mask = lens_to_mask(duration)
154
+ else: # save memory and speed up, as single inference need no mask currently
155
+ mask = None
156
+
157
+ # neural ode
158
+
159
+ def fn(t, x):
160
+ # at each step, conditioning is fixed
161
+ # step_cond = torch.where(cond_mask, cond, torch.zeros_like(cond))
162
+
163
+ # predict flow
164
+ pred = self.transformer(
165
+ x=x, cond=step_cond, text=text, time=t, mask=mask, drop_audio_cond=False, drop_text=False
166
+ )
167
+ if cfg_strength < 1e-5:
168
+ return pred
169
+
170
+ null_pred = self.transformer(
171
+ x=x, cond=step_cond, text=text, time=t, mask=mask, drop_audio_cond=True, drop_text=True
172
+ )
173
+ return pred + (pred - null_pred) * cfg_strength
174
+
175
+ # noise input
176
+ # to make sure batch inference result is same with different batch size, and for sure single inference
177
+ # still some difference maybe due to convolutional layers
178
+ y0 = []
179
+ for dur in duration:
180
+ if exists(seed):
181
+ torch.manual_seed(seed)
182
+ y0.append(torch.randn(dur, self.num_channels, device=self.device, dtype=step_cond.dtype))
183
+ y0 = pad_sequence(y0, padding_value=0, batch_first=True)
184
+
185
+ t_start = 0
186
+
187
+ # duplicate test corner for inner time step oberservation
188
+ if duplicate_test:
189
+ t_start = t_inter
190
+ y0 = (1 - t_start) * y0 + t_start * test_cond
191
+ steps = int(steps * (1 - t_start))
192
+
193
+ t = torch.linspace(t_start, 1, steps + 1, device=self.device, dtype=step_cond.dtype)
194
+ if sway_sampling_coef is not None:
195
+ t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
196
+
197
+ trajectory = odeint(fn, y0, t, **self.odeint_kwargs)
198
+
199
+ sampled = trajectory[-1]
200
+ out = sampled
201
+ out = torch.where(cond_mask, cond, out)
202
+
203
+ if exists(vocoder):
204
+ out = out.permute(0, 2, 1)
205
+ out = vocoder(out)
206
+
207
+ return out, trajectory
208
+
209
+ def forward(
210
+ self,
211
+ inp: float["b n d"] | float["b nw"], # mel or raw wave # noqa: F722
212
+ text: int["b nt"] | list[str], # noqa: F722
213
+ *,
214
+ lens: int["b"] | None = None, # noqa: F821
215
+ noise_scheduler: str | None = None,
216
+ ):
217
+ # handle raw wave
218
+ if inp.ndim == 2:
219
+ inp = self.mel_spec(inp)
220
+ inp = inp.permute(0, 2, 1)
221
+ assert inp.shape[-1] == self.num_channels
222
+
223
+ batch, seq_len, dtype, device, _σ1 = *inp.shape[:2], inp.dtype, self.device, self.sigma
224
+
225
+ # handle text as string
226
+ if isinstance(text, list):
227
+ if exists(self.vocab_char_map):
228
+ text = list_str_to_idx(text, self.vocab_char_map).to(device)
229
+ else:
230
+ text = list_str_to_tensor(text).to(device)
231
+ assert text.shape[0] == batch
232
+
233
+ # lens and mask
234
+ if not exists(lens):
235
+ lens = torch.full((batch,), seq_len, device=device)
236
+
237
+ mask = lens_to_mask(lens, length=seq_len) # useless here, as collate_fn will pad to max length in batch
238
+
239
+ # get a random span to mask out for training conditionally
240
+ frac_lengths = torch.zeros((batch,), device=self.device).float().uniform_(*self.frac_lengths_mask)
241
+ rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)
242
+
243
+ if exists(mask):
244
+ rand_span_mask &= mask
245
+
246
+ # mel is x1
247
+ x1 = inp
248
+
249
+ # x0 is gaussian noise
250
+ x0 = torch.randn_like(x1)
251
+
252
+ # time step
253
+ time = torch.rand((batch,), dtype=dtype, device=self.device)
254
+ # TODO. noise_scheduler
255
+
256
+ # sample xt (φ_t(x) in the paper)
257
+ t = time.unsqueeze(-1).unsqueeze(-1)
258
+ φ = (1 - t) * x0 + t * x1
259
+ flow = x1 - x0
260
+
261
+ # only predict what is within the random mask span for infilling
262
+ cond = torch.where(rand_span_mask[..., None], torch.zeros_like(x1), x1)
263
+
264
+ # transformer and cfg training with a drop rate
265
+ drop_audio_cond = random() < self.audio_drop_prob # p_drop in voicebox paper
266
+ if random() < self.cond_drop_prob: # p_uncond in voicebox paper
267
+ drop_audio_cond = True
268
+ drop_text = True
269
+ else:
270
+ drop_text = False
271
+
272
+ # if want rigourously mask out padding, record in collate_fn in dataset.py, and pass in here
273
+ # adding mask will use more memory, thus also need to adjust batchsampler with scaled down threshold for long sequences
274
+ pred = self.transformer(
275
+ x=φ, cond=cond, text=text, time=time, drop_audio_cond=drop_audio_cond, drop_text=drop_text
276
+ )
277
+
278
+ # flow matching loss
279
+ loss = F.mse_loss(pred, flow, reduction="none")
280
+ loss = loss[rand_span_mask]
281
+
282
+ return loss.mean(), cond, pred
src/f5_tts/model/dataset.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from importlib.resources import files
3
+
4
+ import torch
5
+ import torch.nn.functional as F
6
+ import torchaudio
7
+ from datasets import Dataset as Dataset_
8
+ from datasets import load_from_disk
9
+ from torch import nn
10
+ from torch.utils.data import Dataset, Sampler
11
+ from tqdm import tqdm
12
+
13
+ from f5_tts.model.modules import MelSpec
14
+ from f5_tts.model.utils import default
15
+
16
+
17
+ class HFDataset(Dataset):
18
+ def __init__(
19
+ self,
20
+ hf_dataset: Dataset,
21
+ target_sample_rate=24_000,
22
+ n_mel_channels=100,
23
+ hop_length=256,
24
+ n_fft=1024,
25
+ win_length=1024,
26
+ mel_spec_type="vocos",
27
+ ):
28
+ self.data = hf_dataset
29
+ self.target_sample_rate = target_sample_rate
30
+ self.hop_length = hop_length
31
+
32
+ self.mel_spectrogram = MelSpec(
33
+ n_fft=n_fft,
34
+ hop_length=hop_length,
35
+ win_length=win_length,
36
+ n_mel_channels=n_mel_channels,
37
+ target_sample_rate=target_sample_rate,
38
+ mel_spec_type=mel_spec_type,
39
+ )
40
+
41
+ def get_frame_len(self, index):
42
+ row = self.data[index]
43
+ audio = row["audio"]["array"]
44
+ sample_rate = row["audio"]["sampling_rate"]
45
+ return audio.shape[-1] / sample_rate * self.target_sample_rate / self.hop_length
46
+
47
+ def __len__(self):
48
+ return len(self.data)
49
+
50
+ def __getitem__(self, index):
51
+ row = self.data[index]
52
+ audio = row["audio"]["array"]
53
+
54
+ # logger.info(f"Audio shape: {audio.shape}")
55
+
56
+ sample_rate = row["audio"]["sampling_rate"]
57
+ duration = audio.shape[-1] / sample_rate
58
+
59
+ if duration > 30 or duration < 0.3:
60
+ return self.__getitem__((index + 1) % len(self.data))
61
+
62
+ audio_tensor = torch.from_numpy(audio).float()
63
+
64
+ if sample_rate != self.target_sample_rate:
65
+ resampler = torchaudio.transforms.Resample(sample_rate, self.target_sample_rate)
66
+ audio_tensor = resampler(audio_tensor)
67
+
68
+ audio_tensor = audio_tensor.unsqueeze(0) # 't -> 1 t')
69
+
70
+ mel_spec = self.mel_spectrogram(audio_tensor)
71
+
72
+ mel_spec = mel_spec.squeeze(0) # '1 d t -> d t'
73
+
74
+ text = row["text"]
75
+
76
+ return dict(
77
+ mel_spec=mel_spec,
78
+ text=text,
79
+ )
80
+
81
+
82
+ class CustomDataset(Dataset):
83
+ def __init__(
84
+ self,
85
+ custom_dataset: Dataset,
86
+ durations=None,
87
+ target_sample_rate=24_000,
88
+ hop_length=256,
89
+ n_mel_channels=100,
90
+ n_fft=1024,
91
+ win_length=1024,
92
+ mel_spec_type="vocos",
93
+ preprocessed_mel=False,
94
+ mel_spec_module: nn.Module | None = None,
95
+ ):
96
+ self.data = custom_dataset
97
+ self.durations = durations
98
+ self.target_sample_rate = target_sample_rate
99
+ self.hop_length = hop_length
100
+ self.n_fft = n_fft
101
+ self.win_length = win_length
102
+ self.mel_spec_type = mel_spec_type
103
+ self.preprocessed_mel = preprocessed_mel
104
+
105
+ if not preprocessed_mel:
106
+ self.mel_spectrogram = default(
107
+ mel_spec_module,
108
+ MelSpec(
109
+ n_fft=n_fft,
110
+ hop_length=hop_length,
111
+ win_length=win_length,
112
+ n_mel_channels=n_mel_channels,
113
+ target_sample_rate=target_sample_rate,
114
+ mel_spec_type=mel_spec_type,
115
+ ),
116
+ )
117
+
118
+ def get_frame_len(self, index):
119
+ if (
120
+ self.durations is not None
121
+ ): # Please make sure the separately provided durations are correct, otherwise 99.99% OOM
122
+ return self.durations[index] * self.target_sample_rate / self.hop_length
123
+ return self.data[index]["duration"] * self.target_sample_rate / self.hop_length
124
+
125
+ def __len__(self):
126
+ return len(self.data)
127
+
128
+ def __getitem__(self, index):
129
+ while True:
130
+ row = self.data[index]
131
+ audio_path = row["audio_path"]
132
+ text = row["text"]
133
+ duration = row["duration"]
134
+
135
+ # filter by given length
136
+ if 0.3 <= duration <= 30:
137
+ break # valid
138
+
139
+ index = (index + 1) % len(self.data)
140
+
141
+ if self.preprocessed_mel:
142
+ mel_spec = torch.tensor(row["mel_spec"])
143
+ else:
144
+ audio, source_sample_rate = torchaudio.load(audio_path)
145
+
146
+ # make sure mono input
147
+ if audio.shape[0] > 1:
148
+ audio = torch.mean(audio, dim=0, keepdim=True)
149
+
150
+ # resample if necessary
151
+ if source_sample_rate != self.target_sample_rate:
152
+ resampler = torchaudio.transforms.Resample(source_sample_rate, self.target_sample_rate)
153
+ audio = resampler(audio)
154
+
155
+ # to mel spectrogram
156
+ mel_spec = self.mel_spectrogram(audio)
157
+ mel_spec = mel_spec.squeeze(0) # '1 d t -> d t'
158
+
159
+ return {
160
+ "mel_spec": mel_spec,
161
+ "text": text,
162
+ }
163
+
164
+
165
+ # Dynamic Batch Sampler
166
+ class DynamicBatchSampler(Sampler[list[int]]):
167
+ """Extension of Sampler that will do the following:
168
+ 1. Change the batch size (essentially number of sequences)
169
+ in a batch to ensure that the total number of frames are less
170
+ than a certain threshold.
171
+ 2. Make sure the padding efficiency in the batch is high.
172
+ 3. Shuffle batches each epoch while maintaining reproducibility.
173
+ """
174
+
175
+ def __init__(
176
+ self, sampler: Sampler[int], frames_threshold: int, max_samples=0, random_seed=None, drop_last: bool = False
177
+ ):
178
+ self.sampler = sampler
179
+ self.frames_threshold = frames_threshold
180
+ self.max_samples = max_samples
181
+ self.random_seed = random_seed
182
+ self.epoch = 0
183
+
184
+ indices, batches = [], []
185
+ data_source = self.sampler.data_source
186
+
187
+ for idx in tqdm(
188
+ self.sampler, desc="Sorting with sampler... if slow, check whether dataset is provided with duration"
189
+ ):
190
+ indices.append((idx, data_source.get_frame_len(idx)))
191
+ indices.sort(key=lambda elem: elem[1])
192
+
193
+ batch = []
194
+ batch_frames = 0
195
+ for idx, frame_len in tqdm(
196
+ indices, desc=f"Creating dynamic batches with {frames_threshold} audio frames per gpu"
197
+ ):
198
+ if batch_frames + frame_len <= self.frames_threshold and (max_samples == 0 or len(batch) < max_samples):
199
+ batch.append(idx)
200
+ batch_frames += frame_len
201
+ else:
202
+ if len(batch) > 0:
203
+ batches.append(batch)
204
+ if frame_len <= self.frames_threshold:
205
+ batch = [idx]
206
+ batch_frames = frame_len
207
+ else:
208
+ batch = []
209
+ batch_frames = 0
210
+
211
+ if not drop_last and len(batch) > 0:
212
+ batches.append(batch)
213
+
214
+ del indices
215
+ self.batches = batches
216
+
217
+ def set_epoch(self, epoch: int) -> None:
218
+ """Sets the epoch for this sampler."""
219
+ self.epoch = epoch
220
+
221
+ def __iter__(self):
222
+ # Use both random_seed and epoch for deterministic but different shuffling per epoch
223
+ if self.random_seed is not None:
224
+ g = torch.Generator()
225
+ g.manual_seed(self.random_seed + self.epoch)
226
+ # Use PyTorch's random permutation for better reproducibility across PyTorch versions
227
+ indices = torch.randperm(len(self.batches), generator=g).tolist()
228
+ batches = [self.batches[i] for i in indices]
229
+ else:
230
+ batches = self.batches
231
+ return iter(batches)
232
+
233
+ def __len__(self):
234
+ return len(self.batches)
235
+
236
+
237
+ # Load dataset
238
+
239
+
240
+ def load_dataset(
241
+ dataset_name: str,
242
+ tokenizer: str = "pinyin",
243
+ dataset_type: str = "CustomDataset",
244
+ audio_type: str = "raw",
245
+ mel_spec_module: nn.Module | None = None,
246
+ mel_spec_kwargs: dict = dict(),
247
+ ) -> CustomDataset | HFDataset:
248
+ """
249
+ dataset_type - "CustomDataset" if you want to use tokenizer name and default data path to load for train_dataset
250
+ - "CustomDatasetPath" if you just want to pass the full path to a preprocessed dataset without relying on tokenizer
251
+ """
252
+
253
+ print("Loading dataset ...")
254
+
255
+ if dataset_type == "CustomDataset":
256
+ rel_data_path = str(files("f5_tts").joinpath(f"../../data/{dataset_name}_{tokenizer}"))
257
+ if audio_type == "raw":
258
+ try:
259
+ train_dataset = load_from_disk(f"{rel_data_path}/raw")
260
+ except: # noqa: E722
261
+ train_dataset = Dataset_.from_file(f"{rel_data_path}/raw.arrow")
262
+ preprocessed_mel = False
263
+ elif audio_type == "mel":
264
+ train_dataset = Dataset_.from_file(f"{rel_data_path}/mel.arrow")
265
+ preprocessed_mel = True
266
+ with open(f"{rel_data_path}/duration.json", "r", encoding="utf-8") as f:
267
+ data_dict = json.load(f)
268
+ durations = data_dict["duration"]
269
+ train_dataset = CustomDataset(
270
+ train_dataset,
271
+ durations=durations,
272
+ preprocessed_mel=preprocessed_mel,
273
+ mel_spec_module=mel_spec_module,
274
+ **mel_spec_kwargs,
275
+ )
276
+
277
+ elif dataset_type == "CustomDatasetPath":
278
+ try:
279
+ train_dataset = load_from_disk(f"{dataset_name}/raw")
280
+ except: # noqa: E722
281
+ train_dataset = Dataset_.from_file(f"{dataset_name}/raw.arrow")
282
+
283
+ with open(f"{dataset_name}/duration.json", "r", encoding="utf-8") as f:
284
+ data_dict = json.load(f)
285
+ durations = data_dict["duration"]
286
+ train_dataset = CustomDataset(
287
+ train_dataset, durations=durations, preprocessed_mel=preprocessed_mel, **mel_spec_kwargs
288
+ )
289
+
290
+ elif dataset_type == "HFDataset":
291
+ print(
292
+ "Should manually modify the path of huggingface dataset to your need.\n"
293
+ + "May also the corresponding script cuz different dataset may have different format."
294
+ )
295
+ pre, post = dataset_name.split("_")
296
+ train_dataset = HFDataset(
297
+ load_dataset(f"{pre}/{pre}", split=f"train.{post}", cache_dir=str(files("f5_tts").joinpath("../../data"))),
298
+ )
299
+
300
+ return train_dataset
301
+
302
+
303
+ # collation
304
+
305
+
306
+ def collate_fn(batch):
307
+ mel_specs = [item["mel_spec"].squeeze(0) for item in batch]
308
+ mel_lengths = torch.LongTensor([spec.shape[-1] for spec in mel_specs])
309
+ max_mel_length = mel_lengths.amax()
310
+
311
+ padded_mel_specs = []
312
+ for spec in mel_specs: # TODO. maybe records mask for attention here
313
+ padding = (0, max_mel_length - spec.size(-1))
314
+ padded_spec = F.pad(spec, padding, value=0)
315
+ padded_mel_specs.append(padded_spec)
316
+
317
+ mel_specs = torch.stack(padded_mel_specs)
318
+
319
+ text = [item["text"] for item in batch]
320
+ text_lengths = torch.LongTensor([len(item) for item in text])
321
+
322
+ return dict(
323
+ mel=mel_specs,
324
+ mel_lengths=mel_lengths,
325
+ text=text,
326
+ text_lengths=text_lengths,
327
+ )
src/f5_tts/model/modules.py ADDED
@@ -0,0 +1,658 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import math
13
+ from typing import Optional
14
+
15
+ import torch
16
+ import torch.nn.functional as F
17
+ import torchaudio
18
+ from librosa.filters import mel as librosa_mel_fn
19
+ from torch import nn
20
+ from x_transformers.x_transformers import apply_rotary_pos_emb
21
+
22
+
23
+ # raw wav to mel spec
24
+
25
+
26
+ mel_basis_cache = {}
27
+ hann_window_cache = {}
28
+
29
+
30
+ def get_bigvgan_mel_spectrogram(
31
+ waveform,
32
+ n_fft=1024,
33
+ n_mel_channels=100,
34
+ target_sample_rate=24000,
35
+ hop_length=256,
36
+ win_length=1024,
37
+ fmin=0,
38
+ fmax=None,
39
+ center=False,
40
+ ): # Copy from https://github.com/NVIDIA/BigVGAN/tree/main
41
+ device = waveform.device
42
+ key = f"{n_fft}_{n_mel_channels}_{target_sample_rate}_{hop_length}_{win_length}_{fmin}_{fmax}_{device}"
43
+
44
+ if key not in mel_basis_cache:
45
+ mel = librosa_mel_fn(sr=target_sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=fmin, fmax=fmax)
46
+ mel_basis_cache[key] = torch.from_numpy(mel).float().to(device) # TODO: why they need .float()?
47
+ hann_window_cache[key] = torch.hann_window(win_length).to(device)
48
+
49
+ mel_basis = mel_basis_cache[key]
50
+ hann_window = hann_window_cache[key]
51
+
52
+ padding = (n_fft - hop_length) // 2
53
+ waveform = torch.nn.functional.pad(waveform.unsqueeze(1), (padding, padding), mode="reflect").squeeze(1)
54
+
55
+ spec = torch.stft(
56
+ waveform,
57
+ n_fft,
58
+ hop_length=hop_length,
59
+ win_length=win_length,
60
+ window=hann_window,
61
+ center=center,
62
+ pad_mode="reflect",
63
+ normalized=False,
64
+ onesided=True,
65
+ return_complex=True,
66
+ )
67
+ spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
68
+
69
+ mel_spec = torch.matmul(mel_basis, spec)
70
+ mel_spec = torch.log(torch.clamp(mel_spec, min=1e-5))
71
+
72
+ return mel_spec
73
+
74
+
75
+ def get_vocos_mel_spectrogram(
76
+ waveform,
77
+ n_fft=1024,
78
+ n_mel_channels=100,
79
+ target_sample_rate=24000,
80
+ hop_length=256,
81
+ win_length=1024,
82
+ ):
83
+ mel_stft = torchaudio.transforms.MelSpectrogram(
84
+ sample_rate=target_sample_rate,
85
+ n_fft=n_fft,
86
+ win_length=win_length,
87
+ hop_length=hop_length,
88
+ n_mels=n_mel_channels,
89
+ power=1,
90
+ center=True,
91
+ normalized=False,
92
+ norm=None,
93
+ ).to(waveform.device)
94
+ if len(waveform.shape) == 3:
95
+ waveform = waveform.squeeze(1) # 'b 1 nw -> b nw'
96
+
97
+ assert len(waveform.shape) == 2
98
+
99
+ mel = mel_stft(waveform)
100
+ mel = mel.clamp(min=1e-5).log()
101
+ return mel
102
+
103
+
104
+ class MelSpec(nn.Module):
105
+ def __init__(
106
+ self,
107
+ n_fft=1024,
108
+ hop_length=256,
109
+ win_length=1024,
110
+ n_mel_channels=100,
111
+ target_sample_rate=24_000,
112
+ mel_spec_type="vocos",
113
+ ):
114
+ super().__init__()
115
+ assert mel_spec_type in ["vocos", "bigvgan"], print("We only support two extract mel backend: vocos or bigvgan")
116
+
117
+ self.n_fft = n_fft
118
+ self.hop_length = hop_length
119
+ self.win_length = win_length
120
+ self.n_mel_channels = n_mel_channels
121
+ self.target_sample_rate = target_sample_rate
122
+
123
+ if mel_spec_type == "vocos":
124
+ self.extractor = get_vocos_mel_spectrogram
125
+ elif mel_spec_type == "bigvgan":
126
+ self.extractor = get_bigvgan_mel_spectrogram
127
+
128
+ self.register_buffer("dummy", torch.tensor(0), persistent=False)
129
+
130
+ def forward(self, wav):
131
+ if self.dummy.device != wav.device:
132
+ self.to(wav.device)
133
+
134
+ mel = self.extractor(
135
+ waveform=wav,
136
+ n_fft=self.n_fft,
137
+ n_mel_channels=self.n_mel_channels,
138
+ target_sample_rate=self.target_sample_rate,
139
+ hop_length=self.hop_length,
140
+ win_length=self.win_length,
141
+ )
142
+
143
+ return mel
144
+
145
+
146
+ # sinusoidal position embedding
147
+
148
+
149
+ class SinusPositionEmbedding(nn.Module):
150
+ def __init__(self, dim):
151
+ super().__init__()
152
+ self.dim = dim
153
+
154
+ def forward(self, x, scale=1000):
155
+ device = x.device
156
+ half_dim = self.dim // 2
157
+ emb = math.log(10000) / (half_dim - 1)
158
+ emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
159
+ emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
160
+ emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
161
+ return emb
162
+
163
+
164
+ # convolutional position embedding
165
+
166
+
167
+ class ConvPositionEmbedding(nn.Module):
168
+ def __init__(self, dim, kernel_size=31, groups=16):
169
+ super().__init__()
170
+ assert kernel_size % 2 != 0
171
+ self.conv1d = nn.Sequential(
172
+ nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
173
+ nn.Mish(),
174
+ nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
175
+ nn.Mish(),
176
+ )
177
+
178
+ def forward(self, x: float["b n d"], mask: bool["b n"] | None = None): # noqa: F722
179
+ if mask is not None:
180
+ mask = mask[..., None]
181
+ x = x.masked_fill(~mask, 0.0)
182
+
183
+ x = x.permute(0, 2, 1)
184
+ x = self.conv1d(x)
185
+ out = x.permute(0, 2, 1)
186
+
187
+ if mask is not None:
188
+ out = out.masked_fill(~mask, 0.0)
189
+
190
+ return out
191
+
192
+
193
+ # rotary positional embedding related
194
+
195
+
196
+ def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0):
197
+ # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
198
+ # has some connection to NTK literature
199
+ # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
200
+ # https://github.com/lucidrains/rotary-embedding-torch/blob/main/rotary_embedding_torch/rotary_embedding_torch.py
201
+ theta *= theta_rescale_factor ** (dim / (dim - 2))
202
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
203
+ t = torch.arange(end, device=freqs.device) # type: ignore
204
+ freqs = torch.outer(t, freqs).float() # type: ignore
205
+ freqs_cos = torch.cos(freqs) # real part
206
+ freqs_sin = torch.sin(freqs) # imaginary part
207
+ return torch.cat([freqs_cos, freqs_sin], dim=-1)
208
+
209
+
210
+ def get_pos_embed_indices(start, length, max_pos, scale=1.0):
211
+ # length = length if isinstance(length, int) else length.max()
212
+ scale = scale * torch.ones_like(start, dtype=torch.float32) # in case scale is a scalar
213
+ pos = (
214
+ start.unsqueeze(1)
215
+ + (torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0) * scale.unsqueeze(1)).long()
216
+ )
217
+ # avoid extra long error.
218
+ pos = torch.where(pos < max_pos, pos, max_pos - 1)
219
+ return pos
220
+
221
+
222
+ # Global Response Normalization layer (Instance Normalization ?)
223
+
224
+
225
+ class GRN(nn.Module):
226
+ def __init__(self, dim):
227
+ super().__init__()
228
+ self.gamma = nn.Parameter(torch.zeros(1, 1, dim))
229
+ self.beta = nn.Parameter(torch.zeros(1, 1, dim))
230
+
231
+ def forward(self, x):
232
+ Gx = torch.norm(x, p=2, dim=1, keepdim=True)
233
+ Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
234
+ return self.gamma * (x * Nx) + self.beta + x
235
+
236
+
237
+ # ConvNeXt-V2 Block https://github.com/facebookresearch/ConvNeXt-V2/blob/main/models/convnextv2.py
238
+ # ref: https://github.com/bfs18/e2_tts/blob/main/rfwave/modules.py#L108
239
+
240
+
241
+ class ConvNeXtV2Block(nn.Module):
242
+ def __init__(
243
+ self,
244
+ dim: int,
245
+ intermediate_dim: int,
246
+ dilation: int = 1,
247
+ ):
248
+ super().__init__()
249
+ padding = (dilation * (7 - 1)) // 2
250
+ self.dwconv = nn.Conv1d(
251
+ dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
252
+ ) # depthwise conv
253
+ self.norm = nn.LayerNorm(dim, eps=1e-6)
254
+ self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers
255
+ self.act = nn.GELU()
256
+ self.grn = GRN(intermediate_dim)
257
+ self.pwconv2 = nn.Linear(intermediate_dim, dim)
258
+
259
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
260
+ residual = x
261
+ x = x.transpose(1, 2) # b n d -> b d n
262
+ x = self.dwconv(x)
263
+ x = x.transpose(1, 2) # b d n -> b n d
264
+ x = self.norm(x)
265
+ x = self.pwconv1(x)
266
+ x = self.act(x)
267
+ x = self.grn(x)
268
+ x = self.pwconv2(x)
269
+ return residual + x
270
+
271
+
272
+ # AdaLayerNormZero
273
+ # return with modulated x for attn input, and params for later mlp modulation
274
+
275
+
276
+ class AdaLayerNormZero(nn.Module):
277
+ def __init__(self, dim):
278
+ super().__init__()
279
+
280
+ self.silu = nn.SiLU()
281
+ self.linear = nn.Linear(dim, dim * 6)
282
+
283
+ self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
284
+
285
+ def forward(self, x, emb=None):
286
+ emb = self.linear(self.silu(emb))
287
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
288
+
289
+ x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
290
+ return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
291
+
292
+
293
+ # AdaLayerNormZero for final layer
294
+ # return only with modulated x for attn input, cuz no more mlp modulation
295
+
296
+
297
+ class AdaLayerNormZero_Final(nn.Module):
298
+ def __init__(self, dim):
299
+ super().__init__()
300
+
301
+ self.silu = nn.SiLU()
302
+ self.linear = nn.Linear(dim, dim * 2)
303
+
304
+ self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
305
+
306
+ def forward(self, x, emb):
307
+ emb = self.linear(self.silu(emb))
308
+ scale, shift = torch.chunk(emb, 2, dim=1)
309
+
310
+ x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
311
+ return x
312
+
313
+
314
+ # FeedForward
315
+
316
+
317
+ class FeedForward(nn.Module):
318
+ def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"):
319
+ super().__init__()
320
+ inner_dim = int(dim * mult)
321
+ dim_out = dim_out if dim_out is not None else dim
322
+
323
+ activation = nn.GELU(approximate=approximate)
324
+ project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
325
+ self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
326
+
327
+ def forward(self, x):
328
+ return self.ff(x)
329
+
330
+
331
+ # Attention with possible joint part
332
+ # modified from diffusers/src/diffusers/models/attention_processor.py
333
+
334
+
335
+ class Attention(nn.Module):
336
+ def __init__(
337
+ self,
338
+ processor: JointAttnProcessor | AttnProcessor,
339
+ dim: int,
340
+ heads: int = 8,
341
+ dim_head: int = 64,
342
+ dropout: float = 0.0,
343
+ context_dim: Optional[int] = None, # if not None -> joint attention
344
+ context_pre_only=None,
345
+ ):
346
+ super().__init__()
347
+
348
+ if not hasattr(F, "scaled_dot_product_attention"):
349
+ raise ImportError("Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
350
+
351
+ self.processor = processor
352
+
353
+ self.dim = dim
354
+ self.heads = heads
355
+ self.inner_dim = dim_head * heads
356
+ self.dropout = dropout
357
+
358
+ self.context_dim = context_dim
359
+ self.context_pre_only = context_pre_only
360
+
361
+ self.to_q = nn.Linear(dim, self.inner_dim)
362
+ self.to_k = nn.Linear(dim, self.inner_dim)
363
+ self.to_v = nn.Linear(dim, self.inner_dim)
364
+
365
+ if self.context_dim is not None:
366
+ self.to_k_c = nn.Linear(context_dim, self.inner_dim)
367
+ self.to_v_c = nn.Linear(context_dim, self.inner_dim)
368
+ if self.context_pre_only is not None:
369
+ self.to_q_c = nn.Linear(context_dim, self.inner_dim)
370
+
371
+ self.to_out = nn.ModuleList([])
372
+ self.to_out.append(nn.Linear(self.inner_dim, dim))
373
+ self.to_out.append(nn.Dropout(dropout))
374
+
375
+ if self.context_pre_only is not None and not self.context_pre_only:
376
+ self.to_out_c = nn.Linear(self.inner_dim, dim)
377
+
378
+ def forward(
379
+ self,
380
+ x: float["b n d"], # noised input x # noqa: F722
381
+ c: float["b n d"] = None, # context c # noqa: F722
382
+ mask: bool["b n"] | None = None, # noqa: F722
383
+ rope=None, # rotary position embedding for x
384
+ c_rope=None, # rotary position embedding for c
385
+ ) -> torch.Tensor:
386
+ if c is not None:
387
+ return self.processor(self, x, c=c, mask=mask, rope=rope, c_rope=c_rope)
388
+ else:
389
+ return self.processor(self, x, mask=mask, rope=rope)
390
+
391
+
392
+ # Attention processor
393
+
394
+
395
+ class AttnProcessor:
396
+ def __init__(self):
397
+ pass
398
+
399
+ def __call__(
400
+ self,
401
+ attn: Attention,
402
+ x: float["b n d"], # noised input x # noqa: F722
403
+ mask: bool["b n"] | None = None, # noqa: F722
404
+ rope=None, # rotary position embedding
405
+ ) -> torch.FloatTensor:
406
+ batch_size = x.shape[0]
407
+
408
+ # `sample` projections.
409
+ query = attn.to_q(x)
410
+ key = attn.to_k(x)
411
+ value = attn.to_v(x)
412
+
413
+ # apply rotary position embedding
414
+ if rope is not None:
415
+ freqs, xpos_scale = rope
416
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
417
+
418
+ query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
419
+ key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
420
+
421
+ # attention
422
+ inner_dim = key.shape[-1]
423
+ head_dim = inner_dim // attn.heads
424
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
425
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
426
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
427
+
428
+ # mask. e.g. inference got a batch with different target durations, mask out the padding
429
+ if mask is not None:
430
+ attn_mask = mask
431
+ attn_mask = attn_mask.unsqueeze(1).unsqueeze(1) # 'b n -> b 1 1 n'
432
+ attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
433
+ else:
434
+ attn_mask = None
435
+
436
+ x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
437
+ x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
438
+ x = x.to(query.dtype)
439
+
440
+ # linear proj
441
+ x = attn.to_out[0](x)
442
+ # dropout
443
+ x = attn.to_out[1](x)
444
+
445
+ if mask is not None:
446
+ mask = mask.unsqueeze(-1)
447
+ x = x.masked_fill(~mask, 0.0)
448
+
449
+ return x
450
+
451
+
452
+ # Joint Attention processor for MM-DiT
453
+ # modified from diffusers/src/diffusers/models/attention_processor.py
454
+
455
+
456
+ class JointAttnProcessor:
457
+ def __init__(self):
458
+ pass
459
+
460
+ def __call__(
461
+ self,
462
+ attn: Attention,
463
+ x: float["b n d"], # noised input x # noqa: F722
464
+ c: float["b nt d"] = None, # context c, here text # noqa: F722
465
+ mask: bool["b n"] | None = None, # noqa: F722
466
+ rope=None, # rotary position embedding for x
467
+ c_rope=None, # rotary position embedding for c
468
+ ) -> torch.FloatTensor:
469
+ residual = x
470
+
471
+ batch_size = c.shape[0]
472
+
473
+ # `sample` projections.
474
+ query = attn.to_q(x)
475
+ key = attn.to_k(x)
476
+ value = attn.to_v(x)
477
+
478
+ # `context` projections.
479
+ c_query = attn.to_q_c(c)
480
+ c_key = attn.to_k_c(c)
481
+ c_value = attn.to_v_c(c)
482
+
483
+ # apply rope for context and noised input independently
484
+ if rope is not None:
485
+ freqs, xpos_scale = rope
486
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
487
+ query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
488
+ key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
489
+ if c_rope is not None:
490
+ freqs, xpos_scale = c_rope
491
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
492
+ c_query = apply_rotary_pos_emb(c_query, freqs, q_xpos_scale)
493
+ c_key = apply_rotary_pos_emb(c_key, freqs, k_xpos_scale)
494
+
495
+ # attention
496
+ query = torch.cat([query, c_query], dim=1)
497
+ key = torch.cat([key, c_key], dim=1)
498
+ value = torch.cat([value, c_value], dim=1)
499
+
500
+ inner_dim = key.shape[-1]
501
+ head_dim = inner_dim // attn.heads
502
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
503
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
504
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
505
+
506
+ # mask. e.g. inference got a batch with different target durations, mask out the padding
507
+ if mask is not None:
508
+ attn_mask = F.pad(mask, (0, c.shape[1]), value=True) # no mask for c (text)
509
+ attn_mask = attn_mask.unsqueeze(1).unsqueeze(1) # 'b n -> b 1 1 n'
510
+ attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
511
+ else:
512
+ attn_mask = None
513
+
514
+ x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
515
+ x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
516
+ x = x.to(query.dtype)
517
+
518
+ # Split the attention outputs.
519
+ x, c = (
520
+ x[:, : residual.shape[1]],
521
+ x[:, residual.shape[1] :],
522
+ )
523
+
524
+ # linear proj
525
+ x = attn.to_out[0](x)
526
+ # dropout
527
+ x = attn.to_out[1](x)
528
+ if not attn.context_pre_only:
529
+ c = attn.to_out_c(c)
530
+
531
+ if mask is not None:
532
+ mask = mask.unsqueeze(-1)
533
+ x = x.masked_fill(~mask, 0.0)
534
+ # c = c.masked_fill(~mask, 0.) # no mask for c (text)
535
+
536
+ return x, c
537
+
538
+
539
+ # DiT Block
540
+
541
+
542
+ class DiTBlock(nn.Module):
543
+ def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1):
544
+ super().__init__()
545
+
546
+ self.attn_norm = AdaLayerNormZero(dim)
547
+ self.attn = Attention(
548
+ processor=AttnProcessor(),
549
+ dim=dim,
550
+ heads=heads,
551
+ dim_head=dim_head,
552
+ dropout=dropout,
553
+ )
554
+
555
+ self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
556
+ self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
557
+
558
+ def forward(self, x, t, mask=None, rope=None): # x: noised input, t: time embedding
559
+ # pre-norm & modulation for attention input
560
+ norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
561
+
562
+ # attention
563
+ attn_output = self.attn(x=norm, mask=mask, rope=rope)
564
+
565
+ # process attention output for input x
566
+ x = x + gate_msa.unsqueeze(1) * attn_output
567
+
568
+ norm = self.ff_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
569
+ ff_output = self.ff(norm)
570
+ x = x + gate_mlp.unsqueeze(1) * ff_output
571
+
572
+ return x
573
+
574
+
575
+ # MMDiT Block https://arxiv.org/abs/2403.03206
576
+
577
+
578
+ class MMDiTBlock(nn.Module):
579
+ r"""
580
+ modified from diffusers/src/diffusers/models/attention.py
581
+
582
+ notes.
583
+ _c: context related. text, cond, etc. (left part in sd3 fig2.b)
584
+ _x: noised input related. (right part)
585
+ context_pre_only: last layer only do prenorm + modulation cuz no more ffn
586
+ """
587
+
588
+ def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1, context_pre_only=False):
589
+ super().__init__()
590
+
591
+ self.context_pre_only = context_pre_only
592
+
593
+ self.attn_norm_c = AdaLayerNormZero_Final(dim) if context_pre_only else AdaLayerNormZero(dim)
594
+ self.attn_norm_x = AdaLayerNormZero(dim)
595
+ self.attn = Attention(
596
+ processor=JointAttnProcessor(),
597
+ dim=dim,
598
+ heads=heads,
599
+ dim_head=dim_head,
600
+ dropout=dropout,
601
+ context_dim=dim,
602
+ context_pre_only=context_pre_only,
603
+ )
604
+
605
+ if not context_pre_only:
606
+ self.ff_norm_c = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
607
+ self.ff_c = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
608
+ else:
609
+ self.ff_norm_c = None
610
+ self.ff_c = None
611
+ self.ff_norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
612
+ self.ff_x = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
613
+
614
+ def forward(self, x, c, t, mask=None, rope=None, c_rope=None): # x: noised input, c: context, t: time embedding
615
+ # pre-norm & modulation for attention input
616
+ if self.context_pre_only:
617
+ norm_c = self.attn_norm_c(c, t)
618
+ else:
619
+ norm_c, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.attn_norm_c(c, emb=t)
620
+ norm_x, x_gate_msa, x_shift_mlp, x_scale_mlp, x_gate_mlp = self.attn_norm_x(x, emb=t)
621
+
622
+ # attention
623
+ x_attn_output, c_attn_output = self.attn(x=norm_x, c=norm_c, mask=mask, rope=rope, c_rope=c_rope)
624
+
625
+ # process attention output for context c
626
+ if self.context_pre_only:
627
+ c = None
628
+ else: # if not last layer
629
+ c = c + c_gate_msa.unsqueeze(1) * c_attn_output
630
+
631
+ norm_c = self.ff_norm_c(c) * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
632
+ c_ff_output = self.ff_c(norm_c)
633
+ c = c + c_gate_mlp.unsqueeze(1) * c_ff_output
634
+
635
+ # process attention output for input x
636
+ x = x + x_gate_msa.unsqueeze(1) * x_attn_output
637
+
638
+ norm_x = self.ff_norm_x(x) * (1 + x_scale_mlp[:, None]) + x_shift_mlp[:, None]
639
+ x_ff_output = self.ff_x(norm_x)
640
+ x = x + x_gate_mlp.unsqueeze(1) * x_ff_output
641
+
642
+ return c, x
643
+
644
+
645
+ # time step conditioning embedding
646
+
647
+
648
+ class TimestepEmbedding(nn.Module):
649
+ def __init__(self, dim, freq_embed_dim=256):
650
+ super().__init__()
651
+ self.time_embed = SinusPositionEmbedding(freq_embed_dim)
652
+ self.time_mlp = nn.Sequential(nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
653
+
654
+ def forward(self, timestep: float["b"]): # noqa: F821
655
+ time_hidden = self.time_embed(timestep)
656
+ time_hidden = time_hidden.to(timestep.dtype)
657
+ time = self.time_mlp(time_hidden) # b d
658
+ return time
src/f5_tts/model/trainer.py ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import gc
4
+ import math
5
+ import os
6
+
7
+ import torch
8
+ import torchaudio
9
+ import wandb
10
+ from accelerate import Accelerator
11
+ from accelerate.utils import DistributedDataParallelKwargs
12
+ from ema_pytorch import EMA
13
+ from torch.optim import AdamW
14
+ from torch.optim.lr_scheduler import LinearLR, SequentialLR
15
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler
16
+ from tqdm import tqdm
17
+
18
+ from f5_tts.model import CFM
19
+ from f5_tts.model.dataset import DynamicBatchSampler, collate_fn
20
+ from f5_tts.model.utils import default, exists
21
+
22
+ # trainer
23
+
24
+
25
+ class Trainer:
26
+ def __init__(
27
+ self,
28
+ model: CFM,
29
+ epochs,
30
+ learning_rate,
31
+ num_warmup_updates=20000,
32
+ save_per_updates=1000,
33
+ keep_last_n_checkpoints: int = -1, # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
34
+ checkpoint_path=None,
35
+ batch_size=32,
36
+ batch_size_type: str = "sample",
37
+ max_samples=32,
38
+ grad_accumulation_steps=1,
39
+ max_grad_norm=1.0,
40
+ noise_scheduler: str | None = None,
41
+ duration_predictor: torch.nn.Module | None = None,
42
+ logger: str | None = "wandb", # "wandb" | "tensorboard" | None
43
+ wandb_project="test_e2-tts",
44
+ wandb_run_name="test_run",
45
+ wandb_resume_id: str = None,
46
+ log_samples: bool = False,
47
+ last_per_updates=None,
48
+ accelerate_kwargs: dict = dict(),
49
+ ema_kwargs: dict = dict(),
50
+ bnb_optimizer: bool = False,
51
+ mel_spec_type: str = "vocos", # "vocos" | "bigvgan"
52
+ is_local_vocoder: bool = False, # use local path vocoder
53
+ local_vocoder_path: str = "", # local vocoder path
54
+ ):
55
+ ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
56
+
57
+ if logger == "wandb" and not wandb.api.api_key:
58
+ logger = None
59
+ self.log_samples = log_samples
60
+
61
+ self.accelerator = Accelerator(
62
+ log_with=logger if logger == "wandb" else None,
63
+ kwargs_handlers=[ddp_kwargs],
64
+ gradient_accumulation_steps=grad_accumulation_steps,
65
+ **accelerate_kwargs,
66
+ )
67
+
68
+ self.logger = logger
69
+ if self.logger == "wandb":
70
+ if exists(wandb_resume_id):
71
+ init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name, "id": wandb_resume_id}}
72
+ else:
73
+ init_kwargs = {"wandb": {"resume": "allow", "name": wandb_run_name}}
74
+
75
+ self.accelerator.init_trackers(
76
+ project_name=wandb_project,
77
+ init_kwargs=init_kwargs,
78
+ config={
79
+ "epochs": epochs,
80
+ "learning_rate": learning_rate,
81
+ "num_warmup_updates": num_warmup_updates,
82
+ "batch_size": batch_size,
83
+ "batch_size_type": batch_size_type,
84
+ "max_samples": max_samples,
85
+ "grad_accumulation_steps": grad_accumulation_steps,
86
+ "max_grad_norm": max_grad_norm,
87
+ "gpus": self.accelerator.num_processes,
88
+ "noise_scheduler": noise_scheduler,
89
+ },
90
+ )
91
+
92
+ elif self.logger == "tensorboard":
93
+ from torch.utils.tensorboard import SummaryWriter
94
+
95
+ self.writer = SummaryWriter(log_dir=f"runs/{wandb_run_name}")
96
+
97
+ self.model = model
98
+
99
+ if self.is_main:
100
+ self.ema_model = EMA(model, include_online_model=False, **ema_kwargs)
101
+ self.ema_model.to(self.accelerator.device)
102
+
103
+ print(f"Using logger: {logger}")
104
+ if grad_accumulation_steps > 1:
105
+ print(
106
+ "Gradient accumulation checkpointing with per_updates now, old logic per_steps used with before f992c4e"
107
+ )
108
+
109
+ self.epochs = epochs
110
+ self.num_warmup_updates = num_warmup_updates
111
+ self.save_per_updates = save_per_updates
112
+ self.keep_last_n_checkpoints = keep_last_n_checkpoints
113
+ self.last_per_updates = default(last_per_updates, save_per_updates)
114
+ self.checkpoint_path = default(checkpoint_path, "ckpts/test_e2-tts")
115
+
116
+ self.batch_size = batch_size
117
+ self.batch_size_type = batch_size_type
118
+ self.max_samples = max_samples
119
+ self.grad_accumulation_steps = grad_accumulation_steps
120
+ self.max_grad_norm = max_grad_norm
121
+
122
+ # mel vocoder config
123
+ self.vocoder_name = mel_spec_type
124
+ self.is_local_vocoder = is_local_vocoder
125
+ self.local_vocoder_path = local_vocoder_path
126
+
127
+ self.noise_scheduler = noise_scheduler
128
+
129
+ self.duration_predictor = duration_predictor
130
+
131
+ if bnb_optimizer:
132
+ import bitsandbytes as bnb
133
+
134
+ self.optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=learning_rate)
135
+ else:
136
+ self.optimizer = AdamW(model.parameters(), lr=learning_rate)
137
+ self.model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
138
+
139
+ @property
140
+ def is_main(self):
141
+ return self.accelerator.is_main_process
142
+
143
+ def save_checkpoint(self, update, last=False):
144
+ self.accelerator.wait_for_everyone()
145
+ if self.is_main:
146
+ checkpoint = dict(
147
+ model_state_dict=self.accelerator.unwrap_model(self.model).state_dict(),
148
+ optimizer_state_dict=self.accelerator.unwrap_model(self.optimizer).state_dict(),
149
+ ema_model_state_dict=self.ema_model.state_dict(),
150
+ scheduler_state_dict=self.scheduler.state_dict(),
151
+ update=update,
152
+ )
153
+ if not os.path.exists(self.checkpoint_path):
154
+ os.makedirs(self.checkpoint_path)
155
+ if last:
156
+ self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_last.pt")
157
+ print(f"Saved last checkpoint at update {update}")
158
+ else:
159
+ if self.keep_last_n_checkpoints == 0:
160
+ return
161
+ self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_{update}.pt")
162
+ if self.keep_last_n_checkpoints > 0:
163
+ # Updated logic to exclude pretrained model from rotation
164
+ checkpoints = [
165
+ f
166
+ for f in os.listdir(self.checkpoint_path)
167
+ if f.startswith("model_")
168
+ and not f.startswith("pretrained_") # Exclude pretrained models
169
+ and f.endswith(".pt")
170
+ and f != "model_last.pt"
171
+ ]
172
+ checkpoints.sort(key=lambda x: int(x.split("_")[1].split(".")[0]))
173
+ while len(checkpoints) > self.keep_last_n_checkpoints:
174
+ oldest_checkpoint = checkpoints.pop(0)
175
+ os.remove(os.path.join(self.checkpoint_path, oldest_checkpoint))
176
+ print(f"Removed old checkpoint: {oldest_checkpoint}")
177
+
178
+ def load_checkpoint(self):
179
+ if (
180
+ not exists(self.checkpoint_path)
181
+ or not os.path.exists(self.checkpoint_path)
182
+ or not any(filename.endswith(".pt") for filename in os.listdir(self.checkpoint_path))
183
+ ):
184
+ return 0
185
+
186
+ self.accelerator.wait_for_everyone()
187
+ if "model_last.pt" in os.listdir(self.checkpoint_path):
188
+ latest_checkpoint = "model_last.pt"
189
+ else:
190
+ # Updated to consider pretrained models for loading but prioritize training checkpoints
191
+ all_checkpoints = [
192
+ f
193
+ for f in os.listdir(self.checkpoint_path)
194
+ if (f.startswith("model_") or f.startswith("pretrained_")) and f.endswith(".pt")
195
+ ]
196
+
197
+ # First try to find regular training checkpoints
198
+ training_checkpoints = [f for f in all_checkpoints if f.startswith("model_") and f != "model_last.pt"]
199
+ if training_checkpoints:
200
+ latest_checkpoint = sorted(
201
+ training_checkpoints,
202
+ key=lambda x: int("".join(filter(str.isdigit, x))),
203
+ )[-1]
204
+ else:
205
+ # If no training checkpoints, use pretrained model
206
+ latest_checkpoint = next(f for f in all_checkpoints if f.startswith("pretrained_"))
207
+
208
+ # checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", map_location=self.accelerator.device) # rather use accelerator.load_state ಥ_ಥ
209
+ checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", weights_only=True, map_location="cpu")
210
+
211
+ # patch for backward compatibility, 305e3ea
212
+ for key in ["ema_model.mel_spec.mel_stft.mel_scale.fb", "ema_model.mel_spec.mel_stft.spectrogram.window"]:
213
+ if key in checkpoint["ema_model_state_dict"]:
214
+ del checkpoint["ema_model_state_dict"][key]
215
+
216
+ if self.is_main:
217
+ self.ema_model.load_state_dict(checkpoint["ema_model_state_dict"])
218
+
219
+ if "update" in checkpoint or "step" in checkpoint:
220
+ # patch for backward compatibility, with before f992c4e
221
+ if "step" in checkpoint:
222
+ checkpoint["update"] = checkpoint["step"] // self.grad_accumulation_steps
223
+ if self.grad_accumulation_steps > 1 and self.is_main:
224
+ print(
225
+ "F5-TTS WARNING: Loading checkpoint saved with per_steps logic (before f992c4e), will convert to per_updates according to grad_accumulation_steps setting, may have unexpected behaviour."
226
+ )
227
+ # patch for backward compatibility, 305e3ea
228
+ for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window"]:
229
+ if key in checkpoint["model_state_dict"]:
230
+ del checkpoint["model_state_dict"][key]
231
+
232
+ self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint["model_state_dict"])
233
+ self.accelerator.unwrap_model(self.optimizer).load_state_dict(checkpoint["optimizer_state_dict"])
234
+ if self.scheduler:
235
+ self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
236
+ update = checkpoint["update"]
237
+ else:
238
+ checkpoint["model_state_dict"] = {
239
+ k.replace("ema_model.", ""): v
240
+ for k, v in checkpoint["ema_model_state_dict"].items()
241
+ if k not in ["initted", "update", "step"]
242
+ }
243
+ self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint["model_state_dict"])
244
+ update = 0
245
+
246
+ del checkpoint
247
+ gc.collect()
248
+ return update
249
+
250
+ def train(self, train_dataset: Dataset, num_workers=16, resumable_with_seed: int = None):
251
+ if self.log_samples:
252
+ from f5_tts.infer.utils_infer import cfg_strength, load_vocoder, nfe_step, sway_sampling_coef
253
+
254
+ vocoder = load_vocoder(
255
+ vocoder_name=self.vocoder_name, is_local=self.is_local_vocoder, local_path=self.local_vocoder_path
256
+ )
257
+ target_sample_rate = self.accelerator.unwrap_model(self.model).mel_spec.target_sample_rate
258
+ log_samples_path = f"{self.checkpoint_path}/samples"
259
+ os.makedirs(log_samples_path, exist_ok=True)
260
+
261
+ if exists(resumable_with_seed):
262
+ generator = torch.Generator()
263
+ generator.manual_seed(resumable_with_seed)
264
+ else:
265
+ generator = None
266
+
267
+ if self.batch_size_type == "sample":
268
+ train_dataloader = DataLoader(
269
+ train_dataset,
270
+ collate_fn=collate_fn,
271
+ num_workers=num_workers,
272
+ pin_memory=True,
273
+ persistent_workers=True,
274
+ batch_size=self.batch_size,
275
+ shuffle=True,
276
+ generator=generator,
277
+ )
278
+ elif self.batch_size_type == "frame":
279
+ self.accelerator.even_batches = False
280
+ sampler = SequentialSampler(train_dataset)
281
+ batch_sampler = DynamicBatchSampler(
282
+ sampler,
283
+ self.batch_size,
284
+ max_samples=self.max_samples,
285
+ random_seed=resumable_with_seed, # This enables reproducible shuffling
286
+ drop_last=False,
287
+ )
288
+ train_dataloader = DataLoader(
289
+ train_dataset,
290
+ collate_fn=collate_fn,
291
+ num_workers=num_workers,
292
+ pin_memory=True,
293
+ persistent_workers=True,
294
+ batch_sampler=batch_sampler,
295
+ )
296
+ else:
297
+ raise ValueError(f"batch_size_type must be either 'sample' or 'frame', but received {self.batch_size_type}")
298
+
299
+ # accelerator.prepare() dispatches batches to devices;
300
+ # which means the length of dataloader calculated before, should consider the number of devices
301
+ warmup_updates = (
302
+ self.num_warmup_updates * self.accelerator.num_processes
303
+ ) # consider a fixed warmup steps while using accelerate multi-gpu ddp
304
+ # otherwise by default with split_batches=False, warmup steps change with num_processes
305
+ total_updates = math.ceil(len(train_dataloader) / self.grad_accumulation_steps) * self.epochs
306
+ decay_updates = total_updates - warmup_updates
307
+ warmup_scheduler = LinearLR(self.optimizer, start_factor=1e-8, end_factor=1.0, total_iters=warmup_updates)
308
+ decay_scheduler = LinearLR(self.optimizer, start_factor=1.0, end_factor=1e-8, total_iters=decay_updates)
309
+ self.scheduler = SequentialLR(
310
+ self.optimizer, schedulers=[warmup_scheduler, decay_scheduler], milestones=[warmup_updates]
311
+ )
312
+ train_dataloader, self.scheduler = self.accelerator.prepare(
313
+ train_dataloader, self.scheduler
314
+ ) # actual multi_gpu updates = single_gpu updates / gpu nums
315
+ start_update = self.load_checkpoint()
316
+ global_update = start_update
317
+
318
+ if exists(resumable_with_seed):
319
+ orig_epoch_step = len(train_dataloader)
320
+ start_step = start_update * self.grad_accumulation_steps
321
+ skipped_epoch = int(start_step // orig_epoch_step)
322
+ skipped_batch = start_step % orig_epoch_step
323
+ skipped_dataloader = self.accelerator.skip_first_batches(train_dataloader, num_batches=skipped_batch)
324
+ else:
325
+ skipped_epoch = 0
326
+
327
+ for epoch in range(skipped_epoch, self.epochs):
328
+ self.model.train()
329
+ if exists(resumable_with_seed) and epoch == skipped_epoch:
330
+ progress_bar_initial = math.ceil(skipped_batch / self.grad_accumulation_steps)
331
+ current_dataloader = skipped_dataloader
332
+ else:
333
+ progress_bar_initial = 0
334
+ current_dataloader = train_dataloader
335
+
336
+ # Set epoch for the batch sampler if it exists
337
+ if hasattr(train_dataloader, "batch_sampler") and hasattr(train_dataloader.batch_sampler, "set_epoch"):
338
+ train_dataloader.batch_sampler.set_epoch(epoch)
339
+
340
+ progress_bar = tqdm(
341
+ range(math.ceil(len(train_dataloader) / self.grad_accumulation_steps)),
342
+ desc=f"Epoch {epoch+1}/{self.epochs}",
343
+ unit="update",
344
+ disable=not self.accelerator.is_local_main_process,
345
+ initial=progress_bar_initial,
346
+ )
347
+
348
+ for batch in current_dataloader:
349
+ with self.accelerator.accumulate(self.model):
350
+ text_inputs = batch["text"]
351
+ mel_spec = batch["mel"].permute(0, 2, 1)
352
+ mel_lengths = batch["mel_lengths"]
353
+
354
+ # TODO. add duration predictor training
355
+ if self.duration_predictor is not None and self.accelerator.is_local_main_process:
356
+ dur_loss = self.duration_predictor(mel_spec, lens=batch.get("durations"))
357
+ self.accelerator.log({"duration loss": dur_loss.item()}, step=global_update)
358
+
359
+ loss, cond, pred = self.model(
360
+ mel_spec, text=text_inputs, lens=mel_lengths, noise_scheduler=self.noise_scheduler
361
+ )
362
+ self.accelerator.backward(loss)
363
+
364
+ if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
365
+ self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
366
+
367
+ self.optimizer.step()
368
+ self.scheduler.step()
369
+ self.optimizer.zero_grad()
370
+
371
+ if self.accelerator.sync_gradients:
372
+ if self.is_main:
373
+ self.ema_model.update()
374
+
375
+ global_update += 1
376
+ progress_bar.update(1)
377
+ progress_bar.set_postfix(update=str(global_update), loss=loss.item())
378
+
379
+ if self.accelerator.is_local_main_process:
380
+ self.accelerator.log(
381
+ {"loss": loss.item(), "lr": self.scheduler.get_last_lr()[0]}, step=global_update
382
+ )
383
+ if self.logger == "tensorboard":
384
+ self.writer.add_scalar("loss", loss.item(), global_update)
385
+ self.writer.add_scalar("lr", self.scheduler.get_last_lr()[0], global_update)
386
+
387
+ if global_update % self.save_per_updates == 0 and self.accelerator.sync_gradients:
388
+ self.save_checkpoint(global_update)
389
+
390
+ if self.log_samples and self.accelerator.is_local_main_process:
391
+ ref_audio_len = mel_lengths[0]
392
+ infer_text = [
393
+ text_inputs[0] + ([" "] if isinstance(text_inputs[0], list) else " ") + text_inputs[0]
394
+ ]
395
+ with torch.inference_mode():
396
+ generated, _ = self.accelerator.unwrap_model(self.model).sample(
397
+ cond=mel_spec[0][:ref_audio_len].unsqueeze(0),
398
+ text=infer_text,
399
+ duration=ref_audio_len * 2,
400
+ steps=nfe_step,
401
+ cfg_strength=cfg_strength,
402
+ sway_sampling_coef=sway_sampling_coef,
403
+ )
404
+ generated = generated.to(torch.float32)
405
+ gen_mel_spec = generated[:, ref_audio_len:, :].permute(0, 2, 1).to(self.accelerator.device)
406
+ ref_mel_spec = batch["mel"][0].unsqueeze(0)
407
+ if self.vocoder_name == "vocos":
408
+ gen_audio = vocoder.decode(gen_mel_spec).cpu()
409
+ ref_audio = vocoder.decode(ref_mel_spec).cpu()
410
+ elif self.vocoder_name == "bigvgan":
411
+ gen_audio = vocoder(gen_mel_spec).squeeze(0).cpu()
412
+ ref_audio = vocoder(ref_mel_spec).squeeze(0).cpu()
413
+
414
+ torchaudio.save(
415
+ f"{log_samples_path}/update_{global_update}_gen.wav", gen_audio, target_sample_rate
416
+ )
417
+ torchaudio.save(
418
+ f"{log_samples_path}/update_{global_update}_ref.wav", ref_audio, target_sample_rate
419
+ )
420
+
421
+ if global_update % self.last_per_updates == 0 and self.accelerator.sync_gradients:
422
+ self.save_checkpoint(global_update, last=True)
423
+
424
+ self.save_checkpoint(global_update, last=True)
425
+
426
+ self.accelerator.end_training()
src/f5_tts/model/utils.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import random
5
+ from collections import defaultdict
6
+ from importlib.resources import files
7
+
8
+ import torch
9
+ from torch.nn.utils.rnn import pad_sequence
10
+
11
+ import jieba
12
+ from pypinyin import lazy_pinyin, Style
13
+
14
+
15
+ # seed everything
16
+
17
+
18
+ def seed_everything(seed=0):
19
+ random.seed(seed)
20
+ os.environ["PYTHONHASHSEED"] = str(seed)
21
+ torch.manual_seed(seed)
22
+ torch.cuda.manual_seed(seed)
23
+ torch.cuda.manual_seed_all(seed)
24
+ torch.backends.cudnn.deterministic = True
25
+ torch.backends.cudnn.benchmark = False
26
+
27
+
28
+ # helpers
29
+
30
+
31
+ def exists(v):
32
+ return v is not None
33
+
34
+
35
+ def default(v, d):
36
+ return v if exists(v) else d
37
+
38
+
39
+ # tensor helpers
40
+
41
+
42
+ def lens_to_mask(t: int["b"], length: int | None = None) -> bool["b n"]: # noqa: F722 F821
43
+ if not exists(length):
44
+ length = t.amax()
45
+
46
+ seq = torch.arange(length, device=t.device)
47
+ return seq[None, :] < t[:, None]
48
+
49
+
50
+ def mask_from_start_end_indices(seq_len: int["b"], start: int["b"], end: int["b"]): # noqa: F722 F821
51
+ max_seq_len = seq_len.max().item()
52
+ seq = torch.arange(max_seq_len, device=start.device).long()
53
+ start_mask = seq[None, :] >= start[:, None]
54
+ end_mask = seq[None, :] < end[:, None]
55
+ return start_mask & end_mask
56
+
57
+
58
+ def mask_from_frac_lengths(seq_len: int["b"], frac_lengths: float["b"]): # noqa: F722 F821
59
+ lengths = (frac_lengths * seq_len).long()
60
+ max_start = seq_len - lengths
61
+
62
+ rand = torch.rand_like(frac_lengths)
63
+ start = (max_start * rand).long().clamp(min=0)
64
+ end = start + lengths
65
+
66
+ return mask_from_start_end_indices(seq_len, start, end)
67
+
68
+
69
+ def maybe_masked_mean(t: float["b n d"], mask: bool["b n"] = None) -> float["b d"]: # noqa: F722
70
+ if not exists(mask):
71
+ return t.mean(dim=1)
72
+
73
+ t = torch.where(mask[:, :, None], t, torch.tensor(0.0, device=t.device))
74
+ num = t.sum(dim=1)
75
+ den = mask.float().sum(dim=1)
76
+
77
+ return num / den.clamp(min=1.0)
78
+
79
+
80
+ # simple utf-8 tokenizer, since paper went character based
81
+ def list_str_to_tensor(text: list[str], padding_value=-1) -> int["b nt"]: # noqa: F722
82
+ list_tensors = [torch.tensor([*bytes(t, "UTF-8")]) for t in text] # ByT5 style
83
+ text = pad_sequence(list_tensors, padding_value=padding_value, batch_first=True)
84
+ return text
85
+
86
+
87
+ # char tokenizer, based on custom dataset's extracted .txt file
88
+ def list_str_to_idx(
89
+ text: list[str] | list[list[str]],
90
+ vocab_char_map: dict[str, int], # {char: idx}
91
+ padding_value=-1,
92
+ ) -> int["b nt"]: # noqa: F722
93
+ list_idx_tensors = [torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text] # pinyin or char style
94
+ text = pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
95
+ return text
96
+
97
+
98
+ # Get tokenizer
99
+
100
+
101
+ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
102
+ """
103
+ tokenizer - "pinyin" do g2p for only chinese characters, need .txt vocab_file
104
+ - "char" for char-wise tokenizer, need .txt vocab_file
105
+ - "byte" for utf-8 tokenizer
106
+ - "custom" if you're directly passing in a path to the vocab.txt you want to use
107
+ vocab_size - if use "pinyin", all available pinyin types, common alphabets (also those with accent) and symbols
108
+ - if use "char", derived from unfiltered character & symbol counts of custom dataset
109
+ - if use "byte", set to 256 (unicode byte range)
110
+ """
111
+ if tokenizer in ["pinyin", "char"]:
112
+ tokenizer_path = os.path.join(files("f5_tts").joinpath("../../data"), f"{dataset_name}_{tokenizer}/vocab.txt")
113
+ with open(tokenizer_path, "r", encoding="utf-8") as f:
114
+ vocab_char_map = {}
115
+ for i, char in enumerate(f):
116
+ vocab_char_map[char[:-1]] = i
117
+ vocab_size = len(vocab_char_map)
118
+ assert vocab_char_map[" "] == 0, "make sure space is of idx 0 in vocab.txt, cuz 0 is used for unknown char"
119
+
120
+ elif tokenizer == "byte":
121
+ vocab_char_map = None
122
+ vocab_size = 256
123
+
124
+ elif tokenizer == "custom":
125
+ with open(dataset_name, "r", encoding="utf-8") as f:
126
+ vocab_char_map = {}
127
+ for i, char in enumerate(f):
128
+ vocab_char_map[char[:-1]] = i
129
+ vocab_size = len(vocab_char_map)
130
+
131
+ return vocab_char_map, vocab_size
132
+
133
+
134
+ # convert char to pinyin
135
+
136
+ jieba.initialize()
137
+ print("Word segmentation module jieba initialized.\n")
138
+
139
+ # def convert_char_to_pinyin(text_list, polyphone=True):
140
+ # final_text_list = []
141
+ # for text in text_list:
142
+ # char_list = [char for char in text if char not in "。,、;:?!《》【】—…:;\"()[]{}"]
143
+ # final_text_list.append(char_list)
144
+ # # print(final_text_list)
145
+ # return final_text_list
146
+
147
+ # def convert_char_to_pinyin(text_list, polyphone=True):
148
+ # final_text_list = [char for char in text_list if char not in "。,、;:?!《》【】—…:;?!\"()[]{}"]
149
+ # return final_text_list
150
+
151
+ def convert_char_to_pinyin(text_list, polyphone=True):
152
+ final_text_list = []
153
+ custom_trans = str.maketrans(
154
+ {";": ",", "“": '"', "”": '"', "‘": "'", "’": "'"}
155
+ ) # add custom trans here, to address oov
156
+
157
+ def is_chinese(c):
158
+ return (
159
+ "\u3100" <= c <= "\u9fff" # common chinese characters
160
+ )
161
+
162
+ for text in text_list:
163
+ char_list = []
164
+ text = text.translate(custom_trans)
165
+ for seg in jieba.cut(text):
166
+ seg_byte_len = len(bytes(seg, "UTF-8"))
167
+ if seg_byte_len == len(seg): # if pure alphabets and symbols
168
+ if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
169
+ char_list.append(" ")
170
+ char_list.extend(seg)
171
+ elif polyphone and seg_byte_len == 3 * len(seg): # if pure east asian characters
172
+ seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
173
+ for i, c in enumerate(seg):
174
+ if is_chinese(c):
175
+ char_list.append(" ")
176
+ char_list.append(seg_[i])
177
+ else: # if mixed characters, alphabets and symbols
178
+ for c in seg:
179
+ if ord(c) < 256:
180
+ char_list.extend(c)
181
+ elif is_chinese(c):
182
+ char_list.append(" ")
183
+ char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True))
184
+ else:
185
+ char_list.append(c)
186
+ final_text_list.append(char_list)
187
+ # print(final_text_list)
188
+ return final_text_list
189
+
190
+ # filter func for dirty data with many repetitions
191
+
192
+ def repetition_found(text, length=2, tolerance=10):
193
+ pattern_count = defaultdict(int)
194
+ for i in range(len(text) - length + 1):
195
+ pattern = text[i : i + length]
196
+ pattern_count[pattern] += 1
197
+ for pattern, count in pattern_count.items():
198
+ if count > tolerance:
199
+ return True
200
+ return False
src/f5_tts/scripts/count_max_epoch.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ADAPTIVE BATCH SIZE"""
2
+
3
+ print("Adaptive batch size: using grouping batch sampler, frames_per_gpu fixed fed in")
4
+ print(" -> least padding, gather wavs with accumulated frames in a batch\n")
5
+
6
+ # data
7
+ total_hours = 95282
8
+ mel_hop_length = 256
9
+ mel_sampling_rate = 24000
10
+
11
+ # target
12
+ wanted_max_updates = 1000000
13
+
14
+ # train params
15
+ gpus = 8
16
+ frames_per_gpu = 38400 # 8 * 38400 = 307200
17
+ grad_accum = 1
18
+
19
+ # intermediate
20
+ mini_batch_frames = frames_per_gpu * grad_accum * gpus
21
+ mini_batch_hours = mini_batch_frames * mel_hop_length / mel_sampling_rate / 3600
22
+ updates_per_epoch = total_hours / mini_batch_hours
23
+ # steps_per_epoch = updates_per_epoch * grad_accum
24
+
25
+ # result
26
+ epochs = wanted_max_updates / updates_per_epoch
27
+ print(f"epochs should be set to: {epochs:.0f} ({epochs/grad_accum:.1f} x gd_acum {grad_accum})")
28
+ print(f"progress_bar should show approx. 0/{updates_per_epoch:.0f} updates")
29
+ # print(f" or approx. 0/{steps_per_epoch:.0f} steps")
30
+
31
+ # others
32
+ print(f"total {total_hours:.0f} hours")
33
+ print(f"mini-batch of {mini_batch_frames:.0f} frames, {mini_batch_hours:.2f} hours per mini-batch")
src/f5_tts/scripts/count_params_gflops.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ sys.path.append(os.getcwd())
5
+
6
+ from f5_tts.model import CFM, DiT
7
+
8
+ import torch
9
+ import thop
10
+
11
+
12
+ """ ~155M """
13
+ # transformer = UNetT(dim = 768, depth = 20, heads = 12, ff_mult = 4)
14
+ # transformer = UNetT(dim = 768, depth = 20, heads = 12, ff_mult = 4, text_dim = 512, conv_layers = 4)
15
+ # transformer = DiT(dim = 768, depth = 18, heads = 12, ff_mult = 2)
16
+ # transformer = DiT(dim = 768, depth = 18, heads = 12, ff_mult = 2, text_dim = 512, conv_layers = 4)
17
+ # transformer = DiT(dim = 768, depth = 18, heads = 12, ff_mult = 2, text_dim = 512, conv_layers = 4, long_skip_connection = True)
18
+ # transformer = MMDiT(dim = 512, depth = 16, heads = 16, ff_mult = 2)
19
+
20
+ """ ~335M """
21
+ # FLOPs: 622.1 G, Params: 333.2 M
22
+ # transformer = UNetT(dim = 1024, depth = 24, heads = 16, ff_mult = 4)
23
+ # FLOPs: 363.4 G, Params: 335.8 M
24
+ transformer = DiT(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
25
+
26
+
27
+ model = CFM(transformer=transformer)
28
+ target_sample_rate = 24000
29
+ n_mel_channels = 100
30
+ hop_length = 256
31
+ duration = 20
32
+ frame_length = int(duration * target_sample_rate / hop_length)
33
+ text_length = 150
34
+
35
+ flops, params = thop.profile(
36
+ model, inputs=(torch.randn(1, frame_length, n_mel_channels), torch.zeros(1, text_length, dtype=torch.long))
37
+ )
38
+ print(f"FLOPs: {flops / 1e9} G")
39
+ print(f"Params: {params / 1e6} M")
src/f5_tts/socket_server.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import gc
3
+ import socket
4
+ import struct
5
+ import torch
6
+ import torchaudio
7
+ import traceback
8
+ from importlib.resources import files
9
+ from threading import Thread
10
+
11
+ from cached_path import cached_path
12
+
13
+ from infer.utils_infer import infer_batch_process, preprocess_ref_audio_text, load_vocoder, load_model
14
+ from model.backbones.dit import DiT
15
+
16
+
17
+ class TTSStreamingProcessor:
18
+ def __init__(self, ckpt_file, vocab_file, ref_audio, ref_text, device=None, dtype=torch.float32):
19
+ self.device = device or (
20
+ "cuda"
21
+ if torch.cuda.is_available()
22
+ else "xpu"
23
+ if torch.xpu.is_available()
24
+ else "mps"
25
+ if torch.backends.mps.is_available()
26
+ else "cpu"
27
+ )
28
+
29
+ # Load the model using the provided checkpoint and vocab files
30
+ self.model = load_model(
31
+ model_cls=DiT,
32
+ model_cfg=dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
33
+ ckpt_path=ckpt_file,
34
+ mel_spec_type="vocos", # or "bigvgan" depending on vocoder
35
+ vocab_file=vocab_file,
36
+ ode_method="euler",
37
+ use_ema=True,
38
+ device=self.device,
39
+ ).to(self.device, dtype=dtype)
40
+
41
+ # Load the vocoder
42
+ self.vocoder = load_vocoder(is_local=False)
43
+
44
+ # Set sampling rate for streaming
45
+ self.sampling_rate = 24000 # Consistency with client
46
+
47
+ # Set reference audio and text
48
+ self.ref_audio = ref_audio
49
+ self.ref_text = ref_text
50
+
51
+ # Warm up the model
52
+ self._warm_up()
53
+
54
+ def _warm_up(self):
55
+ """Warm up the model with a dummy input to ensure it's ready for real-time processing."""
56
+ print("Warming up the model...")
57
+ ref_audio, ref_text = preprocess_ref_audio_text(self.ref_audio, self.ref_text)
58
+ audio, sr = torchaudio.load(ref_audio)
59
+ gen_text = "Warm-up text for the model."
60
+
61
+ # Pass the vocoder as an argument here
62
+ infer_batch_process((audio, sr), ref_text, [gen_text], self.model, self.vocoder, device=self.device)
63
+ print("Warm-up completed.")
64
+
65
+ def generate_stream(self, text, play_steps_in_s=0.5):
66
+ """Generate audio in chunks and yield them in real-time."""
67
+ # Preprocess the reference audio and text
68
+ ref_audio, ref_text = preprocess_ref_audio_text(self.ref_audio, self.ref_text)
69
+
70
+ # Load reference audio
71
+ audio, sr = torchaudio.load(ref_audio)
72
+
73
+ # Run inference for the input text
74
+ audio_chunk, final_sample_rate, _ = infer_batch_process(
75
+ (audio, sr),
76
+ ref_text,
77
+ [text],
78
+ self.model,
79
+ self.vocoder,
80
+ device=self.device, # Pass vocoder here
81
+ )
82
+
83
+ # Break the generated audio into chunks and send them
84
+ chunk_size = int(final_sample_rate * play_steps_in_s)
85
+
86
+ if len(audio_chunk) < chunk_size:
87
+ packed_audio = struct.pack(f"{len(audio_chunk)}f", *audio_chunk)
88
+ yield packed_audio
89
+ return
90
+
91
+ for i in range(0, len(audio_chunk), chunk_size):
92
+ chunk = audio_chunk[i : i + chunk_size]
93
+
94
+ # Check if it's the final chunk
95
+ if i + chunk_size >= len(audio_chunk):
96
+ chunk = audio_chunk[i:]
97
+
98
+ # Send the chunk if it is not empty
99
+ if len(chunk) > 0:
100
+ packed_audio = struct.pack(f"{len(chunk)}f", *chunk)
101
+ yield packed_audio
102
+
103
+
104
+ def handle_client(client_socket, processor):
105
+ try:
106
+ while True:
107
+ # Receive data from the client
108
+ data = client_socket.recv(1024).decode("utf-8")
109
+ if not data:
110
+ break
111
+
112
+ try:
113
+ # The client sends the text input
114
+ text = data.strip()
115
+
116
+ # Generate and stream audio chunks
117
+ for audio_chunk in processor.generate_stream(text):
118
+ client_socket.sendall(audio_chunk)
119
+
120
+ # Send end-of-audio signal
121
+ client_socket.sendall(b"END_OF_AUDIO")
122
+
123
+ except Exception as inner_e:
124
+ print(f"Error during processing: {inner_e}")
125
+ traceback.print_exc() # Print the full traceback to diagnose the issue
126
+ break
127
+
128
+ except Exception as e:
129
+ print(f"Error handling client: {e}")
130
+ traceback.print_exc()
131
+ finally:
132
+ client_socket.close()
133
+
134
+
135
+ def start_server(host, port, processor):
136
+ server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
137
+ server.bind((host, port))
138
+ server.listen(5)
139
+ print(f"Server listening on {host}:{port}")
140
+
141
+ while True:
142
+ client_socket, addr = server.accept()
143
+ print(f"Accepted connection from {addr}")
144
+ client_handler = Thread(target=handle_client, args=(client_socket, processor))
145
+ client_handler.start()
146
+
147
+
148
+ if __name__ == "__main__":
149
+ parser = argparse.ArgumentParser()
150
+
151
+ parser.add_argument("--host", default="0.0.0.0")
152
+ parser.add_argument("--port", default=9998)
153
+
154
+ parser.add_argument(
155
+ "--ckpt_file",
156
+ default=str(cached_path("hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors")),
157
+ help="Path to the model checkpoint file",
158
+ )
159
+ parser.add_argument(
160
+ "--vocab_file",
161
+ default="",
162
+ help="Path to the vocab file if customized",
163
+ )
164
+
165
+ parser.add_argument(
166
+ "--ref_audio",
167
+ default=str(files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")),
168
+ help="Reference audio to provide model with speaker characteristics",
169
+ )
170
+ parser.add_argument(
171
+ "--ref_text",
172
+ default="",
173
+ help="Reference audio subtitle, leave empty to auto-transcribe",
174
+ )
175
+
176
+ parser.add_argument("--device", default=None, help="Device to run the model on")
177
+ parser.add_argument("--dtype", default=torch.float32, help="Data type to use for model inference")
178
+
179
+ args = parser.parse_args()
180
+
181
+ try:
182
+ # Initialize the processor with the model and vocoder
183
+ processor = TTSStreamingProcessor(
184
+ ckpt_file=args.ckpt_file,
185
+ vocab_file=args.vocab_file,
186
+ ref_audio=args.ref_audio,
187
+ ref_text=args.ref_text,
188
+ device=args.device,
189
+ dtype=args.dtype,
190
+ )
191
+
192
+ # Start the server
193
+ start_server(args.host, args.port, processor)
194
+
195
+ except KeyboardInterrupt:
196
+ gc.collect()
src/f5_tts/train/README.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Training
2
+
3
+ ## Prepare Dataset
4
+
5
+ Example data processing scripts, and you may tailor your own one along with a Dataset class in `src/f5_tts/model/dataset.py`.
6
+
7
+ ### 1. Some specific Datasets preparing scripts
8
+ Download corresponding dataset first, and fill in the path in scripts.
9
+
10
+ ```bash
11
+ # Prepare the Emilia dataset
12
+ python src/f5_tts/train/datasets/prepare_emilia.py
13
+
14
+ # Prepare the Wenetspeech4TTS dataset
15
+ python src/f5_tts/train/datasets/prepare_wenetspeech4tts.py
16
+
17
+ # Prepare the LibriTTS dataset
18
+ python src/f5_tts/train/datasets/prepare_libritts.py
19
+
20
+ # Prepare the LJSpeech dataset
21
+ python src/f5_tts/train/datasets/prepare_ljspeech.py
22
+ ```
23
+
24
+ ### 2. Create custom dataset with metadata.csv
25
+ Use guidance see [#57 here](https://github.com/SWivid/F5-TTS/discussions/57#discussioncomment-10959029).
26
+
27
+ ```bash
28
+ python src/f5_tts/train/datasets/prepare_csv_wavs.py
29
+ ```
30
+
31
+ ## Training & Finetuning
32
+
33
+ Once your datasets are prepared, you can start the training process.
34
+
35
+ ### 1. Training script used for pretrained model
36
+
37
+ ```bash
38
+ # setup accelerate config, e.g. use multi-gpu ddp, fp16
39
+ # will be to: ~/.cache/huggingface/accelerate/default_config.yaml
40
+ accelerate config
41
+
42
+ # .yaml files are under src/f5_tts/configs directory
43
+ accelerate launch src/f5_tts/train/train.py --config-name F5TTS_Base_train.yaml
44
+
45
+ # possible to overwrite accelerate and hydra config
46
+ accelerate launch --mixed_precision=fp16 src/f5_tts/train/train.py --config-name F5TTS_Small_train.yaml ++datasets.batch_size_per_gpu=19200
47
+ ```
48
+
49
+ ### 2. Finetuning practice
50
+ Discussion board for Finetuning [#57](https://github.com/SWivid/F5-TTS/discussions/57).
51
+
52
+ Gradio UI training/finetuning with `src/f5_tts/train/finetune_gradio.py` see [#143](https://github.com/SWivid/F5-TTS/discussions/143).
53
+
54
+ The `use_ema = True` is harmful for early-stage finetuned checkpoints (which goes just few updates, thus ema weights still dominated by pretrained ones), try turn it off and see if provide better results.
55
+
56
+ ### 3. Wandb Logging
57
+
58
+ The `wandb/` dir will be created under path you run training/finetuning scripts.
59
+
60
+ By default, the training script does NOT use logging (assuming you didn't manually log in using `wandb login`).
61
+
62
+ To turn on wandb logging, you can either:
63
+
64
+ 1. Manually login with `wandb login`: Learn more [here](https://docs.wandb.ai/ref/cli/wandb-login)
65
+ 2. Automatically login programmatically by setting an environment variable: Get an API KEY at https://wandb.ai/site/ and set the environment variable as follows:
66
+
67
+ On Mac & Linux:
68
+
69
+ ```
70
+ export WANDB_API_KEY=<YOUR WANDB API KEY>
71
+ ```
72
+
73
+ On Windows:
74
+
75
+ ```
76
+ set WANDB_API_KEY=<YOUR WANDB API KEY>
77
+ ```
78
+ Moreover, if you couldn't access Wandb and want to log metrics offline, you can the environment variable as follows:
79
+
80
+ ```
81
+ export WANDB_MODE=offline
82
+ ```
src/f5_tts/train/datasets/prepare_csv_wavs.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import signal
4
+ import subprocess # For invoking ffprobe
5
+ import shutil
6
+ import concurrent.futures
7
+ import multiprocessing
8
+ from contextlib import contextmanager
9
+
10
+ sys.path.append(os.getcwd())
11
+
12
+ import argparse
13
+ import csv
14
+ import json
15
+ from importlib.resources import files
16
+ from pathlib import Path
17
+
18
+ import torchaudio
19
+ from tqdm import tqdm
20
+ from datasets.arrow_writer import ArrowWriter
21
+
22
+ from f5_tts.model.utils import (
23
+ convert_char_to_pinyin,
24
+ )
25
+
26
+
27
+ PRETRAINED_VOCAB_PATH = files("f5_tts").joinpath("../../data/Emilia_ZH_EN_pinyin/vocab.txt")
28
+
29
+
30
+ def is_csv_wavs_format(input_dataset_dir):
31
+ fpath = Path(input_dataset_dir)
32
+ metadata = fpath / "metadata.csv"
33
+ wavs = fpath / "wavs"
34
+ return metadata.exists() and metadata.is_file() and wavs.exists() and wavs.is_dir()
35
+
36
+
37
+ # Configuration constants
38
+ BATCH_SIZE = 100 # Batch size for text conversion
39
+ MAX_WORKERS = max(1, multiprocessing.cpu_count() - 1) # Leave one CPU free
40
+ THREAD_NAME_PREFIX = "AudioProcessor"
41
+ CHUNK_SIZE = 100 # Number of files to process per worker batch
42
+
43
+ executor = None # Global executor for cleanup
44
+
45
+
46
+ @contextmanager
47
+ def graceful_exit():
48
+ """Context manager for graceful shutdown on signals"""
49
+
50
+ def signal_handler(signum, frame):
51
+ print("\nReceived signal to terminate. Cleaning up...")
52
+ if executor is not None:
53
+ print("Shutting down executor...")
54
+ executor.shutdown(wait=False, cancel_futures=True)
55
+ sys.exit(1)
56
+
57
+ # Set up signal handlers
58
+ signal.signal(signal.SIGINT, signal_handler)
59
+ signal.signal(signal.SIGTERM, signal_handler)
60
+
61
+ try:
62
+ yield
63
+ finally:
64
+ if executor is not None:
65
+ executor.shutdown(wait=False)
66
+
67
+
68
+ def process_audio_file(audio_path, text, polyphone):
69
+ """Process a single audio file by checking its existence and extracting duration."""
70
+ if not Path(audio_path).exists():
71
+ print(f"audio {audio_path} not found, skipping")
72
+ return None
73
+ try:
74
+ audio_duration = get_audio_duration(audio_path)
75
+ if audio_duration <= 0:
76
+ raise ValueError(f"Duration {audio_duration} is non-positive.")
77
+ return (audio_path, text, audio_duration)
78
+ except Exception as e:
79
+ print(f"Warning: Failed to process {audio_path} due to error: {e}. Skipping corrupt file.")
80
+ return None
81
+
82
+
83
+ def batch_convert_texts(texts, polyphone, batch_size=BATCH_SIZE):
84
+ """Convert a list of texts to pinyin in batches."""
85
+ converted_texts = []
86
+ for i in range(0, len(texts), batch_size):
87
+ batch = texts[i : i + batch_size]
88
+ converted_batch = convert_char_to_pinyin(batch, polyphone=polyphone)
89
+ converted_texts.extend(converted_batch)
90
+ return converted_texts
91
+
92
+
93
+ def prepare_csv_wavs_dir(input_dir, num_workers=None):
94
+ global executor
95
+ assert is_csv_wavs_format(input_dir), f"not csv_wavs format: {input_dir}"
96
+ input_dir = Path(input_dir)
97
+ metadata_path = input_dir / "metadata.csv"
98
+ audio_path_text_pairs = read_audio_text_pairs(metadata_path.as_posix())
99
+
100
+ polyphone = True
101
+ total_files = len(audio_path_text_pairs)
102
+
103
+ # Use provided worker count or calculate optimal number
104
+ worker_count = num_workers if num_workers is not None else min(MAX_WORKERS, total_files)
105
+ print(f"\nProcessing {total_files} audio files using {worker_count} workers...")
106
+
107
+ with graceful_exit():
108
+ # Initialize thread pool with optimized settings
109
+ with concurrent.futures.ThreadPoolExecutor(
110
+ max_workers=worker_count, thread_name_prefix=THREAD_NAME_PREFIX
111
+ ) as exec:
112
+ executor = exec
113
+ results = []
114
+
115
+ # Process files in chunks for better efficiency
116
+ for i in range(0, len(audio_path_text_pairs), CHUNK_SIZE):
117
+ chunk = audio_path_text_pairs[i : i + CHUNK_SIZE]
118
+ # Submit futures in order
119
+ chunk_futures = [executor.submit(process_audio_file, pair[0], pair[1], polyphone) for pair in chunk]
120
+
121
+ # Iterate over futures in the original submission order to preserve ordering
122
+ for future in tqdm(
123
+ chunk_futures,
124
+ total=len(chunk),
125
+ desc=f"Processing chunk {i//CHUNK_SIZE + 1}/{(total_files + CHUNK_SIZE - 1)//CHUNK_SIZE}",
126
+ ):
127
+ try:
128
+ result = future.result()
129
+ if result is not None:
130
+ results.append(result)
131
+ except Exception as e:
132
+ print(f"Error processing file: {e}")
133
+
134
+ executor = None
135
+
136
+ # Filter out failed results
137
+ processed = [res for res in results if res is not None]
138
+ if not processed:
139
+ raise RuntimeError("No valid audio files were processed!")
140
+
141
+ # Batch process text conversion
142
+ raw_texts = [item[1] for item in processed]
143
+ converted_texts = batch_convert_texts(raw_texts, polyphone, batch_size=BATCH_SIZE)
144
+
145
+ # Prepare final results
146
+ sub_result = []
147
+ durations = []
148
+ vocab_set = set()
149
+
150
+ for (audio_path, _, duration), conv_text in zip(processed, converted_texts):
151
+ sub_result.append({"audio_path": audio_path, "text": conv_text, "duration": duration})
152
+ durations.append(duration)
153
+ vocab_set.update(list(conv_text))
154
+
155
+ return sub_result, durations, vocab_set
156
+
157
+
158
+ def get_audio_duration(audio_path, timeout=5):
159
+ """
160
+ Get the duration of an audio file in seconds using ffmpeg's ffprobe.
161
+ Falls back to torchaudio.load() if ffprobe fails.
162
+ """
163
+ try:
164
+ cmd = [
165
+ "ffprobe",
166
+ "-v",
167
+ "error",
168
+ "-show_entries",
169
+ "format=duration",
170
+ "-of",
171
+ "default=noprint_wrappers=1:nokey=1",
172
+ audio_path,
173
+ ]
174
+ result = subprocess.run(
175
+ cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True, timeout=timeout
176
+ )
177
+ duration_str = result.stdout.strip()
178
+ if duration_str:
179
+ return float(duration_str)
180
+ raise ValueError("Empty duration string from ffprobe.")
181
+ except (subprocess.TimeoutExpired, subprocess.SubprocessError, ValueError) as e:
182
+ print(f"Warning: ffprobe failed for {audio_path} with error: {e}. Falling back to torchaudio.")
183
+ try:
184
+ audio, sample_rate = torchaudio.load(audio_path)
185
+ return audio.shape[1] / sample_rate
186
+ except Exception as e:
187
+ raise RuntimeError(f"Both ffprobe and torchaudio failed for {audio_path}: {e}")
188
+
189
+
190
+ def read_audio_text_pairs(csv_file_path):
191
+ audio_text_pairs = []
192
+
193
+ parent = Path(csv_file_path).parent
194
+ with open(csv_file_path, mode="r", newline="", encoding="utf-8-sig") as csvfile:
195
+ reader = csv.reader(csvfile, delimiter="|")
196
+ next(reader) # Skip the header row
197
+ for row in reader:
198
+ if len(row) >= 2:
199
+ audio_file = row[0].strip() # First column: audio file path
200
+ text = row[1].strip() # Second column: text
201
+ audio_file_path = parent / audio_file
202
+ audio_text_pairs.append((audio_file_path.as_posix(), text))
203
+
204
+ return audio_text_pairs
205
+
206
+
207
+ def save_prepped_dataset(out_dir, result, duration_list, text_vocab_set, is_finetune):
208
+ out_dir = Path(out_dir)
209
+ out_dir.mkdir(exist_ok=True, parents=True)
210
+ print(f"\nSaving to {out_dir} ...")
211
+
212
+ # Save dataset with improved batch size for better I/O performance
213
+ raw_arrow_path = out_dir / "raw.arrow"
214
+ with ArrowWriter(path=raw_arrow_path.as_posix(), writer_batch_size=100) as writer:
215
+ for line in tqdm(result, desc="Writing to raw.arrow ..."):
216
+ writer.write(line)
217
+
218
+ # Save durations to JSON
219
+ dur_json_path = out_dir / "duration.json"
220
+ with open(dur_json_path.as_posix(), "w", encoding="utf-8") as f:
221
+ json.dump({"duration": duration_list}, f, ensure_ascii=False)
222
+
223
+ # Handle vocab file - write only once based on finetune flag
224
+ voca_out_path = out_dir / "vocab.txt"
225
+ if is_finetune:
226
+ file_vocab_finetune = PRETRAINED_VOCAB_PATH.as_posix()
227
+ shutil.copy2(file_vocab_finetune, voca_out_path)
228
+ else:
229
+ with open(voca_out_path.as_posix(), "w") as f:
230
+ for vocab in sorted(text_vocab_set):
231
+ f.write(vocab + "\n")
232
+
233
+ dataset_name = out_dir.stem
234
+ print(f"\nFor {dataset_name}, sample count: {len(result)}")
235
+ print(f"For {dataset_name}, vocab size is: {len(text_vocab_set)}")
236
+ print(f"For {dataset_name}, total {sum(duration_list)/3600:.2f} hours")
237
+
238
+
239
+ def prepare_and_save_set(inp_dir, out_dir, is_finetune: bool = True, num_workers: int = None):
240
+ if is_finetune:
241
+ assert PRETRAINED_VOCAB_PATH.exists(), f"pretrained vocab.txt not found: {PRETRAINED_VOCAB_PATH}"
242
+ sub_result, durations, vocab_set = prepare_csv_wavs_dir(inp_dir, num_workers=num_workers)
243
+ save_prepped_dataset(out_dir, sub_result, durations, vocab_set, is_finetune)
244
+
245
+
246
+ def cli():
247
+ try:
248
+ # Before processing, check if ffprobe is available.
249
+ if shutil.which("ffprobe") is None:
250
+ print(
251
+ "Warning: ffprobe is not available. Duration extraction will rely on torchaudio (which may be slower)."
252
+ )
253
+
254
+ # Usage examples in help text
255
+ parser = argparse.ArgumentParser(
256
+ description="Prepare and save dataset.",
257
+ epilog="""
258
+ Examples:
259
+ # For fine-tuning (default):
260
+ python prepare_csv_wavs.py /input/dataset/path /output/dataset/path
261
+
262
+ # For pre-training:
263
+ python prepare_csv_wavs.py /input/dataset/path /output/dataset/path --pretrain
264
+
265
+ # With custom worker count:
266
+ python prepare_csv_wavs.py /input/dataset/path /output/dataset/path --workers 4
267
+ """,
268
+ )
269
+ parser.add_argument("inp_dir", type=str, help="Input directory containing the data.")
270
+ parser.add_argument("out_dir", type=str, help="Output directory to save the prepared data.")
271
+ parser.add_argument("--pretrain", action="store_true", help="Enable for new pretrain, otherwise is a fine-tune")
272
+ parser.add_argument("--workers", type=int, help=f"Number of worker threads (default: {MAX_WORKERS})")
273
+ args = parser.parse_args()
274
+
275
+ prepare_and_save_set(args.inp_dir, args.out_dir, is_finetune=not args.pretrain, num_workers=args.workers)
276
+ except KeyboardInterrupt:
277
+ print("\nOperation cancelled by user. Cleaning up...")
278
+ if executor is not None:
279
+ executor.shutdown(wait=False, cancel_futures=True)
280
+ sys.exit(1)
281
+
282
+
283
+ if __name__ == "__main__":
284
+ cli()
src/f5_tts/train/datasets/prepare_emilia.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Emilia Dataset: https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07
2
+ # if use updated new version, i.e. WebDataset, feel free to modify / draft your own script
3
+
4
+ # generate audio text map for Emilia ZH & EN
5
+ # evaluate for vocab size
6
+
7
+ import os
8
+ import sys
9
+
10
+ sys.path.append(os.getcwd())
11
+
12
+ import json
13
+ from concurrent.futures import ProcessPoolExecutor
14
+ from importlib.resources import files
15
+ from pathlib import Path
16
+ from tqdm import tqdm
17
+
18
+ from datasets.arrow_writer import ArrowWriter
19
+
20
+ from f5_tts.model.utils import (
21
+ repetition_found,
22
+ convert_char_to_pinyin,
23
+ )
24
+
25
+
26
+ out_zh = {
27
+ "ZH_B00041_S06226",
28
+ "ZH_B00042_S09204",
29
+ "ZH_B00065_S09430",
30
+ "ZH_B00065_S09431",
31
+ "ZH_B00066_S09327",
32
+ "ZH_B00066_S09328",
33
+ }
34
+ zh_filters = ["い", "て"]
35
+ # seems synthesized audios, or heavily code-switched
36
+ out_en = {
37
+ "EN_B00013_S00913",
38
+ "EN_B00042_S00120",
39
+ "EN_B00055_S04111",
40
+ "EN_B00061_S00693",
41
+ "EN_B00061_S01494",
42
+ "EN_B00061_S03375",
43
+ "EN_B00059_S00092",
44
+ "EN_B00111_S04300",
45
+ "EN_B00100_S03759",
46
+ "EN_B00087_S03811",
47
+ "EN_B00059_S00950",
48
+ "EN_B00089_S00946",
49
+ "EN_B00078_S05127",
50
+ "EN_B00070_S04089",
51
+ "EN_B00074_S09659",
52
+ "EN_B00061_S06983",
53
+ "EN_B00061_S07060",
54
+ "EN_B00059_S08397",
55
+ "EN_B00082_S06192",
56
+ "EN_B00091_S01238",
57
+ "EN_B00089_S07349",
58
+ "EN_B00070_S04343",
59
+ "EN_B00061_S02400",
60
+ "EN_B00076_S01262",
61
+ "EN_B00068_S06467",
62
+ "EN_B00076_S02943",
63
+ "EN_B00064_S05954",
64
+ "EN_B00061_S05386",
65
+ "EN_B00066_S06544",
66
+ "EN_B00076_S06944",
67
+ "EN_B00072_S08620",
68
+ "EN_B00076_S07135",
69
+ "EN_B00076_S09127",
70
+ "EN_B00065_S00497",
71
+ "EN_B00059_S06227",
72
+ "EN_B00063_S02859",
73
+ "EN_B00075_S01547",
74
+ "EN_B00061_S08286",
75
+ "EN_B00079_S02901",
76
+ "EN_B00092_S03643",
77
+ "EN_B00096_S08653",
78
+ "EN_B00063_S04297",
79
+ "EN_B00063_S04614",
80
+ "EN_B00079_S04698",
81
+ "EN_B00104_S01666",
82
+ "EN_B00061_S09504",
83
+ "EN_B00061_S09694",
84
+ "EN_B00065_S05444",
85
+ "EN_B00063_S06860",
86
+ "EN_B00065_S05725",
87
+ "EN_B00069_S07628",
88
+ "EN_B00083_S03875",
89
+ "EN_B00071_S07665",
90
+ "EN_B00071_S07665",
91
+ "EN_B00062_S04187",
92
+ "EN_B00065_S09873",
93
+ "EN_B00065_S09922",
94
+ "EN_B00084_S02463",
95
+ "EN_B00067_S05066",
96
+ "EN_B00106_S08060",
97
+ "EN_B00073_S06399",
98
+ "EN_B00073_S09236",
99
+ "EN_B00087_S00432",
100
+ "EN_B00085_S05618",
101
+ "EN_B00064_S01262",
102
+ "EN_B00072_S01739",
103
+ "EN_B00059_S03913",
104
+ "EN_B00069_S04036",
105
+ "EN_B00067_S05623",
106
+ "EN_B00060_S05389",
107
+ "EN_B00060_S07290",
108
+ "EN_B00062_S08995",
109
+ }
110
+ en_filters = ["ا", "い", "て"]
111
+
112
+
113
+ def deal_with_audio_dir(audio_dir):
114
+ audio_jsonl = audio_dir.with_suffix(".jsonl")
115
+ sub_result, durations = [], []
116
+ vocab_set = set()
117
+ bad_case_zh = 0
118
+ bad_case_en = 0
119
+ with open(audio_jsonl, "r") as f:
120
+ lines = f.readlines()
121
+ for line in tqdm(lines, desc=f"{audio_jsonl.stem}"):
122
+ obj = json.loads(line)
123
+ text = obj["text"]
124
+ if obj["language"] == "zh":
125
+ if obj["wav"].split("/")[1] in out_zh or any(f in text for f in zh_filters) or repetition_found(text):
126
+ bad_case_zh += 1
127
+ continue
128
+ else:
129
+ text = text.translate(
130
+ str.maketrans({",": ",", "!": "!", "?": "?"})
131
+ ) # not "。" cuz much code-switched
132
+ if obj["language"] == "en":
133
+ if (
134
+ obj["wav"].split("/")[1] in out_en
135
+ or any(f in text for f in en_filters)
136
+ or repetition_found(text, length=4)
137
+ ):
138
+ bad_case_en += 1
139
+ continue
140
+ if tokenizer == "pinyin":
141
+ text = convert_char_to_pinyin([text], polyphone=polyphone)[0]
142
+ duration = obj["duration"]
143
+ sub_result.append({"audio_path": str(audio_dir.parent / obj["wav"]), "text": text, "duration": duration})
144
+ durations.append(duration)
145
+ vocab_set.update(list(text))
146
+ return sub_result, durations, vocab_set, bad_case_zh, bad_case_en
147
+
148
+
149
+ def main():
150
+ assert tokenizer in ["pinyin", "char"]
151
+ result = []
152
+ duration_list = []
153
+ text_vocab_set = set()
154
+ total_bad_case_zh = 0
155
+ total_bad_case_en = 0
156
+
157
+ # process raw data
158
+ executor = ProcessPoolExecutor(max_workers=max_workers)
159
+ futures = []
160
+ for lang in langs:
161
+ dataset_path = Path(os.path.join(dataset_dir, lang))
162
+ [
163
+ futures.append(executor.submit(deal_with_audio_dir, audio_dir))
164
+ for audio_dir in dataset_path.iterdir()
165
+ if audio_dir.is_dir()
166
+ ]
167
+ for futures in tqdm(futures, total=len(futures)):
168
+ sub_result, durations, vocab_set, bad_case_zh, bad_case_en = futures.result()
169
+ result.extend(sub_result)
170
+ duration_list.extend(durations)
171
+ text_vocab_set.update(vocab_set)
172
+ total_bad_case_zh += bad_case_zh
173
+ total_bad_case_en += bad_case_en
174
+ executor.shutdown()
175
+
176
+ # save preprocessed dataset to disk
177
+ if not os.path.exists(f"{save_dir}"):
178
+ os.makedirs(f"{save_dir}")
179
+ print(f"\nSaving to {save_dir} ...")
180
+
181
+ # dataset = Dataset.from_dict({"audio_path": audio_path_list, "text": text_list, "duration": duration_list}) # oom
182
+ # dataset.save_to_disk(f"{save_dir}/raw", max_shard_size="2GB")
183
+ with ArrowWriter(path=f"{save_dir}/raw.arrow") as writer:
184
+ for line in tqdm(result, desc="Writing to raw.arrow ..."):
185
+ writer.write(line)
186
+
187
+ # dup a json separately saving duration in case for DynamicBatchSampler ease
188
+ with open(f"{save_dir}/duration.json", "w", encoding="utf-8") as f:
189
+ json.dump({"duration": duration_list}, f, ensure_ascii=False)
190
+
191
+ # vocab map, i.e. tokenizer
192
+ # add alphabets and symbols (optional, if plan to ft on de/fr etc.)
193
+ # if tokenizer == "pinyin":
194
+ # text_vocab_set.update([chr(i) for i in range(32, 127)] + [chr(i) for i in range(192, 256)])
195
+ with open(f"{save_dir}/vocab.txt", "w") as f:
196
+ for vocab in sorted(text_vocab_set):
197
+ f.write(vocab + "\n")
198
+
199
+ print(f"\nFor {dataset_name}, sample count: {len(result)}")
200
+ print(f"For {dataset_name}, vocab size is: {len(text_vocab_set)}")
201
+ print(f"For {dataset_name}, total {sum(duration_list)/3600:.2f} hours")
202
+ if "ZH" in langs:
203
+ print(f"Bad zh transcription case: {total_bad_case_zh}")
204
+ if "EN" in langs:
205
+ print(f"Bad en transcription case: {total_bad_case_en}\n")
206
+
207
+
208
+ if __name__ == "__main__":
209
+ max_workers = 16
210
+
211
+ tokenizer = "pinyin" # "pinyin" | "char"
212
+ polyphone = True
213
+
214
+ langs = ["EN"]
215
+ dataset_dir = "data/datasetVN"
216
+ dataset_name = f"vnTTS_{'_'.join(langs)}_{tokenizer}"
217
+ save_dir = str(files("f5_tts").joinpath("../../")) + f"/data/{dataset_name}"
218
+ print(f"\nPrepare for {dataset_name}, will save to {save_dir}\n")
219
+
220
+ main()
221
+
222
+ # Emilia ZH & EN
223
+ # samples count 37837916 (after removal)
224
+ # pinyin vocab size 2543 (polyphone)
225
+ # total duration 95281.87 (hours)
226
+ # bad zh asr cnt 230435 (samples)
227
+ # bad eh asr cnt 37217 (samples)
228
+
229
+ # vocab size may be slightly different due to jieba tokenizer and pypinyin (e.g. way of polyphoneme)
230
+ # please be careful if using pretrained model, make sure the vocab.txt is same
src/f5_tts/train/datasets/prepare_libritts.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ sys.path.append(os.getcwd())
5
+
6
+ import json
7
+ from concurrent.futures import ProcessPoolExecutor
8
+ from importlib.resources import files
9
+ from pathlib import Path
10
+ from tqdm import tqdm
11
+ import soundfile as sf
12
+ from datasets.arrow_writer import ArrowWriter
13
+
14
+ from f5_tts.model.utils import (
15
+ repetition_found,
16
+ convert_char_to_pinyin,
17
+ )
18
+
19
+
20
+ def deal_with_audio_dir(audio_dir):
21
+ sub_result, durations = [], []
22
+ vocab_set = set()
23
+ audio_lists = list(audio_dir.rglob("*.wav"))
24
+
25
+ for line in audio_lists:
26
+ text_path = line.with_suffix(".lab")
27
+ text = open(text_path, "r").read().strip()
28
+ duration = sf.info(line).duration
29
+ if duration < 0.4 or duration > 30:
30
+ continue
31
+ sub_result.append({"audio_path": str(line), "text": text, "duration": duration})
32
+ durations.append(duration)
33
+ vocab_set.update(list(text))
34
+ return sub_result, durations, vocab_set
35
+
36
+
37
+ def main():
38
+ result = []
39
+ duration_list = []
40
+ text_vocab_set = set()
41
+
42
+ # process raw data
43
+ executor = ProcessPoolExecutor(max_workers=max_workers)
44
+ futures = []
45
+
46
+ for subset in tqdm(SUB_SET):
47
+ dataset_path = Path(os.path.join(dataset_dir, subset))
48
+ [
49
+ futures.append(executor.submit(deal_with_audio_dir, audio_dir))
50
+ for audio_dir in dataset_path.iterdir()
51
+ if audio_dir.is_dir()
52
+ ]
53
+ for future in tqdm(futures, total=len(futures)):
54
+ sub_result, durations, vocab_set = future.result()
55
+ result.extend(sub_result)
56
+ duration_list.extend(durations)
57
+ text_vocab_set.update(vocab_set)
58
+ executor.shutdown()
59
+
60
+ # save preprocessed dataset to disk
61
+ if not os.path.exists(f"{save_dir}"):
62
+ os.makedirs(f"{save_dir}")
63
+ print(f"\nSaving to {save_dir} ...")
64
+
65
+ with ArrowWriter(path=f"{save_dir}/raw.arrow") as writer:
66
+ for line in tqdm(result, desc="Writing to raw.arrow ..."):
67
+ writer.write(line)
68
+
69
+ # dup a json separately saving duration in case for DynamicBatchSampler ease
70
+ with open(f"{save_dir}/duration.json", "w", encoding="utf-8") as f:
71
+ json.dump({"duration": duration_list}, f, ensure_ascii=False)
72
+
73
+ # vocab map, i.e. tokenizer
74
+ with open(f"{save_dir}/vocab.txt", "w") as f:
75
+ for vocab in sorted(text_vocab_set):
76
+ f.write(vocab + "\n")
77
+
78
+ print(f"\nFor {dataset_name}, sample count: {len(result)}")
79
+ print(f"For {dataset_name}, vocab size is: {len(text_vocab_set)}")
80
+ print(f"For {dataset_name}, total {sum(duration_list)/3600:.2f} hours")
81
+
82
+
83
+ if __name__ == "__main__":
84
+ max_workers = 16
85
+
86
+ tokenizer = "char" # "pinyin" | "char"
87
+
88
+ SUB_SET = ["mc"]
89
+ dataset_dir = "data/datasetVN"
90
+ dataset_name = f"vnTTS_{'_'.join(SUB_SET)}_{tokenizer}"
91
+ save_dir = str(files("f5_tts").joinpath("../../")) + f"/data/{dataset_name}"
92
+ print(f"\nPrepare for {dataset_name}, will save to {save_dir}\n")
93
+ main()
94
+
95
+ # For LibriTTS_100_360_500_char, sample count: 354218
96
+ # For LibriTTS_100_360_500_char, vocab size is: 78
97
+ # For LibriTTS_100_360_500_char, total 554.09 hours
src/f5_tts/train/datasets/prepare_ljspeech.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ sys.path.append(os.getcwd())
5
+
6
+ import json
7
+ from importlib.resources import files
8
+ from pathlib import Path
9
+ from tqdm import tqdm
10
+ import soundfile as sf
11
+ from datasets.arrow_writer import ArrowWriter
12
+
13
+
14
+ def main():
15
+ result = []
16
+ duration_list = []
17
+ text_vocab_set = set()
18
+
19
+ with open(meta_info, "r") as f:
20
+ lines = f.readlines()
21
+ for line in tqdm(lines):
22
+ uttr, text, norm_text = line.split("|")
23
+ norm_text = norm_text.strip()
24
+ wav_path = Path(dataset_dir) / "wavs" / f"{uttr}.wav"
25
+ duration = sf.info(wav_path).duration
26
+ if duration < 0.4 or duration > 30:
27
+ continue
28
+ result.append({"audio_path": str(wav_path), "text": norm_text, "duration": duration})
29
+ duration_list.append(duration)
30
+ text_vocab_set.update(list(norm_text))
31
+
32
+ # save preprocessed dataset to disk
33
+ if not os.path.exists(f"{save_dir}"):
34
+ os.makedirs(f"{save_dir}")
35
+ print(f"\nSaving to {save_dir} ...")
36
+
37
+ with ArrowWriter(path=f"{save_dir}/raw.arrow") as writer:
38
+ for line in tqdm(result, desc="Writing to raw.arrow ..."):
39
+ writer.write(line)
40
+
41
+ # dup a json separately saving duration in case for DynamicBatchSampler ease
42
+ with open(f"{save_dir}/duration.json", "w", encoding="utf-8") as f:
43
+ json.dump({"duration": duration_list}, f, ensure_ascii=False)
44
+
45
+ # vocab map, i.e. tokenizer
46
+ # add alphabets and symbols (optional, if plan to ft on de/fr etc.)
47
+ with open(f"{save_dir}/vocab.txt", "w") as f:
48
+ for vocab in sorted(text_vocab_set):
49
+ f.write(vocab + "\n")
50
+
51
+ print(f"\nFor {dataset_name}, sample count: {len(result)}")
52
+ print(f"For {dataset_name}, vocab size is: {len(text_vocab_set)}")
53
+ print(f"For {dataset_name}, total {sum(duration_list)/3600:.2f} hours")
54
+
55
+
56
+ if __name__ == "__main__":
57
+ tokenizer = "char" # "pinyin" | "char"
58
+
59
+ dataset_dir = "<SOME_PATH>/LJSpeech-1.1"
60
+ dataset_name = f"LJSpeech_{tokenizer}"
61
+ meta_info = os.path.join(dataset_dir, "metadata.csv")
62
+ save_dir = str(files("f5_tts").joinpath("../../")) + f"/data/{dataset_name}"
63
+ print(f"\nPrepare for {dataset_name}, will save to {save_dir}\n")
64
+
65
+ main()
src/f5_tts/train/datasets/prepare_wenetspeech4tts.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generate audio text map for WenetSpeech4TTS
2
+ # evaluate for vocab size
3
+
4
+ import os
5
+ import sys
6
+
7
+ sys.path.append(os.getcwd())
8
+
9
+ import json
10
+ from concurrent.futures import ProcessPoolExecutor
11
+ from importlib.resources import files
12
+ from tqdm import tqdm
13
+
14
+ import torchaudio
15
+ from datasets import Dataset
16
+
17
+ from f5_tts.model.utils import convert_char_to_pinyin
18
+
19
+
20
+ def deal_with_sub_path_files(dataset_path, sub_path):
21
+ print(f"Dealing with: {sub_path}")
22
+
23
+ text_dir = os.path.join(dataset_path, sub_path, "txts")
24
+ audio_dir = os.path.join(dataset_path, sub_path, "wavs")
25
+ text_files = os.listdir(text_dir)
26
+
27
+ audio_paths, texts, durations = [], [], []
28
+ for text_file in tqdm(text_files):
29
+ with open(os.path.join(text_dir, text_file), "r", encoding="utf-8") as file:
30
+ first_line = file.readline().split("\t")
31
+ audio_nm = first_line[0]
32
+ audio_path = os.path.join(audio_dir, audio_nm + ".wav")
33
+ text = first_line[1].strip()
34
+
35
+ audio_paths.append(audio_path)
36
+
37
+ if tokenizer == "pinyin":
38
+ texts.extend(convert_char_to_pinyin([text], polyphone=polyphone))
39
+ elif tokenizer == "char":
40
+ texts.append(text)
41
+
42
+ audio, sample_rate = torchaudio.load(audio_path)
43
+ durations.append(audio.shape[-1] / sample_rate)
44
+
45
+ return audio_paths, texts, durations
46
+
47
+
48
+ def main():
49
+ assert tokenizer in ["pinyin", "char"]
50
+
51
+ audio_path_list, text_list, duration_list = [], [], []
52
+
53
+ executor = ProcessPoolExecutor(max_workers=max_workers)
54
+ futures = []
55
+ for dataset_path in dataset_paths:
56
+ sub_items = os.listdir(dataset_path)
57
+ sub_paths = [item for item in sub_items if os.path.isdir(os.path.join(dataset_path, item))]
58
+ for sub_path in sub_paths:
59
+ futures.append(executor.submit(deal_with_sub_path_files, dataset_path, sub_path))
60
+ for future in tqdm(futures, total=len(futures)):
61
+ audio_paths, texts, durations = future.result()
62
+ audio_path_list.extend(audio_paths)
63
+ text_list.extend(texts)
64
+ duration_list.extend(durations)
65
+ executor.shutdown()
66
+
67
+ if not os.path.exists("data"):
68
+ os.makedirs("data")
69
+
70
+ print(f"\nSaving to {save_dir} ...")
71
+ dataset = Dataset.from_dict({"audio_path": audio_path_list, "text": text_list, "duration": duration_list})
72
+ dataset.save_to_disk(f"{save_dir}/raw", max_shard_size="2GB") # arrow format
73
+
74
+ with open(f"{save_dir}/duration.json", "w", encoding="utf-8") as f:
75
+ json.dump(
76
+ {"duration": duration_list}, f, ensure_ascii=False
77
+ ) # dup a json separately saving duration in case for DynamicBatchSampler ease
78
+
79
+ print("\nEvaluating vocab size (all characters and symbols / all phonemes) ...")
80
+ text_vocab_set = set()
81
+ for text in tqdm(text_list):
82
+ text_vocab_set.update(list(text))
83
+
84
+ # add alphabets and symbols (optional, if plan to ft on de/fr etc.)
85
+ if tokenizer == "pinyin":
86
+ text_vocab_set.update([chr(i) for i in range(32, 127)] + [chr(i) for i in range(192, 256)])
87
+
88
+ with open(f"{save_dir}/vocab.txt", "w") as f:
89
+ for vocab in sorted(text_vocab_set):
90
+ f.write(vocab + "\n")
91
+ print(f"\nFor {dataset_name}, sample count: {len(text_list)}")
92
+ print(f"For {dataset_name}, vocab size is: {len(text_vocab_set)}\n")
93
+
94
+
95
+ if __name__ == "__main__":
96
+ max_workers = 32
97
+
98
+ tokenizer = "pinyin" # "pinyin" | "char"
99
+ polyphone = True
100
+ dataset_choice = 1 # 1: Premium, 2: Standard, 3: Basic
101
+
102
+ dataset_name = (
103
+ ["WenetSpeech4TTS_Premium", "WenetSpeech4TTS_Standard", "WenetSpeech4TTS_Basic"][dataset_choice - 1]
104
+ + "_"
105
+ + tokenizer
106
+ )
107
+ dataset_paths = [
108
+ "<SOME_PATH>/WenetSpeech4TTS/Basic",
109
+ "<SOME_PATH>/WenetSpeech4TTS/Standard",
110
+ "<SOME_PATH>/WenetSpeech4TTS/Premium",
111
+ ][-dataset_choice:]
112
+ save_dir = str(files("f5_tts").joinpath("../../")) + f"/data/{dataset_name}"
113
+ print(f"\nChoose Dataset: {dataset_name}, will save to {save_dir}\n")
114
+
115
+ main()
116
+
117
+ # Results (if adding alphabets with accents and symbols):
118
+ # WenetSpeech4TTS Basic Standard Premium
119
+ # samples count 3932473 1941220 407494
120
+ # pinyin vocab size 1349 1348 1344 (no polyphone)
121
+ # - - 1459 (polyphone)
122
+ # char vocab size 5264 5219 5042
123
+
124
+ # vocab size may be slightly different due to jieba tokenizer and pypinyin (e.g. way of polyphoneme)
125
+ # please be careful if using pretrained model, make sure the vocab.txt is same
src/f5_tts/train/finetune_cli.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import shutil
4
+
5
+ from cached_path import cached_path
6
+ from f5_tts.model import CFM, UNetT, DiT, Trainer
7
+ from f5_tts.model.utils import get_tokenizer
8
+ from f5_tts.model.dataset import load_dataset
9
+ from importlib.resources import files
10
+
11
+
12
+ # -------------------------- Dataset Settings --------------------------- #
13
+ target_sample_rate = 24000
14
+ n_mel_channels = 100
15
+ hop_length = 256
16
+ win_length = 1024
17
+ n_fft = 1024
18
+ mel_spec_type = "vocos" # 'vocos' or 'bigvgan'
19
+
20
+
21
+ # -------------------------- Argument Parsing --------------------------- #
22
+ def parse_args():
23
+ # batch_size_per_gpu = 1000 settting for gpu 8GB
24
+ # batch_size_per_gpu = 1600 settting for gpu 12GB
25
+ # batch_size_per_gpu = 2000 settting for gpu 16GB
26
+ # batch_size_per_gpu = 3200 settting for gpu 24GB
27
+
28
+ # num_warmup_updates = 300 for 5000 sample about 10 hours
29
+
30
+ # change save_per_updates , last_per_updates change this value what you need ,
31
+
32
+ parser = argparse.ArgumentParser(description="Train CFM Model")
33
+
34
+ parser.add_argument(
35
+ "--exp_name", type=str, default="F5TTS_Base", choices=["F5TTS_Base", "E2TTS_Base"], help="Experiment name"
36
+ )
37
+ parser.add_argument("--dataset_name", type=str, default="vnTTS_mc", help="Name of the dataset to use")
38
+ parser.add_argument("--learning_rate", type=float, default=1e-5, help="Learning rate for training")
39
+ parser.add_argument("--batch_size_per_gpu", type=int, default=3200, help="Batch size per GPU")
40
+ parser.add_argument(
41
+ "--batch_size_type", type=str, default="frame", choices=["frame", "sample"], help="Batch size type"
42
+ )
43
+ parser.add_argument("--max_samples", type=int, default=64, help="Max sequences per batch")
44
+ parser.add_argument("--grad_accumulation_steps", type=int, default=1, help="Gradient accumulation steps")
45
+ parser.add_argument("--max_grad_norm", type=float, default=1.0, help="Max gradient norm for clipping")
46
+ parser.add_argument("--epochs", type=int, default=100, help="Number of training epochs")
47
+ parser.add_argument("--num_warmup_updates", type=int, default=300, help="Warmup updates")
48
+ parser.add_argument("--save_per_updates", type=int, default=10000, help="Save checkpoint every X updates")
49
+ parser.add_argument(
50
+ "--keep_last_n_checkpoints",
51
+ type=int,
52
+ default=-1,
53
+ help="-1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints",
54
+ )
55
+ parser.add_argument("--last_per_updates", type=int, default=50000, help="Save last checkpoint every X updates")
56
+ parser.add_argument("--finetune", action="store_true", help="Use Finetune")
57
+ parser.add_argument("--pretrain", type=str, default="/mnt/d/ckpts/vn_tts_mc_vlog/pretrained_model_1200000.pt", help="the path to the checkpoint")
58
+ parser.add_argument(
59
+ "--tokenizer", type=str, default="char", choices=["pinyin", "char", "custom"], help="Tokenizer type"
60
+ )
61
+ parser.add_argument(
62
+ "--tokenizer_path",
63
+ type=str,
64
+ default=None,
65
+ help="Path to custom tokenizer vocab file (only used if tokenizer = 'custom')",
66
+ )
67
+ parser.add_argument(
68
+ "--log_samples",
69
+ action="store_true",
70
+ help="Log inferenced samples per ckpt save updates",
71
+ )
72
+ parser.add_argument("--logger", type=str, default=None, choices=["wandb", "tensorboard"], help="logger")
73
+ parser.add_argument(
74
+ "--bnb_optimizer",
75
+ action="store_true",
76
+ help="Use 8-bit Adam optimizer from bitsandbytes",
77
+ )
78
+
79
+ return parser.parse_args()
80
+
81
+
82
+ # -------------------------- Training Settings -------------------------- #
83
+
84
+
85
+ def main():
86
+ args = parse_args()
87
+
88
+ checkpoint_path = str(files("f5_tts").joinpath(f"../../ckpts/{args.dataset_name}"))
89
+
90
+ # Model parameters based on experiment name
91
+ if args.exp_name == "F5TTS_Base":
92
+ wandb_resume_id = None
93
+ model_cls = DiT
94
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
95
+ if args.finetune:
96
+ if args.pretrain is None:
97
+ ckpt_path = str(cached_path("hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.pt"))
98
+ else:
99
+ ckpt_path = args.pretrain
100
+ elif args.exp_name == "E2TTS_Base":
101
+ wandb_resume_id = None
102
+ model_cls = UNetT
103
+ model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
104
+ if args.finetune:
105
+ if args.pretrain is None:
106
+ ckpt_path = str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.pt"))
107
+ else:
108
+ ckpt_path = args.pretrain
109
+
110
+ if args.finetune:
111
+ if not os.path.isdir(checkpoint_path):
112
+ os.makedirs(checkpoint_path, exist_ok=True)
113
+
114
+ file_checkpoint = os.path.basename(ckpt_path)
115
+ if not file_checkpoint.startswith("pretrained_"): # Change: Add 'pretrained_' prefix to copied model
116
+ file_checkpoint = "pretrained_" + file_checkpoint
117
+ file_checkpoint = os.path.join(checkpoint_path, file_checkpoint)
118
+ if not os.path.isfile(file_checkpoint):
119
+ shutil.copy2(ckpt_path, file_checkpoint)
120
+ print("copy checkpoint for finetune")
121
+
122
+ # Use the tokenizer and tokenizer_path provided in the command line arguments
123
+ tokenizer = args.tokenizer
124
+ if tokenizer == "custom":
125
+ if not args.tokenizer_path:
126
+ raise ValueError("Custom tokenizer selected, but no tokenizer_path provided.")
127
+ tokenizer_path = args.tokenizer_path
128
+ else:
129
+ tokenizer_path = args.dataset_name
130
+
131
+ vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
132
+
133
+ print("\nvocab : ", vocab_size)
134
+ print("\nvocoder : ", mel_spec_type)
135
+
136
+ mel_spec_kwargs = dict(
137
+ n_fft=n_fft,
138
+ hop_length=hop_length,
139
+ win_length=win_length,
140
+ n_mel_channels=n_mel_channels,
141
+ target_sample_rate=target_sample_rate,
142
+ mel_spec_type=mel_spec_type,
143
+ )
144
+
145
+ model = CFM(
146
+ transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
147
+ mel_spec_kwargs=mel_spec_kwargs,
148
+ vocab_char_map=vocab_char_map,
149
+ )
150
+
151
+ trainer = Trainer(
152
+ model,
153
+ args.epochs,
154
+ args.learning_rate,
155
+ num_warmup_updates=args.num_warmup_updates,
156
+ save_per_updates=args.save_per_updates,
157
+ keep_last_n_checkpoints=args.keep_last_n_checkpoints,
158
+ checkpoint_path=checkpoint_path,
159
+ batch_size=args.batch_size_per_gpu,
160
+ batch_size_type=args.batch_size_type,
161
+ max_samples=args.max_samples,
162
+ grad_accumulation_steps=args.grad_accumulation_steps,
163
+ max_grad_norm=args.max_grad_norm,
164
+ logger=args.logger,
165
+ wandb_project=args.dataset_name,
166
+ wandb_run_name=args.exp_name,
167
+ wandb_resume_id=wandb_resume_id,
168
+ log_samples=args.log_samples,
169
+ last_per_updates=args.last_per_updates,
170
+ bnb_optimizer=args.bnb_optimizer,
171
+ )
172
+
173
+ train_dataset = load_dataset(args.dataset_name, tokenizer, mel_spec_kwargs=mel_spec_kwargs)
174
+
175
+ trainer.train(
176
+ train_dataset,
177
+ resumable_with_seed=666, # seed for shuffling dataset
178
+ )
179
+
180
+
181
+ if __name__ == "__main__":
182
+ main()
src/f5_tts/train/finetune_gradio.py ADDED
@@ -0,0 +1,1889 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import threading
2
+ import queue
3
+ import re
4
+
5
+ import gc
6
+ import json
7
+ import os
8
+ import platform
9
+ import psutil
10
+ import random
11
+ import signal
12
+ import shutil
13
+ import subprocess
14
+ import sys
15
+ import tempfile
16
+ import time
17
+ from glob import glob
18
+
19
+ import click
20
+ import gradio as gr
21
+ import librosa
22
+ import numpy as np
23
+ import torch
24
+ import torchaudio
25
+ from datasets import Dataset as Dataset_
26
+ from datasets.arrow_writer import ArrowWriter
27
+ from safetensors.torch import save_file
28
+ from scipy.io import wavfile
29
+ from cached_path import cached_path
30
+ from f5_tts.api import F5TTS
31
+ from f5_tts.model.utils import convert_char_to_pinyin
32
+ from f5_tts.infer.utils_infer import transcribe
33
+ from importlib.resources import files
34
+
35
+
36
+ training_process = None
37
+ system = platform.system()
38
+ python_executable = sys.executable or "python"
39
+ tts_api = None
40
+ last_checkpoint = ""
41
+ last_device = ""
42
+ last_ema = None
43
+
44
+
45
+ path_data = str(files("f5_tts").joinpath("../../data"))
46
+ path_project_ckpts = "/mnt/d/ckpts"
47
+ file_train = str(files("f5_tts").joinpath("train/finetune_cli.py"))
48
+
49
+ device = (
50
+ "cuda"
51
+ if torch.cuda.is_available()
52
+ else "xpu"
53
+ if torch.xpu.is_available()
54
+ else "mps"
55
+ if torch.backends.mps.is_available()
56
+ else "cpu"
57
+ )
58
+
59
+
60
+ # Save settings from a JSON file
61
+ def save_settings(
62
+ project_name,
63
+ exp_name,
64
+ learning_rate,
65
+ batch_size_per_gpu,
66
+ batch_size_type,
67
+ max_samples,
68
+ grad_accumulation_steps,
69
+ max_grad_norm,
70
+ epochs,
71
+ num_warmup_updates,
72
+ save_per_updates,
73
+ keep_last_n_checkpoints,
74
+ last_per_updates,
75
+ finetune,
76
+ file_checkpoint_train,
77
+ tokenizer_type,
78
+ tokenizer_file,
79
+ mixed_precision,
80
+ logger,
81
+ ch_8bit_adam,
82
+ ):
83
+ path_project = os.path.join(path_project_ckpts, project_name)
84
+ os.makedirs(path_project, exist_ok=True)
85
+ file_setting = os.path.join(path_project, "setting.json")
86
+
87
+ settings = {
88
+ "exp_name": exp_name,
89
+ "learning_rate": learning_rate,
90
+ "batch_size_per_gpu": batch_size_per_gpu,
91
+ "batch_size_type": batch_size_type,
92
+ "max_samples": max_samples,
93
+ "grad_accumulation_steps": grad_accumulation_steps,
94
+ "max_grad_norm": max_grad_norm,
95
+ "epochs": epochs,
96
+ "num_warmup_updates": num_warmup_updates,
97
+ "save_per_updates": save_per_updates,
98
+ "keep_last_n_checkpoints": keep_last_n_checkpoints,
99
+ "last_per_updates": last_per_updates,
100
+ "finetune": finetune,
101
+ "file_checkpoint_train": file_checkpoint_train,
102
+ "tokenizer_type": tokenizer_type,
103
+ "tokenizer_file": tokenizer_file,
104
+ "mixed_precision": mixed_precision,
105
+ "logger": logger,
106
+ "bnb_optimizer": ch_8bit_adam,
107
+ }
108
+ with open(file_setting, "w") as f:
109
+ json.dump(settings, f, indent=4)
110
+ return "Settings saved!"
111
+
112
+
113
+ # Load settings from a JSON file
114
+ def load_settings(project_name):
115
+ project_name = project_name.replace("_pinyin", "").replace("_char", "")
116
+ path_project = os.path.join(path_project_ckpts, project_name)
117
+ file_setting = os.path.join(path_project, "setting.json")
118
+
119
+ # Default settings
120
+ default_settings = {
121
+ "exp_name": "F5TTS_Base",
122
+ "learning_rate": 1e-05,
123
+ "batch_size_per_gpu": 1000,
124
+ "batch_size_type": "frame",
125
+ "max_samples": 64,
126
+ "grad_accumulation_steps": 1,
127
+ "max_grad_norm": 1,
128
+ "epochs": 100,
129
+ "num_warmup_updates": 2,
130
+ "save_per_updates": 300,
131
+ "keep_last_n_checkpoints": -1,
132
+ "last_per_updates": 100,
133
+ "finetune": True,
134
+ "file_checkpoint_train": "",
135
+ "tokenizer_type": "pinyin",
136
+ "tokenizer_file": "",
137
+ "mixed_precision": "none",
138
+ "logger": "wandb",
139
+ "bnb_optimizer": False,
140
+ }
141
+
142
+ # Load settings from file if it exists
143
+ if os.path.isfile(file_setting):
144
+ with open(file_setting, "r") as f:
145
+ file_settings = json.load(f)
146
+ default_settings.update(file_settings)
147
+
148
+ # Return as a tuple in the correct order
149
+ return (
150
+ default_settings["exp_name"],
151
+ default_settings["learning_rate"],
152
+ default_settings["batch_size_per_gpu"],
153
+ default_settings["batch_size_type"],
154
+ default_settings["max_samples"],
155
+ default_settings["grad_accumulation_steps"],
156
+ default_settings["max_grad_norm"],
157
+ default_settings["epochs"],
158
+ default_settings["num_warmup_updates"],
159
+ default_settings["save_per_updates"],
160
+ default_settings["keep_last_n_checkpoints"],
161
+ default_settings["last_per_updates"],
162
+ default_settings["finetune"],
163
+ default_settings["file_checkpoint_train"],
164
+ default_settings["tokenizer_type"],
165
+ default_settings["tokenizer_file"],
166
+ default_settings["mixed_precision"],
167
+ default_settings["logger"],
168
+ default_settings["bnb_optimizer"],
169
+ )
170
+
171
+
172
+ # Load metadata
173
+ def get_audio_duration(audio_path):
174
+ """Calculate the duration mono of an audio file."""
175
+ audio, sample_rate = torchaudio.load(audio_path)
176
+ return audio.shape[1] / sample_rate
177
+
178
+
179
+ def clear_text(text):
180
+ """Clean and prepare text by lowering the case and stripping whitespace."""
181
+ return text.lower().strip()
182
+
183
+
184
+ def get_rms(
185
+ y,
186
+ frame_length=2048,
187
+ hop_length=512,
188
+ pad_mode="constant",
189
+ ): # https://github.com/RVC-Boss/GPT-SoVITS/blob/main/tools/slicer2.py
190
+ padding = (int(frame_length // 2), int(frame_length // 2))
191
+ y = np.pad(y, padding, mode=pad_mode)
192
+
193
+ axis = -1
194
+ # put our new within-frame axis at the end for now
195
+ out_strides = y.strides + tuple([y.strides[axis]])
196
+ # Reduce the shape on the framing axis
197
+ x_shape_trimmed = list(y.shape)
198
+ x_shape_trimmed[axis] -= frame_length - 1
199
+ out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
200
+ xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
201
+ if axis < 0:
202
+ target_axis = axis - 1
203
+ else:
204
+ target_axis = axis + 1
205
+ xw = np.moveaxis(xw, -1, target_axis)
206
+ # Downsample along the target axis
207
+ slices = [slice(None)] * xw.ndim
208
+ slices[axis] = slice(0, None, hop_length)
209
+ x = xw[tuple(slices)]
210
+
211
+ # Calculate power
212
+ power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
213
+
214
+ return np.sqrt(power)
215
+
216
+
217
+ class Slicer: # https://github.com/RVC-Boss/GPT-SoVITS/blob/main/tools/slicer2.py
218
+ def __init__(
219
+ self,
220
+ sr: int,
221
+ threshold: float = -40.0,
222
+ min_length: int = 2000,
223
+ min_interval: int = 300,
224
+ hop_size: int = 20,
225
+ max_sil_kept: int = 2000,
226
+ ):
227
+ if not min_length >= min_interval >= hop_size:
228
+ raise ValueError("The following condition must be satisfied: min_length >= min_interval >= hop_size")
229
+ if not max_sil_kept >= hop_size:
230
+ raise ValueError("The following condition must be satisfied: max_sil_kept >= hop_size")
231
+ min_interval = sr * min_interval / 1000
232
+ self.threshold = 10 ** (threshold / 20.0)
233
+ self.hop_size = round(sr * hop_size / 1000)
234
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
235
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
236
+ self.min_interval = round(min_interval / self.hop_size)
237
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
238
+
239
+ def _apply_slice(self, waveform, begin, end):
240
+ if len(waveform.shape) > 1:
241
+ return waveform[:, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)]
242
+ else:
243
+ return waveform[begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)]
244
+
245
+ # @timeit
246
+ def slice(self, waveform):
247
+ if len(waveform.shape) > 1:
248
+ samples = waveform.mean(axis=0)
249
+ else:
250
+ samples = waveform
251
+ if samples.shape[0] <= self.min_length:
252
+ return [waveform]
253
+ rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
254
+ sil_tags = []
255
+ silence_start = None
256
+ clip_start = 0
257
+ for i, rms in enumerate(rms_list):
258
+ # Keep looping while frame is silent.
259
+ if rms < self.threshold:
260
+ # Record start of silent frames.
261
+ if silence_start is None:
262
+ silence_start = i
263
+ continue
264
+ # Keep looping while frame is not silent and silence start has not been recorded.
265
+ if silence_start is None:
266
+ continue
267
+ # Clear recorded silence start if interval is not enough or clip is too short
268
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
269
+ need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
270
+ if not is_leading_silence and not need_slice_middle:
271
+ silence_start = None
272
+ continue
273
+ # Need slicing. Record the range of silent frames to be removed.
274
+ if i - silence_start <= self.max_sil_kept:
275
+ pos = rms_list[silence_start : i + 1].argmin() + silence_start
276
+ if silence_start == 0:
277
+ sil_tags.append((0, pos))
278
+ else:
279
+ sil_tags.append((pos, pos))
280
+ clip_start = pos
281
+ elif i - silence_start <= self.max_sil_kept * 2:
282
+ pos = rms_list[i - self.max_sil_kept : silence_start + self.max_sil_kept + 1].argmin()
283
+ pos += i - self.max_sil_kept
284
+ pos_l = rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start
285
+ pos_r = rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept
286
+ if silence_start == 0:
287
+ sil_tags.append((0, pos_r))
288
+ clip_start = pos_r
289
+ else:
290
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
291
+ clip_start = max(pos_r, pos)
292
+ else:
293
+ pos_l = rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start
294
+ pos_r = rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept
295
+ if silence_start == 0:
296
+ sil_tags.append((0, pos_r))
297
+ else:
298
+ sil_tags.append((pos_l, pos_r))
299
+ clip_start = pos_r
300
+ silence_start = None
301
+ # Deal with trailing silence.
302
+ total_frames = rms_list.shape[0]
303
+ if silence_start is not None and total_frames - silence_start >= self.min_interval:
304
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
305
+ pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
306
+ sil_tags.append((pos, total_frames + 1))
307
+ # Apply and return slices.
308
+ ####音频+起始时间+终止时间
309
+ if len(sil_tags) == 0:
310
+ return [[waveform, 0, int(total_frames * self.hop_size)]]
311
+ else:
312
+ chunks = []
313
+ if sil_tags[0][0] > 0:
314
+ chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]), 0, int(sil_tags[0][0] * self.hop_size)])
315
+ for i in range(len(sil_tags) - 1):
316
+ chunks.append(
317
+ [
318
+ self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),
319
+ int(sil_tags[i][1] * self.hop_size),
320
+ int(sil_tags[i + 1][0] * self.hop_size),
321
+ ]
322
+ )
323
+ if sil_tags[-1][1] < total_frames:
324
+ chunks.append(
325
+ [
326
+ self._apply_slice(waveform, sil_tags[-1][1], total_frames),
327
+ int(sil_tags[-1][1] * self.hop_size),
328
+ int(total_frames * self.hop_size),
329
+ ]
330
+ )
331
+ return chunks
332
+
333
+
334
+ # terminal
335
+ def terminate_process_tree(pid, including_parent=True):
336
+ try:
337
+ parent = psutil.Process(pid)
338
+ except psutil.NoSuchProcess:
339
+ # Process already terminated
340
+ return
341
+
342
+ children = parent.children(recursive=True)
343
+ for child in children:
344
+ try:
345
+ os.kill(child.pid, signal.SIGTERM) # or signal.SIGKILL
346
+ except OSError:
347
+ pass
348
+ if including_parent:
349
+ try:
350
+ os.kill(parent.pid, signal.SIGTERM) # or signal.SIGKILL
351
+ except OSError:
352
+ pass
353
+
354
+
355
+ def terminate_process(pid):
356
+ if system == "Windows":
357
+ cmd = f"taskkill /t /f /pid {pid}"
358
+ os.system(cmd)
359
+ else:
360
+ terminate_process_tree(pid)
361
+
362
+
363
+ def start_training(
364
+ dataset_name="",
365
+ exp_name="F5TTS_Base",
366
+ learning_rate=1e-4,
367
+ batch_size_per_gpu=400,
368
+ batch_size_type="frame",
369
+ max_samples=64,
370
+ grad_accumulation_steps=1,
371
+ max_grad_norm=1.0,
372
+ epochs=11,
373
+ num_warmup_updates=200,
374
+ save_per_updates=400,
375
+ keep_last_n_checkpoints=-1,
376
+ last_per_updates=800,
377
+ finetune=True,
378
+ file_checkpoint_train="",
379
+ tokenizer_type="pinyin",
380
+ tokenizer_file="",
381
+ mixed_precision="fp16",
382
+ stream=False,
383
+ logger="wandb",
384
+ ch_8bit_adam=False,
385
+ ):
386
+ global training_process, tts_api, stop_signal
387
+
388
+ if tts_api is not None:
389
+ if tts_api is not None:
390
+ del tts_api
391
+
392
+ gc.collect()
393
+ torch.cuda.empty_cache()
394
+ tts_api = None
395
+
396
+ path_project = os.path.join(path_data, dataset_name)
397
+
398
+ if not os.path.isdir(path_project):
399
+ yield (
400
+ f"There is not project with name {dataset_name}",
401
+ gr.update(interactive=True),
402
+ gr.update(interactive=False),
403
+ )
404
+ return
405
+
406
+ file_raw = os.path.join(path_project, "raw.arrow")
407
+ if not os.path.isfile(file_raw):
408
+ yield f"There is no file {file_raw}", gr.update(interactive=True), gr.update(interactive=False)
409
+ return
410
+
411
+ # Check if a training process is already running
412
+ if training_process is not None:
413
+ return "Train run already!", gr.update(interactive=False), gr.update(interactive=True)
414
+
415
+ yield "start train", gr.update(interactive=False), gr.update(interactive=False)
416
+
417
+ # Command to run the training script with the specified arguments
418
+
419
+ if tokenizer_file == "":
420
+ if dataset_name.endswith("_pinyin"):
421
+ tokenizer_type = "pinyin"
422
+ elif dataset_name.endswith("_char"):
423
+ tokenizer_type = "char"
424
+ else:
425
+ tokenizer_type = "custom"
426
+
427
+ dataset_name = dataset_name.replace("_pinyin", "").replace("_char", "")
428
+
429
+ if mixed_precision != "none":
430
+ fp16 = f"--mixed_precision={mixed_precision}"
431
+ else:
432
+ fp16 = ""
433
+
434
+ cmd = (
435
+ f"accelerate launch {fp16} {file_train} --exp_name {exp_name}"
436
+ f" --learning_rate {learning_rate}"
437
+ f" --batch_size_per_gpu {batch_size_per_gpu}"
438
+ f" --batch_size_type {batch_size_type}"
439
+ f" --max_samples {max_samples}"
440
+ f" --grad_accumulation_steps {grad_accumulation_steps}"
441
+ f" --max_grad_norm {max_grad_norm}"
442
+ f" --epochs {epochs}"
443
+ f" --num_warmup_updates {num_warmup_updates}"
444
+ f" --save_per_updates {save_per_updates}"
445
+ f" --keep_last_n_checkpoints {keep_last_n_checkpoints}"
446
+ f" --last_per_updates {last_per_updates}"
447
+ f" --dataset_name {dataset_name}"
448
+ )
449
+
450
+ if finetune:
451
+ cmd += " --finetune"
452
+
453
+ if file_checkpoint_train != "":
454
+ cmd += f" --pretrain {file_checkpoint_train}"
455
+
456
+ if tokenizer_file != "":
457
+ cmd += f" --tokenizer_path {tokenizer_file}"
458
+
459
+ cmd += f" --tokenizer {tokenizer_type}"
460
+
461
+ cmd += f" --log_samples --logger {logger}"
462
+
463
+ if ch_8bit_adam:
464
+ cmd += " --bnb_optimizer"
465
+
466
+ print("run command : \n" + cmd + "\n")
467
+
468
+ save_settings(
469
+ dataset_name,
470
+ exp_name,
471
+ learning_rate,
472
+ batch_size_per_gpu,
473
+ batch_size_type,
474
+ max_samples,
475
+ grad_accumulation_steps,
476
+ max_grad_norm,
477
+ epochs,
478
+ num_warmup_updates,
479
+ save_per_updates,
480
+ keep_last_n_checkpoints,
481
+ last_per_updates,
482
+ finetune,
483
+ file_checkpoint_train,
484
+ tokenizer_type,
485
+ tokenizer_file,
486
+ mixed_precision,
487
+ logger,
488
+ ch_8bit_adam,
489
+ )
490
+
491
+ try:
492
+ if not stream:
493
+ # Start the training process
494
+ training_process = subprocess.Popen(cmd, shell=True)
495
+
496
+ time.sleep(5)
497
+ yield "train start", gr.update(interactive=False), gr.update(interactive=True)
498
+
499
+ # Wait for the training process to finish
500
+ training_process.wait()
501
+ else:
502
+
503
+ def stream_output(pipe, output_queue):
504
+ try:
505
+ for line in iter(pipe.readline, ""):
506
+ output_queue.put(line)
507
+ except Exception as e:
508
+ output_queue.put(f"Error reading pipe: {str(e)}")
509
+ finally:
510
+ pipe.close()
511
+
512
+ env = os.environ.copy()
513
+ env["PYTHONUNBUFFERED"] = "1"
514
+
515
+ training_process = subprocess.Popen(
516
+ cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1, env=env
517
+ )
518
+ yield "Training started...", gr.update(interactive=False), gr.update(interactive=True)
519
+
520
+ stdout_queue = queue.Queue()
521
+ stderr_queue = queue.Queue()
522
+
523
+ stdout_thread = threading.Thread(target=stream_output, args=(training_process.stdout, stdout_queue))
524
+ stderr_thread = threading.Thread(target=stream_output, args=(training_process.stderr, stderr_queue))
525
+ stdout_thread.daemon = True
526
+ stderr_thread.daemon = True
527
+ stdout_thread.start()
528
+ stderr_thread.start()
529
+ stop_signal = False
530
+ while True:
531
+ if stop_signal:
532
+ training_process.terminate()
533
+ time.sleep(0.5)
534
+ if training_process.poll() is None:
535
+ training_process.kill()
536
+ yield "Training stopped by user.", gr.update(interactive=True), gr.update(interactive=False)
537
+ break
538
+
539
+ process_status = training_process.poll()
540
+
541
+ # Handle stdout
542
+ try:
543
+ while True:
544
+ output = stdout_queue.get_nowait()
545
+ print(output, end="")
546
+ match = re.search(
547
+ r"Epoch (\d+)/(\d+):\s+(\d+)%\|.*\[(\d+:\d+)<.*?loss=(\d+\.\d+), update=(\d+)", output
548
+ )
549
+ if match:
550
+ current_epoch = match.group(1)
551
+ total_epochs = match.group(2)
552
+ percent_complete = match.group(3)
553
+ elapsed_time = match.group(4)
554
+ loss = match.group(5)
555
+ current_update = match.group(6)
556
+ message = (
557
+ f"Epoch: {current_epoch}/{total_epochs}, "
558
+ f"Progress: {percent_complete}%, "
559
+ f"Elapsed Time: {elapsed_time}, "
560
+ f"Loss: {loss}, "
561
+ f"Update: {current_update}"
562
+ )
563
+ yield message, gr.update(interactive=False), gr.update(interactive=True)
564
+ elif output.strip():
565
+ yield output, gr.update(interactive=False), gr.update(interactive=True)
566
+ except queue.Empty:
567
+ pass
568
+
569
+ # Handle stderr
570
+ try:
571
+ while True:
572
+ error_output = stderr_queue.get_nowait()
573
+ print(error_output, end="")
574
+ if error_output.strip():
575
+ yield f"{error_output.strip()}", gr.update(interactive=False), gr.update(interactive=True)
576
+ except queue.Empty:
577
+ pass
578
+
579
+ if process_status is not None and stdout_queue.empty() and stderr_queue.empty():
580
+ if process_status != 0:
581
+ yield (
582
+ f"Process crashed with exit code {process_status}!",
583
+ gr.update(interactive=False),
584
+ gr.update(interactive=True),
585
+ )
586
+ else:
587
+ yield "Training complete!", gr.update(interactive=False), gr.update(interactive=True)
588
+ break
589
+
590
+ # Small sleep to prevent CPU thrashing
591
+ time.sleep(0.1)
592
+
593
+ # Clean up
594
+ training_process.stdout.close()
595
+ training_process.stderr.close()
596
+ training_process.wait()
597
+
598
+ time.sleep(1)
599
+
600
+ if training_process is None:
601
+ text_info = "train stop"
602
+ else:
603
+ text_info = "train complete !"
604
+
605
+ except Exception as e: # Catch all exceptions
606
+ # Ensure that we reset the training process variable in case of an error
607
+ text_info = f"An error occurred: {str(e)}"
608
+
609
+ training_process = None
610
+
611
+ yield text_info, gr.update(interactive=True), gr.update(interactive=False)
612
+
613
+
614
+ def stop_training():
615
+ global training_process, stop_signal
616
+
617
+ if training_process is None:
618
+ return "Train not run !", gr.update(interactive=True), gr.update(interactive=False)
619
+ terminate_process_tree(training_process.pid)
620
+ # training_process = None
621
+ stop_signal = True
622
+ return "train stop", gr.update(interactive=True), gr.update(interactive=False)
623
+
624
+
625
+ def get_list_projects():
626
+ project_list = []
627
+ for folder in os.listdir(path_data):
628
+ path_folder = os.path.join(path_data, folder)
629
+ if not os.path.isdir(path_folder):
630
+ continue
631
+ folder = folder.lower()
632
+ if folder == "emilia_zh_en_pinyin":
633
+ continue
634
+ project_list.append(folder)
635
+
636
+ projects_selelect = None if not project_list else project_list[-1]
637
+
638
+ return project_list, projects_selelect
639
+
640
+
641
+ def create_data_project(name, tokenizer_type):
642
+ name += "_" + tokenizer_type
643
+ os.makedirs(os.path.join(path_data, name), exist_ok=True)
644
+ os.makedirs(os.path.join(path_data, name, "dataset"), exist_ok=True)
645
+ project_list, projects_selelect = get_list_projects()
646
+ return gr.update(choices=project_list, value=name)
647
+
648
+
649
+ def transcribe_all(name_project, audio_files, language, user=False, progress=gr.Progress()):
650
+ path_project = os.path.join(path_data, name_project)
651
+ path_dataset = os.path.join(path_project, "dataset")
652
+ path_project_wavs = os.path.join(path_project, "wavs")
653
+ file_metadata = os.path.join(path_project, "metadata.csv")
654
+
655
+ if not user:
656
+ if audio_files is None:
657
+ return "You need to load an audio file."
658
+
659
+ if os.path.isdir(path_project_wavs):
660
+ shutil.rmtree(path_project_wavs)
661
+
662
+ if os.path.isfile(file_metadata):
663
+ os.remove(file_metadata)
664
+
665
+ os.makedirs(path_project_wavs, exist_ok=True)
666
+
667
+ if user:
668
+ file_audios = [
669
+ file
670
+ for format in ("*.wav", "*.ogg", "*.opus", "*.mp3", "*.flac")
671
+ for file in glob(os.path.join(path_dataset, format))
672
+ ]
673
+ if file_audios == []:
674
+ return "No audio file was found in the dataset."
675
+ else:
676
+ file_audios = audio_files
677
+
678
+ alpha = 0.5
679
+ _max = 1.0
680
+ slicer = Slicer(24000)
681
+
682
+ num = 0
683
+ error_num = 0
684
+ data = ""
685
+ for file_audio in progress.tqdm(file_audios, desc="transcribe files", total=len((file_audios))):
686
+ audio, _ = librosa.load(file_audio, sr=24000, mono=True)
687
+
688
+ list_slicer = slicer.slice(audio)
689
+ for chunk, start, end in progress.tqdm(list_slicer, total=len(list_slicer), desc="slicer files"):
690
+ name_segment = os.path.join(f"segment_{num}")
691
+ file_segment = os.path.join(path_project_wavs, f"{name_segment}.wav")
692
+
693
+ tmp_max = np.abs(chunk).max()
694
+ if tmp_max > 1:
695
+ chunk /= tmp_max
696
+ chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
697
+ wavfile.write(file_segment, 24000, (chunk * 32767).astype(np.int16))
698
+
699
+ try:
700
+ text = transcribe(file_segment, language)
701
+ text = text.lower().strip().replace('"', "")
702
+
703
+ data += f"{name_segment}|{text}\n"
704
+
705
+ num += 1
706
+ except: # noqa: E722
707
+ error_num += 1
708
+
709
+ with open(file_metadata, "w", encoding="utf-8-sig") as f:
710
+ f.write(data)
711
+
712
+ if error_num != []:
713
+ error_text = f"\nerror files : {error_num}"
714
+ else:
715
+ error_text = ""
716
+
717
+ return f"transcribe complete samples : {num}\npath : {path_project_wavs}{error_text}"
718
+
719
+
720
+ def format_seconds_to_hms(seconds):
721
+ hours = int(seconds / 3600)
722
+ minutes = int((seconds % 3600) / 60)
723
+ seconds = seconds % 60
724
+ return "{:02d}:{:02d}:{:02d}".format(hours, minutes, int(seconds))
725
+
726
+
727
+ def get_correct_audio_path(
728
+ audio_input,
729
+ base_path="wavs",
730
+ supported_formats=("wav", "mp3", "aac", "flac", "m4a", "alac", "ogg", "aiff", "wma", "amr"),
731
+ ):
732
+ file_audio = None
733
+
734
+ # Helper function to check if file has a supported extension
735
+ def has_supported_extension(file_name):
736
+ return any(file_name.endswith(f".{ext}") for ext in supported_formats)
737
+
738
+ # Case 1: If it's a full path with a valid extension, use it directly
739
+ if os.path.isabs(audio_input) and has_supported_extension(audio_input):
740
+ file_audio = audio_input
741
+
742
+ # Case 2: If it has a supported extension but is not a full path
743
+ elif has_supported_extension(audio_input) and not os.path.isabs(audio_input):
744
+ file_audio = os.path.join(base_path, audio_input)
745
+
746
+ # Case 3: If only the name is given (no extension and not a full path)
747
+ elif not has_supported_extension(audio_input) and not os.path.isabs(audio_input):
748
+ for ext in supported_formats:
749
+ potential_file = os.path.join(base_path, f"{audio_input}.{ext}")
750
+ if os.path.exists(potential_file):
751
+ file_audio = potential_file
752
+ break
753
+ else:
754
+ file_audio = os.path.join(base_path, f"{audio_input}.{supported_formats[0]}")
755
+ return file_audio
756
+
757
+
758
+ def create_metadata(name_project, ch_tokenizer, progress=gr.Progress()):
759
+ path_project = os.path.join(path_data, name_project)
760
+ path_project_wavs = os.path.join(path_project, "wavs")
761
+ file_metadata = os.path.join(path_project, "metadata.csv")
762
+ file_raw = os.path.join(path_project, "raw.arrow")
763
+ file_duration = os.path.join(path_project, "duration.json")
764
+ file_vocab = os.path.join(path_project, "vocab.txt")
765
+
766
+ if not os.path.isfile(file_metadata):
767
+ return "The file was not found in " + file_metadata, ""
768
+
769
+ with open(file_metadata, "r", encoding="utf-8-sig") as f:
770
+ data = f.read()
771
+
772
+ audio_path_list = []
773
+ text_list = []
774
+ duration_list = []
775
+
776
+ count = data.split("\n")
777
+ lenght = 0
778
+ result = []
779
+ error_files = []
780
+ text_vocab_set = set()
781
+ for line in progress.tqdm(data.split("\n"), total=count):
782
+ sp_line = line.split("|")
783
+ if len(sp_line) != 2:
784
+ continue
785
+ name_audio, text = sp_line[:2]
786
+
787
+ file_audio = get_correct_audio_path(name_audio, path_project_wavs)
788
+
789
+ if not os.path.isfile(file_audio):
790
+ error_files.append([file_audio, "error path"])
791
+ continue
792
+
793
+ try:
794
+ duration = get_audio_duration(file_audio)
795
+ except Exception as e:
796
+ error_files.append([file_audio, "duration"])
797
+ print(f"Error processing {file_audio}: {e}")
798
+ continue
799
+
800
+ if duration < 1 or duration > 25:
801
+ if duration > 25:
802
+ error_files.append([file_audio, "duration > 25 sec"])
803
+ if duration < 1:
804
+ error_files.append([file_audio, "duration < 1 sec "])
805
+ continue
806
+ if len(text) < 3:
807
+ error_files.append([file_audio, "very small text len 3"])
808
+ continue
809
+
810
+ text = clear_text(text)
811
+ text = convert_char_to_pinyin([text], polyphone=True)[0]
812
+
813
+ audio_path_list.append(file_audio)
814
+ duration_list.append(duration)
815
+ text_list.append(text)
816
+
817
+ result.append({"audio_path": file_audio, "text": text, "duration": duration})
818
+ if ch_tokenizer:
819
+ text_vocab_set.update(list(text))
820
+
821
+ lenght += duration
822
+
823
+ if duration_list == []:
824
+ return f"Error: No audio files found in the specified path : {path_project_wavs}", ""
825
+
826
+ min_second = round(min(duration_list), 2)
827
+ max_second = round(max(duration_list), 2)
828
+
829
+ with ArrowWriter(path=file_raw, writer_batch_size=1) as writer:
830
+ for line in progress.tqdm(result, total=len(result), desc="prepare data"):
831
+ writer.write(line)
832
+
833
+ with open(file_duration, "w") as f:
834
+ json.dump({"duration": duration_list}, f, ensure_ascii=False)
835
+
836
+ new_vocal = ""
837
+ if not ch_tokenizer:
838
+ if not os.path.isfile(file_vocab):
839
+ file_vocab_finetune = os.path.join(path_data, "Emilia_ZH_EN_pinyin/vocab.txt")
840
+ if not os.path.isfile(file_vocab_finetune):
841
+ return "Error: Vocabulary file 'Emilia_ZH_EN_pinyin' not found!", ""
842
+ shutil.copy2(file_vocab_finetune, file_vocab)
843
+
844
+ with open(file_vocab, "r", encoding="utf-8-sig") as f:
845
+ vocab_char_map = {}
846
+ for i, char in enumerate(f):
847
+ vocab_char_map[char[:-1]] = i
848
+ vocab_size = len(vocab_char_map)
849
+
850
+ else:
851
+ with open(file_vocab, "w", encoding="utf-8-sig") as f:
852
+ for vocab in sorted(text_vocab_set):
853
+ f.write(vocab + "\n")
854
+ new_vocal += vocab + "\n"
855
+ vocab_size = len(text_vocab_set)
856
+
857
+ if error_files != []:
858
+ error_text = "\n".join([" = ".join(item) for item in error_files])
859
+ else:
860
+ error_text = ""
861
+
862
+ return (
863
+ f"prepare complete \nsamples : {len(text_list)}\ntime data : {format_seconds_to_hms(lenght)}\nmin sec : {min_second}\nmax sec : {max_second}\nfile_arrow : {file_raw}\nvocab : {vocab_size}\n{error_text}",
864
+ new_vocal,
865
+ )
866
+
867
+
868
+ def check_user(value):
869
+ return gr.update(visible=not value), gr.update(visible=value)
870
+
871
+
872
+ def calculate_train(
873
+ name_project,
874
+ batch_size_type,
875
+ max_samples,
876
+ learning_rate,
877
+ num_warmup_updates,
878
+ save_per_updates,
879
+ last_per_updates,
880
+ finetune,
881
+ ):
882
+ path_project = os.path.join(path_data, name_project)
883
+ file_duraction = os.path.join(path_project, "duration.json")
884
+
885
+ if not os.path.isfile(file_duraction):
886
+ return (
887
+ 1000,
888
+ max_samples,
889
+ num_warmup_updates,
890
+ save_per_updates,
891
+ last_per_updates,
892
+ "project not found !",
893
+ learning_rate,
894
+ )
895
+
896
+ with open(file_duraction, "r") as file:
897
+ data = json.load(file)
898
+
899
+ duration_list = data["duration"]
900
+ samples = len(duration_list)
901
+ hours = sum(duration_list) / 3600
902
+
903
+ # if torch.cuda.is_available():
904
+ # gpu_properties = torch.cuda.get_device_properties(0)
905
+ # total_memory = gpu_properties.total_memory / (1024**3)
906
+ # elif torch.backends.mps.is_available():
907
+ # total_memory = psutil.virtual_memory().available / (1024**3)
908
+
909
+ if torch.cuda.is_available():
910
+ gpu_count = torch.cuda.device_count()
911
+ total_memory = 0
912
+ for i in range(gpu_count):
913
+ gpu_properties = torch.cuda.get_device_properties(i)
914
+ total_memory += gpu_properties.total_memory / (1024**3) # in GB
915
+
916
+ elif torch.xpu.is_available():
917
+ gpu_count = torch.xpu.device_count()
918
+ total_memory = 0
919
+ for i in range(gpu_count):
920
+ gpu_properties = torch.xpu.get_device_properties(i)
921
+ total_memory += gpu_properties.total_memory / (1024**3)
922
+
923
+ elif torch.backends.mps.is_available():
924
+ gpu_count = 1
925
+ total_memory = psutil.virtual_memory().available / (1024**3)
926
+
927
+ if batch_size_type == "frame":
928
+ batch = int(total_memory * 0.5)
929
+ batch = (lambda num: num + 1 if num % 2 != 0 else num)(batch)
930
+ batch_size_per_gpu = int(38400 / batch)
931
+ else:
932
+ batch_size_per_gpu = int(total_memory / 8)
933
+ batch_size_per_gpu = (lambda num: num + 1 if num % 2 != 0 else num)(batch_size_per_gpu)
934
+ batch = batch_size_per_gpu
935
+
936
+ if batch_size_per_gpu <= 0:
937
+ batch_size_per_gpu = 1
938
+
939
+ if samples < 64:
940
+ max_samples = int(samples * 0.25)
941
+ else:
942
+ max_samples = 64
943
+
944
+ num_warmup_updates = int(samples * 0.05)
945
+ save_per_updates = int(samples * 0.10)
946
+ last_per_updates = int(save_per_updates * 0.25)
947
+
948
+ max_samples = (lambda num: num + 1 if num % 2 != 0 else num)(max_samples)
949
+ num_warmup_updates = (lambda num: num + 1 if num % 2 != 0 else num)(num_warmup_updates)
950
+ save_per_updates = (lambda num: num + 1 if num % 2 != 0 else num)(save_per_updates)
951
+ last_per_updates = (lambda num: num + 1 if num % 2 != 0 else num)(last_per_updates)
952
+ if last_per_updates <= 0:
953
+ last_per_updates = 2
954
+
955
+ total_hours = hours
956
+ mel_hop_length = 256
957
+ mel_sampling_rate = 24000
958
+
959
+ # target
960
+ wanted_max_updates = 1000000
961
+
962
+ # train params
963
+ gpus = gpu_count
964
+ frames_per_gpu = batch_size_per_gpu # 8 * 38400 = 307200
965
+ grad_accum = 1
966
+
967
+ # intermediate
968
+ mini_batch_frames = frames_per_gpu * grad_accum * gpus
969
+ mini_batch_hours = mini_batch_frames * mel_hop_length / mel_sampling_rate / 3600
970
+ updates_per_epoch = total_hours / mini_batch_hours
971
+ # steps_per_epoch = updates_per_epoch * grad_accum
972
+ epochs = wanted_max_updates / updates_per_epoch
973
+
974
+ if finetune:
975
+ learning_rate = 1e-5
976
+ else:
977
+ learning_rate = 7.5e-5
978
+
979
+ return (
980
+ batch_size_per_gpu,
981
+ max_samples,
982
+ num_warmup_updates,
983
+ save_per_updates,
984
+ last_per_updates,
985
+ samples,
986
+ learning_rate,
987
+ int(epochs),
988
+ )
989
+
990
+
991
+ def extract_and_save_ema_model(checkpoint_path: str, new_checkpoint_path: str, safetensors: bool) -> str:
992
+ try:
993
+ checkpoint = torch.load(checkpoint_path, weights_only=True)
994
+ print("Original Checkpoint Keys:", checkpoint.keys())
995
+
996
+ ema_model_state_dict = checkpoint.get("ema_model_state_dict", None)
997
+ if ema_model_state_dict is None:
998
+ return "No 'ema_model_state_dict' found in the checkpoint."
999
+
1000
+ if safetensors:
1001
+ new_checkpoint_path = new_checkpoint_path.replace(".pt", ".safetensors")
1002
+ save_file(ema_model_state_dict, new_checkpoint_path)
1003
+ else:
1004
+ new_checkpoint_path = new_checkpoint_path.replace(".safetensors", ".pt")
1005
+ new_checkpoint = {"ema_model_state_dict": ema_model_state_dict}
1006
+ torch.save(new_checkpoint, new_checkpoint_path)
1007
+
1008
+ return f"New checkpoint saved at: {new_checkpoint_path}"
1009
+
1010
+ except Exception as e:
1011
+ return f"An error occurred: {e}"
1012
+
1013
+
1014
+ def expand_model_embeddings(ckpt_path, new_ckpt_path, num_new_tokens=42):
1015
+ seed = 666
1016
+ random.seed(seed)
1017
+ os.environ["PYTHONHASHSEED"] = str(seed)
1018
+ torch.manual_seed(seed)
1019
+ torch.cuda.manual_seed(seed)
1020
+ torch.cuda.manual_seed_all(seed)
1021
+ torch.backends.cudnn.deterministic = True
1022
+ torch.backends.cudnn.benchmark = False
1023
+
1024
+ ckpt = torch.load(ckpt_path, map_location="cpu")
1025
+
1026
+ ema_sd = ckpt.get("ema_model_state_dict", {})
1027
+ embed_key_ema = "ema_model.transformer.text_embed.text_embed.weight"
1028
+ old_embed_ema = ema_sd[embed_key_ema]
1029
+
1030
+ vocab_old = old_embed_ema.size(0)
1031
+ embed_dim = old_embed_ema.size(1)
1032
+ vocab_new = vocab_old + num_new_tokens
1033
+
1034
+ def expand_embeddings(old_embeddings):
1035
+ new_embeddings = torch.zeros((vocab_new, embed_dim))
1036
+ new_embeddings[:vocab_old] = old_embeddings
1037
+ new_embeddings[vocab_old:] = torch.randn((num_new_tokens, embed_dim))
1038
+ return new_embeddings
1039
+
1040
+ ema_sd[embed_key_ema] = expand_embeddings(ema_sd[embed_key_ema])
1041
+
1042
+ torch.save(ckpt, new_ckpt_path)
1043
+
1044
+ return vocab_new
1045
+
1046
+
1047
+ def vocab_count(text):
1048
+ return str(len(text.split(",")))
1049
+
1050
+
1051
+ def vocab_extend(project_name, symbols, model_type):
1052
+ if symbols == "":
1053
+ return "Symbols empty!"
1054
+
1055
+ name_project = project_name
1056
+ path_project = os.path.join(path_data, name_project)
1057
+ file_vocab_project = os.path.join(path_project, "vocab.txt")
1058
+
1059
+ file_vocab = os.path.join(path_data, "Emilia_ZH_EN_pinyin/vocab.txt")
1060
+ if not os.path.isfile(file_vocab):
1061
+ return f"the file {file_vocab} not found !"
1062
+
1063
+ symbols = symbols.split(",")
1064
+ if symbols == []:
1065
+ return "Symbols to extend not found."
1066
+
1067
+ with open(file_vocab, "r", encoding="utf-8-sig") as f:
1068
+ data = f.read()
1069
+ vocab = data.split("\n")
1070
+ vocab_check = set(vocab)
1071
+
1072
+ miss_symbols = []
1073
+ for item in symbols:
1074
+ item = item.replace(" ", "")
1075
+ if item in vocab_check:
1076
+ continue
1077
+ miss_symbols.append(item)
1078
+
1079
+ if miss_symbols == []:
1080
+ return "Symbols are okay no need to extend."
1081
+
1082
+ size_vocab = len(vocab)
1083
+ vocab.pop()
1084
+ for item in miss_symbols:
1085
+ vocab.append(item)
1086
+
1087
+ vocab.append("")
1088
+
1089
+ with open(file_vocab_project, "w", encoding="utf-8") as f:
1090
+ f.write("\n".join(vocab))
1091
+
1092
+ if model_type == "F5-TTS":
1093
+ ckpt_path = str(cached_path("hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.pt"))
1094
+ else:
1095
+ ckpt_path = str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.pt"))
1096
+
1097
+ vocab_size_new = len(miss_symbols)
1098
+
1099
+ dataset_name = name_project.replace("_pinyin", "").replace("_char", "")
1100
+ new_ckpt_path = os.path.join(path_project_ckpts, dataset_name)
1101
+ os.makedirs(new_ckpt_path, exist_ok=True)
1102
+
1103
+ # Add pretrained_ prefix to model when copying for consistency with finetune_cli.py
1104
+ new_ckpt_file = os.path.join(new_ckpt_path, "pretrained_model_1200000.pt")
1105
+
1106
+ size = expand_model_embeddings(ckpt_path, new_ckpt_file, num_new_tokens=vocab_size_new-1)
1107
+
1108
+ vocab_new = "\n".join(miss_symbols)
1109
+ return f"vocab old size : {size_vocab}\nvocab new size : {size}\nvocab add : {vocab_size_new-1}\nnew symbols :\n{vocab_new}"
1110
+
1111
+
1112
+ def vocab_check(project_name):
1113
+ name_project = project_name
1114
+ path_project = os.path.join(path_data, name_project)
1115
+
1116
+ file_metadata = os.path.join(path_project, "metadata.csv")
1117
+
1118
+ file_vocab = os.path.join(path_data, "Emilia_ZH_EN_pinyin/vocab.txt")
1119
+ if not os.path.isfile(file_vocab):
1120
+ return f"the file {file_vocab} not found !", ""
1121
+
1122
+ with open(file_vocab, "r", encoding="utf-8-sig") as f:
1123
+ data = f.read()
1124
+ vocab = data.split("\n")
1125
+ vocab = set(vocab)
1126
+
1127
+ if not os.path.isfile(file_metadata):
1128
+ return f"the file {file_metadata} not found !", ""
1129
+
1130
+ with open(file_metadata, "r", encoding="utf-8-sig") as f:
1131
+ data = f.read()
1132
+
1133
+ miss_symbols = []
1134
+ miss_symbols_keep = {}
1135
+ for item in data.split("\n"):
1136
+ sp = item.split("|")
1137
+ if len(sp) != 2:
1138
+ continue
1139
+
1140
+ text = sp[1].lower().strip()
1141
+
1142
+ for t in text:
1143
+ if t not in vocab and t not in miss_symbols_keep:
1144
+ miss_symbols.append(t)
1145
+ miss_symbols_keep[t] = t
1146
+
1147
+ if miss_symbols == []:
1148
+ vocab_miss = ""
1149
+ info = "You can train using your language !"
1150
+ else:
1151
+ vocab_miss = ",".join(miss_symbols)
1152
+ info = f"The following symbols are missing in your language {len(miss_symbols)}\n\n"
1153
+
1154
+ return info, vocab_miss
1155
+
1156
+
1157
+ def get_random_sample_prepare(project_name):
1158
+ name_project = project_name
1159
+ path_project = os.path.join(path_data, name_project)
1160
+ file_arrow = os.path.join(path_project, "raw.arrow")
1161
+ if not os.path.isfile(file_arrow):
1162
+ return "", None
1163
+ dataset = Dataset_.from_file(file_arrow)
1164
+ random_sample = dataset.shuffle(seed=random.randint(0, 1000)).select([0])
1165
+ text = "[" + " , ".join(["' " + t + " '" for t in random_sample["text"][0]]) + "]"
1166
+ audio_path = random_sample["audio_path"][0]
1167
+ return text, audio_path
1168
+
1169
+
1170
+ def get_random_sample_transcribe(project_name):
1171
+ name_project = project_name
1172
+ path_project = os.path.join(path_data, name_project)
1173
+ file_metadata = os.path.join(path_project, "metadata.csv")
1174
+ if not os.path.isfile(file_metadata):
1175
+ return "", None
1176
+
1177
+ data = ""
1178
+ with open(file_metadata, "r", encoding="utf-8-sig") as f:
1179
+ data = f.read()
1180
+
1181
+ list_data = []
1182
+ for item in data.split("\n"):
1183
+ sp = item.split("|")
1184
+ if len(sp) != 2:
1185
+ continue
1186
+
1187
+ # fixed audio when it is absolute
1188
+ file_audio = get_correct_audio_path(sp[0], os.path.join(path_project, "wavs"))
1189
+ list_data.append([file_audio, sp[1]])
1190
+
1191
+ if list_data == []:
1192
+ return "", None
1193
+
1194
+ random_item = random.choice(list_data)
1195
+
1196
+ return random_item[1], random_item[0]
1197
+
1198
+
1199
+ def get_random_sample_infer(project_name):
1200
+ text, audio = get_random_sample_transcribe(project_name)
1201
+ return (
1202
+ text,
1203
+ text,
1204
+ audio,
1205
+ )
1206
+
1207
+
1208
+ def infer(
1209
+ project, file_checkpoint, exp_name, ref_text, ref_audio, gen_text, nfe_step, use_ema, speed, seed, remove_silence
1210
+ ):
1211
+ global last_checkpoint, last_device, tts_api, last_ema
1212
+
1213
+ if not os.path.isfile(file_checkpoint):
1214
+ return None, "checkpoint not found!"
1215
+
1216
+ if training_process is not None:
1217
+ device_test = "cpu"
1218
+ else:
1219
+ device_test = None
1220
+
1221
+ if last_checkpoint != file_checkpoint or last_device != device_test or last_ema != use_ema or tts_api is None:
1222
+ if last_checkpoint != file_checkpoint:
1223
+ last_checkpoint = file_checkpoint
1224
+
1225
+ if last_device != device_test:
1226
+ last_device = device_test
1227
+
1228
+ if last_ema != use_ema:
1229
+ last_ema = use_ema
1230
+
1231
+ vocab_file = os.path.join(path_data, project, "vocab.txt")
1232
+
1233
+ tts_api = F5TTS(
1234
+ model_type=exp_name, ckpt_file=file_checkpoint, vocab_file=vocab_file, device=device_test, use_ema=use_ema
1235
+ )
1236
+
1237
+ print("update >> ", device_test, file_checkpoint, use_ema)
1238
+
1239
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
1240
+ tts_api.infer(
1241
+ gen_text=gen_text.lower().strip(),
1242
+ ref_text=ref_text.lower().strip(),
1243
+ ref_file=ref_audio,
1244
+ nfe_step=nfe_step,
1245
+ file_wave=f.name,
1246
+ speed=speed,
1247
+ seed=seed,
1248
+ remove_silence=remove_silence,
1249
+ )
1250
+ return f.name, tts_api.device, str(tts_api.seed)
1251
+
1252
+
1253
+ def check_finetune(finetune):
1254
+ return gr.update(interactive=finetune), gr.update(interactive=finetune), gr.update(interactive=finetune)
1255
+
1256
+
1257
+ def get_checkpoints_project(project_name, is_gradio=True):
1258
+ if project_name is None:
1259
+ return [], ""
1260
+ project_name = project_name.replace("_pinyin", "").replace("_char", "")
1261
+
1262
+ if os.path.isdir(path_project_ckpts):
1263
+ files_checkpoints = glob(os.path.join(path_project_ckpts, project_name, "*.pt"))
1264
+ # Separate pretrained and regular checkpoints
1265
+ pretrained_checkpoints = [f for f in files_checkpoints if "pretrained_" in os.path.basename(f)]
1266
+ regular_checkpoints = [
1267
+ f
1268
+ for f in files_checkpoints
1269
+ if "pretrained_" not in os.path.basename(f) and "model_last.pt" not in os.path.basename(f)
1270
+ ]
1271
+ last_checkpoint = [f for f in files_checkpoints if "model_last.pt" in os.path.basename(f)]
1272
+
1273
+ # Sort regular checkpoints by number
1274
+ regular_checkpoints = sorted(
1275
+ regular_checkpoints, key=lambda x: int(os.path.basename(x).split("_")[1].split(".")[0])
1276
+ )
1277
+
1278
+ # Combine in order: pretrained, regular, last
1279
+ files_checkpoints = pretrained_checkpoints + regular_checkpoints + last_checkpoint
1280
+ else:
1281
+ files_checkpoints = []
1282
+
1283
+ selelect_checkpoint = None if not files_checkpoints else files_checkpoints[0]
1284
+
1285
+ if is_gradio:
1286
+ return gr.update(choices=files_checkpoints, value=selelect_checkpoint)
1287
+
1288
+ return files_checkpoints, selelect_checkpoint
1289
+
1290
+
1291
+ def get_audio_project(project_name, is_gradio=True):
1292
+ if project_name is None:
1293
+ return [], ""
1294
+ project_name = project_name.replace("_pinyin", "").replace("_char", "")
1295
+
1296
+ if os.path.isdir(path_project_ckpts):
1297
+ files_audios = glob(os.path.join(path_project_ckpts, project_name, "samples", "*.wav"))
1298
+ files_audios = sorted(files_audios, key=lambda x: int(os.path.basename(x).split("_")[1].split(".")[0]))
1299
+
1300
+ files_audios = [item.replace("_gen.wav", "") for item in files_audios if item.endswith("_gen.wav")]
1301
+ else:
1302
+ files_audios = []
1303
+
1304
+ selelect_checkpoint = None if not files_audios else files_audios[0]
1305
+
1306
+ if is_gradio:
1307
+ return gr.update(choices=files_audios, value=selelect_checkpoint)
1308
+
1309
+ return files_audios, selelect_checkpoint
1310
+
1311
+
1312
+ def get_gpu_stats():
1313
+ gpu_stats = ""
1314
+
1315
+ if torch.cuda.is_available():
1316
+ gpu_count = torch.cuda.device_count()
1317
+ for i in range(gpu_count):
1318
+ gpu_name = torch.cuda.get_device_name(i)
1319
+ gpu_properties = torch.cuda.get_device_properties(i)
1320
+ total_memory = gpu_properties.total_memory / (1024**3) # in GB
1321
+ allocated_memory = torch.cuda.memory_allocated(i) / (1024**2) # in MB
1322
+ reserved_memory = torch.cuda.memory_reserved(i) / (1024**2) # in MB
1323
+
1324
+ gpu_stats += (
1325
+ f"GPU {i} Name: {gpu_name}\n"
1326
+ f"Total GPU memory (GPU {i}): {total_memory:.2f} GB\n"
1327
+ f"Allocated GPU memory (GPU {i}): {allocated_memory:.2f} MB\n"
1328
+ f"Reserved GPU memory (GPU {i}): {reserved_memory:.2f} MB\n\n"
1329
+ )
1330
+ elif torch.xpu.is_available():
1331
+ gpu_count = torch.xpu.device_count()
1332
+ for i in range(gpu_count):
1333
+ gpu_name = torch.xpu.get_device_name(i)
1334
+ gpu_properties = torch.xpu.get_device_properties(i)
1335
+ total_memory = gpu_properties.total_memory / (1024**3) # in GB
1336
+ allocated_memory = torch.xpu.memory_allocated(i) / (1024**2) # in MB
1337
+ reserved_memory = torch.xpu.memory_reserved(i) / (1024**2) # in MB
1338
+
1339
+ gpu_stats += (
1340
+ f"GPU {i} Name: {gpu_name}\n"
1341
+ f"Total GPU memory (GPU {i}): {total_memory:.2f} GB\n"
1342
+ f"Allocated GPU memory (GPU {i}): {allocated_memory:.2f} MB\n"
1343
+ f"Reserved GPU memory (GPU {i}): {reserved_memory:.2f} MB\n\n"
1344
+ )
1345
+ elif torch.backends.mps.is_available():
1346
+ gpu_count = 1
1347
+ gpu_stats += "MPS GPU\n"
1348
+ total_memory = psutil.virtual_memory().total / (
1349
+ 1024**3
1350
+ ) # Total system memory (MPS doesn't have its own memory)
1351
+ allocated_memory = 0
1352
+ reserved_memory = 0
1353
+
1354
+ gpu_stats += (
1355
+ f"Total system memory: {total_memory:.2f} GB\n"
1356
+ f"Allocated GPU memory (MPS): {allocated_memory:.2f} MB\n"
1357
+ f"Reserved GPU memory (MPS): {reserved_memory:.2f} MB\n"
1358
+ )
1359
+
1360
+ else:
1361
+ gpu_stats = "No GPU available"
1362
+
1363
+ return gpu_stats
1364
+
1365
+
1366
+ def get_cpu_stats():
1367
+ cpu_usage = psutil.cpu_percent(interval=1)
1368
+ memory_info = psutil.virtual_memory()
1369
+ memory_used = memory_info.used / (1024**2)
1370
+ memory_total = memory_info.total / (1024**2)
1371
+ memory_percent = memory_info.percent
1372
+
1373
+ pid = os.getpid()
1374
+ process = psutil.Process(pid)
1375
+ nice_value = process.nice()
1376
+
1377
+ cpu_stats = (
1378
+ f"CPU Usage: {cpu_usage:.2f}%\n"
1379
+ f"System Memory: {memory_used:.2f} MB used / {memory_total:.2f} MB total ({memory_percent}% used)\n"
1380
+ f"Process Priority (Nice value): {nice_value}"
1381
+ )
1382
+
1383
+ return cpu_stats
1384
+
1385
+
1386
+ def get_combined_stats():
1387
+ gpu_stats = get_gpu_stats()
1388
+ cpu_stats = get_cpu_stats()
1389
+ combined_stats = f"### GPU Stats\n{gpu_stats}\n\n### CPU Stats\n{cpu_stats}"
1390
+ return combined_stats
1391
+
1392
+
1393
+ def get_audio_select(file_sample):
1394
+ select_audio_ref = file_sample
1395
+ select_audio_gen = file_sample
1396
+
1397
+ if file_sample is not None:
1398
+ select_audio_ref += "_ref.wav"
1399
+ select_audio_gen += "_gen.wav"
1400
+
1401
+ return select_audio_ref, select_audio_gen
1402
+
1403
+
1404
+ with gr.Blocks() as app:
1405
+ gr.Markdown(
1406
+ """
1407
+ # E2/F5 TTS Automatic Finetune
1408
+
1409
+ This is a local web UI for F5 TTS with advanced batch processing support. This app supports the following TTS models:
1410
+
1411
+ * [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
1412
+ * [E2 TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS)
1413
+
1414
+ The checkpoints support English and Chinese.
1415
+
1416
+ For tutorial and updates check here (https://github.com/SWivid/F5-TTS/discussions/143)
1417
+ """
1418
+ )
1419
+
1420
+ with gr.Row():
1421
+ projects, projects_selelect = get_list_projects()
1422
+ tokenizer_type = gr.Radio(label="Tokenizer Type", choices=["pinyin", "char", "custom"], value="pinyin")
1423
+ project_name = gr.Textbox(label="Project Name", value="my_speak")
1424
+ bt_create = gr.Button("Create a New Project")
1425
+
1426
+ with gr.Row():
1427
+ cm_project = gr.Dropdown(
1428
+ choices=projects, value=projects_selelect, label="Project", allow_custom_value=True, scale=6
1429
+ )
1430
+ ch_refresh_project = gr.Button("Refresh", scale=1)
1431
+
1432
+ bt_create.click(fn=create_data_project, inputs=[project_name, tokenizer_type], outputs=[cm_project])
1433
+
1434
+ with gr.Tabs():
1435
+ with gr.TabItem("Transcribe Data"):
1436
+ gr.Markdown("""```plaintext
1437
+ Skip this step if you have your dataset, metadata.csv, and a folder wavs with all the audio files.
1438
+ ```""")
1439
+
1440
+ ch_manual = gr.Checkbox(label="Audio from Path", value=False)
1441
+
1442
+ mark_info_transcribe = gr.Markdown(
1443
+ """```plaintext
1444
+ Place your 'wavs' folder and 'metadata.csv' file in the '{your_project_name}' directory.
1445
+
1446
+ my_speak/
1447
+
1448
+ └── dataset/
1449
+ ├── audio1.wav
1450
+ └── audio2.wav
1451
+ ...
1452
+ ```""",
1453
+ visible=False,
1454
+ )
1455
+
1456
+ audio_speaker = gr.File(label="Voice", type="filepath", file_count="multiple")
1457
+ txt_lang = gr.Text(label="Language", value="English")
1458
+ bt_transcribe = bt_create = gr.Button("Transcribe")
1459
+ txt_info_transcribe = gr.Text(label="Info", value="")
1460
+ bt_transcribe.click(
1461
+ fn=transcribe_all,
1462
+ inputs=[cm_project, audio_speaker, txt_lang, ch_manual],
1463
+ outputs=[txt_info_transcribe],
1464
+ )
1465
+ ch_manual.change(fn=check_user, inputs=[ch_manual], outputs=[audio_speaker, mark_info_transcribe])
1466
+
1467
+ random_sample_transcribe = gr.Button("Random Sample")
1468
+
1469
+ with gr.Row():
1470
+ random_text_transcribe = gr.Text(label="Text")
1471
+ random_audio_transcribe = gr.Audio(label="Audio", type="filepath")
1472
+
1473
+ random_sample_transcribe.click(
1474
+ fn=get_random_sample_transcribe,
1475
+ inputs=[cm_project],
1476
+ outputs=[random_text_transcribe, random_audio_transcribe],
1477
+ )
1478
+
1479
+ with gr.TabItem("Vocab Check"):
1480
+ gr.Markdown("""```plaintext
1481
+ Check the vocabulary for fine-tuning Emilia_ZH_EN to ensure all symbols are included. For fine-tuning a new language.
1482
+ ```""")
1483
+
1484
+ check_button = gr.Button("Check Vocab")
1485
+ txt_info_check = gr.Text(label="Info", value="")
1486
+
1487
+ gr.Markdown("""```plaintext
1488
+ Using the extended model, you can finetune to a new language that is missing symbols in the vocab. This creates a new model with a new vocabulary size and saves it in your ckpts/project folder.
1489
+ ```""")
1490
+
1491
+ exp_name_extend = gr.Radio(label="Model", choices=["F5-TTS", "E2-TTS"], value="F5-TTS")
1492
+
1493
+ with gr.Row():
1494
+ txt_extend = gr.Textbox(
1495
+ label="Symbols",
1496
+ value="",
1497
+ placeholder="To add new symbols, make sure to use ',' for each symbol",
1498
+ scale=6,
1499
+ )
1500
+ txt_count_symbol = gr.Textbox(label="New Vocab Size", value="", scale=1)
1501
+
1502
+ extend_button = gr.Button("Extend")
1503
+ txt_info_extend = gr.Text(label="Info", value="")
1504
+
1505
+ txt_extend.change(vocab_count, inputs=[txt_extend], outputs=[txt_count_symbol])
1506
+ check_button.click(fn=vocab_check, inputs=[cm_project], outputs=[txt_info_check, txt_extend])
1507
+ extend_button.click(
1508
+ fn=vocab_extend, inputs=[cm_project, txt_extend, exp_name_extend], outputs=[txt_info_extend]
1509
+ )
1510
+
1511
+ with gr.TabItem("Prepare Data"):
1512
+ gr.Markdown("""```plaintext
1513
+ Skip this step if you have your dataset, raw.arrow, duration.json, and vocab.txt
1514
+ ```""")
1515
+
1516
+ gr.Markdown(
1517
+ """```plaintext
1518
+ Place all your "wavs" folder and your "metadata.csv" file in your project name directory.
1519
+
1520
+ Supported audio formats: "wav", "mp3", "aac", "flac", "m4a", "alac", "ogg", "aiff", "wma", "amr"
1521
+
1522
+ Example wav format:
1523
+ my_speak/
1524
+
1525
+ ├── wavs/
1526
+ │ ├── audio1.wav
1527
+ │ └── audio2.wav
1528
+ | ...
1529
+
1530
+ └── metadata.csv
1531
+
1532
+ File format metadata.csv:
1533
+
1534
+ audio1|text1 or audio1.wav|text1 or your_path/audio1.wav|text1
1535
+ audio2|text1 or audio2.wav|text1 or your_path/audio2.wav|text1
1536
+ ...
1537
+
1538
+ ```"""
1539
+ )
1540
+ ch_tokenizern = gr.Checkbox(label="Create Vocabulary", value=False, visible=False)
1541
+
1542
+ bt_prepare = bt_create = gr.Button("Prepare")
1543
+ txt_info_prepare = gr.Text(label="Info", value="")
1544
+ txt_vocab_prepare = gr.Text(label="Vocab", value="")
1545
+
1546
+ bt_prepare.click(
1547
+ fn=create_metadata, inputs=[cm_project, ch_tokenizern], outputs=[txt_info_prepare, txt_vocab_prepare]
1548
+ )
1549
+
1550
+ random_sample_prepare = gr.Button("Random Sample")
1551
+
1552
+ with gr.Row():
1553
+ random_text_prepare = gr.Text(label="Tokenizer")
1554
+ random_audio_prepare = gr.Audio(label="Audio", type="filepath")
1555
+
1556
+ random_sample_prepare.click(
1557
+ fn=get_random_sample_prepare, inputs=[cm_project], outputs=[random_text_prepare, random_audio_prepare]
1558
+ )
1559
+
1560
+ with gr.TabItem("Train Data"):
1561
+ gr.Markdown("""```plaintext
1562
+ The auto-setting is still experimental. Please make sure that the epochs, save per updates, and last per updates are set correctly, or change them manually as needed.
1563
+ If you encounter a memory error, try reducing the batch size per GPU to a smaller number.
1564
+ ```""")
1565
+ with gr.Row():
1566
+ bt_calculate = bt_create = gr.Button("Auto Settings")
1567
+ lb_samples = gr.Label(label="Samples")
1568
+ batch_size_type = gr.Radio(label="Batch Size Type", choices=["frame", "sample"], value="frame")
1569
+
1570
+ with gr.Row():
1571
+ ch_finetune = bt_create = gr.Checkbox(label="Finetune", value=True)
1572
+ tokenizer_file = gr.Textbox(label="Tokenizer File", value="")
1573
+ file_checkpoint_train = gr.Textbox(label="Path to the Pretrained Checkpoint", value="")
1574
+
1575
+ with gr.Row():
1576
+ exp_name = gr.Radio(label="Model", choices=["F5TTS_Base", "E2TTS_Base"], value="F5TTS_Base")
1577
+ learning_rate = gr.Number(label="Learning Rate", value=1e-5, step=1e-5)
1578
+
1579
+ with gr.Row():
1580
+ batch_size_per_gpu = gr.Number(label="Batch Size per GPU", value=1000)
1581
+ max_samples = gr.Number(label="Max Samples", value=64)
1582
+
1583
+ with gr.Row():
1584
+ grad_accumulation_steps = gr.Number(label="Gradient Accumulation Steps", value=1)
1585
+ max_grad_norm = gr.Number(label="Max Gradient Norm", value=1.0)
1586
+
1587
+ with gr.Row():
1588
+ epochs = gr.Number(label="Epochs", value=10)
1589
+ num_warmup_updates = gr.Number(label="Warmup Updates", value=2)
1590
+
1591
+ with gr.Row():
1592
+ save_per_updates = gr.Number(label="Save per Updates", value=300)
1593
+ keep_last_n_checkpoints = gr.Number(
1594
+ label="Keep Last N Checkpoints",
1595
+ value=-1,
1596
+ step=1,
1597
+ precision=0,
1598
+ info="-1: Keep all checkpoints, 0: Only save final model_last.pt, N>0: Keep last N checkpoints",
1599
+ )
1600
+ last_per_updates = gr.Number(label="Last per Updates", value=100)
1601
+
1602
+ with gr.Row():
1603
+ ch_8bit_adam = gr.Checkbox(label="Use 8-bit Adam optimizer")
1604
+ mixed_precision = gr.Radio(label="mixed_precision", choices=["none", "fp16", "bf16"], value="none")
1605
+ cd_logger = gr.Radio(label="logger", choices=["wandb", "tensorboard"], value="wandb")
1606
+ start_button = gr.Button("Start Training")
1607
+ stop_button = gr.Button("Stop Training", interactive=False)
1608
+
1609
+ if projects_selelect is not None:
1610
+ (
1611
+ exp_name_value,
1612
+ learning_rate_value,
1613
+ batch_size_per_gpu_value,
1614
+ batch_size_type_value,
1615
+ max_samples_value,
1616
+ grad_accumulation_steps_value,
1617
+ max_grad_norm_value,
1618
+ epochs_value,
1619
+ num_warmup_updates_value,
1620
+ save_per_updates_value,
1621
+ keep_last_n_checkpoints_value,
1622
+ last_per_updates_value,
1623
+ finetune_value,
1624
+ file_checkpoint_train_value,
1625
+ tokenizer_type_value,
1626
+ tokenizer_file_value,
1627
+ mixed_precision_value,
1628
+ logger_value,
1629
+ bnb_optimizer_value,
1630
+ ) = load_settings(projects_selelect)
1631
+
1632
+ # Assigning values to the respective components
1633
+ exp_name.value = exp_name_value
1634
+ learning_rate.value = learning_rate_value
1635
+ batch_size_per_gpu.value = batch_size_per_gpu_value
1636
+ batch_size_type.value = batch_size_type_value
1637
+ max_samples.value = max_samples_value
1638
+ grad_accumulation_steps.value = grad_accumulation_steps_value
1639
+ max_grad_norm.value = max_grad_norm_value
1640
+ epochs.value = epochs_value
1641
+ num_warmup_updates.value = num_warmup_updates_value
1642
+ save_per_updates.value = save_per_updates_value
1643
+ keep_last_n_checkpoints.value = keep_last_n_checkpoints_value
1644
+ last_per_updates.value = last_per_updates_value
1645
+ ch_finetune.value = finetune_value
1646
+ file_checkpoint_train.value = file_checkpoint_train_value
1647
+ tokenizer_type.value = tokenizer_type_value
1648
+ tokenizer_file.value = tokenizer_file_value
1649
+ mixed_precision.value = mixed_precision_value
1650
+ cd_logger.value = logger_value
1651
+ ch_8bit_adam.value = bnb_optimizer_value
1652
+
1653
+ ch_stream = gr.Checkbox(label="Stream Output Experiment", value=True)
1654
+ txt_info_train = gr.Text(label="Info", value="")
1655
+
1656
+ list_audios, select_audio = get_audio_project(projects_selelect, False)
1657
+
1658
+ select_audio_ref = select_audio
1659
+ select_audio_gen = select_audio
1660
+
1661
+ if select_audio is not None:
1662
+ select_audio_ref += "_ref.wav"
1663
+ select_audio_gen += "_gen.wav"
1664
+
1665
+ with gr.Row():
1666
+ ch_list_audio = gr.Dropdown(
1667
+ choices=list_audios,
1668
+ value=select_audio,
1669
+ label="Audios",
1670
+ allow_custom_value=True,
1671
+ scale=6,
1672
+ interactive=True,
1673
+ )
1674
+ bt_stream_audio = gr.Button("Refresh", scale=1)
1675
+ bt_stream_audio.click(fn=get_audio_project, inputs=[cm_project], outputs=[ch_list_audio])
1676
+ cm_project.change(fn=get_audio_project, inputs=[cm_project], outputs=[ch_list_audio])
1677
+
1678
+ with gr.Row():
1679
+ audio_ref_stream = gr.Audio(label="Original", type="filepath", value=select_audio_ref)
1680
+ audio_gen_stream = gr.Audio(label="Generate", type="filepath", value=select_audio_gen)
1681
+
1682
+ ch_list_audio.change(
1683
+ fn=get_audio_select,
1684
+ inputs=[ch_list_audio],
1685
+ outputs=[audio_ref_stream, audio_gen_stream],
1686
+ )
1687
+
1688
+ start_button.click(
1689
+ fn=start_training,
1690
+ inputs=[
1691
+ cm_project,
1692
+ exp_name,
1693
+ learning_rate,
1694
+ batch_size_per_gpu,
1695
+ batch_size_type,
1696
+ max_samples,
1697
+ grad_accumulation_steps,
1698
+ max_grad_norm,
1699
+ epochs,
1700
+ num_warmup_updates,
1701
+ save_per_updates,
1702
+ keep_last_n_checkpoints,
1703
+ last_per_updates,
1704
+ ch_finetune,
1705
+ file_checkpoint_train,
1706
+ tokenizer_type,
1707
+ tokenizer_file,
1708
+ mixed_precision,
1709
+ ch_stream,
1710
+ cd_logger,
1711
+ ch_8bit_adam,
1712
+ ],
1713
+ outputs=[txt_info_train, start_button, stop_button],
1714
+ )
1715
+ stop_button.click(fn=stop_training, outputs=[txt_info_train, start_button, stop_button])
1716
+
1717
+ bt_calculate.click(
1718
+ fn=calculate_train,
1719
+ inputs=[
1720
+ cm_project,
1721
+ batch_size_type,
1722
+ max_samples,
1723
+ learning_rate,
1724
+ num_warmup_updates,
1725
+ save_per_updates,
1726
+ last_per_updates,
1727
+ ch_finetune,
1728
+ ],
1729
+ outputs=[
1730
+ batch_size_per_gpu,
1731
+ max_samples,
1732
+ num_warmup_updates,
1733
+ save_per_updates,
1734
+ last_per_updates,
1735
+ lb_samples,
1736
+ learning_rate,
1737
+ epochs,
1738
+ ],
1739
+ )
1740
+
1741
+ ch_finetune.change(
1742
+ check_finetune, inputs=[ch_finetune], outputs=[file_checkpoint_train, tokenizer_file, tokenizer_type]
1743
+ )
1744
+
1745
+ def setup_load_settings():
1746
+ output_components = [
1747
+ exp_name, # 1
1748
+ learning_rate, # 2
1749
+ batch_size_per_gpu, # 3
1750
+ batch_size_type, # 4
1751
+ max_samples, # 5
1752
+ grad_accumulation_steps, # 6
1753
+ max_grad_norm, # 7
1754
+ epochs, # 8
1755
+ num_warmup_updates, # 9
1756
+ save_per_updates, # 10
1757
+ keep_last_n_checkpoints, # 11
1758
+ last_per_updates, # 12
1759
+ ch_finetune, # 13
1760
+ file_checkpoint_train, # 14
1761
+ tokenizer_type, # 15
1762
+ tokenizer_file, # 16
1763
+ mixed_precision, # 17
1764
+ cd_logger, # 18
1765
+ ch_8bit_adam, # 19
1766
+ ]
1767
+ return output_components
1768
+
1769
+ outputs = setup_load_settings()
1770
+
1771
+ cm_project.change(
1772
+ fn=load_settings,
1773
+ inputs=[cm_project],
1774
+ outputs=outputs,
1775
+ )
1776
+
1777
+ ch_refresh_project.click(
1778
+ fn=load_settings,
1779
+ inputs=[cm_project],
1780
+ outputs=outputs,
1781
+ )
1782
+
1783
+ with gr.TabItem("Test Model"):
1784
+ gr.Markdown("""```plaintext
1785
+ SOS: Check the use_ema setting (True or False) for your model to see what works best for you. use seed -1 from random
1786
+ ```""")
1787
+ exp_name = gr.Radio(label="Model", choices=["F5-TTS", "E2-TTS"], value="F5-TTS")
1788
+ list_checkpoints, checkpoint_select = get_checkpoints_project(projects_selelect, False)
1789
+
1790
+ with gr.Row():
1791
+ nfe_step = gr.Number(label="NFE Step", value=32)
1792
+ speed = gr.Slider(label="Speed", value=1.0, minimum=0.3, maximum=2.0, step=0.1)
1793
+ seed = gr.Number(label="Seed", value=-1, minimum=-1)
1794
+ remove_silence = gr.Checkbox(label="Remove Silence")
1795
+
1796
+ ch_use_ema = gr.Checkbox(label="Use EMA", value=True)
1797
+ with gr.Row():
1798
+ cm_checkpoint = gr.Dropdown(
1799
+ choices=list_checkpoints, value=checkpoint_select, label="Checkpoints", allow_custom_value=True
1800
+ )
1801
+ bt_checkpoint_refresh = gr.Button("Refresh")
1802
+
1803
+ random_sample_infer = gr.Button("Random Sample")
1804
+
1805
+ ref_text = gr.Textbox(label="Ref Text")
1806
+ ref_audio = gr.Audio(label="Audio Ref", type="filepath")
1807
+ gen_text = gr.Textbox(label="Gen Text")
1808
+
1809
+ random_sample_infer.click(
1810
+ fn=get_random_sample_infer, inputs=[cm_project], outputs=[ref_text, gen_text, ref_audio]
1811
+ )
1812
+
1813
+ with gr.Row():
1814
+ txt_info_gpu = gr.Textbox("", label="Device")
1815
+ seed_info = gr.Text(label="Seed :")
1816
+ check_button_infer = gr.Button("Infer")
1817
+
1818
+ gen_audio = gr.Audio(label="Audio Gen", type="filepath")
1819
+
1820
+ check_button_infer.click(
1821
+ fn=infer,
1822
+ inputs=[
1823
+ cm_project,
1824
+ cm_checkpoint,
1825
+ exp_name,
1826
+ ref_text,
1827
+ ref_audio,
1828
+ gen_text,
1829
+ nfe_step,
1830
+ ch_use_ema,
1831
+ speed,
1832
+ seed,
1833
+ remove_silence,
1834
+ ],
1835
+ outputs=[gen_audio, txt_info_gpu, seed_info],
1836
+ )
1837
+
1838
+ bt_checkpoint_refresh.click(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
1839
+ cm_project.change(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
1840
+
1841
+ with gr.TabItem("Reduce Checkpoint"):
1842
+ gr.Markdown("""```plaintext
1843
+ Reduce the model size from 5GB to 1.3GB. The new checkpoint can be used for inference or fine-tuning afterward, but it cannot be used to continue training.
1844
+ ```""")
1845
+ txt_path_checkpoint = gr.Text(label="Path to Checkpoint:")
1846
+ txt_path_checkpoint_small = gr.Text(label="Path to Output:")
1847
+ ch_safetensors = gr.Checkbox(label="Safetensors", value="")
1848
+ txt_info_reduse = gr.Text(label="Info", value="")
1849
+ reduse_button = gr.Button("Reduce")
1850
+ reduse_button.click(
1851
+ fn=extract_and_save_ema_model,
1852
+ inputs=[txt_path_checkpoint, txt_path_checkpoint_small, ch_safetensors],
1853
+ outputs=[txt_info_reduse],
1854
+ )
1855
+
1856
+ with gr.TabItem("System Info"):
1857
+ output_box = gr.Textbox(label="GPU and CPU Information", lines=20)
1858
+
1859
+ def update_stats():
1860
+ return get_combined_stats()
1861
+
1862
+ update_button = gr.Button("Update Stats")
1863
+ update_button.click(fn=update_stats, outputs=output_box)
1864
+
1865
+ def auto_update():
1866
+ yield gr.update(value=update_stats())
1867
+
1868
+ gr.update(fn=auto_update, inputs=[], outputs=output_box)
1869
+
1870
+
1871
+ @click.command()
1872
+ @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
1873
+ @click.option("--host", "-H", default=None, help="Host to run the app on")
1874
+ @click.option(
1875
+ "--share",
1876
+ "-s",
1877
+ default=False,
1878
+ is_flag=True,
1879
+ help="Share the app via Gradio share link",
1880
+ )
1881
+ @click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
1882
+ def main(port, host, share, api):
1883
+ global app
1884
+ print("Starting app...")
1885
+ app.queue(api_open=api).launch(server_name=host, server_port=port, share=share, show_api=api)
1886
+
1887
+
1888
+ if __name__ == "__main__":
1889
+ main()
src/f5_tts/train/train.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # training script.
2
+
3
+ import os
4
+ from importlib.resources import files
5
+
6
+ import hydra
7
+
8
+ from f5_tts.model import CFM, DiT, Trainer, UNetT
9
+ from f5_tts.model.dataset import load_dataset
10
+ from f5_tts.model.utils import get_tokenizer
11
+
12
+ os.chdir(str(files("f5_tts").joinpath("../.."))) # change working directory to root of project (local editable)
13
+
14
+
15
+ @hydra.main(version_base="1.3", config_path=str(files("f5_tts").joinpath("configs")), config_name=None)
16
+ def main(cfg):
17
+ tokenizer = cfg.model.tokenizer
18
+ mel_spec_type = cfg.model.mel_spec.mel_spec_type
19
+ exp_name = f"{cfg.model.name}_{mel_spec_type}_{cfg.model.tokenizer}_{cfg.datasets.name}"
20
+
21
+ # set text tokenizer
22
+ if tokenizer != "custom":
23
+ tokenizer_path = cfg.datasets.name
24
+ else:
25
+ tokenizer_path = cfg.model.tokenizer_path
26
+ vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
27
+
28
+ # set model
29
+ if "F5TTS" in cfg.model.name:
30
+ model_cls = DiT
31
+ elif "E2TTS" in cfg.model.name:
32
+ model_cls = UNetT
33
+ wandb_resume_id = None
34
+
35
+ model = CFM(
36
+ transformer=model_cls(**cfg.model.arch, text_num_embeds=vocab_size, mel_dim=cfg.model.mel_spec.n_mel_channels),
37
+ mel_spec_kwargs=cfg.model.mel_spec,
38
+ vocab_char_map=vocab_char_map,
39
+ )
40
+
41
+ # init trainer
42
+ trainer = Trainer(
43
+ model,
44
+ epochs=cfg.optim.epochs,
45
+ learning_rate=cfg.optim.learning_rate,
46
+ num_warmup_updates=cfg.optim.num_warmup_updates,
47
+ save_per_updates=cfg.ckpts.save_per_updates,
48
+ keep_last_n_checkpoints=getattr(cfg.ckpts, "keep_last_n_checkpoints", -1),
49
+ checkpoint_path=str(files("f5_tts").joinpath(f"../../{cfg.ckpts.save_dir}")),
50
+ batch_size=cfg.datasets.batch_size_per_gpu,
51
+ batch_size_type=cfg.datasets.batch_size_type,
52
+ max_samples=cfg.datasets.max_samples,
53
+ grad_accumulation_steps=cfg.optim.grad_accumulation_steps,
54
+ max_grad_norm=cfg.optim.max_grad_norm,
55
+ logger=cfg.ckpts.logger,
56
+ wandb_project="CFM-TTS",
57
+ wandb_run_name=exp_name,
58
+ wandb_resume_id=wandb_resume_id,
59
+ last_per_updates=cfg.ckpts.last_per_updates,
60
+ log_samples=True,
61
+ bnb_optimizer=cfg.optim.bnb_optimizer,
62
+ mel_spec_type=mel_spec_type,
63
+ is_local_vocoder=cfg.model.vocoder.is_local,
64
+ local_vocoder_path=cfg.model.vocoder.local_path,
65
+ )
66
+
67
+ train_dataset = load_dataset(cfg.datasets.name, tokenizer, mel_spec_kwargs=cfg.model.mel_spec)
68
+ trainer.train(
69
+ train_dataset,
70
+ num_workers=cfg.datasets.num_workers,
71
+ resumable_with_seed=666, # seed for shuffling dataset
72
+ )
73
+
74
+
75
+ if __name__ == "__main__":
76
+ main()