Luka512 commited on
Commit
54a22a0
·
verified ·
1 Parent(s): fc71f8d

Upload folder using huggingface_hub

Browse files
cosyvoice2.yaml CHANGED
@@ -21,14 +21,15 @@ num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size,
21
  # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
22
  # for system/third_party class/function, we do not require this.
23
  llm: !new:cosyvoice.llm.llm.Qwen2LM
24
- llm_input_size: !ref <llm_input_size>
25
- llm_output_size: !ref <llm_output_size>
 
26
  speech_token_size: 6561
27
  length_normalized_loss: True
28
  lsm_weight: 0
29
  mix_ratio: [5, 15]
30
- llm: !new:cosyvoice.llm.llm.Qwen2Encoder
31
- pretrain_path: !ref <qwen_pretrain_path>
32
  sampling: !name:cosyvoice.utils.common.ras_sampling
33
  top_p: 0.8
34
  top_k: 25
@@ -134,6 +135,7 @@ parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
134
  get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
135
  token_path: !ref <qwen_pretrain_path>
136
  skip_special_tokens: True
 
137
  allowed_special: 'all'
138
  tokenize: !name:cosyvoice.dataset.processor.tokenize
139
  get_tokenizer: !ref <get_tokenizer>
@@ -141,7 +143,7 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize
141
  filter: !name:cosyvoice.dataset.processor.filter
142
  max_length: 40960
143
  min_length: 100
144
- token_max_length: 200
145
  token_min_length: 1
146
  resample: !name:cosyvoice.dataset.processor.resample
147
  resample_rate: !ref <sample_rate>
@@ -170,9 +172,9 @@ sort: !name:cosyvoice.dataset.processor.sort
170
  sort_size: 500 # sort_size should be less than shuffle_size
171
  batch: !name:cosyvoice.dataset.processor.batch
172
  batch_type: 'dynamic'
173
- max_frames_in_batch: 4000
174
  padding: !name:cosyvoice.dataset.processor.padding
175
- use_spk_embedding: False # change to True during sft
176
 
177
 
178
  # dataset processor pipeline
@@ -205,15 +207,16 @@ data_pipeline_gan: [
205
 
206
  # llm flow train conf
207
  train_conf:
208
- optim: adam
209
  optim_conf:
210
  lr: 1e-5 # change to 1e-5 during sft
 
211
  scheduler: constantlr # change to constantlr during sft
212
  scheduler_conf:
213
  warmup_steps: 2500
214
- max_epoch: 200 # 200
215
  grad_clip: 5
216
- accum_grad: 2
217
  log_interval: 100
218
  save_per_step: 3000 # -1 this is where you can set the step-wise validation checkpoint, -1 means no step-wise validation checkpoint
219
 
@@ -227,7 +230,7 @@ train_conf_gan:
227
  optim_conf_d:
228
  lr: 0.0002 # use small lr for gan training
229
  scheduler_d: constantlr
230
- max_epoch: 200
231
  grad_clip: 5
232
  accum_grad: 1 # in gan training, accum_grad must be 1
233
  log_interval: 100
 
21
  # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
22
  # for system/third_party class/function, we do not require this.
23
  llm: !new:cosyvoice.llm.llm.Qwen2LM
24
+ # llm_input_size/llm_output_size will be auto-set from backbone.hidden_size, so 896 here is ignored
25
+ llm_input_size: 0
26
+ llm_output_size: 0
27
  speech_token_size: 6561
28
  length_normalized_loss: True
29
  lsm_weight: 0
30
  mix_ratio: [5, 15]
31
+ llm: !new:cosyvoice.llm.llm.HFBackbone # backbone-agnostic
32
+ pretrain_path: !ref <qwen_pretrain_path> # e.g., "Qwen/Qwen3-0.6B" or "mistralai/Mistral-7B-Instruct-v0.3"
33
  sampling: !name:cosyvoice.utils.common.ras_sampling
34
  top_p: 0.8
35
  top_k: 25
 
135
  get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
136
  token_path: !ref <qwen_pretrain_path>
137
  skip_special_tokens: True
138
+ # add_additional_specials: auto-detected based on token_path (True for blanken/CosyVoice models, False for custom HF backbones)
139
  allowed_special: 'all'
140
  tokenize: !name:cosyvoice.dataset.processor.tokenize
141
  get_tokenizer: !ref <get_tokenizer>
 
143
  filter: !name:cosyvoice.dataset.processor.filter
144
  max_length: 40960
145
  min_length: 100
146
+ token_max_length: 512 # not sure if this can just be changed?
147
  token_min_length: 1
148
  resample: !name:cosyvoice.dataset.processor.resample
149
  resample_rate: !ref <sample_rate>
 
172
  sort_size: 500 # sort_size should be less than shuffle_size
173
  batch: !name:cosyvoice.dataset.processor.batch
174
  batch_type: 'dynamic'
175
+ max_frames_in_batch: 3000
176
  padding: !name:cosyvoice.dataset.processor.padding
177
+ use_spk_embedding: True # change to True during sft
178
 
179
 
180
  # dataset processor pipeline
 
207
 
208
  # llm flow train conf
209
  train_conf:
210
+ optim: adamw
211
  optim_conf:
212
  lr: 1e-5 # change to 1e-5 during sft
213
+ # weight_decay: 0.01
214
  scheduler: constantlr # change to constantlr during sft
215
  scheduler_conf:
216
  warmup_steps: 2500
217
+ max_epoch: 30 # 200
218
  grad_clip: 5
219
+ accum_grad: 4
220
  log_interval: 100
221
  save_per_step: 3000 # -1 this is where you can set the step-wise validation checkpoint, -1 means no step-wise validation checkpoint
222
 
 
230
  optim_conf_d:
231
  lr: 0.0002 # use small lr for gan training
232
  scheduler_d: constantlr
233
+ max_epoch: 20
234
  grad_clip: 5
235
  accum_grad: 1 # in gan training, accum_grad must be 1
236
  log_interval: 100
flow.decoder.estimator.fp32.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a9dd1d177ed4a6506fa422a966682c54c3263e35aa338b3ac291ce8fabe7dc51
3
  size 286312346
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83b402a64ac811ea48511431bd2c6151e66c021555ab1b96e547c22d4b1873c5
3
  size 286312346
flow.encoder.fp16.zip CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ea3f0f03c5b644dacfa4924fe7056b2b696b6d2c28bcc8911aa642c468f2024
3
  size 116706755
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b574c286d17194656a139a7b630da7b8f45e9bc44450fa86ff92500794c8d881
3
  size 116706755
flow.encoder.fp32.zip CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2aee0fcb8a16ab1f576d2c74629c06582acf83862aa32348d69dd3dfe5f046ee
3
  size 192369091
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d66a11ce71cc206c1977630f8dcf89839274955e1bd8bf90aeeac781d16a96e
3
  size 192369091
flow.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d700e8c2f726530423875ebeadcaa09bada08878f38e087ca04c55032403834f
3
  size 450569991
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa7ff413cf98ffd7b8532761cdcf4f9a9da17375e0b4a66020c4448ddffc4368
3
  size 450569991
llm.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1bc7ff1b5379c592982455777c1ac823c3e6ddefb53e2f0143d352c338f6de7e
3
  size 2567851323
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:647add6974a326d679083d92b300acb75dd49bfa9c815e524faaa2253342883d
3
  size 2567851323