Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

cosyvoice2.yaml +14 -11
flow.decoder.estimator.fp32.onnx +1 -1
flow.encoder.fp16.zip +1 -1
flow.encoder.fp32.zip +1 -1
flow.pt +1 -1
llm.pt +1 -1

cosyvoice2.yaml CHANGED Viewed

@@ -21,14 +21,15 @@ num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size,
 # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
 # for system/third_party class/function, we do not require this.
 llm: !new:cosyvoice.llm.llm.Qwen2LM
-    llm_input_size: !ref <llm_input_size>
-    llm_output_size: !ref <llm_output_size>
     speech_token_size: 6561
     length_normalized_loss: True
     lsm_weight: 0
     mix_ratio: [5, 15]
-    llm: !new:cosyvoice.llm.llm.Qwen2Encoder
-        pretrain_path: !ref <qwen_pretrain_path>
     sampling: !name:cosyvoice.utils.common.ras_sampling
         top_p: 0.8
         top_k: 25
@@ -134,6 +135,7 @@ parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
 get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
     token_path: !ref <qwen_pretrain_path>
     skip_special_tokens: True
 allowed_special: 'all'
 tokenize: !name:cosyvoice.dataset.processor.tokenize
     get_tokenizer: !ref <get_tokenizer>
@@ -141,7 +143,7 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize
 filter: !name:cosyvoice.dataset.processor.filter
     max_length: 40960
     min_length: 100
-    token_max_length: 200
     token_min_length: 1
 resample: !name:cosyvoice.dataset.processor.resample
     resample_rate: !ref <sample_rate>
@@ -170,9 +172,9 @@ sort: !name:cosyvoice.dataset.processor.sort
     sort_size: 500  # sort_size should be less than shuffle_size
 batch: !name:cosyvoice.dataset.processor.batch
     batch_type: 'dynamic'
-    max_frames_in_batch: 4000
 padding: !name:cosyvoice.dataset.processor.padding
-    use_spk_embedding: False # change to True during sft
 # dataset processor pipeline
@@ -205,15 +207,16 @@ data_pipeline_gan: [
 # llm flow train conf
 train_conf:
-    optim: adam
     optim_conf:
         lr: 1e-5 # change to 1e-5 during sft
     scheduler: constantlr # change to constantlr during sft
     scheduler_conf:
         warmup_steps: 2500
-    max_epoch: 200 # 200
     grad_clip: 5
-    accum_grad: 2
     log_interval: 100
     save_per_step: 3000 # -1 this is where you can set the step-wise validation checkpoint, -1 means no step-wise validation checkpoint
@@ -227,7 +230,7 @@ train_conf_gan:
     optim_conf_d:
         lr: 0.0002 # use small lr for gan training
     scheduler_d: constantlr
-    max_epoch: 200
     grad_clip: 5
     accum_grad: 1 # in gan training, accum_grad must be 1
     log_interval: 100

 # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
 # for system/third_party class/function, we do not require this.
 llm: !new:cosyvoice.llm.llm.Qwen2LM
+    # llm_input_size/llm_output_size will be auto-set from backbone.hidden_size, so 896 here is ignored
+    llm_input_size: 0
+    llm_output_size: 0
     speech_token_size: 6561
     length_normalized_loss: True
     lsm_weight: 0
     mix_ratio: [5, 15]
+    llm: !new:cosyvoice.llm.llm.HFBackbone     # backbone-agnostic
+        pretrain_path: !ref <qwen_pretrain_path> # e.g., "Qwen/Qwen3-0.6B" or "mistralai/Mistral-7B-Instruct-v0.3"
     sampling: !name:cosyvoice.utils.common.ras_sampling
         top_p: 0.8
         top_k: 25
 get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
     token_path: !ref <qwen_pretrain_path>
     skip_special_tokens: True
+    # add_additional_specials: auto-detected based on token_path (True for blanken/CosyVoice models, False for custom HF backbones)
 allowed_special: 'all'
 tokenize: !name:cosyvoice.dataset.processor.tokenize
     get_tokenizer: !ref <get_tokenizer>
 filter: !name:cosyvoice.dataset.processor.filter
     max_length: 40960
     min_length: 100
+    token_max_length: 512 # not sure if this can just be changed?
     token_min_length: 1
 resample: !name:cosyvoice.dataset.processor.resample
     resample_rate: !ref <sample_rate>
     sort_size: 500  # sort_size should be less than shuffle_size
 batch: !name:cosyvoice.dataset.processor.batch
     batch_type: 'dynamic'
+    max_frames_in_batch: 3000
 padding: !name:cosyvoice.dataset.processor.padding
+    use_spk_embedding: True # change to True during sft
 # dataset processor pipeline
 # llm flow train conf
 train_conf:
+    optim: adamw
     optim_conf:
         lr: 1e-5 # change to 1e-5 during sft
+        # weight_decay: 0.01
     scheduler: constantlr # change to constantlr during sft
     scheduler_conf:
         warmup_steps: 2500
+    max_epoch: 30 # 200
     grad_clip: 5
+    accum_grad: 4
     log_interval: 100
     save_per_step: 3000 # -1 this is where you can set the step-wise validation checkpoint, -1 means no step-wise validation checkpoint
     optim_conf_d:
         lr: 0.0002 # use small lr for gan training
     scheduler_d: constantlr
+    max_epoch: 20
     grad_clip: 5
     accum_grad: 1 # in gan training, accum_grad must be 1
     log_interval: 100

flow.decoder.estimator.fp32.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a9dd1d177ed4a6506fa422a966682c54c3263e35aa338b3ac291ce8fabe7dc51
 size 286312346

 version https://git-lfs.github.com/spec/v1
+oid sha256:83b402a64ac811ea48511431bd2c6151e66c021555ab1b96e547c22d4b1873c5
 size 286312346

flow.encoder.fp16.zip CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1ea3f0f03c5b644dacfa4924fe7056b2b696b6d2c28bcc8911aa642c468f2024
 size 116706755

 version https://git-lfs.github.com/spec/v1
+oid sha256:b574c286d17194656a139a7b630da7b8f45e9bc44450fa86ff92500794c8d881
 size 116706755

flow.encoder.fp32.zip CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2aee0fcb8a16ab1f576d2c74629c06582acf83862aa32348d69dd3dfe5f046ee
 size 192369091

 version https://git-lfs.github.com/spec/v1
+oid sha256:0d66a11ce71cc206c1977630f8dcf89839274955e1bd8bf90aeeac781d16a96e
 size 192369091

flow.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d700e8c2f726530423875ebeadcaa09bada08878f38e087ca04c55032403834f
 size 450569991

 version https://git-lfs.github.com/spec/v1
+oid sha256:aa7ff413cf98ffd7b8532761cdcf4f9a9da17375e0b4a66020c4448ddffc4368
 size 450569991

llm.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1bc7ff1b5379c592982455777c1ac823c3e6ddefb53e2f0143d352c338f6de7e
 size 2567851323

 version https://git-lfs.github.com/spec/v1
+oid sha256:647add6974a326d679083d92b300acb75dd49bfa9c815e524faaa2253342883d
 size 2567851323