Upload folder using huggingface_hub
Browse files- cosyvoice2.yaml +14 -11
- flow.decoder.estimator.fp32.onnx +1 -1
- flow.encoder.fp16.zip +1 -1
- flow.encoder.fp32.zip +1 -1
- flow.pt +1 -1
- llm.pt +1 -1
cosyvoice2.yaml
CHANGED
|
@@ -21,14 +21,15 @@ num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size,
|
|
| 21 |
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
|
| 22 |
# for system/third_party class/function, we do not require this.
|
| 23 |
llm: !new:cosyvoice.llm.llm.Qwen2LM
|
| 24 |
-
llm_input_size
|
| 25 |
-
|
|
|
|
| 26 |
speech_token_size: 6561
|
| 27 |
length_normalized_loss: True
|
| 28 |
lsm_weight: 0
|
| 29 |
mix_ratio: [5, 15]
|
| 30 |
-
llm: !new:cosyvoice.llm.llm.
|
| 31 |
-
pretrain_path: !ref <qwen_pretrain_path>
|
| 32 |
sampling: !name:cosyvoice.utils.common.ras_sampling
|
| 33 |
top_p: 0.8
|
| 34 |
top_k: 25
|
|
@@ -134,6 +135,7 @@ parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
|
| 134 |
get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
|
| 135 |
token_path: !ref <qwen_pretrain_path>
|
| 136 |
skip_special_tokens: True
|
|
|
|
| 137 |
allowed_special: 'all'
|
| 138 |
tokenize: !name:cosyvoice.dataset.processor.tokenize
|
| 139 |
get_tokenizer: !ref <get_tokenizer>
|
|
@@ -141,7 +143,7 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize
|
|
| 141 |
filter: !name:cosyvoice.dataset.processor.filter
|
| 142 |
max_length: 40960
|
| 143 |
min_length: 100
|
| 144 |
-
token_max_length:
|
| 145 |
token_min_length: 1
|
| 146 |
resample: !name:cosyvoice.dataset.processor.resample
|
| 147 |
resample_rate: !ref <sample_rate>
|
|
@@ -170,9 +172,9 @@ sort: !name:cosyvoice.dataset.processor.sort
|
|
| 170 |
sort_size: 500 # sort_size should be less than shuffle_size
|
| 171 |
batch: !name:cosyvoice.dataset.processor.batch
|
| 172 |
batch_type: 'dynamic'
|
| 173 |
-
max_frames_in_batch:
|
| 174 |
padding: !name:cosyvoice.dataset.processor.padding
|
| 175 |
-
use_spk_embedding:
|
| 176 |
|
| 177 |
|
| 178 |
# dataset processor pipeline
|
|
@@ -205,15 +207,16 @@ data_pipeline_gan: [
|
|
| 205 |
|
| 206 |
# llm flow train conf
|
| 207 |
train_conf:
|
| 208 |
-
optim:
|
| 209 |
optim_conf:
|
| 210 |
lr: 1e-5 # change to 1e-5 during sft
|
|
|
|
| 211 |
scheduler: constantlr # change to constantlr during sft
|
| 212 |
scheduler_conf:
|
| 213 |
warmup_steps: 2500
|
| 214 |
-
max_epoch:
|
| 215 |
grad_clip: 5
|
| 216 |
-
accum_grad:
|
| 217 |
log_interval: 100
|
| 218 |
save_per_step: 3000 # -1 this is where you can set the step-wise validation checkpoint, -1 means no step-wise validation checkpoint
|
| 219 |
|
|
@@ -227,7 +230,7 @@ train_conf_gan:
|
|
| 227 |
optim_conf_d:
|
| 228 |
lr: 0.0002 # use small lr for gan training
|
| 229 |
scheduler_d: constantlr
|
| 230 |
-
max_epoch:
|
| 231 |
grad_clip: 5
|
| 232 |
accum_grad: 1 # in gan training, accum_grad must be 1
|
| 233 |
log_interval: 100
|
|
|
|
| 21 |
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
|
| 22 |
# for system/third_party class/function, we do not require this.
|
| 23 |
llm: !new:cosyvoice.llm.llm.Qwen2LM
|
| 24 |
+
# llm_input_size/llm_output_size will be auto-set from backbone.hidden_size, so 896 here is ignored
|
| 25 |
+
llm_input_size: 0
|
| 26 |
+
llm_output_size: 0
|
| 27 |
speech_token_size: 6561
|
| 28 |
length_normalized_loss: True
|
| 29 |
lsm_weight: 0
|
| 30 |
mix_ratio: [5, 15]
|
| 31 |
+
llm: !new:cosyvoice.llm.llm.HFBackbone # backbone-agnostic
|
| 32 |
+
pretrain_path: !ref <qwen_pretrain_path> # e.g., "Qwen/Qwen3-0.6B" or "mistralai/Mistral-7B-Instruct-v0.3"
|
| 33 |
sampling: !name:cosyvoice.utils.common.ras_sampling
|
| 34 |
top_p: 0.8
|
| 35 |
top_k: 25
|
|
|
|
| 135 |
get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
|
| 136 |
token_path: !ref <qwen_pretrain_path>
|
| 137 |
skip_special_tokens: True
|
| 138 |
+
# add_additional_specials: auto-detected based on token_path (True for blanken/CosyVoice models, False for custom HF backbones)
|
| 139 |
allowed_special: 'all'
|
| 140 |
tokenize: !name:cosyvoice.dataset.processor.tokenize
|
| 141 |
get_tokenizer: !ref <get_tokenizer>
|
|
|
|
| 143 |
filter: !name:cosyvoice.dataset.processor.filter
|
| 144 |
max_length: 40960
|
| 145 |
min_length: 100
|
| 146 |
+
token_max_length: 512 # not sure if this can just be changed?
|
| 147 |
token_min_length: 1
|
| 148 |
resample: !name:cosyvoice.dataset.processor.resample
|
| 149 |
resample_rate: !ref <sample_rate>
|
|
|
|
| 172 |
sort_size: 500 # sort_size should be less than shuffle_size
|
| 173 |
batch: !name:cosyvoice.dataset.processor.batch
|
| 174 |
batch_type: 'dynamic'
|
| 175 |
+
max_frames_in_batch: 3000
|
| 176 |
padding: !name:cosyvoice.dataset.processor.padding
|
| 177 |
+
use_spk_embedding: True # change to True during sft
|
| 178 |
|
| 179 |
|
| 180 |
# dataset processor pipeline
|
|
|
|
| 207 |
|
| 208 |
# llm flow train conf
|
| 209 |
train_conf:
|
| 210 |
+
optim: adamw
|
| 211 |
optim_conf:
|
| 212 |
lr: 1e-5 # change to 1e-5 during sft
|
| 213 |
+
# weight_decay: 0.01
|
| 214 |
scheduler: constantlr # change to constantlr during sft
|
| 215 |
scheduler_conf:
|
| 216 |
warmup_steps: 2500
|
| 217 |
+
max_epoch: 30 # 200
|
| 218 |
grad_clip: 5
|
| 219 |
+
accum_grad: 4
|
| 220 |
log_interval: 100
|
| 221 |
save_per_step: 3000 # -1 this is where you can set the step-wise validation checkpoint, -1 means no step-wise validation checkpoint
|
| 222 |
|
|
|
|
| 230 |
optim_conf_d:
|
| 231 |
lr: 0.0002 # use small lr for gan training
|
| 232 |
scheduler_d: constantlr
|
| 233 |
+
max_epoch: 20
|
| 234 |
grad_clip: 5
|
| 235 |
accum_grad: 1 # in gan training, accum_grad must be 1
|
| 236 |
log_interval: 100
|
flow.decoder.estimator.fp32.onnx
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 286312346
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:83b402a64ac811ea48511431bd2c6151e66c021555ab1b96e547c22d4b1873c5
|
| 3 |
size 286312346
|
flow.encoder.fp16.zip
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 116706755
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b574c286d17194656a139a7b630da7b8f45e9bc44450fa86ff92500794c8d881
|
| 3 |
size 116706755
|
flow.encoder.fp32.zip
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 192369091
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0d66a11ce71cc206c1977630f8dcf89839274955e1bd8bf90aeeac781d16a96e
|
| 3 |
size 192369091
|
flow.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 450569991
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa7ff413cf98ffd7b8532761cdcf4f9a9da17375e0b4a66020c4448ddffc4368
|
| 3 |
size 450569991
|
llm.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2567851323
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:647add6974a326d679083d92b300acb75dd49bfa9c815e524faaa2253342883d
|
| 3 |
size 2567851323
|