| { | |
| "tts_version": "v1.5.0", | |
| "split": "opensource-en", | |
| "ttl_ckpt_path": "unknown.pt", | |
| "dp_ckpt_path": "unknown.pt", | |
| "ae_ckpt_path": "unknown.pt", | |
| "ttl_train": "unknown", | |
| "dp_train": "unknown", | |
| "ae_train": "unknown", | |
| "ttl": { | |
| "latent_dim": 24, | |
| "chunk_compress_factor": 6, | |
| "batch_expander": { | |
| "n_batch_expand": 6 | |
| }, | |
| "normalizer": { | |
| "scale": 0.25 | |
| }, | |
| "text_encoder": { | |
| "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json", | |
| "text_embedder": { | |
| "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json", | |
| "char_emb_dim": 256 | |
| }, | |
| "convnext": { | |
| "idim": 256, | |
| "ksz": 5, | |
| "intermediate_dim": 1024, | |
| "num_layers": 6, | |
| "dilation_lst": [ | |
| 1, | |
| 1, | |
| 1, | |
| 1, | |
| 1, | |
| 1 | |
| ] | |
| }, | |
| "attn_encoder": { | |
| "hidden_channels": 256, | |
| "filter_channels": 1024, | |
| "n_heads": 4, | |
| "n_layers": 4, | |
| "p_dropout": 0.0 | |
| }, | |
| "proj_out": { | |
| "idim": 256, | |
| "odim": 256 | |
| } | |
| }, | |
| "flow_matching": { | |
| "sig_min": 0 | |
| }, | |
| "style_encoder": { | |
| "proj_in": { | |
| "ldim": 24, | |
| "chunk_compress_factor": 6, | |
| "odim": 256 | |
| }, | |
| "convnext": { | |
| "idim": 256, | |
| "ksz": 5, | |
| "intermediate_dim": 1024, | |
| "num_layers": 6, | |
| "dilation_lst": [ | |
| 1, | |
| 1, | |
| 1, | |
| 1, | |
| 1, | |
| 1 | |
| ] | |
| }, | |
| "style_token_layer": { | |
| "input_dim": 256, | |
| "n_style": 50, | |
| "style_key_dim": 256, | |
| "style_value_dim": 256, | |
| "prototype_dim": 256, | |
| "n_units": 256, | |
| "n_heads": 2 | |
| } | |
| }, | |
| "speech_prompted_text_encoder": { | |
| "text_dim": 256, | |
| "style_dim": 256, | |
| "n_units": 256, | |
| "n_heads": 2 | |
| }, | |
| "uncond_masker": { | |
| "prob_both_uncond": 0.04, | |
| "prob_text_uncond": 0.01, | |
| "std": 0.1, | |
| "text_dim": 256, | |
| "n_style": 50, | |
| "style_key_dim": 256, | |
| "style_value_dim": 256 | |
| }, | |
| "vector_field": { | |
| "proj_in": { | |
| "ldim": 24, | |
| "chunk_compress_factor": 6, | |
| "odim": 512 | |
| }, | |
| "time_encoder": { | |
| "time_dim": 64, | |
| "hdim": 256 | |
| }, | |
| "main_blocks": { | |
| "n_blocks": 4, | |
| "time_cond_layer": { | |
| "idim": 512, | |
| "time_dim": 64 | |
| }, | |
| "style_cond_layer": { | |
| "idim": 512, | |
| "style_dim": 256 | |
| }, | |
| "text_cond_layer": { | |
| "idim": 512, | |
| "text_dim": 256, | |
| "n_heads": 4, | |
| "use_residual": true, | |
| "rotary_base": 10000, | |
| "rotary_scale": 10 | |
| }, | |
| "convnext_0": { | |
| "idim": 512, | |
| "ksz": 5, | |
| "intermediate_dim": 1024, | |
| "num_layers": 4, | |
| "dilation_lst": [ | |
| 1, | |
| 2, | |
| 4, | |
| 8 | |
| ] | |
| }, | |
| "convnext_1": { | |
| "idim": 512, | |
| "ksz": 5, | |
| "intermediate_dim": 1024, | |
| "num_layers": 1, | |
| "dilation_lst": [ | |
| 1 | |
| ] | |
| }, | |
| "convnext_2": { | |
| "idim": 512, | |
| "ksz": 5, | |
| "intermediate_dim": 1024, | |
| "num_layers": 1, | |
| "dilation_lst": [ | |
| 1 | |
| ] | |
| } | |
| }, | |
| "last_convnext": { | |
| "idim": 512, | |
| "ksz": 5, | |
| "intermediate_dim": 1024, | |
| "num_layers": 4, | |
| "dilation_lst": [ | |
| 1, | |
| 1, | |
| 1, | |
| 1 | |
| ] | |
| }, | |
| "proj_out": { | |
| "idim": 512, | |
| "chunk_compress_factor": 6, | |
| "ldim": 24 | |
| } | |
| } | |
| }, | |
| "ae": { | |
| "sample_rate": 44100, | |
| "n_delay": 0, | |
| "base_chunk_size": 512, | |
| "chunk_compress_factor": 1, | |
| "ldim": 24, | |
| "encoder": { | |
| "spec_processor": { | |
| "n_fft": 2048, | |
| "win_length": 2048, | |
| "hop_length": 512, | |
| "n_mels": 228, | |
| "sample_rate": 44100, | |
| "eps": 1e-05, | |
| "norm_mean": 0.0, | |
| "norm_std": 1.0 | |
| }, | |
| "ksz_init": 7, | |
| "ksz": 7, | |
| "num_layers": 10, | |
| "dilation_lst": [ | |
| 1, | |
| 1, | |
| 1, | |
| 1, | |
| 1, | |
| 1, | |
| 1, | |
| 1, | |
| 1, | |
| 1 | |
| ], | |
| "intermediate_dim": 2048, | |
| "idim": 1253, | |
| "hdim": 512, | |
| "odim": 24 | |
| }, | |
| "decoder": { | |
| "ksz_init": 7, | |
| "ksz": 7, | |
| "num_layers": 10, | |
| "dilation_lst": [ | |
| 1, | |
| 2, | |
| 4, | |
| 1, | |
| 2, | |
| 4, | |
| 1, | |
| 1, | |
| 1, | |
| 1 | |
| ], | |
| "intermediate_dim": 2048, | |
| "idim": 24, | |
| "hdim": 512, | |
| "head": { | |
| "idim": 512, | |
| "hdim": 2048, | |
| "odim": 512, | |
| "ksz": 3 | |
| } | |
| } | |
| }, | |
| "dp": { | |
| "latent_dim": 24, | |
| "chunk_compress_factor": 6, | |
| "normalizer": { | |
| "scale": 1.0 | |
| }, | |
| "sentence_encoder": { | |
| "char_emb_dim": 64, | |
| "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json", | |
| "text_embedder": { | |
| "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json", | |
| "char_emb_dim": 64 | |
| }, | |
| "convnext": { | |
| "idim": 64, | |
| "ksz": 5, | |
| "intermediate_dim": 256, | |
| "num_layers": 6, | |
| "dilation_lst": [ | |
| 1, | |
| 1, | |
| 1, | |
| 1, | |
| 1, | |
| 1 | |
| ] | |
| }, | |
| "attn_encoder": { | |
| "hidden_channels": 64, | |
| "filter_channels": 256, | |
| "n_heads": 2, | |
| "n_layers": 2, | |
| "p_dropout": 0.0 | |
| }, | |
| "proj_out": { | |
| "idim": 64, | |
| "odim": 64 | |
| } | |
| }, | |
| "style_encoder": { | |
| "proj_in": { | |
| "ldim": 24, | |
| "chunk_compress_factor": 6, | |
| "odim": 64 | |
| }, | |
| "convnext": { | |
| "idim": 64, | |
| "ksz": 5, | |
| "intermediate_dim": 256, | |
| "num_layers": 4, | |
| "dilation_lst": [ | |
| 1, | |
| 1, | |
| 1, | |
| 1 | |
| ] | |
| }, | |
| "style_token_layer": { | |
| "input_dim": 64, | |
| "n_style": 8, | |
| "style_key_dim": 0, | |
| "style_value_dim": 16, | |
| "prototype_dim": 64, | |
| "n_units": 64, | |
| "n_heads": 2 | |
| } | |
| }, | |
| "predictor": { | |
| "sentence_dim": 64, | |
| "n_style": 8, | |
| "style_dim": 16, | |
| "hdim": 128, | |
| "n_layer": 2 | |
| } | |
| } | |
| } |