| tts_version: "v1.5.0" | |
| split: "opensource-en" | |
| ttl_ckpt_path: "unknown.pt" | |
| dp_ckpt_path: "unknown.pt" | |
| ae_ckpt_path: "unknown.pt" | |
| ttl_train: "unknown" | |
| dp_train: "unknown" | |
| ae_train: "unknown" | |
| ttl: | |
| latent_dim: 24 | |
| chunk_compress_factor: 6 | |
| batch_expander: | |
| n_batch_expand: 6 | |
| normalizer: | |
| scale: 0.25 | |
| text_encoder: | |
| char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json" | |
| text_embedder: | |
| char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json" | |
| char_emb_dim: 256 | |
| convnext: | |
| idim: 256 | |
| ksz: 5 | |
| intermediate_dim: 1024 | |
| num_layers: 6 | |
| dilation_lst: [1, 1, 1, 1, 1, 1] | |
| attn_encoder: | |
| hidden_channels: 256 | |
| filter_channels: 1024 | |
| n_heads: 4 | |
| n_layers: 4 | |
| p_dropout: 0.0 | |
| proj_out: | |
| idim: 256 | |
| odim: 256 | |
| flow_matching: | |
| sig_min: 0 | |
| style_encoder: | |
| proj_in: | |
| ldim: 24 | |
| chunk_compress_factor: 6 | |
| odim: 256 | |
| convnext: | |
| idim: 256 | |
| ksz: 5 | |
| intermediate_dim: 1024 | |
| num_layers: 6 | |
| dilation_lst: [1, 1, 1, 1, 1, 1] | |
| style_token_layer: | |
| input_dim: 256 | |
| n_style: 50 | |
| style_key_dim: 256 | |
| style_value_dim: 256 | |
| prototype_dim: 256 | |
| n_units: 256 | |
| n_heads: 2 | |
| speech_prompted_text_encoder: | |
| text_dim: 256 | |
| style_dim: 256 | |
| n_units: 256 | |
| n_heads: 2 | |
| uncond_masker: | |
| prob_both_uncond: 0.04 | |
| prob_text_uncond: 0.01 | |
| std: 0.1 | |
| text_dim: 256 | |
| n_style: 50 | |
| style_key_dim: 256 | |
| style_value_dim: 256 | |
| vector_field: | |
| proj_in: | |
| ldim: 24 | |
| chunk_compress_factor: 6 | |
| odim: 512 | |
| time_encoder: | |
| time_dim: 64 | |
| hdim: 256 | |
| main_blocks: | |
| n_blocks: 4 | |
| time_cond_layer: | |
| idim: 512 | |
| time_dim: 64 | |
| style_cond_layer: | |
| idim: 512 | |
| style_dim: 256 | |
| text_cond_layer: | |
| idim: 512 | |
| text_dim: 256 | |
| n_heads: 4 | |
| use_residual: True | |
| rotary_base: 10000 | |
| rotary_scale: 10 | |
| convnext_0: | |
| idim: 512 | |
| ksz: 5 | |
| intermediate_dim: 1024 | |
| num_layers: 4 | |
| dilation_lst: [1, 2, 4, 8] | |
| convnext_1: | |
| idim: 512 | |
| ksz: 5 | |
| intermediate_dim: 1024 | |
| num_layers: 1 | |
| dilation_lst: [1] | |
| convnext_2: | |
| idim: 512 | |
| ksz: 5 | |
| intermediate_dim: 1024 | |
| num_layers: 1 | |
| dilation_lst: [1] | |
| last_convnext: | |
| idim: 512 | |
| ksz: 5 | |
| intermediate_dim: 1024 | |
| num_layers: 4 | |
| dilation_lst: [1, 1, 1, 1] | |
| proj_out: | |
| idim: 512 | |
| chunk_compress_factor: 6 | |
| ldim: 24 | |
| ae: | |
| sample_rate: 44100 | |
| n_delay: 0 | |
| base_chunk_size: 512 | |
| chunk_compress_factor: 1 | |
| ldim: 24 | |
| encoder: | |
| spec_processor: | |
| n_fft: 2048 | |
| win_length: 2048 | |
| hop_length: 512 | |
| n_mels: 228 | |
| sample_rate: 44100 | |
| eps: 1e-05 | |
| norm_mean: 0.0 | |
| norm_std: 1.0 | |
| ksz_init: 7 | |
| ksz: 7 | |
| num_layers: 10 | |
| dilation_lst: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] | |
| intermediate_dim: 2048 | |
| idim: 1253 | |
| hdim: 512 | |
| odim: 24 | |
| decoder: | |
| ksz_init: 7 | |
| ksz: 7 | |
| num_layers: 10 | |
| dilation_lst: [1, 2, 4, 1, 2, 4, 1, 1, 1, 1] | |
| intermediate_dim: 2048 | |
| idim: 24 | |
| hdim: 512 | |
| head: | |
| idim: 512 | |
| hdim: 2048 | |
| odim: 512 | |
| ksz: 3 | |
| dp: | |
| latent_dim: 24 | |
| chunk_compress_factor: 6 | |
| normalizer: | |
| scale: 1.0 | |
| sentence_encoder: | |
| char_emb_dim: 64 | |
| char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json" | |
| text_embedder: | |
| char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json" | |
| char_emb_dim: 64 | |
| convnext: | |
| idim: 64 | |
| ksz: 5 | |
| intermediate_dim: 256 | |
| num_layers: 6 | |
| dilation_lst: [1, 1, 1, 1, 1, 1] | |
| attn_encoder: | |
| hidden_channels: 64 | |
| filter_channels: 256 | |
| n_heads: 2 | |
| n_layers: 2 | |
| p_dropout: 0.0 | |
| proj_out: | |
| idim: 64 | |
| odim: 64 | |
| style_encoder: | |
| proj_in: | |
| ldim: 24 | |
| chunk_compress_factor: 6 | |
| odim: 64 | |
| convnext: | |
| idim: 64 | |
| ksz: 5 | |
| intermediate_dim: 256 | |
| num_layers: 4 | |
| dilation_lst: [1, 1, 1, 1] | |
| style_token_layer: | |
| input_dim: 64 | |
| n_style: 8 | |
| style_key_dim: 0 | |
| style_value_dim: 16 | |
| prototype_dim: 64 | |
| n_units: 64 | |
| n_heads: 2 | |
| predictor: | |
| sentence_dim: 64 | |
| n_style: 8 | |
| style_dim: 16 | |
| hdim: 128 | |
| n_layer: 2 | |
| unicode_indexer_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.npy" | |
| unicode_indexer_json_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.json" | |
| window_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/window.json" | |
| filter_bank_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/filter_bank.json" | |