| { | |
| "model" : { | |
| "fm_decoder_downsampling_factor" : [1,2,4,2,1], | |
| "fm_decoder_num_layers" : [2,2,4,4,4], | |
| "fm_decoder_cnn_module_kernel" : [31,15,7,15,31], | |
| "fm_decoder_feedforward_dim" : 1536, | |
| "fm_decoder_num_heads" : 4, | |
| "fm_decoder_dim" : 512, | |
| "text_encoder_num_layers" : 4, | |
| "text_encoder_feedforward_dim" : 512, | |
| "text_encoder_cnn_module_kernel" : 9, | |
| "text_encoder_num_heads" : 4, | |
| "text_encoder_dim" : 192, | |
| "query_head_dim" : 32, | |
| "value_head_dim" : 12, | |
| "pos_head_dim" : 4, | |
| "pos_dim" : 48, | |
| "time_embed_dim" : 192, | |
| "text_embed_dim" : 192, | |
| "feat_dim": 100 | |
| }, | |
| "feature" : { | |
| "sampling_rate": 24000, | |
| "type": "vocos" | |
| } | |
| } |