{ "architectures": [ "S3GenModel" ], "cfm_inference_cfg_rate": 0.7, "cfm_sigma_min": 1e-06, "cfm_solver": "euler", "cfm_t_scheduler": "cosine", "decoder_act_fn": "gelu", "decoder_attention_head_dim": 64, "decoder_channels": [ 256 ], "decoder_in_channels": 320, "decoder_n_blocks": 4, "decoder_num_heads": 8, "decoder_num_mid_blocks": 12, "decoder_out_channels": 80, "dtype": "float32", "encoder_attention_heads": 8, "encoder_dropout_rate": 0.1, "encoder_linear_units": 2048, "encoder_num_blocks": 6, "encoder_output_size": 512, "fmax": 8000, "fmin": 0, "hop_length": 480, "input_frame_rate": 25, "mel_bins": 80, "model_type": "s3gen", "n_fft": 1920, "pre_lookahead_len": 3, "sampling_rate": 24000, "speaker_embed_dim": 192, "speaker_feat_dim": 80, "token_embed_dim": 512, "token_mel_ratio": 2, "transformers_version": "5.0.0.dev0", "vocab_size": 6561, "win_size": 1920 }