| { | |
| "audio_vae": { | |
| "model": { | |
| "params": { | |
| "ddconfig": { | |
| "double_z": true, | |
| "mel_bins": 64, | |
| "z_channels": 8, | |
| "resolution": 256, | |
| "downsample_time": false, | |
| "in_channels": 2, | |
| "out_ch": 2, | |
| "ch": 128, | |
| "ch_mult": [ | |
| 1, | |
| 2, | |
| 4 | |
| ], | |
| "num_res_blocks": 2, | |
| "attn_resolutions": [], | |
| "dropout": 0.0, | |
| "mid_block_add_attention": false, | |
| "norm_type": "pixel", | |
| "causality_axis": "height" | |
| }, | |
| "sampling_rate": 16000 | |
| } | |
| }, | |
| "preprocessing": { | |
| "audio": { | |
| "sampling_rate": 16000, | |
| "max_wav_value": 32768.0, | |
| "duration": 5.12, | |
| "stereo": true, | |
| "causal_padding": 3 | |
| }, | |
| "stft": { | |
| "filter_length": 1024, | |
| "hop_length": 160, | |
| "win_length": 1024, | |
| "causal": true | |
| }, | |
| "mel": { | |
| "n_mel_channels": 64, | |
| "mel_fmin": 0, | |
| "mel_fmax": 8000 | |
| } | |
| } | |
| }, | |
| "_class_name": "LTX2AudioDecoder" | |
| } | |