{ "root": "pretrained_models", "structure": { "ckpts": { "multilingual_grl": { "path": "pretrained_models/ckpts/multilingual_grl", "description": "Main LEMAS-TTS model (multilingual_grl)", "files": [ { "path": "pretrained_models/ckpts/multilingual_grl/multilingual_grl.safetensors", "info": "Unconditional/text-conditioned non-autoregressive model weights with GRL, supporting multilingual TTS and editing (default Edit Model in Gradio)" } ] }, "multilingual_prosody": { "path": "pretrained_models/ckpts/multilingual_prosody", "description": "non-autoregressive variant with an additional prosody encoder", "files": [ { "path": "pretrained_models/ckpts/multilingual_prosody/multilingual_prosody.safetensors", "info": "Multilingual model weights with global prosody conditioning enabled (selectable in the model menu)" } ] }, "prosody_encoder": { "path": "pretrained_models/ckpts/prosody_encoder", "description": "Pretssel / UnitY2-style prosody encoder used by the non-autoregressive prosody backend", "files": [ { "path": "pretrained_models/ckpts/prosody_encoder/pretssel_cfg.json", "info": "Architecture configuration for the Pretssel/UnitY2 prosody encoder (dimensions, number of layers, etc.)" }, { "path": "pretrained_models/ckpts/prosody_encoder/prosody_encoder_UnitY2.pt", "info": "Prosody encoder weights, used to extract global prosody embeddings from reference audio" } ] }, "vocos-mel-24khz": { "path": "pretrained_models/ckpts/vocos-mel-24khz", "description": "Vocos neural vocoder (mel → 24kHz waveform)", "files": [ { "path": "pretrained_models/ckpts/vocos-mel-24khz/config.yaml", "info": "Vocos vocoder configuration defining mel feature dimensions and network architecture" }, { "path": "pretrained_models/ckpts/vocos-mel-24khz/pytorch_model.bin", "info": "Main Vocos vocoder weights, used to decode mel features in the CFM backend" }, { "path": "pretrained_models/ckpts/vocos-mel-24khz/README.md", "info": "Documentation for the vocoder (origin, usage, and notes)" } ] } }, "data": { "multilingual_grl": { "path": "pretrained_models/data/multilingual_grl", "description": "Text vocabulary for the multilingual_grl model", "files": [ { "path": "pretrained_models/data/multilingual_grl/vocab.txt", "info": "phone vocabulary corresponding to the text embeddings of the multilingual_grl model" } ] }, "multilingual_prosody": { "path": "pretrained_models/data/multilingual_prosody", "description": "Text vocabulary for the multilingual_prosody model", "files": [ { "path": "pretrained_models/data/multilingual_prosody/vocab.txt", "info": "Shared text vocabulary used by the multilingual_prosody model" } ] } }, "demos": { "root": "pretrained_models/demos", "files": [ { "path": "pretrained_models/demos/test.wav", "info": "Simple test audio used for quick validation of gradio script" } ], "lemas_edit_test": { "path": "pretrained_models/demos/lemas_edit_test", "description": "Audio samples and alignment annotations for LEMAS-Edit demos", "subdirs": { "vocals": { "path": "pretrained_models/demos/lemas_edit_test/vocals", "files": [ { "path": "pretrained_models/demos/lemas_edit_test/vocals/en_AUD0000000214_S0001522.mp3", "info": "English demo audio used for AR/NAR editing examples" }, { "path": "pretrained_models/demos/lemas_edit_test/vocals/zh_emilia_zh_0008385782.mp3", "info": "Chinese demo audio used for multilingual editing examples" } ] }, "align": { "path": "pretrained_models/demos/lemas_edit_test/align", "files": [ { "path": "pretrained_models/demos/lemas_edit_test/align/en_AUD0000000214_S0001522.json", "info": "MMS alignment JSON for the English demo, including intervals, words, and modified_index used for editing" }, { "path": "pretrained_models/demos/lemas_edit_test/align/zh_emilia_zh_0008385782.json", "info": "MMS alignment JSON for the Chinese demo" } ] } } } }, "uvr5": { "path": "pretrained_models/uvr5", "description": "Kim Vocal UVR5 models and configurations (used for denoising)", "files": [ { "path": "pretrained_models/uvr5/Kim_Vocal_1.onnx", "info": "Main UVR5 model (ONNX format) for vocal/accompaniment separation and denoising" }, { "path": "pretrained_models/uvr5/MDX-Net-Kim-Vocal1.json", "info": "UVR5 model architecture and inference configuration (channels, frame length, etc.)" }, { "path": "pretrained_models/uvr5/model_data.json", "info": "UVR5 metadata including presets, model list, and default parameters" }, { "path": "pretrained_models/uvr5/model_name_mapper.json", "info": "Mapping from internal UVR5 model names to human-readable names for frontend selection" } ] }, "whisperx": { "path": "pretrained_models/whisperx", "description": "WhisperX VAD and segmentation model assets", "files": [ { "path": "pretrained_models/whisperx/whisperx-vad-segmentation.bin", "info": "WhisperX voice activity detection (VAD) weights used for long-audio segmentation and ASR alignment assistance" }, { "path": "pretrained_models/whisperx/whisperx-vad-segmentation.bak", "info": "Backup or legacy version of the VAD model, kept for safety" } ] } } }