File size: 6,422 Bytes
b993b85 04dbd13 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
{
"root": "pretrained_models",
"structure": {
"ckpts": {
"multilingual_grl": {
"path": "pretrained_models/ckpts/multilingual_grl",
"description": "Main LEMAS-TTS model (multilingual_grl)",
"files": [
{
"path": "pretrained_models/ckpts/multilingual_grl/multilingual_grl.safetensors",
"info": "Unconditional/text-conditioned non-autoregressive model weights with GRL, supporting multilingual TTS and editing (default Edit Model in Gradio)"
}
]
},
"multilingual_prosody": {
"path": "pretrained_models/ckpts/multilingual_prosody",
"description": "non-autoregressive variant with an additional prosody encoder",
"files": [
{
"path": "pretrained_models/ckpts/multilingual_prosody/multilingual_prosody.safetensors",
"info": "Multilingual model weights with global prosody conditioning enabled (selectable in the model menu)"
}
]
},
"prosody_encoder": {
"path": "pretrained_models/ckpts/prosody_encoder",
"description": "Pretssel / UnitY2-style prosody encoder used by the non-autoregressive prosody backend",
"files": [
{
"path": "pretrained_models/ckpts/prosody_encoder/pretssel_cfg.json",
"info": "Architecture configuration for the Pretssel/UnitY2 prosody encoder (dimensions, number of layers, etc.)"
},
{
"path": "pretrained_models/ckpts/prosody_encoder/prosody_encoder_UnitY2.pt",
"info": "Prosody encoder weights, used to extract global prosody embeddings from reference audio"
}
]
},
"vocos-mel-24khz": {
"path": "pretrained_models/ckpts/vocos-mel-24khz",
"description": "Vocos neural vocoder (mel → 24kHz waveform)",
"files": [
{
"path": "pretrained_models/ckpts/vocos-mel-24khz/config.yaml",
"info": "Vocos vocoder configuration defining mel feature dimensions and network architecture"
},
{
"path": "pretrained_models/ckpts/vocos-mel-24khz/pytorch_model.bin",
"info": "Main Vocos vocoder weights, used to decode mel features in the CFM backend"
},
{
"path": "pretrained_models/ckpts/vocos-mel-24khz/README.md",
"info": "Documentation for the vocoder (origin, usage, and notes)"
}
]
}
},
"data": {
"multilingual_grl": {
"path": "pretrained_models/data/multilingual_grl",
"description": "Text vocabulary for the multilingual_grl model",
"files": [
{
"path": "pretrained_models/data/multilingual_grl/vocab.txt",
"info": "phone vocabulary corresponding to the text embeddings of the multilingual_grl model"
}
]
},
"multilingual_prosody": {
"path": "pretrained_models/data/multilingual_prosody",
"description": "Text vocabulary for the multilingual_prosody model",
"files": [
{
"path": "pretrained_models/data/multilingual_prosody/vocab.txt",
"info": "Shared text vocabulary used by the multilingual_prosody model"
}
]
}
},
"demos": {
"root": "pretrained_models/demos",
"files": [
{
"path": "pretrained_models/demos/test.wav",
"info": "Simple test audio used for quick validation of gradio script"
}
],
"lemas_edit_test": {
"path": "pretrained_models/demos/lemas_edit_test",
"description": "Audio samples and alignment annotations for LEMAS-Edit demos",
"subdirs": {
"vocals": {
"path": "pretrained_models/demos/lemas_edit_test/vocals",
"files": [
{
"path": "pretrained_models/demos/lemas_edit_test/vocals/en_AUD0000000214_S0001522.mp3",
"info": "English demo audio used for AR/NAR editing examples"
},
{
"path": "pretrained_models/demos/lemas_edit_test/vocals/zh_emilia_zh_0008385782.mp3",
"info": "Chinese demo audio used for multilingual editing examples"
}
]
},
"align": {
"path": "pretrained_models/demos/lemas_edit_test/align",
"files": [
{
"path": "pretrained_models/demos/lemas_edit_test/align/en_AUD0000000214_S0001522.json",
"info": "MMS alignment JSON for the English demo, including intervals, words, and modified_index used for editing"
},
{
"path": "pretrained_models/demos/lemas_edit_test/align/zh_emilia_zh_0008385782.json",
"info": "MMS alignment JSON for the Chinese demo"
}
]
}
}
}
},
"uvr5": {
"path": "pretrained_models/uvr5",
"description": "Kim Vocal UVR5 models and configurations (used for denoising)",
"files": [
{
"path": "pretrained_models/uvr5/Kim_Vocal_1.onnx",
"info": "Main UVR5 model (ONNX format) for vocal/accompaniment separation and denoising"
},
{
"path": "pretrained_models/uvr5/MDX-Net-Kim-Vocal1.json",
"info": "UVR5 model architecture and inference configuration (channels, frame length, etc.)"
},
{
"path": "pretrained_models/uvr5/model_data.json",
"info": "UVR5 metadata including presets, model list, and default parameters"
},
{
"path": "pretrained_models/uvr5/model_name_mapper.json",
"info": "Mapping from internal UVR5 model names to human-readable names for frontend selection"
}
]
},
"whisperx": {
"path": "pretrained_models/whisperx",
"description": "WhisperX VAD and segmentation model assets",
"files": [
{
"path": "pretrained_models/whisperx/whisperx-vad-segmentation.bin",
"info": "WhisperX voice activity detection (VAD) weights used for long-audio segmentation and ASR alignment assistance"
},
{
"path": "pretrained_models/whisperx/whisperx-vad-segmentation.bak",
"info": "Backup or legacy version of the VAD model, kept for safety"
}
]
}
}
}
|