Spaces:
Sleeping
Sleeping
| import spaces | |
| import gradio as gr | |
| import torch | |
| from TTS.api import TTS | |
| import os | |
| import argparse | |
| import os | |
| import sys | |
| import tempfile | |
| import librosa.display | |
| import numpy as np | |
| import torchaudio | |
| import traceback | |
| from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list | |
| from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt | |
| from TTS.tts.configs.xtts_config import XttsConfig | |
| from TTS.tts.models.xtts import Xtts | |
| os.environ["COQUI_TOS_AGREED"] = "1" | |
| device = "cuda" | |
| tts = TTS("tts_models/multilingual/multi-dataset/xtts_bill_spa").to(device) | |
| model_path = '/home/user/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_bill_spa/model.pth' | |
| config_path = '/home/user/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_bill_spa/config.json' | |
| vocab_path = '/home/user/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_bill_spa/vocab.json' | |
| def clear_gpu_cache(): | |
| # clear the GPU cache | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| XTTS_MODEL = None | |
| def load_model(xtts_checkpoint, xtts_config, xtts_vocab): | |
| global XTTS_MODEL | |
| clear_gpu_cache() | |
| if not xtts_checkpoint or not xtts_config or not xtts_vocab: | |
| return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!" | |
| config = XttsConfig() | |
| config.load_json(xtts_config) | |
| XTTS_MODEL = Xtts.init_from_config(config) | |
| print("Loading XTTS model! ") | |
| XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False) | |
| if torch.cuda.is_available(): | |
| XTTS_MODEL.cuda() | |
| print("Model Loaded!") | |
| def run_tts(lang, tts_text, speaker_audio_file): | |
| if XTTS_MODEL is None or not speaker_audio_file: | |
| return "You need to run the previous step to load the model !!", None, None | |
| gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs) | |
| out = XTTS_MODEL.inference( | |
| text=tts_text, | |
| language=lang, | |
| gpt_cond_latent=gpt_cond_latent, | |
| speaker_embedding=speaker_embedding, | |
| temperature=XTTS_MODEL.config.temperature, # Add custom parameters here | |
| length_penalty=XTTS_MODEL.config.length_penalty, | |
| repetition_penalty=XTTS_MODEL.config.repetition_penalty, | |
| top_k=XTTS_MODEL.config.top_k, | |
| top_p=XTTS_MODEL.config.top_p, | |
| ) | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: | |
| out["wav"] = torch.tensor(out["wav"]).unsqueeze(0) | |
| out_path = fp.name | |
| torchaudio.save(out_path, out["wav"], 24000) | |
| print("Speech generated !") | |
| return out_path, speaker_audio_file | |
| def generate(text, audio): | |
| load_model(model_path, config_path, vocab_path) | |
| out_path, speaker_audio_file = run_tts(lang='es', tts_text=text, speaker_audio_file=audio) | |
| return out_path | |
| demo = gr.Interface( | |
| fn=generate, | |
| inputs=[gr.Textbox(label='Frase a generar'), gr.Audio(type='filepath', label='Voz de referencia')], | |
| outputs=gr.Audio(type='filepath') | |
| ) | |
| demo.launch() |