XTTS_finetuned_dani

Sleeping

App Files Files Community

XTTS_finetuned_dani / app.py

rockdrigoma

Update app.py

33b51a6 verified over 1 year ago

raw

history blame

3.29 kB

	import spaces
	import gradio as gr
	import torch
	from TTS.api import TTS
	import os
	import argparse
	import os
	import sys
	import tempfile
	import librosa.display
	import numpy as np

	import torchaudio
	import traceback
	from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list
	from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt

	from TTS.tts.configs.xtts_config import XttsConfig
	from TTS.tts.models.xtts import Xtts

	os.environ["COQUI_TOS_AGREED"] = "1"

	device = "cuda"

	tts = TTS("tts_models/multilingual/multi-dataset/xtts_bill_spa").to(device)
	model_path = '/home/user/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_bill_spa/model.pth'
	config_path = '/home/user/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_bill_spa/config.json'
	vocab_path = '/home/user/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_bill_spa/vocab.json'


	def clear_gpu_cache():
	# clear the GPU cache
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	XTTS_MODEL = None
	def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
	global XTTS_MODEL
	clear_gpu_cache()
	if not xtts_checkpoint or not xtts_config or not xtts_vocab:
	return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!"
	config = XttsConfig()
	config.load_json(xtts_config)
	XTTS_MODEL = Xtts.init_from_config(config)
	print("Loading XTTS model! ")
	XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
	if torch.cuda.is_available():
	XTTS_MODEL.cuda()

	print("Model Loaded!")

	def run_tts(lang, tts_text, speaker_audio_file):
	if XTTS_MODEL is None or not speaker_audio_file:
	return "You need to run the previous step to load the model !!", None, None

	gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs)
	out = XTTS_MODEL.inference(
	text=tts_text,
	language=lang,
	gpt_cond_latent=gpt_cond_latent,
	speaker_embedding=speaker_embedding,
	temperature=XTTS_MODEL.config.temperature, # Add custom parameters here
	length_penalty=XTTS_MODEL.config.length_penalty,
	repetition_penalty=XTTS_MODEL.config.repetition_penalty,
	top_k=XTTS_MODEL.config.top_k,
	top_p=XTTS_MODEL.config.top_p,
	)

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
	out["wav"] = torch.tensor(out["wav"]).unsqueeze(0)
	out_path = fp.name
	torchaudio.save(out_path, out["wav"], 24000)
	print("Speech generated !")

	return out_path, speaker_audio_file


	@spaces.GPU(enable_queue=True)
	def generate(text, audio):
	load_model(model_path, config_path, vocab_path)
	out_path, speaker_audio_file = run_tts(lang='es', tts_text=text, speaker_audio_file=audio)
	return out_path

	demo = gr.Interface(
	fn=generate,
	inputs=[gr.Textbox(label='Frase a generar'), gr.Audio(type='filepath', label='Voz de referencia')],
	outputs=gr.Audio(type='filepath')
	)

	demo.launch()