Spaces:

Borio047
/

DG-TTS

Sleeping

App Files Files Community

DG-TTS / app.py

Borio047

Update app.py

cd931dd verified 7 days ago

raw

history blame contribute delete

2.9 kB

	import gradio as gr
	import numpy as np
	import soundfile as sf
	import os
	import uuid

	import torch
	from transformers import VitsModel, VitsTokenizer, set_seed

	# 1. Load MMS-TTS English model (lighter than Bark)
	MODEL_ID = "facebook/mms-tts-eng"

	tokenizer = VitsTokenizer.from_pretrained(MODEL_ID)
	model = VitsModel.from_pretrained(MODEL_ID)

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = model.to(device)

	# Optional: make outputs deterministic
	set_seed(555)


	MAX_CHARS = 300 # keep text short for speed and stability


	def generate_speech(text: str) -> str:
	"""
	Take text, synthesize speech with MMS-TTS,
	save to a WAV file, and return the filepath
	(for gr.Audio(type="filepath")).
	"""
	if not text or text.strip() == "":
	raise gr.Error("Please enter some text 🙂")

	text = text.strip()
	if len(text) > MAX_CHARS:
	text = text[:MAX_CHARS]
	# You could also show a warning text if you like.

	# MMS-TTS is trained on lowercased, unpunctuated text → simple normalization
	normalized_text = text.lower()

	# 1) Tokenize
	inputs = tokenizer(text=normalized_text, return_tensors="pt").to(device)

	# 2) Forward pass
	with torch.no_grad():
	outputs = model(**inputs)

	# 3) Get waveform and sampling rate
	waveform = outputs.waveform[0].cpu().numpy().astype(np.float32)
	sr = model.config.sampling_rate # typically 16000

	# 4) Save to /tmp as WAV
	tmp_dir = "/tmp"
	os.makedirs(tmp_dir, exist_ok=True)
	filename = f"tts_{uuid.uuid4().hex}.wav"
	filepath = os.path.join(tmp_dir, filename)

	sf.write(filepath, waveform, sr)

	# 5) Return file path for gr.Audio(type="filepath")
	return filepath


	with gr.Blocks() as demo:
	gr.Markdown("# 🗣️ Англи текстийг яриа болгох \n\n --- Simple TTS with facebook/mms-tts-eng")
	gr.Markdown(
	"Энд англи дээр өгүүлбэрээ бичээд Яриаг үүсгэ товчийг дарж англи яриаг сонсоорой. \n\n"
	"Model: `facebook/mms-tts-eng` (MMS-TTS, VITS-based)."
	)

	with gr.Row():
	with gr.Column(scale=2):
	text_input = gr.Textbox(
	label="Яриа болгох англи өгүүлбэр",
	placeholder="Жишээ: Hello, this is my text-to-speech demo",
	lines=3,
	)
	generate_button = gr.Button("Яриаг үүсгэнэ үү", variant="primary")
	with gr.Column(scale=1):
	audio_output = gr.Audio(
	label="Үүссэн бичлэг",
	type="filepath", # we return a path string
	)

	generate_button.click(
	fn=generate_speech,
	inputs=text_input,
	outputs=audio_output,
	)

	if __name__ == "__main__":
	demo.launch(ssr_mode=False)