Spaces:
Running
Running
File size: 2,296 Bytes
6ddd7e4 743b8f5 b5b5082 a8f789a 7890f41 743b8f5 7b8a54e ad0617c 3c03d8e 7b8a54e ad0617c 743b8f5 b5b5082 72db052 743b8f5 b5b5082 743b8f5 b5b5082 7890f41 b5b5082 a8f789a ea7eb85 b5b5082 743b8f5 b5b5082 58068be b5b5082 e30854e b5b5082 3f3d89a b5b5082 ea7eb85 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import spaces
import os, sys, logging
sys.path.append("neutts-air")
from neuttsair.neutts import NeuTTSAir
import numpy as np
import gradio as gr
SAMPLES_PATH = os.path.join(os.getcwd(), "neutts-air", "samples")
DEFAULT_REF_TEXT = "So I'm live on radio. And I say, well, my dear friend James here clearly, and the whole room just froze. Turns out I'd completely misspoken and mentioned our other friend."
DEFAULT_REF_PATH = os.path.join(SAMPLES_PATH, "dave.wav")
DEFAULT_GEN_TEXT = "My name is Dave, and um, I'm from London."
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
stream=sys.stdout
)
tts = NeuTTSAir(
backbone_repo="neuphonic/neutts-air",
backbone_device="cpu",
codec_repo="neuphonic/neucodec",
codec_device="cpu"
)
@spaces.GPU()
def infer(
ref_text: str,
ref_audio_path: str,
gen_text: str,
) -> tuple[int, np.ndarray]:
"""
Generates speech using NeuTTS-Air given a reference audio and text, and new text to synthesize.
Args:
ref_text (str): The text corresponding to the reference audio.
ref_audio_path (str): The file path to the reference audio.
gen_text (str): The new text to synthesize.
Returns:
tuple [int, np.ndarray]: A tuple containing the sample rate (24000) and the generated audio waveform as a numpy array.
"""
logging.info(f"Using reference: {ref_audio_path}")
gr.Info("Starting inference request!")
gr.Info("Encoding reference...")
ref_codes = tts.encode_reference(ref_audio_path)
gr.Info(f"Generating audio for input text: {gen_text}")
wav = tts.infer(gen_text, ref_codes, ref_text)
return (24_000, wav)
demo = gr.Interface(
fn=infer,
inputs=[
gr.Textbox(label="Reference Text", value=DEFAULT_REF_TEXT),
gr.Audio(type="filepath", label="Reference Audio", value=DEFAULT_REF_PATH),
gr.Textbox(label="Text to Generate", value=DEFAULT_GEN_TEXT),
],
outputs=gr.Audio(type="numpy", label="Generated Speech"),
title="NeuTTS-Air☁️",
description="Upload a reference audio sample, provide the reference text, and enter new text to synthesize."
)
if __name__ == "__main__":
demo.launch(allowed_paths=[SAMPLES_PATH], mcp_server=True, inbrowser=True) |