File size: 3,117 Bytes
6ddd7e4
743b8f5
b5b5082
 
a8f789a
7890f41
f4b1b5b
7890f41
743b8f5
7b8a54e
ad0617c
3c03d8e
7b8a54e
ad0617c
743b8f5
 
 
 
 
 
b5b5082
72db052
743b8f5
b5b5082
743b8f5
b5b5082
7890f41
f4b1b5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5b5082
a8f789a
 
f4b1b5b
 
a8f789a
ea7eb85
 
 
 
f4b1b5b
ea7eb85
 
 
 
 
b5b5082
f4b1b5b
 
 
 
 
 
 
743b8f5
b5b5082
 
 
 
58068be
 
b5b5082
 
 
 
 
 
e30854e
f4b1b5b
 
b5b5082
 
3f3d89a
b5b5082
 
 
 
ea7eb85
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import spaces
import os, sys, logging
sys.path.append("neutts-air")
from neuttsair.neutts import NeuTTSAir
import numpy as np
import gradio as gr
from groq import Groq


SAMPLES_PATH = os.path.join(os.getcwd(), "neutts-air", "samples")
DEFAULT_REF_TEXT = "So I'm live on radio. And I say, well, my dear friend James here clearly, and the whole room just froze. Turns out I'd completely misspoken and mentioned our other friend." 
DEFAULT_REF_PATH = os.path.join(SAMPLES_PATH, "dave.wav")
DEFAULT_GEN_TEXT = "My name is Dave, and um, I'm from London."

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    stream=sys.stdout
)

tts = NeuTTSAir(
    backbone_repo="neuphonic/neutts-air",
    backbone_device="cpu",
    codec_repo="neuphonic/neucodec",
    codec_device="cpu"
)



def transcribe(file_path: str):
    client = Groq()
    with open(file_path, "rb") as file:
        transcription = client.audio.transcriptions.create(
            file=(file_path, file.read()),
            model="whisper-large-v3-turbo",
            temperature=0,
            response_format="verbose_json",
        )

        if len(transcription.text) <= 0: logging.warn("Error while transcripting the reference audio.")
        return transcription.text

@spaces.GPU()
def infer(
    gen_text: str,
    ref_text: str = DEFAULT_REF_TEXT,
    ref_audio_path: str = DEFAULT_REF_PATH,
) -> tuple[int, np.ndarray]:
    """
    Generates speech using NeuTTS-Air given a reference audio and text, and new text to synthesize.

    Args:
        gen_text (str): The new text to synthesize.
        ref_text (str): The text corresponding to the reference audio.
        ref_audio_path (str): The file path to the reference audio.
    Returns:
        tuple [int, np.ndarray]: A tuple containing the sample rate (24000) and the generated audio waveform as a numpy array.
    """

    if gen_text is None or not len(gen_text): 
        raise Exception("Please insert the new text to synthesize.")
    if ref_audio_path != DEFAULT_REF_PATH and ref_text == DEFAULT_REF_TEXT:
        ref_text = ""
    if not len(ref_text): 
        ref_text = transcribe(ref_audio_path)

    logging.info(f"Using reference: {ref_audio_path}")
    gr.Info("Starting inference request!")
    gr.Info("Encoding reference...")
    ref_codes = tts.encode_reference(ref_audio_path)

    gr.Info(f"Generating audio for input text: {gen_text}")
    wav = tts.infer(gen_text, ref_codes, ref_text)

    return (24_000, wav)

demo = gr.Interface(
    fn=infer,
    inputs=[
        gr.Textbox(label="Text to Generate", value=DEFAULT_GEN_TEXT),
        gr.Textbox(label="Reference Text (Optional)", value=DEFAULT_REF_TEXT),
        gr.Audio(type="filepath", label="Reference Audio", value=DEFAULT_REF_PATH),
    ],
    outputs=gr.Audio(type="numpy", label="Generated Speech"),
    title="NeuTTS-Air☁️",
    description="Upload a reference audio sample, provide the reference text, and enter new text to synthesize."
)

if __name__ == "__main__":
    demo.launch(allowed_paths=[SAMPLES_PATH], mcp_server=True, inbrowser=True)