NMachine commited on
Commit
de3e327
·
verified ·
1 Parent(s): 390ed5e

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +62 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ from snac import SNAC
4
+ import soundfile as sf
5
+ import gradio as gr
6
+
7
+ # Load models
8
+ model = AutoModelForCausalLM.from_pretrained(
9
+ "maya-research/maya1",
10
+ torch_dtype=torch.bfloat16,
11
+ device_map="auto"
12
+ )
13
+ tokenizer = AutoTokenizer.from_pretrained("maya-research/maya1")
14
+ snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to("cuda")
15
+
16
+ # Main generation function
17
+ def generate_voice(description, text):
18
+ prompt = f'<description="{description}"> {text}'
19
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
20
+ with torch.inference_mode():
21
+ outputs = model.generate(
22
+ **inputs,
23
+ max_new_tokens=500,
24
+ temperature=0.4,
25
+ top_p=0.9,
26
+ do_sample=True
27
+ )
28
+
29
+ generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
30
+ snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937]
31
+
32
+ frames = len(snac_tokens) // 7
33
+ codes = [[], [], []]
34
+ for i in range(frames):
35
+ s = snac_tokens[i*7:(i+1)*7]
36
+ codes[0].append((s[0]-128266) % 4096)
37
+ codes[1].extend([(s[1]-128266) % 4096, (s[4]-128266) % 4096])
38
+ codes[2].extend([(s[2]-128266) % 4096, (s[3]-128266) % 4096,
39
+ (s[5]-128266) % 4096, (s[6]-128266) % 4096])
40
+
41
+ codes_tensor = [torch.tensor(c, dtype=torch.long, device="cuda").unsqueeze(0) for c in codes]
42
+ with torch.inference_mode():
43
+ audio = snac_model.decoder(snac_model.quantizer.from_codes(codes_tensor))[0, 0].cpu().numpy()
44
+
45
+ out_path = "output.wav"
46
+ sf.write(out_path, audio, 24000)
47
+ return out_path
48
+
49
+ # Gradio interface — no preset text, fully user-controlled
50
+ demo = gr.Interface(
51
+ fn=generate_voice,
52
+ inputs=[
53
+ gr.Textbox(label="Voice Description (e.g., calm female voice with British accent)"),
54
+ gr.Textbox(label="Text to Speak (type anything you want)")
55
+ ],
56
+ outputs=gr.Audio(label="Generated Speech"),
57
+ title="🎙️ Maya1 Voice Generator",
58
+ description="Generate expressive emotional speech using the open-source Maya1 + SNAC pipeline."
59
+ )
60
+
61
+ if __name__ == "__main__":
62
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ soundfile
4
+ snac
5
+ gradio