|
|
import gradio as gr |
|
|
import asyncio |
|
|
import io |
|
|
import sys |
|
|
sys.path.insert(0, '.') |
|
|
|
|
|
|
|
|
try: |
|
|
import spaces |
|
|
except ImportError: |
|
|
class SpacesMock: |
|
|
@staticmethod |
|
|
def GPU(func): |
|
|
return func |
|
|
spaces = SpacesMock() |
|
|
|
|
|
from maya1.model_loader import Maya1Model |
|
|
from maya1.pipeline import Maya1Pipeline |
|
|
from maya1.prompt_builder import Maya1PromptBuilder |
|
|
from maya1.snac_decoder import SNACDecoder |
|
|
from maya1.constants import AUDIO_SAMPLE_RATE |
|
|
|
|
|
|
|
|
PRESET_CHARACTERS = { |
|
|
"Realistic: Sarcastic Male (American)": { |
|
|
"description": "Realistic male voice in the 30s age with a american accent. Low pitch, nasally timbre, conversational pacing, sarcastic tone delivery at low intensity, commercial domain, product_demo_voice role, formal delivery", |
|
|
"example_text": "<sarcastic> He really stood up there and said we need to <chuckle> save the world. <sigh> What a joke." |
|
|
}, |
|
|
"Realistic: Excited Female (Asian-American)": { |
|
|
"description": "Realistic female voice in the 20s age with a asian_american accent. Normal pitch, smooth timbre, conversational pacing, neutral tone delivery at high intensity, viral_content domain, meme_voice role, formal delivery", |
|
|
"example_text": "<excited> I am issuing a formal commendation for this particular item! It has exceeded all established metrics for excellence. <gasp> This is something I would actually spend my own money on. <laugh> Seriously!" |
|
|
}, |
|
|
"Creative: Alpha Leader (Indian)": { |
|
|
"description": "Creative, alpha character. Male voice in their 30s with a indian accent. Normal pitch, nasally timbre, very_fast pacing, energetic tone at medium intensity.", |
|
|
"example_text": "<angry> I don't want to hear excuses, I only want to see solutions! <sigh> Get your teams together, brainstorm for thirty minutes, and come back to me with a plan. <excited> Now move!" |
|
|
}, |
|
|
"Creative: Vampire (Middle Eastern)": { |
|
|
"description": "Creative, vampire character. Male voice in their 40s with a middle_eastern accent. Low pitch, nasally timbre, very_slow pacing, excited tone at medium intensity.", |
|
|
"example_text": "<whisper> Soon you will join me in this magnificent eternal darkness. <laugh> And we shall feast upon the world together, <excited> bound by this exquisite night forever. <mischievous>" |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
model = None |
|
|
prompt_builder = None |
|
|
snac_decoder = None |
|
|
pipeline = None |
|
|
models_loaded = False |
|
|
|
|
|
def load_models(): |
|
|
"""Load Maya1 vLLM model and pipeline (runs once).""" |
|
|
global model, prompt_builder, snac_decoder, pipeline, models_loaded |
|
|
|
|
|
if models_loaded: |
|
|
return |
|
|
|
|
|
print("Loading Maya1 model with vLLM...") |
|
|
model = Maya1Model( |
|
|
model_path="maya-research/maya1", |
|
|
dtype="bfloat16", |
|
|
max_model_len=8192, |
|
|
gpu_memory_utilization=0.85, |
|
|
) |
|
|
|
|
|
print("Initializing prompt builder...") |
|
|
prompt_builder = Maya1PromptBuilder(model.tokenizer, model) |
|
|
|
|
|
print("Loading SNAC decoder...") |
|
|
snac_decoder = SNACDecoder( |
|
|
device="cuda", |
|
|
enable_batching=False, |
|
|
) |
|
|
|
|
|
print("Initializing pipeline...") |
|
|
pipeline = Maya1Pipeline(model, prompt_builder, snac_decoder) |
|
|
|
|
|
models_loaded = True |
|
|
print("Models loaded successfully!") |
|
|
|
|
|
def preset_selected(preset_name): |
|
|
"""Update description and text when preset is selected.""" |
|
|
if preset_name in PRESET_CHARACTERS: |
|
|
char = PRESET_CHARACTERS[preset_name] |
|
|
return char["description"], char["example_text"] |
|
|
return "", "" |
|
|
|
|
|
@spaces.GPU |
|
|
def generate_speech(preset_name, description, text, temperature, max_tokens): |
|
|
"""Generate emotional speech from description and text using vLLM.""" |
|
|
try: |
|
|
|
|
|
load_models() |
|
|
|
|
|
|
|
|
if preset_name and preset_name in PRESET_CHARACTERS: |
|
|
description = PRESET_CHARACTERS[preset_name]["description"] |
|
|
|
|
|
|
|
|
if not description or not text: |
|
|
return None, "Error: Please provide both description and text!" |
|
|
|
|
|
print(f"Generating with temperature={temperature}, max_tokens={max_tokens}...") |
|
|
|
|
|
|
|
|
loop = asyncio.new_event_loop() |
|
|
asyncio.set_event_loop(loop) |
|
|
audio_bytes = loop.run_until_complete( |
|
|
pipeline.generate_speech( |
|
|
description=description, |
|
|
text=text, |
|
|
temperature=temperature, |
|
|
top_p=0.9, |
|
|
max_tokens=max_tokens, |
|
|
repetition_penalty=1.1, |
|
|
seed=None, |
|
|
) |
|
|
) |
|
|
loop.close() |
|
|
|
|
|
if audio_bytes is None: |
|
|
return None, "Error: Audio generation failed. Try different text or increase max_tokens." |
|
|
|
|
|
|
|
|
import wave |
|
|
wav_buffer = io.BytesIO() |
|
|
with wave.open(wav_buffer, 'wb') as wav_file: |
|
|
wav_file.setnchannels(1) |
|
|
wav_file.setsampwidth(2) |
|
|
wav_file.setframerate(AUDIO_SAMPLE_RATE) |
|
|
wav_file.writeframes(audio_bytes) |
|
|
|
|
|
wav_buffer.seek(0) |
|
|
|
|
|
|
|
|
duration = len(audio_bytes) // 2 / AUDIO_SAMPLE_RATE |
|
|
frames = len(audio_bytes) // 2 // (AUDIO_SAMPLE_RATE // 6.86) // 7 |
|
|
|
|
|
status_msg = f"Generated {duration:.2f}s of emotional speech!" |
|
|
|
|
|
return wav_buffer, status_msg |
|
|
|
|
|
except Exception as e: |
|
|
import traceback |
|
|
error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" |
|
|
print(error_msg) |
|
|
return None, error_msg |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Maya1 - Open Source Emotional TTS", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown(""" |
|
|
# Maya1 - Open Source Emotional Text-to-Speech |
|
|
|
|
|
**The best open source voice AI model with emotions!** |
|
|
|
|
|
Generate realistic and expressive speech with natural language voice design. |
|
|
Choose a preset character or create your own custom voice. |
|
|
|
|
|
[Model](https://huggingface.co/maya-research/maya1) | [GitHub](https://github.com/MayaResearch/maya1-fastapi) |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### Character Selection") |
|
|
|
|
|
preset_dropdown = gr.Dropdown( |
|
|
choices=list(PRESET_CHARACTERS.keys()), |
|
|
label="Preset Characters", |
|
|
value=list(PRESET_CHARACTERS.keys())[0], |
|
|
info="Quick pick from 4 preset characters" |
|
|
) |
|
|
|
|
|
gr.Markdown("### Voice Design") |
|
|
|
|
|
description_input = gr.Textbox( |
|
|
label="Voice Description", |
|
|
placeholder="E.g., Male voice in their 30s with american accent. Normal pitch, warm timbre...", |
|
|
lines=3, |
|
|
value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["description"] |
|
|
) |
|
|
|
|
|
text_input = gr.Textbox( |
|
|
label="Text to Speak", |
|
|
placeholder="Enter text with <emotion> tags like <laugh>, <sigh>, <excited>...", |
|
|
lines=4, |
|
|
value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["example_text"] |
|
|
) |
|
|
|
|
|
with gr.Accordion("Advanced Settings", open=False): |
|
|
temperature_slider = gr.Slider( |
|
|
minimum=0.1, |
|
|
maximum=1.0, |
|
|
value=0.4, |
|
|
step=0.1, |
|
|
label="Temperature", |
|
|
info="Lower = more stable, Higher = more creative" |
|
|
) |
|
|
|
|
|
max_tokens_slider = gr.Slider( |
|
|
minimum=100, |
|
|
maximum=2048, |
|
|
value=500, |
|
|
step=50, |
|
|
label="Max Tokens", |
|
|
info="More tokens = longer audio" |
|
|
) |
|
|
|
|
|
generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### Generated Audio") |
|
|
|
|
|
audio_output = gr.Audio( |
|
|
label="Generated Speech", |
|
|
type="filepath", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
status_output = gr.Textbox( |
|
|
label="Status", |
|
|
lines=3, |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
gr.Markdown(""" |
|
|
### Supported Emotions |
|
|
|
|
|
`<angry>` `<appalled>` `<chuckle>` `<cry>` `<curious>` `<disappointed>` |
|
|
`<excited>` `<exhale>` `<gasp>` `<giggle>` `<gulp>` `<laugh>` |
|
|
`<laugh_harder>` `<mischievous>` `<sarcastic>` `<scream>` `<sigh>` |
|
|
`<sing>` `<snort>` `<whisper>` |
|
|
|
|
|
### Tips |
|
|
- Use emotion tags naturally in your text |
|
|
- Longer text needs more max_tokens |
|
|
- Lower temperature for consistent results |
|
|
- Presets are great starting points! |
|
|
""") |
|
|
|
|
|
|
|
|
preset_dropdown.change( |
|
|
fn=preset_selected, |
|
|
inputs=[preset_dropdown], |
|
|
outputs=[description_input, text_input] |
|
|
) |
|
|
|
|
|
generate_btn.click( |
|
|
fn=generate_speech, |
|
|
inputs=[preset_dropdown, description_input, text_input, temperature_slider, max_tokens_slider], |
|
|
outputs=[audio_output, status_output] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|
|
|
|