kitten-tts-mini / app.py
mpasila's picture
Update app.py
816ad52 verified
import gradio as gr
import numpy as np
import soundfile as sf
import tempfile
import os
from kittentts import KittenTTS
# Initialize the TTS model
print("Loading Kitten TTS Mini model...")
tts_model = KittenTTS("KittenML/kitten-tts-mini-0.1")
print("Model loaded successfully!")
# Available voices from the README
AVAILABLE_VOICES = [
'expr-voice-2-m',
'expr-voice-2-f',
'expr-voice-3-m',
'expr-voice-3-f',
'expr-voice-4-m',
'expr-voice-4-f',
'expr-voice-5-m',
'expr-voice-5-f'
]
def generate_speech(text, voice):
"""Generate speech from text using Kitten TTS Mini"""
if not text.strip():
return None, "Please enter some text to synthesize."
# Check character limit
if len(text) > 457:
return None, f"❌ Text is too long ({len(text)} characters). Please limit to 457 characters or less."
try:
# Generate audio
print(f"Generating audio for: '{text[:50]}...' with voice: {voice}")
audio = tts_model.generate(text, voice=voice)
# Create temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
sf.write(tmp_file.name, audio, 24000)
return tmp_file.name, f"βœ… Successfully generated audio with {voice} ({len(text)} characters)"
except Exception as e:
error_msg = f"❌ Error generating audio: {str(e)}"
print(error_msg)
return None, error_msg
def create_interface():
"""Create the Gradio interface"""
with gr.Blocks(
title="🐱 Kitten TTS Mini",
theme=gr.themes.Soft(),
css="""
.main-header {
text-align: center;
margin-bottom: 2rem;
}
.info-box {
background: var(--background-fill-secondary);
color: var(--body-text-color);
padding: 1rem;
border-radius: 10px;
border-left: 4px solid #4285f4;
margin: 1rem 0;
}
.info-box h3, .info-box h4 {
color: var(--body-text-color) !important;
margin-top: 0;
}
.info-box ul, .info-box li, .info-box p {
color: var(--body-text-color) !important;
}
.footer-box {
background: var(--background-fill-secondary);
color: var(--body-text-color);
padding: 1rem;
border-radius: 10px;
margin: 2rem 0;
text-align: center;
}
.footer-box p, .footer-box a {
color: var(--body-text-color) !important;
}
.footer-box a:hover {
color: #4285f4 !important;
}
"""
) as demo:
# Header
gr.HTML("""
<div class="main-header">
<h1>🐱 Kitten TTS Mini 0.1</h1>
<p>Open-source realistic text-to-speech with 80M parameters</p>
</div>
""")
# Info box
gr.HTML("""
<div class="info-box">
<h3>ℹ️ About Kitten TTS Mini</h3>
<ul>
<li><strong>Parameters:</strong> 80 million</li>
<li><strong>File size:</strong> ~170MB</li>
<li><strong>Sample rate:</strong> 24kHz</li>
<li><strong>Voices:</strong> 8 different voices (male & female)</li>
</ul>
</div>
""")
with gr.Row():
with gr.Column(scale=2):
# Input text
text_input = gr.Textbox(
label="πŸ“ Text to Synthesize (max 457 characters)",
placeholder="Enter the text you want to convert to speech...",
lines=3,
max_lines=10,
max_length=457,
show_label=True,
info="Character limit: 457"
)
# Voice selection
voice_dropdown = gr.Dropdown(
choices=AVAILABLE_VOICES,
value='expr-voice-2-f',
label="🎭 Voice Selection",
info="Choose from available voices"
)
# Generate button
generate_btn = gr.Button(
"🎡 Generate Speech",
variant="primary",
size="lg"
)
with gr.Column(scale=1):
# Voice descriptions
gr.HTML("""
<div class="info-box">
<h4>🎭 Voice Guide</h4>
<p><strong>Format:</strong> expr-voice-{number}-{gender}</p>
<ul>
<li><strong>Numbers 2-5:</strong> Different voice styles</li>
<li><strong>m:</strong> Male voices</li>
<li><strong>f:</strong> Female voices</li>
</ul>
</div>
""")
# Output section
with gr.Row():
with gr.Column():
status_output = gr.Textbox(
label="πŸ“Š Status",
interactive=False
)
audio_output = gr.Audio(
label="🎡 Generated Audio",
type="filepath"
)
# Example inputs
gr.Examples(
examples=[
["Hello! This is Kitten TTS Mini, a high quality text-to-speech model.", "expr-voice-2-f"],
["Welcome to the world of open-source artificial intelligence and speech synthesis.", "expr-voice-3-m"],
["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet.", "expr-voice-4-f"],
["Kitten TTS works without requiring a GPU, making it accessible for everyone to use.", "expr-voice-5-m"],
["Science and technology are advancing rapidly, bringing us closer to a better future.", "expr-voice-2-m"]
],
inputs=[text_input, voice_dropdown],
label="πŸ’‘ Example Texts"
)
# Footer
gr.HTML("""
<div class="footer-box">
<p><strong>🐱 Kitten TTS Mini</strong> | Built with ❀️ by the KittenML team</p>
<p>Based on StyleTTS 2 architecture | Licensed under Apache 2.0</p>
<p><a href="https://huggingface.co/KittenML/kitten-tts-mini-0.1" target="_blank">Model Card</a> |
<a href="https://github.com/KittenML/KittenTTS" target="_blank">GitHub</a></p>
</div>
""")
# Event handlers
generate_btn.click(
fn=generate_speech,
inputs=[text_input, voice_dropdown],
outputs=[audio_output, status_output],
show_progress=True
)
# Also allow Enter key to trigger generation
text_input.submit(
fn=generate_speech,
inputs=[text_input, voice_dropdown],
outputs=[audio_output, status_output],
show_progress=True
)
return demo
if __name__ == "__main__":
# Create and launch the interface
demo = create_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True
)