Spaces:

mpasila
/

kitten-tts-mini

Running

File size: 7,460 Bytes

import gradio as gr
import numpy as np
import soundfile as sf
import tempfile
import os
from kittentts import KittenTTS

# Initialize the TTS model
print("Loading Kitten TTS Mini model...")
tts_model = KittenTTS("KittenML/kitten-tts-mini-0.1")
print("Model loaded successfully!")

# Available voices from the README
AVAILABLE_VOICES = [
    'expr-voice-2-m',
    'expr-voice-2-f', 
    'expr-voice-3-m',
    'expr-voice-3-f',
    'expr-voice-4-m',
    'expr-voice-4-f',
    'expr-voice-5-m',
    'expr-voice-5-f'
]

def generate_speech(text, voice):
    """Generate speech from text using Kitten TTS Mini"""
    
    if not text.strip():
        return None, "Please enter some text to synthesize."
    
    # Check character limit
    if len(text) > 457:
        return None, f"❌ Text is too long ({len(text)} characters). Please limit to 457 characters or less."
    
    try:
        # Generate audio
        print(f"Generating audio for: '{text[:50]}...' with voice: {voice}")
        audio = tts_model.generate(text, voice=voice)
        
        # Create temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
            sf.write(tmp_file.name, audio, 24000)
            return tmp_file.name, f"✅ Successfully generated audio with {voice} ({len(text)} characters)"
            
    except Exception as e:
        error_msg = f"❌ Error generating audio: {str(e)}"
        print(error_msg)
        return None, error_msg

def create_interface():
    """Create the Gradio interface"""
    
    with gr.Blocks(
        title="🐱 Kitten TTS Mini",
        theme=gr.themes.Soft(),
        css="""
        .main-header {
            text-align: center;
            margin-bottom: 2rem;
        }
        .info-box {
            background: var(--background-fill-secondary);
            color: var(--body-text-color);
            padding: 1rem;
            border-radius: 10px;
            border-left: 4px solid #4285f4;
            margin: 1rem 0;
        }
        .info-box h3, .info-box h4 {
            color: var(--body-text-color) !important;
            margin-top: 0;
        }
        .info-box ul, .info-box li, .info-box p {
            color: var(--body-text-color) !important;
        }
        .footer-box {
            background: var(--background-fill-secondary);
            color: var(--body-text-color);
            padding: 1rem;
            border-radius: 10px;
            margin: 2rem 0;
            text-align: center;
        }
        .footer-box p, .footer-box a {
            color: var(--body-text-color) !important;
        }
        .footer-box a:hover {
            color: #4285f4 !important;
        }
        """
    ) as demo:
        
        # Header
        gr.HTML("""
        <div class="main-header">
            <h1>🐱 Kitten TTS Mini 0.1</h1>
            <p>Open-source realistic text-to-speech with 80M parameters</p>
        </div>
        """)
        
        # Info box
        gr.HTML("""
        <div class="info-box">
            <h3>ℹ️ About Kitten TTS Mini</h3>
            <ul>
                <li><strong>Parameters:</strong> 80 million</li>
                <li><strong>File size:</strong> ~170MB</li>
                <li><strong>Sample rate:</strong> 24kHz</li>
                <li><strong>Voices:</strong> 8 different voices (male & female)</li>
            </ul>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=2):
                # Input text
                text_input = gr.Textbox(
                    label="📝 Text to Synthesize (max 457 characters)",
                    placeholder="Enter the text you want to convert to speech...",
                    lines=3,
                    max_lines=10,
                    max_length=457,
                    show_label=True,
                    info="Character limit: 457"
                )
                
                # Voice selection
                voice_dropdown = gr.Dropdown(
                    choices=AVAILABLE_VOICES,
                    value='expr-voice-2-f',
                    label="🎭 Voice Selection",
                    info="Choose from available voices"
                )
                
                # Generate button
                generate_btn = gr.Button(
                    "🎵 Generate Speech", 
                    variant="primary",
                    size="lg"
                )
                
            with gr.Column(scale=1):
                # Voice descriptions
                gr.HTML("""
                <div class="info-box">
                    <h4>🎭 Voice Guide</h4>
                    <p><strong>Format:</strong> expr-voice-{number}-{gender}</p>
                    <ul>
                        <li><strong>Numbers 2-5:</strong> Different voice styles</li>
                        <li><strong>m:</strong> Male voices</li>
                        <li><strong>f:</strong> Female voices</li>
                    </ul>
                </div>
                """)
        
        # Output section
        with gr.Row():
            with gr.Column():
                status_output = gr.Textbox(
                    label="📊 Status",
                    interactive=False
                )
                
                audio_output = gr.Audio(
                    label="🎵 Generated Audio",
                    type="filepath"
                )
        
        # Example inputs
        gr.Examples(
            examples=[
                ["Hello! This is Kitten TTS Mini, a high quality text-to-speech model.", "expr-voice-2-f"],
                ["Welcome to the world of open-source artificial intelligence and speech synthesis.", "expr-voice-3-m"],
                ["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet.", "expr-voice-4-f"],
                ["Kitten TTS works without requiring a GPU, making it accessible for everyone to use.", "expr-voice-5-m"],
                ["Science and technology are advancing rapidly, bringing us closer to a better future.", "expr-voice-2-m"]
            ],
            inputs=[text_input, voice_dropdown],
            label="💡 Example Texts"
        )
        
        # Footer
        gr.HTML("""
        <div class="footer-box">
            <p><strong>🐱 Kitten TTS Mini</strong> | Built with ❤️ by the KittenML team</p>
            <p>Based on StyleTTS 2 architecture | Licensed under Apache 2.0</p>
            <p><a href="https://huggingface.co/KittenML/kitten-tts-mini-0.1" target="_blank">Model Card</a> | 
               <a href="https://github.com/KittenML/KittenTTS" target="_blank">GitHub</a></p>
        </div>
        """)
        
        # Event handlers
        generate_btn.click(
            fn=generate_speech,
            inputs=[text_input, voice_dropdown],
            outputs=[audio_output, status_output],
            show_progress=True
        )
        
        # Also allow Enter key to trigger generation
        text_input.submit(
            fn=generate_speech,
            inputs=[text_input, voice_dropdown],
            outputs=[audio_output, status_output],
            show_progress=True
        )
    
    return demo

if __name__ == "__main__":
    # Create and launch the interface
    demo = create_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )