File size: 7,460 Bytes
97f8412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816ad52
 
 
 
97f8412
 
 
 
 
 
 
 
816ad52
97f8412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816ad52
 
97f8412
 
 
 
 
816ad52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97f8412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c649d3b
97f8412
 
c649d3b
 
 
 
97f8412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816ad52
97f8412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import gradio as gr
import numpy as np
import soundfile as sf
import tempfile
import os
from kittentts import KittenTTS

# Initialize the TTS model
print("Loading Kitten TTS Mini model...")
tts_model = KittenTTS("KittenML/kitten-tts-mini-0.1")
print("Model loaded successfully!")

# Available voices from the README
AVAILABLE_VOICES = [
    'expr-voice-2-m',
    'expr-voice-2-f', 
    'expr-voice-3-m',
    'expr-voice-3-f',
    'expr-voice-4-m',
    'expr-voice-4-f',
    'expr-voice-5-m',
    'expr-voice-5-f'
]

def generate_speech(text, voice):
    """Generate speech from text using Kitten TTS Mini"""
    
    if not text.strip():
        return None, "Please enter some text to synthesize."
    
    # Check character limit
    if len(text) > 457:
        return None, f"❌ Text is too long ({len(text)} characters). Please limit to 457 characters or less."
    
    try:
        # Generate audio
        print(f"Generating audio for: '{text[:50]}...' with voice: {voice}")
        audio = tts_model.generate(text, voice=voice)
        
        # Create temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
            sf.write(tmp_file.name, audio, 24000)
            return tmp_file.name, f"βœ… Successfully generated audio with {voice} ({len(text)} characters)"
            
    except Exception as e:
        error_msg = f"❌ Error generating audio: {str(e)}"
        print(error_msg)
        return None, error_msg

def create_interface():
    """Create the Gradio interface"""
    
    with gr.Blocks(
        title="🐱 Kitten TTS Mini",
        theme=gr.themes.Soft(),
        css="""
        .main-header {
            text-align: center;
            margin-bottom: 2rem;
        }
        .info-box {
            background: var(--background-fill-secondary);
            color: var(--body-text-color);
            padding: 1rem;
            border-radius: 10px;
            border-left: 4px solid #4285f4;
            margin: 1rem 0;
        }
        .info-box h3, .info-box h4 {
            color: var(--body-text-color) !important;
            margin-top: 0;
        }
        .info-box ul, .info-box li, .info-box p {
            color: var(--body-text-color) !important;
        }
        .footer-box {
            background: var(--background-fill-secondary);
            color: var(--body-text-color);
            padding: 1rem;
            border-radius: 10px;
            margin: 2rem 0;
            text-align: center;
        }
        .footer-box p, .footer-box a {
            color: var(--body-text-color) !important;
        }
        .footer-box a:hover {
            color: #4285f4 !important;
        }
        """
    ) as demo:
        
        # Header
        gr.HTML("""
        <div class="main-header">
            <h1>🐱 Kitten TTS Mini 0.1</h1>
            <p>Open-source realistic text-to-speech with 80M parameters</p>
        </div>
        """)
        
        # Info box
        gr.HTML("""
        <div class="info-box">
            <h3>ℹ️ About Kitten TTS Mini</h3>
            <ul>
                <li><strong>Parameters:</strong> 80 million</li>
                <li><strong>File size:</strong> ~170MB</li>
                <li><strong>Sample rate:</strong> 24kHz</li>
                <li><strong>Voices:</strong> 8 different voices (male & female)</li>
            </ul>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=2):
                # Input text
                text_input = gr.Textbox(
                    label="πŸ“ Text to Synthesize (max 457 characters)",
                    placeholder="Enter the text you want to convert to speech...",
                    lines=3,
                    max_lines=10,
                    max_length=457,
                    show_label=True,
                    info="Character limit: 457"
                )
                
                # Voice selection
                voice_dropdown = gr.Dropdown(
                    choices=AVAILABLE_VOICES,
                    value='expr-voice-2-f',
                    label="🎭 Voice Selection",
                    info="Choose from available voices"
                )
                
                # Generate button
                generate_btn = gr.Button(
                    "🎡 Generate Speech", 
                    variant="primary",
                    size="lg"
                )
                
            with gr.Column(scale=1):
                # Voice descriptions
                gr.HTML("""
                <div class="info-box">
                    <h4>🎭 Voice Guide</h4>
                    <p><strong>Format:</strong> expr-voice-{number}-{gender}</p>
                    <ul>
                        <li><strong>Numbers 2-5:</strong> Different voice styles</li>
                        <li><strong>m:</strong> Male voices</li>
                        <li><strong>f:</strong> Female voices</li>
                    </ul>
                </div>
                """)
        
        # Output section
        with gr.Row():
            with gr.Column():
                status_output = gr.Textbox(
                    label="πŸ“Š Status",
                    interactive=False
                )
                
                audio_output = gr.Audio(
                    label="🎡 Generated Audio",
                    type="filepath"
                )
        
        # Example inputs
        gr.Examples(
            examples=[
                ["Hello! This is Kitten TTS Mini, a high quality text-to-speech model.", "expr-voice-2-f"],
                ["Welcome to the world of open-source artificial intelligence and speech synthesis.", "expr-voice-3-m"],
                ["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet.", "expr-voice-4-f"],
                ["Kitten TTS works without requiring a GPU, making it accessible for everyone to use.", "expr-voice-5-m"],
                ["Science and technology are advancing rapidly, bringing us closer to a better future.", "expr-voice-2-m"]
            ],
            inputs=[text_input, voice_dropdown],
            label="πŸ’‘ Example Texts"
        )
        
        # Footer
        gr.HTML("""
        <div class="footer-box">
            <p><strong>🐱 Kitten TTS Mini</strong> | Built with ❀️ by the KittenML team</p>
            <p>Based on StyleTTS 2 architecture | Licensed under Apache 2.0</p>
            <p><a href="https://huggingface.co/KittenML/kitten-tts-mini-0.1" target="_blank">Model Card</a> | 
               <a href="https://github.com/KittenML/KittenTTS" target="_blank">GitHub</a></p>
        </div>
        """)
        
        # Event handlers
        generate_btn.click(
            fn=generate_speech,
            inputs=[text_input, voice_dropdown],
            outputs=[audio_output, status_output],
            show_progress=True
        )
        
        # Also allow Enter key to trigger generation
        text_input.submit(
            fn=generate_speech,
            inputs=[text_input, voice_dropdown],
            outputs=[audio_output, status_output],
            show_progress=True
        )
    
    return demo

if __name__ == "__main__":
    # Create and launch the interface
    demo = create_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )