Spaces:

mpasila
/

kitten-tts-mini

Running

App Files Files Community

kitten-tts-mini / app.py

mpasila

Update app.py

816ad52 verified 3 months ago

raw

history blame contribute delete

7.46 kB

	import gradio as gr
	import numpy as np
	import soundfile as sf
	import tempfile
	import os
	from kittentts import KittenTTS

	# Initialize the TTS model
	print("Loading Kitten TTS Mini model...")
	tts_model = KittenTTS("KittenML/kitten-tts-mini-0.1")
	print("Model loaded successfully!")

	# Available voices from the README
	AVAILABLE_VOICES = [
	'expr-voice-2-m',
	'expr-voice-2-f',
	'expr-voice-3-m',
	'expr-voice-3-f',
	'expr-voice-4-m',
	'expr-voice-4-f',
	'expr-voice-5-m',
	'expr-voice-5-f'
	]

	def generate_speech(text, voice):
	"""Generate speech from text using Kitten TTS Mini"""

	if not text.strip():
	return None, "Please enter some text to synthesize."

	# Check character limit
	if len(text) > 457:
	return None, f"❌ Text is too long ({len(text)} characters). Please limit to 457 characters or less."

	try:
	# Generate audio
	print(f"Generating audio for: '{text[:50]}...' with voice: {voice}")
	audio = tts_model.generate(text, voice=voice)

	# Create temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
	sf.write(tmp_file.name, audio, 24000)
	return tmp_file.name, f"✅ Successfully generated audio with {voice} ({len(text)} characters)"

	except Exception as e:
	error_msg = f"❌ Error generating audio: {str(e)}"
	print(error_msg)
	return None, error_msg

	def create_interface():
	"""Create the Gradio interface"""

	with gr.Blocks(
	title="🐱 Kitten TTS Mini",
	theme=gr.themes.Soft(),
	css="""
	.main-header {
	text-align: center;
	margin-bottom: 2rem;
	}
	.info-box {
	background: var(--background-fill-secondary);
	color: var(--body-text-color);
	padding: 1rem;
	border-radius: 10px;
	border-left: 4px solid #4285f4;
	margin: 1rem 0;
	}
	.info-box h3, .info-box h4 {
	color: var(--body-text-color) !important;
	margin-top: 0;
	}
	.info-box ul, .info-box li, .info-box p {
	color: var(--body-text-color) !important;
	}
	.footer-box {
	background: var(--background-fill-secondary);
	color: var(--body-text-color);
	padding: 1rem;
	border-radius: 10px;
	margin: 2rem 0;
	text-align: center;
	}
	.footer-box p, .footer-box a {
	color: var(--body-text-color) !important;
	}
	.footer-box a:hover {
	color: #4285f4 !important;
	}
	"""
	) as demo:

	# Header
	gr.HTML("""
	<div class="main-header">
	<h1>🐱 Kitten TTS Mini 0.1</h1>
	<p>Open-source realistic text-to-speech with 80M parameters</p>
	</div>
	""")

	# Info box
	gr.HTML("""
	<div class="info-box">
	<h3>ℹ️ About Kitten TTS Mini</h3>
	<ul>
	<li><strong>Parameters:</strong> 80 million</li>
	<li><strong>File size:</strong> ~170MB</li>
	<li><strong>Sample rate:</strong> 24kHz</li>
	<li><strong>Voices:</strong> 8 different voices (male & female)</li>
	</ul>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=2):
	# Input text
	text_input = gr.Textbox(
	label="📝 Text to Synthesize (max 457 characters)",
	placeholder="Enter the text you want to convert to speech...",
	lines=3,
	max_lines=10,
	max_length=457,
	show_label=True,
	info="Character limit: 457"
	)

	# Voice selection
	voice_dropdown = gr.Dropdown(
	choices=AVAILABLE_VOICES,
	value='expr-voice-2-f',
	label="🎭 Voice Selection",
	info="Choose from available voices"
	)

	# Generate button
	generate_btn = gr.Button(
	"🎵 Generate Speech",
	variant="primary",
	size="lg"
	)

	with gr.Column(scale=1):
	# Voice descriptions
	gr.HTML("""
	<div class="info-box">
	<h4>🎭 Voice Guide</h4>
	<p><strong>Format:</strong> expr-voice-{number}-{gender}</p>
	<ul>
	<li><strong>Numbers 2-5:</strong> Different voice styles</li>
	<li><strong>m:</strong> Male voices</li>
	<li><strong>f:</strong> Female voices</li>
	</ul>
	</div>
	""")

	# Output section
	with gr.Row():
	with gr.Column():
	status_output = gr.Textbox(
	label="📊 Status",
	interactive=False
	)

	audio_output = gr.Audio(
	label="🎵 Generated Audio",
	type="filepath"
	)

	# Example inputs
	gr.Examples(
	examples=[
	["Hello! This is Kitten TTS Mini, a high quality text-to-speech model.", "expr-voice-2-f"],
	["Welcome to the world of open-source artificial intelligence and speech synthesis.", "expr-voice-3-m"],
	["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet.", "expr-voice-4-f"],
	["Kitten TTS works without requiring a GPU, making it accessible for everyone to use.", "expr-voice-5-m"],
	["Science and technology are advancing rapidly, bringing us closer to a better future.", "expr-voice-2-m"]
	],
	inputs=[text_input, voice_dropdown],
	label="💡 Example Texts"
	)

	# Footer
	gr.HTML("""
	<div class="footer-box">
	<p><strong>🐱 Kitten TTS Mini</strong> \| Built with ❤️ by the KittenML team</p>
	<p>Based on StyleTTS 2 architecture \| Licensed under Apache 2.0</p>
	<p><a href="https://huggingface.co/KittenML/kitten-tts-mini-0.1" target="_blank">Model Card</a> \|
	<a href="https://github.com/KittenML/KittenTTS" target="_blank">GitHub</a></p>
	</div>
	""")

	# Event handlers
	generate_btn.click(
	fn=generate_speech,
	inputs=[text_input, voice_dropdown],
	outputs=[audio_output, status_output],
	show_progress=True
	)

	# Also allow Enter key to trigger generation
	text_input.submit(
	fn=generate_speech,
	inputs=[text_input, voice_dropdown],
	outputs=[audio_output, status_output],
	show_progress=True
	)

	return demo

	if __name__ == "__main__":
	# Create and launch the interface
	demo = create_interface()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True
	)