Spaces:

kazuhina
/

anime-tts

Running on Zero

App Files Files Community

anime-tts / app.py

kazuhina

Fix Spaces build error: Remove problematic examples and update flagging parameter

1eabe08 about 1 month ago

raw

history blame contribute delete

5.21 kB

	#!/usr/bin/env python3
	"""
	Anime TTS - Gradio app for using anime-whisper API
	Uses litagin/anime-whisper model for Japanese anime voice transcription
	"""

	import gradio as gr
	import torch
	import spaces
	from transformers import pipeline
	import tempfile
	import os
	from pathlib import Path

	# Initialize the anime-whisper model
	print("Loading anime-whisper model...")
	try:
	# Configure generation parameters for optimal anime speech recognition
	generate_kwargs = {
	"language": "Japanese",
	"no_repeat_ngram_size": 0,
	"repetition_penalty": 1.0,
	}

	# Initialize the pipeline with proper error handling
	pipe = pipeline(
	"automatic-speech-recognition",
	model="litagin/anime-whisper",
	device="cuda" if torch.cuda.is_available() else "cpu",
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	chunk_length_s=30.0,
	batch_size=64 if torch.cuda.is_available() else 8,
	)
	print("Model loaded successfully!")

	except Exception as e:
	print(f"Error loading model: {e}")
	# Create a fallback function for when model loading fails
	def pipe(args, *kwargs):
	return {"text": "Error: Model not loaded. Please check the model availability."}

	@spaces.GPU
	def transcribe_audio(audio_file):
	"""
	Transcribe Japanese anime speech using anime-whisper model

	Args:
	audio_file: Path to the audio file or uploaded file

	Returns:
	str: Transcribed Japanese text
	"""
	try:
	if not audio_file:
	return "Please upload an audio file."

	# Handle different types of audio inputs
	if hasattr(audio_file, 'name'):
	# Gradio file object
	audio_path = audio_file.name
	elif isinstance(audio_file, str):
	# File path string
	audio_path = audio_file
	else:
	return "Invalid audio file format."

	# Check if file exists
	if not os.path.exists(audio_path):
	return "Audio file not found."

	print(f"Processing audio file: {audio_path}")

	# Perform transcription
	result = pipe(audio_path, generate_kwargs=generate_kwargs)

	# Return the transcribed text
	if isinstance(result, dict) and 'text' in result:
	transcribed_text = result['text']
	print(f"Transcription successful: {transcribed_text[:100]}...")
	return transcribed_text
	else:
	return "Transcription failed. Please try again."

	except Exception as e:
	error_msg = f"Error during transcription: {str(e)}"
	print(error_msg)
	return error_msg

	def create_demo():
	"""
	Create a demo interface for testing
	"""
	# Create a temporary audio file for demo (silent audio)
	import numpy as np
	import soundfile as sf

	# Generate 3 seconds of silence at 16kHz
	sample_rate = 16000
	duration = 3
	t = np.linspace(0, duration, int(sample_rate * duration), False)
	demo_audio = np.sin(2 * np.pi * 440 * t) * 0.1 # 440Hz tone at low volume

	# Save demo audio to temporary file
	with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
	sf.write(f.name, demo_audio, sample_rate)
	return f.name

	# Create Gradio interface
	demo = gr.Interface(
	fn=transcribe_audio,
	inputs=[
	gr.Audio(
	label="Upload Japanese Anime Audio File",
	type="filepath",
	format="wav"
	)
	],
	outputs=[
	gr.Textbox(
	label="Transcribed Japanese Text",
	lines=5,
	placeholder="The transcribed text will appear here..."
	)
	],
	title="🌍 Anime TTS - Japanese Anime Speech Recognition",
	description="""
	This application uses the anime-whisper model to transcribe Japanese anime speech to text.

	Features:
	- 🗾 Specialized for Japanese anime voice acting
	- 🎭 Handles emotional expressions and non-verbal sounds
	- 🎯 High accuracy for anime dialogue
	- 📝 Natural Japanese punctuation

	Supported audio formats: WAV, MP3, M4A, FLAC

	Note: This model works best with anime/visual novel dialogue and may perform less accurately with other types of Japanese speech.
	""",
	examples=None,
	theme=gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="slate",
	neutral_hue="slate"
	),
	css="""
	.gradio-container {max-width: 800px !important; margin: auto !important;}
	.title {text-align: center; color: #1e40af;}
	.description {text-align: center; font-size: 1.1em;}
	""",
	flagging_mode="never",
	submit_btn="🎵 Transcribe Audio",
	stop_btn="⏹️ Stop"
	)

	if __name__ == "__main__":
	print("🚀 Starting Anime TTS App...")
	print("📱 Interface will be available at: http://localhost:7860")
	print("🌍 Using anime-whisper model by litagin")

	# Launch the interface
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	debug=False,
	show_error=True
	)