anime-tts / app.py
kazuhina's picture
Fix Spaces build error: Remove problematic examples and update flagging parameter
1eabe08
#!/usr/bin/env python3
"""
Anime TTS - Gradio app for using anime-whisper API
Uses litagin/anime-whisper model for Japanese anime voice transcription
"""
import gradio as gr
import torch
import spaces
from transformers import pipeline
import tempfile
import os
from pathlib import Path
# Initialize the anime-whisper model
print("Loading anime-whisper model...")
try:
# Configure generation parameters for optimal anime speech recognition
generate_kwargs = {
"language": "Japanese",
"no_repeat_ngram_size": 0,
"repetition_penalty": 1.0,
}
# Initialize the pipeline with proper error handling
pipe = pipeline(
"automatic-speech-recognition",
model="litagin/anime-whisper",
device="cuda" if torch.cuda.is_available() else "cpu",
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
chunk_length_s=30.0,
batch_size=64 if torch.cuda.is_available() else 8,
)
print("Model loaded successfully!")
except Exception as e:
print(f"Error loading model: {e}")
# Create a fallback function for when model loading fails
def pipe(*args, **kwargs):
return {"text": "Error: Model not loaded. Please check the model availability."}
@spaces.GPU
def transcribe_audio(audio_file):
"""
Transcribe Japanese anime speech using anime-whisper model
Args:
audio_file: Path to the audio file or uploaded file
Returns:
str: Transcribed Japanese text
"""
try:
if not audio_file:
return "Please upload an audio file."
# Handle different types of audio inputs
if hasattr(audio_file, 'name'):
# Gradio file object
audio_path = audio_file.name
elif isinstance(audio_file, str):
# File path string
audio_path = audio_file
else:
return "Invalid audio file format."
# Check if file exists
if not os.path.exists(audio_path):
return "Audio file not found."
print(f"Processing audio file: {audio_path}")
# Perform transcription
result = pipe(audio_path, generate_kwargs=generate_kwargs)
# Return the transcribed text
if isinstance(result, dict) and 'text' in result:
transcribed_text = result['text']
print(f"Transcription successful: {transcribed_text[:100]}...")
return transcribed_text
else:
return "Transcription failed. Please try again."
except Exception as e:
error_msg = f"Error during transcription: {str(e)}"
print(error_msg)
return error_msg
def create_demo():
"""
Create a demo interface for testing
"""
# Create a temporary audio file for demo (silent audio)
import numpy as np
import soundfile as sf
# Generate 3 seconds of silence at 16kHz
sample_rate = 16000
duration = 3
t = np.linspace(0, duration, int(sample_rate * duration), False)
demo_audio = np.sin(2 * np.pi * 440 * t) * 0.1 # 440Hz tone at low volume
# Save demo audio to temporary file
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
sf.write(f.name, demo_audio, sample_rate)
return f.name
# Create Gradio interface
demo = gr.Interface(
fn=transcribe_audio,
inputs=[
gr.Audio(
label="Upload Japanese Anime Audio File",
type="filepath",
format="wav"
)
],
outputs=[
gr.Textbox(
label="Transcribed Japanese Text",
lines=5,
placeholder="The transcribed text will appear here..."
)
],
title="🌍 Anime TTS - Japanese Anime Speech Recognition",
description="""
This application uses the **anime-whisper** model to transcribe Japanese anime speech to text.
**Features:**
- πŸ—Ύ Specialized for Japanese anime voice acting
- 🎭 Handles emotional expressions and non-verbal sounds
- 🎯 High accuracy for anime dialogue
- πŸ“ Natural Japanese punctuation
**Supported audio formats:** WAV, MP3, M4A, FLAC
**Note:** This model works best with anime/visual novel dialogue and may perform less accurately with other types of Japanese speech.
""",
examples=None,
theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="slate",
neutral_hue="slate"
),
css="""
.gradio-container {max-width: 800px !important; margin: auto !important;}
.title {text-align: center; color: #1e40af;}
.description {text-align: center; font-size: 1.1em;}
""",
flagging_mode="never",
submit_btn="🎡 Transcribe Audio",
stop_btn="⏹️ Stop"
)
if __name__ == "__main__":
print("πŸš€ Starting Anime TTS App...")
print("πŸ“± Interface will be available at: http://localhost:7860")
print("🌍 Using anime-whisper model by litagin")
# Launch the interface
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
debug=False,
show_error=True
)