Spaces:

kazuhina
/

anime-tts

Running on Zero

File size: 5,207 Bytes

#!/usr/bin/env python3
"""
Anime TTS - Gradio app for using anime-whisper API
Uses litagin/anime-whisper model for Japanese anime voice transcription
"""

import gradio as gr
import torch
import spaces
from transformers import pipeline
import tempfile
import os
from pathlib import Path

# Initialize the anime-whisper model
print("Loading anime-whisper model...")
try:
    # Configure generation parameters for optimal anime speech recognition
    generate_kwargs = {
        "language": "Japanese",
        "no_repeat_ngram_size": 0,
        "repetition_penalty": 1.0,
    }
    
    # Initialize the pipeline with proper error handling
    pipe = pipeline(
        "automatic-speech-recognition",
        model="litagin/anime-whisper",
        device="cuda" if torch.cuda.is_available() else "cpu",
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        chunk_length_s=30.0,
        batch_size=64 if torch.cuda.is_available() else 8,
    )
    print("Model loaded successfully!")
    
except Exception as e:
    print(f"Error loading model: {e}")
    # Create a fallback function for when model loading fails
    def pipe(*args, **kwargs):
        return {"text": "Error: Model not loaded. Please check the model availability."}

@spaces.GPU
def transcribe_audio(audio_file):
    """
    Transcribe Japanese anime speech using anime-whisper model
    
    Args:
        audio_file: Path to the audio file or uploaded file
        
    Returns:
        str: Transcribed Japanese text
    """
    try:
        if not audio_file:
            return "Please upload an audio file."
        
        # Handle different types of audio inputs
        if hasattr(audio_file, 'name'):
            # Gradio file object
            audio_path = audio_file.name
        elif isinstance(audio_file, str):
            # File path string
            audio_path = audio_file
        else:
            return "Invalid audio file format."
        
        # Check if file exists
        if not os.path.exists(audio_path):
            return "Audio file not found."
        
        print(f"Processing audio file: {audio_path}")
        
        # Perform transcription
        result = pipe(audio_path, generate_kwargs=generate_kwargs)
        
        # Return the transcribed text
        if isinstance(result, dict) and 'text' in result:
            transcribed_text = result['text']
            print(f"Transcription successful: {transcribed_text[:100]}...")
            return transcribed_text
        else:
            return "Transcription failed. Please try again."
            
    except Exception as e:
        error_msg = f"Error during transcription: {str(e)}"
        print(error_msg)
        return error_msg

def create_demo():
    """
    Create a demo interface for testing
    """
    # Create a temporary audio file for demo (silent audio)
    import numpy as np
    import soundfile as sf
    
    # Generate 3 seconds of silence at 16kHz
    sample_rate = 16000
    duration = 3
    t = np.linspace(0, duration, int(sample_rate * duration), False)
    demo_audio = np.sin(2 * np.pi * 440 * t) * 0.1  # 440Hz tone at low volume
    
    # Save demo audio to temporary file
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
        sf.write(f.name, demo_audio, sample_rate)
        return f.name

# Create Gradio interface
demo = gr.Interface(
    fn=transcribe_audio,
    inputs=[
        gr.Audio(
            label="Upload Japanese Anime Audio File",
            type="filepath",
            format="wav"
        )
    ],
    outputs=[
        gr.Textbox(
            label="Transcribed Japanese Text",
            lines=5,
            placeholder="The transcribed text will appear here..."
        )
    ],
    title="🌍 Anime TTS - Japanese Anime Speech Recognition",
    description="""
    This application uses the **anime-whisper** model to transcribe Japanese anime speech to text.
    
    **Features:**
    - 🗾 Specialized for Japanese anime voice acting
    - 🎭 Handles emotional expressions and non-verbal sounds
    - 🎯 High accuracy for anime dialogue
    - 📝 Natural Japanese punctuation
    
    **Supported audio formats:** WAV, MP3, M4A, FLAC
    
    **Note:** This model works best with anime/visual novel dialogue and may perform less accurately with other types of Japanese speech.
    """,
    examples=None,
    theme=gr.themes.Soft(
        primary_hue="blue",
        secondary_hue="slate",
        neutral_hue="slate"
    ),
    css="""
    .gradio-container {max-width: 800px !important; margin: auto !important;}
    .title {text-align: center; color: #1e40af;}
    .description {text-align: center; font-size: 1.1em;}
    """,
    flagging_mode="never",
    submit_btn="🎵 Transcribe Audio",
    stop_btn="⏹️ Stop"
)

if __name__ == "__main__":
    print("🚀 Starting Anime TTS App...")
    print("📱 Interface will be available at: http://localhost:7860")
    print("🌍 Using anime-whisper model by litagin")
    
    # Launch the interface
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        debug=False,
        show_error=True
    )