File size: 5,207 Bytes
6a984a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1eabe08
6a984a1
 
 
 
 
 
 
 
 
 
1eabe08
6a984a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/usr/bin/env python3
"""
Anime TTS - Gradio app for using anime-whisper API
Uses litagin/anime-whisper model for Japanese anime voice transcription
"""

import gradio as gr
import torch
import spaces
from transformers import pipeline
import tempfile
import os
from pathlib import Path

# Initialize the anime-whisper model
print("Loading anime-whisper model...")
try:
    # Configure generation parameters for optimal anime speech recognition
    generate_kwargs = {
        "language": "Japanese",
        "no_repeat_ngram_size": 0,
        "repetition_penalty": 1.0,
    }
    
    # Initialize the pipeline with proper error handling
    pipe = pipeline(
        "automatic-speech-recognition",
        model="litagin/anime-whisper",
        device="cuda" if torch.cuda.is_available() else "cpu",
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        chunk_length_s=30.0,
        batch_size=64 if torch.cuda.is_available() else 8,
    )
    print("Model loaded successfully!")
    
except Exception as e:
    print(f"Error loading model: {e}")
    # Create a fallback function for when model loading fails
    def pipe(*args, **kwargs):
        return {"text": "Error: Model not loaded. Please check the model availability."}

@spaces.GPU
def transcribe_audio(audio_file):
    """
    Transcribe Japanese anime speech using anime-whisper model
    
    Args:
        audio_file: Path to the audio file or uploaded file
        
    Returns:
        str: Transcribed Japanese text
    """
    try:
        if not audio_file:
            return "Please upload an audio file."
        
        # Handle different types of audio inputs
        if hasattr(audio_file, 'name'):
            # Gradio file object
            audio_path = audio_file.name
        elif isinstance(audio_file, str):
            # File path string
            audio_path = audio_file
        else:
            return "Invalid audio file format."
        
        # Check if file exists
        if not os.path.exists(audio_path):
            return "Audio file not found."
        
        print(f"Processing audio file: {audio_path}")
        
        # Perform transcription
        result = pipe(audio_path, generate_kwargs=generate_kwargs)
        
        # Return the transcribed text
        if isinstance(result, dict) and 'text' in result:
            transcribed_text = result['text']
            print(f"Transcription successful: {transcribed_text[:100]}...")
            return transcribed_text
        else:
            return "Transcription failed. Please try again."
            
    except Exception as e:
        error_msg = f"Error during transcription: {str(e)}"
        print(error_msg)
        return error_msg

def create_demo():
    """
    Create a demo interface for testing
    """
    # Create a temporary audio file for demo (silent audio)
    import numpy as np
    import soundfile as sf
    
    # Generate 3 seconds of silence at 16kHz
    sample_rate = 16000
    duration = 3
    t = np.linspace(0, duration, int(sample_rate * duration), False)
    demo_audio = np.sin(2 * np.pi * 440 * t) * 0.1  # 440Hz tone at low volume
    
    # Save demo audio to temporary file
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
        sf.write(f.name, demo_audio, sample_rate)
        return f.name

# Create Gradio interface
demo = gr.Interface(
    fn=transcribe_audio,
    inputs=[
        gr.Audio(
            label="Upload Japanese Anime Audio File",
            type="filepath",
            format="wav"
        )
    ],
    outputs=[
        gr.Textbox(
            label="Transcribed Japanese Text",
            lines=5,
            placeholder="The transcribed text will appear here..."
        )
    ],
    title="🌍 Anime TTS - Japanese Anime Speech Recognition",
    description="""
    This application uses the **anime-whisper** model to transcribe Japanese anime speech to text.
    
    **Features:**
    - πŸ—Ύ Specialized for Japanese anime voice acting
    - 🎭 Handles emotional expressions and non-verbal sounds
    - 🎯 High accuracy for anime dialogue
    - πŸ“ Natural Japanese punctuation
    
    **Supported audio formats:** WAV, MP3, M4A, FLAC
    
    **Note:** This model works best with anime/visual novel dialogue and may perform less accurately with other types of Japanese speech.
    """,
    examples=None,
    theme=gr.themes.Soft(
        primary_hue="blue",
        secondary_hue="slate",
        neutral_hue="slate"
    ),
    css="""
    .gradio-container {max-width: 800px !important; margin: auto !important;}
    .title {text-align: center; color: #1e40af;}
    .description {text-align: center; font-size: 1.1em;}
    """,
    flagging_mode="never",
    submit_btn="🎡 Transcribe Audio",
    stop_btn="⏹️ Stop"
)

if __name__ == "__main__":
    print("πŸš€ Starting Anime TTS App...")
    print("πŸ“± Interface will be available at: http://localhost:7860")
    print("🌍 Using anime-whisper model by litagin")
    
    # Launch the interface
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        debug=False,
        show_error=True
    )