File size: 5,207 Bytes
6a984a1 1eabe08 6a984a1 1eabe08 6a984a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
#!/usr/bin/env python3
"""
Anime TTS - Gradio app for using anime-whisper API
Uses litagin/anime-whisper model for Japanese anime voice transcription
"""
import gradio as gr
import torch
import spaces
from transformers import pipeline
import tempfile
import os
from pathlib import Path
# Initialize the anime-whisper model
print("Loading anime-whisper model...")
try:
# Configure generation parameters for optimal anime speech recognition
generate_kwargs = {
"language": "Japanese",
"no_repeat_ngram_size": 0,
"repetition_penalty": 1.0,
}
# Initialize the pipeline with proper error handling
pipe = pipeline(
"automatic-speech-recognition",
model="litagin/anime-whisper",
device="cuda" if torch.cuda.is_available() else "cpu",
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
chunk_length_s=30.0,
batch_size=64 if torch.cuda.is_available() else 8,
)
print("Model loaded successfully!")
except Exception as e:
print(f"Error loading model: {e}")
# Create a fallback function for when model loading fails
def pipe(*args, **kwargs):
return {"text": "Error: Model not loaded. Please check the model availability."}
@spaces.GPU
def transcribe_audio(audio_file):
"""
Transcribe Japanese anime speech using anime-whisper model
Args:
audio_file: Path to the audio file or uploaded file
Returns:
str: Transcribed Japanese text
"""
try:
if not audio_file:
return "Please upload an audio file."
# Handle different types of audio inputs
if hasattr(audio_file, 'name'):
# Gradio file object
audio_path = audio_file.name
elif isinstance(audio_file, str):
# File path string
audio_path = audio_file
else:
return "Invalid audio file format."
# Check if file exists
if not os.path.exists(audio_path):
return "Audio file not found."
print(f"Processing audio file: {audio_path}")
# Perform transcription
result = pipe(audio_path, generate_kwargs=generate_kwargs)
# Return the transcribed text
if isinstance(result, dict) and 'text' in result:
transcribed_text = result['text']
print(f"Transcription successful: {transcribed_text[:100]}...")
return transcribed_text
else:
return "Transcription failed. Please try again."
except Exception as e:
error_msg = f"Error during transcription: {str(e)}"
print(error_msg)
return error_msg
def create_demo():
"""
Create a demo interface for testing
"""
# Create a temporary audio file for demo (silent audio)
import numpy as np
import soundfile as sf
# Generate 3 seconds of silence at 16kHz
sample_rate = 16000
duration = 3
t = np.linspace(0, duration, int(sample_rate * duration), False)
demo_audio = np.sin(2 * np.pi * 440 * t) * 0.1 # 440Hz tone at low volume
# Save demo audio to temporary file
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
sf.write(f.name, demo_audio, sample_rate)
return f.name
# Create Gradio interface
demo = gr.Interface(
fn=transcribe_audio,
inputs=[
gr.Audio(
label="Upload Japanese Anime Audio File",
type="filepath",
format="wav"
)
],
outputs=[
gr.Textbox(
label="Transcribed Japanese Text",
lines=5,
placeholder="The transcribed text will appear here..."
)
],
title="π Anime TTS - Japanese Anime Speech Recognition",
description="""
This application uses the **anime-whisper** model to transcribe Japanese anime speech to text.
**Features:**
- πΎ Specialized for Japanese anime voice acting
- π Handles emotional expressions and non-verbal sounds
- π― High accuracy for anime dialogue
- π Natural Japanese punctuation
**Supported audio formats:** WAV, MP3, M4A, FLAC
**Note:** This model works best with anime/visual novel dialogue and may perform less accurately with other types of Japanese speech.
""",
examples=None,
theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="slate",
neutral_hue="slate"
),
css="""
.gradio-container {max-width: 800px !important; margin: auto !important;}
.title {text-align: center; color: #1e40af;}
.description {text-align: center; font-size: 1.1em;}
""",
flagging_mode="never",
submit_btn="π΅ Transcribe Audio",
stop_btn="βΉοΈ Stop"
)
if __name__ == "__main__":
print("π Starting Anime TTS App...")
print("π± Interface will be available at: http://localhost:7860")
print("π Using anime-whisper model by litagin")
# Launch the interface
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
debug=False,
show_error=True
) |