|
|
|
|
|
""" |
|
|
Anime TTS - Gradio app for using anime-whisper API |
|
|
Uses litagin/anime-whisper model for Japanese anime voice transcription |
|
|
""" |
|
|
|
|
|
import gradio as gr |
|
|
import torch |
|
|
import spaces |
|
|
from transformers import pipeline |
|
|
import tempfile |
|
|
import os |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
print("Loading anime-whisper model...") |
|
|
try: |
|
|
|
|
|
generate_kwargs = { |
|
|
"language": "Japanese", |
|
|
"no_repeat_ngram_size": 0, |
|
|
"repetition_penalty": 1.0, |
|
|
} |
|
|
|
|
|
|
|
|
pipe = pipeline( |
|
|
"automatic-speech-recognition", |
|
|
model="litagin/anime-whisper", |
|
|
device="cuda" if torch.cuda.is_available() else "cpu", |
|
|
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
|
|
chunk_length_s=30.0, |
|
|
batch_size=64 if torch.cuda.is_available() else 8, |
|
|
) |
|
|
print("Model loaded successfully!") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error loading model: {e}") |
|
|
|
|
|
def pipe(*args, **kwargs): |
|
|
return {"text": "Error: Model not loaded. Please check the model availability."} |
|
|
|
|
|
@spaces.GPU |
|
|
def transcribe_audio(audio_file): |
|
|
""" |
|
|
Transcribe Japanese anime speech using anime-whisper model |
|
|
|
|
|
Args: |
|
|
audio_file: Path to the audio file or uploaded file |
|
|
|
|
|
Returns: |
|
|
str: Transcribed Japanese text |
|
|
""" |
|
|
try: |
|
|
if not audio_file: |
|
|
return "Please upload an audio file." |
|
|
|
|
|
|
|
|
if hasattr(audio_file, 'name'): |
|
|
|
|
|
audio_path = audio_file.name |
|
|
elif isinstance(audio_file, str): |
|
|
|
|
|
audio_path = audio_file |
|
|
else: |
|
|
return "Invalid audio file format." |
|
|
|
|
|
|
|
|
if not os.path.exists(audio_path): |
|
|
return "Audio file not found." |
|
|
|
|
|
print(f"Processing audio file: {audio_path}") |
|
|
|
|
|
|
|
|
result = pipe(audio_path, generate_kwargs=generate_kwargs) |
|
|
|
|
|
|
|
|
if isinstance(result, dict) and 'text' in result: |
|
|
transcribed_text = result['text'] |
|
|
print(f"Transcription successful: {transcribed_text[:100]}...") |
|
|
return transcribed_text |
|
|
else: |
|
|
return "Transcription failed. Please try again." |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"Error during transcription: {str(e)}" |
|
|
print(error_msg) |
|
|
return error_msg |
|
|
|
|
|
def create_demo(): |
|
|
""" |
|
|
Create a demo interface for testing |
|
|
""" |
|
|
|
|
|
import numpy as np |
|
|
import soundfile as sf |
|
|
|
|
|
|
|
|
sample_rate = 16000 |
|
|
duration = 3 |
|
|
t = np.linspace(0, duration, int(sample_rate * duration), False) |
|
|
demo_audio = np.sin(2 * np.pi * 440 * t) * 0.1 |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f: |
|
|
sf.write(f.name, demo_audio, sample_rate) |
|
|
return f.name |
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=transcribe_audio, |
|
|
inputs=[ |
|
|
gr.Audio( |
|
|
label="Upload Japanese Anime Audio File", |
|
|
type="filepath", |
|
|
format="wav" |
|
|
) |
|
|
], |
|
|
outputs=[ |
|
|
gr.Textbox( |
|
|
label="Transcribed Japanese Text", |
|
|
lines=5, |
|
|
placeholder="The transcribed text will appear here..." |
|
|
) |
|
|
], |
|
|
title="π Anime TTS - Japanese Anime Speech Recognition", |
|
|
description=""" |
|
|
This application uses the **anime-whisper** model to transcribe Japanese anime speech to text. |
|
|
|
|
|
**Features:** |
|
|
- πΎ Specialized for Japanese anime voice acting |
|
|
- π Handles emotional expressions and non-verbal sounds |
|
|
- π― High accuracy for anime dialogue |
|
|
- π Natural Japanese punctuation |
|
|
|
|
|
**Supported audio formats:** WAV, MP3, M4A, FLAC |
|
|
|
|
|
**Note:** This model works best with anime/visual novel dialogue and may perform less accurately with other types of Japanese speech. |
|
|
""", |
|
|
examples=None, |
|
|
theme=gr.themes.Soft( |
|
|
primary_hue="blue", |
|
|
secondary_hue="slate", |
|
|
neutral_hue="slate" |
|
|
), |
|
|
css=""" |
|
|
.gradio-container {max-width: 800px !important; margin: auto !important;} |
|
|
.title {text-align: center; color: #1e40af;} |
|
|
.description {text-align: center; font-size: 1.1em;} |
|
|
""", |
|
|
flagging_mode="never", |
|
|
submit_btn="π΅ Transcribe Audio", |
|
|
stop_btn="βΉοΈ Stop" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("π Starting Anime TTS App...") |
|
|
print("π± Interface will be available at: http://localhost:7860") |
|
|
print("π Using anime-whisper model by litagin") |
|
|
|
|
|
|
|
|
demo.launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
share=False, |
|
|
debug=False, |
|
|
show_error=True |
|
|
) |