petersvenning's picture
Update app.py
579f288 verified
import spaces # MÅ være første import for ZeroGPU!
import gradio as gr
import torch
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa
import warnings
import os
from datetime import datetime
import tempfile
warnings.filterwarnings("ignore")
# Sjekk device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
def format_timestamp(seconds):
"""Konverter sekunder til MM:SS-format"""
minutes = int(seconds // 60)
seconds = int(seconds % 60)
return f"{minutes:02d}:{seconds:02d}"
def add_timestamps_to_text(text, timestamp_interval=2):
"""Legg til tidsstempler hver N sekund i transkripsjonen"""
if not text or len(text.strip()) < 10:
return text
words = text.split()
if len(words) < 5:
return f"[00:00] {text}"
words_per_second = 1.8
words_per_interval = words_per_second * timestamp_interval
timestamped_text = []
current_time = 0
for i in range(0, len(words), int(words_per_interval)):
timestamp = f"[{format_timestamp(current_time)}] "
word_chunk = words[i:i + int(words_per_interval)]
text_chunk = " ".join(word_chunk)
timestamped_text.append(timestamp + text_chunk)
current_time += timestamp_interval
return "\n".join(timestamped_text)
@spaces.GPU # ZeroGPU-decorator
def transcribe_audio_gpu(audio_input, model_choice="πŸ‡³πŸ‡΄ NB-Whisper Large (Best for Norwegian)"):
"""GPU-akselerert transkripsjon med NB-Whisper"""
if audio_input is None:
return "Please upload an audio file or record audio.", "", None
try:
# Filnavn
if isinstance(audio_input, str):
original_filename = os.path.basename(audio_input)
base_name = os.path.splitext(original_filename)[0]
else:
original_filename = "recorded_audio.wav"
base_name = "recorded_audio"
# Modellvalg
model_map = {
"πŸ‡³πŸ‡΄ NB-Whisper Large (Best for Norwegian)": "NbAiLab/nb-whisper-large",
"πŸ‡³πŸ‡΄ NB-Whisper Medium (Faster)": "NbAiLab/nb-whisper-medium",
"🌍 Whisper Large v3 (Universal)": "openai/whisper-large-v3"
}
model_id = model_map.get(model_choice, "NbAiLab/nb-whisper-large")
# Last modell og processor pΓ₯ GPU
print(f"Loading {model_id} on GPU...")
processor = WhisperProcessor.from_pretrained(model_id)
model = WhisperForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.float16, # Kun her!
device_map="auto"
)
# Last lyd
print("Loading audio file...")
audio_data, sr = librosa.load(audio_input, sr=16000, mono=True)
duration_minutes = len(audio_data) / sr / 60
print(f"Audio duration: {duration_minutes:.1f} minutes")
# Forbered input features
input_features = processor(
audio_data,
sampling_rate=16000,
return_tensors="pt"
).input_features.to(device)
# Generer transkripsjon (uten torch_dtype her!)
with torch.no_grad():
predicted_ids = model.generate(
input_features,
max_length=1024, # PrΓΈv evt. 896 hvis du fΓ₯r OOM
min_length=20,
do_sample=False,
num_beams=1,
early_stopping=False,
pad_token_id=processor.tokenizer.eos_token_id,
use_cache=True
)
# Dekod transkripsjon
print("Decoding transcription...")
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
if not transcription or len(transcription) < 20:
return "Error: Could not generate transcription. Audio may be unclear.", "", None
# Legg til tidsstempler
print("Adding timestamps...")
final_transcription = add_timestamps_to_text(transcription, timestamp_interval=2)
# Lagre fil
current_date = datetime.now().strftime("%Y-%m-%d")
current_time = datetime.now().strftime("%H-%M")
transcript_filename = f"transcript_{base_name}_{current_date}_{current_time}.txt"
transcript_content = f"Transcript - {original_filename}\n"
transcript_content += f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
transcript_content += f"Duration: {duration_minutes:.1f} minutes\n"
transcript_content += f"Model: {model_choice}\n"
transcript_content += f"Processing: GPU-accelerated with 2-second timestamps\n"
transcript_content += f"Generated by: Leadership by Heart - Norwegian Whisper Transcription\n"
transcript_content += "\n" + "="*70 + "\n\n"
transcript_content += final_transcription
temp_dir = tempfile.gettempdir()
temp_file_path = os.path.join(temp_dir, transcript_filename)
with open(temp_file_path, 'w', encoding='utf-8') as f:
f.write(transcript_content)
print("GPU transcription complete!")
return final_transcription, f"πŸ“ File: {original_filename} ({duration_minutes:.1f} min)", temp_file_path
except Exception as e:
error_msg = f"GPU Error: {str(e)}"
print(error_msg)
return error_msg, f"Error processing file", None
# Gradio interface
with gr.Blocks(title="Norwegian Whisper Transcription", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# πŸŽ™οΈ Norwegian Whisper Transcription
### Leadership by Heart - GPU-Accelerated Transcription
**✨ NΓ₯ med ZeroGPU Power!**
- ⚑ **GPU-accelerert prosessering**
- πŸ‡³πŸ‡΄ **NB-Whisper Large optimalisert for norsk**
- πŸ“ **2-sekunders tidsstempler**
- πŸš€ **Komplett 13+ min filer pΓ₯ 2-3 minutter**
"""
)
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="πŸ“ Upload Audio or 🎀 Record"
)
model_dropdown = gr.Dropdown(
choices=[
"πŸ‡³πŸ‡΄ NB-Whisper Large (Best for Norwegian)",
"πŸ‡³πŸ‡΄ NB-Whisper Medium (Faster)",
"🌍 Whisper Large v3 (Universal)"
],
value="πŸ‡³πŸ‡΄ NB-Whisper Large (Best for Norwegian)",
label="πŸ€– Select Model",
info="NB-Whisper models are specifically trained on Norwegian speech"
)
transcribe_btn = gr.Button("πŸš€ GPU Transcription", variant="primary", size="lg")
gr.Markdown(
"""
**πŸ’‘ GPU Acceleration:**
- Bruker Nvidia H200 GPU (ZeroGPU)
- Halvpresisjon for maksimal hastighet
- Komplett filprosessering
- Skal matche Jojo-kvalitet og fart!
"""
)
with gr.Column():
filename_display = gr.Textbox(
label="πŸ“„ Current File",
interactive=False,
placeholder="No file selected"
)
output_text = gr.Textbox(
label="πŸ“ Complete GPU Transcription",
lines=20,
max_lines=60,
placeholder="Your GPU-accelerated transcription will appear here..."
)
download_file = gr.File(
label="πŸ’Ύ Download Complete Transcript (.txt)"
)
# Event handlers
transcribe_btn.click(
fn=transcribe_audio_gpu,
inputs=[audio_input, model_dropdown],
outputs=[output_text, filename_display, download_file]
)
def show_filename(audio):
if audio is None:
return "No file selected"
if isinstance(audio, str):
filename = os.path.basename(audio)
try:
audio_data, sr = librosa.load(audio, sr=16000, mono=True)
duration_minutes = len(audio_data) / sr / 60
return f"πŸ“ {filename} ({duration_minutes:.1f} min)"
except:
return f"πŸ“ {filename}"
else:
return "πŸ“ recorded_audio.wav"
audio_input.change(
fn=show_filename,
inputs=[audio_input],
outputs=[filename_display]
)
if __name__ == "__main__":
demo.launch()