|
|
import spaces |
|
|
import gradio as gr |
|
|
import torch |
|
|
import numpy as np |
|
|
from transformers import WhisperProcessor, WhisperForConditionalGeneration |
|
|
import librosa |
|
|
import warnings |
|
|
import os |
|
|
from datetime import datetime |
|
|
import tempfile |
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
print(f"Using device: {device}") |
|
|
|
|
|
def format_timestamp(seconds): |
|
|
"""Konverter sekunder til MM:SS-format""" |
|
|
minutes = int(seconds // 60) |
|
|
seconds = int(seconds % 60) |
|
|
return f"{minutes:02d}:{seconds:02d}" |
|
|
|
|
|
def add_timestamps_to_text(text, timestamp_interval=2): |
|
|
"""Legg til tidsstempler hver N sekund i transkripsjonen""" |
|
|
if not text or len(text.strip()) < 10: |
|
|
return text |
|
|
words = text.split() |
|
|
if len(words) < 5: |
|
|
return f"[00:00] {text}" |
|
|
words_per_second = 1.8 |
|
|
words_per_interval = words_per_second * timestamp_interval |
|
|
timestamped_text = [] |
|
|
current_time = 0 |
|
|
for i in range(0, len(words), int(words_per_interval)): |
|
|
timestamp = f"[{format_timestamp(current_time)}] " |
|
|
word_chunk = words[i:i + int(words_per_interval)] |
|
|
text_chunk = " ".join(word_chunk) |
|
|
timestamped_text.append(timestamp + text_chunk) |
|
|
current_time += timestamp_interval |
|
|
return "\n".join(timestamped_text) |
|
|
|
|
|
@spaces.GPU |
|
|
def transcribe_audio_gpu(audio_input, model_choice="π³π΄ NB-Whisper Large (Best for Norwegian)"): |
|
|
"""GPU-akselerert transkripsjon med NB-Whisper""" |
|
|
if audio_input is None: |
|
|
return "Please upload an audio file or record audio.", "", None |
|
|
try: |
|
|
|
|
|
if isinstance(audio_input, str): |
|
|
original_filename = os.path.basename(audio_input) |
|
|
base_name = os.path.splitext(original_filename)[0] |
|
|
else: |
|
|
original_filename = "recorded_audio.wav" |
|
|
base_name = "recorded_audio" |
|
|
|
|
|
model_map = { |
|
|
"π³π΄ NB-Whisper Large (Best for Norwegian)": "NbAiLab/nb-whisper-large", |
|
|
"π³π΄ NB-Whisper Medium (Faster)": "NbAiLab/nb-whisper-medium", |
|
|
"π Whisper Large v3 (Universal)": "openai/whisper-large-v3" |
|
|
} |
|
|
model_id = model_map.get(model_choice, "NbAiLab/nb-whisper-large") |
|
|
|
|
|
print(f"Loading {model_id} on GPU...") |
|
|
processor = WhisperProcessor.from_pretrained(model_id) |
|
|
model = WhisperForConditionalGeneration.from_pretrained( |
|
|
model_id, |
|
|
torch_dtype=torch.float16, |
|
|
device_map="auto" |
|
|
) |
|
|
|
|
|
print("Loading audio file...") |
|
|
audio_data, sr = librosa.load(audio_input, sr=16000, mono=True) |
|
|
duration_minutes = len(audio_data) / sr / 60 |
|
|
print(f"Audio duration: {duration_minutes:.1f} minutes") |
|
|
|
|
|
input_features = processor( |
|
|
audio_data, |
|
|
sampling_rate=16000, |
|
|
return_tensors="pt" |
|
|
).input_features.to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
predicted_ids = model.generate( |
|
|
input_features, |
|
|
max_length=1024, |
|
|
min_length=20, |
|
|
do_sample=False, |
|
|
num_beams=1, |
|
|
early_stopping=False, |
|
|
pad_token_id=processor.tokenizer.eos_token_id, |
|
|
use_cache=True |
|
|
) |
|
|
|
|
|
print("Decoding transcription...") |
|
|
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip() |
|
|
if not transcription or len(transcription) < 20: |
|
|
return "Error: Could not generate transcription. Audio may be unclear.", "", None |
|
|
|
|
|
print("Adding timestamps...") |
|
|
final_transcription = add_timestamps_to_text(transcription, timestamp_interval=2) |
|
|
|
|
|
current_date = datetime.now().strftime("%Y-%m-%d") |
|
|
current_time = datetime.now().strftime("%H-%M") |
|
|
transcript_filename = f"transcript_{base_name}_{current_date}_{current_time}.txt" |
|
|
transcript_content = f"Transcript - {original_filename}\n" |
|
|
transcript_content += f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n" |
|
|
transcript_content += f"Duration: {duration_minutes:.1f} minutes\n" |
|
|
transcript_content += f"Model: {model_choice}\n" |
|
|
transcript_content += f"Processing: GPU-accelerated with 2-second timestamps\n" |
|
|
transcript_content += f"Generated by: Leadership by Heart - Norwegian Whisper Transcription\n" |
|
|
transcript_content += "\n" + "="*70 + "\n\n" |
|
|
transcript_content += final_transcription |
|
|
temp_dir = tempfile.gettempdir() |
|
|
temp_file_path = os.path.join(temp_dir, transcript_filename) |
|
|
with open(temp_file_path, 'w', encoding='utf-8') as f: |
|
|
f.write(transcript_content) |
|
|
print("GPU transcription complete!") |
|
|
return final_transcription, f"π File: {original_filename} ({duration_minutes:.1f} min)", temp_file_path |
|
|
except Exception as e: |
|
|
error_msg = f"GPU Error: {str(e)}" |
|
|
print(error_msg) |
|
|
return error_msg, f"Error processing file", None |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Norwegian Whisper Transcription", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# ποΈ Norwegian Whisper Transcription |
|
|
### Leadership by Heart - GPU-Accelerated Transcription |
|
|
**β¨ NΓ₯ med ZeroGPU Power!** |
|
|
- β‘ **GPU-accelerert prosessering** |
|
|
- π³π΄ **NB-Whisper Large optimalisert for norsk** |
|
|
- π **2-sekunders tidsstempler** |
|
|
- π **Komplett 13+ min filer pΓ₯ 2-3 minutter** |
|
|
""" |
|
|
) |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
audio_input = gr.Audio( |
|
|
sources=["upload", "microphone"], |
|
|
type="filepath", |
|
|
label="π Upload Audio or π€ Record" |
|
|
) |
|
|
model_dropdown = gr.Dropdown( |
|
|
choices=[ |
|
|
"π³π΄ NB-Whisper Large (Best for Norwegian)", |
|
|
"π³π΄ NB-Whisper Medium (Faster)", |
|
|
"π Whisper Large v3 (Universal)" |
|
|
], |
|
|
value="π³π΄ NB-Whisper Large (Best for Norwegian)", |
|
|
label="π€ Select Model", |
|
|
info="NB-Whisper models are specifically trained on Norwegian speech" |
|
|
) |
|
|
transcribe_btn = gr.Button("π GPU Transcription", variant="primary", size="lg") |
|
|
gr.Markdown( |
|
|
""" |
|
|
**π‘ GPU Acceleration:** |
|
|
- Bruker Nvidia H200 GPU (ZeroGPU) |
|
|
- Halvpresisjon for maksimal hastighet |
|
|
- Komplett filprosessering |
|
|
- Skal matche Jojo-kvalitet og fart! |
|
|
""" |
|
|
) |
|
|
with gr.Column(): |
|
|
filename_display = gr.Textbox( |
|
|
label="π Current File", |
|
|
interactive=False, |
|
|
placeholder="No file selected" |
|
|
) |
|
|
output_text = gr.Textbox( |
|
|
label="π Complete GPU Transcription", |
|
|
lines=20, |
|
|
max_lines=60, |
|
|
placeholder="Your GPU-accelerated transcription will appear here..." |
|
|
) |
|
|
download_file = gr.File( |
|
|
label="πΎ Download Complete Transcript (.txt)" |
|
|
) |
|
|
|
|
|
transcribe_btn.click( |
|
|
fn=transcribe_audio_gpu, |
|
|
inputs=[audio_input, model_dropdown], |
|
|
outputs=[output_text, filename_display, download_file] |
|
|
) |
|
|
def show_filename(audio): |
|
|
if audio is None: |
|
|
return "No file selected" |
|
|
if isinstance(audio, str): |
|
|
filename = os.path.basename(audio) |
|
|
try: |
|
|
audio_data, sr = librosa.load(audio, sr=16000, mono=True) |
|
|
duration_minutes = len(audio_data) / sr / 60 |
|
|
return f"π {filename} ({duration_minutes:.1f} min)" |
|
|
except: |
|
|
return f"π {filename}" |
|
|
else: |
|
|
return "π recorded_audio.wav" |
|
|
audio_input.change( |
|
|
fn=show_filename, |
|
|
inputs=[audio_input], |
|
|
outputs=[filename_display] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |