Spaces:

petersvenning
/

norwegian-whisper-transcription

Sleeping

App Files Files Community

norwegian-whisper-transcription / app.py

petersvenning

Update app.py

579f288 verified 4 months ago

raw

history blame contribute delete

8.49 kB

	import spaces # MÅ være første import for ZeroGPU!
	import gradio as gr
	import torch
	import numpy as np
	from transformers import WhisperProcessor, WhisperForConditionalGeneration
	import librosa
	import warnings
	import os
	from datetime import datetime
	import tempfile
	warnings.filterwarnings("ignore")

	# Sjekk device
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	def format_timestamp(seconds):
	"""Konverter sekunder til MM:SS-format"""
	minutes = int(seconds // 60)
	seconds = int(seconds % 60)
	return f"{minutes:02d}:{seconds:02d}"

	def add_timestamps_to_text(text, timestamp_interval=2):
	"""Legg til tidsstempler hver N sekund i transkripsjonen"""
	if not text or len(text.strip()) < 10:
	return text
	words = text.split()
	if len(words) < 5:
	return f"[00:00] {text}"
	words_per_second = 1.8
	words_per_interval = words_per_second * timestamp_interval
	timestamped_text = []
	current_time = 0
	for i in range(0, len(words), int(words_per_interval)):
	timestamp = f"[{format_timestamp(current_time)}] "
	word_chunk = words[i:i + int(words_per_interval)]
	text_chunk = " ".join(word_chunk)
	timestamped_text.append(timestamp + text_chunk)
	current_time += timestamp_interval
	return "\n".join(timestamped_text)

	@spaces.GPU # ZeroGPU-decorator
	def transcribe_audio_gpu(audio_input, model_choice="🇳🇴 NB-Whisper Large (Best for Norwegian)"):
	"""GPU-akselerert transkripsjon med NB-Whisper"""
	if audio_input is None:
	return "Please upload an audio file or record audio.", "", None
	try:
	# Filnavn
	if isinstance(audio_input, str):
	original_filename = os.path.basename(audio_input)
	base_name = os.path.splitext(original_filename)[0]
	else:
	original_filename = "recorded_audio.wav"
	base_name = "recorded_audio"
	# Modellvalg
	model_map = {
	"🇳🇴 NB-Whisper Large (Best for Norwegian)": "NbAiLab/nb-whisper-large",
	"🇳🇴 NB-Whisper Medium (Faster)": "NbAiLab/nb-whisper-medium",
	"🌍 Whisper Large v3 (Universal)": "openai/whisper-large-v3"
	}
	model_id = model_map.get(model_choice, "NbAiLab/nb-whisper-large")
	# Last modell og processor på GPU
	print(f"Loading {model_id} on GPU...")
	processor = WhisperProcessor.from_pretrained(model_id)
	model = WhisperForConditionalGeneration.from_pretrained(
	model_id,
	torch_dtype=torch.float16, # Kun her!
	device_map="auto"
	)
	# Last lyd
	print("Loading audio file...")
	audio_data, sr = librosa.load(audio_input, sr=16000, mono=True)
	duration_minutes = len(audio_data) / sr / 60
	print(f"Audio duration: {duration_minutes:.1f} minutes")
	# Forbered input features
	input_features = processor(
	audio_data,
	sampling_rate=16000,
	return_tensors="pt"
	).input_features.to(device)
	# Generer transkripsjon (uten torch_dtype her!)
	with torch.no_grad():
	predicted_ids = model.generate(
	input_features,
	max_length=1024, # Prøv evt. 896 hvis du får OOM
	min_length=20,
	do_sample=False,
	num_beams=1,
	early_stopping=False,
	pad_token_id=processor.tokenizer.eos_token_id,
	use_cache=True
	)
	# Dekod transkripsjon
	print("Decoding transcription...")
	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
	if not transcription or len(transcription) < 20:
	return "Error: Could not generate transcription. Audio may be unclear.", "", None
	# Legg til tidsstempler
	print("Adding timestamps...")
	final_transcription = add_timestamps_to_text(transcription, timestamp_interval=2)
	# Lagre fil
	current_date = datetime.now().strftime("%Y-%m-%d")
	current_time = datetime.now().strftime("%H-%M")
	transcript_filename = f"transcript_{base_name}_{current_date}_{current_time}.txt"
	transcript_content = f"Transcript - {original_filename}\n"
	transcript_content += f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
	transcript_content += f"Duration: {duration_minutes:.1f} minutes\n"
	transcript_content += f"Model: {model_choice}\n"
	transcript_content += f"Processing: GPU-accelerated with 2-second timestamps\n"
	transcript_content += f"Generated by: Leadership by Heart - Norwegian Whisper Transcription\n"
	transcript_content += "\n" + "="*70 + "\n\n"
	transcript_content += final_transcription
	temp_dir = tempfile.gettempdir()
	temp_file_path = os.path.join(temp_dir, transcript_filename)
	with open(temp_file_path, 'w', encoding='utf-8') as f:
	f.write(transcript_content)
	print("GPU transcription complete!")
	return final_transcription, f"📁 File: {original_filename} ({duration_minutes:.1f} min)", temp_file_path
	except Exception as e:
	error_msg = f"GPU Error: {str(e)}"
	print(error_msg)
	return error_msg, f"Error processing file", None

	# Gradio interface
	with gr.Blocks(title="Norwegian Whisper Transcription", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# 🎙️ Norwegian Whisper Transcription
	### Leadership by Heart - GPU-Accelerated Transcription
	✨ Nå med ZeroGPU Power!
	- ⚡ GPU-accelerert prosessering
	- 🇳🇴 NB-Whisper Large optimalisert for norsk
	- 📍 2-sekunders tidsstempler
	- 🚀 Komplett 13+ min filer på 2-3 minutter
	"""
	)
	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(
	sources=["upload", "microphone"],
	type="filepath",
	label="📁 Upload Audio or 🎤 Record"
	)
	model_dropdown = gr.Dropdown(
	choices=[
	"🇳🇴 NB-Whisper Large (Best for Norwegian)",
	"🇳🇴 NB-Whisper Medium (Faster)",
	"🌍 Whisper Large v3 (Universal)"
	],
	value="🇳🇴 NB-Whisper Large (Best for Norwegian)",
	label="🤖 Select Model",
	info="NB-Whisper models are specifically trained on Norwegian speech"
	)
	transcribe_btn = gr.Button("🚀 GPU Transcription", variant="primary", size="lg")
	gr.Markdown(
	"""
	💡 GPU Acceleration:
	- Bruker Nvidia H200 GPU (ZeroGPU)
	- Halvpresisjon for maksimal hastighet
	- Komplett filprosessering
	- Skal matche Jojo-kvalitet og fart!
	"""
	)
	with gr.Column():
	filename_display = gr.Textbox(
	label="📄 Current File",
	interactive=False,
	placeholder="No file selected"
	)
	output_text = gr.Textbox(
	label="📝 Complete GPU Transcription",
	lines=20,
	max_lines=60,
	placeholder="Your GPU-accelerated transcription will appear here..."
	)
	download_file = gr.File(
	label="💾 Download Complete Transcript (.txt)"
	)
	# Event handlers
	transcribe_btn.click(
	fn=transcribe_audio_gpu,
	inputs=[audio_input, model_dropdown],
	outputs=[output_text, filename_display, download_file]
	)
	def show_filename(audio):
	if audio is None:
	return "No file selected"
	if isinstance(audio, str):
	filename = os.path.basename(audio)
	try:
	audio_data, sr = librosa.load(audio, sr=16000, mono=True)
	duration_minutes = len(audio_data) / sr / 60
	return f"📁 {filename} ({duration_minutes:.1f} min)"
	except:
	return f"📁 {filename}"
	else:
	return "📁 recorded_audio.wav"
	audio_input.change(
	fn=show_filename,
	inputs=[audio_input],
	outputs=[filename_display]
	)

	if __name__ == "__main__":
	demo.launch()