palli23 commited on
Commit
d817784
·
1 Parent(s): 3407dd3

diarization1Mæló

Browse files
Files changed (1) hide show
  1. app.py +70 -49
app.py CHANGED
@@ -1,78 +1,99 @@
1
- # app.py for HF Spaces (ZeroGPU safe pyannote)
 
 
 
2
  import os
3
  import gradio as gr
4
  import spaces
5
  import tempfile
6
  import torch
7
-
8
- from torch.serialization import safe_globals
9
- from pyannote.audio.core.model import Model
10
- from pyannote.audio.core.task import Task, Specifications
11
- from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
12
- from typing import OrderedDict
13
-
14
  from transformers import pipeline
15
  from pyannote.audio import Pipeline
 
16
 
17
- # Required patches for ZeroGPU
18
- os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
19
- torch.serialization.add_safe_globals({
20
- "OrderedDict": OrderedDict,
21
- })
22
-
23
- MODEL_NAME = "palli23/whisper-small-sam_spjall"
24
 
 
 
 
25
  @spaces.GPU(duration=120)
26
  def transcribe_with_diarization(audio_path):
 
27
  if not audio_path:
28
- return "Hladdu upp hljóðskrá"
29
-
30
- # Fix strict unpickling in torch 2.6 (ZeroGPU)
 
 
 
31
  with safe_globals([
32
  torch.torch_version.TorchVersion,
33
- Model,
34
- Task,
35
- Specifications,
36
- SpeakerDiarization,
37
- OrderedDict,
38
  ]):
 
 
 
 
39
  diarization = Pipeline.from_pretrained(
40
- "pyannote/speaker-diarization-3.1",
41
- use_auth_token=os.getenv("HF_TOKEN")
42
  ).to("cuda")
43
-
44
- # Run diarization
45
- dia = diarization(audio_path)
46
 
47
- # Whisper model
 
 
 
 
 
48
  asr = pipeline(
49
- "automatic-speech-recognition",
50
- model=MODEL_NAME,
51
  device=0,
52
- use_auth_token=os.getenv("HF_TOKEN"),
53
  )
54
 
55
- # segment-by-segment ASR
56
- result = []
57
- for turn, _, speaker in dia.itertracks(yield_label=True):
58
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
59
- diarization.crop(audio_path, turn).export(f.name, format="wav")
60
- chunk = f.name
 
 
 
 
 
61
 
62
- text = asr(chunk)["text"].strip()
63
- os.unlink(chunk)
64
- result.append(f"[MÆLENDI {speaker}] {text}")
65
 
66
- return "\n".join(result) or "Enginn texti heyrðist."
 
67
 
 
 
 
 
 
 
 
 
 
68
  with gr.Blocks() as demo:
69
- gr.Markdown("# Íslenskt ASR + Mælendagreining")
70
- gr.Markdown("Whisper-small + pyannote 3.1 (ZeroGPU örugg útgáfa)")
71
-
72
- audio = gr.Audio(type="filepath", label="Hljóðskrá")
73
- btn = gr.Button("Transcribe með mælendum")
74
- out = gr.Textbox(lines=35, label="Úttak")
75
 
76
- btn.click(transcribe_with_diarization, inputs=audio, outputs=out)
 
77
 
 
78
  demo.launch(auth=("beta", "beta2025"))
 
1
+ # ============================================================
2
+ # app.py – Whisper-small + Pyannote 3.1 (ZeroGPU örugg útgáfa)
3
+ # ============================================================
4
+
5
  import os
6
  import gradio as gr
7
  import spaces
8
  import tempfile
9
  import torch
 
 
 
 
 
 
 
10
  from transformers import pipeline
11
  from pyannote.audio import Pipeline
12
+ from torch.serialization import safe_globals
13
 
14
+ # ------------------------------------------------------------
15
+ # STILLT MODELNÖFN
16
+ # ------------------------------------------------------------
17
+ ASR_MODEL = "palli23/whisper-small-sam_spjall"
18
+ DIAR_MODEL = "pyannote/speaker-diarization-3.1"
 
 
19
 
20
+ # ------------------------------------------------------------
21
+ # Aðalfallið – keyrir á ZeroGPU (120s GPU max)
22
+ # ------------------------------------------------------------
23
  @spaces.GPU(duration=120)
24
  def transcribe_with_diarization(audio_path):
25
+
26
  if not audio_path:
27
+ return "Hladdu upp hljóðskrá."
28
+
29
+ # ----------------------------
30
+ # 1. PYTORCH SAFE GLOBALS FIX
31
+ # ----------------------------
32
+ # PyTorch 2.6+ ZeroGPU unpickling patch – MANDATORY
33
  with safe_globals([
34
  torch.torch_version.TorchVersion,
35
+ "pyannote.audio.core.task.Specifications",
36
+ "pyannote.audio.core.model.Model",
37
+ "pyannote.audio.pipelines.speaker_diarization.SpeakerDiarization"
 
 
38
  ]):
39
+
40
+ # ----------------------------
41
+ # 2. Load diarization pipeline
42
+ # ----------------------------
43
  diarization = Pipeline.from_pretrained(
44
+ DIAR_MODEL,
45
+ token=os.getenv("HF_TOKEN") # <--- RÉTT FYRIR PYANNOTE 3.1
46
  ).to("cuda")
 
 
 
47
 
48
+ # Keyra diarization
49
+ diar = diarization(audio_path)
50
+
51
+ # ----------------------------
52
+ # 3. Whisper ASR
53
+ # ----------------------------
54
  asr = pipeline(
55
+ task="automatic-speech-recognition",
56
+ model=ASR_MODEL,
57
  device=0,
58
+ token=os.getenv("HF_TOKEN")
59
  )
60
 
61
+ # ----------------------------
62
+ # 4. Skera út segment + Greina texta
63
+ # ----------------------------
64
+ final_output = []
65
+
66
+ for turn, _, speaker in diar.itertracks(yield_label=True):
67
+
68
+ # Vista tímabundna WAV fyrir hvert segment
69
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
70
+ diar.crop(audio_path, turn).export(tmp.name, format="wav")
71
+ seg_path = tmp.name
72
 
73
+ # ASR texti
74
+ text = asr(seg_path)["text"].strip()
 
75
 
76
+ # Vista niðurstöðu
77
+ final_output.append(f"[MÆLENDI {speaker}] {text}")
78
 
79
+ # Hreinsa
80
+ os.unlink(seg_path)
81
+
82
+ return "\n".join(final_output) if final_output else "Ekkert heyrt í hljóðinu."
83
+
84
+
85
+ # ------------------------------------------------------------
86
+ # GRADIO UI
87
+ # ------------------------------------------------------------
88
  with gr.Blocks() as demo:
89
+ gr.Markdown("# 🎙️ Íslenskt tal → texti + mælendagreining")
90
+ gr.Markdown("Whisper-small + pyannote 3.1 • Virkar á ZeroGPU 5 mín hljóð max")
91
+
92
+ audio_input = gr.Audio(type="filepath", label="Hladdu upp hljóðskrá (.wav / .mp3)")
93
+ out_box = gr.Textbox(lines=30, label="Útskrift + mælendur")
 
94
 
95
+ run_button = gr.Button("Transcribe með mælendum", variant="primary")
96
+ run_button.click(transcribe_with_diarization, inputs=audio_input, outputs=out_box)
97
 
98
+ # Spaces auth
99
  demo.launch(auth=("beta", "beta2025"))