Spaces:
Sleeping
Sleeping
Sidak Singh
commited on
Commit
·
66a7fab
1
Parent(s):
7b7174c
can detect sentece ends
Browse files- __pycache__/transcriber.cpython-310.pyc +0 -0
- app.py +3 -0
- transcriber.py +6 -1
- working.py +0 -28
__pycache__/transcriber.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/transcriber.cpython-310.pyc and b/__pycache__/transcriber.cpython-310.pyc differ
|
|
|
app.py
CHANGED
|
@@ -2,6 +2,7 @@ import gradio as gr
|
|
| 2 |
import numpy as np
|
| 3 |
from transcriber import AudioProcessor
|
| 4 |
|
|
|
|
| 5 |
# Create processor instance with more conservative settings
|
| 6 |
processor = AudioProcessor(model_size="tiny.en", device="cpu")
|
| 7 |
|
|
@@ -22,6 +23,8 @@ def process_mic_audio(audio):
|
|
| 22 |
# Get current transcription
|
| 23 |
transcription = processor.get_transcription()
|
| 24 |
print(transcription)
|
|
|
|
|
|
|
| 25 |
|
| 26 |
# Return status update and transcription
|
| 27 |
buffer_seconds = buffer_size / processor.sample_rate
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
from transcriber import AudioProcessor
|
| 4 |
|
| 5 |
+
|
| 6 |
# Create processor instance with more conservative settings
|
| 7 |
processor = AudioProcessor(model_size="tiny.en", device="cpu")
|
| 8 |
|
|
|
|
| 23 |
# Get current transcription
|
| 24 |
transcription = processor.get_transcription()
|
| 25 |
print(transcription)
|
| 26 |
+
transcription = str(transcription)
|
| 27 |
+
|
| 28 |
|
| 29 |
# Return status update and transcription
|
| 30 |
buffer_seconds = buffer_size / processor.sample_rate
|
transcriber.py
CHANGED
|
@@ -3,6 +3,8 @@ import threading
|
|
| 3 |
import time
|
| 4 |
from faster_whisper import WhisperModel
|
| 5 |
import scipy.signal as signal
|
|
|
|
|
|
|
| 6 |
|
| 7 |
class AudioProcessor:
|
| 8 |
def __init__(self, model_size="tiny.en", device="cpu", compute_type="int8"):
|
|
@@ -26,6 +28,8 @@ class AudioProcessor:
|
|
| 26 |
self.audio_model = WhisperModel(model_size, device=device, compute_type=compute_type)
|
| 27 |
print(f"Initialized {model_size} model on {device}")
|
| 28 |
|
|
|
|
|
|
|
| 29 |
def _trim_buffer_intelligently(self):
|
| 30 |
"""
|
| 31 |
Trim the buffer while preserving transcription continuity
|
|
@@ -273,7 +277,8 @@ class AudioProcessor:
|
|
| 273 |
def get_transcription(self):
|
| 274 |
"""Get the current transcription text"""
|
| 275 |
with self.lock:
|
| 276 |
-
|
|
|
|
| 277 |
|
| 278 |
def get_playback_audio(self):
|
| 279 |
"""Get properly formatted audio for Gradio playback"""
|
|
|
|
| 3 |
import time
|
| 4 |
from faster_whisper import WhisperModel
|
| 5 |
import scipy.signal as signal
|
| 6 |
+
from typing import List
|
| 7 |
+
from punctuators.models import SBDModelONNX
|
| 8 |
|
| 9 |
class AudioProcessor:
|
| 10 |
def __init__(self, model_size="tiny.en", device="cpu", compute_type="int8"):
|
|
|
|
| 28 |
self.audio_model = WhisperModel(model_size, device=device, compute_type=compute_type)
|
| 29 |
print(f"Initialized {model_size} model on {device}")
|
| 30 |
|
| 31 |
+
self.sentence_end_detect = SBDModelONNX.from_pretrained("sbd_multi_lang")
|
| 32 |
+
|
| 33 |
def _trim_buffer_intelligently(self):
|
| 34 |
"""
|
| 35 |
Trim the buffer while preserving transcription continuity
|
|
|
|
| 277 |
def get_transcription(self):
|
| 278 |
"""Get the current transcription text"""
|
| 279 |
with self.lock:
|
| 280 |
+
results: List[List[str]] = self.sentence_end_detect.infer([self.full_transcription])
|
| 281 |
+
return results[0]
|
| 282 |
|
| 283 |
def get_playback_audio(self):
|
| 284 |
"""Get properly formatted audio for Gradio playback"""
|
working.py
DELETED
|
@@ -1,28 +0,0 @@
|
|
| 1 |
-
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
|
| 2 |
-
|
| 3 |
-
def transcribe(stream, new_chunk):
|
| 4 |
-
sr, y = new_chunk
|
| 5 |
-
|
| 6 |
-
# Convert to mono if stereo
|
| 7 |
-
if y.ndim > 1:
|
| 8 |
-
y = y.mean(axis=1)
|
| 9 |
-
|
| 10 |
-
y = y.astype(np.float32)
|
| 11 |
-
y /= np.max(np.abs(y))
|
| 12 |
-
|
| 13 |
-
if stream is not None:
|
| 14 |
-
stream = np.concatenate([stream, y])
|
| 15 |
-
else:
|
| 16 |
-
stream = y
|
| 17 |
-
|
| 18 |
-
# Return the stream as state and a string representation of the array for display
|
| 19 |
-
return stream, str(stream)
|
| 20 |
-
|
| 21 |
-
demo = gr.Interface(
|
| 22 |
-
transcribe,
|
| 23 |
-
["state", gr.Audio(sources=["microphone"], streaming=True)],
|
| 24 |
-
["state", "text"],
|
| 25 |
-
live=True,
|
| 26 |
-
)
|
| 27 |
-
|
| 28 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|