Spaces:

Scrapyard-Brampton
/

Testing

Sleeping

App Files Files Community

Scrapyard commited on Aug 8, 2025

Commit

bc075a6

1 Parent(s): a748eff

it works idk how but it does

Browse files

Files changed (1) hide show

app.py +163 -52

app.py CHANGED Viewed

@@ -1,68 +1,179 @@
 import gradio as gr
 import numpy as np
 from faster_whisper import WhisperModel
-from faster_whisper.transcribe import Segment
 audio_model = WhisperModel("tiny.en", device="cpu", compute_type="int8")
-transcription = ['']
-buffer = np.array([])
-def transcribe(SampleRate, data):
-    global buffer
-    if SampleRate * 3 >= len(buffer):
-        print("buffer big")
-        segments, info = audio_model.transcribe(buffer, beam_size=5)
-        result = (list(segments))
-        text = ""
-        if result and len(result) > 0:
-            text = result[0].text
-            print("Text:", text)
-        else:
-            text = ""
-            print("No text found")
-            print(result)
-        buffer = np.array([])
-        return(text)
-    else:
-        buffer = np.concatenate([buffer, data])
-        print("buffer small")
-        return None
-def normaliseData(audioInput, stream):
-    sr, y = audioInput
-    # Convert to mono if stereo
-    if y.ndim > 1:
-        y = y.mean(axis=1)
-    y = y.astype(np.float32)
-    y /= np.max(np.abs(y))
-    if stream is not None:
-        stream = np.concatenate([stream, y])
-    else:
-        stream = y
-    words = transcribe(sr, y)
-    # Return the stream as state and a string representation of the array for display
-    return stream, words,
-with gr.Blocks() as demo:
-    audioInput = gr.Audio(sources=["microphone"], streaming=True)
-    audioOutput = gr.Textbox(label="Output")
-    state = gr.State()
-    audioInput.stream(
-        fn=normaliseData,
-        inputs=[audioInput, state],
-        outputs=[state, audioOutput] # try switching it arround
     )
-demo.launch()

 import gradio as gr
 import numpy as np
 from faster_whisper import WhisperModel
+import threading
+import time
+import scipy.signal as signal
+# Initialize the WhisperModel
 audio_model = WhisperModel("tiny.en", device="cpu", compute_type="int8")
+class AudioProcessor:
+    def __init__(self):
+        self.audio_buffer = np.array([])  # Stores raw audio for playback
+        self.sample_rate = 16000          # Default sample rate for whisper
+        self.lock = threading.Lock()      # Thread safety for buffer access
+        self.transcription = ['']         # List of transcription segments
+        self.min_process_length = 1 * self.sample_rate  # Process at least 1 second
+        self.max_buffer_size = 30 * self.sample_rate  # Maximum buffer size (30 seconds)
+        self.last_process_time = time.time()
+        self.process_interval = 1.0       # Process every 1 second
+    def add_audio(self, audio_data, sr):
+        """Add audio to the buffer and process if needed"""
+        with self.lock:
+            # Convert to mono if stereo
+            if audio_data.ndim > 1:
+                audio_data = audio_data.mean(axis=1)
+            # Keep original format without normalization
+            audio_data = audio_data.astype(np.float32)
+            # Resample properly if needed
+            if sr != self.sample_rate:
+                try:
+                    number_of_samples = int(len(audio_data) * self.sample_rate / sr)
+                    audio_data = signal.resample(audio_data, number_of_samples)
+                except Exception as e:
+                    print(f"Resampling error: {e}")
+                    ratio = self.sample_rate / sr
+                    audio_data = np.interp(
+                        np.arange(0, len(audio_data) * ratio, ratio),
+                        np.arange(0, len(audio_data)),
+                        audio_data
+                    )
+            # Add to buffer without renormalizing
+            if len(self.audio_buffer) == 0:
+                self.audio_buffer = audio_data
+            else:
+                self.audio_buffer = np.concatenate([self.audio_buffer, audio_data])
+            # Trim buffer if it gets too large
+            if len(self.audio_buffer) > self.max_buffer_size:
+                self.audio_buffer = self.audio_buffer[-self.max_buffer_size:]
+            # Check if we should process now
+            should_process = (
+                len(self.audio_buffer) >= self.min_process_length and
+                time.time() - self.last_process_time >= self.process_interval
+            )
+            if should_process:
+                self.last_process_time = time.time()
+                # Process the buffer in a separate thread to avoid blocking
+                threading.Thread(target=self._process_audio).start()
+            return len(self.audio_buffer)
+    def _process_audio(self):
+        """Process the current audio buffer (should be called in a separate thread)"""
+        with self.lock:
+            # Make a copy for processing
+            audio = self.audio_buffer.copy()
+        # Normalize for transcription
+        audio_norm = audio.astype(np.float32)
+        if np.max(np.abs(audio_norm)) > 0:
+            audio_norm = audio_norm / np.max(np.abs(audio_norm))
+        try:
+            # Transcribe with whisper
+            segments, info = audio_model.transcribe(audio_norm, beam_size=5)
+            result = list(segments)
+            if result:
+                with self.lock:
+                    # Update the transcription
+                    self.transcription = [seg.text for seg in result]
+        except Exception as e:
+            print(f"Transcription error: {e}")
+    def get_transcription(self):
+        """Get the current transcription text"""
+        with self.lock:
+            return " ".join(self.transcription)
+    def clear_buffer(self):
+        """Clear the audio buffer"""
+        with self.lock:
+            self.audio_buffer = np.array([])
+            self.transcription = ['']
+            return "Buffers cleared"
+    def get_playback_audio(self):
+        """Get properly formatted audio for Gradio playback"""
+        with self.lock:
+            if len(self.audio_buffer) == 0:
+                return None
+            # Make a copy and ensure proper format for Gradio
+            audio = self.audio_buffer.copy()
+            # Ensure audio is in the correct range for playback (-1 to 1)
+            if np.max(np.abs(audio)) > 0:
+                audio = audio / max(1.0, np.max(np.abs(audio)))
+            return (self.sample_rate, audio)
+# Create processor instance
+processor = AudioProcessor()
+def process_mic_audio(audio):
+    """Process audio from Gradio microphone and update transcription"""
+    if audio is None:
+        return gr.update(), gr.update()
+    sr, y = audio
+    # Add to processor and possibly trigger transcription
+    buffer_size = processor.add_audio(y, sr)
+    # Get current transcription
+    transcription = processor.get_transcription()
+    # Return status update and transcription
+    buffer_seconds = buffer_size / processor.sample_rate
+    return (
+        f"Buffer size: {buffer_size} samples ({buffer_seconds:.2f} seconds)",
+        transcription
+    )
+def clear_audio_buffer():
+    """Clear the audio buffer"""
+    return processor.clear_buffer(), gr.update(), ""
+def get_current_buffer():
+    """Get the current buffer for playback"""
+    return processor.get_playback_audio()
+# Create Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Live Speech Recognition with Buffer Playback")
+    with gr.Row():
+        audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Microphone Input")
+    with gr.Row():
+        status_output = gr.Textbox(label="Buffer Status", interactive=False)
+        buffer_audio = gr.Audio(label="Current Buffer (Click to Play)", interactive=False)
+    with gr.Row():
+        clear_btn = gr.Button("Clear Buffer")
+        play_btn = gr.Button("Get Buffer for Playback")
+    with gr.Row():
+        transcription_output = gr.Textbox(label="Live Transcription", lines=5, interactive=False)
+    # Connect components - removed the 'every' parameter for compatibility
+    audio_input.stream(
+        process_mic_audio,
+        audio_input,
+        [status_output, transcription_output]
     )
+    clear_btn.click(clear_audio_buffer, None, [status_output, buffer_audio, transcription_output])
+    play_btn.click(get_current_buffer, None, buffer_audio)
+# Launch the interface
+demo.launch()