Spaces:

AndroidGuy
/

Speaker-Diarization

Sleeping

App Files Files Community

Saiyaswanth007 commited on May 28

Commit

f4275bf

1 Parent(s): bd39e10

Transcript

Browse files

Files changed (4) hide show

inference.py +19 -0
shared.py +57 -8
test_websocket.py +1 -0
ui.py +10 -2

inference.py CHANGED Viewed

@@ -10,6 +10,16 @@ import time
 from typing import Set, Dict, Any
 import traceback
 # Set up logging
 logging.basicConfig(
     level=logging.INFO,
@@ -185,6 +195,15 @@ async def shutdown_event():
         try:
             diart.stop_recording()
             logger.info("Recording stopped")
         except Exception as e:
             logger.error(f"Error stopping recording: {e}")

 from typing import Set, Dict, Any
 import traceback
+# Check for RealtimeSTT and install if needed
+try:
+    from RealtimeSTT import AudioToTextRecorder
+except ImportError:
+    import subprocess
+    import sys
+    print("Installing RealtimeSTT dependency...")
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "RealtimeSTT"])
+    from RealtimeSTT import AudioToTextRecorder
 # Set up logging
 logging.basicConfig(
     level=logging.INFO,
         try:
             diart.stop_recording()
             logger.info("Recording stopped")
+            # Shutdown RealtimeSTT properly if available
+            if hasattr(diart, 'recorder') and diart.recorder:
+                try:
+                    diart.recorder.shutdown()
+                    logger.info("Transcription model shut down")
+                except Exception as e:
+                    logger.error(f"Error shutting down transcription model: {e}")
         except Exception as e:
             logger.error(f"Error stopping recording: {e}")

shared.py CHANGED Viewed

@@ -8,6 +8,9 @@ import torchaudio
 from scipy.spatial.distance import cosine
 from scipy.signal import resample
 import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -64,12 +67,26 @@ class SpeechBrainEncoder:
         self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "speechbrain")
         os.makedirs(self.cache_dir, exist_ok=True)
     def load_model(self):
         """Load the ECAPA-TDNN model"""
         try:
             # Import SpeechBrain
             from speechbrain.pretrained import EncoderClassifier
             # Load the pre-trained model
             self.model = EncoderClassifier.from_hparams(
                 source="speechbrain/spkrec-ecapa-voxceleb",
@@ -286,7 +303,7 @@ class RealtimeSpeakerDiarization:
         self.encoder = None
         self.audio_processor = None
         self.speaker_detector = None
-        self.recorder = None
         self.sentence_queue = queue.Queue()
         self.full_sentences = []
         self.sentence_speakers = []
@@ -314,6 +331,25 @@ class RealtimeSpeakerDiarization:
                     change_threshold=self.change_threshold,
                     max_speakers=self.max_speakers
                 )
                 logger.info("Models initialized successfully!")
                 return True
             else:
@@ -416,6 +452,11 @@ class RealtimeSpeakerDiarization:
             self.sentence_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
             self.sentence_thread.start()
             return "Recording started successfully!"
         except Exception as e:
@@ -425,6 +466,15 @@ class RealtimeSpeakerDiarization:
     def stop_recording(self):
         """Stop the recording process"""
         self.is_running = False
         return "Recording stopped!"
     def clear_conversation(self):
@@ -573,6 +623,12 @@ class RealtimeSpeakerDiarization:
             # Add to audio processor buffer for speaker detection
             self.audio_processor.add_audio_chunk(audio_data)
             # Periodically extract embeddings for speaker detection
             embedding = None
             speaker_id = self.speaker_detector.current_speaker
@@ -582,12 +638,6 @@ class RealtimeSpeakerDiarization:
                 embedding = self.audio_processor.extract_embedding_from_buffer()
                 if embedding is not None:
                     speaker_id, similarity = self.speaker_detector.add_embedding(embedding)
-                    # Add a simulated sentence for demo purposes
-                    if similarity < 0.5:
-                        with self.transcription_lock:
-                            self.full_sentences.append((f"[Audio segment {self.speaker_detector.segment_counter}]", speaker_id))
-                            self.update_conversation_display()
             # Return processing result
             return {
@@ -595,7 +645,6 @@ class RealtimeSpeakerDiarization:
                 "buffer_size": len(self.audio_processor.audio_buffer),
                 "speaker_id": int(speaker_id) if not isinstance(speaker_id, int) else speaker_id,
                 "similarity": float(similarity) if embedding is not None and not isinstance(similarity, float) else similarity,
-                "latest_sentence": f"[Audio segment {self.speaker_detector.segment_counter}]" if similarity < 0.5 else None,
                 "conversation_html": self.current_conversation
             }

 from scipy.spatial.distance import cosine
 from scipy.signal import resample
 import logging
+import urllib.request
+# Import RealtimeSTT for transcription
+from RealtimeSTT import AudioToTextRecorder
 # Set up logging
 logging.basicConfig(level=logging.INFO)
         self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "speechbrain")
         os.makedirs(self.cache_dir, exist_ok=True)
+    def _download_model(self):
+        """Download pre-trained SpeechBrain ECAPA-TDNN model if not present"""
+        model_url = "https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb/resolve/main/embedding_model.ckpt"
+        model_path = os.path.join(self.cache_dir, "embedding_model.ckpt")
+        if not os.path.exists(model_path):
+            print(f"Downloading ECAPA-TDNN model to {model_path}...")
+            urllib.request.urlretrieve(model_url, model_path)
+        return model_path
     def load_model(self):
         """Load the ECAPA-TDNN model"""
         try:
             # Import SpeechBrain
             from speechbrain.pretrained import EncoderClassifier
+            # Get model path
+            model_path = self._download_model()
             # Load the pre-trained model
             self.model = EncoderClassifier.from_hparams(
                 source="speechbrain/spkrec-ecapa-voxceleb",
         self.encoder = None
         self.audio_processor = None
         self.speaker_detector = None
+        self.recorder = None  # RealtimeSTT recorder
         self.sentence_queue = queue.Queue()
         self.full_sentences = []
         self.sentence_speakers = []
                     change_threshold=self.change_threshold,
                     max_speakers=self.max_speakers
                 )
+                # Initialize RealtimeSTT transcription model
+                self.recorder = AudioToTextRecorder(
+                    spinner=False,
+                    use_microphone=False,
+                    model=FINAL_TRANSCRIPTION_MODEL,
+                    language=TRANSCRIPTION_LANGUAGE,
+                    silero_sensitivity=SILERO_SENSITIVITY,
+                    webrtc_sensitivity=WEBRTC_SENSITIVITY,
+                    post_speech_silence_duration=0.7,
+                    min_length_of_recording=MIN_LENGTH_OF_RECORDING,
+                    pre_recording_buffer_duration=PRE_RECORDING_BUFFER_DURATION,
+                    enable_realtime_transcription=True,
+                    realtime_processing_pause=0,
+                    realtime_model_type=REALTIME_TRANSCRIPTION_MODEL,
+                    on_realtime_transcription_stabilized=self.live_text_detected,
+                    on_recording_complete=self.process_final_text
+                )
                 logger.info("Models initialized successfully!")
                 return True
             else:
             self.sentence_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
             self.sentence_thread.start()
+            # Start the RealtimeSTT recorder if not already started
+            if self.recorder and not getattr(self.recorder, '_is_running', False):
+                self.recorder.start()
+                logger.info("RealtimeSTT recorder started")
             return "Recording started successfully!"
         except Exception as e:
     def stop_recording(self):
         """Stop the recording process"""
         self.is_running = False
+        # Stop the RealtimeSTT recorder
+        if self.recorder:
+            try:
+                self.recorder.stop()
+                logger.info("RealtimeSTT recorder stopped")
+            except Exception as e:
+                logger.error(f"Error stopping recorder: {e}")
         return "Recording stopped!"
     def clear_conversation(self):
             # Add to audio processor buffer for speaker detection
             self.audio_processor.add_audio_chunk(audio_data)
+            # Feed to RealtimeSTT for transcription
+            if self.recorder:
+                # Convert to int16 for RealtimeSTT
+                audio_int16 = (audio_data * 32768).astype(np.int16)
+                self.recorder.feed_audio(audio_int16.tobytes())
             # Periodically extract embeddings for speaker detection
             embedding = None
             speaker_id = self.speaker_detector.current_speaker
                 embedding = self.audio_processor.extract_embedding_from_buffer()
                 if embedding is not None:
                     speaker_id, similarity = self.speaker_detector.add_embedding(embedding)
             # Return processing result
             return {
                 "buffer_size": len(self.audio_processor.audio_buffer),
                 "speaker_id": int(speaker_id) if not isinstance(speaker_id, int) else speaker_id,
                 "similarity": float(similarity) if embedding is not None and not isinstance(similarity, float) else similarity,
                 "conversation_html": self.current_conversation
             }

test_websocket.py CHANGED Viewed

@@ -15,6 +15,7 @@ async def test_ws():
             audio = (np.random.randn(3200) * 3000).astype(np.int16)
             await websocket.send(audio.tobytes())
             print(f"Sent audio chunk {i+1}/20")
         try:
             while True:

             audio = (np.random.randn(3200) * 3000).astype(np.int16)
             await websocket.send(audio.tobytes())
             print(f"Sent audio chunk {i+1}/20")
+            await asyncio.sleep(0.05)
         try:
             while True:

ui.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 from fastapi import FastAPI
-from shared import DEFAULT_CHANGE_THRESHOLD, DEFAULT_MAX_SPEAKERS, ABSOLUTE_MAX_SPEAKERS
 print(gr.__version__)
 # Connection configuration (separate signaling server from model server)
 # These will be replaced at deployment time with the correct URLs
@@ -23,7 +23,10 @@ def build_ui():
         # Header and description
         gr.Markdown("# 🎤 Live Speaker Diarization")
-        gr.Markdown("Real-time speech recognition with automatic speaker identification")
         # Status indicator
         connection_status = gr.HTML(
@@ -459,6 +462,11 @@ def build_ui():
                                         <li>Threshold: ${threshold}</li>
                                         <li>Max Speakers: ${maxSpeakers}</li>
                                     </ul>
                                 `;
                             }
                         });

 import gradio as gr
 from fastapi import FastAPI
+from shared import DEFAULT_CHANGE_THRESHOLD, DEFAULT_MAX_SPEAKERS, ABSOLUTE_MAX_SPEAKERS, FINAL_TRANSCRIPTION_MODEL, REALTIME_TRANSCRIPTION_MODEL
 print(gr.__version__)
 # Connection configuration (separate signaling server from model server)
 # These will be replaced at deployment time with the correct URLs
         # Header and description
         gr.Markdown("# 🎤 Live Speaker Diarization")
+        gr.Markdown(f"Real-time speech recognition with automatic speaker identification")
+        # Add transcription model info
+        gr.Markdown(f"**Using Models:** Final: {FINAL_TRANSCRIPTION_MODEL}, Realtime: {REALTIME_TRANSCRIPTION_MODEL}")
         # Status indicator
         connection_status = gr.HTML(
                                         <li>Threshold: ${threshold}</li>
                                         <li>Max Speakers: ${maxSpeakers}</li>
                                     </ul>
+                                    <p>Transcription Models:</p>
+                                    <ul>
+                                        <li>Final: ${window.FINAL_TRANSCRIPTION_MODEL || "distil-large-v3"}</li>
+                                        <li>Realtime: ${window.REALTIME_TRANSCRIPTION_MODEL || "distil-small.en"}</li>
+                                    </ul>
                                 `;
                             }
                         });