Spaces:

Archime
/

canary_aed_streaming

Running on Zero

App Files Files Community

Archime commited on 16 days ago

Commit

4f560c0

1 Parent(s): f76a38c

optimise read_and_stream_audio

Browse files

Files changed (1) hide show

app/utils.py +88 -76

app/utils.py CHANGED Viewed

@@ -5,6 +5,7 @@ import asyncio
 import os
 import time
 import numpy as np
 import spaces
 import hmac
 import hashlib
@@ -29,6 +30,11 @@ from app.silero_vad_engine import Silero_Vad_Engine
 from app.streaming_audio_processor import StreamingAudioProcessor,StreamingAudioProcessorConfig
 import nemo.collections.asr as nemo_asr
 READ_SIZE=4000
 # --------------------------------------------------------
 # Utility functions
@@ -68,7 +74,7 @@ def generate_coturn_config():
-def read_and_stream_audio(filepath_to_stream: str, session_hash_code: str,read_size:int =8000, sample_rate:int =16000):
     """
     Read an audio file and stream it chunk by chunk (1s per chunk).
     Handles errors safely and reports structured messages to the client.
@@ -84,55 +90,104 @@ def read_and_stream_audio(filepath_to_stream: str, session_hash_code: str,read_s
     try:
         segment = AudioSegment.from_file(filepath_to_stream)
         chunk_duration_ms = int((read_size/sample_rate)*1000)
         total_chunks = len(segment) // chunk_duration_ms + 1
         start_streaming(session_hash_code)
-        logging.info(f"[{session_hash_code}] Starting audio streaming {filepath_to_stream} ({total_chunks} chunks).")
-        for i, chunk in enumerate(segment[::chunk_duration_ms]):
             frame_rate = chunk.frame_rate
-            samples = np.array(chunk.get_array_of_samples()).reshape(1, -1)
             progress = round(((i + 1) / total_chunks) * 100, 2)
             if is_stop_requested(session_hash_code):
-                logging.info(f"[{session_hash_code}] Stop signal received. Terminating stream.")
-                yield ((frame_rate, samples), AdditionalOutputs({"stoped": True, "value": "STREAM_STOPED", "session_hash_code" : session_hash_code } ) )
                 break
-            yield ((frame_rate, samples), AdditionalOutputs({"progressed": True, "value": progress , "session_hash_code" : session_hash_code} ))
-            logging.debug(f"[{session_hash_code}] Sent chunk {i+1}/{total_chunks} ({progress}%).")
-            time.sleep(chunk_duration_ms/1000)
-            #  Save only if transcription is active
-            if os.path.exists(task_active_flag) :
-                chunk_dir = get_session_hashe_chunks_dir(session_hash_code)
-                if not os.path.exists(chunk_dir) :
                     os.makedirs(chunk_dir, exist_ok=True)
                 npz_path = os.path.join(chunk_dir, f"chunk_{i:05d}.npz")
-                chunk_array = np.array(chunk.get_array_of_samples(), dtype=np.int16)
-                if os.path.exists(task_active_flag):
-                    np.savez_compressed(npz_path, data=chunk_array, rate=frame_rate)
-                    logging.debug(f"[{session_hash_code}] Saved chunk {i}/{total_chunks} (transcribe active) ({progress}%) ({npz_path}).")
-            # raise_error()  # Optional injected test exception
-        logging.info(f"[{session_hash_code}] Audio streaming completed successfully.")
-    except asyncio.CancelledError:
-        yield from handle_stream_error(session_hash_code, "Streaming cancelled by user.")
-    except FileNotFoundError as e:
-        yield from handle_stream_error(session_hash_code, e)
     except Exception as e:
         yield from handle_stream_error(session_hash_code, e)
     finally:
         remove_active_stream_flag_file(session_hash_code)
-        logging.info(f"[{session_hash_code}] Stop flag reset.")
 # asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/canary-1b-v2")
 asr_model = None
 @spaces.GPU
 def task_fake(session_hash_code: str,
         task_type, lang_source, lang_target,
@@ -143,23 +198,6 @@ def task_fake(session_hash_code: str,
     """Continuously read and delete .npz chunks while task is active."""
     global asr_model
     yield ("initializing the CanarySpeechEngine and Silero_Vad_Engine", "info", None)
-    ### TODO
-    ##-----------
-    # conf = CanaryConfig.from_params(
-    #     task_type, SUPPORTED_LANGS_MAP.get(lang_source),SUPPORTED_LANGS_MAP.get(lang_target) ,
-    #     chunk_secs, left_context_secs, right_context_secs,
-    #     streaming_policy, alignatt_thr, waitk_lagging,
-    #     exclude_sink_frames, xatt_scores_layer, hallucinations_detector
-    # )
-    # canary_speech_engine = CanarySpeechEngine(asr_model,conf)
-    # silero_vad_engine = Silero_Vad_Engine()
-    # streaming_audio_processor_config = StreamingAudioProcessorConfig(
-    # read_size=READ_SIZE,
-    # silence_threshold_chunks=1
-    # )
-    # streamer = StreamingAudioProcessor(speech_engine=canary_speech_engine,vad_engine=silero_vad_engine,cfg=streaming_audio_processor_config)
-    ##-----------
     yield ("initialized the CanarySpeechEngine and Silero_Vad_Engine", "info", None)
     yield (f"Task started for session {session_hash_code}", "info", None)
@@ -191,29 +229,22 @@ def task_fake(session_hash_code: str,
                     npz = np.load(fpath)
                     samples = npz["data"]
                     rate = int(npz["rate"])
-                    ##-----------
-                    # new_texts = streamer.process_chunk(samples)
-                    # for text in new_texts:
-                    #     print(text, end='', flush=True)
-                    #     yield (text, "success", text)
-                    #     logging.debug(f"[{session_hash_code}] {new_texts}")
-                    ##-----------
-                    ### TODO
                     text = f"Transcribed {fname}: {len(samples)} samples @ {rate}Hz\n"
                     yield (text, "success", fname)
                     os.remove(fpath)
                     logging.debug(f"[{session_hash_code}] Deleted processed chunk: {fname}")
                 except Exception as e:
                     logging.warning(f"[{session_hash_code}] Error processing {fname}: {e}")
                     yield (f"Error processing {fname}: {e}", "warning", fname)
-                    continue
                 time.sleep(0.1)
-        # TODO
-        ##-----------
-        # final_text = streamer.finalize_stream()
-        # yield (text, "success", final_text)
-        ##-----------
         yield ("DONE", "done", None)
         logging.info(f"[{session_hash_code}] task loop ended (flag removed).")
@@ -338,25 +369,6 @@ def task(session_hash_code: str,
         yield ("Task finished and cleaned up.", "done", None)
-def handle_stream_error(session_hash_code: str, error: Exception):
-    """
-    Handle streaming errors:
-    - Log the error
-    - Send structured info to client
-    - Reset stop flag
-    """
-    if isinstance(error, Exception):
-        msg = f"{type(error).__name__}: {str(error)}"
-    else:
-        msg = str(error)
-    logging.error(f"[{session_hash_code}] Streaming error: {msg}", exc_info=isinstance(error, Exception))
-    remove_active_stream_flag_file(session_hash_code)
-    yield ((16000,np.zeros(16000, dtype=np.float32).reshape(1, -1)), AdditionalOutputs({"errored": True, "value": msg, "session_hash_code" : session_hash_code}))
 # --- Decorator compatibility layer ---

 import os
 import time
 import numpy as np
 import spaces
 import hmac
 import hashlib
 from app.streaming_audio_processor import StreamingAudioProcessor,StreamingAudioProcessorConfig
 import nemo.collections.asr as nemo_asr
 READ_SIZE=4000
+import gradio as gr
+from typing import Generator
+from typing import Generator, Tuple, Any, Optional
+GradioAudioYield = Tuple[int, np.ndarray]
+StreamYield = Generator[Tuple[GradioAudioYield, AdditionalOutputs], None, None]
 # --------------------------------------------------------
 # Utility functions
+def read_and_stream_audio(filepath_to_stream: str, session_hash_code: str,read_size:int =8000, sample_rate:int =16000)  -> StreamYield:
     """
     Read an audio file and stream it chunk by chunk (1s per chunk).
     Handles errors safely and reports structured messages to the client.
     try:
         segment = AudioSegment.from_file(filepath_to_stream)
         chunk_duration_ms = int((read_size/sample_rate)*1000)
+        total_duration_ms = len(segment)
         total_chunks = len(segment) // chunk_duration_ms + 1
         start_streaming(session_hash_code)
+        logging.info(f"[{session_hash_code}] Starting stream: {filepath_to_stream} ({total_chunks} chunks, {chunk_duration_ms}ms steps).")
+        chunk_dir = get_session_hashe_chunks_dir(session_hash_code)
+        ensure_dir_exists = False
+        for i, start_ms in enumerate(range(0, total_duration_ms, chunk_duration_ms)):
+            end_ms = min(start_ms + chunk_duration_ms, total_duration_ms)
+            chunk = segment[start_ms:end_ms]
             frame_rate = chunk.frame_rate
+            samples_int16 = np.array(chunk.get_array_of_samples(), dtype=np.int16)
+            samples_float = (samples_int16 / 32768.0).astype(np.float32)
+            # Gestion Mono vs Stéréo pour Gradio
+            if chunk.channels > 1:
+                samples_reshaped = samples_float.reshape(-1, chunk.channels)
+            else:
+                samples_reshaped = samples_float.reshape(1, -1)
             progress = round(((i + 1) / total_chunks) * 100, 2)
+                        # Envoi au client
             if is_stop_requested(session_hash_code):
+                logging.info(f"[{session_hash_code}] Stop signal received.")
+                samples = np.array(chunk.get_array_of_samples()).reshape(1, -1)
+                yield (
+                    (sample_rate, samples_reshaped),
+                    AdditionalOutputs({"stoped": True, "value": "STREAM_STOPPED", "session_hash_code": session_hash_code})
+                )
                 break
+            yield (
+                (frame_rate, samples_reshaped),
+                AdditionalOutputs({"progressed": True, "value": progress, "session_hash_code": session_hash_code})
+            )
+            if is_active_task(session_hash_code):
+                if not ensure_dir_exists:
                     os.makedirs(chunk_dir, exist_ok=True)
+                    ensure_dir_exists = True
                 npz_path = os.path.join(chunk_dir, f"chunk_{i:05d}.npz")
+                # Compression activée, attention c'est lent (CPU intensif)
+                if is_active_task(session_hash_code):
+                    np.savez_compressed(npz_path, data=samples_int16, rate=frame_rate)
+                    logging.debug(f"[{session_hash_code}] Saved chunk {i} to {npz_path}")
+            time.sleep(chunk_duration_ms/1000)
+            raise_error()  # Optional injected test exception
+        logging.info(f"[{session_hash_code}] Streaming completed.")
     except Exception as e:
         yield from handle_stream_error(session_hash_code, e)
     finally:
         remove_active_stream_flag_file(session_hash_code)
+        logging.info(f"[{session_hash_code}] Cleanup done.")
+def handle_stream_error(session_hash_code: str, error: Exception):
+    """
+    Handle streaming errors:
+    - Log the error
+    - Send structured info to client
+    - Reset stop flag
+    """
+    msg = f"{type(error).__name__}: {str(error)}"
+    logging.error(f"[{session_hash_code}] Stream Error: {msg}", exc_info=True)
+    remove_active_stream_flag_file(session_hash_code)
+    empty_audio = np.zeros((1, 16000), dtype=np.float32)
+    yield (
+            (16000, empty_audio),
+            AdditionalOutputs({"errored": True, "value": msg, "session_hash_code": session_hash_code})
+        )
 # asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/canary-1b-v2")
 asr_model = None
+# @spaces.cache
+# def load_model():
+#     logging.info("Chargement du modèle ASR/AST de NeMo...")
+#     # Remplacez par votre logique de chargement de modèle
+#     model = nemo_asr.models.EncDecRNNTModel.restore_from("path/to/model.nemo")
+#     logging.info("Modèle chargé.")
+#     return model
+# # Chargez-le une seule fois au démarrage du script
+# ASR_MODEL = load_model()
 @spaces.GPU
 def task_fake(session_hash_code: str,
         task_type, lang_source, lang_target,
     """Continuously read and delete .npz chunks while task is active."""
     global asr_model
     yield ("initializing the CanarySpeechEngine and Silero_Vad_Engine", "info", None)
     yield ("initialized the CanarySpeechEngine and Silero_Vad_Engine", "info", None)
     yield (f"Task started for session {session_hash_code}", "info", None)
                     npz = np.load(fpath)
                     samples = npz["data"]
                     rate = int(npz["rate"])
                     text = f"Transcribed {fname}: {len(samples)} samples @ {rate}Hz\n"
                     yield (text, "success", fname)
                     os.remove(fpath)
                     logging.debug(f"[{session_hash_code}] Deleted processed chunk: {fname}")
+                    # raise_error()
+                except EOFError as e:
+                    logging.warning(f"[{session_hash_code}] Error processing {fname}: {e}")
+                    yield (f"EOFError processing {fname}: {e}", "warning", fname)
                 except Exception as e:
                     logging.warning(f"[{session_hash_code}] Error processing {fname}: {e}")
                     yield (f"Error processing {fname}: {e}", "warning", fname)
+                    # continue
                 time.sleep(0.1)
         yield ("DONE", "done", None)
         logging.info(f"[{session_hash_code}] task loop ended (flag removed).")
         yield ("Task finished and cleaned up.", "done", None)
 # --- Decorator compatibility layer ---