Spaces:

WJ88
/

NVIDIA-Parakeet-TDT-0.6B-v2-INT8-Real-Time-Mic-Transcription

Running

App Files Files Community

WJ88 commited on May 23

Commit

cbb0ac8

verified ·

1 Parent(s): 10e2040

Removed dynamic q8, reduced logs, reduced size of buffer, removed check for mono, all optimization

Browse files

When type="numpy", Gradio captures audio in mono by default from the microphone.

The logs show that quantization consistently fails:
21:25:07 - load_model | audio_q=00, txt_q=00 | quantisation skipped (object proxy must define __deepcopy__())
The error indicates that the model (or a component of it) does not support dynamic quantization due to missing __deepcopy__ implementation, a known limitation with some PyTorch models or proxies.
Since quantization is skipped and provides no performance benefit in this case, the try-except block is unnecessary and adds complexity without value.
The app functions correctly without quantization, as evidenced by successful transcriptions in the logs (e.g., inference times of ~1.5s for 4-second chunks).

Files changed (1) hide show

app.py +16 -38

app.py CHANGED Viewed

@@ -29,17 +29,23 @@ logger = logging.getLogger("asr_app")
 # ────────────────────────────────────────────────
 # Constants
 # ────────────────────────────────────────────────
-SR              = 16_000            # Model sample-rate
-CHUNK_SECONDS   = 4                 # seconds per inference window
 CHUNK_SAMPLES   = SR * CHUNK_SECONDS
-README_CONTENT  = ""
 # ────────────────────────────────────────────────
 # ASR Application
 # ────────────────────────────────────────────────
 class ASRApp:
     def __init__(self):
-        self.audio_queue      = queue.Queue(maxsize=100)
         self.transcript_queue = queue.Queue()
         self.transcript_list  = []
         self._load_model()
@@ -60,41 +66,23 @@ class ASRApp:
             model_name="nvidia/parakeet-tdt-0.6b-v2",
             map_location="cpu",
         )
-        model.eval()                         # inference mode
-        # ---- dynamic INT8 quantisation ----
-        try:
-            model = torch.quantization.quantize_dynamic(
-                model,
-                {torch.nn.Linear, torch.nn.LSTM, torch.nn.GRU},
-                dtype=torch.qint8,
-            )
-            self._log("load_model", "INT8 quantisation applied")
-        except Exception as e:
-            self._log("load_model", f"quantisation skipped ({e})")
         self.asr_model = model
         self._log("load_model", f"model ready in {time.time()-t0:.1f}s")
-        # warm-up (1 × 1 s of zeros)
         with torch.inference_mode():
-            _ = self.asr_model.transcribe(
-                [np.zeros(SR, dtype=np.float32)]
-            )
         self._log("load_model", "warm-up done")
     # ---------- threading ----------
     def _start_worker(self):
-        threading.Thread(
-            target=self._worker,
-            daemon=True,
-        ).start()
     def _worker(self):
         buf = np.array([], dtype=np.float32)
         while True:
             try:
-                # accumulate until CHUNK_SAMPLES
                 while len(buf) < CHUNK_SAMPLES:
                     buf = np.concatenate([buf, self.audio_queue.get()])
-                    self._log("_worker", f"buffer={len(buf)}")
                 chunk, buf = buf[:CHUNK_SAMPLES], buf[CHUNK_SAMPLES:]
                 self._log("_worker", f"→ transcribe {len(chunk)} samples")
                 t0 = time.time()
@@ -110,11 +98,7 @@ class ASRApp:
     # ---------- audio preprocessing ----------
     def _preprocess(self, audio):
         sr, y = audio
-        if y.ndim > 1:
-            y = y.mean(axis=1)
-        if sr != SR:
-            # resample faster with polyphase filter
-            y = signal.resample_poly(y, SR, sr)
         y = y.astype(np.float32)
         y /= (np.abs(y).max() + 1e-9)
         return y
@@ -134,12 +118,6 @@ class ASRApp:
 # ────────────────────────────────────────────────
 # Gradio UI
 # ────────────────────────────────────────────────
-with open('README.md', 'r', encoding='utf-8') as file:
-    README_CONTENT = file.read()
-# Remove YAML frontmatter between the first pair of '---' lines
-README_CONTENT_without_YAML = re.sub(r'^---.*?---\s*', '', README_CONTENT, flags=re.DOTALL)
 asr_app = ASRApp()
 with gr.Blocks() as demo:
     mic = gr.Audio(
@@ -155,7 +133,7 @@ with gr.Blocks() as demo:
         fn=asr_app.stream_fn,
         inputs=mic,
         outputs=out,
-        stream_every=0.5,        # ↓ UI calls per second
     )
 asr_app._log("main", "launching UI")

 # ────────────────────────────────────────────────
 # Constants
 # ────────────────────────────────────────────────
+SR              = 16_000
+CHUNK_SECONDS   = 4
 CHUNK_SAMPLES   = SR * CHUNK_SECONDS
+# ────────────────────────────────────────────────
+# Prepare UI Description data from README.md file
+# ────────────────────────────────────────────────
+with open('README.md', 'r', encoding='utf-8') as file:
+    content = file.read()
+README_CONTENT_without_YAML = re.sub(r'^---.*?---\s*', '', content, flags=re.DOTALL)
 # ────────────────────────────────────────────────
 # ASR Application
 # ────────────────────────────────────────────────
 class ASRApp:
     def __init__(self):
+        self.audio_queue      = queue.Queue(maxsize=8)
         self.transcript_queue = queue.Queue()
         self.transcript_list  = []
         self._load_model()
             model_name="nvidia/parakeet-tdt-0.6b-v2",
             map_location="cpu",
         )
+        model.eval()
         self.asr_model = model
         self._log("load_model", f"model ready in {time.time()-t0:.1f}s")
         with torch.inference_mode():
+            _ = self.asr_model.transcribe([np.zeros(SR, dtype=np.float32)])
         self._log("load_model", "warm-up done")
     # ---------- threading ----------
     def _start_worker(self):
+        threading.Thread(target=self._worker, daemon=True).start()
     def _worker(self):
         buf = np.array([], dtype=np.float32)
         while True:
             try:
                 while len(buf) < CHUNK_SAMPLES:
                     buf = np.concatenate([buf, self.audio_queue.get()])
                 chunk, buf = buf[:CHUNK_SAMPLES], buf[CHUNK_SAMPLES:]
                 self._log("_worker", f"→ transcribe {len(chunk)} samples")
                 t0 = time.time()
     # ---------- audio preprocessing ----------
     def _preprocess(self, audio):
         sr, y = audio
+        y = signal.resample_poly(y, SR, sr)
         y = y.astype(np.float32)
         y /= (np.abs(y).max() + 1e-9)
         return y
 # ────────────────────────────────────────────────
 # Gradio UI
 # ────────────────────────────────────────────────
 asr_app = ASRApp()
 with gr.Blocks() as demo:
     mic = gr.Audio(
         fn=asr_app.stream_fn,
         inputs=mic,
         outputs=out,
+        stream_every=0.5,
     )
 asr_app._log("main", "launching UI")