WJ88 commited on
Commit
cbb0ac8
Β·
verified Β·
1 Parent(s): 10e2040

Removed dynamic q8, reduced logs, reduced size of buffer, removed check for mono, all optimization

Browse files

When type="numpy", Gradio captures audio in mono by default from the microphone.

The logs show that quantization consistently fails:
21:25:07 - load_model | audio_q=00, txt_q=00 | quantisation skipped (object proxy must define __deepcopy__())
The error indicates that the model (or a component of it) does not support dynamic quantization due to missing __deepcopy__ implementation, a known limitation with some PyTorch models or proxies.
Since quantization is skipped and provides no performance benefit in this case, the try-except block is unnecessary and adds complexity without value.
The app functions correctly without quantization, as evidenced by successful transcriptions in the logs (e.g., inference times of ~1.5s for 4-second chunks).

Files changed (1) hide show
  1. app.py +16 -38
app.py CHANGED
@@ -29,17 +29,23 @@ logger = logging.getLogger("asr_app")
29
  # ────────────────────────────────────────────────
30
  # Constants
31
  # ────────────────────────────────────────────────
32
- SR = 16_000 # Model sample-rate
33
- CHUNK_SECONDS = 4 # seconds per inference window
34
  CHUNK_SAMPLES = SR * CHUNK_SECONDS
35
- README_CONTENT = ""
 
 
 
 
 
 
36
 
37
  # ────────────────────────────────────────────────
38
  # ASR Application
39
  # ────────────────────────────────────────────────
40
  class ASRApp:
41
  def __init__(self):
42
- self.audio_queue = queue.Queue(maxsize=100)
43
  self.transcript_queue = queue.Queue()
44
  self.transcript_list = []
45
  self._load_model()
@@ -60,41 +66,23 @@ class ASRApp:
60
  model_name="nvidia/parakeet-tdt-0.6b-v2",
61
  map_location="cpu",
62
  )
63
- model.eval() # inference mode
64
- # ---- dynamic INT8 quantisation ----
65
- try:
66
- model = torch.quantization.quantize_dynamic(
67
- model,
68
- {torch.nn.Linear, torch.nn.LSTM, torch.nn.GRU},
69
- dtype=torch.qint8,
70
- )
71
- self._log("load_model", "INT8 quantisation applied")
72
- except Exception as e:
73
- self._log("load_model", f"quantisation skipped ({e})")
74
  self.asr_model = model
75
  self._log("load_model", f"model ready in {time.time()-t0:.1f}s")
76
- # warm-up (1 Γ— 1 s of zeros)
77
  with torch.inference_mode():
78
- _ = self.asr_model.transcribe(
79
- [np.zeros(SR, dtype=np.float32)]
80
- )
81
  self._log("load_model", "warm-up done")
82
-
83
  # ---------- threading ----------
84
  def _start_worker(self):
85
- threading.Thread(
86
- target=self._worker,
87
- daemon=True,
88
- ).start()
89
 
90
  def _worker(self):
91
  buf = np.array([], dtype=np.float32)
92
  while True:
93
  try:
94
- # accumulate until CHUNK_SAMPLES
95
  while len(buf) < CHUNK_SAMPLES:
96
  buf = np.concatenate([buf, self.audio_queue.get()])
97
- self._log("_worker", f"buffer={len(buf)}")
98
  chunk, buf = buf[:CHUNK_SAMPLES], buf[CHUNK_SAMPLES:]
99
  self._log("_worker", f"β†’ transcribe {len(chunk)} samples")
100
  t0 = time.time()
@@ -110,11 +98,7 @@ class ASRApp:
110
  # ---------- audio preprocessing ----------
111
  def _preprocess(self, audio):
112
  sr, y = audio
113
- if y.ndim > 1:
114
- y = y.mean(axis=1)
115
- if sr != SR:
116
- # resample faster with polyphase filter
117
- y = signal.resample_poly(y, SR, sr)
118
  y = y.astype(np.float32)
119
  y /= (np.abs(y).max() + 1e-9)
120
  return y
@@ -134,12 +118,6 @@ class ASRApp:
134
  # ────────────────────────────────────────────────
135
  # Gradio UI
136
  # ────────────────────────────────────────────────
137
- with open('README.md', 'r', encoding='utf-8') as file:
138
- README_CONTENT = file.read()
139
-
140
- # Remove YAML frontmatter between the first pair of '---' lines
141
- README_CONTENT_without_YAML = re.sub(r'^---.*?---\s*', '', README_CONTENT, flags=re.DOTALL)
142
-
143
  asr_app = ASRApp()
144
  with gr.Blocks() as demo:
145
  mic = gr.Audio(
@@ -155,7 +133,7 @@ with gr.Blocks() as demo:
155
  fn=asr_app.stream_fn,
156
  inputs=mic,
157
  outputs=out,
158
- stream_every=0.5, # ↓ UI calls per second
159
  )
160
 
161
  asr_app._log("main", "launching UI")
 
29
  # ────────────────────────────────────────────────
30
  # Constants
31
  # ────────────────────────────────────────────────
32
+ SR = 16_000
33
+ CHUNK_SECONDS = 4
34
  CHUNK_SAMPLES = SR * CHUNK_SECONDS
35
+
36
+ # ────────────────────────────────────────────────
37
+ # Prepare UI Description data from README.md file
38
+ # ────────────────────────────────────────────────
39
+ with open('README.md', 'r', encoding='utf-8') as file:
40
+ content = file.read()
41
+ README_CONTENT_without_YAML = re.sub(r'^---.*?---\s*', '', content, flags=re.DOTALL)
42
 
43
  # ────────────────────────────────────────────────
44
  # ASR Application
45
  # ────────────────────────────────────────────────
46
  class ASRApp:
47
  def __init__(self):
48
+ self.audio_queue = queue.Queue(maxsize=8)
49
  self.transcript_queue = queue.Queue()
50
  self.transcript_list = []
51
  self._load_model()
 
66
  model_name="nvidia/parakeet-tdt-0.6b-v2",
67
  map_location="cpu",
68
  )
69
+ model.eval()
 
 
 
 
 
 
 
 
 
 
70
  self.asr_model = model
71
  self._log("load_model", f"model ready in {time.time()-t0:.1f}s")
 
72
  with torch.inference_mode():
73
+ _ = self.asr_model.transcribe([np.zeros(SR, dtype=np.float32)])
 
 
74
  self._log("load_model", "warm-up done")
75
+
76
  # ---------- threading ----------
77
  def _start_worker(self):
78
+ threading.Thread(target=self._worker, daemon=True).start()
 
 
 
79
 
80
  def _worker(self):
81
  buf = np.array([], dtype=np.float32)
82
  while True:
83
  try:
 
84
  while len(buf) < CHUNK_SAMPLES:
85
  buf = np.concatenate([buf, self.audio_queue.get()])
 
86
  chunk, buf = buf[:CHUNK_SAMPLES], buf[CHUNK_SAMPLES:]
87
  self._log("_worker", f"β†’ transcribe {len(chunk)} samples")
88
  t0 = time.time()
 
98
  # ---------- audio preprocessing ----------
99
  def _preprocess(self, audio):
100
  sr, y = audio
101
+ y = signal.resample_poly(y, SR, sr)
 
 
 
 
102
  y = y.astype(np.float32)
103
  y /= (np.abs(y).max() + 1e-9)
104
  return y
 
118
  # ────────────────────────────────────────────────
119
  # Gradio UI
120
  # ────────────────────────────────────────────────
 
 
 
 
 
 
121
  asr_app = ASRApp()
122
  with gr.Blocks() as demo:
123
  mic = gr.Audio(
 
133
  fn=asr_app.stream_fn,
134
  inputs=mic,
135
  outputs=out,
136
+ stream_every=0.5,
137
  )
138
 
139
  asr_app._log("main", "launching UI")