Removed dynamic q8, reduced logs, reduced size of buffer, removed check for mono, all optimization
Browse filesWhen type="numpy", Gradio captures audio in mono by default from the microphone.
The logs show that quantization consistently fails:
21:25:07 - load_model | audio_q=00, txt_q=00 | quantisation skipped (object proxy must define __deepcopy__())
The error indicates that the model (or a component of it) does not support dynamic quantization due to missing __deepcopy__ implementation, a known limitation with some PyTorch models or proxies.
Since quantization is skipped and provides no performance benefit in this case, the try-except block is unnecessary and adds complexity without value.
The app functions correctly without quantization, as evidenced by successful transcriptions in the logs (e.g., inference times of ~1.5s for 4-second chunks).
app.py
CHANGED
|
@@ -29,17 +29,23 @@ logger = logging.getLogger("asr_app")
|
|
| 29 |
# ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
# Constants
|
| 31 |
# ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
-
SR = 16_000
|
| 33 |
-
CHUNK_SECONDS = 4
|
| 34 |
CHUNK_SAMPLES = SR * CHUNK_SECONDS
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
# ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 38 |
# ASR Application
|
| 39 |
# ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
class ASRApp:
|
| 41 |
def __init__(self):
|
| 42 |
-
self.audio_queue = queue.Queue(maxsize=
|
| 43 |
self.transcript_queue = queue.Queue()
|
| 44 |
self.transcript_list = []
|
| 45 |
self._load_model()
|
|
@@ -60,41 +66,23 @@ class ASRApp:
|
|
| 60 |
model_name="nvidia/parakeet-tdt-0.6b-v2",
|
| 61 |
map_location="cpu",
|
| 62 |
)
|
| 63 |
-
model.eval()
|
| 64 |
-
# ---- dynamic INT8 quantisation ----
|
| 65 |
-
try:
|
| 66 |
-
model = torch.quantization.quantize_dynamic(
|
| 67 |
-
model,
|
| 68 |
-
{torch.nn.Linear, torch.nn.LSTM, torch.nn.GRU},
|
| 69 |
-
dtype=torch.qint8,
|
| 70 |
-
)
|
| 71 |
-
self._log("load_model", "INT8 quantisation applied")
|
| 72 |
-
except Exception as e:
|
| 73 |
-
self._log("load_model", f"quantisation skipped ({e})")
|
| 74 |
self.asr_model = model
|
| 75 |
self._log("load_model", f"model ready in {time.time()-t0:.1f}s")
|
| 76 |
-
# warm-up (1 Γ 1 s of zeros)
|
| 77 |
with torch.inference_mode():
|
| 78 |
-
_ = self.asr_model.transcribe(
|
| 79 |
-
[np.zeros(SR, dtype=np.float32)]
|
| 80 |
-
)
|
| 81 |
self._log("load_model", "warm-up done")
|
| 82 |
-
|
| 83 |
# ---------- threading ----------
|
| 84 |
def _start_worker(self):
|
| 85 |
-
threading.Thread(
|
| 86 |
-
target=self._worker,
|
| 87 |
-
daemon=True,
|
| 88 |
-
).start()
|
| 89 |
|
| 90 |
def _worker(self):
|
| 91 |
buf = np.array([], dtype=np.float32)
|
| 92 |
while True:
|
| 93 |
try:
|
| 94 |
-
# accumulate until CHUNK_SAMPLES
|
| 95 |
while len(buf) < CHUNK_SAMPLES:
|
| 96 |
buf = np.concatenate([buf, self.audio_queue.get()])
|
| 97 |
-
self._log("_worker", f"buffer={len(buf)}")
|
| 98 |
chunk, buf = buf[:CHUNK_SAMPLES], buf[CHUNK_SAMPLES:]
|
| 99 |
self._log("_worker", f"β transcribe {len(chunk)} samples")
|
| 100 |
t0 = time.time()
|
|
@@ -110,11 +98,7 @@ class ASRApp:
|
|
| 110 |
# ---------- audio preprocessing ----------
|
| 111 |
def _preprocess(self, audio):
|
| 112 |
sr, y = audio
|
| 113 |
-
|
| 114 |
-
y = y.mean(axis=1)
|
| 115 |
-
if sr != SR:
|
| 116 |
-
# resample faster with polyphase filter
|
| 117 |
-
y = signal.resample_poly(y, SR, sr)
|
| 118 |
y = y.astype(np.float32)
|
| 119 |
y /= (np.abs(y).max() + 1e-9)
|
| 120 |
return y
|
|
@@ -134,12 +118,6 @@ class ASRApp:
|
|
| 134 |
# ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 135 |
# Gradio UI
|
| 136 |
# ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 137 |
-
with open('README.md', 'r', encoding='utf-8') as file:
|
| 138 |
-
README_CONTENT = file.read()
|
| 139 |
-
|
| 140 |
-
# Remove YAML frontmatter between the first pair of '---' lines
|
| 141 |
-
README_CONTENT_without_YAML = re.sub(r'^---.*?---\s*', '', README_CONTENT, flags=re.DOTALL)
|
| 142 |
-
|
| 143 |
asr_app = ASRApp()
|
| 144 |
with gr.Blocks() as demo:
|
| 145 |
mic = gr.Audio(
|
|
@@ -155,7 +133,7 @@ with gr.Blocks() as demo:
|
|
| 155 |
fn=asr_app.stream_fn,
|
| 156 |
inputs=mic,
|
| 157 |
outputs=out,
|
| 158 |
-
stream_every=0.5,
|
| 159 |
)
|
| 160 |
|
| 161 |
asr_app._log("main", "launching UI")
|
|
|
|
| 29 |
# ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
# Constants
|
| 31 |
# ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
+
SR = 16_000
|
| 33 |
+
CHUNK_SECONDS = 4
|
| 34 |
CHUNK_SAMPLES = SR * CHUNK_SECONDS
|
| 35 |
+
|
| 36 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 37 |
+
# Prepare UI Description data from README.md file
|
| 38 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 39 |
+
with open('README.md', 'r', encoding='utf-8') as file:
|
| 40 |
+
content = file.read()
|
| 41 |
+
README_CONTENT_without_YAML = re.sub(r'^---.*?---\s*', '', content, flags=re.DOTALL)
|
| 42 |
|
| 43 |
# ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 44 |
# ASR Application
|
| 45 |
# ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 46 |
class ASRApp:
|
| 47 |
def __init__(self):
|
| 48 |
+
self.audio_queue = queue.Queue(maxsize=8)
|
| 49 |
self.transcript_queue = queue.Queue()
|
| 50 |
self.transcript_list = []
|
| 51 |
self._load_model()
|
|
|
|
| 66 |
model_name="nvidia/parakeet-tdt-0.6b-v2",
|
| 67 |
map_location="cpu",
|
| 68 |
)
|
| 69 |
+
model.eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
self.asr_model = model
|
| 71 |
self._log("load_model", f"model ready in {time.time()-t0:.1f}s")
|
|
|
|
| 72 |
with torch.inference_mode():
|
| 73 |
+
_ = self.asr_model.transcribe([np.zeros(SR, dtype=np.float32)])
|
|
|
|
|
|
|
| 74 |
self._log("load_model", "warm-up done")
|
| 75 |
+
|
| 76 |
# ---------- threading ----------
|
| 77 |
def _start_worker(self):
|
| 78 |
+
threading.Thread(target=self._worker, daemon=True).start()
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
def _worker(self):
|
| 81 |
buf = np.array([], dtype=np.float32)
|
| 82 |
while True:
|
| 83 |
try:
|
|
|
|
| 84 |
while len(buf) < CHUNK_SAMPLES:
|
| 85 |
buf = np.concatenate([buf, self.audio_queue.get()])
|
|
|
|
| 86 |
chunk, buf = buf[:CHUNK_SAMPLES], buf[CHUNK_SAMPLES:]
|
| 87 |
self._log("_worker", f"β transcribe {len(chunk)} samples")
|
| 88 |
t0 = time.time()
|
|
|
|
| 98 |
# ---------- audio preprocessing ----------
|
| 99 |
def _preprocess(self, audio):
|
| 100 |
sr, y = audio
|
| 101 |
+
y = signal.resample_poly(y, SR, sr)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
y = y.astype(np.float32)
|
| 103 |
y /= (np.abs(y).max() + 1e-9)
|
| 104 |
return y
|
|
|
|
| 118 |
# ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 119 |
# Gradio UI
|
| 120 |
# ββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
asr_app = ASRApp()
|
| 122 |
with gr.Blocks() as demo:
|
| 123 |
mic = gr.Audio(
|
|
|
|
| 133 |
fn=asr_app.stream_fn,
|
| 134 |
inputs=mic,
|
| 135 |
outputs=out,
|
| 136 |
+
stream_every=0.5,
|
| 137 |
)
|
| 138 |
|
| 139 |
asr_app._log("main", "launching UI")
|