Manjunath Kudlur
commited on
Commit
·
f5961bb
1
Parent(s):
7e3cf0f
Detect repetition while decoding
Browse files- decoder_worker.js +52 -2
- index.html +3 -3
decoder_worker.js
CHANGED
|
@@ -73,6 +73,46 @@ async function fetchModelWithProgress(url, modelName) {
|
|
| 73 |
let cfg = null;
|
| 74 |
let tailLatency = 0;
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
// Sessions
|
| 77 |
let adapterSession = null;
|
| 78 |
let decoderInitSession = null;
|
|
@@ -203,7 +243,9 @@ async function decodeAccumulated() {
|
|
| 203 |
await initDecoderCache(context);
|
| 204 |
|
| 205 |
const numFrames = accumulatedFeatures.dims[1];
|
| 206 |
-
|
|
|
|
|
|
|
| 207 |
|
| 208 |
const tokens = [1]; // BOS
|
| 209 |
for (let step = 0; step < maxTokens; step++) {
|
|
@@ -219,7 +261,15 @@ async function decodeAccumulated() {
|
|
| 219 |
}
|
| 220 |
|
| 221 |
tokens.push(maxIdx);
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
}
|
| 224 |
|
| 225 |
return tokenizer.decode(tokens, true);
|
|
|
|
| 73 |
let cfg = null;
|
| 74 |
let tailLatency = 0;
|
| 75 |
|
| 76 |
+
// Decoding config
|
| 77 |
+
const TOKENS_PER_SECOND = 6.5; // Max tokens per second of audio
|
| 78 |
+
const FRAME_DURATION_MS = 20; // Each encoder frame is 20ms
|
| 79 |
+
|
| 80 |
+
// Check for repetitive token patterns that indicate decoding should stop
|
| 81 |
+
function hasRepetition(tokens) {
|
| 82 |
+
const len = tokens.length;
|
| 83 |
+
if (len < 5) return false;
|
| 84 |
+
|
| 85 |
+
// Check if last 5 tokens are the same
|
| 86 |
+
const last5 = tokens.slice(-5);
|
| 87 |
+
if (last5.every(t => t === last5[0])) {
|
| 88 |
+
return true;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
// Check for 3 repeated same pairs (e.g., [A,B,A,B,A,B])
|
| 92 |
+
if (len >= 6) {
|
| 93 |
+
const pair1 = [tokens[len - 6], tokens[len - 5]];
|
| 94 |
+
const pair2 = [tokens[len - 4], tokens[len - 3]];
|
| 95 |
+
const pair3 = [tokens[len - 2], tokens[len - 1]];
|
| 96 |
+
if (pair1[0] === pair2[0] && pair2[0] === pair3[0] &&
|
| 97 |
+
pair1[1] === pair2[1] && pair2[1] === pair3[1]) {
|
| 98 |
+
return true;
|
| 99 |
+
}
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
// Check for 2 repeated same triples (e.g., [A,B,C,A,B,C])
|
| 103 |
+
if (len >= 6) {
|
| 104 |
+
const triple1 = [tokens[len - 6], tokens[len - 5], tokens[len - 4]];
|
| 105 |
+
const triple2 = [tokens[len - 3], tokens[len - 2], tokens[len - 1]];
|
| 106 |
+
if (triple1[0] === triple2[0] &&
|
| 107 |
+
triple1[1] === triple2[1] &&
|
| 108 |
+
triple1[2] === triple2[2]) {
|
| 109 |
+
return true;
|
| 110 |
+
}
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
return false;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
// Sessions
|
| 117 |
let adapterSession = null;
|
| 118 |
let decoderInitSession = null;
|
|
|
|
| 243 |
await initDecoderCache(context);
|
| 244 |
|
| 245 |
const numFrames = accumulatedFeatures.dims[1];
|
| 246 |
+
// Calculate duration in seconds and max tokens based on that
|
| 247 |
+
const durationSeconds = (numFrames * FRAME_DURATION_MS) / 1000;
|
| 248 |
+
const maxTokens = Math.max(10, Math.floor(durationSeconds * TOKENS_PER_SECOND));
|
| 249 |
|
| 250 |
const tokens = [1]; // BOS
|
| 251 |
for (let step = 0; step < maxTokens; step++) {
|
|
|
|
| 261 |
}
|
| 262 |
|
| 263 |
tokens.push(maxIdx);
|
| 264 |
+
|
| 265 |
+
// Stop on EOS
|
| 266 |
+
if (maxIdx === 2) break;
|
| 267 |
+
|
| 268 |
+
// Stop on repetitive patterns
|
| 269 |
+
if (hasRepetition(tokens)) {
|
| 270 |
+
console.log('Stopping decode due to repetition detected');
|
| 271 |
+
break;
|
| 272 |
+
}
|
| 273 |
}
|
| 274 |
|
| 275 |
return tokenizer.decode(tokens, true);
|
index.html
CHANGED
|
@@ -207,8 +207,8 @@
|
|
| 207 |
display: flex;
|
| 208 |
flex-direction: column;
|
| 209 |
justify-content: center;
|
| 210 |
-
align-items:
|
| 211 |
-
text-align:
|
| 212 |
backdrop-filter: blur(10px);
|
| 213 |
border: 1px solid rgba(255, 255, 255, 0.1);
|
| 214 |
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
|
|
@@ -404,7 +404,7 @@
|
|
| 404 |
background: rgba(0, 0, 0, 0.9);
|
| 405 |
border-radius: 0 0 12px 12px;
|
| 406 |
min-height: 80px;
|
| 407 |
-
text-align:
|
| 408 |
backdrop-filter: blur(10px);
|
| 409 |
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.5);
|
| 410 |
}
|
|
|
|
| 207 |
display: flex;
|
| 208 |
flex-direction: column;
|
| 209 |
justify-content: center;
|
| 210 |
+
align-items: flex-start;
|
| 211 |
+
text-align: left;
|
| 212 |
backdrop-filter: blur(10px);
|
| 213 |
border: 1px solid rgba(255, 255, 255, 0.1);
|
| 214 |
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
|
|
|
|
| 404 |
background: rgba(0, 0, 0, 0.9);
|
| 405 |
border-radius: 0 0 12px 12px;
|
| 406 |
min-height: 80px;
|
| 407 |
+
text-align: left;
|
| 408 |
backdrop-filter: blur(10px);
|
| 409 |
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.5);
|
| 410 |
}
|