Spaces:

UsefulSensors
/

moonshine-streaming-demo

Running

App Files Files Community

Manjunath Kudlur commited on 6 days ago

Commit

7e3cf0f

1 Parent(s): 252a98f

Minimum segment duration to 2s

Browse files

Files changed (1) hide show

streaming_asr.js +12 -3

streaming_asr.js CHANGED Viewed

@@ -16,8 +16,10 @@ const VAD_CHUNK_SAMPLES = 160;  // 10ms - optimal for TenVAD
 const ASR_CHUNK_SAMPLES = 320;  // 20ms - Moonshine frame size
 const ENCODER_BATCH_SAMPLES = 5120;  // 320ms - batch size for encoder
-const PRE_BUFFER_CHUNKS = 15;  // ~300ms at 20ms chunks
-const POST_BUFFER_CHUNKS = 3;  // ~60ms at 20ms chunks
 const MODEL_CONFIGS = {
     sleeker: {
@@ -210,6 +212,7 @@ class PipelinedStreamingASR {
         this.emaProb = 0;
         this.onsetCounter = 0;
         this.offsetCounter = 0;
         // Buffers
         this.vadBuffer = [];
@@ -569,9 +572,14 @@ class PipelinedStreamingASR {
                 this.onsetCounter = 0;
             }
         } else if (this.state === 'speech') {
             if (this.emaProb < this.offsetThreshold) {
                 this.offsetCounter++;
-                if (this.offsetCounter >= 3) {
                     this.endSegment();
                 }
             } else {
@@ -585,6 +593,7 @@ class PipelinedStreamingASR {
         this.state = 'speech';
         this.onsetCounter = 0;
         this.offsetCounter = 0;
         this.encoderBatchBuffer = [];  // Reset batch buffer for new segment
         // Tell encoder to start new segment

 const ASR_CHUNK_SAMPLES = 320;  // 20ms - Moonshine frame size
 const ENCODER_BATCH_SAMPLES = 5120;  // 320ms - batch size for encoder
+const PRE_BUFFER_CHUNKS = 25;  // ~500ms at 20ms chunks - capture more audio before onset
+const POST_BUFFER_CHUNKS = 5;  // ~100ms at 20ms chunks
+const MIN_SEGMENT_DURATION_MS = 2000;  // Minimum 2 seconds before allowing segment end
+const OFFSET_CHUNKS_REQUIRED = 10;  // ~100ms of silence needed to end segment
 const MODEL_CONFIGS = {
     sleeker: {
         this.emaProb = 0;
         this.onsetCounter = 0;
         this.offsetCounter = 0;
+        this.segmentStartTime = 0;
         // Buffers
         this.vadBuffer = [];
                 this.onsetCounter = 0;
             }
         } else if (this.state === 'speech') {
+            // Check if minimum segment duration has passed
+            const segmentDuration = Date.now() - this.segmentStartTime;
+            const minDurationMet = segmentDuration >= MIN_SEGMENT_DURATION_MS;
             if (this.emaProb < this.offsetThreshold) {
                 this.offsetCounter++;
+                // Only end segment if minimum duration met AND enough silence chunks
+                if (minDurationMet && this.offsetCounter >= OFFSET_CHUNKS_REQUIRED) {
                     this.endSegment();
                 }
             } else {
         this.state = 'speech';
         this.onsetCounter = 0;
         this.offsetCounter = 0;
+        this.segmentStartTime = Date.now();
         this.encoderBatchBuffer = [];  // Reset batch buffer for new segment
         // Tell encoder to start new segment