Spaces:

WillHeld
/

diva-audio-chat

Paused

App Files Files Community

Helw150 commited on Oct 11, 2024

Commit

3268a02

1 Parent(s): 5ded772

Multi turn

Browse files

Files changed (1) hide show

app.py +13 -5

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ resampler = Audio(sampling_rate=16_000)
 @spaces.GPU
 @torch.no_grad
-def diva_audio(audio_input, do_sample=False, temperature=0.001):
     sr, y = audio_input
     x = xxhash.xxh32(bytes(y)).hexdigest()
     y = y.astype(np.float32)
@@ -35,7 +35,12 @@ def diva_audio(audio_input, do_sample=False, temperature=0.001):
         resampler.encode_example({"array": y, "sampling_rate": sr})
     )
     yield from diva_model.generate_stream(
-        a["array"], None, do_sample=do_sample, max_new_tokens=256
     )
@@ -70,7 +75,7 @@ def run_vad(ori_audio, sr):
 def warm_up():
-    frames = b"\x00\x00" * 1024 * 2  # 1024 frames of 2 bytes each
     dur, frames, tcost = run_vad(frames, 16000)
     print(f"warm up done, time_cost: {tcost:.3f} s")
@@ -86,6 +91,7 @@ class AppState:
     started_talking: bool = False
     stopped: bool = False
     conversation: list = field(default_factory=list)
 def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
@@ -134,7 +140,9 @@ def response(state: AppState):
     )
     start = False
-    for resp in diva_audio((state.sampling_rate, state.stream)):
         if not start:
             state.conversation.append({"role": "assistant", "content": resp})
             start = True
@@ -142,7 +150,7 @@ def response(state: AppState):
             state.conversation[-1]["content"] = resp
         yield state, state.conversation
-    yield AppState(conversation=state.conversation), state.conversation
 def start_recording_user(state: AppState):

 @spaces.GPU
 @torch.no_grad
+def diva_audio(audio_input, do_sample=False, temperature=0.001, prev_outs=None):
     sr, y = audio_input
     x = xxhash.xxh32(bytes(y)).hexdigest()
     y = y.astype(np.float32)
         resampler.encode_example({"array": y, "sampling_rate": sr})
     )
     yield from diva_model.generate_stream(
+        a["array"],
+        None,
+        do_sample=do_sample,
+        max_new_tokens=256,
+        init_outputs=prev_outs,
+        return_outputs=True,
     )
 def warm_up():
+    frames = np.ones(2048)  # 1024 frames of 2 bytes each
     dur, frames, tcost = run_vad(frames, 16000)
     print(f"warm up done, time_cost: {tcost:.3f} s")
     started_talking: bool = False
     stopped: bool = False
     conversation: list = field(default_factory=list)
+    model_outs: any = None
 def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
     )
     start = False
+    for resp, outs in diva_audio(
+        (state.sampling_rate, state.stream), prev_outs=state.model_outs
+    ):
         if not start:
             state.conversation.append({"role": "assistant", "content": resp})
             start = True
             state.conversation[-1]["content"] = resp
         yield state, state.conversation
+    yield AppState(conversation=state.conversation, model_outs=outs), state.conversation
 def start_recording_user(state: AppState):