Spaces:
Paused
Paused
Helw150
commited on
Commit
·
3268a02
1
Parent(s):
5ded772
Multi turn
Browse files
app.py
CHANGED
|
@@ -26,7 +26,7 @@ resampler = Audio(sampling_rate=16_000)
|
|
| 26 |
|
| 27 |
@spaces.GPU
|
| 28 |
@torch.no_grad
|
| 29 |
-
def diva_audio(audio_input, do_sample=False, temperature=0.001):
|
| 30 |
sr, y = audio_input
|
| 31 |
x = xxhash.xxh32(bytes(y)).hexdigest()
|
| 32 |
y = y.astype(np.float32)
|
|
@@ -35,7 +35,12 @@ def diva_audio(audio_input, do_sample=False, temperature=0.001):
|
|
| 35 |
resampler.encode_example({"array": y, "sampling_rate": sr})
|
| 36 |
)
|
| 37 |
yield from diva_model.generate_stream(
|
| 38 |
-
a["array"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
)
|
| 40 |
|
| 41 |
|
|
@@ -70,7 +75,7 @@ def run_vad(ori_audio, sr):
|
|
| 70 |
|
| 71 |
|
| 72 |
def warm_up():
|
| 73 |
-
frames =
|
| 74 |
dur, frames, tcost = run_vad(frames, 16000)
|
| 75 |
print(f"warm up done, time_cost: {tcost:.3f} s")
|
| 76 |
|
|
@@ -86,6 +91,7 @@ class AppState:
|
|
| 86 |
started_talking: bool = False
|
| 87 |
stopped: bool = False
|
| 88 |
conversation: list = field(default_factory=list)
|
|
|
|
| 89 |
|
| 90 |
|
| 91 |
def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
|
|
@@ -134,7 +140,9 @@ def response(state: AppState):
|
|
| 134 |
)
|
| 135 |
|
| 136 |
start = False
|
| 137 |
-
for resp in diva_audio(
|
|
|
|
|
|
|
| 138 |
if not start:
|
| 139 |
state.conversation.append({"role": "assistant", "content": resp})
|
| 140 |
start = True
|
|
@@ -142,7 +150,7 @@ def response(state: AppState):
|
|
| 142 |
state.conversation[-1]["content"] = resp
|
| 143 |
yield state, state.conversation
|
| 144 |
|
| 145 |
-
yield AppState(conversation=state.conversation), state.conversation
|
| 146 |
|
| 147 |
|
| 148 |
def start_recording_user(state: AppState):
|
|
|
|
| 26 |
|
| 27 |
@spaces.GPU
|
| 28 |
@torch.no_grad
|
| 29 |
+
def diva_audio(audio_input, do_sample=False, temperature=0.001, prev_outs=None):
|
| 30 |
sr, y = audio_input
|
| 31 |
x = xxhash.xxh32(bytes(y)).hexdigest()
|
| 32 |
y = y.astype(np.float32)
|
|
|
|
| 35 |
resampler.encode_example({"array": y, "sampling_rate": sr})
|
| 36 |
)
|
| 37 |
yield from diva_model.generate_stream(
|
| 38 |
+
a["array"],
|
| 39 |
+
None,
|
| 40 |
+
do_sample=do_sample,
|
| 41 |
+
max_new_tokens=256,
|
| 42 |
+
init_outputs=prev_outs,
|
| 43 |
+
return_outputs=True,
|
| 44 |
)
|
| 45 |
|
| 46 |
|
|
|
|
| 75 |
|
| 76 |
|
| 77 |
def warm_up():
|
| 78 |
+
frames = np.ones(2048) # 1024 frames of 2 bytes each
|
| 79 |
dur, frames, tcost = run_vad(frames, 16000)
|
| 80 |
print(f"warm up done, time_cost: {tcost:.3f} s")
|
| 81 |
|
|
|
|
| 91 |
started_talking: bool = False
|
| 92 |
stopped: bool = False
|
| 93 |
conversation: list = field(default_factory=list)
|
| 94 |
+
model_outs: any = None
|
| 95 |
|
| 96 |
|
| 97 |
def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
|
|
|
|
| 140 |
)
|
| 141 |
|
| 142 |
start = False
|
| 143 |
+
for resp, outs in diva_audio(
|
| 144 |
+
(state.sampling_rate, state.stream), prev_outs=state.model_outs
|
| 145 |
+
):
|
| 146 |
if not start:
|
| 147 |
state.conversation.append({"role": "assistant", "content": resp})
|
| 148 |
start = True
|
|
|
|
| 150 |
state.conversation[-1]["content"] = resp
|
| 151 |
yield state, state.conversation
|
| 152 |
|
| 153 |
+
yield AppState(conversation=state.conversation, model_outs=outs), state.conversation
|
| 154 |
|
| 155 |
|
| 156 |
def start_recording_user(state: AppState):
|