Update app.py
Browse files
app.py
CHANGED
|
@@ -62,7 +62,7 @@ class Usage(BaseModel):
|
|
| 62 |
prompt_tokens: int
|
| 63 |
completion_tokens: int
|
| 64 |
total_tokens: int
|
| 65 |
-
prompt_tokens_details: Optional[PromptTokensDetails]
|
| 66 |
|
| 67 |
class ChatCompletionChoice(BaseModel):
|
| 68 |
index: int
|
|
@@ -77,7 +77,7 @@ class ChatCompletionChunk(BaseModel):
|
|
| 77 |
created: int = Field(...)
|
| 78 |
model: str
|
| 79 |
choices: List[ChatCompletionChoice]
|
| 80 |
-
usage: Optional[Usage]
|
| 81 |
|
| 82 |
class ToolFunction(BaseModel):
|
| 83 |
name: str
|
|
@@ -493,7 +493,6 @@ async def chatResponseStream(request: ChatCompletionRequest, model_state: any, c
|
|
| 493 |
clean_msg = cleanMessages(current_messages, enableReasoning)
|
| 494 |
prompt = f"{clean_msg}\n\nAssistant:{' <think' if enableReasoning else ''}"
|
| 495 |
|
| 496 |
-
tool_buffer = ""
|
| 497 |
tool_call_mode = False
|
| 498 |
|
| 499 |
async with GPU_LOCK:
|
|
@@ -529,12 +528,9 @@ async def chatResponseStream(request: ChatCompletionRequest, model_state: any, c
|
|
| 529 |
async with GPU_LOCK:
|
| 530 |
try:
|
| 531 |
tool_out, tool_tokens, tool_state = await runPrefill(request, "", [0], model_state)
|
| 532 |
-
temp_tokens = []
|
| 533 |
-
|
| 534 |
current_gen = ""
|
| 535 |
|
| 536 |
for i in range(200):
|
| 537 |
-
args = PIPELINE_ARGS(temperature=0.1, top_p=0.1)
|
| 538 |
tool_token = MODEL_STORAGE[request.model].pipeline.sample_logits(tool_out, temperature=0.1, top_p=0.1)
|
| 539 |
tool_out, tool_state = MODEL_STORAGE[request.model].model.forward([tool_token], tool_state)
|
| 540 |
|
|
|
|
| 62 |
prompt_tokens: int
|
| 63 |
completion_tokens: int
|
| 64 |
total_tokens: int
|
| 65 |
+
prompt_tokens_details: Optional[PromptTokensDetails] = None
|
| 66 |
|
| 67 |
class ChatCompletionChoice(BaseModel):
|
| 68 |
index: int
|
|
|
|
| 77 |
created: int = Field(...)
|
| 78 |
model: str
|
| 79 |
choices: List[ChatCompletionChoice]
|
| 80 |
+
usage: Optional[Usage] = None
|
| 81 |
|
| 82 |
class ToolFunction(BaseModel):
|
| 83 |
name: str
|
|
|
|
| 493 |
clean_msg = cleanMessages(current_messages, enableReasoning)
|
| 494 |
prompt = f"{clean_msg}\n\nAssistant:{' <think' if enableReasoning else ''}"
|
| 495 |
|
|
|
|
| 496 |
tool_call_mode = False
|
| 497 |
|
| 498 |
async with GPU_LOCK:
|
|
|
|
| 528 |
async with GPU_LOCK:
|
| 529 |
try:
|
| 530 |
tool_out, tool_tokens, tool_state = await runPrefill(request, "", [0], model_state)
|
|
|
|
|
|
|
| 531 |
current_gen = ""
|
| 532 |
|
| 533 |
for i in range(200):
|
|
|
|
| 534 |
tool_token = MODEL_STORAGE[request.model].pipeline.sample_logits(tool_out, temperature=0.1, top_p=0.1)
|
| 535 |
tool_out, tool_state = MODEL_STORAGE[request.model].model.forward([tool_token], tool_state)
|
| 536 |
|