Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -462,14 +462,14 @@ class ModelWrapper:
|
|
| 462 |
print(f"✅ Model loaded: {self.d_model}d × {self.n_layers}L × {self.n_heads}H")
|
| 463 |
|
| 464 |
def generate_stream(self, prompt: str, max_new_tokens: int = 200,
|
| 465 |
-
|
| 466 |
"""Generator that yields tokens one at a time for streaming"""
|
| 467 |
-
# Format prompt
|
| 468 |
-
if not prompt.startswith("
|
| 469 |
-
prompt = f"
|
| 470 |
else:
|
| 471 |
-
if "
|
| 472 |
-
prompt = prompt + "
|
| 473 |
|
| 474 |
# Tokenize
|
| 475 |
encoding = self.tokenizer.encode(prompt)
|
|
@@ -509,8 +509,11 @@ class ModelWrapper:
|
|
| 509 |
# Decode the new token
|
| 510 |
token_id = int(next_token[0, 0])
|
| 511 |
|
| 512 |
-
# Stop on EOS
|
| 513 |
-
if token_id
|
|
|
|
|
|
|
|
|
|
| 514 |
break
|
| 515 |
|
| 516 |
# Decode and yield the token
|
|
@@ -518,6 +521,7 @@ class ModelWrapper:
|
|
| 518 |
response_text += token_text
|
| 519 |
yield response_text
|
| 520 |
|
|
|
|
| 521 |
def generate(self, prompt: str, max_new_tokens: int = 200,
|
| 522 |
temperature: float = 0.8, top_k: int = 50, top_p: float = 0.9):
|
| 523 |
"""Non-streaming generation (returns full response)"""
|
|
@@ -544,14 +548,15 @@ print(f"✅ Model downloaded to: {model_path}")
|
|
| 544 |
# Load model
|
| 545 |
model = ModelWrapper(model_path)
|
| 546 |
|
|
|
|
| 547 |
def chat_fn(message, history, temperature, top_k, top_p, max_tokens):
|
| 548 |
-
# Build conversation context
|
| 549 |
conversation = ""
|
| 550 |
for user_msg, bot_msg in history:
|
| 551 |
-
conversation += f"
|
| 552 |
|
| 553 |
# Add current message
|
| 554 |
-
conversation += f"
|
| 555 |
|
| 556 |
# Stream response token by token
|
| 557 |
partial_response = ""
|
|
|
|
| 462 |
print(f"✅ Model loaded: {self.d_model}d × {self.n_layers}L × {self.n_heads}H")
|
| 463 |
|
| 464 |
def generate_stream(self, prompt: str, max_new_tokens: int = 200,
|
| 465 |
+
temperature: float = 0.8, top_k: int = 50, top_p: float = 0.9):
|
| 466 |
"""Generator that yields tokens one at a time for streaming"""
|
| 467 |
+
# Format prompt in ChatML format
|
| 468 |
+
if not prompt.startswith("<|im_start|>"):
|
| 469 |
+
prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
|
| 470 |
else:
|
| 471 |
+
if "<|im_start|>assistant" not in prompt:
|
| 472 |
+
prompt = prompt + "<|im_start|>assistant\n"
|
| 473 |
|
| 474 |
# Tokenize
|
| 475 |
encoding = self.tokenizer.encode(prompt)
|
|
|
|
| 509 |
# Decode the new token
|
| 510 |
token_id = int(next_token[0, 0])
|
| 511 |
|
| 512 |
+
# Stop on EOS or end tokens
|
| 513 |
+
if token_id in [
|
| 514 |
+
self.tokenizer.token_to_id("<|endoftext|>"),
|
| 515 |
+
self.tokenizer.token_to_id("<|im_end|>")
|
| 516 |
+
]:
|
| 517 |
break
|
| 518 |
|
| 519 |
# Decode and yield the token
|
|
|
|
| 521 |
response_text += token_text
|
| 522 |
yield response_text
|
| 523 |
|
| 524 |
+
|
| 525 |
def generate(self, prompt: str, max_new_tokens: int = 200,
|
| 526 |
temperature: float = 0.8, top_k: int = 50, top_p: float = 0.9):
|
| 527 |
"""Non-streaming generation (returns full response)"""
|
|
|
|
| 548 |
# Load model
|
| 549 |
model = ModelWrapper(model_path)
|
| 550 |
|
| 551 |
+
|
| 552 |
def chat_fn(message, history, temperature, top_k, top_p, max_tokens):
|
| 553 |
+
# Build conversation context in ChatML format
|
| 554 |
conversation = ""
|
| 555 |
for user_msg, bot_msg in history:
|
| 556 |
+
conversation += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{bot_msg}<|im_end|>\n"
|
| 557 |
|
| 558 |
# Add current message
|
| 559 |
+
conversation += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
|
| 560 |
|
| 561 |
# Stream response token by token
|
| 562 |
partial_response = ""
|