Spaces:
Running
on
Zero
Running
on
Zero
feat(optim): load the model and tokenizer outside of the spaces wrapped method
Browse filesOn one side we lose the lazy init, but we benefit from the tensor packing on zero, so the model has a smaller memory footprint when idle. Besides, this way, callers do not consume their gpu quota to actually load the model. Is already downloaded, loaded in memory and prepared for serving
app.py
CHANGED
|
@@ -54,13 +54,14 @@ def _history_to_messages(history: List[Tuple[str, str]]) -> List[Dict[str, str]]
|
|
| 54 |
msgs.append({"role": "assistant", "content": bot_msg})
|
| 55 |
return msgs
|
| 56 |
|
|
|
|
|
|
|
| 57 |
@spaces.GPU(duration=120)
|
| 58 |
def generate_stream(message: str, history: List[Tuple[str, str]]):
|
| 59 |
"""
|
| 60 |
Minimal streaming chat function for gr.ChatInterface.
|
| 61 |
Uses instruct chat template. No token UI. No extra controls.
|
| 62 |
"""
|
| 63 |
-
_ensure_loaded()
|
| 64 |
|
| 65 |
messages = _history_to_messages(history) + [{"role": "user", "content": message}]
|
| 66 |
inputs = _tokenizer.apply_chat_template(
|
|
|
|
| 54 |
msgs.append({"role": "assistant", "content": bot_msg})
|
| 55 |
return msgs
|
| 56 |
|
| 57 |
+
_ensure_loaded()
|
| 58 |
+
|
| 59 |
@spaces.GPU(duration=120)
|
| 60 |
def generate_stream(message: str, history: List[Tuple[str, str]]):
|
| 61 |
"""
|
| 62 |
Minimal streaming chat function for gr.ChatInterface.
|
| 63 |
Uses instruct chat template. No token UI. No extra controls.
|
| 64 |
"""
|
|
|
|
| 65 |
|
| 66 |
messages = _history_to_messages(history) + [{"role": "user", "content": message}]
|
| 67 |
inputs = _tokenizer.apply_chat_template(
|