sparkleman
commited on
Commit
·
aeaf225
1
Parent(s):
89db2d9
UPDATE: Change gpu state display
Browse files
app.py
CHANGED
|
@@ -1,4 +1,10 @@
|
|
| 1 |
from config import CONFIG, ModelConfig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import os, copy, types, gc, sys, re, time, collections, asyncio
|
| 4 |
from huggingface_hub import hf_hub_download
|
|
@@ -28,6 +34,15 @@ if "cuda" in CONFIG.STRATEGY.lower():
|
|
| 28 |
nvmlInit()
|
| 29 |
gpu_h = nvmlDeviceGetHandleByIndex(0)
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
torch.backends.cudnn.benchmark = True
|
| 32 |
torch.backends.cudnn.allow_tf32 = True
|
| 33 |
torch.backends.cuda.matmul.allow_tf32 = True
|
|
@@ -56,7 +71,6 @@ from api_types import (
|
|
| 56 |
ChatCompletionChoice,
|
| 57 |
ChatCompletionMessage,
|
| 58 |
)
|
| 59 |
-
from utils import cleanMessages, parse_think_response, remove_nested_think_tags_stack
|
| 60 |
|
| 61 |
|
| 62 |
class ModelStorage:
|
|
@@ -72,6 +86,8 @@ DEFAULT_REASONING_MODEL_NAME = None
|
|
| 72 |
|
| 73 |
logger.info(f"STRATEGY - {CONFIG.STRATEGY}")
|
| 74 |
|
|
|
|
|
|
|
| 75 |
for model_config in CONFIG.MODELS:
|
| 76 |
logger.info(f"Load Model - {model_config.SERVICE_NAME}")
|
| 77 |
|
|
@@ -109,6 +125,7 @@ for model_config in CONFIG.MODELS:
|
|
| 109 |
MODEL_STORAGE[model_config.SERVICE_NAME].MODEL_CONFIG = model_config
|
| 110 |
MODEL_STORAGE[model_config.SERVICE_NAME].model = tmp_model
|
| 111 |
MODEL_STORAGE[model_config.SERVICE_NAME].pipeline = tmp_pipeline
|
|
|
|
| 112 |
|
| 113 |
|
| 114 |
logger.info(f"Load Model - DEFALUT_MODEL_NAME is `{DEFALUT_MODEL_NAME}`")
|
|
@@ -643,11 +660,7 @@ async def chat_completions(request: ChatCompletionRequest):
|
|
| 643 |
raise f"Can not find `{modelName}`"
|
| 644 |
|
| 645 |
async def chatResponseStreamDisconnect():
|
| 646 |
-
|
| 647 |
-
gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
|
| 648 |
-
logger.info(
|
| 649 |
-
f"[STATUS] vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}"
|
| 650 |
-
)
|
| 651 |
|
| 652 |
model_state = None
|
| 653 |
request_dict = request.model_dump()
|
|
|
|
| 1 |
from config import CONFIG, ModelConfig
|
| 2 |
+
from utils import (
|
| 3 |
+
cleanMessages,
|
| 4 |
+
parse_think_response,
|
| 5 |
+
remove_nested_think_tags_stack,
|
| 6 |
+
format_bytes,
|
| 7 |
+
)
|
| 8 |
|
| 9 |
import os, copy, types, gc, sys, re, time, collections, asyncio
|
| 10 |
from huggingface_hub import hf_hub_download
|
|
|
|
| 34 |
nvmlInit()
|
| 35 |
gpu_h = nvmlDeviceGetHandleByIndex(0)
|
| 36 |
|
| 37 |
+
|
| 38 |
+
def logGPUState():
|
| 39 |
+
if "cuda" in CONFIG.STRATEGY:
|
| 40 |
+
gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
|
| 41 |
+
logger.info(
|
| 42 |
+
f"[STATUS] vram {format_bytes(gpu_info.total)} used {format_bytes(gpu_info.used)} free {format_bytes(gpu_info.free)}"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
torch.backends.cudnn.benchmark = True
|
| 47 |
torch.backends.cudnn.allow_tf32 = True
|
| 48 |
torch.backends.cuda.matmul.allow_tf32 = True
|
|
|
|
| 71 |
ChatCompletionChoice,
|
| 72 |
ChatCompletionMessage,
|
| 73 |
)
|
|
|
|
| 74 |
|
| 75 |
|
| 76 |
class ModelStorage:
|
|
|
|
| 86 |
|
| 87 |
logger.info(f"STRATEGY - {CONFIG.STRATEGY}")
|
| 88 |
|
| 89 |
+
logGPUState()
|
| 90 |
+
|
| 91 |
for model_config in CONFIG.MODELS:
|
| 92 |
logger.info(f"Load Model - {model_config.SERVICE_NAME}")
|
| 93 |
|
|
|
|
| 125 |
MODEL_STORAGE[model_config.SERVICE_NAME].MODEL_CONFIG = model_config
|
| 126 |
MODEL_STORAGE[model_config.SERVICE_NAME].model = tmp_model
|
| 127 |
MODEL_STORAGE[model_config.SERVICE_NAME].pipeline = tmp_pipeline
|
| 128 |
+
logGPUState()
|
| 129 |
|
| 130 |
|
| 131 |
logger.info(f"Load Model - DEFALUT_MODEL_NAME is `{DEFALUT_MODEL_NAME}`")
|
|
|
|
| 660 |
raise f"Can not find `{modelName}`"
|
| 661 |
|
| 662 |
async def chatResponseStreamDisconnect():
|
| 663 |
+
logGPUState()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 664 |
|
| 665 |
model_state = None
|
| 666 |
request_dict = request.model_dump()
|
utils.py
CHANGED
|
@@ -58,3 +58,13 @@ def remove_nested_think_tags_stack(text):
|
|
| 58 |
else:
|
| 59 |
i += 1
|
| 60 |
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
else:
|
| 59 |
i += 1
|
| 60 |
return result
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def format_bytes(size):
|
| 64 |
+
power = 2**10
|
| 65 |
+
n = 0
|
| 66 |
+
power_labels = {0: "", 1: "K", 2: "M", 3: "G", 4: "T"}
|
| 67 |
+
while size > power:
|
| 68 |
+
size /= power
|
| 69 |
+
n += 1
|
| 70 |
+
return f"{size:.4f}{power_labels[n]+'B'}"
|