Bbbv

Sleeping

App Files Files Community

sparkleman commited on Mar 13

Commit

aeaf225

1 Parent(s): 89db2d9

UPDATE: Change gpu state display

Browse files

Files changed (2) hide show

app.py +19 -6
utils.py +10 -0

app.py CHANGED Viewed

@@ -1,4 +1,10 @@
 from config import CONFIG, ModelConfig
 import os, copy, types, gc, sys, re, time, collections, asyncio
 from huggingface_hub import hf_hub_download
@@ -28,6 +34,15 @@ if "cuda" in CONFIG.STRATEGY.lower():
     nvmlInit()
     gpu_h = nvmlDeviceGetHandleByIndex(0)
 torch.backends.cudnn.benchmark = True
 torch.backends.cudnn.allow_tf32 = True
 torch.backends.cuda.matmul.allow_tf32 = True
@@ -56,7 +71,6 @@ from api_types import (
     ChatCompletionChoice,
     ChatCompletionMessage,
 )
-from utils import cleanMessages, parse_think_response, remove_nested_think_tags_stack
 class ModelStorage:
@@ -72,6 +86,8 @@ DEFAULT_REASONING_MODEL_NAME = None
 logger.info(f"STRATEGY - {CONFIG.STRATEGY}")
 for model_config in CONFIG.MODELS:
     logger.info(f"Load Model - {model_config.SERVICE_NAME}")
@@ -109,6 +125,7 @@ for model_config in CONFIG.MODELS:
     MODEL_STORAGE[model_config.SERVICE_NAME].MODEL_CONFIG = model_config
     MODEL_STORAGE[model_config.SERVICE_NAME].model = tmp_model
     MODEL_STORAGE[model_config.SERVICE_NAME].pipeline = tmp_pipeline
 logger.info(f"Load Model - DEFALUT_MODEL_NAME is `{DEFALUT_MODEL_NAME}`")
@@ -643,11 +660,7 @@ async def chat_completions(request: ChatCompletionRequest):
         raise f"Can not find `{modelName}`"
     async def chatResponseStreamDisconnect():
-        if "cuda" in CONFIG.STRATEGY:
-            gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
-            logger.info(
-                f"[STATUS] vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}"
-            )
     model_state = None
     request_dict = request.model_dump()

 from config import CONFIG, ModelConfig
+from utils import (
+    cleanMessages,
+    parse_think_response,
+    remove_nested_think_tags_stack,
+    format_bytes,
+)
 import os, copy, types, gc, sys, re, time, collections, asyncio
 from huggingface_hub import hf_hub_download
     nvmlInit()
     gpu_h = nvmlDeviceGetHandleByIndex(0)
+def logGPUState():
+    if "cuda" in CONFIG.STRATEGY:
+        gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
+        logger.info(
+            f"[STATUS] vram {format_bytes(gpu_info.total)} used {format_bytes(gpu_info.used)} free {format_bytes(gpu_info.free)}"
+        )
 torch.backends.cudnn.benchmark = True
 torch.backends.cudnn.allow_tf32 = True
 torch.backends.cuda.matmul.allow_tf32 = True
     ChatCompletionChoice,
     ChatCompletionMessage,
 )
 class ModelStorage:
 logger.info(f"STRATEGY - {CONFIG.STRATEGY}")
+logGPUState()
 for model_config in CONFIG.MODELS:
     logger.info(f"Load Model - {model_config.SERVICE_NAME}")
     MODEL_STORAGE[model_config.SERVICE_NAME].MODEL_CONFIG = model_config
     MODEL_STORAGE[model_config.SERVICE_NAME].model = tmp_model
     MODEL_STORAGE[model_config.SERVICE_NAME].pipeline = tmp_pipeline
+    logGPUState()
 logger.info(f"Load Model - DEFALUT_MODEL_NAME is `{DEFALUT_MODEL_NAME}`")
         raise f"Can not find `{modelName}`"
     async def chatResponseStreamDisconnect():
+        logGPUState()
     model_state = None
     request_dict = request.model_dump()

utils.py CHANGED Viewed

@@ -58,3 +58,13 @@ def remove_nested_think_tags_stack(text):
         else:
             i += 1
     return result

         else:
             i += 1
     return result
+def format_bytes(size):
+    power = 2**10
+    n = 0
+    power_labels = {0: "", 1: "K", 2: "M", 3: "G", 4: "T"}
+    while size > power:
+        size /= power
+        n += 1
+    return f"{size:.4f}{power_labels[n]+'B'}"