Spaces:

omaryasserhassan
/

llm_server

Sleeping

App Files Files Community

omaryasserhassan commited on Aug 17

Commit

730089e

verified ·

1 Parent(s): 5f65aa8

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -20

app.py CHANGED Viewed

@@ -7,14 +7,20 @@ from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 # ---------- Minimal fixed config (fast on CPU) ----------
-REPO_ID   = "bartowski/Llama-3.2-1B-Instruct-GGUF"         # <-- 1B model (faster)
 FILENAME  = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
-CACHE_DIR = "/app/models"                                  # already prefetched in your build
 N_THREADS = min(4, os.cpu_count() or 2)
-N_BATCH   = 8                                              # smaller = less thrash
 N_CTX     = 2048
-MAX_TOKENS = 16                                            # short, fast replies
 TEMPERATURE = 0.7
 TOP_P       = 0.9
@@ -23,36 +29,44 @@ STOP        = ["</s>", "<|eot_id|>"]
 # ---------- App ----------
 app = FastAPI(title="Simple Llama Server (1B fast)")
 model = None
 class PromptRequest(BaseModel):
     prompt: str
 @app.on_event("startup")
 def load_model():
-    global model
-    os.makedirs(CACHE_DIR, exist_ok=True)
-    local_path = hf_hub_download(
-        repo_id=REPO_ID,
-        filename=FILENAME,
-        cache_dir=CACHE_DIR,
-        local_dir_use_symlinks=False,
-    )
     t0 = time.time()
     model = Llama(
-        model_path=local_path,
         chat_format="llama-3",
         n_ctx=N_CTX,
         n_threads=N_THREADS,
         n_batch=N_BATCH,
-        use_mmap=True,
-        n_gpu_layers=0,
         verbose=False,
     )
-    print(f"[startup] model loaded in {time.time()-t0:.2f}s from {local_path}")
 @app.get("/health")
 def health():
-    return {"ok": model is not None}
 @app.post("/generate")
 def generate(req: PromptRequest):
@@ -71,6 +85,4 @@ def generate(req: PromptRequest):
         stop=STOP,
     )
     text = out["choices"][0]["message"]["content"]
-    dt = time.time() - t0
-    print(f"[infer] tokens={MAX_TOKENS} took {dt:.2f}s, prompt_len_chars={len(prompt)}")
-    return JSONResponse({"response": text, "timing_sec": round(dt, 2)})

 from llama_cpp import Llama
 # ---------- Minimal fixed config (fast on CPU) ----------
+REPO_ID   = "bartowski/Llama-3.2-1B-Instruct-GGUF"         # 1B = much faster on CPU
 FILENAME  = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
+# Build-time prefetch location (Dockerfile step put model here)
+BUILD_DIR   = "/app/models"
+MODEL_PATH  = os.path.join(BUILD_DIR, FILENAME)
+# Writable runtime cache if the prebuilt file isn't present
+RUNTIME_CACHE = "/tmp/hf_cache"
 N_THREADS = min(4, os.cpu_count() or 2)
+N_BATCH   = 8
 N_CTX     = 2048
+MAX_TOKENS = 16
 TEMPERATURE = 0.7
 TOP_P       = 0.9
 # ---------- App ----------
 app = FastAPI(title="Simple Llama Server (1B fast)")
 model = None
+effective_model_path = None
 class PromptRequest(BaseModel):
     prompt: str
 @app.on_event("startup")
 def load_model():
+    global model, effective_model_path
+    # 1) If the model exists from the Docker build, use it directly (no writes)
+    if os.path.isfile(MODEL_PATH):
+        effective_model_path = MODEL_PATH
+    else:
+        # 2) Otherwise, download to a writable temp cache (NOT under /app)
+        os.makedirs(RUNTIME_CACHE, exist_ok=True)
+        effective_model_path = hf_hub_download(
+            repo_id=REPO_ID,
+            filename=FILENAME,
+            cache_dir=RUNTIME_CACHE,
+            local_dir_use_symlinks=False,
+        )
     t0 = time.time()
     model = Llama(
+        model_path=effective_model_path,
         chat_format="llama-3",
         n_ctx=N_CTX,
         n_threads=N_THREADS,
         n_batch=N_BATCH,
+        use_mmap=True,     # faster load
+        n_gpu_layers=0,    # CPU only
         verbose=False,
     )
+    print(f"[startup] loaded {effective_model_path} in {time.time()-t0:.2f}s")
 @app.get("/health")
 def health():
+    return {"ok": model is not None, "model_path": effective_model_path}
 @app.post("/generate")
 def generate(req: PromptRequest):
         stop=STOP,
     )
     text = out["choices"][0]["message"]["content"]
+    return JSONResponse({"response": text, "timing_sec": round(time.time()-t0, 2)})