Spaces:

omaryasserhassan
/

llm_server

Sleeping

App Files Files Community

omaryasserhassan commited on Aug 17

Commit

77fd55f

verified ·

1 Parent(s): a510b92

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -47

app.py CHANGED Viewed

@@ -10,20 +10,24 @@ from pydantic import BaseModel, Field
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
-# ---------------- Config (still overridable via env) ----------------
 REPO_ID   = os.getenv("REPO_ID", "bartowski/Llama-3.2-3B-Instruct-GGUF")
 FILENAME  = os.getenv("FILENAME", "Llama-3.2-3B-Instruct-Q4_K_M.gguf")
-# Preferred cache dir (may not be writable)
-CACHE_DIR = os.getenv("CACHE_DIR", "/data/models")
-# Inference knobs (safer on small CPU Spaces)
-N_THREADS = int(os.getenv("N_THREADS", str(min(4, (os.cpu_count() or 2)))))
-N_BATCH   = int(os.getenv("N_BATCH", "32"))
 N_CTX     = int(os.getenv("N_CTX", "2048"))
-# Fixed sampling (fast-ish defaults)
-MAX_TOKENS   = int(os.getenv("MAX_TOKENS", "96"))
 TEMPERATURE  = float(os.getenv("TEMPERATURE", "0.7"))
 TOP_P        = float(os.getenv("TOP_P", "0.9"))
 STOP_TOKENS  = os.getenv("STOP_TOKENS", "</s>,<|eot_id|>").split(",")
@@ -31,17 +35,14 @@ STOP_TOKENS  = os.getenv("STOP_TOKENS", "</s>,<|eot_id|>").split(",")
 SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", "").strip()
 CTX_SAFETY    = int(os.getenv("CTX_SAFETY", "128"))
-# ---------------- App scaffolding ----------------
 app = FastAPI(title="Llama 3.2 3B Instruct (llama.cpp) API - Prompt Only")
 _model: Optional[Llama] = None
 _model_lock = threading.Lock()
 _effective_cache_dir: Optional[str] = None
 def _select_writable_cache_dir(preferred: str) -> str:
-    """
-    Pick the first writable directory from a list of candidates.
-    Tries to mkdir and write a tiny file to confirm writability.
-    """
     candidates = [
         preferred,
         os.path.join(os.path.expanduser("~"), ".cache", "hf_models"),
@@ -50,24 +51,26 @@ def _select_writable_cache_dir(preferred: str) -> str:
     for d in candidates:
         try:
             os.makedirs(d, exist_ok=True)
-            test_path = os.path.join(d, ".write_test")
-            with open(test_path, "w") as f:
                 f.write("ok")
-            os.remove(test_path)
             return d
         except Exception:
             continue
-    raise RuntimeError("No writable cache directory found among: " + ", ".join(candidates))
-# ---------------- Model loader ----------------
-def get_model() -> Llama:
-    global _model, _effective_cache_dir
-    if _model is not None:
-        return _model
-    # pick a writable cache dir (handles /data permission issues)
     if _effective_cache_dir is None:
-        _effective_cache_dir = _select_writable_cache_dir(CACHE_DIR)
     local_path = hf_hub_download(
         repo_id=REPO_ID,
@@ -75,21 +78,33 @@ def get_model() -> Llama:
         cache_dir=_effective_cache_dir,
         local_dir_use_symlinks=False,
     )
     _model = Llama(
-        model_path=local_path,
         chat_format="llama-3",
         n_ctx=N_CTX,
         n_threads=N_THREADS,
         n_batch=N_BATCH,
         verbose=False,
     )
     return _model
 @app.on_event("startup")
 def _warm_start():
-    # Preload to avoid cold-start cost on first request
-    get_model()
 # ---------------- Schemas ----------------
 class GenerateRequest(BaseModel):
@@ -116,28 +131,17 @@ def _fit_prompt_to_context(model: Llama, prompt: str) -> str:
 def health():
     try:
         _ = get_model()
-        return {"ok": True, "cache_dir": _effective_cache_dir}
     except Exception as e:
         return {"ok": False, "error": str(e)}
-@app.get("/config")
-def config():
-    return {
-        "repo_id": REPO_ID,
-        "filename": FILENAME,
-        "preferred_cache_dir": CACHE_DIR,
-        "effective_cache_dir": _effective_cache_dir,
-        "n_threads": N_THREADS,
-        "n_batch": N_BATCH,
-        "n_ctx": N_CTX,
-        "max_tokens": MAX_TOKENS,
-        "temperature": TEMPERATURE,
-        "top_p": TOP_P,
-        "stop_tokens": STOP_TOKENS,
-        "ctx_safety": CTX_SAFETY,
-        "has_system_prompt": bool(SYSTEM_PROMPT),
-    }
 @app.get("/warmup")
 def warmup():
     model = get_model()
@@ -159,7 +163,7 @@ def warmup():
 def generate(req: GenerateRequest):
     """
     Non-streaming chat completion.
-    Accepts ONLY a prompt string; all other params are fixed in code/env.
     """
     try:
         if not req.prompt or not req.prompt.strip():
@@ -203,6 +207,7 @@ def generate(req: GenerateRequest):
                 "n_threads": N_THREADS,
             },
             "prompt_truncated": (fitted_prompt != user_prompt),
             "effective_cache_dir": _effective_cache_dir,
         })

 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
+# ---------------- Config ----------------
 REPO_ID   = os.getenv("REPO_ID", "bartowski/Llama-3.2-3B-Instruct-GGUF")
 FILENAME  = os.getenv("FILENAME", "Llama-3.2-3B-Instruct-Q4_K_M.gguf")
+# BUILD-TIME PREFETCH LOCATION (your Dockerfile downloads here)
+BUILD_CACHE_DIR = "/app/models"
+BUILD_MODEL_PATH = os.path.join(BUILD_CACHE_DIR, FILENAME)
+# Preferred runtime cache (only used if model not found above)
+PREFERRED_CACHE_DIR = os.getenv("CACHE_DIR", "/app/models")
+# Inference knobs (conservative for small CPU Spaces)
+N_THREADS = min(4, (os.cpu_count() or 2))
+N_BATCH   = int(os.getenv("N_BATCH", "16"))     # safer than 32/64 on tiny CPUs
 N_CTX     = int(os.getenv("N_CTX", "2048"))
+# Sampling (keep short for latency)
+MAX_TOKENS   = int(os.getenv("MAX_TOKENS", "48"))   # tighter → faster
 TEMPERATURE  = float(os.getenv("TEMPERATURE", "0.7"))
 TOP_P        = float(os.getenv("TOP_P", "0.9"))
 STOP_TOKENS  = os.getenv("STOP_TOKENS", "</s>,<|eot_id|>").split(",")
 SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", "").strip()
 CTX_SAFETY    = int(os.getenv("CTX_SAFETY", "128"))
+# ---------------- App ----------------
 app = FastAPI(title="Llama 3.2 3B Instruct (llama.cpp) API - Prompt Only")
 _model: Optional[Llama] = None
 _model_lock = threading.Lock()
+_effective_model_path: Optional[str] = None
 _effective_cache_dir: Optional[str] = None
 def _select_writable_cache_dir(preferred: str) -> str:
     candidates = [
         preferred,
         os.path.join(os.path.expanduser("~"), ".cache", "hf_models"),
     for d in candidates:
         try:
             os.makedirs(d, exist_ok=True)
+            test_file = os.path.join(d, ".w")
+            with open(test_file, "w") as f:
                 f.write("ok")
+            os.remove(test_file)
             return d
         except Exception:
             continue
+    raise RuntimeError("No writable cache directory found")
+def _resolve_model_path() -> str:
+    """
+    1) If the model file exists at build path (/app/models/...), use it (fast path).
+    2) Else, download into first writable cache dir and return that path.
+    """
+    global _effective_cache_dir
+    if os.path.isfile(BUILD_MODEL_PATH):
+        return BUILD_MODEL_PATH
     if _effective_cache_dir is None:
+        _effective_cache_dir = _select_writable_cache_dir(PREFERRED_CACHE_DIR)
     local_path = hf_hub_download(
         repo_id=REPO_ID,
         cache_dir=_effective_cache_dir,
         local_dir_use_symlinks=False,
     )
+    return local_path
+# ---------------- Model loader ----------------
+def get_model() -> Llama:
+    global _model, _effective_model_path
+    if _model is not None:
+        return _model
+    # Resolve path without failing on /data permission
+    _effective_model_path = _resolve_model_path()
+    # llama.cpp init (CPU-friendly)
     _model = Llama(
+        model_path=_effective_model_path,
         chat_format="llama-3",
         n_ctx=N_CTX,
         n_threads=N_THREADS,
         n_batch=N_BATCH,
+        use_mmap=True,       # faster load on CPU
+        n_gpu_layers=0,      # ensure pure CPU
         verbose=False,
     )
     return _model
 @app.on_event("startup")
 def _warm_start():
+    get_model()  # force load at startup so first request is predictable
 # ---------------- Schemas ----------------
 class GenerateRequest(BaseModel):
 def health():
     try:
         _ = get_model()
+        return {
+            "ok": True,
+            "model_path": _effective_model_path,
+            "cache_dir": _effective_cache_dir,
+            "n_threads": N_THREADS,
+            "n_batch": N_BATCH,
+            "n_ctx": N_CTX
+        }
     except Exception as e:
         return {"ok": False, "error": str(e)}
 @app.get("/warmup")
 def warmup():
     model = get_model()
 def generate(req: GenerateRequest):
     """
     Non-streaming chat completion.
+    Accepts ONLY a prompt string; all other params are fixed here.
     """
     try:
         if not req.prompt or not req.prompt.strip():
                 "n_threads": N_THREADS,
             },
             "prompt_truncated": (fitted_prompt != user_prompt),
+            "effective_model_path": _effective_model_path,
             "effective_cache_dir": _effective_cache_dir,
         })