Spaces:

omaryasserhassan
/

llm_server

Sleeping

App Files Files Community

omaryasserhassan commited on Aug 17

Commit

490925d

verified ·

1 Parent(s): 77fd55f

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -191

app.py CHANGED Viewed

@@ -1,217 +1,61 @@
 # app.py
 import os
-import time
-import threading
-from typing import Optional
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
-from pydantic import BaseModel, Field
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 # ---------------- Config ----------------
-REPO_ID   = os.getenv("REPO_ID", "bartowski/Llama-3.2-3B-Instruct-GGUF")
-FILENAME  = os.getenv("FILENAME", "Llama-3.2-3B-Instruct-Q4_K_M.gguf")
-# BUILD-TIME PREFETCH LOCATION (your Dockerfile downloads here)
-BUILD_CACHE_DIR = "/app/models"
-BUILD_MODEL_PATH = os.path.join(BUILD_CACHE_DIR, FILENAME)
-# Preferred runtime cache (only used if model not found above)
-PREFERRED_CACHE_DIR = os.getenv("CACHE_DIR", "/app/models")
-# Inference knobs (conservative for small CPU Spaces)
-N_THREADS = min(4, (os.cpu_count() or 2))
-N_BATCH   = int(os.getenv("N_BATCH", "16"))     # safer than 32/64 on tiny CPUs
-N_CTX     = int(os.getenv("N_CTX", "2048"))
-# Sampling (keep short for latency)
-MAX_TOKENS   = int(os.getenv("MAX_TOKENS", "48"))   # tighter → faster
-TEMPERATURE  = float(os.getenv("TEMPERATURE", "0.7"))
-TOP_P        = float(os.getenv("TOP_P", "0.9"))
-STOP_TOKENS  = os.getenv("STOP_TOKENS", "</s>,<|eot_id|>").split(",")
-SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", "").strip()
-CTX_SAFETY    = int(os.getenv("CTX_SAFETY", "128"))
 # ---------------- App ----------------
-app = FastAPI(title="Llama 3.2 3B Instruct (llama.cpp) API - Prompt Only")
-_model: Optional[Llama] = None
-_model_lock = threading.Lock()
-_effective_model_path: Optional[str] = None
-_effective_cache_dir: Optional[str] = None
-def _select_writable_cache_dir(preferred: str) -> str:
-    candidates = [
-        preferred,
-        os.path.join(os.path.expanduser("~"), ".cache", "hf_models"),
-        "/tmp/hf_models",
-    ]
-    for d in candidates:
-        try:
-            os.makedirs(d, exist_ok=True)
-            test_file = os.path.join(d, ".w")
-            with open(test_file, "w") as f:
-                f.write("ok")
-            os.remove(test_file)
-            return d
-        except Exception:
-            continue
-    raise RuntimeError("No writable cache directory found")
-def _resolve_model_path() -> str:
-    """
-    1) If the model file exists at build path (/app/models/...), use it (fast path).
-    2) Else, download into first writable cache dir and return that path.
-    """
-    global _effective_cache_dir
-    if os.path.isfile(BUILD_MODEL_PATH):
-        return BUILD_MODEL_PATH
-    if _effective_cache_dir is None:
-        _effective_cache_dir = _select_writable_cache_dir(PREFERRED_CACHE_DIR)
     local_path = hf_hub_download(
         repo_id=REPO_ID,
         filename=FILENAME,
-        cache_dir=_effective_cache_dir,
         local_dir_use_symlinks=False,
     )
-    return local_path
-# ---------------- Model loader ----------------
-def get_model() -> Llama:
-    global _model, _effective_model_path
-    if _model is not None:
-        return _model
-    # Resolve path without failing on /data permission
-    _effective_model_path = _resolve_model_path()
-    # llama.cpp init (CPU-friendly)
-    _model = Llama(
-        model_path=_effective_model_path,
         chat_format="llama-3",
         n_ctx=N_CTX,
         n_threads=N_THREADS,
         n_batch=N_BATCH,
-        use_mmap=True,       # faster load on CPU
-        n_gpu_layers=0,      # ensure pure CPU
         verbose=False,
     )
-    return _model
-@app.on_event("startup")
-def _warm_start():
-    get_model()  # force load at startup so first request is predictable
-# ---------------- Schemas ----------------
-class GenerateRequest(BaseModel):
-    prompt: str = Field(..., description="User prompt text only.")
-# ---------------- Helpers ----------------
-def _fit_prompt_to_context(model: Llama, prompt: str) -> str:
-    """
-    Ensure tokens(prompt) + MAX_TOKENS + CTX_SAFETY <= N_CTX.
-    If over budget, truncate from the front (keep the tail).
-    """
-    toks = model.tokenize(prompt.encode("utf-8"))
-    budget = max(256, N_CTX - MAX_TOKENS - CTX_SAFETY)
-    if len(toks) <= budget:
-        return prompt
-    kept = model.detokenize(toks[-budget:])
-    try:
-        return kept.decode("utf-8", errors="ignore")
-    except Exception:
-        return kept.decode("utf-8", "ignore")
-# ---------------- Endpoints ----------------
-@app.get("/health")
-def health():
-    try:
-        _ = get_model()
-        return {
-            "ok": True,
-            "model_path": _effective_model_path,
-            "cache_dir": _effective_cache_dir,
-            "n_threads": N_THREADS,
-            "n_batch": N_BATCH,
-            "n_ctx": N_CTX
-        }
-    except Exception as e:
-        return {"ok": False, "error": str(e)}
-@app.get("/warmup")
-def warmup():
-    model = get_model()
-    messages = [{"role": "user", "content": "Say OK."}]
-    t0 = time.time()
-    with _model_lock:
-        out = model.create_chat_completion(
-            messages=messages,
-            max_tokens=8,
-            temperature=0.0,
-            top_p=1.0,
-            stop=STOP_TOKENS,
-        )
-    dt = time.time() - t0
-    text = out["choices"][0]["message"]["content"]
-    return {"ok": True, "ms": int(dt * 1000), "resp": text.strip()}
 @app.post("/generate")
-def generate(req: GenerateRequest):
-    """
-    Non-streaming chat completion.
-    Accepts ONLY a prompt string; all other params are fixed here.
-    """
-    try:
-        if not req.prompt or not req.prompt.strip():
-            raise HTTPException(status_code=400, detail="prompt must be a non-empty string")
-        model = get_model()
-        user_prompt = req.prompt.strip()
-        fitted_prompt = _fit_prompt_to_context(model, user_prompt)
-        messages = []
-        if SYSTEM_PROMPT:
-            messages.append({"role": "system", "content": SYSTEM_PROMPT})
-        messages.append({"role": "user", "content": fitted_prompt})
-        t0 = time.time()
-        with _model_lock:
-            out = model.create_chat_completion(
-                messages=messages,
-                max_tokens=MAX_TOKENS,
-                temperature=TEMPERATURE,
-                top_p=TOP_P,
-                stop=STOP_TOKENS,
-            )
-        dt = time.time() - t0
-        text = out["choices"][0]["message"]["content"]
-        usage = out.get("usage", {})
-        return JSONResponse({
-            "ok": True,
-            "response": text,
-            "usage": usage,
-            "timing_sec": round(dt, 3),
-            "params": {
-                "max_tokens": MAX_TOKENS,
-                "temperature": TEMPERATURE,
-                "top_p": TOP_P,
-                "stop": STOP_TOKENS,
-                "n_ctx": N_CTX,
-                "n_batch": N_BATCH,
-                "n_threads": N_THREADS,
-            },
-            "prompt_truncated": (fitted_prompt != user_prompt),
-            "effective_model_path": _effective_model_path,
-            "effective_cache_dir": _effective_cache_dir,
-        })
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))

 # app.py
 import os
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
+from pydantic import BaseModel
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 # ---------------- Config ----------------
+REPO_ID  = "bartowski/Llama-3.2-3B-Instruct-GGUF"
+FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"
+CACHE_DIR = "/app/models"
+N_THREADS = min(4, os.cpu_count() or 2)
+N_BATCH   = 16
+N_CTX     = 2048
+MAX_TOKENS = 64
 # ---------------- App ----------------
+app = FastAPI(title="Simple Llama Server")
+model = None
+class PromptRequest(BaseModel):
+    prompt: str
+# ---------------- Startup ----------------
+@app.on_event("startup")
+def load_model():
+    global model
+    os.makedirs(CACHE_DIR, exist_ok=True)
     local_path = hf_hub_download(
         repo_id=REPO_ID,
         filename=FILENAME,
+        cache_dir=CACHE_DIR,
         local_dir_use_symlinks=False,
     )
+    model = Llama(
+        model_path=local_path,
         chat_format="llama-3",
         n_ctx=N_CTX,
         n_threads=N_THREADS,
         n_batch=N_BATCH,
         verbose=False,
     )
+# ---------------- Endpoint ----------------
 @app.post("/generate")
+def generate(req: PromptRequest):
+    global model
+    if model is None:
+        raise HTTPException(status_code=500, detail="Model not loaded")
+    out = model.create_chat_completion(
+        messages=[{"role": "user", "content": req.prompt}],
+        max_tokens=MAX_TOKENS,
+        temperature=0.7,
+        top_p=0.9,
+        stop=["</s>", "<|eot_id|>"]
+    )
+    text = out["choices"][0]["message"]["content"]
+    return JSONResponse({"response": text})