Spaces:

Luka512
/

Qwen3

Runtime error

App Files Files Community

Tim Luka Horstmann commited on May 12

Commit

9f40e8d

1 Parent(s): dcce920

Test first deployment

Browse files

Files changed (3) hide show

Dockerfile +31 -0
llm_server.py +70 -0
requirements.txt +3 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+FROM python:3.10-slim
+ENV DEBIAN_FRONTEND=noninteractive \
+    MODEL_REPO="unsloth/Qwen3-0.6B-GGUF" \
+    MODEL_FILE="Qwen3-0.6B-Q4_K_M.gguf" \
+    HF_HOME=/app/cache
+# system deps + rust for llama-cpp
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+      build-essential cmake git curl wget libgomp1 ca-certificates && \
+    rm -rf /var/lib/apt/lists/* && \
+    curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \
+    . "$HOME/.cargo/env" && rustup default stable
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+RUN git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git /tmp/llama-cpp-python \
+ && cd /tmp/llama-cpp-python \
+ && FORCE_CMAKE=1 pip install --no-cache-dir . \
+ && pip install --no-cache-dir "llama-cpp-python[server]" huggingface_hub \
+ && rm -rf /tmp/llama-cpp-python
+# Copy the LLM server code
+COPY llm_server.py /app/llm_server.py
+EXPOSE 7860
+CMD ["uvicorn", "llm_server:app", "--host", "0.0.0.0", "--port", "7860"]

llm_server.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import os, time, logging
+from pathlib import Path
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download, login
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+MODEL_DIR = Path("/app/pretrained_models/llm")
+MODEL_DIR.mkdir(parents=True, exist_ok=True)
+MODEL_FILE = os.getenv("MODEL_FILE")
+MODEL_PATH = MODEL_DIR / MODEL_FILE
+@app.on_event("startup")
+async def startup():
+    logging.info("Starting LLM service…")
+    if not MODEL_PATH.exists():
+        token = os.getenv("HF_TOKEN")
+        if token:
+            login(token=token)
+        hf_hub_download(
+            repo_id=os.getenv("MODEL_REPO"),
+            filename=MODEL_FILE,
+            local_dir=str(MODEL_DIR)
+        )
+    global llm
+    llm = Llama(
+        model_path=str(MODEL_PATH),
+        n_ctx=1024,
+        n_threads=2,
+        n_gpu_layers=0,
+        use_mlock=True,
+        f16_kv=True,
+    )
+    logging.info("LLM loaded.")
+@app.post("/v1/chat/completions")
+async def chat(req: dict):
+    if req.get("model") != "llama-cpp":
+        raise HTTPException(status_code=404, detail="Model not found")
+    resp = llm.create_chat_completion(
+        messages=req["messages"],
+        max_tokens=req.get("max_tokens", 256),
+        temperature=req.get("temperature", 0.7),
+        top_p=req.get("top_p", 1.0),
+        stream=False
+    )
+    return JSONResponse({
+        "id": resp["id"],
+        "object": "chat.completion",
+        "created": resp.get("created", int(time.time())),
+        "model": "llama-cpp",
+        "choices": [{
+            "index": 0,
+            "message": {
+                "role": resp["choices"][0]["message"]["role"],
+                "content": resp["choices"][0]["message"]["content"],
+            },
+            "finish_reason": resp["choices"][0].get("finish_reason", "stop"),
+        }],
+        "usage": resp.get("usage", {}),
+    })

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+fastapi
+uvicorn[standard]
+huggingface_hub