Spaces:

Karan6933
/

coder-agent

Sleeping

App Files Files Community

Karan6933 commited on 7 days ago

Commit

05944f9

verified ·

1 Parent(s): 3809b3c

Upload 8 files

Browse files

Files changed (6) hide show

Dockerfile +5 -5
app/main.py +25 -51
app/model.py +105 -158
app/prompt.py +2 -10
app/schemas.py +6 -30
requirements.txt +2 -4

Dockerfile CHANGED Viewed

@@ -1,25 +1,25 @@
 FROM python:3.11-slim
 ENV PYTHONUNBUFFERED=1 \
-    CMAKE_ARGS="-DLLAMA_AVX2=ON" \
     FORCE_CMAKE=1
-# System deps (llama.cpp ke liye zaroori)
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     cmake \
     git \
     && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
-# Install python deps (IMPORTANT)
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy app
 COPY app/ ./app/
 EXPOSE 7860
-CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

+# Dockerfile
 FROM python:3.11-slim
 ENV PYTHONUNBUFFERED=1 \
+    CMAKE_ARGS="-DLLAMA_AVX2=ON -DLLAMA_AVX=ON -DLLAMA_FMA=ON" \
     FORCE_CMAKE=1
+# System deps for llama.cpp compilation
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     cmake \
     git \
+    wget \
     && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY app/ ./app/
 EXPOSE 7860
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

app/main.py CHANGED Viewed

@@ -1,14 +1,13 @@
 # app/main.py
 """
-FastAPI application for serving Nanbeige4.1-3B model.
-CPU-ONLY optimized for Hugging Face Spaces (Docker).
 """
 import asyncio
 from contextlib import asynccontextmanager
 from fastapi import FastAPI
-from fastapi.responses import StreamingResponse, JSONResponse
 from app.model import load_model, generate_stream, generate
 from app.prompt import build_prompt
@@ -17,89 +16,64 @@ from app.schemas import GenerationRequest, GenerationResponse
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    """
-    Lifespan context manager for startup/shutdown events.
-    Loads model on startup to ensure it's ready for requests.
-    """
-    # Startup: Load model
-    print("Loading model on CPU...")
-    load_model()
-    print("Model loaded successfully on CPU")
     yield
-    # Shutdown: Cleanup
     print("Shutting down...")
 app = FastAPI(
-    title="Nanbeige4.1-3B API (CPU)",
-    description="FastAPI wrapper for Nanbeige4.1-3B - CPU Optimized",
-    version="1.0.0",
     lifespan=lifespan
 )
 @app.get("/")
 async def health_check():
-    """Health check endpoint."""
     return {
-        "status": "ok",
-        "model": "Nanbeige4.1-3B",
         "device": "cpu",
-        "mode": "float32"
     }
 @app.post("/generate")
 async def generate_text(request: GenerationRequest):
-    """
-    Generate text from prompt.
-    Supports both streaming and non-streaming responses.
-    """
-    # Build final prompt with system instructions
     final_prompt = build_prompt(request.prompt)
     if request.stream:
-        # Streaming response
         async def stream_generator():
-            # Run sync generator in thread pool to not block event loop
             loop = asyncio.get_event_loop()
-            # Use run_in_executor for CPU-bound operations
-            def sync_generator():
-                return generate_stream(
                     final_prompt,
                     temperature=request.temperature,
                     max_tokens=request.max_tokens
-                )
-            # Get the generator
-            sync_gen = await loop.run_in_executor(None, sync_generator)
-            # Iterate through chunks
-            for chunk in sync_gen:
                 if chunk:
-                    # SSE format
                     yield f"data: {chunk}\n\n"
             yield "data: [DONE]\n\n"
         return StreamingResponse(
             stream_generator(),
-            media_type="text/event-stream",
-            headers={
-                "Cache-Control": "no-cache",
-                "Connection": "keep-alive",
-            }
         )
     else:
-        # Non-streaming response - run in executor to not block
-        loop = asyncio.get_event_loop()
-        result = await loop.run_in_executor(
-            None,
-            lambda: generate(
-                final_prompt,
-                temperature=request.temperature,
-                max_tokens=request.max_tokens
-            )
         )
         return GenerationResponse(text=result)

 # app/main.py
 """
+FastAPI app with llama.cpp backend.
 """
 import asyncio
 from contextlib import asynccontextmanager
 from fastapi import FastAPI
+from fastapi.responses import StreamingResponse
 from app.model import load_model, generate_stream, generate
 from app.prompt import build_prompt
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    """Startup: Download and load model."""
+    print("=" * 50)
+    print("Starting up - Loading GGUF model...")
+    print("=" * 50)
+    load_model()  # Pre-load on startup
+    print("Ready for requests!")
     yield
     print("Shutting down...")
 app = FastAPI(
+    title="Nanbeige3B-GGUF API",
+    description="Fast CPU inference with llama.cpp",
+    version="2.0.0",
     lifespan=lifespan
 )
 @app.get("/")
 async def health_check():
     return {
+        "status": "ok",
+        "model": "Nanbeige-3B-GGUF",
+        "backend": "llama.cpp",
         "device": "cpu",
+        "optimized": True
     }
 @app.post("/generate")
 async def generate_text(request: GenerationRequest):
     final_prompt = build_prompt(request.prompt)
     if request.stream:
         async def stream_generator():
             loop = asyncio.get_event_loop()
+            def sync_gen():
+                for chunk in generate_stream(
                     final_prompt,
                     temperature=request.temperature,
                     max_tokens=request.max_tokens
+                ):
+                    yield chunk
+            for chunk in sync_gen():
                 if chunk:
                     yield f"data: {chunk}\n\n"
             yield "data: [DONE]\n\n"
         return StreamingResponse(
             stream_generator(),
+            media_type="text/event-stream"
         )
     else:
+        result = generate(
+            final_prompt,
+            temperature=request.temperature,
+            max_tokens=request.max_tokens
         )
         return GenerationResponse(text=result)

app/model.py CHANGED Viewed

@@ -1,7 +1,7 @@
-# app/model.py - llama.cpp optimized version
 """
-CPU-optimized model loading using llama-cpp-python.
-2-4x faster than transformers on CPU.
 """
 import gc
@@ -9,188 +9,135 @@ import os
 from typing import Generator, Optional
 from pathlib import Path
-# Try to use llama.cpp, fallback to transformers
-try:
-    from llama_cpp import Llama
-    LLAMA_AVAILABLE = True
-except ImportError:
-    LLAMA_AVAILABLE = False
-    from transformers import AutoModelForCausalLM, AutoTokenizer
 # Global singleton
-_llama_model = None
-_transformer_model = None
-_tokenizer = None
-def get_model_path() -> str:
     """
-    Returns path to GGUF model.
-    If GGUF not available, returns HF model name.
     """
-    # Pehle check karo agar GGUF downloaded hai
-    gguf_path = "/tmp/models/nanbeige-3b-q4_0.gguf"
-    if os.path.exists(gguf_path):
-        return gguf_path
-    # Agar nahi hai, toh HF model name return karo
-    return "Nanbeige/Nanbeige4.1-3B"
-def load_model():
     """
-    Load model with llama.cpp if available (GGUF),
-    otherwise fallback to optimized transformers.
     """
-    global _llama_model, _transformer_model, _tokenizer
-    # Agar already loaded hai
-    if _llama_model or _transformer_model:
-        return
-    model_path = get_model_path()
-    # GGUF format mein hai toh llama.cpp use karo (FAST)
-    if model_path.endswith(".gguf") and LLAMA_AVAILABLE:
-        print("Loading GGUF model with llama.cpp (optimized)...")
-        _llama_model = Llama(
-            model_path=model_path,
-            n_ctx=2048,
-            n_threads=4,  # CPU threads
-            n_batch=512,
-            verbose=False
-        )
-        print("Model loaded with llama.cpp")
-    # Nahi toh transformers fallback (SLOW but works)
-    else:
-        print("GGUF not available, using transformers (slower)...")
-        import torch
-        from transformers import AutoModelForCausalLM, AutoTokenizer
-        model_name = "Nanbeige/Nanbeige4.1-3B"
-        _tokenizer = AutoTokenizer.from_pretrained(
-            model_name,
-            trust_remote_code=True,
-            use_fast=False
-        )
-        if _tokenizer.pad_token is None:
-            _tokenizer.pad_token = _tokenizer.eos_token
-        _transformer_model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype=torch.float32,
-            trust_remote_code=True,
-            low_cpu_mem_usage=True,
-            device_map=None,
-        )
-        _transformer_model = _transformer_model.to("cpu")
-        _transformer_model.eval()
-        # Disable gradients
-        for param in _transformer_model.parameters():
-            param.requires_grad = False
-        print("Model loaded with transformers")
     gc.collect()
-def generate_stream(prompt: str, temperature: float = 0.7, max_tokens: int = 100):
     """
-    Generate with llama.cpp (fast) or transformers (slow).
     """
-    load_model()
-    # llama.cpp path (FAST - 2-4x speedup)
-    if _llama_model:
-        # llama.cpp native streaming
-        stream = _llama_model(
-            prompt,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=0.95,
-            stream=True,
-            stop=["</s>", "User:", "Human:"]
-        )
-        for output in stream:
-            text = output["choices"][0]["text"]
-            if text:
-                yield text
-    # Transformers fallback (SLOW)
-    else:
-        import torch
-        from threading import Thread
-        from transformers import TextIteratorStreamer
-        inputs = _tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
-        input_ids = inputs.input_ids
-        streamer = TextIteratorStreamer(
-            _tokenizer,
-            skip_prompt=True,
-            skip_special_tokens=True
-        )
-        generation_kwargs = {
-            "input_ids": input_ids,
-            "max_new_tokens": max_tokens,
-            "temperature": temperature,
-            "top_p": 0.95,
-            "do_sample": True,
-            "pad_token_id": _tokenizer.pad_token_id,
-            "eos_token_id": _tokenizer.eos_token_id,
-            "streamer": streamer,
-            "use_cache": True,
-        }
-        thread = Thread(target=_transformer_model.generate, kwargs=generation_kwargs)
-        thread.start()
-        for text in streamer:
-            if text:
-                yield text
-        thread.join()
     gc.collect()
-def generate(prompt: str, temperature: float = 0.7, max_tokens: int = 100) -> str:
     """
-    Non-streaming generation.
     """
-    load_model()
-    if _llama_model:
-        output = _llama_model(
-            prompt,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=0.95,
-            stop=["</s>", "User:", "Human:"]
-        )
-        return output["choices"][0]["text"]
-    else:
-        import torch
-        inputs = _tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
-        with torch.no_grad():
-            output_ids = _transformer_model.generate(
-                inputs.input_ids,
-                max_new_tokens=max_tokens,
-                temperature=temperature,
-                top_p=0.95,
-                do_sample=True,
-                pad_token_id=_tokenizer.pad_token_id,
-                eos_token_id=_tokenizer.eos_token_id,
-                use_cache=True,
-            )
-        new_tokens = output_ids[0][len(inputs.input_ids[0]):]
-        return _tokenizer.decode(new_tokens, skip_special_tokens=True)

+# app/model.py
 """
+CPU-optimized model loading with automatic GGUF download.
+Uses llama.cpp for 2-4x faster inference on CPU.
 """
 import gc
 from typing import Generator, Optional
 from pathlib import Path
+from huggingface_hub import hf_hub_download, list_repo_files
+from llama_cpp import Llama
 # Global singleton
+_llama_model: Optional[Llama] = None
+# Model configuration
+MODEL_REPO = "TheBloke/Nanbeige-3B-GGUF"  # GGUF version available hai
+MODEL_FILE = "nanbeige-3b.Q4_K_M.gguf"    # 4-bit quantized, balanced quality/speed
+# Agar yeh nahi chale toh: "nanbeige-3b.Q4_0.gguf" (faster, less quality)
+# Ya: "nanbeige-3b.Q5_K_M.gguf" (better quality, slower)
+CACHE_DIR = "/tmp/models"
+def download_gguf_model() -> str:
     """
+    Download GGUF model from Hugging Face if not exists.
+    Returns local path to model file.
     """
+    os.makedirs(CACHE_DIR, exist_ok=True)
+    local_path = os.path.join(CACHE_DIR, MODEL_FILE)
+    # Agar already downloaded hai
+    if os.path.exists(local_path):
+        print(f"GGUF model already exists: {local_path}")
+        return local_path
+    print(f"Downloading GGUF model: {MODEL_FILE}")
+    print(f"From: {MODEL_REPO}")
+    print("This may take a few minutes...")
+    try:
+        # Download from Hugging Face
+        downloaded_path = hf_hub_download(
+            repo_id=MODEL_REPO,
+            filename=MODEL_FILE,
+            cache_dir=CACHE_DIR,
+            local_dir=CACHE_DIR,
+            local_dir_use_symlinks=False
+        )
+        print(f"Model downloaded to: {downloaded_path}")
+        return downloaded_path
+    except Exception as e:
+        print(f"Error downloading GGUF model: {e}")
+        print("Falling back to smaller model or available alternative...")
+        raise
+def load_model() -> Llama:
     """
+    Load GGUF model with llama.cpp (optimized for CPU).
+    Downloads automatically if not present.
     """
+    global _llama_model
+    if _llama_model is not None:
+        return _llama_model
+    # Download if needed
+    model_path = download_gguf_model()
+    print("Loading GGUF model with llama.cpp (CPU optimized)...")
+    print("This is 2-4x faster than transformers!")
+    # CPU optimizations
+    _llama_model = Llama(
+        model_path=model_path,
+        n_ctx=2048,        # Context window
+        n_threads=4,       # CPU threads (tune based on your CPU)
+        n_batch=512,       # Batch size for prompt processing
+        verbose=False,     # Quiet mode
+        use_mmap=True,     # Memory mapping for faster loading
+        use_mlock=False,   # Don't lock memory (HF Spaces constraint)
+    )
+    print(f"Model loaded successfully!")
+    print(f"Threads: 4 | Context: 2048 | Quantization: Q4_K_M")
     gc.collect()
+    return _llama_model
+def generate_stream(
+    prompt: str,
+    temperature: float = 0.7,
+    max_tokens: int = 200
+) -> Generator[str, None, None]:
     """
+    Streaming generation with llama.cpp (FAST).
     """
+    model = load_model()
+    # llama.cpp native streaming - very fast on CPU
+    stream = model(
+        prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        top_p=0.95,
+        stream=True,
+        stop=["</s>", "User:", "Human:", "Assistant:"]
+    )
+    for output in stream:
+        text = output["choices"][0]["text"]
+        if text:
+            yield text
     gc.collect()
+def generate(
+    prompt: str,
+    temperature: float = 0.7,
+    max_tokens: int = 200
+) -> str:
     """
+    Non-streaming generation with llama.cpp.
     """
+    model = load_model()
+    output = model(
+        prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        top_p=0.95,
+        stop=["</s>", "User:", "Human:", "Assistant:"]
+    )
+    gc.collect()
+    return output["choices"][0]["text"]

app/prompt.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # app/prompt.py
 """
-Prompt building utilities for Nanbeige model.
 """
 SYSTEM_PROMPT = """Tu ek helpful assistant hai. Hamesha concise aur accurate jawab de.
@@ -11,13 +11,5 @@ SYSTEM_PROMPT = """Tu ek helpful assistant hai. Hamesha concise aur accurate jaw
 def build_prompt(user_input: str) -> str:
-    """
-    Build the final prompt by combining system prompt with user input.
-    Args:
-        user_input: Raw user query/input
-    Returns:
-        Formatted prompt string ready for model inference
-    """
     return f"{SYSTEM_PROMPT}\n\nUser: {user_input}\nAssistant:"

 # app/prompt.py
 """
+Prompt building utilities.
 """
 SYSTEM_PROMPT = """Tu ek helpful assistant hai. Hamesha concise aur accurate jawab de.
 def build_prompt(user_input: str) -> str:
+    """Build final prompt with system instructions."""
     return f"{SYSTEM_PROMPT}\n\nUser: {user_input}\nAssistant:"

app/schemas.py CHANGED Viewed

@@ -1,41 +1,17 @@
 # app/schemas.py
 """
-Pydantic schemas for API request/response validation.
 """
 from pydantic import BaseModel, Field
 class GenerationRequest(BaseModel):
-    """Request schema for text generation endpoint."""
-    prompt: str = Field(
-        ...,
-        min_length=1,
-        description="Input prompt text"
-    )
-    temperature: float = Field(
-        default=0.7,
-        ge=0.0,
-        le=2.0,
-        description="Sampling temperature"
-    )
-    max_tokens: int = Field(
-        default=200,
-        ge=1,
-        le=512,
-        description="Maximum tokens to generate"
-    )
-    stream: bool = Field(
-        default=True,
-        description="Whether to stream the response"
-    )
 class GenerationResponse(BaseModel):
-    """Response schema for non-streaming generation."""
-    text: str = Field(
-        ...,
-        description="Generated text response"
-    )

 # app/schemas.py
 """
+Pydantic schemas.
 """
 from pydantic import BaseModel, Field
 class GenerationRequest(BaseModel):
+    prompt: str = Field(..., min_length=1)
+    temperature: float = Field(default=0.7, ge=0.0, le=2.0)
+    max_tokens: int = Field(default=200, ge=1, le=1024)
+    stream: bool = Field(default=True)
 class GenerationResponse(BaseModel):
+    text: str

requirements.txt CHANGED Viewed

@@ -2,9 +2,7 @@
 fastapi==0.115.0
 uvicorn[standard]==0.32.0
 pydantic==2.9.0
-transformers==4.46.0
-torch==2.5.0
-accelerate==1.0.0
-sentencepiece==0.2.0
 huggingface-hub==0.26.0
 python-multipart==0.0.12

 fastapi==0.115.0
 uvicorn[standard]==0.32.0
 pydantic==2.9.0
+llama-cpp-python==0.3.2
 huggingface-hub==0.26.0
+requests==2.32.0
 python-multipart==0.0.12