Spaces:

Karan6933
/

coder-agent

Sleeping

App Files Files Community

Karan6933 commited on 8 days ago

Commit

538c943

verified ·

1 Parent(s): 2069ca1

Upload 7 files

Browse files

Files changed (7) hide show

Dockerfile +33 -0
app/main.py +89 -0
app/model.py +199 -0
app/prompt.py +23 -0
app/schemas.py +41 -0
requirements.txt +11 -0
run.sh +12 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,33 @@

+# Dockerfile
+FROM python:3.11-slim
+# Set environment variables for Hugging Face cache optimization
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    HF_HOME=/tmp/.huggingface \
+    TRANSFORMERS_CACHE=/tmp/.cache/huggingface \
+    HF_HUB_CACHE=/tmp/.cache/huggingface/hub
+# Install minimal system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /app
+# Copy requirements first for layer caching
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY app/ ./app/
+# Create cache directories
+RUN mkdir -p /tmp/.cache/huggingface
+# Expose Hugging Face Spaces default port
+EXPOSE 7860
+# Run the application
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

app/main.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# app/main.py
+"""
+FastAPI application for serving Nanbeige4.1-3B model.
+Optimized for Hugging Face Spaces (CPU, Docker).
+"""
+import asyncio
+from contextlib import asynccontextmanager
+from fastapi import FastAPI
+from fastapi.responses import StreamingResponse, JSONResponse
+from app.model import load_model, generate_stream, generate
+from app.prompt import build_prompt
+from app.schemas import GenerationRequest, GenerationResponse
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Lifespan context manager for startup/shutdown events.
+    Loads model on startup to ensure it's ready for requests.
+    """
+    # Startup: Load model
+    print("Loading model...")
+    load_model()
+    print("Model loaded successfully")
+    yield
+    # Shutdown: Cleanup (if needed)
+    print("Shutting down...")
+app = FastAPI(
+    title="Nanbeige4.1-3B API",
+    description="FastAPI wrapper for Nanbeige4.1-3B with streaming support",
+    version="1.0.0",
+    lifespan=lifespan
+)
+@app.get("/")
+async def health_check():
+    """Health check endpoint."""
+    return {"status": "ok", "model": "Nanbeige4.1-3B"}
+@app.post("/generate")
+async def generate_text(request: GenerationRequest):
+    """
+    Generate text from prompt.
+    Supports both streaming and non-streaming responses.
+    """
+    # Build final prompt with system instructions
+    final_prompt = build_prompt(request.prompt)
+    if request.stream:
+        # Streaming response
+        async def stream_generator():
+            # Run sync generator in thread pool to not block
+            loop = asyncio.get_event_loop()
+            sync_gen = generate_stream(
+                final_prompt,
+                temperature=request.temperature,
+                max_tokens=request.max_tokens
+            )
+            for chunk in sync_gen:
+                if chunk:
+                    # SSE format
+                    yield f"data: {chunk}\n\n"
+            yield "data: [DONE]\n\n"
+        return StreamingResponse(
+            stream_generator(),
+            media_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+            }
+        )
+    else:
+        # Non-streaming response
+        result = generate(
+            final_prompt,
+            temperature=request.temperature,
+            max_tokens=request.max_tokens
+        )
+        return GenerationResponse(text=result)

app/model.py ADDED Viewed

	@@ -0,0 +1,199 @@

+# app/model.py
+"""
+Model loading and inference utilities for Nanbeige/Nanbeige4.1-3B.
+Implements singleton pattern to ensure model loads only once.
+"""
+import gc
+import os
+from typing import Generator, Optional
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+# Global singleton instances
+_tokenizer: Optional[AutoTokenizer] = None
+_model: Optional[AutoModelForCausalLM] = None
+def get_quantization_config() -> Optional[BitsAndBytesConfig]:
+    """
+    Configure 4-bit quantization for CPU memory efficiency.
+    Returns None if bitsandbytes is not available or on CPU.
+    """
+    try:
+        # 4-bit quantization config for minimal memory footprint
+        return BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+        )
+    except Exception:
+        return None
+def load_model() -> tuple[AutoTokenizer, AutoModelForCausalLM]:
+    """
+    Load tokenizer and model with singleton pattern.
+    Loads only on first call, returns cached instances thereafter.
+    Returns:
+        Tuple of (tokenizer, model)
+    """
+    global _tokenizer, _model
+    if _tokenizer is not None and _model is not None:
+        return _tokenizer, _model
+    model_name = "Nanbeige/Nanbeige4.1-3B"
+    # Load tokenizer
+    _tokenizer = AutoTokenizer.from_pretrained(
+        model_name,
+        use_fast=False,
+        trust_remote_code=True
+    )
+    # Configure model loading for CPU
+    # Use torch.float16 for memory efficiency on CPU
+    model_kwargs = {
+        "torch_dtype": torch.float16,
+        "trust_remote_code": True,
+        "low_cpu_mem_usage": True,
+    }
+    # Try to use quantization if available, otherwise use standard loading
+    quant_config = get_quantization_config()
+    if quant_config is not None:
+        model_kwargs["quantization_config"] = quant_config
+    # Load model
+    _model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        **model_kwargs
+    )
+    # Ensure model is in eval mode
+    _model.eval()
+    # Clear cache to free memory
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    return _tokenizer, _model
+def generate_stream(
+    prompt: str,
+    temperature: float = 0.7,
+    max_tokens: int = 200
+) -> Generator[str, None, None]:
+    """
+    Generate text in streaming fashion.
+    Args:
+        prompt: Input prompt text
+        temperature: Sampling temperature
+        max_tokens: Maximum tokens to generate
+    Yields:
+        Text chunks as they are generated
+    """
+    tokenizer, model = load_model()
+    # Tokenize input
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        add_special_tokens=False
+    )
+    # Move to same device as model
+    input_ids = inputs.input_ids.to(model.device)
+    # Generation parameters optimized for Nanbeige
+    generation_kwargs = {
+        "input_ids": input_ids,
+        "max_new_tokens": max_tokens,
+        "temperature": temperature,
+        "top_p": 0.95,
+        "do_sample": True,
+        "pad_token_id": tokenizer.pad_token_id or tokenizer.eos_token_id,
+        "eos_token_id": tokenizer.eos_token_id,
+    }
+    # Stream generation using generate with streamer
+    from transformers import TextIteratorStreamer
+    from threading import Thread
+    streamer = TextIteratorStreamer(
+        tokenizer,
+        skip_prompt=True,
+        skip_special_tokens=True
+    )
+    generation_kwargs["streamer"] = streamer
+    # Run generation in separate thread to enable streaming
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    generated_text = ""
+    for text in streamer:
+        generated_text += text
+        yield text
+    thread.join()
+    # Cleanup
+    gc.collect()
+def generate(
+    prompt: str,
+    temperature: float = 0.7,
+    max_tokens: int = 200
+) -> str:
+    """
+    Generate text non-streaming (full response).
+    Args:
+        prompt: Input prompt text
+        temperature: Sampling temperature
+        max_tokens: Maximum tokens to generate
+    Returns:
+        Complete generated text
+    """
+    tokenizer, model = load_model()
+    # Tokenize input
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        add_special_tokens=False
+    )
+    input_ids = inputs.input_ids.to(model.device)
+    # Generate
+    with torch.no_grad():
+        output_ids = model.generate(
+            input_ids,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=0.95,
+            do_sample=True,
+            pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+    # Decode only the new tokens
+    new_tokens = output_ids[0][len(input_ids[0]):]
+    response = tokenizer.decode(new_tokens, skip_special_tokens=True)
+    # Cleanup
+    gc.collect()
+    return response

app/prompt.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# app/prompt.py
+"""
+Prompt building utilities for Nanbeige model.
+"""
+SYSTEM_PROMPT = """Tu ek helpful assistant hai. Hamesha concise aur accurate jawab de.
+- Agar kuch pata nahi, toh clearly bol "I don't know"
+- Kabhi bhi mat bol ki tu ek AI model hai
+- Sirf verified information share kar
+- Hinglish mein baat kar"""
+def build_prompt(user_input: str) -> str:
+    """
+    Build the final prompt by combining system prompt with user input.
+    Args:
+        user_input: Raw user query/input
+    Returns:
+        Formatted prompt string ready for model inference
+    """
+    return f"{SYSTEM_PROMPT}\n\nUser: {user_input}\nAssistant:"

app/schemas.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# app/schemas.py
+"""
+Pydantic schemas for API request/response validation.
+"""
+from pydantic import BaseModel, Field
+class GenerationRequest(BaseModel):
+    """Request schema for text generation endpoint."""
+    prompt: str = Field(
+        ...,
+        min_length=1,
+        description="Input prompt text"
+    )
+    temperature: float = Field(
+        default=0.7,
+        ge=0.0,
+        le=2.0,
+        description="Sampling temperature"
+    )
+    max_tokens: int = Field(
+        default=200,
+        ge=1,
+        le=2048,
+        description="Maximum tokens to generate"
+    )
+    stream: bool = Field(
+        default=True,
+        description="Whether to stream the response"
+    )
+class GenerationResponse(BaseModel):
+    """Response schema for non-streaming generation."""
+    text: str = Field(
+        ...,
+        description="Generated text response"
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+# requirements.txt
+fastapi==0.115.0
+uvicorn[standard]==0.32.0
+pydantic==2.9.0
+transformers==4.46.0
+torch==2.5.0
+accelerate==1.0.0
+sentencepiece==0.2.0
+bitsandbytes==0.44.0
+huggingface-hub==0.26.0
+python-multipart==0.0.12

run.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+#!/bin/bash
+# run.sh
+# Production startup script for uvicorn server
+exec uvicorn app.main:app \
+    --host 0.0.0.0 \
+    --port 7860 \
+    --workers 1 \
+    --loop uvloop \
+    --http httptools \
+    --proxy-headers \
+    --forwarded-allow-ips '*'