Spaces:

Karan6933
/

abcd

Build error

App Files Files Community

Karan6933 commited on 5 days ago

Commit

c2f1385

verified ·

1 Parent(s): 0483cf5

Update api/main.py

Browse files

Files changed (1) hide show

api/main.py +86 -73

api/main.py CHANGED Viewed

@@ -1,81 +1,94 @@
-from fastapi import FastAPI
-from fastapi.responses import StreamingResponse
-from batcher import BatchScheduler
-from bridge import stream_batch
 import asyncio
-import time
-app = FastAPI()
-scheduler = BatchScheduler(max_batch=8, max_wait_ms=30)
-# In-memory chat history (per process, for demo)
-chat_histories = {}
-@app.post("/chat")
-async def chat(prompt: str, session_id: str = "default"):
-    # Simple history management
-    if session_id not in chat_histories:
-        chat_histories[session_id] = []
-    # Contextual prompt construction
-    history = "\n".join(chat_histories[session_id])
-    if history:
-        full_prompt = f"{history}\n{prompt}"
-    else:
-        full_prompt = prompt
-    # Get the queue for this request
-    token_queue = await scheduler.add(full_prompt)
-    # Generator to yield tokens from the queue
-    async def response_generator():
-        full_response = []
-        while True:
-            token = await token_queue.get()
-            if token is None:
-                break
-            yield token
-            full_response.append(token)
-        # After streaming is done, update history
-        # Note: This runs after the response closes, might need background task if strict
-        # But for generator, code continues after yield
-        response_text = "".join(full_response)
-        chat_histories[session_id].append(f"User: {prompt}")
-        chat_histories[session_id].append(f"AI: {response_text}")
-        # Keep history concise
-        if len(chat_histories[session_id]) > 10:
-            chat_histories[session_id] = chat_histories[session_id][-10:]
-    return StreamingResponse(response_generator(), media_type="text/plain")
-async def batch_loop():
-    print("Batch loop started...")
-    while True:
-        # Wait for a batch
-        batch = await scheduler.get_batch()
-        if not batch:
-            await asyncio.sleep(0.01) # Short sleep if empty
-            continue
-        # Process batch
-        prompts, queues = zip(*batch)
-        print(f"Processing batch of {len(prompts)} prompts")
-        # Stream from C++ engine
-        # Iterate over the generator which yields step-by-step tokens
-        for step_tokens in stream_batch(prompts):
-            for q, token in zip(queues, step_tokens):
-                if token is not None:
-                    q.put_nowait(token)
-            # Yield control to event loop to let FastAPI flush tokens
-            await asyncio.sleep(0)
-        # Signal done
-        for q in queues:
-            q.put_nowait(None)
-@app.on_event("startup")
-async def startup_event():
-    asyncio.create_task(batch_loop())

+import os
 import asyncio
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+from typing import List, Optional
+import logging
+from engine import init_engine, get_engine
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Configuration
+MODEL_PATH = os.getenv("MODEL_PATH", "model/model.gguf")
+MODEL_URL = os.getenv("MODEL_URL", "https://huggingface.co/prithivMLmods/Nanbeige4.1-3B-f32-GGUF/resolve/main/Nanbeige4.1-3B.Q8_0.gguf")
+class GenerateRequest(BaseModel):
+    prompt: str
+    max_tokens: int = 256
+    temperature: float = 0.7
+    stream: bool = True
+class BatchRequest(BaseModel):
+    prompts: List[str]
+    max_tokens: int = 256
+    temperature: float = 0.7
+def download_model():
+    """Download model if not exists"""
+    if not os.path.exists(MODEL_PATH):
+        os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
+        logger.info(f"Downloading model from {MODEL_URL}")
+        import urllib.request
+        urllib.request.urlretrieve(MODEL_URL, MODEL_PATH)
+        logger.info("Model downloaded")
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Startup
+    logger.info("Starting up...")
+    download_model()
+    init_engine(MODEL_PATH, n_ctx=4096, n_threads=4)
+    logger.info("Ready!")
+    yield
+    # Shutdown
+    logger.info("Shutting down...")
+app = FastAPI(title="Nanbeige LLM API", lifespan=lifespan)
+@app.post("/generate")
+async def generate(req: GenerateRequest):
+    """Single prompt generation with streaming"""
+    engine = get_engine()
+    if req.stream:
+        async def stream_generator():
+            async for token in engine.generate_stream(
+                req.prompt,
+                max_tokens=req.max_tokens,
+                temperature=req.temperature
+            ):
+                yield token
+        return StreamingResponse(
+            stream_generator(),
+            media_type="text/plain"
+        )
+    else:
+        # Non-streaming: collect all tokens
+        chunks = []
+        async for token in engine.generate_stream(
+            req.prompt,
+            max_tokens=req.max_tokens,
+            temperature=req.temperature
+        ):
+            chunks.append(token)
+        return {"text": "".join(chunks)}
+@app.post("/generate_batch")
+async def generate_batch(req: BatchRequest):
+    """Batch generation (multiple prompts)"""
+    engine = get_engine()
+    results = await engine.generate_batch(
+        req.prompts,
+        max_tokens=req.max_tokens,
+        temperature=req.temperature
+    )
+    return {"results": results}
+@app.get("/health")
+async def health():
+    return {"status": "ok", "model_loaded": get_engine()._model is not None}