Spaces:

fugthchat
/

fugthdes

Sleeping

App Files Files Community

fugthchat commited on Nov 7

Commit

c8d6e8a

verified ·

1 Parent(s): ca47c49

Update app.py

Browse files

Files changed (1) hide show

app.py +140 -94

app.py CHANGED Viewed

@@ -1,57 +1,134 @@
 import os
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 from llama_cpp import Llama
 from fastapi.middleware.cors import CORSMiddleware
 from huggingface_hub import hf_hub_download
-import logging
-import threading
 from contextlib import asynccontextmanager
-# Set up logging
 logging.basicConfig(level=logging.INFO)
-# --- NEW, SMARTER MODEL MAP ---
-# We are swapping to better storytelling models
 MODEL_MAP = {
     "light": {
         "repo_id": "microsoft/Phi-3-mini-4k-instruct-gguf",
-        "filename": "Phi-3-mini-4k-instruct-q4.gguf" # 2.13 GB - MUCH smarter
     },
     "medium": {
         "repo_id": "TheBloke/DeepSeek-LLM-7B-Chat-GGUF",
-        "filename": "deepseek-llm-7b-chat.Q4_K_M.gguf" # 4.08 GB - High Quality
     },
     "heavy": {
         "repo_id": "TheBloke/DeepSeek-LLM-7B-Chat-GGUF",
-        "filename": "deepseek-llm-7b-chat.Q5_K_M.gguf" # 4.78 GB - Best Quality
     }
 }
-# --- GLOBAL CACHE & LOCK ---
 llm_cache = {}
-model_lock = threading.Lock()
-# --- LIFESPAN FUNCTION ---
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    # This code runs ON STARTUP
-    logging.info("Server starting up... Acquiring lock to pre-load 'light' model (Phi-3).")
-    with model_lock:
-        get_llm_instance("light")
-    logging.info("Server is ready and 'light' model (Phi-3) is loaded.")
     yield
-    # This code runs ON SHUTDOWN
     logging.info("Server shutting down...")
     llm_cache.clear()
-# Pass the lifespan function to FastAPI
 app = FastAPI(lifespan=lifespan)
-# --- CORS ---
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -60,53 +137,15 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# --- Helper Function to Load Model ---
-def get_llm_instance(choice: str) -> Llama:
-    if choice not in MODEL_MAP:
-        logging.error(f"Invalid model choice: {choice}")
-        return None
-    if choice in llm_cache:
-        logging.info(f"Using cached model: {choice}")
-        return llm_cache[choice]
-    model_info = MODEL_MAP[choice]
-    repo_id = model_info["repo_id"]
-    filename = model_info["filename"]
-    try:
-        logging.info(f"Downloading model: {filename} from {repo_id}")
-        model_path = hf_hub_download(repo_id=repo_id, filename=filename)
-        logging.info(f"Model downloaded to: {model_path}")
-        logging.info("Loading model into memory...")
-        llm = Llama(
-            model_path=model_path,
-            n_ctx=4096,
-            n_threads=2,
-            n_gpu_layers=0,
-            verbose=True
-        )
-        llm_cache.clear()
-        llm_cache[choice] = llm
-        logging.info(f"Model {choice} loaded successfully.")
-        return llm
-    except Exception as e:
-        logging.critical(f"CRITICAL ERROR: Failed to download/load model {filename}. Error: {e}", exc_info=True)
-        return None
-# --- API Data Models (SIMPLIFIED) ---
-class StoryPrompt(BaseModel):
     prompt: str
     model_choice: str
-    feedback: str = ""
-    story_memory: str = ""
 # --- API Endpoints ---
 @app.get("/")
 def get_status():
     loaded_model = list(llm_cache.keys())[0] if llm_cache else "None"
     return {
         "status": "AI server is online",
@@ -114,35 +153,42 @@ def get_status():
         "models": list(MODEL_MAP.keys())
     }
-@app.post("/generate")
-async def generate_story(prompt: StoryPrompt):
-    logging.info("Request received. Waiting to acquire model lock...")
-    with model_lock:
-        logging.info("Lock acquired. Processing request.")
-        try:
-            llm = get_llm_instance(prompt.model_choice)
-            if llm is None:
-                logging.error(f"Failed to get model for choice: {prompt.model_choice}")
-                return JSONResponse(status_code=503, content={"error": "The AI model is not available or failed to load."})
-            # We trust the frontend to build the full prompt
-            final_prompt = prompt.prompt
-            logging.info(f"Generating with {prompt.model_choice}...")
-            output = llm(
-                final_prompt,
-                max_tokens=512,
-                stop=["<|user|>", "<|endoftext|>", "user:"], # Added stop tokens for Phi-3
-                echo=False
-            )
-            generated_text = output["choices"][0]["text"].strip()
-            logging.info("Generation complete.")
-            return {"story_text": generated_text}
-        except Exception as e:
-            logging.error(f"An internal error occurred during generation: {e}", exc_info=True)
-            return JSONResponse(status_code=500, content={"error": "An unexpected error occurred."})
-        finally:
-            logging.info("Releasing model lock.")

 import os
+import uuid
+import threading
+import logging
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 from llama_cpp import Llama
 from fastapi.middleware.cors import CORSMiddleware
 from huggingface_hub import hf_hub_download
 from contextlib import asynccontextmanager
+# --- Setup ---
 logging.basicConfig(level=logging.INFO)
+# --- Model Map (Using the smarter Phi-3) ---
 MODEL_MAP = {
     "light": {
         "repo_id": "microsoft/Phi-3-mini-4k-instruct-gguf",
+        "filename": "Phi-3-mini-4k-instruct-q4.gguf" # 2.13 GB
     },
     "medium": {
         "repo_id": "TheBloke/DeepSeek-LLM-7B-Chat-GGUF",
+        "filename": "deepseek-llm-7b-chat.Q4_K_M.gguf" # 4.08 GB
     },
     "heavy": {
         "repo_id": "TheBloke/DeepSeek-LLM-7B-Chat-GGUF",
+        "filename": "deepseek-llm-7b-chat.Q5_K_M.gguf" # 4.78 GB
     }
 }
+# --- Global Caches & Locks ---
 llm_cache = {}
+model_lock = threading.Lock() # Ensures only one model loads at a time
+llm_lock = threading.Lock() # Ensures only one generation job runs at a time
+# This is our new in-memory "database" for jobs
+# It will hold the status and results of background tasks
+JOBS = {}
+# --- Helper: Load Model ---
+def get_llm_instance(choice: str) -> Llama:
+    with model_lock:
+        if choice not in MODEL_MAP:
+            logging.error(f"Invalid model choice: {choice}")
+            return None
+        if choice in llm_cache:
+            logging.info(f"Using cached model: {choice}")
+            return llm_cache[choice]
+        model_info = MODEL_MAP[choice]
+        repo_id = model_info["repo_id"]
+        filename = model_info["filename"]
+        try:
+            logging.info(f"Downloading model: {filename} from {repo_id}")
+            model_path = hf_hub_download(repo_id=repo_id, filename=filename)
+            logging.info(f"Model downloaded to: {model_path}")
+            logging.info("Loading model into memory...")
+            llm = Llama(
+                model_path=model_path,
+                n_ctx=4096,
+                n_threads=2,
+                n_gpu_layers=0,
+                verbose=True
+            )
+            llm_cache.clear()
+            llm_cache[choice] = llm
+            logging.info(f"Model {choice} loaded successfully.")
+            return llm
+        except Exception as e:
+            logging.critical(f"CRITICAL ERROR: Failed to download/load model {filename}. Error: {e}", exc_info=True)
+            return None
+# --- Helper: The Background AI Task ---
+def run_generation_in_background(job_id: str, model_choice: str, prompt: str):
+    """
+    This function runs in a separate thread.
+    It performs the long-running AI generation.
+    """
+    global JOBS
+    try:
+        # Acquire the lock. If another job is running, this will wait.
+        logging.info(f"Job {job_id}: Waiting to acquire LLM lock...")
+        with llm_lock:
+            logging.info(f"Job {job_id}: Lock acquired. Loading model.")
+            llm = get_llm_instance(model_choice)
+            if llm is None:
+                raise Exception("Model could not be loaded.")
+            JOBS[job_id]["status"] = "processing"
+            logging.info(f"Job {job_id}: Processing prompt...")
+            output = llm(
+                prompt,
+                max_tokens=512,
+                stop=["<|user|>", "<|endoftext|>", "user:"],
+                echo=False
+            )
+            generated_text = output["choices"][0]["text"].strip()
+            # Save the result and mark as complete
+            JOBS[job_id]["status"] = "complete"
+            JOBS[job_id]["result"] = generated_text
+            logging.info(f"Job {job_id}: Complete.")
+    except Exception as e:
+        logging.error(f"Job {job_id}: Failed. Error: {e}")
+        JOBS[job_id]["status"] = "error"
+        JOBS[job_id]["result"] = str(e)
+    finally:
+        # The lock is automatically released by the 'with' statement
+        logging.info(f"Job {job_id}: LLM lock released.")
+# --- FastAPI App & Lifespan ---
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    logging.info("Server starting up... Pre-loading 'light' model.")
+    get_llm_instance("light")
+    logging.info("Server is ready and 'light' model is loaded.")
     yield
     logging.info("Server shutting down...")
     llm_cache.clear()
 app = FastAPI(lifespan=lifespan)
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# --- API Data Models ---
+class SubmitPrompt(BaseModel):
     prompt: str
     model_choice: str
 # --- API Endpoints ---
 @app.get("/")
 def get_status():
+    """This is the 'wake up' and status check endpoint."""
     loaded_model = list(llm_cache.keys())[0] if llm_cache else "None"
     return {
         "status": "AI server is online",
         "models": list(MODEL_MAP.keys())
     }
+@app.post("/submit_job")
+async def submit_job(prompt: SubmitPrompt):
+    """
+    NEW: Instantly accepts a job and starts it in the background.
+    """
+    job_id = str(uuid.uuid4())
+    # Store the job as "pending"
+    JOBS[job_id] = {"status": "pending", "result": None}
+    # Start the background thread
+    thread = threading.Thread(
+        target=run_generation_in_background,
+        args=(job_id, prompt.model_choice, prompt.prompt)
+    )
+    thread.start()
+    logging.info(f"Job {job_id} submitted.")
+    # Return the Job ID to the user immediately
+    return {"job_id": job_id}
+@app.get("/get_job_status/{job_id}")
+async def get_job_status(job_id: str):
+    """
+    NEW: Allows the frontend to check on a job.
+    """
+    job = JOBS.get(job_id)
+    if job is None:
+        return JSONResponse(status_code=404, content={"error": "Job not found."})
+    # If the job is done, send the result and remove it from memory
+    if job["status"] in ["complete", "error"]:
+        result = job
+        del JOBS[job_id] # Clean up
+        return result
+    # If not done, just send the current status
+    return {"status": job["status"]}