Update app.py
Browse files
app.py
CHANGED
|
@@ -12,19 +12,20 @@ from contextlib import asynccontextmanager
|
|
| 12 |
# Set up logging
|
| 13 |
logging.basicConfig(level=logging.INFO)
|
| 14 |
|
| 15 |
-
# --- MODEL MAP ---
|
|
|
|
| 16 |
MODEL_MAP = {
|
| 17 |
"light": {
|
| 18 |
-
"repo_id": "
|
| 19 |
-
"filename": "
|
| 20 |
},
|
| 21 |
"medium": {
|
| 22 |
-
"repo_id": "TheBloke/
|
| 23 |
-
"filename": "
|
| 24 |
},
|
| 25 |
-
"heavy": {
|
| 26 |
-
"repo_id": "TheBloke/
|
| 27 |
-
"filename": "
|
| 28 |
}
|
| 29 |
}
|
| 30 |
|
|
@@ -36,10 +37,10 @@ model_lock = threading.Lock()
|
|
| 36 |
@asynccontextmanager
|
| 37 |
async def lifespan(app: FastAPI):
|
| 38 |
# This code runs ON STARTUP
|
| 39 |
-
logging.info("Server starting up... Acquiring lock to pre-load model.")
|
| 40 |
with model_lock:
|
| 41 |
get_llm_instance("light")
|
| 42 |
-
logging.info("Server is ready and 'light' model is loaded.")
|
| 43 |
|
| 44 |
yield
|
| 45 |
|
|
@@ -97,20 +98,13 @@ def get_llm_instance(choice: str) -> Llama:
|
|
| 97 |
return None
|
| 98 |
|
| 99 |
# --- API Data Models (SIMPLIFIED) ---
|
| 100 |
-
# We only need the full prompt and the model choice
|
| 101 |
-
# The frontend will build the prompt.
|
| 102 |
class StoryPrompt(BaseModel):
|
| 103 |
prompt: str
|
| 104 |
model_choice: str
|
| 105 |
-
|
| 106 |
-
# These are no longer used by the backend, but we include them
|
| 107 |
-
# so the frontend's request doesn't fail
|
| 108 |
feedback: str = ""
|
| 109 |
story_memory: str = ""
|
| 110 |
|
| 111 |
-
|
| 112 |
# --- API Endpoints ---
|
| 113 |
-
|
| 114 |
@app.get("/")
|
| 115 |
def get_status():
|
| 116 |
loaded_model = list(llm_cache.keys())[0] if llm_cache else "None"
|
|
@@ -122,10 +116,6 @@ def get_status():
|
|
| 122 |
|
| 123 |
@app.post("/generate")
|
| 124 |
async def generate_story(prompt: StoryPrompt):
|
| 125 |
-
"""
|
| 126 |
-
Main generation endpoint.
|
| 127 |
-
This is now much simpler.
|
| 128 |
-
"""
|
| 129 |
logging.info("Request received. Waiting to acquire model lock...")
|
| 130 |
with model_lock:
|
| 131 |
logging.info("Lock acquired. Processing request.")
|
|
@@ -135,16 +125,14 @@ async def generate_story(prompt: StoryPrompt):
|
|
| 135 |
logging.error(f"Failed to get model for choice: {prompt.model_choice}")
|
| 136 |
return JSONResponse(status_code=503, content={"error": "The AI model is not available or failed to load."})
|
| 137 |
|
| 138 |
-
#
|
| 139 |
-
# We trust the frontend and use the prompt exactly as it was sent.
|
| 140 |
-
# We no longer re-format it.
|
| 141 |
final_prompt = prompt.prompt
|
| 142 |
|
| 143 |
logging.info(f"Generating with {prompt.model_choice}...")
|
| 144 |
output = llm(
|
| 145 |
final_prompt,
|
| 146 |
max_tokens=512,
|
| 147 |
-
stop=["<|user|>", "<|endoftext|>"],
|
| 148 |
echo=False
|
| 149 |
)
|
| 150 |
|
|
|
|
| 12 |
# Set up logging
|
| 13 |
logging.basicConfig(level=logging.INFO)
|
| 14 |
|
| 15 |
+
# --- NEW, SMARTER MODEL MAP ---
|
| 16 |
+
# We are swapping to better storytelling models
|
| 17 |
MODEL_MAP = {
|
| 18 |
"light": {
|
| 19 |
+
"repo_id": "microsoft/Phi-3-mini-4k-instruct-gguf",
|
| 20 |
+
"filename": "Phi-3-mini-4k-instruct-q4.gguf" # 2.13 GB - MUCH smarter
|
| 21 |
},
|
| 22 |
"medium": {
|
| 23 |
+
"repo_id": "TheBloke/DeepSeek-LLM-7B-Chat-GGUF",
|
| 24 |
+
"filename": "deepseek-llm-7b-chat.Q4_K_M.gguf" # 4.08 GB - High Quality
|
| 25 |
},
|
| 26 |
+
"heavy": {
|
| 27 |
+
"repo_id": "TheBloke/DeepSeek-LLM-7B-Chat-GGUF",
|
| 28 |
+
"filename": "deepseek-llm-7b-chat.Q5_K_M.gguf" # 4.78 GB - Best Quality
|
| 29 |
}
|
| 30 |
}
|
| 31 |
|
|
|
|
| 37 |
@asynccontextmanager
|
| 38 |
async def lifespan(app: FastAPI):
|
| 39 |
# This code runs ON STARTUP
|
| 40 |
+
logging.info("Server starting up... Acquiring lock to pre-load 'light' model (Phi-3).")
|
| 41 |
with model_lock:
|
| 42 |
get_llm_instance("light")
|
| 43 |
+
logging.info("Server is ready and 'light' model (Phi-3) is loaded.")
|
| 44 |
|
| 45 |
yield
|
| 46 |
|
|
|
|
| 98 |
return None
|
| 99 |
|
| 100 |
# --- API Data Models (SIMPLIFIED) ---
|
|
|
|
|
|
|
| 101 |
class StoryPrompt(BaseModel):
|
| 102 |
prompt: str
|
| 103 |
model_choice: str
|
|
|
|
|
|
|
|
|
|
| 104 |
feedback: str = ""
|
| 105 |
story_memory: str = ""
|
| 106 |
|
|
|
|
| 107 |
# --- API Endpoints ---
|
|
|
|
| 108 |
@app.get("/")
|
| 109 |
def get_status():
|
| 110 |
loaded_model = list(llm_cache.keys())[0] if llm_cache else "None"
|
|
|
|
| 116 |
|
| 117 |
@app.post("/generate")
|
| 118 |
async def generate_story(prompt: StoryPrompt):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
logging.info("Request received. Waiting to acquire model lock...")
|
| 120 |
with model_lock:
|
| 121 |
logging.info("Lock acquired. Processing request.")
|
|
|
|
| 125 |
logging.error(f"Failed to get model for choice: {prompt.model_choice}")
|
| 126 |
return JSONResponse(status_code=503, content={"error": "The AI model is not available or failed to load."})
|
| 127 |
|
| 128 |
+
# We trust the frontend to build the full prompt
|
|
|
|
|
|
|
| 129 |
final_prompt = prompt.prompt
|
| 130 |
|
| 131 |
logging.info(f"Generating with {prompt.model_choice}...")
|
| 132 |
output = llm(
|
| 133 |
final_prompt,
|
| 134 |
max_tokens=512,
|
| 135 |
+
stop=["<|user|>", "<|endoftext|>", "user:"], # Added stop tokens for Phi-3
|
| 136 |
echo=False
|
| 137 |
)
|
| 138 |
|