Spaces:
Running
on
Zero
Running
on
Zero
Replace Gradio with FastAPI HTML console
Browse files- README.md +3 -3
- app.py +52 -57
- requirements.txt +0 -4
README.md
CHANGED
|
@@ -20,8 +20,8 @@ endpoint via the `HF_ROUTER_API` environment variable.
|
|
| 20 |
|
| 21 |
| File | Purpose |
|
| 22 |
| ---- | ------- |
|
| 23 |
-
| `app.py` | Loads the merged checkpoint on demand (tries `MODEL_REPO` first, then `router-
|
| 24 |
-
| `requirements.txt` | Minimal dependency set (transformers, bitsandbytes, torch,
|
| 25 |
| `.huggingface/spaces.yml` | Configures the Space for ZeroGPU hardware and disables automatic sleep. |
|
| 26 |
|
| 27 |
## Deployment Steps
|
|
@@ -39,7 +39,7 @@ endpoint via the `HF_ROUTER_API` environment variable.
|
|
| 39 |
```
|
| 40 |
|
| 41 |
3. **Configure secrets**
|
| 42 |
-
- `MODEL_REPO` – defaults to `
|
| 43 |
- `HF_TOKEN` – token with read access to the merged model
|
| 44 |
|
| 45 |
4. **Connect the main router UI**
|
|
|
|
| 20 |
|
| 21 |
| File | Purpose |
|
| 22 |
| ---- | ------- |
|
| 23 |
+
| `app.py` | Loads the merged checkpoint on demand (tries `MODEL_REPO` first, then `router-qwen3-32b-merged`, `router-gemma3-merged`), exposes a `/v1/generate` API, and serves a small HTML console at `/gradio`. |
|
| 24 |
+
| `requirements.txt` | Minimal dependency set (transformers, bitsandbytes, torch, fastapi). |
|
| 25 |
| `.huggingface/spaces.yml` | Configures the Space for ZeroGPU hardware and disables automatic sleep. |
|
| 26 |
|
| 27 |
## Deployment Steps
|
|
|
|
| 39 |
```
|
| 40 |
|
| 41 |
3. **Configure secrets**
|
| 42 |
+
- `MODEL_REPO` – optional override; defaults to the fallback list (`router-qwen3-32b-merged`, `router-gemma3-merged`)
|
| 43 |
- `HF_TOKEN` – token with read access to the merged model
|
| 44 |
|
| 45 |
4. **Connect the main router UI**
|
app.py
CHANGED
|
@@ -6,26 +6,15 @@ from typing import Optional
|
|
| 6 |
|
| 7 |
import torch
|
| 8 |
from fastapi import FastAPI, HTTPException
|
|
|
|
| 9 |
from pydantic import BaseModel
|
| 10 |
|
| 11 |
-
import gradio as gr
|
| 12 |
-
import spaces
|
| 13 |
from transformers import (
|
| 14 |
AutoModelForCausalLM,
|
| 15 |
AutoTokenizer,
|
| 16 |
BitsAndBytesConfig,
|
| 17 |
)
|
| 18 |
|
| 19 |
-
try: # Load optional .env so Spaces and local runs behave the same.
|
| 20 |
-
from dotenv import load_dotenv
|
| 21 |
-
except Exception: # pragma: no cover
|
| 22 |
-
def load_dotenv(*_: object, **__: object) -> bool:
|
| 23 |
-
return False
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
load_dotenv()
|
| 27 |
-
|
| 28 |
-
|
| 29 |
MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "600"))
|
| 30 |
DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", "0.2"))
|
| 31 |
DEFAULT_TOP_P = float(os.environ.get("DEFAULT_TOP_P", "0.9"))
|
|
@@ -74,11 +63,9 @@ class GenerateResponse(BaseModel):
|
|
| 74 |
text: str
|
| 75 |
|
| 76 |
|
| 77 |
-
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
|
| 78 |
_MODEL = None
|
| 79 |
|
| 80 |
|
| 81 |
-
@spaces.GPU(duration=120)
|
| 82 |
def get_model() -> AutoModelForCausalLM:
|
| 83 |
global _MODEL
|
| 84 |
if _MODEL is None:
|
|
@@ -154,51 +141,59 @@ def generate_endpoint(payload: GeneratePayload) -> GenerateResponse:
|
|
| 154 |
return GenerateResponse(text=text)
|
| 155 |
|
| 156 |
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
|
| 196 |
-
|
| 197 |
-
app = gr.mount_gradio_app(fastapi_app, demo, path="/gradio")
|
| 198 |
|
| 199 |
|
| 200 |
if __name__ == "__main__": # pragma: no cover
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
pass
|
|
|
|
| 6 |
|
| 7 |
import torch
|
| 8 |
from fastapi import FastAPI, HTTPException
|
| 9 |
+
from fastapi.responses import HTMLResponse
|
| 10 |
from pydantic import BaseModel
|
| 11 |
|
|
|
|
|
|
|
| 12 |
from transformers import (
|
| 13 |
AutoModelForCausalLM,
|
| 14 |
AutoTokenizer,
|
| 15 |
BitsAndBytesConfig,
|
| 16 |
)
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "600"))
|
| 19 |
DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", "0.2"))
|
| 20 |
DEFAULT_TOP_P = float(os.environ.get("DEFAULT_TOP_P", "0.9"))
|
|
|
|
| 63 |
text: str
|
| 64 |
|
| 65 |
|
|
|
|
| 66 |
_MODEL = None
|
| 67 |
|
| 68 |
|
|
|
|
| 69 |
def get_model() -> AutoModelForCausalLM:
|
| 70 |
global _MODEL
|
| 71 |
if _MODEL is None:
|
|
|
|
| 141 |
return GenerateResponse(text=text)
|
| 142 |
|
| 143 |
|
| 144 |
+
@fastapi_app.get("/gradio", response_class=HTMLResponse)
|
| 145 |
+
def interactive_ui() -> str:
|
| 146 |
+
return """
|
| 147 |
+
<!doctype html>
|
| 148 |
+
<html>
|
| 149 |
+
<head>
|
| 150 |
+
<title>Router Control Room</title>
|
| 151 |
+
<style>
|
| 152 |
+
body { font-family: sans-serif; margin: 40px; max-width: 900px; }
|
| 153 |
+
textarea, input { width: 100%; }
|
| 154 |
+
textarea { height: 180px; }
|
| 155 |
+
pre { background: #111; color: #0f0; padding: 16px; border-radius: 8px; }
|
| 156 |
+
</style>
|
| 157 |
+
</head>
|
| 158 |
+
<body>
|
| 159 |
+
<h1>Router Control Room</h1>
|
| 160 |
+
<p>This lightweight UI calls <code>/v1/generate</code>. Provide a full router prompt below.</p>
|
| 161 |
+
<label>Prompt</label>
|
| 162 |
+
<textarea id="prompt" placeholder="Include system text + user query..."></textarea>
|
| 163 |
+
<label>Max new tokens</label>
|
| 164 |
+
<input id="max_tokens" type="number" value="600" min="64" max="1024" step="16" />
|
| 165 |
+
<label>Temperature</label>
|
| 166 |
+
<input id="temperature" type="number" value="0.2" min="0" max="2" step="0.05" />
|
| 167 |
+
<label>Top-p</label>
|
| 168 |
+
<input id="top_p" type="number" value="0.9" min="0" max="1" step="0.05" />
|
| 169 |
+
<button onclick="callRouter()">Generate plan</button>
|
| 170 |
+
<h2>Response</h2>
|
| 171 |
+
<pre id="response">(waiting)</pre>
|
| 172 |
+
<script>
|
| 173 |
+
async function callRouter() {
|
| 174 |
+
const resp = await fetch("/v1/generate", {
|
| 175 |
+
method: "POST",
|
| 176 |
+
headers: { "Content-Type": "application/json" },
|
| 177 |
+
body: JSON.stringify({
|
| 178 |
+
prompt: document.getElementById("prompt").value,
|
| 179 |
+
max_new_tokens: Number(document.getElementById("max_tokens").value),
|
| 180 |
+
temperature: Number(document.getElementById("temperature").value),
|
| 181 |
+
top_p: Number(document.getElementById("top_p").value)
|
| 182 |
+
})
|
| 183 |
+
});
|
| 184 |
+
const json = await resp.json();
|
| 185 |
+
document.getElementById("response").textContent = JSON.stringify(json, null, 2);
|
| 186 |
+
}
|
| 187 |
+
</script>
|
| 188 |
+
</body>
|
| 189 |
+
</html>
|
| 190 |
+
"""
|
| 191 |
|
| 192 |
|
| 193 |
+
app = fastapi_app
|
|
|
|
| 194 |
|
| 195 |
|
| 196 |
if __name__ == "__main__": # pragma: no cover
|
| 197 |
+
import uvicorn
|
| 198 |
+
|
| 199 |
+
uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))
|
|
|
requirements.txt
CHANGED
|
@@ -1,9 +1,5 @@
|
|
| 1 |
-
accelerate>=0.25.0
|
| 2 |
bitsandbytes>=0.41.0
|
| 3 |
fastapi>=0.110.0
|
| 4 |
-
gradio>=4.38.0
|
| 5 |
-
python-dotenv>=1.0.0
|
| 6 |
-
spaces>=0.40.0
|
| 7 |
torch>=2.1.0
|
| 8 |
transformers>=4.40.0
|
| 9 |
uvicorn>=0.22.0
|
|
|
|
|
|
|
| 1 |
bitsandbytes>=0.41.0
|
| 2 |
fastapi>=0.110.0
|
|
|
|
|
|
|
|
|
|
| 3 |
torch>=2.1.0
|
| 4 |
transformers>=4.40.0
|
| 5 |
uvicorn>=0.22.0
|