Alovestocode commited on
Commit
ee25577
·
verified ·
1 Parent(s): 74309f5

Replace Gradio with FastAPI HTML console

Browse files
Files changed (3) hide show
  1. README.md +3 -3
  2. app.py +52 -57
  3. requirements.txt +0 -4
README.md CHANGED
@@ -20,8 +20,8 @@ endpoint via the `HF_ROUTER_API` environment variable.
20
 
21
  | File | Purpose |
22
  | ---- | ------- |
23
- | `app.py` | Loads the merged checkpoint on demand (tries `MODEL_REPO` first, then `router-llama31-merged`, `router-qwen3-32b-merged`, `router-gemma3-merged`), exposes a `/v1/generate` API, and ships an interactive Gradio UI for manual testing. |
24
- | `requirements.txt` | Minimal dependency set (transformers, bitsandbytes, torch, gradio, fastapi). |
25
  | `.huggingface/spaces.yml` | Configures the Space for ZeroGPU hardware and disables automatic sleep. |
26
 
27
  ## Deployment Steps
@@ -39,7 +39,7 @@ endpoint via the `HF_ROUTER_API` environment variable.
39
  ```
40
 
41
  3. **Configure secrets**
42
- - `MODEL_REPO` – defaults to `Alovestocode/router-llama31-merged` (override if you need the larger Qwen/Gemma checkpoints)
43
  - `HF_TOKEN` – token with read access to the merged model
44
 
45
  4. **Connect the main router UI**
 
20
 
21
  | File | Purpose |
22
  | ---- | ------- |
23
+ | `app.py` | Loads the merged checkpoint on demand (tries `MODEL_REPO` first, then `router-qwen3-32b-merged`, `router-gemma3-merged`), exposes a `/v1/generate` API, and serves a small HTML console at `/gradio`. |
24
+ | `requirements.txt` | Minimal dependency set (transformers, bitsandbytes, torch, fastapi). |
25
  | `.huggingface/spaces.yml` | Configures the Space for ZeroGPU hardware and disables automatic sleep. |
26
 
27
  ## Deployment Steps
 
39
  ```
40
 
41
  3. **Configure secrets**
42
+ - `MODEL_REPO` – optional override; defaults to the fallback list (`router-qwen3-32b-merged`, `router-gemma3-merged`)
43
  - `HF_TOKEN` – token with read access to the merged model
44
 
45
  4. **Connect the main router UI**
app.py CHANGED
@@ -6,26 +6,15 @@ from typing import Optional
6
 
7
  import torch
8
  from fastapi import FastAPI, HTTPException
 
9
  from pydantic import BaseModel
10
 
11
- import gradio as gr
12
- import spaces
13
  from transformers import (
14
  AutoModelForCausalLM,
15
  AutoTokenizer,
16
  BitsAndBytesConfig,
17
  )
18
 
19
- try: # Load optional .env so Spaces and local runs behave the same.
20
- from dotenv import load_dotenv
21
- except Exception: # pragma: no cover
22
- def load_dotenv(*_: object, **__: object) -> bool:
23
- return False
24
-
25
-
26
- load_dotenv()
27
-
28
-
29
  MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "600"))
30
  DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", "0.2"))
31
  DEFAULT_TOP_P = float(os.environ.get("DEFAULT_TOP_P", "0.9"))
@@ -74,11 +63,9 @@ class GenerateResponse(BaseModel):
74
  text: str
75
 
76
 
77
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
78
  _MODEL = None
79
 
80
 
81
- @spaces.GPU(duration=120)
82
  def get_model() -> AutoModelForCausalLM:
83
  global _MODEL
84
  if _MODEL is None:
@@ -154,51 +141,59 @@ def generate_endpoint(payload: GeneratePayload) -> GenerateResponse:
154
  return GenerateResponse(text=text)
155
 
156
 
157
- def gradio_infer(
158
- prompt: str,
159
- max_new_tokens: int,
160
- temperature: float,
161
- top_p: float,
162
- ) -> str:
163
- return _generate(
164
- prompt=prompt,
165
- max_new_tokens=max_new_tokens,
166
- temperature=temperature,
167
- top_p=top_p,
168
- )
169
-
170
-
171
- with gr.Blocks(title="Router Model ZeroGPU Backend") as demo:
172
- gr.Markdown(
173
- f"### {MODEL_ID}\n"
174
- "This Space serves a merged router checkpoint for the CourseGPT project. "
175
- "Use the `/v1/generate` REST endpoint for programmatic access."
176
- )
177
- with gr.Row():
178
- prompt_box = gr.Textbox(
179
- label="Prompt",
180
- placeholder="Router system prompt + user query…",
181
- lines=8,
182
- )
183
- with gr.Row():
184
- max_tokens = gr.Slider(64, 1024, MAX_NEW_TOKENS, step=16, label="max_new_tokens")
185
- temperature = gr.Slider(0.0, 1.5, DEFAULT_TEMPERATURE, step=0.05, label="temperature")
186
- top_p = gr.Slider(0.1, 1.0, DEFAULT_TOP_P, step=0.05, label="top_p")
187
- output_box = gr.Textbox(label="Router Response", lines=10)
188
- run_btn = gr.Button("Generate", variant="primary")
189
- run_btn.click(
190
- fn=gradio_infer,
191
- inputs=[prompt_box, max_tokens, temperature, top_p],
192
- outputs=output_box,
193
- )
 
 
 
 
 
 
 
 
 
 
194
 
195
 
196
- demo.queue()
197
- app = gr.mount_gradio_app(fastapi_app, demo, path="/gradio")
198
 
199
 
200
  if __name__ == "__main__": # pragma: no cover
201
- # Hugging Face Spaces will serve the exported `app` automatically.
202
- # Running uvicorn manually can conflict with the platform proxy, so we deliberately
203
- # no-op here.
204
- pass
 
6
 
7
  import torch
8
  from fastapi import FastAPI, HTTPException
9
+ from fastapi.responses import HTMLResponse
10
  from pydantic import BaseModel
11
 
 
 
12
  from transformers import (
13
  AutoModelForCausalLM,
14
  AutoTokenizer,
15
  BitsAndBytesConfig,
16
  )
17
 
 
 
 
 
 
 
 
 
 
 
18
  MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "600"))
19
  DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", "0.2"))
20
  DEFAULT_TOP_P = float(os.environ.get("DEFAULT_TOP_P", "0.9"))
 
63
  text: str
64
 
65
 
 
66
  _MODEL = None
67
 
68
 
 
69
  def get_model() -> AutoModelForCausalLM:
70
  global _MODEL
71
  if _MODEL is None:
 
141
  return GenerateResponse(text=text)
142
 
143
 
144
+ @fastapi_app.get("/gradio", response_class=HTMLResponse)
145
+ def interactive_ui() -> str:
146
+ return """
147
+ <!doctype html>
148
+ <html>
149
+ <head>
150
+ <title>Router Control Room</title>
151
+ <style>
152
+ body { font-family: sans-serif; margin: 40px; max-width: 900px; }
153
+ textarea, input { width: 100%; }
154
+ textarea { height: 180px; }
155
+ pre { background: #111; color: #0f0; padding: 16px; border-radius: 8px; }
156
+ </style>
157
+ </head>
158
+ <body>
159
+ <h1>Router Control Room</h1>
160
+ <p>This lightweight UI calls <code>/v1/generate</code>. Provide a full router prompt below.</p>
161
+ <label>Prompt</label>
162
+ <textarea id="prompt" placeholder="Include system text + user query..."></textarea>
163
+ <label>Max new tokens</label>
164
+ <input id="max_tokens" type="number" value="600" min="64" max="1024" step="16" />
165
+ <label>Temperature</label>
166
+ <input id="temperature" type="number" value="0.2" min="0" max="2" step="0.05" />
167
+ <label>Top-p</label>
168
+ <input id="top_p" type="number" value="0.9" min="0" max="1" step="0.05" />
169
+ <button onclick="callRouter()">Generate plan</button>
170
+ <h2>Response</h2>
171
+ <pre id="response">(waiting)</pre>
172
+ <script>
173
+ async function callRouter() {
174
+ const resp = await fetch("/v1/generate", {
175
+ method: "POST",
176
+ headers: { "Content-Type": "application/json" },
177
+ body: JSON.stringify({
178
+ prompt: document.getElementById("prompt").value,
179
+ max_new_tokens: Number(document.getElementById("max_tokens").value),
180
+ temperature: Number(document.getElementById("temperature").value),
181
+ top_p: Number(document.getElementById("top_p").value)
182
+ })
183
+ });
184
+ const json = await resp.json();
185
+ document.getElementById("response").textContent = JSON.stringify(json, null, 2);
186
+ }
187
+ </script>
188
+ </body>
189
+ </html>
190
+ """
191
 
192
 
193
+ app = fastapi_app
 
194
 
195
 
196
  if __name__ == "__main__": # pragma: no cover
197
+ import uvicorn
198
+
199
+ uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))
 
requirements.txt CHANGED
@@ -1,9 +1,5 @@
1
- accelerate>=0.25.0
2
  bitsandbytes>=0.41.0
3
  fastapi>=0.110.0
4
- gradio>=4.38.0
5
- python-dotenv>=1.0.0
6
- spaces>=0.40.0
7
  torch>=2.1.0
8
  transformers>=4.40.0
9
  uvicorn>=0.22.0
 
 
1
  bitsandbytes>=0.41.0
2
  fastapi>=0.110.0
 
 
 
3
  torch>=2.1.0
4
  transformers>=4.40.0
5
  uvicorn>=0.22.0