Spaces:
Running
on
Zero
Running
on
Zero
Add tokenizer fallback cascade
Browse files- README.md +1 -1
- __pycache__/app.cpython-313.pyc +0 -0
- app.py +32 -1
README.md
CHANGED
|
@@ -20,7 +20,7 @@ endpoint via the `HF_ROUTER_API` environment variable.
|
|
| 20 |
|
| 21 |
| File | Purpose |
|
| 22 |
| ---- | ------- |
|
| 23 |
-
| `app.py` | Loads the merged checkpoint on demand (
|
| 24 |
| `requirements.txt` | Minimal dependency set (transformers, bitsandbytes, torch, gradio, fastapi). |
|
| 25 |
| `.huggingface/spaces.yml` | Configures the Space for ZeroGPU hardware and disables automatic sleep. |
|
| 26 |
|
|
|
|
| 20 |
|
| 21 |
| File | Purpose |
|
| 22 |
| ---- | ------- |
|
| 23 |
+
| `app.py` | Loads the merged checkpoint on demand (tries `MODEL_REPO` first, then `router-llama31-merged`, `router-qwen3-32b-merged`, `router-gemma3-merged`), exposes a `/v1/generate` API, and ships an interactive Gradio UI for manual testing. |
|
| 24 |
| `requirements.txt` | Minimal dependency set (transformers, bitsandbytes, torch, gradio, fastapi). |
|
| 25 |
| `.huggingface/spaces.yml` | Configures the Space for ZeroGPU hardware and disables automatic sleep. |
|
| 26 |
|
__pycache__/app.cpython-313.pyc
CHANGED
|
Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ
|
|
|
app.py
CHANGED
|
@@ -26,12 +26,43 @@ except Exception: # pragma: no cover
|
|
| 26 |
load_dotenv()
|
| 27 |
|
| 28 |
|
| 29 |
-
MODEL_ID = os.environ.get("MODEL_REPO", "Alovestocode/router-llama31-merged")
|
| 30 |
MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "600"))
|
| 31 |
DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", "0.2"))
|
| 32 |
DEFAULT_TOP_P = float(os.environ.get("DEFAULT_TOP_P", "0.9"))
|
| 33 |
USE_4BIT = os.environ.get("LOAD_IN_4BIT", "1") not in {"0", "false", "False"}
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
class GeneratePayload(BaseModel):
|
| 37 |
prompt: str
|
|
|
|
| 26 |
load_dotenv()
|
| 27 |
|
| 28 |
|
|
|
|
| 29 |
MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "600"))
|
| 30 |
DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", "0.2"))
|
| 31 |
DEFAULT_TOP_P = float(os.environ.get("DEFAULT_TOP_P", "0.9"))
|
| 32 |
USE_4BIT = os.environ.get("LOAD_IN_4BIT", "1") not in {"0", "false", "False"}
|
| 33 |
|
| 34 |
+
MODEL_FALLBACKS = [
|
| 35 |
+
"Alovestocode/router-llama31-merged",
|
| 36 |
+
"Alovestocode/router-qwen3-32b-merged",
|
| 37 |
+
"Alovestocode/router-gemma3-merged",
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _initialise_tokenizer() -> tuple[str, AutoTokenizer]:
|
| 42 |
+
errors: dict[str, str] = {}
|
| 43 |
+
candidates = []
|
| 44 |
+
explicit = os.environ.get("MODEL_REPO")
|
| 45 |
+
if explicit:
|
| 46 |
+
candidates.append(explicit)
|
| 47 |
+
for name in MODEL_FALLBACKS:
|
| 48 |
+
if name not in candidates:
|
| 49 |
+
candidates.append(name)
|
| 50 |
+
for candidate in candidates:
|
| 51 |
+
try:
|
| 52 |
+
tok = AutoTokenizer.from_pretrained(candidate, use_fast=False)
|
| 53 |
+
print(f"Loaded tokenizer from {candidate}")
|
| 54 |
+
return candidate, tok
|
| 55 |
+
except Exception as exc: # pragma: no cover - download errors
|
| 56 |
+
errors[candidate] = str(exc)
|
| 57 |
+
print(f"Tokenizer load failed for {candidate}: {exc}")
|
| 58 |
+
raise RuntimeError(
|
| 59 |
+
"Unable to load any router model. Tried:\n" +
|
| 60 |
+
"\n".join(f"- {k}: {v}" for k, v in errors.items())
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
MODEL_ID, tokenizer = _initialise_tokenizer()
|
| 65 |
+
|
| 66 |
|
| 67 |
class GeneratePayload(BaseModel):
|
| 68 |
prompt: str
|