Alovestocode commited on
Commit
82dcfd3
·
verified ·
1 Parent(s): f5c6fe4

Add tokenizer fallback cascade

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. __pycache__/app.cpython-313.pyc +0 -0
  3. app.py +32 -1
README.md CHANGED
@@ -20,7 +20,7 @@ endpoint via the `HF_ROUTER_API` environment variable.
20
 
21
  | File | Purpose |
22
  | ---- | ------- |
23
- | `app.py` | Loads the merged checkpoint on demand (defaults to `Alovestocode/router-llama31-merged` for faster startup), exposes a `/v1/generate` API, and ships an interactive Gradio UI for manual testing. |
24
  | `requirements.txt` | Minimal dependency set (transformers, bitsandbytes, torch, gradio, fastapi). |
25
  | `.huggingface/spaces.yml` | Configures the Space for ZeroGPU hardware and disables automatic sleep. |
26
 
 
20
 
21
  | File | Purpose |
22
  | ---- | ------- |
23
+ | `app.py` | Loads the merged checkpoint on demand (tries `MODEL_REPO` first, then `router-llama31-merged`, `router-qwen3-32b-merged`, `router-gemma3-merged`), exposes a `/v1/generate` API, and ships an interactive Gradio UI for manual testing. |
24
  | `requirements.txt` | Minimal dependency set (transformers, bitsandbytes, torch, gradio, fastapi). |
25
  | `.huggingface/spaces.yml` | Configures the Space for ZeroGPU hardware and disables automatic sleep. |
26
 
__pycache__/app.cpython-313.pyc CHANGED
Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ
 
app.py CHANGED
@@ -26,12 +26,43 @@ except Exception: # pragma: no cover
26
  load_dotenv()
27
 
28
 
29
- MODEL_ID = os.environ.get("MODEL_REPO", "Alovestocode/router-llama31-merged")
30
  MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "600"))
31
  DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", "0.2"))
32
  DEFAULT_TOP_P = float(os.environ.get("DEFAULT_TOP_P", "0.9"))
33
  USE_4BIT = os.environ.get("LOAD_IN_4BIT", "1") not in {"0", "false", "False"}
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  class GeneratePayload(BaseModel):
37
  prompt: str
 
26
  load_dotenv()
27
 
28
 
 
29
  MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "600"))
30
  DEFAULT_TEMPERATURE = float(os.environ.get("DEFAULT_TEMPERATURE", "0.2"))
31
  DEFAULT_TOP_P = float(os.environ.get("DEFAULT_TOP_P", "0.9"))
32
  USE_4BIT = os.environ.get("LOAD_IN_4BIT", "1") not in {"0", "false", "False"}
33
 
34
+ MODEL_FALLBACKS = [
35
+ "Alovestocode/router-llama31-merged",
36
+ "Alovestocode/router-qwen3-32b-merged",
37
+ "Alovestocode/router-gemma3-merged",
38
+ ]
39
+
40
+
41
+ def _initialise_tokenizer() -> tuple[str, AutoTokenizer]:
42
+ errors: dict[str, str] = {}
43
+ candidates = []
44
+ explicit = os.environ.get("MODEL_REPO")
45
+ if explicit:
46
+ candidates.append(explicit)
47
+ for name in MODEL_FALLBACKS:
48
+ if name not in candidates:
49
+ candidates.append(name)
50
+ for candidate in candidates:
51
+ try:
52
+ tok = AutoTokenizer.from_pretrained(candidate, use_fast=False)
53
+ print(f"Loaded tokenizer from {candidate}")
54
+ return candidate, tok
55
+ except Exception as exc: # pragma: no cover - download errors
56
+ errors[candidate] = str(exc)
57
+ print(f"Tokenizer load failed for {candidate}: {exc}")
58
+ raise RuntimeError(
59
+ "Unable to load any router model. Tried:\n" +
60
+ "\n".join(f"- {k}: {v}" for k, v in errors.items())
61
+ )
62
+
63
+
64
+ MODEL_ID, tokenizer = _initialise_tokenizer()
65
+
66
 
67
  class GeneratePayload(BaseModel):
68
  prompt: str