Spaces:

red-rectangle
/

black-box

Runtime error

App Files Files Community

Joel Lundgren commited on Sep 21

Commit

9ef29cf

1 Parent(s): a22ca8b

test

Browse files

Files changed (1) hide show

app.py +78 -12

app.py CHANGED Viewed

@@ -73,13 +73,24 @@ def get_llm(model_name, preferred_file: str | None = None):
     tokenizer_repo = tokenizer_repo_map[model_name]
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_repo)
     # Try a few common ONNX filenames found in community repos to avoid the
     # "Too many ONNX model files were found" ambiguity.
     candidate_files = [
         "model_q4.onnx",
         "model_quantized.onnx",
-        "model_int8.onnx",
         "model.onnx",
     ]
@@ -100,6 +111,7 @@ def get_llm(model_name, preferred_file: str | None = None):
                 subfolder="onnx",
                 file_name=fname,
             )
             break
         except Exception as e:
             last_err = e
@@ -113,6 +125,12 @@ def get_llm(model_name, preferred_file: str | None = None):
             model.config.use_cache = False
         except Exception:
             pass
     llm_cache[cache_key] = (model, tokenizer)
     return model, tokenizer
@@ -161,17 +179,47 @@ def generate_text(
     )
     inputs = tokenizer([text], return_tensors="pt")
     with torch.inference_mode():
-        gen_ids = model.generate(
-            **inputs,
-            max_new_tokens=int(max_new_tokens),
-            do_sample=bool(do_sample),
-            temperature=float(temperature),
-            top_p=float(top_p),
-            top_k=int(top_k),
-            repetition_penalty=float(repetition_penalty),
-        )
     # Decode only the newly generated tokens beyond the input length
     trimmed = [
@@ -179,6 +227,8 @@ def generate_text(
         for input_ids, output_ids in zip(inputs.input_ids, gen_ids)
     ]
     response = tokenizer.batch_decode(trimmed, skip_special_tokens=True)[0]
     return response
 def chat_respond(
@@ -233,7 +283,13 @@ def chat_respond(
         )
     trimmed = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, gen_ids)]
-    reply = tokenizer.batch_decode(trimmed, skip_special_tokens=True)[0]
     new_history = (history or []) + [(message, reply)]
     return new_history, gr.update(value="")
@@ -251,7 +307,17 @@ with gr.Blocks() as demo:
     with gr.Tab("LLM Chat"):
         model_selector = gr.Dropdown(choices=["gemma3:1b", "qwen3:0.6b"], label="Select LLM Model")
         onnx_file_selector = gr.Dropdown(
-            choices=["auto", "model_q4.onnx", "model_int8.onnx", "model_quantized.onnx", "model.onnx"],
             value="auto",
             label="ONNX file variant"
         )

     tokenizer_repo = tokenizer_repo_map[model_name]
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_repo)
+    # Ensure pad token exists (common for decoder-only models)
+    if tokenizer.pad_token_id is None:
+        try:
+            tokenizer.pad_token = tokenizer.eos_token
+        except Exception:
+            pass
     # Try a few common ONNX filenames found in community repos to avoid the
     # "Too many ONNX model files were found" ambiguity.
+    # Order: prefer int8, then q4f16, q4, general quantized, uint8, fp16, and finally generic.
     candidate_files = [
+        "model_int8.onnx",
+        "model_q4f16.onnx",
         "model_q4.onnx",
         "model_quantized.onnx",
+        "model_uint8.onnx",
+        "model_fp16.onnx",
+        "model_bnb4.onnx",
         "model.onnx",
     ]
                 subfolder="onnx",
                 file_name=fname,
             )
+            print(f"[ONNX] Loaded {onnx_repo}/onnx/{fname}")
             break
         except Exception as e:
             last_err = e
             model.config.use_cache = False
         except Exception:
             pass
+    # Mirror in generation config as well
+    if hasattr(model, "generation_config") and hasattr(model.generation_config, "use_cache"):
+        try:
+            model.generation_config.use_cache = False
+        except Exception:
+            pass
     llm_cache[cache_key] = (model, tokenizer)
     return model, tokenizer
     )
     inputs = tokenizer([text], return_tensors="pt")
+    # Ensure attention_mask is present and pad_token is defined
+    if "attention_mask" not in inputs:
+        inputs = tokenizer([text], return_tensors="pt", padding=True)
+    if getattr(tokenizer, "pad_token_id", None) is None and getattr(tokenizer, "eos_token_id", None) is not None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    gen_kwargs = {
+        "max_new_tokens": int(max_new_tokens),
+        "do_sample": bool(do_sample),
+        "temperature": float(temperature),
+        "top_p": float(top_p),
+        "top_k": int(top_k),
+        "repetition_penalty": float(repetition_penalty),
+        "use_cache": False,
+    }
+    if getattr(tokenizer, "eos_token_id", None) is not None:
+        gen_kwargs["eos_token_id"] = tokenizer.eos_token_id
     with torch.inference_mode():
+        try:
+            gen_ids = model.generate(
+                **inputs,
+                **gen_kwargs,
+            )
+        except Exception as e:
+            msg = str(e)
+            # Retry with int8 if KV cache shape mismatch and user didn't pick int8
+            if (
+                "past_key_values" in msg or "INVALID_ARGUMENT" in msg
+            ) and onnx_file_choice != "model_int8.onnx":
+                # Reload as int8 and retry once
+                model, tokenizer = get_llm(model_name, preferred_file="model_int8.onnx")
+                gen_kwargs["use_cache"] = False
+                gen_ids = model.generate(
+                    **inputs,
+                    **gen_kwargs,
+                )
+                # Mark that we switched variant
+                switched_variant_note = "\n[Note] Switched to model_int8.onnx due to KV-cache shape mismatch."
+            else:
+                raise
     # Decode only the newly generated tokens beyond the input length
     trimmed = [
         for input_ids, output_ids in zip(inputs.input_ids, gen_ids)
     ]
     response = tokenizer.batch_decode(trimmed, skip_special_tokens=True)[0]
+    if 'switched_variant_note' in locals():
+        response = response + switched_variant_note
     return response
 def chat_respond(
         )
     trimmed = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, gen_ids)]
+    try:
+        reply = tokenizer.batch_decode(trimmed, skip_special_tokens=True)[0]
+    except Exception as e:
+        # Gracefully surface decoding issues
+        reply = f"[Error] Failed to decode model output: {e}"
+    if 'switched_variant_note' in locals():
+        reply = reply + switched_variant_note
     new_history = (history or []) + [(message, reply)]
     return new_history, gr.update(value="")
     with gr.Tab("LLM Chat"):
         model_selector = gr.Dropdown(choices=["gemma3:1b", "qwen3:0.6b"], label="Select LLM Model")
         onnx_file_selector = gr.Dropdown(
+            choices=[
+                "auto",
+                "model_int8.onnx",
+                "model_q4f16.onnx",
+                "model_q4.onnx",
+                "model_quantized.onnx",
+                "model_uint8.onnx",
+                "model_fp16.onnx",
+                "model_bnb4.onnx",
+                "model.onnx",
+            ],
             value="auto",
             label="ONNX file variant"
         )