Spaces:

GrassData
/

cliptagger-12b

Running on A100

App Files Files Community

andrejrad commited on Aug 18

Commit

21b17c3

verified ·

1 Parent(s): 8ed3bc5

Update app.py

Browse files

Files changed (1) hide show

app.py +121 -136

app.py CHANGED Viewed

@@ -4,22 +4,21 @@ import gradio as gr
 from PIL import Image
 import torch
 import spaces
-# --------------------------
-# Environment
-# --------------------------
 MODEL_ID = os.environ.get("MODEL_ID", "inference-net/ClipTagger-12b")
-HF_TOKEN = os.environ.get("HF_TOKEN")
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-DTYPE  = torch.bfloat16 if torch.cuda.is_available() else torch.float32
 TEMP = 0.1
 MAX_NEW_TOKENS = 2000
-# --------------------------
-# Prompts (yours)
-# --------------------------
 SYSTEM_PROMPT = (
     "You are an image annotation API trained to analyze YouTube video keyframes. "
     "You will be given instructions on the output format, what to caption, and how to perform your job. "
@@ -56,77 +55,105 @@ Rules:
 - Output **only the JSON**, no extra text or explanation.
 """
-# --------------------------
-# Load full VLM (Gemma-3)
-# --------------------------
-from transformers import AutoConfig, AutoProcessor, AutoTokenizer, AutoModelForCausalLM
-processor = tokenizer = model = None
-LOAD_ERROR = None
-try:
-    cfg = AutoConfig.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
-    if "clip" in cfg.__class__.__name__.lower():
-        raise RuntimeError(
-            f"MODEL_ID '{MODEL_ID}' resolves to a CLIP/encoder config. "
-            "Point MODEL_ID to your full VLM checkpoint (this repo's config shows gemma3)."
-        )
-    # Processor (has vision + tokenizer routing)
     try:
-        processor = AutoProcessor.from_pretrained(
             MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
         )
-    except TypeError:
-        processor = AutoProcessor.from_pretrained(
-            MODEL_ID, token=HF_TOKEN, trust_remote_code=True
         )
-    # Model
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,
-        token=HF_TOKEN,
-        device_map="auto",
-        torch_dtype=DTYPE,
-        trust_remote_code=True,
-    )
-    # Tokenizer (fall back in case processor doesn't expose it)
-    tokenizer = getattr(processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
-        MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
-    )
-except Exception as e:
-    LOAD_ERROR = f"{e}\n\n{traceback.format_exc()}"
-# --------------------------
-# Inference
-# --------------------------
-def _build_messages(image: Image.Image):
-    return [
-        {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
-        {"role": "user", "content": [{"type": "image", "image": image},
-                                     {"type": "text",  "text": USER_PROMPT}]}
-    ]
-def _run(image: Image.Image) -> Tuple[str, Dict[str, Any], bool]:
     if image is None:
         return "Please upload an image.", None, False
-    if model is None or processor is None:
-        msg = (
-            "❌ Model failed to load.\n\n"
-            f"{LOAD_ERROR or 'Unknown error.'}\n"
-            "Check: MODEL_ID, HF_TOKEN, and that the repo includes processor + model shards."
-        )
-        return msg, None, False
-    # Build chat input
-    if hasattr(processor, "apply_chat_template"):
-        prompt = processor.apply_chat_template(
-            _build_messages(image), add_generation_prompt=True, tokenize=False
-        )
     else:
-        # Conservative fallback
         msgs = _build_messages(image)
         prompt = ""
         for m in msgs:
@@ -137,93 +164,49 @@ def _run(image: Image.Image) -> Tuple[str, Dict[str, Any], bool]:
                 elif chunk["type"] == "image":
                     prompt += f"{role}: [IMAGE]\n"
-    # Tokenize with vision
-    inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
-    # Generation args
     gen_kwargs = dict(
         temperature=TEMP,
         max_new_tokens=MAX_NEW_TOKENS,
     )
-    # If your config has multiple eos ids (yours does: [1, 106]), pass them
-    eos_id = getattr(tokenizer, "eos_token_id", None)
-    try:
-        # prefer config’s eos_token_id if list-like
-        from transformers.utils import is_torch_available
-        cfg_eos = getattr(model.config, "eos_token_id", None)
-        if isinstance(cfg_eos, (list, tuple)):
-            gen_kwargs["eos_token_id"] = list(cfg_eos)
-        elif eos_id is not None:
-            gen_kwargs["eos_token_id"] = eos_id
-    except Exception:
-        if eos_id is not None:
-            gen_kwargs["eos_token_id"] = eos_id
-    # Ask model to emit strict JSON (supported in newer transformers for some models)
     try:
         gen_kwargs["response_format"] = {"type": "json_object"}
     except Exception:
         pass
     with torch.inference_mode():
-        out_ids = model.generate(**inputs, **gen_kwargs)
-    # Decode via processor if available (some VLMs override decode)
-    if hasattr(processor, "decode"):
-        text = processor.decode(out_ids[0], skip_special_tokens=True)
-    else:
-        text = tokenizer.decode(out_ids[0], skip_special_tokens=True)
-    # Trim any echoed prompt
     if USER_PROMPT in text:
         text = text.split(USER_PROMPT)[-1].strip()
-    # Strict parse, with fallback to top-level {...}
-    try:
-        parsed = json.loads(text)
         return json.dumps(parsed, indent=2), parsed, True
-    except Exception:
-        m = re.search(r"\{(?:[^{}]|(?R))*\}", text, flags=re.DOTALL)
-        if m:
-            try:
-                parsed = json.loads(m.group(0))
-                return json.dumps(parsed, indent=2), parsed, True
-            except Exception:
-                pass
-        # Return raw text to help debug prompt adherence if needed
-        return text, None, False
-# --------------------------
-# Spaces GPU entry + warmup
-# --------------------------
-@spaces.GPU
-def annotate_image(pil: Image.Image):
-    return _run(pil)
 @spaces.GPU(duration=60)
 def _warmup():
-    if model is None or processor is None:
-        return "skip"
     try:
-        dummy = Image.new("RGB", (64, 64), (127, 127, 127))
-        _ = _run(dummy)
-        return "ok"
     except Exception as e:
         return f"warmup error: {e}"
-try:
-    _ = _warmup()
-except Exception:
-    pass
-# --------------------------
-# UI
-# --------------------------
-with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe Annotator (Gemma-3 VLM)") as demo:
-    gr.Markdown("# Keyframe Annotator (Gemma-3-12B FT)\nUpload an image to get **strict JSON** annotations.")
-    if LOAD_ERROR:
-        with gr.Accordion("Startup Error Details", open=False):
-            gr.Markdown(f"```\n{LOAD_ERROR}\n```")
     with gr.Row():
         with gr.Column(scale=1):
@@ -234,10 +217,12 @@ with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe
             out_json = gr.JSON(label="Parsed JSON")
             ok_flag = gr.Checkbox(label="Valid JSON", value=False, interactive=False)
-    def on_click(img):
-        text, js, ok = _run(img)
-        return text, js, ok
     btn.click(annotate_image, inputs=[image], outputs=[out_text, out_json, ok_flag])
 demo.queue(max_size=32).launch()

 from PIL import Image
 import torch
 import spaces
+from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM, AutoConfig
+# --------- ENV / PARAMS ----------
 MODEL_ID = os.environ.get("MODEL_ID", "inference-net/ClipTagger-12b")
+HF_TOKEN = os.environ.get("HF_TOKEN")  # put this in Space -> Settings -> Variables & secrets
 TEMP = 0.1
 MAX_NEW_TOKENS = 2000
+# Lazy globals (ZeroGPU-safe)
+_processor: Any = None
+_tokenizer: Any = None
+_model: Any = None
+_last_load_error: str | None = None
+# --------- PROMPTS (yours) ----------
 SYSTEM_PROMPT = (
     "You are an image annotation API trained to analyze YouTube video keyframes. "
     "You will be given instructions on the output format, what to caption, and how to perform your job. "
 - Output **only the JSON**, no extra text or explanation.
 """
+# --------- HELPERS ----------
+def _json_extract(text: str):
+    """Strict parse -> top-level {...} fallback."""
+    try:
+        return json.loads(text)
+    except Exception:
+        m = re.search(r"\{(?:[^{}]|(?R))*\}", text, flags=re.DOTALL)
+        if m:
+            try:
+                return json.loads(m.group(0))
+            except Exception:
+                pass
+    return None
+def _build_messages(image: Image.Image):
+    return [
+        {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
+        {"role": "user",   "content": [{"type": "image", "image": image},
+                                       {"type": "text",  "text": USER_PROMPT}]}
+    ]
+# --------- ZERO-GPU LAZY LOADER ----------
+@spaces.GPU
+def _ensure_loaded() -> str:
+    """
+    Load the model only when a ZeroGPU worker with a GPU is attached.
+    Tries quantized path first (compressed-tensors), then falls back to unquantized.
+    """
+    global _processor, _tokenizer, _model, _last_load_error
+    if _model is not None and _processor is not None:
+        return "already_loaded"
     try:
+        # Sanity: config should be gemma3 causal VLM (not CLIP)
+        cfg = AutoConfig.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
+        if "clip" in cfg.__class__.__name__.lower():
+            raise RuntimeError(
+                f"MODEL_ID '{MODEL_ID}' resolves to CLIP/encoder config; need a causal VLM checkpoint."
+            )
+        # Try quantized (as per your config)
+        _processor = AutoProcessor.from_pretrained(
             MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
         )
+        _model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            token=HF_TOKEN,
+            device_map="auto",
+            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+            trust_remote_code=True,
         )
+        _tokenizer = getattr(_processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
+            MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
+        )
+        _last_load_error = None
+        return "ok_quant"
+    except Exception as e:
+        # Fallback: disable quantization (more VRAM)
+        if "compressed_tensors" in str(e):
+            try:
+                _processor = AutoProcessor.from_pretrained(
+                    MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
+                )
+                _model = AutoModelForCausalLM.from_pretrained(
+                    MODEL_ID,
+                    token=HF_TOKEN,
+                    device_map="auto",
+                    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+                    trust_remote_code=True,
+                    quantization_config=None,  # force dequantized load
+                )
+                _tokenizer = getattr(_processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
+                    MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
+                )
+                _last_load_error = None
+                return "ok_dequant"
+            except Exception as e2:
+                _last_load_error = f"{e}\n\nFallback failed:\n{e2}\n{traceback.format_exc()}"
+                _processor = _tokenizer = _model = None
+                return "fail"
+        else:
+            _last_load_error = f"{e}\n{traceback.format_exc()}"
+            _processor = _tokenizer = _model = None
+            return "fail"
+# --------- INFERENCE ----------
+@spaces.GPU
+def annotate_image(image: Image.Image) -> Tuple[str, Dict[str, Any] | None, bool]:
+    status = _ensure_loaded()
+    if status == "fail":
+        return f"❌ Load error:\n{_last_load_error}", None, False
     if image is None:
         return "Please upload an image.", None, False
+    # Prompt assembly
+    if hasattr(_processor, "apply_chat_template"):
+        prompt = _processor.apply_chat_template(_build_messages(image), add_generation_prompt=True, tokenize=False)
     else:
         msgs = _build_messages(image)
         prompt = ""
         for m in msgs:
                 elif chunk["type"] == "image":
                     prompt += f"{role}: [IMAGE]\n"
+    inputs = _processor(text=prompt, images=image, return_tensors="pt").to(_model.device)
     gen_kwargs = dict(
         temperature=TEMP,
         max_new_tokens=MAX_NEW_TOKENS,
     )
+    # respect multiple eos ids if present
+    eos = getattr(_model.config, "eos_token_id", None)
+    if eos is not None:
+        gen_kwargs["eos_token_id"] = eos
+    # Try JSON-only output (if supported)
     try:
         gen_kwargs["response_format"] = {"type": "json_object"}
     except Exception:
         pass
     with torch.inference_mode():
+        out = _model.generate(**inputs, **gen_kwargs)
+    text = (_processor.decode(out[0], skip_special_tokens=True)
+            if hasattr(_processor, "decode")
+            else _tokenizer.decode(out[0], skip_special_tokens=True))
     if USER_PROMPT in text:
         text = text.split(USER_PROMPT)[-1].strip()
+    parsed = _json_extract(text)
+    if isinstance(parsed, dict):
         return json.dumps(parsed, indent=2), parsed, True
+    return text, None, False
+# Optional: quick warmup to validate loading on first worker
 @spaces.GPU(duration=60)
 def _warmup():
     try:
+        return _ensure_loaded()
     except Exception as e:
         return f"warmup error: {e}"
+# --------- UI ----------
+with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe Annotator (ZeroGPU)") as demo:
+    gr.Markdown("# Keyframe Annotator (Gemma-3-12B FT · ZeroGPU)\nUpload an image to get **strict JSON** annotations.")
     with gr.Row():
         with gr.Column(scale=1):
             out_json = gr.JSON(label="Parsed JSON")
             ok_flag = gr.Checkbox(label="Valid JSON", value=False, interactive=False)
     btn.click(annotate_image, inputs=[image], outputs=[out_text, out_json, ok_flag])
+# fire a non-blocking warmup
+try:
+    _ = _warmup()
+except Exception:
+    pass
 demo.queue(max_size=32).launch()