Spaces:

davanstrien
/

vllm-index-card-extractor

Running on Zero

App Files Files Community

davanstrien HF Staff commited on Oct 6

Commit

bbe7feb

verified ·

1 Parent(s): 251c25d

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -84

app.py CHANGED Viewed

@@ -1,105 +1,103 @@
 import gradio as gr
-from PIL import Image as PILImage
 import os
 import json
 import spaces
-from typing import Optional
-from pydantic import BaseModel, Field
-import outlines
-from outlines.inputs import Chat, Image
-from transformers import Qwen3VLMoeForConditionalGeneration, AutoProcessor
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-# Define the metadata schema
-class CatalogCardMetadata(BaseModel):
-    """Structured metadata from a library catalog card."""
-    title: Optional[str] = Field(None, description="The main title or name on the card")
-    author: Optional[str] = Field(
-        None, description="Author, creator, or associated person/organization"
-    )
-    date: Optional[str] = Field(
-        None,
-        description="Any dates mentioned (publication, creation, or coverage dates)",
-    )
-    call_number: Optional[str] = Field(
-        None, description="Library classification or call number"
-    )
-    physical_description: Optional[str] = Field(
-        None, description="Details about the physical item (size, extent, format)"
-    )
-    subjects: Optional[list[str]] = Field(
-        None, description="Subject headings or topics"
-    )
-    notes: Optional[str] = Field(
-        None, description="Any additional notes or information"
-    )
-# Load model and processor with Outlines
-print("Loading Qwen3-VL-30B-A3B-Instruct model with Outlines...")
-hf_model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
-    "Qwen/Qwen3-VL-30B-A3B-Instruct", torch_dtype="auto", device_map="auto"
 )
-hf_processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct")
-model = outlines.from_transformers(hf_model, hf_processor)
 print("Model loaded successfully!")
-EXTRACTION_PROMPT = """Extract all metadata from this library catalog card. Include title, author, dates, call number, physical description, subjects, and notes. If a field is not present, omit it."""
 @spaces.GPU
 def extract_metadata(image):
-    """Extract structured metadata from catalog card image using Outlines."""
     if image is None:
         return "Please upload an image."
     try:
         # Ensure image is PIL Image
-        print(f"DEBUG: Received image type: {type(image)}")
-        if not isinstance(image, PILImage.Image):
-            image = PILImage.open(image).convert("RGB")
-        print(f"DEBUG: After conversion, image type: {type(image)}")
-        print(f"DEBUG: Image format before setting: {image.format}")
-        # Set format (required by Outlines Image class)
-        if not image.format:
-            image.format = "PNG"
-        print(f"DEBUG: Image format after setting: {image.format}")
-        # Wrap in Outlines Image
-        outlines_image = Image(image)
-        print(f"DEBUG: Outlines Image created: {type(outlines_image)}")
-        print(f"DEBUG: Outlines Image.image type: {type(outlines_image.image)}")
-        # Create Chat prompt with Image (using simpler list format)
-        prompt = Chat(
-            messages=[
-                {
-                    "role": "user",
-                    "content": [EXTRACTION_PROMPT, outlines_image],
-                }
-            ]
         )
-        print(f"DEBUG: Chat prompt created successfully")
-        # Generate with structured output - guaranteed valid JSON
-        print(f"DEBUG: Starting generation...")
-        result = model(prompt, CatalogCardMetadata, max_new_tokens=512)
-        print(f"DEBUG: Generation complete, result type: {type(result)}")
-        # Parse and format (always valid JSON with Outlines)
-        metadata = CatalogCardMetadata.model_validate_json(result)
-        return json.dumps(metadata.model_dump(exclude_none=True), indent=2)
     except Exception as e:
-        import traceback
-        error_msg = f"Error during extraction: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
-        print(error_msg)
-        return error_msg
 # Create Gradio interface
 with gr.Blocks(title="Library Card Metadata Extractor") as demo:
@@ -118,14 +116,25 @@ with gr.Blocks(title="Library Card Metadata Extractor") as demo:
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 📤 Upload Catalog Card")
-            image_input = gr.Image(label="Library Catalog Card", type="pil")
             submit_btn = gr.Button("🔍 Extract Metadata", variant="primary", size="lg")
         with gr.Column(scale=1):
             gr.Markdown("### 📋 Extracted Metadata (JSON)")
-            output = gr.Code(label="Metadata", language="json", lines=15)
-    submit_btn.click(fn=extract_metadata, inputs=image_input, outputs=output)
     gr.Markdown("---")
@@ -143,7 +152,7 @@ with gr.Blocks(title="Library Card Metadata Extractor") as demo:
         inputs=image_input,
         outputs=output,
         fn=extract_metadata,
-        cache_examples=False,
     )
     gr.Markdown("---")

 import gradio as gr
+from PIL import Image
 import os
+import torch
 import json
 import spaces
+from transformers import AutoModelForImageTextToText, AutoProcessor
+from qwen_vl_utils import process_vision_info
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+# Load model and processor
+print("Loading Qwen3-VL-30B-A3B-Instruct model...")
+model = AutoModelForImageTextToText.from_pretrained(
+    "Qwen/Qwen3-VL-30B-A3B-Instruct",
+    torch_dtype=torch.bfloat16,
+    device_map="auto"
 )
+processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct")
 print("Model loaded successfully!")
+EXTRACTION_PROMPT = """Extract all metadata from this library catalog card and return it as valid JSON with the following fields:
+- title: The main title or name on the card
+- author: Author, creator, or associated person/organization
+- date: Any dates mentioned (publication, creation, or coverage dates)
+- call_number: Library classification or call number
+- physical_description: Details about the physical item (size, extent, format)
+- subjects: Subject headings or topics
+- notes: Any additional notes or information
+Return ONLY the JSON object, nothing else. If a field is not present on the card, use null for that field."""
 @spaces.GPU
 def extract_metadata(image):
+    """Extract structured metadata from catalog card image."""
     if image is None:
         return "Please upload an image."
     try:
         # Ensure image is PIL Image
+        if not isinstance(image, Image.Image):
+            image = Image.open(image).convert("RGB")
+        # Format messages for Qwen3-VL
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": EXTRACTION_PROMPT}
+                ]
+            }
+        ]
+        # Prepare inputs
+        text = processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
         )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt"
+        )
+        inputs = inputs.to(model.device)
+        # Generate
+        with torch.inference_mode():
+            generated_ids = model.generate(
+                **inputs,
+                max_new_tokens=512,
+                temperature=0.1,
+                do_sample=False
+            )
+        # Trim input tokens from output
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        # Decode output
+        output_text = processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        # Try to parse as JSON for pretty formatting
+        try:
+            json_data = json.loads(output_text)
+            return json.dumps(json_data, indent=2)
+        except json.JSONDecodeError:
+            # If not valid JSON, return as-is
+            return output_text
     except Exception as e:
+        return f"Error during extraction: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(title="Library Card Metadata Extractor") as demo:
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 📤 Upload Catalog Card")
+            image_input = gr.Image(
+                label="Library Catalog Card",
+                type="pil"
+            )
             submit_btn = gr.Button("🔍 Extract Metadata", variant="primary", size="lg")
         with gr.Column(scale=1):
             gr.Markdown("### 📋 Extracted Metadata (JSON)")
+            output = gr.Code(
+                label="Metadata",
+                language="json",
+                lines=15
+            )
+    submit_btn.click(
+        fn=extract_metadata,
+        inputs=image_input,
+        outputs=output
+    )
     gr.Markdown("---")
         inputs=image_input,
         outputs=output,
         fn=extract_metadata,
+        cache_examples=False
     )
     gr.Markdown("---")