Spaces:

ankandrew
/

Qwen2.5VL

Running on Zero

App Files Files Community

ankandrew commited on Apr 17

Commit

2e3ddd8

1 Parent(s): f0c7145

Update gradio demo

Browse files

Files changed (1) hide show

app.py +105 -5

app.py CHANGED Viewed

@@ -1,11 +1,111 @@
 import gradio as gr
 import spaces
-@spaces.GPU
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import subprocess
 import gradio as gr
 import spaces
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+subprocess.run(
+    "pip install flash-attn --no-build-isolation",
+    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+    shell=True,
+)
+# Mapping user-friendly names to HF model IDs
+MODEL_NAMES = {
+    "Qwen2.5-VL-7B-Instruct-AWQ": "Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
+    "Qwen2.5-VL-3B-Instruct-AWQ": "Qwen/Qwen2.5-VL-3B-Instruct-AWQ",
+    "Qwen2.5-VL-7B-Instruct": "Qwen/Qwen2.5-VL-7B-Instruct",
+    "Qwen2.5-VL-3B-Instruct": "Qwen/Qwen2.5-VL-3B-Instruct",
+}
+@spaces.GPU(duration=300)
+def run_inference(model_key, input_type, text, image, video, fps):
+    """
+    Load the selected Qwen2.5-VL model and run inference on text, image, or video.
+    """
+    model_id = MODEL_NAMES[model_key]
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_id,
+        torch_dtype="auto",
+        device_map="auto"
+    )
+    processor = AutoProcessor.from_pretrained(model_id)
+    # Text-only inference
+    if input_type == "text":
+        inputs = processor(
+            text=text,
+            return_tensors="pt",
+            padding=True
+        )
+        inputs = inputs.to(model.device)
+        outputs = model.generate(**inputs, max_new_tokens=512)
+        return processor.batch_decode(outputs, skip_special_tokens=True)[0]
+    # Multimodal inference (image or video)
+    content = []
+    if input_type == "image" and image:
+        content.append({"type": "image", "image": image})
+    elif input_type == "video" and video:
+        # Ensure file URI for local files
+        video_src = video if str(video).startswith("file://") else f"file://{video}"
+        content.append({"type": "video", "video": video_src, "fps": fps})
+    content.append({"type": "text", "text": text or ""})
+    msg = [{"role": "user", "content": content}]
+    # Prepare inputs for model with video kwargs
+    text_prompt = processor.apply_chat_template(
+        msg, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs, video_kwargs = process_vision_info(msg, return_video_kwargs=True)
+    inputs = processor(
+        text=[text_prompt],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+        **video_kwargs
+    )
+    inputs = inputs.to(model.device)
+    gen_ids = model.generate(**inputs, max_new_tokens=512)
+    # Trim the prompt tokens
+    trimmed = [out_ids[len(inp_ids):] for inp_ids, out_ids in zip(inputs.input_ids, gen_ids)]
+    return processor.batch_decode(trimmed, skip_special_tokens=True)[0]
+# Build Gradio interface
+demo = gr.Blocks()
+with demo:
+    gr.Markdown("# Qwen2.5-VL Multimodal Demo")
+    model_select = gr.Dropdown(list(MODEL_NAMES.keys()), label="Select Model")
+    input_type = gr.Radio(["text", "image", "video"], label="Input Type")
+    text_input = gr.Textbox(lines=3, placeholder="Enter text...", visible=True)
+    image_input = gr.Image(type="filepath", visible=False)
+    video_input = gr.Video(type="filepath", visible=False)
+    fps_input = gr.Slider(minimum=0.1, maximum=30.0, step=0.1, value=2.0, label="FPS", visible=False)
+    output = gr.Textbox(label="Output")
+    # Show/hide inputs based on selection
+    def update_inputs(choice):
+        return (
+            gr.update(visible=(choice == "text")),
+            gr.update(visible=(choice == "image")),
+            gr.update(visible=(choice == "video")),
+            gr.update(visible=(choice == "video"))
+        )
+    input_type.change(update_inputs, input_type, [text_input, image_input, video_input, fps_input])
+    run_btn = gr.Button("Generate")
+    run_btn.click(
+        run_inference,
+        [model_select, input_type, text_input, image_input, video_input, fps_input],
+        output
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()