Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

prithivMLmods commited on 24 days ago

Commit

da6c1e1

verified ·

1 Parent(s): 7d4c078

update app

Browse files

Files changed (1) hide show

app.py +36 -33

app.py CHANGED Viewed

@@ -355,7 +355,42 @@ def create_gradio_interface():
         gr.Markdown("Explore the capabilities of various Vision Language Models for tasks like OCR, VQA, and Object Detection.")
         with gr.Tabs():
-            # --- TAB 1: Document and General VLMs ---
             with gr.TabItem("📄 Document & General VLM"):
                 with gr.Row():
                     with gr.Column(scale=2):
@@ -390,39 +425,7 @@ def create_gradio_interface():
                     inputs=[image_input_doc, prompt_input_doc]
                 )
-            # --- TAB 2: Moondream3 Lab ---
-            with gr.TabItem("🌝 Moondream3"):
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        md3_image_input = gr.Image(label="Upload an image", type="pil", height=400)
-                        md3_task_type = gr.Radio(
-                            choices=["Object Detection", "Point Detection", "Caption", "Visual Question Answering"],
-                            label="Task Type", value="Object Detection"
-                        )
-                        md3_prompt_input = gr.Textbox(
-                            label="Prompt (object to detect/question to ask)",
-                            placeholder="e.g., 'car', 'person', 'What's in this image?'"
-                        )
-                        md3_max_objects = gr.Number(
-                            label="Max Objects (for Object Detection only)",
-                            value=10, minimum=1, maximum=50, step=1, visible=True
-                        )
-                        md3_generate_btn = gr.Button(value="Submit", variant="primary")
-                    with gr.Column(scale=1):
-                        md3_output_image = gr.Image(type="pil", label="Result", height=400)
-                        md3_output_textbox = gr.Textbox(label="Model Response", lines=10, show_copy_button=True)
-                        md3_output_time = gr.Markdown()
-                gr.Examples(
-                    examples=[
-                        ["md3/1.jpg", "Object Detection", "boats", 7],
-                        ["md3/2.jpg", "Point Detection", "children", 7],
-                        ["md3/3.png", "Caption", "", 5],
-                        ["md3/4.jpeg", "Visual Question Answering", "Analyze the GDP trend over the years.", 5],
-                    ],
-                    inputs=[md3_image_input, md3_task_type, md3_prompt_input, md3_max_objects],
-                    label="Click an example to populate inputs"
-                )
         process_btn.click(
             fn=process_document_stream,

         gr.Markdown("Explore the capabilities of various Vision Language Models for tasks like OCR, VQA, and Object Detection.")
         with gr.Tabs():
+            # --- TAB 1: Moondream3 Lab ---
+            with gr.TabItem("🌝 Moondream3"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        md3_image_input = gr.Image(label="Upload an image", type="pil", height=400)
+                        md3_task_type = gr.Radio(
+                            choices=["Object Detection", "Point Detection", "Caption", "Visual Question Answering"],
+                            label="Task Type", value="Object Detection"
+                        )
+                        md3_prompt_input = gr.Textbox(
+                            label="Prompt (object to detect/question to ask)",
+                            placeholder="e.g., 'car', 'person', 'What's in this image?'"
+                        )
+                        md3_max_objects = gr.Number(
+                            label="Max Objects (for Object Detection only)",
+                            value=10, minimum=1, maximum=50, step=1, visible=True
+                        )
+                        md3_generate_btn = gr.Button(value="Submit", variant="primary")
+                    with gr.Column(scale=1):
+                        md3_output_image = gr.Image(type="pil", label="Result", height=400)
+                        md3_output_textbox = gr.Textbox(label="Model Response", lines=10, show_copy_button=True)
+                        md3_output_time = gr.Markdown()
+                gr.Examples(
+                    examples=[
+                        ["md3/1.jpg", "Object Detection", "boats", 7],
+                        ["md3/2.jpg", "Point Detection", "children", 7],
+                        ["md3/3.png", "Caption", "", 5],
+                        ["md3/4.jpeg", "Visual Question Answering", "Analyze the GDP trend over the years.", 5],
+                    ],
+                    inputs=[md3_image_input, md3_task_type, md3_prompt_input, md3_max_objects],
+                    label="Click an example to populate inputs"
+                )
+            # --- TAB 2: Document and General VLMs ---
             with gr.TabItem("📄 Document & General VLM"):
                 with gr.Row():
                     with gr.Column(scale=2):
                     inputs=[image_input_doc, prompt_input_doc]
                 )
         process_btn.click(
             fn=process_document_stream,