Spaces:

Hcompany
/

Holo1-Localization

Running on Zero

App Files Files Community

multimodalart HF Staff commited on Jun 3

Commit

b46c8a5

verified ·

1 Parent(s): 4f95f14

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -12

app.py CHANGED Viewed

@@ -31,9 +31,9 @@ try:
     model = AutoModelForImageTextToText.from_pretrained(
         MODEL_ID,
-        torch_dtype="auto", # Uses torch.float16 if CUDA is available, else float32
-        # attn_implementation=attn_implementation, # Enable if flash_attention_2 is installed and compatible
-        device_map="auto", # Automatically uses CUDA if available
         trust_remote_code=True
     )
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
@@ -44,7 +44,6 @@ except Exception as e:
                          "This might be due to network issues, an incorrect model ID, or missing dependencies (like flash_attention_2 if enabled by default in some config).\n" \
                          "Ensure you have a stable internet connection and the necessary libraries installed."
     print(load_error_message)
-    # Fallback for Gradio UI to show error
 # --- Helper functions from the model card (or adapted) ---
@@ -56,24 +55,20 @@ def get_localization_prompt(pil_image: Image.Image, instruction: str) -> List[di
     """
     guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
-    # The Qwen2-VL processor expects a list of dictionaries for messages.
-    # For apply_chat_template, the image can be represented by its object if the template handles it,
-    # or a placeholder. The Qwen processor inserts an image tag like <img></img>.
     return [
         {
             "role": "user",
             "content": [
                 {
                     "type": "image",
-                    "image": pil_image, # Passing the PIL image object here, as in the model card.
-                                        # `apply_chat_template` will convert this to an image tag.
                 },
                 {"type": "text", "text": f"{guidelines}\n{instruction}"},
             ],
         }
     ]
-@spaces.GPU
 def run_inference_localization(
     current_model: AutoModelForImageTextToText,
     current_processor: AutoProcessor,
@@ -242,7 +237,7 @@ if not model_loaded:
 else:
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
-        gr.Markdown(description)
         with gr.Row():
             with gr.Column(scale=1):
@@ -264,7 +259,7 @@ else:
                 inputs=[input_image_component, instruction_component],
                 outputs=[output_coords_component, output_image_component],
                 fn=predict_click_location,
-                cache_examples=False, # Re-run for dynamic examples if needed, but False is safer for resource limits
             )
         gr.Markdown(article)

     model = AutoModelForImageTextToText.from_pretrained(
         MODEL_ID,
+        torch_dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+        device_map="auto",
         trust_remote_code=True
     )
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
                          "This might be due to network issues, an incorrect model ID, or missing dependencies (like flash_attention_2 if enabled by default in some config).\n" \
                          "Ensure you have a stable internet connection and the necessary libraries installed."
     print(load_error_message)
 # --- Helper functions from the model card (or adapted) ---
     """
     guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
     return [
         {
             "role": "user",
             "content": [
                 {
                     "type": "image",
+                    "image": pil_image,
                 },
                 {"type": "text", "text": f"{guidelines}\n{instruction}"},
             ],
         }
     ]
+@spaces.GPU(duration=120)
 def run_inference_localization(
     current_model: AutoModelForImageTextToText,
     current_processor: AutoProcessor,
 else:
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
+        # gr.Markdown(description)
         with gr.Row():
             with gr.Column(scale=1):
                 inputs=[input_image_component, instruction_component],
                 outputs=[output_coords_component, output_image_component],
                 fn=predict_click_location,
+                cache_examples="lazy",
             )
         gr.Markdown(article)