|
|
import spaces |
|
|
|
|
|
import re |
|
|
from typing import Tuple, Optional |
|
|
|
|
|
import gradio as gr |
|
|
import numpy as np |
|
|
from PIL import Image, ImageDraw, ImageFont |
|
|
from smolvlm_inference import TransformersModel |
|
|
|
|
|
from prompt import OS_SYSTEM_PROMPT |
|
|
|
|
|
|
|
|
MODEL_ID = "smolagents/SmolVLM2-2.2B-Instruct-Agentic-GUI" |
|
|
|
|
|
|
|
|
print(f"Loading model and processor for {MODEL_ID}...") |
|
|
model = None |
|
|
processor = None |
|
|
model_loaded = False |
|
|
load_error_message = "" |
|
|
|
|
|
|
|
|
|
|
|
model = TransformersModel( |
|
|
model_id=MODEL_ID, |
|
|
to_device="cuda:0", |
|
|
) |
|
|
|
|
|
|
|
|
title = "Smol2Operator Demo" |
|
|
|
|
|
description = """ |
|
|
This is a demo of the Smol2Operator model designed to interact with graphical user interfaces (GUIs) and perform actions within them. |
|
|
This proof-of-concept (POC) version, described in [blogpost], showcases the model’s core capabilities. |
|
|
This compact release is intentionally scoped to fundamental tasks, with complex workflows planned for future iterations. :hugging_face: |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
SYSTEM_PROMPT: str = OS_SYSTEM_PROMPT |
|
|
|
|
|
|
|
|
def get_navigation_prompt(task, image, step=1): |
|
|
""" |
|
|
Get the prompt for the navigation task. |
|
|
- task: The task to complete |
|
|
- image: The current screenshot of the web page |
|
|
- step: The current step of the task |
|
|
""" |
|
|
system_prompt = SYSTEM_PROMPT |
|
|
return [ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": [ |
|
|
{"type": "text", "text": system_prompt}, |
|
|
], |
|
|
}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{ |
|
|
"type": "image", |
|
|
"image": image, |
|
|
}, |
|
|
{"type": "text", "text": f"Please generate the next move according to the UI screenshot, instruction and previous actions.\n\nInstruction: {task}\n\nPrevious actions:\nNone"}, |
|
|
], |
|
|
}, |
|
|
] |
|
|
|
|
|
|
|
|
def array_to_image(image_array: np.ndarray) -> Image.Image: |
|
|
if image_array is None: |
|
|
raise ValueError("No image provided. Please upload an image before submitting.") |
|
|
|
|
|
img = Image.fromarray(np.uint8(image_array)) |
|
|
return img |
|
|
|
|
|
|
|
|
def parse_actions_from_response(response: str) -> list[str]: |
|
|
"""Parse actions from model response using regex pattern.""" |
|
|
pattern = r"<code>\n(.*?)\n</code>" |
|
|
matches = re.findall(pattern, response, re.DOTALL) |
|
|
return matches |
|
|
|
|
|
|
|
|
def extract_coordinates_from_action(action_code: str) -> list[dict]: |
|
|
"""Extract coordinates from action code for localization actions.""" |
|
|
localization_actions = [] |
|
|
|
|
|
|
|
|
patterns = { |
|
|
'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)', |
|
|
'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)', |
|
|
'move_mouse': r'move_mouse\((?:self,\s*)?(?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))\)', |
|
|
'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)' |
|
|
} |
|
|
|
|
|
for action_type, pattern in patterns.items(): |
|
|
matches = re.finditer(pattern, action_code) |
|
|
for match in matches: |
|
|
if action_type == 'drag': |
|
|
|
|
|
from_x, from_y, to_x, to_y = match.groups() |
|
|
localization_actions.append({ |
|
|
'type': 'drag_from', |
|
|
'x': float(from_x), |
|
|
'y': float(from_y), |
|
|
'action': action_type |
|
|
}) |
|
|
localization_actions.append({ |
|
|
'type': 'drag_to', |
|
|
'x': float(to_x), |
|
|
'y': float(to_y), |
|
|
'action': action_type |
|
|
}) |
|
|
else: |
|
|
|
|
|
x_val = match.group(1) |
|
|
y_val = match.group(2) if match.group(2) else x_val |
|
|
if x_val and y_val: |
|
|
localization_actions.append({ |
|
|
'type': action_type, |
|
|
'x': float(x_val), |
|
|
'y': float(y_val), |
|
|
'action': action_type |
|
|
}) |
|
|
|
|
|
return localization_actions |
|
|
|
|
|
|
|
|
def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]: |
|
|
"""Create an image with localization markers drawn on it.""" |
|
|
if not coordinates: |
|
|
return None |
|
|
|
|
|
|
|
|
img_copy = original_image.copy() |
|
|
draw = ImageDraw.Draw(img_copy) |
|
|
|
|
|
|
|
|
width, height = img_copy.size |
|
|
|
|
|
|
|
|
font = ImageFont.load_default() |
|
|
|
|
|
|
|
|
|
|
|
colors = { |
|
|
'click': 'red', |
|
|
'double_click': 'blue', |
|
|
'move_mouse': 'green', |
|
|
'drag_from': 'orange', |
|
|
'drag_to': 'purple' |
|
|
} |
|
|
|
|
|
for i, coord in enumerate(coordinates): |
|
|
|
|
|
pixel_x = int(coord['x'] * width) |
|
|
pixel_y = int(coord['y'] * height) |
|
|
|
|
|
|
|
|
color = colors.get(coord['type'], 'red') |
|
|
|
|
|
|
|
|
circle_radius = 8 |
|
|
draw.ellipse([ |
|
|
pixel_x - circle_radius, pixel_y - circle_radius, |
|
|
pixel_x + circle_radius, pixel_y + circle_radius |
|
|
], fill=color, outline='white', width=2) |
|
|
|
|
|
|
|
|
label = f"{coord['type']}({coord['x']:.2f},{coord['y']:.2f})" |
|
|
if font: |
|
|
draw.text((pixel_x + 10, pixel_y - 10), label, fill=color, font=font) |
|
|
else: |
|
|
draw.text((pixel_x + 10, pixel_y - 10), label, fill=color) |
|
|
|
|
|
|
|
|
if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to': |
|
|
next_coord = coordinates[i + 1] |
|
|
end_x = int(next_coord['x'] * width) |
|
|
end_y = int(next_coord['y'] * height) |
|
|
|
|
|
|
|
|
draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3) |
|
|
|
|
|
|
|
|
arrow_size = 10 |
|
|
dx = end_x - pixel_x |
|
|
dy = end_y - pixel_y |
|
|
length = (dx**2 + dy**2)**0.5 |
|
|
if length > 0: |
|
|
dx_norm = dx / length |
|
|
dy_norm = dy / length |
|
|
|
|
|
|
|
|
arrow_x1 = end_x - arrow_size * dx_norm + arrow_size * dy_norm * 0.5 |
|
|
arrow_y1 = end_y - arrow_size * dy_norm - arrow_size * dx_norm * 0.5 |
|
|
arrow_x2 = end_x - arrow_size * dx_norm - arrow_size * dy_norm * 0.5 |
|
|
arrow_y2 = end_y - arrow_size * dy_norm + arrow_size * dx_norm * 0.5 |
|
|
|
|
|
draw.polygon([end_x, end_y, arrow_x1, arrow_y1, arrow_x2, arrow_y2], fill='orange') |
|
|
|
|
|
return img_copy |
|
|
|
|
|
|
|
|
|
|
|
@spaces.GPU |
|
|
def navigate(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]: |
|
|
input_pil_image = array_to_image(input_numpy_image) |
|
|
assert isinstance(input_pil_image, Image.Image) |
|
|
|
|
|
prompt = get_navigation_prompt(task, input_pil_image) |
|
|
|
|
|
|
|
|
if model is None: |
|
|
raise ValueError("Model not loaded") |
|
|
|
|
|
navigation_str = model.generate(prompt, max_new_tokens=500) |
|
|
print(f"Navigation string: {navigation_str}") |
|
|
navigation_str = navigation_str.strip() |
|
|
|
|
|
|
|
|
actions = parse_actions_from_response(navigation_str) |
|
|
|
|
|
|
|
|
all_coordinates = [] |
|
|
for action_code in actions: |
|
|
coordinates = extract_coordinates_from_action(action_code) |
|
|
all_coordinates.extend(coordinates) |
|
|
|
|
|
|
|
|
localized_image = None |
|
|
if all_coordinates: |
|
|
localized_image = create_localized_image(input_pil_image, all_coordinates) |
|
|
print(f"Found {len(all_coordinates)} localization actions") |
|
|
|
|
|
return navigation_str, localized_image |
|
|
|
|
|
|
|
|
|
|
|
example_1_image: str = "./assets/google.png" |
|
|
example_1_image = Image.open(example_1_image) |
|
|
example_1_task = "Search for the name of the current UK Prime Minister." |
|
|
|
|
|
example_2_image: str = "./assets/huggingface.png" |
|
|
example_2_image = Image.open(example_2_image) |
|
|
example_2_task = "Find the most trending model." |
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
input_image_component = gr.Image(label="UI Image", height=500) |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
task_component = gr.Textbox( |
|
|
label="task", |
|
|
placeholder="e.g., Search for the name of the current UK Prime Minister.", |
|
|
info="Type the task you want the model to complete.", |
|
|
) |
|
|
submit_button = gr.Button("Call Agent", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
output_coords_component = gr.Textbox(label="Agent Output", lines=10) |
|
|
|
|
|
submit_button.click(navigate, [input_image_component, task_component], [output_coords_component, input_image_component]) |
|
|
|
|
|
gr.Examples( |
|
|
examples=[[example_1_image, example_1_task], [example_2_image, example_2_task]], |
|
|
inputs=[input_image_component, task_component], |
|
|
outputs=[output_coords_component, input_image_component], |
|
|
fn=navigate, |
|
|
cache_examples=True, |
|
|
) |
|
|
|
|
|
demo.queue(api_open=False) |
|
|
demo.launch(debug=True, share=True) |
|
|
|