R-Kentaren commited on
Commit
a326747
·
verified ·
1 Parent(s): ef1a9fc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +281 -0
app.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+
3
+ import re
4
+ from typing import Tuple, Optional
5
+
6
+ import gradio as gr
7
+ import numpy as np
8
+ from PIL import Image, ImageDraw, ImageFont
9
+ from smolvlm_inference import TransformersModel
10
+
11
+ from prompt import OS_SYSTEM_PROMPT
12
+
13
+ # --- Configuration ---
14
+ MODEL_ID = "smolagents/SmolVLM2-2.2B-Instruct-Agentic-GUI"
15
+
16
+ # --- Model and Processor Loading (Load once) ---
17
+ print(f"Loading model and processor for {MODEL_ID}...")
18
+ model = None
19
+ processor = None
20
+ model_loaded = False
21
+ load_error_message = ""
22
+
23
+
24
+
25
+ model = TransformersModel(
26
+ model_id=MODEL_ID,
27
+ to_device="cuda:0",
28
+ )
29
+
30
+
31
+ title = "Smol2Operator Demo"
32
+
33
+ description = """
34
+ This is a demo of the Smol2Operator model designed to interact with graphical user interfaces (GUIs) and perform actions within them.
35
+ This proof-of-concept (POC) version, described in [blogpost], showcases the model’s core capabilities.
36
+ This compact release is intentionally scoped to fundamental tasks, with complex workflows planned for future iterations. :hugging_face:
37
+ """
38
+
39
+
40
+
41
+ SYSTEM_PROMPT: str = OS_SYSTEM_PROMPT
42
+
43
+
44
+ def get_navigation_prompt(task, image, step=1):
45
+ """
46
+ Get the prompt for the navigation task.
47
+ - task: The task to complete
48
+ - image: The current screenshot of the web page
49
+ - step: The current step of the task
50
+ """
51
+ system_prompt = SYSTEM_PROMPT
52
+ return [
53
+ {
54
+ "role": "system",
55
+ "content": [
56
+ {"type": "text", "text": system_prompt},
57
+ ],
58
+ },
59
+ {
60
+ "role": "user",
61
+ "content": [
62
+ {
63
+ "type": "image",
64
+ "image": image,
65
+ },
66
+ {"type": "text", "text": f"Please generate the next move according to the UI screenshot, instruction and previous actions.\n\nInstruction: {task}\n\nPrevious actions:\nNone"},
67
+ ],
68
+ },
69
+ ]
70
+
71
+
72
+ def array_to_image(image_array: np.ndarray) -> Image.Image:
73
+ if image_array is None:
74
+ raise ValueError("No image provided. Please upload an image before submitting.")
75
+ # Convert numpy array to PIL Image
76
+ img = Image.fromarray(np.uint8(image_array))
77
+ return img
78
+
79
+
80
+ def parse_actions_from_response(response: str) -> list[str]:
81
+ """Parse actions from model response using regex pattern."""
82
+ pattern = r"<code>\n(.*?)\n</code>"
83
+ matches = re.findall(pattern, response, re.DOTALL)
84
+ return matches
85
+
86
+
87
+ def extract_coordinates_from_action(action_code: str) -> list[dict]:
88
+ """Extract coordinates from action code for localization actions."""
89
+ localization_actions = []
90
+
91
+ # Patterns for different action types
92
+ patterns = {
93
+ 'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
94
+ 'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
95
+ 'move_mouse': r'move_mouse\((?:self,\s*)?(?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))\)',
96
+ 'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)'
97
+ }
98
+
99
+ for action_type, pattern in patterns.items():
100
+ matches = re.finditer(pattern, action_code)
101
+ for match in matches:
102
+ if action_type == 'drag':
103
+ # Drag has from and to coordinates
104
+ from_x, from_y, to_x, to_y = match.groups()
105
+ localization_actions.append({
106
+ 'type': 'drag_from',
107
+ 'x': float(from_x),
108
+ 'y': float(from_y),
109
+ 'action': action_type
110
+ })
111
+ localization_actions.append({
112
+ 'type': 'drag_to',
113
+ 'x': float(to_x),
114
+ 'y': float(to_y),
115
+ 'action': action_type
116
+ })
117
+ else:
118
+ # Single coordinate actions
119
+ x_val = match.group(1)
120
+ y_val = match.group(2) if match.group(2) else x_val # Handle single coordinate case
121
+ if x_val and y_val:
122
+ localization_actions.append({
123
+ 'type': action_type,
124
+ 'x': float(x_val),
125
+ 'y': float(y_val),
126
+ 'action': action_type
127
+ })
128
+
129
+ return localization_actions
130
+
131
+
132
+ def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]:
133
+ """Create an image with localization markers drawn on it."""
134
+ if not coordinates:
135
+ return None
136
+
137
+ # Create a copy of the original image
138
+ img_copy = original_image.copy()
139
+ draw = ImageDraw.Draw(img_copy)
140
+
141
+ # Get image dimensions
142
+ width, height = img_copy.size
143
+
144
+ # Try to load a font, fallback to default if not available
145
+ font = ImageFont.load_default()
146
+
147
+
148
+ # Color scheme for different actions
149
+ colors = {
150
+ 'click': 'red',
151
+ 'double_click': 'blue',
152
+ 'move_mouse': 'green',
153
+ 'drag_from': 'orange',
154
+ 'drag_to': 'purple'
155
+ }
156
+
157
+ for i, coord in enumerate(coordinates):
158
+ # Convert normalized coordinates to pixel coordinates
159
+ pixel_x = int(coord['x'] * width)
160
+ pixel_y = int(coord['y'] * height)
161
+
162
+ # Get color for this action type
163
+ color = colors.get(coord['type'], 'red')
164
+
165
+ # Draw a circle at the coordinate
166
+ circle_radius = 8
167
+ draw.ellipse([
168
+ pixel_x - circle_radius, pixel_y - circle_radius,
169
+ pixel_x + circle_radius, pixel_y + circle_radius
170
+ ], fill=color, outline='white', width=2)
171
+
172
+ # Add text label
173
+ label = f"{coord['type']}({coord['x']:.2f},{coord['y']:.2f})"
174
+ if font:
175
+ draw.text((pixel_x + 10, pixel_y - 10), label, fill=color, font=font)
176
+ else:
177
+ draw.text((pixel_x + 10, pixel_y - 10), label, fill=color)
178
+
179
+ # For drag actions, draw an arrow
180
+ if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to':
181
+ next_coord = coordinates[i + 1]
182
+ end_x = int(next_coord['x'] * width)
183
+ end_y = int(next_coord['y'] * height)
184
+
185
+ # Draw arrow line
186
+ draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3)
187
+
188
+ # Draw arrowhead
189
+ arrow_size = 10
190
+ dx = end_x - pixel_x
191
+ dy = end_y - pixel_y
192
+ length = (dx**2 + dy**2)**0.5
193
+ if length > 0:
194
+ dx_norm = dx / length
195
+ dy_norm = dy / length
196
+
197
+ # Arrowhead points
198
+ arrow_x1 = end_x - arrow_size * dx_norm + arrow_size * dy_norm * 0.5
199
+ arrow_y1 = end_y - arrow_size * dy_norm - arrow_size * dx_norm * 0.5
200
+ arrow_x2 = end_x - arrow_size * dx_norm - arrow_size * dy_norm * 0.5
201
+ arrow_y2 = end_y - arrow_size * dy_norm + arrow_size * dx_norm * 0.5
202
+
203
+ draw.polygon([end_x, end_y, arrow_x1, arrow_y1, arrow_x2, arrow_y2], fill='orange')
204
+
205
+ return img_copy
206
+
207
+
208
+ # --- Gradio processing function ---
209
+ @spaces.GPU
210
+ def navigate(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
211
+ input_pil_image = array_to_image(input_numpy_image)
212
+ assert isinstance(input_pil_image, Image.Image)
213
+
214
+ prompt = get_navigation_prompt(task, input_pil_image)
215
+
216
+
217
+ if model is None:
218
+ raise ValueError("Model not loaded")
219
+
220
+ navigation_str = model.generate(prompt, max_new_tokens=500)
221
+ print(f"Navigation string: {navigation_str}")
222
+ navigation_str = navigation_str.strip()
223
+
224
+ # Parse actions from the response
225
+ actions = parse_actions_from_response(navigation_str)
226
+
227
+ # Extract coordinates from all actions
228
+ all_coordinates = []
229
+ for action_code in actions:
230
+ coordinates = extract_coordinates_from_action(action_code)
231
+ all_coordinates.extend(coordinates)
232
+
233
+ # Create localized image if there are coordinates
234
+ localized_image = None
235
+ if all_coordinates:
236
+ localized_image = create_localized_image(input_pil_image, all_coordinates)
237
+ print(f"Found {len(all_coordinates)} localization actions")
238
+
239
+ return navigation_str, localized_image
240
+
241
+
242
+ # --- Load Example Data ---
243
+ example_1_image: str = "./assets/google.png"
244
+ example_1_image = Image.open(example_1_image)
245
+ example_1_task = "Search for the name of the current UK Prime Minister."
246
+
247
+ example_2_image: str = "./assets/huggingface.png"
248
+ example_2_image = Image.open(example_2_image)
249
+ example_2_task = "Find the most trending model."
250
+
251
+
252
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
253
+ gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
254
+ # gr.Markdown(description)
255
+
256
+ with gr.Row():
257
+ input_image_component = gr.Image(label="UI Image", height=500)
258
+ with gr.Row():
259
+ with gr.Column():
260
+ task_component = gr.Textbox(
261
+ label="task",
262
+ placeholder="e.g., Search for the name of the current UK Prime Minister.",
263
+ info="Type the task you want the model to complete.",
264
+ )
265
+ submit_button = gr.Button("Call Agent", variant="primary")
266
+
267
+ with gr.Column():
268
+ output_coords_component = gr.Textbox(label="Agent Output", lines=10)
269
+
270
+ submit_button.click(navigate, [input_image_component, task_component], [output_coords_component, input_image_component])
271
+
272
+ gr.Examples(
273
+ examples=[[example_1_image, example_1_task], [example_2_image, example_2_task]],
274
+ inputs=[input_image_component, task_component],
275
+ outputs=[output_coords_component, input_image_component],
276
+ fn=navigate,
277
+ cache_examples=True,
278
+ )
279
+
280
+ demo.queue(api_open=False)
281
+ demo.launch(debug=True, share=True)