Simplify agent code
Browse files- e2bqwen.py +3 -74
e2bqwen.py
CHANGED
|
@@ -320,77 +320,6 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
|
|
| 320 |
a.write(json.dumps(output_memory))
|
| 321 |
a.close()
|
| 322 |
|
| 323 |
-
def write_memory_to_messages(self, summary_mode: Optional[bool] = False) -> List[Dict[str, Any]]:
|
| 324 |
-
"""Convert memory to messages for the model"""
|
| 325 |
-
messages = [{"role": MessageRole.SYSTEM, "content": [{"type": "text", "text": self.system_prompt}]}]
|
| 326 |
-
# Get the last memory step
|
| 327 |
-
last_step = self.memory.steps[-1] if self.memory.steps else None
|
| 328 |
-
for memory_step in self.memory.steps:
|
| 329 |
-
if hasattr(memory_step, "task") and memory_step.task:
|
| 330 |
-
# Add task message if it exists
|
| 331 |
-
messages.append({
|
| 332 |
-
"role": MessageRole.USER,
|
| 333 |
-
"content": [{"type": "text", "text": memory_step.task}]
|
| 334 |
-
})
|
| 335 |
-
continue # Skip to next step after adding task
|
| 336 |
-
if hasattr(memory_step, "model_output_message_plan") and memory_step.model_output_message_plan:
|
| 337 |
-
messages.append({
|
| 338 |
-
"role": MessageRole.ASSISTANT,
|
| 339 |
-
"content": [{"type": "text", "text": memory_step.model_output_message_plan.content, "agent_state": "plan"}]
|
| 340 |
-
})
|
| 341 |
-
# Process model output message if it exists
|
| 342 |
-
if hasattr(memory_step, "model_output") and memory_step.model_output:
|
| 343 |
-
messages.append({
|
| 344 |
-
"role": MessageRole.ASSISTANT,
|
| 345 |
-
"content": [{"type": "text", "text": memory_step.model_output}]
|
| 346 |
-
})
|
| 347 |
-
|
| 348 |
-
# Process observations and images
|
| 349 |
-
observation_content = []
|
| 350 |
-
|
| 351 |
-
# Add screenshot image paths if present
|
| 352 |
-
if memory_step is last_step and hasattr(memory_step, "observations_images") and memory_step.observations_images:
|
| 353 |
-
self.logger.log(f"Found {len(memory_step.observations_images)} image paths in step", level=LogLevel.DEBUG)
|
| 354 |
-
for img_path in memory_step.observations_images:
|
| 355 |
-
if isinstance(img_path, str) and os.path.exists(img_path):
|
| 356 |
-
observation_content.append({"type": "image", "image": img_path})
|
| 357 |
-
elif isinstance(img_path, Image.Image):
|
| 358 |
-
screenshot_path = f"screenshot_{int(time.time() * 1000)}.png"
|
| 359 |
-
img_path.save(screenshot_path)
|
| 360 |
-
observation_content.append({"type": "image", "image": screenshot_path})
|
| 361 |
-
else:
|
| 362 |
-
self.logger.log(f" - Skipping invalid image: {type(img_path)}", level=LogLevel.ERROR)
|
| 363 |
-
|
| 364 |
-
# Add text observations if any
|
| 365 |
-
if hasattr(memory_step, "observations") and memory_step.observations:
|
| 366 |
-
self.logger.log(f" - Adding text observation", level=LogLevel.DEBUG)
|
| 367 |
-
observation_content.append({"type": "text", "text": f"Observation: {memory_step.observations}"})
|
| 368 |
-
|
| 369 |
-
# Add error if present and didn't already add observations
|
| 370 |
-
if hasattr(memory_step, "error") and memory_step.error:
|
| 371 |
-
self.logger.log(f" - Adding error message", level=LogLevel.DEBUG)
|
| 372 |
-
observation_content.append({"type": "text", "text": f"Error: {memory_step.error}"})
|
| 373 |
-
|
| 374 |
-
# Add user message with content if we have any
|
| 375 |
-
if observation_content:
|
| 376 |
-
self.logger.log(f" - Adding user message with {len(observation_content)} content items", level=LogLevel.DEBUG)
|
| 377 |
-
messages.append({
|
| 378 |
-
"role": MessageRole.USER,
|
| 379 |
-
"content": observation_content
|
| 380 |
-
})
|
| 381 |
-
|
| 382 |
-
# # Check for images in final message list
|
| 383 |
-
# image_count = 0
|
| 384 |
-
# for msg in messages:
|
| 385 |
-
# if isinstance(msg.get("content"), list):
|
| 386 |
-
# for item in msg["content"]:
|
| 387 |
-
# if isinstance(item, dict) and item.get("type") == "image":
|
| 388 |
-
# image_count += 1
|
| 389 |
-
|
| 390 |
-
# print(f"Created {len(messages)} messages with {image_count} image paths")
|
| 391 |
-
|
| 392 |
-
return messages
|
| 393 |
-
|
| 394 |
|
| 395 |
def take_snapshot_callback(self, memory_step: ActionStep, agent=None) -> None:
|
| 396 |
"""Callback that takes a screenshot + memory snapshot after a step completes"""
|
|
@@ -529,7 +458,7 @@ class QwenVLAPIModel(Model):
|
|
| 529 |
img_byte_arr = io.BytesIO()
|
| 530 |
item["image"].save(img_byte_arr, format="PNG")
|
| 531 |
base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
| 532 |
-
|
| 533 |
content.append({
|
| 534 |
"type": "image_url",
|
| 535 |
"image_url": {
|
|
@@ -543,10 +472,10 @@ class QwenVLAPIModel(Model):
|
|
| 543 |
formatted_messages.append({"role": role, "content": content})
|
| 544 |
|
| 545 |
return formatted_messages
|
| 546 |
-
|
| 547 |
def _call_hf_endpoint(self, formatted_messages, stop_sequences=None, **kwargs):
|
| 548 |
"""Call the Hugging Face OpenAI-compatible endpoint"""
|
| 549 |
-
|
| 550 |
# Extract parameters with defaults
|
| 551 |
max_tokens = kwargs.get("max_new_tokens", 512)
|
| 552 |
temperature = kwargs.get("temperature", 0.7)
|
|
|
|
| 320 |
a.write(json.dumps(output_memory))
|
| 321 |
a.close()
|
| 322 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
|
| 324 |
def take_snapshot_callback(self, memory_step: ActionStep, agent=None) -> None:
|
| 325 |
"""Callback that takes a screenshot + memory snapshot after a step completes"""
|
|
|
|
| 458 |
img_byte_arr = io.BytesIO()
|
| 459 |
item["image"].save(img_byte_arr, format="PNG")
|
| 460 |
base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
| 461 |
+
|
| 462 |
content.append({
|
| 463 |
"type": "image_url",
|
| 464 |
"image_url": {
|
|
|
|
| 472 |
formatted_messages.append({"role": role, "content": content})
|
| 473 |
|
| 474 |
return formatted_messages
|
| 475 |
+
|
| 476 |
def _call_hf_endpoint(self, formatted_messages, stop_sequences=None, **kwargs):
|
| 477 |
"""Call the Hugging Face OpenAI-compatible endpoint"""
|
| 478 |
+
|
| 479 |
# Extract parameters with defaults
|
| 480 |
max_tokens = kwargs.get("max_new_tokens", 512)
|
| 481 |
temperature = kwargs.get("temperature", 0.7)
|