Repair save + new prompts
Browse files- app.py +9 -47
- e2bqwen.py +14 -5
- eval.py +11 -19
app.py
CHANGED
|
@@ -15,7 +15,7 @@ from dotenv import load_dotenv
|
|
| 15 |
from smolagents import CodeAgent
|
| 16 |
from smolagents.gradio_ui import GradioUI, stream_to_gradio
|
| 17 |
|
| 18 |
-
from e2bqwen import QwenVLAPIModel, E2BVisionAgent
|
| 19 |
|
| 20 |
load_dotenv(override=True)
|
| 21 |
|
|
@@ -420,44 +420,13 @@ def generate_interaction_id(session_uuid):
|
|
| 420 |
return f"{session_uuid}_{int(time.time())}"
|
| 421 |
|
| 422 |
|
| 423 |
-
def chat_message_to_json(obj):
|
| 424 |
-
"""Custom JSON serializer for ChatMessage and related objects"""
|
| 425 |
-
if hasattr(obj, "__dict__"):
|
| 426 |
-
# Create a copy of the object's __dict__ to avoid modifying the original
|
| 427 |
-
result = obj.__dict__.copy()
|
| 428 |
-
|
| 429 |
-
# Remove the 'raw' field which may contain non-serializable data
|
| 430 |
-
if "raw" in result:
|
| 431 |
-
del result["raw"]
|
| 432 |
-
|
| 433 |
-
# Process the content or tool_calls if they exist
|
| 434 |
-
if "content" in result and result["content"] is not None:
|
| 435 |
-
if hasattr(result["content"], "__dict__"):
|
| 436 |
-
result["content"] = chat_message_to_json(result["content"])
|
| 437 |
-
|
| 438 |
-
if "tool_calls" in result and result["tool_calls"] is not None:
|
| 439 |
-
result["tool_calls"] = [
|
| 440 |
-
chat_message_to_json(tc) for tc in result["tool_calls"]
|
| 441 |
-
]
|
| 442 |
-
|
| 443 |
-
return result
|
| 444 |
-
elif isinstance(obj, (list, tuple)):
|
| 445 |
-
return [chat_message_to_json(item) for item in obj]
|
| 446 |
-
else:
|
| 447 |
-
return obj
|
| 448 |
-
|
| 449 |
-
|
| 450 |
def save_final_status(folder, status: str, summary, error_message=None) -> None:
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
default=chat_message_to_json,
|
| 457 |
)
|
| 458 |
-
)
|
| 459 |
-
output_file.close()
|
| 460 |
-
|
| 461 |
|
| 462 |
def extract_browser_uuid(js_uuid):
|
| 463 |
print(f"[BROWSER] Got browser UUID from JS: {js_uuid}")
|
|
@@ -494,13 +463,6 @@ def create_agent(data_dir, desktop):
|
|
| 494 |
)
|
| 495 |
|
| 496 |
|
| 497 |
-
def get_agent_summary_erase_images(agent):
|
| 498 |
-
for memory_step in agent.memory.steps:
|
| 499 |
-
if getattr(memory_step, "observations_images", None):
|
| 500 |
-
memory_step.observations_images = None
|
| 501 |
-
return agent.memory.get_succinct_steps()
|
| 502 |
-
|
| 503 |
-
|
| 504 |
class EnrichedGradioUI(GradioUI):
|
| 505 |
def log_user_message(self, text_input):
|
| 506 |
import gradio as gr
|
|
@@ -563,9 +525,9 @@ class EnrichedGradioUI(GradioUI):
|
|
| 563 |
yield stored_messages
|
| 564 |
|
| 565 |
# THIS ERASES IMAGES FROM AGENT MEMORY, USE WITH CAUTION
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
yield stored_messages
|
| 570 |
|
| 571 |
except Exception as e:
|
|
|
|
| 15 |
from smolagents import CodeAgent
|
| 16 |
from smolagents.gradio_ui import GradioUI, stream_to_gradio
|
| 17 |
|
| 18 |
+
from e2bqwen import QwenVLAPIModel, E2BVisionAgent, get_agent_summary_erase_images
|
| 19 |
|
| 20 |
load_dotenv(override=True)
|
| 21 |
|
|
|
|
| 420 |
return f"{session_uuid}_{int(time.time())}"
|
| 421 |
|
| 422 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
def save_final_status(folder, status: str, summary, error_message=None) -> None:
|
| 424 |
+
with open(os.path.join(folder, "metadata.json"), "w") as output_file:
|
| 425 |
+
output_file.write(
|
| 426 |
+
json.dumps(
|
| 427 |
+
{"status": status, "summary": summary, "error_message": error_message},
|
| 428 |
+
)
|
|
|
|
| 429 |
)
|
|
|
|
|
|
|
|
|
|
| 430 |
|
| 431 |
def extract_browser_uuid(js_uuid):
|
| 432 |
print(f"[BROWSER] Got browser UUID from JS: {js_uuid}")
|
|
|
|
| 463 |
)
|
| 464 |
|
| 465 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
class EnrichedGradioUI(GradioUI):
|
| 467 |
def log_user_message(self, text_input):
|
| 468 |
import gradio as gr
|
|
|
|
| 525 |
yield stored_messages
|
| 526 |
|
| 527 |
# THIS ERASES IMAGES FROM AGENT MEMORY, USE WITH CAUTION
|
| 528 |
+
if consent_storage:
|
| 529 |
+
summary = get_agent_summary_erase_images(session_state["agent"])
|
| 530 |
+
save_final_status(data_dir, "completed", summary = summary)
|
| 531 |
yield stored_messages
|
| 532 |
|
| 533 |
except Exception as e:
|
e2bqwen.py
CHANGED
|
@@ -170,6 +170,15 @@ def draw_marker_on_image(image_copy, click_coordinates):
|
|
| 170 |
return image_copy
|
| 171 |
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
class E2BVisionAgent(CodeAgent):
|
| 174 |
"""Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
|
| 175 |
|
|
@@ -220,7 +229,7 @@ class E2BVisionAgent(CodeAgent):
|
|
| 220 |
self.step_callbacks.append(self.take_screenshot_callback)
|
| 221 |
|
| 222 |
def initialize_system_prompt(self) -> str:
|
| 223 |
-
if
|
| 224 |
return """You are a desktop automation assistant that can control a remote desktop environment.
|
| 225 |
You only have access to the following tools to interact with the desktop, no additional ones:
|
| 226 |
- click(x, y): Performs a left-click at the specified coordinates
|
|
@@ -509,9 +518,9 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
|
|
| 509 |
|
| 510 |
image_copy = image.copy()
|
| 511 |
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
|
| 516 |
self.last_marked_screenshot = AgentImage(screenshot_path)
|
| 517 |
print(f"Saved screenshot for step {current_step} to {screenshot_path}")
|
|
@@ -570,7 +579,7 @@ class QwenVLAPIModel(Model):
|
|
| 570 |
super().__init__()
|
| 571 |
self.model_id = model_id
|
| 572 |
self.base_model = HfApiModel(
|
| 573 |
-
model_id="https://
|
| 574 |
token=hf_token,
|
| 575 |
max_tokens=4096,
|
| 576 |
)
|
|
|
|
| 170 |
return image_copy
|
| 171 |
|
| 172 |
|
| 173 |
+
def get_agent_summary_erase_images(agent):
|
| 174 |
+
for memory_step in agent.memory.steps:
|
| 175 |
+
if hasattr(memory_step, "observations_images"):
|
| 176 |
+
memory_step.observations_images = None
|
| 177 |
+
if hasattr(memory_step, "task_images"):
|
| 178 |
+
memory_step.task_images = None
|
| 179 |
+
return agent.write_memory_to_messages()
|
| 180 |
+
|
| 181 |
+
|
| 182 |
class E2BVisionAgent(CodeAgent):
|
| 183 |
"""Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
|
| 184 |
|
|
|
|
| 229 |
self.step_callbacks.append(self.take_screenshot_callback)
|
| 230 |
|
| 231 |
def initialize_system_prompt(self) -> str:
|
| 232 |
+
if False:
|
| 233 |
return """You are a desktop automation assistant that can control a remote desktop environment.
|
| 234 |
You only have access to the following tools to interact with the desktop, no additional ones:
|
| 235 |
- click(x, y): Performs a left-click at the specified coordinates
|
|
|
|
| 518 |
|
| 519 |
image_copy = image.copy()
|
| 520 |
|
| 521 |
+
if getattr(self, "click_coordinates", None):
|
| 522 |
+
print("DRAWING MARKER")
|
| 523 |
+
image_copy = draw_marker_on_image(image_copy, self.click_coordinates)
|
| 524 |
|
| 525 |
self.last_marked_screenshot = AgentImage(screenshot_path)
|
| 526 |
print(f"Saved screenshot for step {current_step} to {screenshot_path}")
|
|
|
|
| 579 |
super().__init__()
|
| 580 |
self.model_id = model_id
|
| 581 |
self.base_model = HfApiModel(
|
| 582 |
+
model_id="https://ahbeihft09ulicbf.us-east-1.aws.endpoints.huggingface.cloud",
|
| 583 |
token=hf_token,
|
| 584 |
max_tokens=4096,
|
| 585 |
)
|
eval.py
CHANGED
|
@@ -9,7 +9,7 @@ from e2b_desktop import Sandbox
|
|
| 9 |
from huggingface_hub import get_token
|
| 10 |
from io import BytesIO
|
| 11 |
from PIL import Image
|
| 12 |
-
from e2bqwen import QwenVLAPIModel, E2BVisionAgent
|
| 13 |
|
| 14 |
from dotenv import load_dotenv
|
| 15 |
|
|
@@ -78,14 +78,6 @@ def create_agent(data_dir, desktop, max_steps: int):
|
|
| 78 |
)
|
| 79 |
|
| 80 |
|
| 81 |
-
def get_agent_summary_erase_images(agent):
|
| 82 |
-
"""Get agent summary and erase images to save space"""
|
| 83 |
-
for memory_step in agent.memory.steps:
|
| 84 |
-
if getattr(memory_step, "observations_images", None):
|
| 85 |
-
memory_step.observations_images = None
|
| 86 |
-
return agent.memory.get_succinct_steps()
|
| 87 |
-
|
| 88 |
-
|
| 89 |
def chat_message_to_json(obj):
|
| 90 |
"""Custom JSON serializer for ChatMessage and related objects"""
|
| 91 |
if hasattr(obj, "__dict__"):
|
|
@@ -179,6 +171,7 @@ def run_example_once(example_name, example_text, run_index, example_dir, max_ste
|
|
| 179 |
)
|
| 180 |
result = {"status": "failed", "run_dir": run_dir, "error": error_message}
|
| 181 |
except Exception as e:
|
|
|
|
| 182 |
error_message = f"Error setting up sandbox: {str(e)}"
|
| 183 |
thread_safe_print(
|
| 184 |
f" ✗ Example '{example_name}' run {run_index} failed: {error_message}"
|
|
@@ -195,6 +188,7 @@ def run_example_once(example_name, example_text, run_index, example_dir, max_ste
|
|
| 195 |
|
| 196 |
return result
|
| 197 |
|
|
|
|
| 198 |
|
| 199 |
def run_example(example_name, example_text, num_runs, example_dir, max_steps):
|
| 200 |
"""Run a single example multiple times using threads for each run"""
|
|
@@ -217,8 +211,9 @@ def run_example(example_name, example_text, num_runs, example_dir, max_steps):
|
|
| 217 |
result = future.result()
|
| 218 |
results.append(result)
|
| 219 |
except Exception as exc:
|
|
|
|
| 220 |
thread_safe_print(
|
| 221 |
-
f" ✗ Run {run_index} for '{example_name}' generated an exception
|
| 222 |
)
|
| 223 |
results.append(
|
| 224 |
{"status": "error", "run_index": run_index, "error": str(exc)}
|
|
@@ -347,15 +342,12 @@ def main():
|
|
| 347 |
|
| 348 |
# Examples from the original code
|
| 349 |
examples = {
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
"
|
| 355 |
-
|
| 356 |
-
# "pond": "What's the name of the pond just south of Château de Fontainebleau in Google maps?",
|
| 357 |
-
# "flux": "Go on the Hugging Face Hub, find a Space for FLUX1.dev, and generate a picture of the Golden Gate bridge.",
|
| 358 |
-
# "hf": "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
|
| 359 |
}
|
| 360 |
|
| 361 |
# Create output directory if it doesn't exist
|
|
|
|
| 9 |
from huggingface_hub import get_token
|
| 10 |
from io import BytesIO
|
| 11 |
from PIL import Image
|
| 12 |
+
from e2bqwen import QwenVLAPIModel, E2BVisionAgent, get_agent_summary_erase_images
|
| 13 |
|
| 14 |
from dotenv import load_dotenv
|
| 15 |
|
|
|
|
| 78 |
)
|
| 79 |
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
def chat_message_to_json(obj):
|
| 82 |
"""Custom JSON serializer for ChatMessage and related objects"""
|
| 83 |
if hasattr(obj, "__dict__"):
|
|
|
|
| 171 |
)
|
| 172 |
result = {"status": "failed", "run_dir": run_dir, "error": error_message}
|
| 173 |
except Exception as e:
|
| 174 |
+
raise e
|
| 175 |
error_message = f"Error setting up sandbox: {str(e)}"
|
| 176 |
thread_safe_print(
|
| 177 |
f" ✗ Example '{example_name}' run {run_index} failed: {error_message}"
|
|
|
|
| 188 |
|
| 189 |
return result
|
| 190 |
|
| 191 |
+
import traceback
|
| 192 |
|
| 193 |
def run_example(example_name, example_text, num_runs, example_dir, max_steps):
|
| 194 |
"""Run a single example multiple times using threads for each run"""
|
|
|
|
| 211 |
result = future.result()
|
| 212 |
results.append(result)
|
| 213 |
except Exception as exc:
|
| 214 |
+
error_traceback = traceback.format_exc()
|
| 215 |
thread_safe_print(
|
| 216 |
+
f" ✗ Run {run_index} for '{example_name}' generated an exception:\n{error_traceback}"
|
| 217 |
)
|
| 218 |
results.append(
|
| 219 |
{"status": "error", "run_index": run_index, "error": str(exc)}
|
|
|
|
| 342 |
|
| 343 |
# Examples from the original code
|
| 344 |
examples = {
|
| 345 |
+
"puppies": "Find me pictures of cute puppies",
|
| 346 |
+
"gmaps": "Use Google Maps to find the Hugging Face HQ in Paris",
|
| 347 |
+
"wiki": "Go to Wikipedia and find what happend on April 4th",
|
| 348 |
+
"hello": "Write 'Hello World' in a text editor",
|
| 349 |
+
"commute": "Find out how long it takes to travel by train from Bern and Basel",
|
| 350 |
+
"hf_space": "Go to Hugging Face Spaces and then find the Space flux.1 schnell. Use the space to generate an image of a GPU",
|
|
|
|
|
|
|
|
|
|
| 351 |
}
|
| 352 |
|
| 353 |
# Create output directory if it doesn't exist
|