Spaces:

smolagents
/

computer-agent

Paused

App Files Files Community

M-Rique commited on Apr 10

Commit

ee08a04

1 Parent(s): e73b85c

Repair save + new prompts

Browse files

Files changed (3) hide show

app.py +9 -47
e2bqwen.py +14 -5
eval.py +11 -19

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ from dotenv import load_dotenv
 from smolagents import CodeAgent
 from smolagents.gradio_ui import GradioUI, stream_to_gradio
-from e2bqwen import QwenVLAPIModel, E2BVisionAgent
 load_dotenv(override=True)
@@ -420,44 +420,13 @@ def generate_interaction_id(session_uuid):
     return f"{session_uuid}_{int(time.time())}"
-def chat_message_to_json(obj):
-    """Custom JSON serializer for ChatMessage and related objects"""
-    if hasattr(obj, "__dict__"):
-        # Create a copy of the object's __dict__ to avoid modifying the original
-        result = obj.__dict__.copy()
-        # Remove the 'raw' field which may contain non-serializable data
-        if "raw" in result:
-            del result["raw"]
-        # Process the content or tool_calls if they exist
-        if "content" in result and result["content"] is not None:
-            if hasattr(result["content"], "__dict__"):
-                result["content"] = chat_message_to_json(result["content"])
-        if "tool_calls" in result and result["tool_calls"] is not None:
-            result["tool_calls"] = [
-                chat_message_to_json(tc) for tc in result["tool_calls"]
-            ]
-        return result
-    elif isinstance(obj, (list, tuple)):
-        return [chat_message_to_json(item) for item in obj]
-    else:
-        return obj
 def save_final_status(folder, status: str, summary, error_message=None) -> None:
-    metadata_path = os.path.join(folder, "metadata.json")
-    output_file = open(metadata_path, "w")
-    output_file.write(
-        json.dumps(
-            {"status": status, "summary": summary, "error_message": error_message},
-            default=chat_message_to_json,
         )
-    )
-    output_file.close()
 def extract_browser_uuid(js_uuid):
     print(f"[BROWSER] Got browser UUID from JS: {js_uuid}")
@@ -494,13 +463,6 @@ def create_agent(data_dir, desktop):
     )
-def get_agent_summary_erase_images(agent):
-    for memory_step in agent.memory.steps:
-        if getattr(memory_step, "observations_images", None):
-            memory_step.observations_images = None
-    return agent.memory.get_succinct_steps()
 class EnrichedGradioUI(GradioUI):
     def log_user_message(self, text_input):
         import gradio as gr
@@ -563,9 +525,9 @@ class EnrichedGradioUI(GradioUI):
                 yield stored_messages
             # THIS ERASES IMAGES FROM AGENT MEMORY, USE WITH CAUTION
-            # if consent_storage:
-            #     summary = get_agent_summary_erase_images(session_state["agent"])
-            #     save_final_status(data_dir, "completed", summary = summary)
             yield stored_messages
         except Exception as e:

 from smolagents import CodeAgent
 from smolagents.gradio_ui import GradioUI, stream_to_gradio
+from e2bqwen import QwenVLAPIModel, E2BVisionAgent, get_agent_summary_erase_images
 load_dotenv(override=True)
     return f"{session_uuid}_{int(time.time())}"
 def save_final_status(folder, status: str, summary, error_message=None) -> None:
+    with open(os.path.join(folder, "metadata.json"), "w") as output_file:
+        output_file.write(
+            json.dumps(
+                {"status": status, "summary": summary, "error_message": error_message},
+            )
         )
 def extract_browser_uuid(js_uuid):
     print(f"[BROWSER] Got browser UUID from JS: {js_uuid}")
     )
 class EnrichedGradioUI(GradioUI):
     def log_user_message(self, text_input):
         import gradio as gr
                 yield stored_messages
             # THIS ERASES IMAGES FROM AGENT MEMORY, USE WITH CAUTION
+            if consent_storage:
+                summary = get_agent_summary_erase_images(session_state["agent"])
+                save_final_status(data_dir, "completed", summary = summary)
             yield stored_messages
         except Exception as e:

e2bqwen.py CHANGED Viewed

@@ -170,6 +170,15 @@ def draw_marker_on_image(image_copy, click_coordinates):
     return image_copy
 class E2BVisionAgent(CodeAgent):
     """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
@@ -220,7 +229,7 @@ class E2BVisionAgent(CodeAgent):
         self.step_callbacks.append(self.take_screenshot_callback)
     def initialize_system_prompt(self) -> str:
-        if True:
             return """You are a desktop automation assistant that can control a remote desktop environment.
 You only have access to the following tools to interact with the desktop, no additional ones:
 - click(x, y): Performs a left-click at the specified coordinates
@@ -509,9 +518,9 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
         image_copy = image.copy()
-        # if getattr(self, "click_coordinates", None):
-        #     print("DRAWING MARKER")
-        #     image_copy = draw_marker_on_image(image_copy, self.click_coordinates)
         self.last_marked_screenshot = AgentImage(screenshot_path)
         print(f"Saved screenshot for step {current_step} to {screenshot_path}")
@@ -570,7 +579,7 @@ class QwenVLAPIModel(Model):
         super().__init__()
         self.model_id = model_id
         self.base_model = HfApiModel(
-            model_id="https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud",
             token=hf_token,
             max_tokens=4096,
         )

     return image_copy
+def get_agent_summary_erase_images(agent):
+    for memory_step in agent.memory.steps:
+        if hasattr(memory_step, "observations_images"):
+            memory_step.observations_images = None
+        if hasattr(memory_step, "task_images"):
+            memory_step.task_images = None
+    return agent.write_memory_to_messages()
 class E2BVisionAgent(CodeAgent):
     """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
         self.step_callbacks.append(self.take_screenshot_callback)
     def initialize_system_prompt(self) -> str:
+        if False:
             return """You are a desktop automation assistant that can control a remote desktop environment.
 You only have access to the following tools to interact with the desktop, no additional ones:
 - click(x, y): Performs a left-click at the specified coordinates
         image_copy = image.copy()
+        if getattr(self, "click_coordinates", None):
+            print("DRAWING MARKER")
+            image_copy = draw_marker_on_image(image_copy, self.click_coordinates)
         self.last_marked_screenshot = AgentImage(screenshot_path)
         print(f"Saved screenshot for step {current_step} to {screenshot_path}")
         super().__init__()
         self.model_id = model_id
         self.base_model = HfApiModel(
+            model_id="https://ahbeihft09ulicbf.us-east-1.aws.endpoints.huggingface.cloud",
             token=hf_token,
             max_tokens=4096,
         )

eval.py CHANGED Viewed

@@ -9,7 +9,7 @@ from e2b_desktop import Sandbox
 from huggingface_hub import get_token
 from io import BytesIO
 from PIL import Image
-from e2bqwen import QwenVLAPIModel, E2BVisionAgent
 from dotenv import load_dotenv
@@ -78,14 +78,6 @@ def create_agent(data_dir, desktop, max_steps: int):
     )
-def get_agent_summary_erase_images(agent):
-    """Get agent summary and erase images to save space"""
-    for memory_step in agent.memory.steps:
-        if getattr(memory_step, "observations_images", None):
-            memory_step.observations_images = None
-    return agent.memory.get_succinct_steps()
 def chat_message_to_json(obj):
     """Custom JSON serializer for ChatMessage and related objects"""
     if hasattr(obj, "__dict__"):
@@ -179,6 +171,7 @@ def run_example_once(example_name, example_text, run_index, example_dir, max_ste
             )
             result = {"status": "failed", "run_dir": run_dir, "error": error_message}
     except Exception as e:
         error_message = f"Error setting up sandbox: {str(e)}"
         thread_safe_print(
             f"  ✗ Example '{example_name}' run {run_index} failed: {error_message}"
@@ -195,6 +188,7 @@ def run_example_once(example_name, example_text, run_index, example_dir, max_ste
     return result
 def run_example(example_name, example_text, num_runs, example_dir, max_steps):
     """Run a single example multiple times using threads for each run"""
@@ -217,8 +211,9 @@ def run_example(example_name, example_text, num_runs, example_dir, max_steps):
                 result = future.result()
                 results.append(result)
             except Exception as exc:
                 thread_safe_print(
-                    f"  ✗ Run {run_index} for '{example_name}' generated an exception: {exc}"
                 )
                 results.append(
                     {"status": "error", "run_index": run_index, "error": str(exc)}
@@ -347,15 +342,12 @@ def main():
     # Examples from the original code
     examples = {
-        # "puppies": "Find me pictures of cute puppies",
-        # "commute": "Check the commuting time between Bern and Zurich on Google maps",
-        # "hello": "Write 'Hello World' in a text editor",
-        # "wiki": "When was Temple Grandin introduced to the American Academy of Arts and Sciences, according to Wikipedia?",
-        "quote": "Can you give me Bertrand Russel's 'Teapot analogy' as stated in his entry on Stanford Encyclopedia of Philosophy?",
-        # "flight": "Search a flight from Rome to Berlin for May 3rd, 2025.",
-        # "pond": "What's the name of the pond just south of Château de Fontainebleau in Google maps?",
-        # "flux": "Go on the Hugging Face Hub, find a Space for FLUX1.dev, and generate a picture of the Golden Gate bridge.",
-        # "hf": "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
     }
     # Create output directory if it doesn't exist

 from huggingface_hub import get_token
 from io import BytesIO
 from PIL import Image
+from e2bqwen import QwenVLAPIModel, E2BVisionAgent, get_agent_summary_erase_images
 from dotenv import load_dotenv
     )
 def chat_message_to_json(obj):
     """Custom JSON serializer for ChatMessage and related objects"""
     if hasattr(obj, "__dict__"):
             )
             result = {"status": "failed", "run_dir": run_dir, "error": error_message}
     except Exception as e:
+        raise e
         error_message = f"Error setting up sandbox: {str(e)}"
         thread_safe_print(
             f"  ✗ Example '{example_name}' run {run_index} failed: {error_message}"
     return result
+import traceback
 def run_example(example_name, example_text, num_runs, example_dir, max_steps):
     """Run a single example multiple times using threads for each run"""
                 result = future.result()
                 results.append(result)
             except Exception as exc:
+                error_traceback = traceback.format_exc()
                 thread_safe_print(
+                    f"  ✗ Run {run_index} for '{example_name}' generated an exception:\n{error_traceback}"
                 )
                 results.append(
                     {"status": "error", "run_index": run_index, "error": str(exc)}
     # Examples from the original code
     examples = {
+        "puppies": "Find me pictures of cute puppies",
+        "gmaps": "Use Google Maps to find the Hugging Face HQ in Paris",
+        "wiki": "Go to Wikipedia and find what happend on April 4th",
+        "hello": "Write 'Hello World' in a text editor",
+        "commute": "Find out how long it takes to travel by train from Bern and Basel",
+        "hf_space": "Go to Hugging Face Spaces and then find the Space flux.1 schnell. Use the space to generate an image of a GPU",
     }
     # Create output directory if it doesn't exist