Spaces:

smolagents
/

computer-agent

Paused

App Files Files Community

Miquel Farré commited on Mar 21

Commit

94436e0

1 Parent(s): 4ac90bc

adding terminal

Browse files

Files changed (2) hide show

app.py +127 -38
e2bqwen.py +39 -101

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ from textwrap import dedent
 import time
 from threading import Timer
 from e2bqwen import QwenVLAPIModel, E2BVisionAgent
 E2B_API_KEY = os.getenv("E2B_API_KEY")
@@ -242,9 +243,34 @@ function() {
             setTimeout(monitorForErrors, 3000);
         }
     });
 }
 """
 def upload_to_hf_and_remove(folder_path):
     repo_id = "open-agents/os-agent-logs"
@@ -367,8 +393,38 @@ def save_final_status(folder, status, details = None):
     a.write(json.dumps({"status":status,"details":str(details)}))
     a.close()
-def run_agent_task(task_input, request: gr.Request):
     session_hash = request.session_hash
     interaction_id = generate_interaction_id(request)
     desktop = get_or_create_sandbox(session_hash)
@@ -377,7 +433,7 @@ def run_agent_task(task_input, request: gr.Request):
     if not os.path.exists(data_dir):
         os.makedirs(data_dir)
     # Create the agent
     agent = E2BVisionAgent(
         model=model,
@@ -386,6 +442,7 @@ def run_agent_task(task_input, request: gr.Request):
         max_steps=200,
         verbosity_level=LogLevel.INFO,
         planning_interval=5,
     )
     # Construct the full task with instructions
@@ -404,28 +461,28 @@ def run_agent_task(task_input, request: gr.Request):
     """)
     try:
         # Run the agent
         result = agent.run(full_task)
-        save_final_status(data_dir, "completed", details = result)
-        return f"Task completed: {result}"
     except Exception as e:
         error_message = f"Error running agent: {str(e)} Details {traceback.format_exc()}"
         save_final_status(data_dir, "failed", details = error_message)
         print(error_message)
-        if 'Both endpoints failed' in error_message:
-            return "Error running agent - Model inference endpoints not ready. Try again later."
-        return "Error running agent"
     finally:
         upload_to_hf_and_remove(data_dir)
 # Create a Gradio app with Blocks
 with gr.Blocks(css=custom_css, js=custom_js) as demo:
-    #gr.HTML("""<h1 style="text-align: center">Personal Computer Assistant</h1>""")
-    # HTML output with simulated image and iframe - default to interactive
     html_output = gr.HTML(
         value=html_template.format(
             stream_url="",
@@ -435,13 +492,11 @@ with gr.Blocks(css=custom_css, js=custom_js) as demo:
         label="Output"
     )
     with gr.Row():
-        # Text input for task
         task_input = gr.Textbox(
             value="Find picture of cute puppies",
             label="Enter your command",
         )
-        # Examples
         gr.Examples(
             examples=[
                 "Check the commuting time between Bern and Zurich",
@@ -452,23 +507,49 @@ with gr.Blocks(css=custom_css, js=custom_js) as demo:
             label= "Example Tasks",
             examples_per_page=4
         )
-    # Results output
-    results_output = gr.Textbox(
-        label="Results",
-        interactive=False,
-        elem_id="results-output"
-    )
-    # Update button
     update_btn = gr.Button("Let's go!")
     # Function to set view-only mode
     def clear_and_set_view_only(task_input, request: gr.Request):
         # First clear the results, then set view-only mode
-        return "", update_html(False, request)
     # Function to set interactive mode
     def set_interactive_mode(request: gr.Request):
         return update_html(True, request)
@@ -484,34 +565,42 @@ with gr.Blocks(css=custom_css, js=custom_js) as demo:
             # This will keep the BSOD visible
             return gr.update()
     # Chain the events
-    # 1. Set view-only mode when button is clicked
     view_only_event = update_btn.click(
         fn=clear_and_set_view_only,
         inputs=[task_input],
-        outputs=[results_output, html_output]
     )
-    # 2. Then run the agent task
     task_result = view_only_event.then(
         fn=run_agent_task,
-        inputs=[task_input],
-        outputs=results_output
     )
-    # 3. Then check the result and conditionally set to interactive mode
     task_result.then(
         fn=check_and_set_interactive,
-        inputs=[results_output],  # Pass the result text to check
         outputs=html_output
     )
-    # Load the sandbox on app start with initial HTML
     demo.load(
-        fn=update_html,
-        inputs=[gr.Checkbox(value=True, visible=False)],  # Hidden checkbox with True value
-        outputs=html_output
     )
 # Launch the app
 if __name__ == "__main__":

 import time
 from threading import Timer
 from e2bqwen import QwenVLAPIModel, E2BVisionAgent
 E2B_API_KEY = os.getenv("E2B_API_KEY")
             setTimeout(monitorForErrors, 3000);
         }
     });
+    // Set up an interval to click the refresh button every 5 seconds
+    setInterval(function() {
+        const btn = document.getElementById('refresh-log-btn');
+        if (btn) btn.click();
+    }, 5000);
 }
 """
+def write_to_console_log(log_file_path, message):
+    """
+    Appends a message to the specified log file with a newline character.
+    Parameters:
+        log_file_path (str): Path to the log file
+        message (str): Message to append to the log file
+    """
+    if log_file_path is None:
+        return False
+    try:
+        # Open the file in append mode
+        with open(log_file_path, 'a') as log_file:
+            # Write the message followed by a newline
+            log_file.write(f"{message}\n")
+        return True
+    except Exception as e:
+        print(f"Error writing to log file: {str(e)}")
+        return False
 def upload_to_hf_and_remove(folder_path):
     repo_id = "open-agents/os-agent-logs"
     a.write(json.dumps({"status":status,"details":str(details)}))
     a.close()
+def get_log_file_path(session_hash):
+    """
+    Creates a log file path based on the session hash.
+    Makes sure the directory exists.
+    """
+    log_dir = os.path.join(TMP_DIR, session_hash)
+    if not os.path.exists(log_dir):
+        os.makedirs(log_dir)
+    return os.path.join(log_dir, 'console.log')
+def initialize_session(interactive_mode, request: gr.Request):
     session_hash = request.session_hash
+    # Create session-specific log file
+    log_path = get_log_file_path(session_hash)
+    # Initialize log file if it doesn't exist
+    if not os.path.exists(log_path):
+        with open(log_path, 'w') as f:
+            f.write(f"Ready to go...\n")
+    # Return HTML and session hash
+    return update_html(interactive_mode, request), session_hash
+# Function to read log content that gets the path from session hash
+def update_terminal_from_session(session_hash):
+    if not session_hash:
+        return "Waiting for session..."
+    log_path = get_log_file_path(session_hash)
+    return read_log_content(log_path)
+def run_agent_task(task_input, session_hash, request: gr.Request):
     interaction_id = generate_interaction_id(request)
     desktop = get_or_create_sandbox(session_hash)
     if not os.path.exists(data_dir):
         os.makedirs(data_dir)
+    log_file = get_log_file_path(session_hash)
     # Create the agent
     agent = E2BVisionAgent(
         model=model,
         max_steps=200,
         verbosity_level=LogLevel.INFO,
         planning_interval=5,
+        log_file = log_file
     )
     # Construct the full task with instructions
     """)
     try:
         # Run the agent
         result = agent.run(full_task)
+        save_final_status(data_dir, "completed", details = result)
+        return f"Task completed: {result}", gr.update(visible=True), gr.update(visible=False)
     except Exception as e:
         error_message = f"Error running agent: {str(e)} Details {traceback.format_exc()}"
         save_final_status(data_dir, "failed", details = error_message)
         print(error_message)
+        error_result = "Error running agent - Model inference endpoints not ready. Try again later." if 'Both endpoints failed' in error_message else "Error running agent"
+        return error_result, gr.update(visible=True), gr.update(visible=False)
     finally:
         upload_to_hf_and_remove(data_dir)
 # Create a Gradio app with Blocks
 with gr.Blocks(css=custom_css, js=custom_js) as demo:
+    #Storing session hash in a state variable
+    session_hash_state = gr.State(None)
     html_output = gr.HTML(
         value=html_template.format(
             stream_url="",
         label="Output"
     )
     with gr.Row():
         task_input = gr.Textbox(
             value="Find picture of cute puppies",
             label="Enter your command",
         )
         gr.Examples(
             examples=[
                 "Check the commuting time between Bern and Zurich",
             label= "Example Tasks",
             examples_per_page=4
         )
+    with gr.Group(visible=True) as terminal_container:
+        terminal = gr.Textbox(
+            value="Initializing...",
+            label='Console',
+            lines=5,
+            max_lines=10,
+            interactive=False
+        )
+        # Hidden refresh button
+        refresh_btn = gr.Button("Refresh", visible=False, elem_id="refresh-log-btn")
+    with gr.Group(visible=False) as results_container:
+        results_output = gr.Textbox(
+            label="Results",
+            interactive=False,
+            elem_id="results-output"
+        )
     update_btn = gr.Button("Let's go!")
+    def read_log_content(log_file, tail=4):
+        """Read the contents of a log file for a specific session"""
+        if not log_file:
+            return "Waiting for session..."
+        if not os.path.exists(log_file):
+            return "Waiting for machine from the future to boot..."
+        try:
+            with open(log_file, 'r') as f:
+                lines = f.readlines()
+                return "".join(lines[-tail:] if len(lines) > tail else lines)
+        except Exception as e:
+            return f"Guru meditation: {str(e)}"
     # Function to set view-only mode
     def clear_and_set_view_only(task_input, request: gr.Request):
         # First clear the results, then set view-only mode
+        return "", update_html(False, request), gr.update(visible=False), gr.update(visible=True)
     # Function to set interactive mode
     def set_interactive_mode(request: gr.Request):
         return update_html(True, request)
             # This will keep the BSOD visible
             return gr.update()
     # Chain the events
+    # 1. Set view-only mode when button is clicked and reset visibility
     view_only_event = update_btn.click(
         fn=clear_and_set_view_only,
         inputs=[task_input],
+        outputs=[results_output, html_output, results_container, terminal_container]
     )
+    # 2. Then run the agent task and update visibility
     task_result = view_only_event.then(
         fn=run_agent_task,
+        inputs=[task_input,session_hash_state],
+        outputs=[results_output, results_container, terminal_container]
     )
+    # 3. Set interactive mode when task completes successfully
     task_result.then(
         fn=check_and_set_interactive,
+        inputs=[results_output],
         outputs=html_output
     )
     demo.load(
+        fn=initialize_session,
+        inputs=[gr.Checkbox(value=True, visible=False)],
+        outputs=[html_output, session_hash_state]
     )
+    # Connect refresh button to update terminal
+    refresh_btn.click(
+        fn=update_terminal_from_session,
+        inputs=[session_hash_state],
+        outputs=[terminal]
+    )
 # Launch the app
 if __name__ == "__main__":

e2bqwen.py CHANGED Viewed

@@ -19,7 +19,26 @@ from smolagents.memory import ActionStep
 from smolagents.models import ChatMessage, MessageRole, Model
 from smolagents.monitoring import LogLevel
 class E2BVisionAgent(CodeAgent):
     """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
     def __init__(
@@ -31,14 +50,19 @@ class E2BVisionAgent(CodeAgent):
         max_steps: int = 200,
         verbosity_level: LogLevel = 4,
         planning_interval: int = 15,
         **kwargs
     ):
         self.desktop = desktop
         self.data_dir = data_dir
         self.planning_interval = planning_interval
         # Initialize Desktop
         self.width, self.height = self.desktop.get_screen_size()
         print(f"Screen size: {self.width}x{self.height}")
         # Set up temp directory
@@ -65,7 +89,9 @@ class E2BVisionAgent(CodeAgent):
         # Add default tools
         self._setup_desktop_tools()
         self.step_callbacks.append(self.take_snapshot_callback)
     def initialize_system_prompt(self):
@@ -156,6 +182,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
             """
             self.desktop.move_mouse(x, y)
             self.desktop.left_click()
             return f"Clicked at coordinates ({x}, {y})"
         @tool
@@ -168,6 +195,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
             """
             self.desktop.move_mouse(x, y)
             self.desktop.right_click()
             return f"Right-clicked at coordinates ({x}, {y})"
         @tool
@@ -180,6 +208,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
             """
             self.desktop.move_mouse(x, y)
             self.desktop.double_click()
             return f"Double-clicked at coordinates ({x}, {y})"
         @tool
@@ -191,6 +220,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
                 y: The y coordinate (vertical position)
             """
             self.desktop.move_mouse(x, y)
             return f"Moved mouse to coordinates ({x}, {y})"
         @tool
@@ -202,6 +232,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
                 delay_in_ms: Delay between keystrokes in milliseconds
             """
             self.desktop.write(text, delay_in_ms=delay_in_ms)
             return f"Typed text: '{text}'"
         @tool
@@ -214,6 +245,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
             if key == "enter":
                 key = "Return"
             self.desktop.press(key)
             return f"Pressed key: {key}"
         @tool
@@ -223,6 +255,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
             Args:
             """
             self.desktop.press(["alt", "left"])
             return "Went back one page"
         @tool
@@ -234,6 +267,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
                 amount: The amount to scroll. A good amount is 1 or 2.
             """
             self.desktop.scroll(direction=direction, amount=amount)
             return f"Scrolled {direction} by {amount}"
         @tool
@@ -244,6 +278,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
                 seconds: Number of seconds to wait
             """
             time.sleep(seconds)
             return f"Waited for {seconds} seconds"
         @tool
@@ -260,6 +295,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
             self.desktop.open(url)
             # Give it time to load
             time.sleep(2)
             return f"Opened URL: {url}"
@@ -289,7 +325,6 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
         messages = [{"role": MessageRole.SYSTEM, "content": [{"type": "text", "text": self.system_prompt}]}]
         # Get the last memory step
         last_step = self.memory.steps[-1] if self.memory.steps else None
         for memory_step in self.memory.steps:
             if hasattr(memory_step, "task") and memory_step.task:
                 # Add task message if it exists
@@ -359,6 +394,8 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
     def take_snapshot_callback(self, memory_step: ActionStep, agent=None) -> None:
         """Callback that takes a screenshot + memory snapshot after a step completes"""
         current_step = memory_step.step_number
         print(f"Taking screenshot for step {current_step}")
         # Check if desktop is still running
@@ -407,105 +444,6 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
             print("E2B sandbox terminated")
-# class QwenVLAPIModel(Model):
-#     """Model wrapper for Qwen2.5VL API"""
-#     def __init__(
-#         self,
-#         model_path: str = "Qwen/Qwen2.5-VL-72B-Instruct",
-#         provider: str = "hyperbolic"
-#     ):
-#         super().__init__()
-#         self.model_path = model_path
-#         self.model_id = model_path
-#         self.provider = provider
-#         self.client = InferenceClient(
-#             provider=self.provider,
-#         )
-#     def __call__(
-#         self,
-#         messages: List[Dict[str, Any]],
-#         stop_sequences: Optional[List[str]] = None,
-#         **kwargs
-#     ) -> ChatMessage:
-#         """Convert a list of messages to an API request and return the response"""
-#         # # Count images in messages - debug
-#         # image_count = 0
-#         # for msg in messages:
-#         #     if isinstance(msg.get("content"), list):
-#         #         for item in msg["content"]:
-#         #             if isinstance(item, dict) and item.get("type") == "image":
-#         #                 image_count += 1
-#         # print(f"QwenVLAPIModel received {len(messages)} messages with {image_count} images")
-#         # Format the messages for the API
-#         formatted_messages = []
-#         for msg in messages:
-#             role = msg["role"]
-#             if isinstance(msg["content"], list):
-#                 content = []
-#                 for item in msg["content"]:
-#                     if item["type"] == "text":
-#                         content.append({"type": "text", "text": item["text"]})
-#                     elif item["type"] == "image":
-#                         # Handle image path or direct image object
-#                         if isinstance(item["image"], str):
-#                             # Image is a path
-#                             with open(item["image"], "rb") as image_file:
-#                                 base64_image = base64.b64encode(image_file.read()).decode("utf-8")
-#                         else:
-#                             # Image is a PIL image or similar object
-#                             img_byte_arr = io.BytesIO()
-#                             item["image"].save(img_byte_arr, format="PNG")
-#                             base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
-#                         content.append({
-#                             "type": "image_url",
-#                             "image_url": {
-#                                 "url": f"data:image/png;base64,{base64_image}"
-#                             }
-#                         })
-#             else:
-#                 content = [{"type": "text", "text": msg["content"]}]
-#             formatted_messages.append({"role": role, "content": content})
-#         # Make the API request
-#         completion = self.client.chat.completions.create(
-#             model=self.model_path,
-#             messages=formatted_messages,
-#             max_tokens=kwargs.get("max_new_tokens", 512),
-#             temperature=kwargs.get("temperature", 0.7),
-#             top_p=kwargs.get("top_p", 0.9),
-#         )
-#         # Extract the response text
-#         output_text = completion.choices[0].message.content
-#         return ChatMessage(role=MessageRole.ASSISTANT, content=output_text)
-#     def to_dict(self) -> Dict[str, Any]:
-#         """Convert the model to a dictionary"""
-#         return {
-#             "class": self.__class__.__name__,
-#             "model_path": self.model_path,
-#             "provider": self.provider,
-#             # We don't save the API key for security reasons
-#         }
-#     @classmethod
-#     def from_dict(cls, data: Dict[str, Any]) -> "QwenVLAPIModel":
-#         """Create a model from a dictionary"""
-#         return cls(
-#             model_path=data.get("model_path", "Qwen/Qwen2.5-VL-72B-Instruct"),
-#             provider=data.get("provider", "hyperbolic"),
-#         )
 class QwenVLAPIModel(Model):
     """Model wrapper for Qwen2.5VL API with fallback mechanism"""

 from smolagents.models import ChatMessage, MessageRole, Model
 from smolagents.monitoring import LogLevel
+def write_to_console_log(log_file_path, message):
+    """
+    Appends a message to the specified log file with a newline character.
+    Parameters:
+        log_file_path (str): Path to the log file
+        message (str): Message to append to the log file
+    """
+    if log_file_path is None:
+        return False
+    try:
+        # Open the file in append mode
+        with open(log_file_path, 'a') as log_file:
+            # Write the message followed by a newline
+            log_file.write(f"{message}\n")
+        return True
+    except Exception as e:
+        print(f"Error writing to log file: {str(e)}")
+        return False
 class E2BVisionAgent(CodeAgent):
     """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
     def __init__(
         max_steps: int = 200,
         verbosity_level: LogLevel = 4,
         planning_interval: int = 15,
+        log_file = None,
         **kwargs
     ):
         self.desktop = desktop
         self.data_dir = data_dir
+        self.log_path = log_file
+        write_to_console_log(self.log_path, "Booting agent...")
         self.planning_interval = planning_interval
         # Initialize Desktop
         self.width, self.height = self.desktop.get_screen_size()
         print(f"Screen size: {self.width}x{self.height}")
+        write_to_console_log(self.log_path, f"Desktop resolution detected: {self.width}x{self.height}")
         # Set up temp directory
         # Add default tools
         self._setup_desktop_tools()
+        write_to_console_log(self.log_path, "Setting up agent tools...")
         self.step_callbacks.append(self.take_snapshot_callback)
+        write_to_console_log(self.log_path, "Studying an action plan... that will take a bit.")
     def initialize_system_prompt(self):
             """
             self.desktop.move_mouse(x, y)
             self.desktop.left_click()
+            write_to_console_log(self.log_path, f"Clicked at coordinates ({x}, {y})")
             return f"Clicked at coordinates ({x}, {y})"
         @tool
             """
             self.desktop.move_mouse(x, y)
             self.desktop.right_click()
+            write_to_console_log(self.log_path, f"Right-clicked at coordinates ({x}, {y})")
             return f"Right-clicked at coordinates ({x}, {y})"
         @tool
             """
             self.desktop.move_mouse(x, y)
             self.desktop.double_click()
+            write_to_console_log(self.log_path, f"Double-clicked at coordinates ({x}, {y})")
             return f"Double-clicked at coordinates ({x}, {y})"
         @tool
                 y: The y coordinate (vertical position)
             """
             self.desktop.move_mouse(x, y)
+            write_to_console_log(self.log_path, f"Moved mouse to coordinates ({x}, {y})")
             return f"Moved mouse to coordinates ({x}, {y})"
         @tool
                 delay_in_ms: Delay between keystrokes in milliseconds
             """
             self.desktop.write(text, delay_in_ms=delay_in_ms)
+            write_to_console_log(self.log_path, f"Typed text: '{text}'")
             return f"Typed text: '{text}'"
         @tool
             if key == "enter":
                 key = "Return"
             self.desktop.press(key)
+            write_to_console_log(self.log_path, f"Pressed key: {key}")
             return f"Pressed key: {key}"
         @tool
             Args:
             """
             self.desktop.press(["alt", "left"])
+            write_to_console_log(self.log_path, "Went back one page")
             return "Went back one page"
         @tool
                 amount: The amount to scroll. A good amount is 1 or 2.
             """
             self.desktop.scroll(direction=direction, amount=amount)
+            write_to_console_log(self.log_path, f"Scrolled {direction} by {amount}")
             return f"Scrolled {direction} by {amount}"
         @tool
                 seconds: Number of seconds to wait
             """
             time.sleep(seconds)
+            write_to_console_log(self.log_path, f"Waited for {seconds} seconds")
             return f"Waited for {seconds} seconds"
         @tool
             self.desktop.open(url)
             # Give it time to load
             time.sleep(2)
+            write_to_console_log(self.log_path, f"Opening URL: {url}")
             return f"Opened URL: {url}"
         messages = [{"role": MessageRole.SYSTEM, "content": [{"type": "text", "text": self.system_prompt}]}]
         # Get the last memory step
         last_step = self.memory.steps[-1] if self.memory.steps else None
         for memory_step in self.memory.steps:
             if hasattr(memory_step, "task") and memory_step.task:
                 # Add task message if it exists
     def take_snapshot_callback(self, memory_step: ActionStep, agent=None) -> None:
         """Callback that takes a screenshot + memory snapshot after a step completes"""
+        write_to_console_log(self.log_path, "Analyzing screen content...")
         current_step = memory_step.step_number
         print(f"Taking screenshot for step {current_step}")
         # Check if desktop is still running
             print("E2B sandbox terminated")
 class QwenVLAPIModel(Model):
     """Model wrapper for Qwen2.5VL API with fallback mechanism"""