Spaces:

smolagents
/

computer-agent

Paused

App Files Files Community

m-ric commited on Mar 27

Commit

70ee3f9

1 Parent(s): 52c8e1b

Switch to agent logging

Browse files

Files changed (2) hide show

app.py +78 -52
e2bqwen.py +14 -38

app.py CHANGED Viewed

@@ -35,25 +35,17 @@ model = QwenVLAPIModel(
 custom_css = """
-:root {
-    --body-background-fill: black!important;
-    --background-fill-secondary: #fad391!important;
-    --body-text-color: #f59e0b!important;
-    --block-text-color: #f59e0b!important;
-}
 .sandbox-container {
     position: relative;
     width: 910px;
     overflow: hidden;
     margin: auto;
 }
-.cyberpunk {
     height: 800px;
 }
-.minimal {
-    height: 700px;
-}
 .sandbox-frame {
     position: absolute;
     top: 0;
     left: 0;
@@ -61,9 +53,6 @@ custom_css = """
     height: 800px;
     pointer-events:none;
 }
-.minimal .sandbox-frame {
-    display: none;
-}
 .sandbox-iframe, .bsod-image {
     position: absolute;
@@ -72,19 +61,6 @@ custom_css = """
     border: 4px solid #444444;
     transform-origin: 0 0;
 }
-.cyberpunk .sandbox-iframe, .bsod-image {
-    /* top: 73px; */
-    top: 99px;
-    /* left: 74px; */
-    left: 110px;
-}
-.cyberpunk .sandbox-iframe {
-    transform: scale(0.535);
-    /* transform: scale(0.59); */
-}
-.minimal .sandbox-iframe {
-    transform: scale(0.65);
-}
 /* Colored label for task textbox */
 .primary-color-label label span {
@@ -100,11 +76,7 @@ custom_css = """
     flex-align:center;
     z-index: 100;
 }
-.cyberpunk .status-bar {
-    position: absolute;
-    bottom: 86px;
-    left: 355px;
-}
 .status-indicator {
     width: 15px;
     height: 15px;
@@ -117,9 +89,7 @@ custom_css = """
     padding: 0 10px;
     text-shadow: none;
 }
-.cyberpunk .status-text {
-    color: #fed244;
-}
 .status-interactive {
     background-color: #2ecc71;
     animation: blink 2s infinite;
@@ -186,7 +156,7 @@ sandbox_html_template = """
 <style>
 @import url('https://fonts.googleapis.com/css2?family=Oxanium:wght@200..800&display=swap');
 </style>
-<div class="sandbox-container {theme}">
     <div class="status-bar">
         <div class="status-indicator {status_class}"></div>
         <div class="status-text">{status_text}</div>
@@ -202,8 +172,9 @@ sandbox_html_template = """
 </div>
 """.replace("<<WIDTH>>", str(WIDTH+15)).replace("<<HEIGHT>>", str(HEIGHT+10))
-custom_js = """
-function() {
     // Function to check if sandbox is timing out
     const checkSandboxTimeout = function() {
         const timeElement = document.getElementById('sandbox-creation-time');
@@ -427,7 +398,7 @@ def get_or_create_sandbox(session_hash):
     return desktop
-def update_html(interactive_mode: bool, theme_checkbox: bool, request: gr.Request):
     session_hash = request.session_hash
     desktop = get_or_create_sandbox(session_hash)
     auth_key = desktop.stream.get_auth_key()
@@ -443,7 +414,6 @@ def update_html(interactive_mode: bool, theme_checkbox: bool, request: gr.Reques
     creation_time = SANDBOX_METADATA[session_hash]['created_at'] if session_hash in SANDBOX_METADATA else time.time()
     sandbox_html_content = sandbox_html_template.format(
-        theme="cyberpunk" if theme_checkbox else "minimal",
         stream_url=stream_url,
         status_class=status_class,
         status_text=status_text,
@@ -483,7 +453,7 @@ def initialize_session(interactive_mode, request: gr.Request):
         with open(log_path, 'w') as f:
             f.write(f"Ready to go...\n")
     # Return HTML and session hash
-    return update_html(interactive_mode, "cyberpunk", request), session_hash
 # Function to read log content that gets the path from session hash
@@ -564,7 +534,7 @@ class EnrichedGradioUI(GradioUI):
         finally:
             upload_to_hf_and_remove(data_dir)
-theme = gr.themes.Default(font=[gr.themes.GoogleFont("Oxanium"), "Futura", "sans-serif"], primary_hue="amber", secondary_hue="blue")
 # Create a Gradio app with Blocks
 with gr.Blocks(theme=theme, css=custom_css, js=custom_js, fill_width=True) as demo:
@@ -576,7 +546,6 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js, fill_width=True) as de
     with gr.Row():
         sandbox_html = gr.HTML(
             value=sandbox_html_template.format(
-                theme="cyberpunk",
                 stream_url="",
                 status_class="status-interactive",
                 status_text="Interactive"
@@ -613,7 +582,64 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js, fill_width=True) as de
                 )
             update_btn = gr.Button("Let's go!", variant="primary")
-            theme_checkbox = gr.Checkbox(label="Cyberpunk Mode", value=True)
     chatbot_display = gr.Chatbot(
         label="Agent's execution logs",
@@ -650,35 +676,35 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js, fill_width=True) as de
     # Function to set view-only mode
     def clear_and_set_view_only(task_input, request: gr.Request):
         # First clear the results, then set view-only mode
-        return "", update_html(False, theme_checkbox, request), gr.update(visible=False)
     def set_interactive(request: gr.Request):
-        return update_html(True, theme_checkbox, request)
     is_interactive = gr.Checkbox(value=True, visible=False)
     # Chain the events
     view_only_event = update_btn.click(
         fn=clear_and_set_view_only,
-        inputs=[task_input, theme_checkbox],
         outputs=[results_output, sandbox_html, results_container]
     )
     view_only_event.then(agent_ui.interact_with_agent, [task_input, stored_messages, session_state, session_hash_state], [chatbot_display]).then(
         fn=set_interactive,
-        inputs=[theme_checkbox],
         outputs=sandbox_html
     )
-    theme_checkbox.change(
-        fn=update_html,
-        inputs=[is_interactive, theme_checkbox],
-        outputs=[sandbox_html]
-    )
     demo.load(
         fn=initialize_session,
         inputs=[is_interactive],
         outputs=[sandbox_html, session_hash_state],
     )
 # Launch the app

 custom_css = """
 .sandbox-container {
     position: relative;
     width: 910px;
     overflow: hidden;
     margin: auto;
 }
+.sandbox-container {
     height: 800px;
 }
 .sandbox-frame {
+    display: none;
     position: absolute;
     top: 0;
     left: 0;
     height: 800px;
     pointer-events:none;
 }
 .sandbox-iframe, .bsod-image {
     position: absolute;
     border: 4px solid #444444;
     transform-origin: 0 0;
 }
 /* Colored label for task textbox */
 .primary-color-label label span {
     flex-align:center;
     z-index: 100;
 }
 .status-indicator {
     width: 15px;
     height: 15px;
     padding: 0 10px;
     text-shadow: none;
 }
 .status-interactive {
     background-color: #2ecc71;
     animation: blink 2s infinite;
 <style>
 @import url('https://fonts.googleapis.com/css2?family=Oxanium:wght@200..800&display=swap');
 </style>
+<div class="sandbox-container">
     <div class="status-bar">
         <div class="status-indicator {status_class}"></div>
         <div class="status-text">{status_text}</div>
 </div>
 """.replace("<<WIDTH>>", str(WIDTH+15)).replace("<<HEIGHT>>", str(HEIGHT+10))
+custom_js = """function() {
+    document.body.classList.toggle('dark');
     // Function to check if sandbox is timing out
     const checkSandboxTimeout = function() {
         const timeElement = document.getElementById('sandbox-creation-time');
     return desktop
+def update_html(interactive_mode: bool, request: gr.Request):
     session_hash = request.session_hash
     desktop = get_or_create_sandbox(session_hash)
     auth_key = desktop.stream.get_auth_key()
     creation_time = SANDBOX_METADATA[session_hash]['created_at'] if session_hash in SANDBOX_METADATA else time.time()
     sandbox_html_content = sandbox_html_template.format(
         stream_url=stream_url,
         status_class=status_class,
         status_text=status_text,
         with open(log_path, 'w') as f:
             f.write(f"Ready to go...\n")
     # Return HTML and session hash
+    return update_html(interactive_mode, request), session_hash
 # Function to read log content that gets the path from session hash
         finally:
             upload_to_hf_and_remove(data_dir)
+theme = gr.themes.Default(font=["sans-serif"], primary_hue="amber", secondary_hue="blue")
 # Create a Gradio app with Blocks
 with gr.Blocks(theme=theme, css=custom_css, js=custom_js, fill_width=True) as demo:
     with gr.Row():
         sandbox_html = gr.HTML(
             value=sandbox_html_template.format(
                 stream_url="",
                 status_class="status-interactive",
                 status_text="Interactive"
                 )
             update_btn = gr.Button("Let's go!", variant="primary")
+            cyberpunk_toggle = gr.Checkbox(label="Go Cyberpunk!", value=False)
+            def apply_theme(cyberpunk_mode: bool):
+                if cyberpunk_mode:
+                    return """
+                        <style>
+                        :root {
+                            --body-background-fill: black!important;
+                            --background-fill-secondary: #fad391!important;
+                            --body-text-color: #f59e0b!important;
+                            --block-text-color: #f59e0b!important;
+                            --font: Oxanium;
+                        }
+                        .sandbox-frame {
+                            display: block!important;
+                        }
+                        .sandbox-iframe, .bsod-image {
+                            /* top: 73px; */
+                            top: 99px;
+                            /* left: 74px; */
+                            left: 110px;
+                        }
+                        .sandbox-iframe {
+                            transform: scale(0.535);
+                            /* transform: scale(0.59); */
+                        }
+                        .status-bar {
+                            position: absolute;
+                            bottom: 86px;
+                            left: 355px;
+                        }
+                        .status-text {
+                            color: #fed244;
+                        }
+                        </style>
+                    """
+                else:
+                    return """
+                        <style>
+                        .sandbox-container {
+                            height: 700px!important;
+                        }
+                        .sandbox-iframe {
+                            transform: scale(0.65);
+                        }
+                        </style>
+                    """
+            # Hidden HTML element to inject CSS dynamically
+            theme_styles = gr.HTML(apply_theme(False), visible=False)
+            cyberpunk_toggle.change(
+                fn=apply_theme,
+                inputs=[cyberpunk_toggle],
+                outputs=[theme_styles]
+            )
     chatbot_display = gr.Chatbot(
         label="Agent's execution logs",
     # Function to set view-only mode
     def clear_and_set_view_only(task_input, request: gr.Request):
         # First clear the results, then set view-only mode
+        return "", update_html(False, request), gr.update(visible=False)
     def set_interactive(request: gr.Request):
+        return update_html(True, request)
     is_interactive = gr.Checkbox(value=True, visible=False)
     # Chain the events
     view_only_event = update_btn.click(
         fn=clear_and_set_view_only,
+        inputs=[task_input],
         outputs=[results_output, sandbox_html, results_container]
     )
     view_only_event.then(agent_ui.interact_with_agent, [task_input, stored_messages, session_state, session_hash_state], [chatbot_display]).then(
         fn=set_interactive,
+        inputs=[],
         outputs=sandbox_html
     )
     demo.load(
         fn=initialize_session,
         inputs=[is_interactive],
         outputs=[sandbox_html, session_hash_state],
+        js="""
+() => {
+    document.body.classList.toggle('dark');
+}
+""",
     )
 # Launch the app

e2bqwen.py CHANGED Viewed

@@ -18,26 +18,6 @@ from smolagents import CodeAgent, tool, HfApiModel
 from smolagents.memory import ActionStep
 from smolagents.models import ChatMessage, MessageRole, Model
 from smolagents.monitoring import LogLevel
-def write_to_console_log(log_file_path, message):
-    """
-    Appends a message to the specified log file with a newline character.
-    Parameters:
-        log_file_path (str): Path to the log file
-        message (str): Message to append to the log file
-    """
-    if log_file_path is None:
-        return False
-    try:
-        # Open the file in append mode
-        with open(log_file_path, 'a') as log_file:
-            # Write the message followed by a newline
-            log_file.write(f"{message}\n")
-        return True
-    except Exception as e:
-        print(f"Error writing to log file: {str(e)}")
-        return False
 E2B_SYSTEM_PROMPT_TEMPLATE = """You are a desktop automation assistant that can control a remote desktop environment.
 On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
@@ -125,13 +105,10 @@ class E2BVisionAgent(CodeAgent):
         self.desktop = desktop
         self.data_dir = data_dir
         self.log_path = log_file
-        write_to_console_log(self.log_path, "Booting agent...")
         self.planning_interval = planning_interval
         # Initialize Desktop
         self.width, self.height = self.desktop.get_screen_size()
         print(f"Screen size: {self.width}x{self.height}")
-        write_to_console_log(self.log_path, f"Desktop resolution detected: {self.width}x{self.height}")
         # Set up temp directory
         os.makedirs(self.data_dir, exist_ok=True)
@@ -157,9 +134,9 @@ class E2BVisionAgent(CodeAgent):
         # Add default tools
         self._setup_desktop_tools()
-        write_to_console_log(self.log_path, "Setting up agent tools...")
         self.step_callbacks.append(self.take_snapshot_callback)
-        write_to_console_log(self.log_path, "Studying an action plan... that will take a bit.")
     def _setup_desktop_tools(self):
         """Register all desktop tools"""
@@ -173,7 +150,7 @@ class E2BVisionAgent(CodeAgent):
             """
             self.desktop.move_mouse(x, y)
             self.desktop.left_click()
-            write_to_console_log(self.log_path, f"Clicked at coordinates ({x}, {y})")
             return f"Clicked at coordinates ({x}, {y})"
         @tool
@@ -186,7 +163,7 @@ class E2BVisionAgent(CodeAgent):
             """
             self.desktop.move_mouse(x, y)
             self.desktop.right_click()
-            write_to_console_log(self.log_path, f"Right-clicked at coordinates ({x}, {y})")
             return f"Right-clicked at coordinates ({x}, {y})"
         @tool
@@ -199,7 +176,7 @@ class E2BVisionAgent(CodeAgent):
             """
             self.desktop.move_mouse(x, y)
             self.desktop.double_click()
-            write_to_console_log(self.log_path, f"Double-clicked at coordinates ({x}, {y})")
             return f"Double-clicked at coordinates ({x}, {y})"
         @tool
@@ -211,7 +188,7 @@ class E2BVisionAgent(CodeAgent):
                 y: The y coordinate (vertical position)
             """
             self.desktop.move_mouse(x, y)
-            write_to_console_log(self.log_path, f"Moved mouse to coordinates ({x}, {y})")
             return f"Moved mouse to coordinates ({x}, {y})"
         @tool
@@ -223,7 +200,7 @@ class E2BVisionAgent(CodeAgent):
                 delay_in_ms: Delay between keystrokes in milliseconds
             """
             self.desktop.write(text, delay_in_ms=delay_in_ms)
-            write_to_console_log(self.log_path, f"Typed text: '{text}'")
             return f"Typed text: '{text}'"
         @tool
@@ -236,7 +213,7 @@ class E2BVisionAgent(CodeAgent):
             if key == "enter":
                 key = "Return"
             self.desktop.press(key)
-            write_to_console_log(self.log_path, f"Pressed key: {key}")
             return f"Pressed key: {key}"
         @tool
@@ -246,7 +223,7 @@ class E2BVisionAgent(CodeAgent):
             Args:
             """
             self.desktop.press(["alt", "left"])
-            write_to_console_log(self.log_path, "Went back one page")
             return "Went back one page"
         @tool
@@ -261,7 +238,7 @@ class E2BVisionAgent(CodeAgent):
             """
             self.desktop.drag([x1, y1], [x2, y2])
             message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
-            write_to_console_log(self.log_path, message)
             return message
         @tool
@@ -273,7 +250,7 @@ class E2BVisionAgent(CodeAgent):
                 amount: The amount to scroll. A good amount is 1 or 2.
             """
             self.desktop.scroll(direction=direction, amount=amount)
-            write_to_console_log(self.log_path, f"Scrolled {direction} by {amount}")
             return f"Scrolled {direction} by {amount}"
         @tool
@@ -284,7 +261,7 @@ class E2BVisionAgent(CodeAgent):
                 seconds: Number of seconds to wait
             """
             time.sleep(seconds)
-            write_to_console_log(self.log_path, f"Waited for {seconds} seconds")
             return f"Waited for {seconds} seconds"
         @tool
@@ -301,7 +278,7 @@ class E2BVisionAgent(CodeAgent):
             self.desktop.open(url)
             # Give it time to load
             time.sleep(2)
-            write_to_console_log(self.log_path, f"Opening URL: {url}")
             return f"Opened URL: {url}"
@@ -330,8 +307,7 @@ class E2BVisionAgent(CodeAgent):
     def take_snapshot_callback(self, memory_step: ActionStep, agent=None) -> None:
         """Callback that takes a screenshot + memory snapshot after a step completes"""
-        print("FYI, here is the system prompt:", agent.system_prompt)
-        write_to_console_log(self.log_path, "Analyzing screen content...")
         current_step = memory_step.step_number
         print(f"Taking screenshot for step {current_step}")

 from smolagents.memory import ActionStep
 from smolagents.models import ChatMessage, MessageRole, Model
 from smolagents.monitoring import LogLevel
 E2B_SYSTEM_PROMPT_TEMPLATE = """You are a desktop automation assistant that can control a remote desktop environment.
 On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
         self.desktop = desktop
         self.data_dir = data_dir
         self.log_path = log_file
         self.planning_interval = planning_interval
         # Initialize Desktop
         self.width, self.height = self.desktop.get_screen_size()
         print(f"Screen size: {self.width}x{self.height}")
         # Set up temp directory
         os.makedirs(self.data_dir, exist_ok=True)
         # Add default tools
         self._setup_desktop_tools()
+        self.logger.log("Setting up agent tools...")
         self.step_callbacks.append(self.take_snapshot_callback)
+        self.logger.log("Studying an action plan... that will take a bit.")
     def _setup_desktop_tools(self):
         """Register all desktop tools"""
             """
             self.desktop.move_mouse(x, y)
             self.desktop.left_click()
+            self.logger.log(self.log_path, f"Clicked at coordinates ({x}, {y})")
             return f"Clicked at coordinates ({x}, {y})"
         @tool
             """
             self.desktop.move_mouse(x, y)
             self.desktop.right_click()
+            self.logger.log(self.log_path, f"Right-clicked at coordinates ({x}, {y})")
             return f"Right-clicked at coordinates ({x}, {y})"
         @tool
             """
             self.desktop.move_mouse(x, y)
             self.desktop.double_click()
+            self.logger.log(self.log_path, f"Double-clicked at coordinates ({x}, {y})")
             return f"Double-clicked at coordinates ({x}, {y})"
         @tool
                 y: The y coordinate (vertical position)
             """
             self.desktop.move_mouse(x, y)
+            self.logger.log(self.log_path, f"Moved mouse to coordinates ({x}, {y})")
             return f"Moved mouse to coordinates ({x}, {y})"
         @tool
                 delay_in_ms: Delay between keystrokes in milliseconds
             """
             self.desktop.write(text, delay_in_ms=delay_in_ms)
+            self.logger.log(self.log_path, f"Typed text: '{text}'")
             return f"Typed text: '{text}'"
         @tool
             if key == "enter":
                 key = "Return"
             self.desktop.press(key)
+            self.logger.log(self.log_path, f"Pressed key: {key}")
             return f"Pressed key: {key}"
         @tool
             Args:
             """
             self.desktop.press(["alt", "left"])
+            self.logger.log(self.log_path, "Went back one page")
             return "Went back one page"
         @tool
             """
             self.desktop.drag([x1, y1], [x2, y2])
             message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
+            self.logger.log(self.log_path, message)
             return message
         @tool
                 amount: The amount to scroll. A good amount is 1 or 2.
             """
             self.desktop.scroll(direction=direction, amount=amount)
+            self.logger.log(self.log_path, f"Scrolled {direction} by {amount}")
             return f"Scrolled {direction} by {amount}"
         @tool
                 seconds: Number of seconds to wait
             """
             time.sleep(seconds)
+            self.logger.log(self.log_path, f"Waited for {seconds} seconds")
             return f"Waited for {seconds} seconds"
         @tool
             self.desktop.open(url)
             # Give it time to load
             time.sleep(2)
+            self.logger.log(self.log_path, f"Opening URL: {url}")
             return f"Opened URL: {url}"
     def take_snapshot_callback(self, memory_step: ActionStep, agent=None) -> None:
         """Callback that takes a screenshot + memory snapshot after a step completes"""
+        self.logger.log(self.log_path, "Analyzing screen content...")
         current_step = memory_step.step_number
         print(f"Taking screenshot for step {current_step}")