Spaces:

smolagents
/

computer-use-agent

Running

App Files Files Community

A-Mahla commited on Nov 6

Commit

1e72ba8

1 Parent(s): 51747e6

NEW agent (#10)

Browse files

Files changed (8) hide show

cua2-core/src/cua2_core/models/models.py +6 -2
cua2-core/src/cua2_core/services/agent_utils/desktop_agent.py +58 -2
cua2-core/src/cua2_core/services/agent_utils/prompt.py +36 -5
cua2-core/src/cua2_core/services/instruction_service.py +14 -12
cua2-core/src/cua2_core/services/sandbox_service.py +29 -0
cua2-core/src/cua2_core/websocket/websocket_manager.py +1 -1
cua2-front/src/components/WelcomeScreen.tsx +8 -8
cua2-front/src/stores/agentStore.ts +2 -2

cua2-core/src/cua2_core/models/models.py CHANGED Viewed

@@ -77,7 +77,11 @@ class AgentAction(FunctionCall):
             seconds = args.get("seconds") or args.get("arg_0")
             return f"Wait for {seconds} seconds"
-        elif action_type == "open":
             url = args.get("url") or args.get("arg_0")
             return f"Open: {url}"
@@ -100,7 +104,7 @@ class AgentStep(BaseModel):
     step_evaluation: Literal["like", "dislike", "neutral"]
     error: Optional[str] = None
     thought: Optional[str] = None
-    actions: Optional[list[AgentAction]] = None
     @field_serializer("actions")
     def serialize_actions(self, actions: list[AgentAction], _info):

             seconds = args.get("seconds") or args.get("arg_0")
             return f"Wait for {seconds} seconds"
+        elif action_type == "open_url":
+            url = args.get("url") or args.get("arg_0")
+            return f"Open: {url}"
+        elif action_type == "launch":
             url = args.get("url") or args.get("arg_0")
             return f"Open: {url}"
     step_evaluation: Literal["like", "dislike", "neutral"]
     error: Optional[str] = None
     thought: Optional[str] = None
+    actions: list[AgentAction] = []
     @field_serializer("actions")
     def serialize_actions(self, actions: list[AgentAction], _info):

cua2-core/src/cua2_core/services/agent_utils/desktop_agent.py CHANGED Viewed

@@ -24,11 +24,13 @@ class E2BVisionAgent(CodeAgent):
         verbosity_level: LogLevel = 2,
         planning_interval: int | None = None,
         use_v1_prompt: bool = False,
         **kwargs,
     ):
         self.desktop = desktop
         self.data_dir = data_dir
         self.planning_interval = planning_interval
         # Initialize Desktop
         self.width, self.height = self.desktop.get_screen_size()
         print(f"Screen size: {self.width}x{self.height}")
@@ -60,6 +62,27 @@ class E2BVisionAgent(CodeAgent):
         self.logger.log("Setting up agent tools...")
         self._setup_desktop_tools()
     def _setup_desktop_tools(self):
         """Register all desktop tools"""
@@ -71,6 +94,9 @@ class E2BVisionAgent(CodeAgent):
                 x: The x coordinate (horizontal position)
                 y: The y coordinate (vertical position)
             """
             self.desktop.move_mouse(x, y)
             self.desktop.left_click()
             self.click_coordinates = [x, y]
@@ -85,6 +111,9 @@ class E2BVisionAgent(CodeAgent):
                 x: The x coordinate (horizontal position)
                 y: The y coordinate (vertical position)
             """
             self.desktop.move_mouse(x, y)
             self.desktop.right_click()
             self.click_coordinates = [x, y]
@@ -99,6 +128,9 @@ class E2BVisionAgent(CodeAgent):
                 x: The x coordinate (horizontal position)
                 y: The y coordinate (vertical position)
             """
             self.desktop.move_mouse(x, y)
             self.desktop.double_click()
             self.click_coordinates = [x, y]
@@ -113,6 +145,9 @@ class E2BVisionAgent(CodeAgent):
                 x: The x coordinate (horizontal position)
                 y: The y coordinate (vertical position)
             """
             self.desktop.move_mouse(x, y)
             self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
             return f"Moved mouse to coordinates ({x}, {y})"
@@ -167,6 +202,11 @@ class E2BVisionAgent(CodeAgent):
                 x2: end x coordinate
                 y2: end y coordinate
             """
             self.desktop.drag([x1, y1], [x2, y2])
             message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
             self.logger.log(message)
@@ -182,6 +222,9 @@ class E2BVisionAgent(CodeAgent):
                 direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
                 amount: The amount to scroll. A good amount is 1 or 2.
             """
             self.desktop.move_mouse(x, y)
             self.desktop.scroll(direction=direction, amount=amount)
             message = f"Scrolled {direction} by {amount}"
@@ -200,18 +243,30 @@ class E2BVisionAgent(CodeAgent):
             return f"Waited for {seconds} seconds"
         @tool
-        def open(url: str) -> str:
             """
             Directly opens a browser with the specified url: use this at start of web searches rather than trying to click the browser.
             Args:
                 url: The URL to open
             """
             self.desktop.open(url)
             time.sleep(2)
             self.logger.log(f"Opening URL: {url}")
             return f"Opened URL: {url}"
         # Register the tools
         self.tools["click"] = click
         self.tools["right_click"] = right_click
@@ -221,7 +276,8 @@ class E2BVisionAgent(CodeAgent):
         self.tools["press"] = press
         self.tools["scroll"] = scroll
         self.tools["wait"] = wait
-        self.tools["open"] = open
         self.tools["go_back"] = go_back
         self.tools["drag"] = drag
         self.tools["scroll"] = scroll

         verbosity_level: LogLevel = 2,
         planning_interval: int | None = None,
         use_v1_prompt: bool = False,
+        qwen_normalization: bool = True,
         **kwargs,
     ):
         self.desktop = desktop
         self.data_dir = data_dir
         self.planning_interval = planning_interval
+        self.qwen_normalization = qwen_normalization
         # Initialize Desktop
         self.width, self.height = self.desktop.get_screen_size()
         print(f"Screen size: {self.width}x{self.height}")
         self.logger.log("Setting up agent tools...")
         self._setup_desktop_tools()
+    def _qwen_unnormalization(self, arguments: dict[str, int]) -> dict[str, int]:
+        """
+        Unnormalize coordinates from 0-999 range to actual screen pixel coordinates.
+        Coordinates are identified by keys containing 'x' or 'y'.
+        Args:
+            arguments: Dictionary with coordinate parameters (keys containing 'x' or 'y')
+        Returns:
+            Dictionary with unnormalized pixel coordinates
+        """
+        unnormalized: dict[str, int] = {}
+        for key, value in arguments.items():
+            if "x" in key.lower() and "y" not in key.lower():
+                unnormalized[key] = int((value / 1000) * self.width)
+            elif "y" in key.lower():
+                unnormalized[key] = int((value / 1000) * self.height)
+            else:
+                unnormalized[key] = value
+        return unnormalized
     def _setup_desktop_tools(self):
         """Register all desktop tools"""
                 x: The x coordinate (horizontal position)
                 y: The y coordinate (vertical position)
             """
+            if self.qwen_normalization:
+                coords = self._qwen_unnormalization({"x": x, "y": y})
+                x, y = coords["x"], coords["y"]
             self.desktop.move_mouse(x, y)
             self.desktop.left_click()
             self.click_coordinates = [x, y]
                 x: The x coordinate (horizontal position)
                 y: The y coordinate (vertical position)
             """
+            if self.qwen_normalization:
+                coords = self._qwen_unnormalization({"x": x, "y": y})
+                x, y = coords["x"], coords["y"]
             self.desktop.move_mouse(x, y)
             self.desktop.right_click()
             self.click_coordinates = [x, y]
                 x: The x coordinate (horizontal position)
                 y: The y coordinate (vertical position)
             """
+            if self.qwen_normalization:
+                coords = self._qwen_unnormalization({"x": x, "y": y})
+                x, y = coords["x"], coords["y"]
             self.desktop.move_mouse(x, y)
             self.desktop.double_click()
             self.click_coordinates = [x, y]
                 x: The x coordinate (horizontal position)
                 y: The y coordinate (vertical position)
             """
+            if self.qwen_normalization:
+                coords = self._qwen_unnormalization({"x": x, "y": y})
+                x, y = coords["x"], coords["y"]
             self.desktop.move_mouse(x, y)
             self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
             return f"Moved mouse to coordinates ({x}, {y})"
                 x2: end x coordinate
                 y2: end y coordinate
             """
+            if self.qwen_normalization:
+                coords = self._qwen_unnormalization(
+                    {"x1": x1, "y1": y1, "x2": x2, "y2": y2}
+                )
+                x1, y1, x2, y2 = coords["x1"], coords["y1"], coords["x2"], coords["y2"]
             self.desktop.drag([x1, y1], [x2, y2])
             message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
             self.logger.log(message)
                 direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
                 amount: The amount to scroll. A good amount is 1 or 2.
             """
+            if self.qwen_normalization:
+                coords = self._qwen_unnormalization({"x": x, "y": y})
+                x, y = coords["x"], coords["y"]
             self.desktop.move_mouse(x, y)
             self.desktop.scroll(direction=direction, amount=amount)
             message = f"Scrolled {direction} by {amount}"
             return f"Waited for {seconds} seconds"
         @tool
+        def open_url(url: str) -> str:
             """
             Directly opens a browser with the specified url: use this at start of web searches rather than trying to click the browser.
             Args:
                 url: The URL to open
             """
+            if not url.startswith("http") and not url.startswith("https"):
+                url = f"https://{url}"
             self.desktop.open(url)
             time.sleep(2)
             self.logger.log(f"Opening URL: {url}")
             return f"Opened URL: {url}"
+        @tool
+        def launch(app: str) -> str:
+            """
+            Launches the specified application
+            Args:
+                app: The application to launch
+            """
+            self.desktop.commands.run(f"{app}", background=True)
+            return f"Launched application: {app}"
         # Register the tools
         self.tools["click"] = click
         self.tools["right_click"] = right_click
         self.tools["press"] = press
         self.tools["scroll"] = scroll
         self.tools["wait"] = wait
+        self.tools["open_url"] = open_url
+        self.tools["launch"] = launch
         self.tools["go_back"] = go_back
         self.tools["drag"] = drag
         self.tools["scroll"] = scroll

cua2-core/src/cua2_core/services/agent_utils/prompt.py CHANGED Viewed

@@ -32,6 +32,32 @@ click(x, y)
 <environment>
 The desktop resolution is <<resolution_x>>x<<resolution_y>> pixels.
 You can only interact through the following tools:
 {%- for tool in tools.values() %}
@@ -42,7 +68,11 @@ You can only interact through the following tools:
 If a task requires a specific application or website, **use**:
 ```python
-open("app_or_url")
 ```
 to launch it before interacting.
 Never manually click the browser icon — use `open_url()` directly for web pages.
@@ -51,8 +81,9 @@ Never manually click the browser icon — use `open_url()` directly for web page
 ---
 <click_guidelines>
-- Always click using **real, visible coordinates** based on the current screenshot.
 - Click precisely **in the center** of the intended target (button, text, icon).
 - Avoid random or approximate coordinates.
 - If nothing changes after a click, check if you misclicked (green crosshair = last click position).
 - If a menu item shows a ▶ (triangle), it means it expands—click directly on the text, not the icon.
@@ -64,7 +95,7 @@ Never manually click the browser icon — use `open_url()` directly for web page
 <workflow_guidelines>
 - **ALWAYS START** by analyzing if the task requires opening an application or URL. If so, your **first action** must be:
   - For websites: `open_url("https://google.com")`
-  - For applications: `open("app_name")`
   - Never manually navigate to apps via clicking icons—use the open tools directly.
 - Complete one atomic action per step: e.g., **click**, **type**, or **wait**.
 - Never combine multiple tool calls in one step.
@@ -89,7 +120,7 @@ What I see: “Text Editor” visible under Accessories.
 Reflection: Clicking directly on “Text Editor”.
 Action:
 ```python
-open("text_editor")
 ```<end_code>
 Step 2
@@ -98,7 +129,7 @@ What I see: Text editor page.
 Reflection: Click on the text editor page to write "Hello World".
 Action:
 ```python
-click(52, 10)
 ```<end_code>
 Step 3

 <environment>
 The desktop resolution is <<resolution_x>>x<<resolution_y>> pixels.
+**Coordinate System:**
+- **IMPORTANT**: All coordinates must be specified in a **normalized range from 0 to 1000**.
+- The x-axis goes from 0 (left edge) to 1000 (right edge).
+- The y-axis goes from 0 (top edge) to 1000 (bottom edge).
+- The system will automatically convert these normalized coordinates to actual screen pixels.
+- Example: To click the center of the screen, use `click(500, 500)`.
+**System Information:**
+You are running on **Xubuntu** (Ubuntu with XFCE desktop environment).
+This is a lightweight setup with essential applications.
+**Available Default Applications:**
+- **File Manager**: Use terminal to browse and manage files (file browsing and management)
+- **Document/Calc Editor**: LibreOffice (document/calculator editor)
+- **Note-taking**: mousepad
+- **Terminal**: xfce4-terminal (command-line interface)
+- **Web Browser**: Firefox (use `open_url()` for websites)
+- **Image Viewer**: ristretto (image viewer)
+- **PDF Viewer**: xpdf (pdf viewer)
+**Important Notes:**
+- This is a **lightweight desktop environment** — do not assume specialized software is installed.
+- For tasks requiring specific applications not listed above, you may need to adapt or use available alternatives.
+- Always verify what's actually visible on the screen rather than assuming applications exist.
 You can only interact through the following tools:
 {%- for tool in tools.values() %}
 If a task requires a specific application or website, **use**:
 ```python
+open_url("https://google.com")
+launch("xfce4-terminal")
+launch("libreoffice --writer")
+launch("libreoffice --calc")
+launch("mousepad")
 ```
 to launch it before interacting.
 Never manually click the browser icon — use `open_url()` directly for web pages.
 ---
 <click_guidelines>
+- Always use **normalized coordinates (0-1000 range)** based on the current screenshot.
 - Click precisely **in the center** of the intended target (button, text, icon).
+- Coordinates must be integers between 0 and 1000 for both x and y axes.
 - Avoid random or approximate coordinates.
 - If nothing changes after a click, check if you misclicked (green crosshair = last click position).
 - If a menu item shows a ▶ (triangle), it means it expands—click directly on the text, not the icon.
 <workflow_guidelines>
 - **ALWAYS START** by analyzing if the task requires opening an application or URL. If so, your **first action** must be:
   - For websites: `open_url("https://google.com")`
+  - For applications: `launch("app_name")`
   - Never manually navigate to apps via clicking icons—use the open tools directly.
 - Complete one atomic action per step: e.g., **click**, **type**, or **wait**.
 - Never combine multiple tool calls in one step.
 Reflection: Clicking directly on “Text Editor”.
 Action:
 ```python
+launch("text_editor")
 ```<end_code>
 Step 2
 Reflection: Click on the text editor page to write "Hello World".
 Action:
 ```python
+click(150, 100)
 ```<end_code>
 Step 3

cua2-core/src/cua2_core/services/instruction_service.py CHANGED Viewed

@@ -14,12 +14,9 @@ class InstructionService:
     available_models = AVAILABLE_MODELS
     seed_topics = [
         "web browsing",
-        "email management",
-        "calendar scheduling",
-        "file management",
         "note-taking",
-        "system settings",
-        "text editing",
         "terminal commands",
     ]
@@ -60,32 +57,37 @@ class InstructionService:
         (
             "Generate a clear and specific web browsing task instruction for a desktop automation agent. "
             "The task should be goal-centric, focused on retrieving information or performing an action online. "
-            "You can specify a URL or website to visit. "
             "Return only the task instruction, nothing else. Keep it simple and focused on a single goal."
         ),
         (
             "Create a practical web browsing task for desktop automation. "
             "The task should focus on finding specific information or completing an online action. "
-            "Include a specific URL or website name if relevant to the goal. "
             "Provide only the task description without any additional explanation."
         ),
         (
             "Generate a specific web browsing task that a desktop automation agent can perform. "
             "The task should be about retrieving information or performing an action on a website. "
-            "You may specify URLs or web addresses. Keep it concrete and single-purpose. "
-            "Return just the task instruction."
         ),
         (
             "Provide a goal-oriented web browsing task instruction for a desktop agent. "
             "Focus on what information to find or what action to perform online. "
-            "Specify a URL or website if it helps achieve the goal. "
             "Output only the instruction."
         ),
         (
             "Think of a realistic web browsing task suitable for desktop automation. "
             "The task should be about accessing online information or performing a web-based action. "
-            "Include specific URLs or websites as needed. Keep it simple and goal-focused. "
-            "Return only the task."
         ),
     ]

     available_models = AVAILABLE_MODELS
     seed_topics = [
         "web browsing",
+        "file management (linux)",
         "note-taking",
+        "text/document editing (no existing document required, create a new one if needed, use libreoffice)",
         "terminal commands",
     ]
         (
             "Generate a clear and specific web browsing task instruction for a desktop automation agent. "
             "The task should be goal-centric, focused on retrieving information or performing an action online. "
+            "Directly specify a URL or website to visit (e.g., 'Go to google.com and search for...'). "
+            "Do NOT instruct the agent to open a browser application first - just specify the URL or web task directly. "
             "Return only the task instruction, nothing else. Keep it simple and focused on a single goal."
         ),
         (
             "Create a practical web browsing task for desktop automation. "
             "The task should focus on finding specific information or completing an online action. "
+            "Include a specific URL or website name and what to do there (e.g., 'Visit github.com and...'). "
+            "Do NOT include steps about opening a browser - just specify the web task directly. "
             "Provide only the task description without any additional explanation."
         ),
         (
             "Generate a specific web browsing task that a desktop automation agent can perform. "
             "The task should be about retrieving information or performing an action on a website. "
+            "Specify URLs or web addresses directly (e.g., 'Navigate to wikipedia.org and...'). "
+            "Do NOT mention opening a browser application - assume the agent will handle that automatically. "
+            "Keep it concrete and single-purpose. Return just the task instruction."
         ),
         (
             "Provide a goal-oriented web browsing task instruction for a desktop agent. "
             "Focus on what information to find or what action to perform online. "
+            "Specify a URL or website directly as part of the task (e.g., 'Go to amazon.com and...'). "
+            "Do NOT instruct to open a browser first - just state the URL and the web task. "
             "Output only the instruction."
         ),
         (
             "Think of a realistic web browsing task suitable for desktop automation. "
             "The task should be about accessing online information or performing a web-based action. "
+            "Include specific URLs or websites with the action to perform (e.g., 'Visit youtube.com and...'). "
+            "Do NOT include opening a browser as a separate step - just specify the web task directly. "
+            "Keep it simple and goal-focused. Return only the task."
         ),
     ]

cua2-core/src/cua2_core/services/sandbox_service.py CHANGED Viewed

@@ -88,3 +88,32 @@ class SandboxService:
                 await asyncio.to_thread(self.sandboxes[session_hash].kill)
                 del self.sandboxes[session_hash]
                 del self.sandbox_metadata[session_hash]

                 await asyncio.to_thread(self.sandboxes[session_hash].kill)
                 del self.sandboxes[session_hash]
                 del self.sandbox_metadata[session_hash]
+if __name__ == "__main__":
+    desktop: Sandbox = Sandbox.create(
+        api_key=os.getenv("E2B_API_KEY"),
+        resolution=(WIDTH, HEIGHT),
+        dpi=96,
+        timeout=SANDBOX_TIMEOUT,
+        template="k0wmnzir0zuzye6dndlw",
+    )
+    desktop.stream.start(require_auth=True)
+    setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
+    desktop.commands.run(setup_cmd)
+    print(
+        desktop.stream.get_url(
+            auto_connect=True,
+            view_only=False,
+            resize="scale",
+            auth_key=desktop.stream.get_auth_key(),
+        )
+    )
+    try:
+        while True:
+            application = input("Enter application to launch: ")
+            desktop.commands.run(f"{application} &")
+    except (KeyboardInterrupt, Exception):
+        pass
+    desktop.kill()

cua2-core/src/cua2_core/websocket/websocket_manager.py CHANGED Viewed

@@ -52,7 +52,7 @@ class WebSocketManager:
         try:
             await websocket.send_text(
                 json.dumps(
-                    message.model_dump(mode="json", context={"actions_as_json": False})
                 )
             )
         except Exception as e:

         try:
             await websocket.send_text(
                 json.dumps(
+                    message.model_dump(mode="json", context={"actions_as_json": True})
                 )
             )
         except Exception as e:

cua2-front/src/components/WelcomeScreen.tsx CHANGED Viewed

@@ -1,12 +1,12 @@
-import React, { useState, useEffect, useRef } from 'react';
-import { Box, Typography, Button, Container, Paper, TextField, IconButton, Select, MenuItem, FormControl, InputLabel, CircularProgress } from '@mui/material';
-import ShuffleIcon from '@mui/icons-material/Shuffle';
-import SendIcon from '@mui/icons-material/Send';
-import LightModeOutlined from '@mui/icons-material/LightModeOutlined';
 import DarkModeOutlined from '@mui/icons-material/DarkModeOutlined';
 import SmartToyIcon from '@mui/icons-material/SmartToy';
-import { useAgentStore, selectSelectedModelId, selectIsDarkMode, selectAvailableModels, selectIsLoadingModels } from '@/stores/agentStore';
-import { fetchAvailableModels, generateRandomQuestion } from '@/services/api';
 interface WelcomeScreenProps {
   onStartTask: (instruction: string, modelId: string) => void;
@@ -147,7 +147,7 @@ export const WelcomeScreen: React.FC<WelcomeScreenProps> = ({ onStartTask, isCon
             color: 'text.primary',
           }}
         >
-          CUA2 Agent
         </Typography>
         {/* Powered by smolagents */}

+import { fetchAvailableModels, generateRandomQuestion } from '@/services/api';
+import { selectAvailableModels, selectIsDarkMode, selectIsLoadingModels, selectSelectedModelId, useAgentStore } from '@/stores/agentStore';
 import DarkModeOutlined from '@mui/icons-material/DarkModeOutlined';
+import LightModeOutlined from '@mui/icons-material/LightModeOutlined';
+import SendIcon from '@mui/icons-material/Send';
+import ShuffleIcon from '@mui/icons-material/Shuffle';
 import SmartToyIcon from '@mui/icons-material/SmartToy';
+import { Box, Button, CircularProgress, Container, FormControl, IconButton, InputLabel, MenuItem, Paper, Select, TextField, Typography } from '@mui/material';
+import React, { useEffect, useRef, useState } from 'react';
 interface WelcomeScreenProps {
   onStartTask: (instruction: string, modelId: string) => void;
             color: 'text.primary',
           }}
         >
+          Computer Use Agent
         </Typography>
         {/* Powered by smolagents */}

cua2-front/src/stores/agentStore.ts CHANGED Viewed

@@ -1,6 +1,6 @@
 import { create } from 'zustand';
 import { devtools } from 'zustand/middleware';
-import { AgentTrace, AgentStep, AgentTraceMetadata, FinalStep } from '@/types/agent';
 interface AgentState {
   // State
@@ -39,7 +39,7 @@ const initialState = {
   isAgentProcessing: false,
   isConnectingToE2B: false,
   vncUrl: '',
-  selectedModelId: 'Qwen/Qwen3-VL-8B-Instruct',
   availableModels: [],
   isLoadingModels: false,
   isConnected: false,

+import { AgentStep, AgentTrace, AgentTraceMetadata, FinalStep } from '@/types/agent';
 import { create } from 'zustand';
 import { devtools } from 'zustand/middleware';
 interface AgentState {
   // State
   isAgentProcessing: false,
   isConnectingToE2B: false,
   vncUrl: '',
+  selectedModelId: 'Qwen/Qwen3-VL-30B-A3B-Instruct',
   availableModels: [],
   isLoadingModels: false,
   isConnected: false,