From 6f1da7ba3add201078c0f96ff77a6de073a472dc Mon Sep 17 00:00:00 2001 From: Ordinal Inc Date: Sat, 26 Oct 2024 17:54:55 -0500 Subject: [PATCH 1/5] fixed retina macos issue --- computer_use_demo/app.py | 2 + computer_use_demo/tools/computer.py | 65 +++++++++++++++++++---------- 2 files changed, 45 insertions(+), 22 deletions(-) diff --git a/computer_use_demo/app.py b/computer_use_demo/app.py index 3a4ad1a..60c6d2f 100644 --- a/computer_use_demo/app.py +++ b/computer_use_demo/app.py @@ -47,6 +47,8 @@ def setup_state(state): state["api_key"] = load_from_storage("api_key") or os.getenv("ANTHROPIC_API_KEY", "") if not state["api_key"]: print("API key not found. Please set it in the environment or storage.") + else: + print(f"API key loaded: {state['api_key'][:5]}...{state['api_key'][-5:]}") if "provider" not in state: state["provider"] = os.getenv("API_PROVIDER", "anthropic") or APIProvider.ANTHROPIC if "provider_radio" not in state: diff --git a/computer_use_demo/tools/computer.py b/computer_use_demo/tools/computer.py index c7bd0ef..7ae8faf 100644 --- a/computer_use_demo/tools/computer.py +++ b/computer_use_demo/tools/computer.py @@ -8,6 +8,8 @@ from pathlib import Path from typing import Literal, TypedDict from uuid import uuid4 +import io +from PIL import Image from anthropic.types.beta import BetaToolComputerUse20241022Param @@ -186,21 +188,18 @@ async def __call__( raise ToolError(f"Invalid action: {action}") - async def screenshot(self): - """Take a screenshot of the current screen and return a ToolResult with the base64 encoded image.""" - output_dir = Path(OUTPUT_DIR) - output_dir.mkdir(parents=True, exist_ok=True) - path = output_dir / f"screenshot_{uuid4().hex}.png" - - # Take screenshot using pyautogui + async def screenshot(self) -> ToolResult: screenshot = pyautogui.screenshot() - screenshot.save(str(path)) + img_byte_arr = io.BytesIO() + screenshot.save(img_byte_arr, format='PNG') + img_byte_arr = img_byte_arr.getvalue() - if path.exists(): - # Return a ToolResult instance instead of a dictionary - return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode()) - - raise ToolError(f"Failed to take screenshot: {path} does not exist.") + # Resize the image if it's too large + if len(img_byte_arr) > 5 * 1024 * 1024: + img_byte_arr = self.resize_image(img_byte_arr) + + base64_image = base64.b64encode(img_byte_arr).decode('utf-8') + return ToolResult(output="Screenshot taken", base64_image=base64_image) async def shell(self, command: str, take_screenshot=True) -> ToolResult: """Run a shell command and return the output, error, and optionally a screenshot.""" @@ -240,11 +239,20 @@ def scale_coordinates(self, source: ScalingSource, x: int, y: int): return round(x * x_scaling_factor), round(y * y_scaling_factor) def get_screen_size(self): - if platform.system() == "Windows": - # Command to get screen resolution on Windows + if platform.system() == "Darwin": # macOS + try: + output = subprocess.check_output(["system_profiler", "SPDisplaysDataType"]).decode('utf-8') + for line in output.split('\n'): + if "Resolution" in line: + resolution = line.split(':')[1].strip() + width, height = map(lambda x: int(x.split()[0]), resolution.split(' x ')) + return width, height + except Exception as e: + print(f"Error getting screen size: {e}") + return 1920, 1080 # Default fallback resolution + elif platform.system() == "Windows": + # Keep existing Windows code cmd = "wmic path Win32_VideoController get CurrentHorizontalResolution,CurrentVerticalResolution" - elif platform.system() == "Darwin": # macOS - cmd = "system_profiler SPDisplaysDataType | grep Resolution" else: # Linux or other OS cmd = "xrandr | grep '*' | awk '{print $1}'" @@ -254,9 +262,6 @@ def get_screen_size(self): if platform.system() == "Windows": lines = output.strip().split('\n')[1:] # Skip the header width, height = map(int, lines[0].split()) - elif platform.system() == "Darwin": - resolution = output.split()[0] - width, height = map(int, resolution.split('x')) else: resolution = output.strip().split()[0] width, height = map(int, resolution.split('x')) @@ -265,7 +270,7 @@ def get_screen_size(self): except subprocess.CalledProcessError as e: print(f"Error occurred: {e}") - return None, None # Return None or some default values + return 1920, 1080 # Default fallback resolution def get_mouse_position(self): @@ -281,4 +286,20 @@ def map_keys(self, text: str): """Map text to cliclick key codes if necessary.""" # For simplicity, return text as is # Implement mapping if special keys are needed - return text \ No newline at end of file + return text + + def resize_image(self, image_data: bytes, max_size: int = 5 * 1024 * 1024) -> bytes: + img = Image.open(io.BytesIO(image_data)) + + # Calculate the scaling factor + current_size = len(image_data) + scale_factor = (max_size / current_size) ** 0.5 + + # Resize the image + new_size = (int(img.width * scale_factor), int(img.height * scale_factor)) + img = img.resize(new_size, Image.LANCZOS) + + # Save the resized image to a bytes buffer + buffer = io.BytesIO() + img.save(buffer, format="PNG", optimize=True) + return buffer.getvalue() From f75680013fab7b995c64e4d3fa108e1373d69eff Mon Sep 17 00:00:00 2001 From: Ordinal Inc Date: Sat, 26 Oct 2024 17:59:52 -0500 Subject: [PATCH 2/5] macos fixes :) works on my m1 --- computer_use_demo/tools/computer.py | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/computer_use_demo/tools/computer.py b/computer_use_demo/tools/computer.py index 7ae8faf..c1f5290 100644 --- a/computer_use_demo/tools/computer.py +++ b/computer_use_demo/tools/computer.py @@ -282,24 +282,4 @@ def get_mouse_position(self): # Adjust for different coordinate system return int(loc.x), int(self.height - loc.y) - def map_keys(self, text: str): - """Map text to cliclick key codes if necessary.""" - # For simplicity, return text as is - # Implement mapping if special keys are needed - return text - - def resize_image(self, image_data: bytes, max_size: int = 5 * 1024 * 1024) -> bytes: - img = Image.open(io.BytesIO(image_data)) - - # Calculate the scaling factor - current_size = len(image_data) - scale_factor = (max_size / current_size) ** 0.5 - - # Resize the image - new_size = (int(img.width * scale_factor), int(img.height * scale_factor)) - img = img.resize(new_size, Image.LANCZOS) - - # Save the resized image to a bytes buffer - buffer = io.BytesIO() - img.save(buffer, format="PNG", optimize=True) - return buffer.getvalue() + def ma \ No newline at end of file From 554cb318dc130bf9586c066fcd64487bfff7463e Mon Sep 17 00:00:00 2001 From: Ordinal Inc Date: Sat, 26 Oct 2024 18:02:57 -0500 Subject: [PATCH 3/5] something happened --- computer_use_demo/tools/computer.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/computer_use_demo/tools/computer.py b/computer_use_demo/tools/computer.py index c1f5290..7ae8faf 100644 --- a/computer_use_demo/tools/computer.py +++ b/computer_use_demo/tools/computer.py @@ -282,4 +282,24 @@ def get_mouse_position(self): # Adjust for different coordinate system return int(loc.x), int(self.height - loc.y) - def ma \ No newline at end of file + def map_keys(self, text: str): + """Map text to cliclick key codes if necessary.""" + # For simplicity, return text as is + # Implement mapping if special keys are needed + return text + + def resize_image(self, image_data: bytes, max_size: int = 5 * 1024 * 1024) -> bytes: + img = Image.open(io.BytesIO(image_data)) + + # Calculate the scaling factor + current_size = len(image_data) + scale_factor = (max_size / current_size) ** 0.5 + + # Resize the image + new_size = (int(img.width * scale_factor), int(img.height * scale_factor)) + img = img.resize(new_size, Image.LANCZOS) + + # Save the resized image to a bytes buffer + buffer = io.BytesIO() + img.save(buffer, format="PNG", optimize=True) + return buffer.getvalue() From bee65f5de6f9fd69e796f9098d2b1ca3875c35f6 Mon Sep 17 00:00:00 2001 From: PyroFilmsFX <4606457+PyroFilmsFX@users.noreply.github.com> Date: Sat, 26 Oct 2024 18:04:20 -0500 Subject: [PATCH 4/5] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4a095c7..d5fac33 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,8 @@ ## 🌟 Overview This is an out-of-the-box (OOTB) solution for Claude's new Computer Use APIs. +MacOS working version (tested on m1 retina) + **No Docker** is required, and it theoretically supports **any platform**, with testing currently done on **Windows**. This project provides a user-friendly interface based on Gradio. 🎨 ## Update @@ -74,7 +76,7 @@ Desktop Interface - [ ] **Platform** - [x] **Windows** - [x] **Mobile** (Send command) - - [ ] **Mac** + - [x] **Mac** - [ ] **Mobile** (Be controlled) - [ ] **Support for More MLLMs** - [x] **Claude 3.5 Sonnet** 🎵 From 1dc2a2b46637997695e4b98be99908ca08d5eccf Mon Sep 17 00:00:00 2001 From: PyroFilmsFX <4606457+PyroFilmsFX@users.noreply.github.com> Date: Sat, 26 Oct 2024 18:04:31 -0500 Subject: [PATCH 5/5] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d5fac33..c8993cf 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Computer Use - OOTB +# Computer Use - OOTB MacOS working version (tested on m1 retina) ## 🌟 Overview This is an out-of-the-box (OOTB) solution for Claude's new Computer Use APIs.