diff --git a/src/askui/tools/agent_os.py b/src/askui/tools/agent_os.py index 96ecc831..af9cc96d 100644 --- a/src/askui/tools/agent_os.py +++ b/src/askui/tools/agent_os.py @@ -263,13 +263,18 @@ def disconnect(self) -> None: """ @abstractmethod - def screenshot(self, report: bool = True) -> Image.Image: + def screenshot(self, report: bool = True, unscaled: bool = False) -> Image.Image: """ Captures a screenshot of the current display. Args: report (bool, optional): Whether to include the screenshot in reporting. Defaults to `True`. + unscaled (bool, optional): Whether to return the screenshot at its + full, real-screen resolution instead of the resolution shown to + the model. Only has an effect on scaling implementations (e.g. + `ComputerAgentOsFacade`); implementations that already return the + native resolution ignore it. Defaults to `False`. Returns: Image.Image: A PIL Image object containing the screenshot. diff --git a/src/askui/tools/askui/askui_controller.py b/src/askui/tools/askui/askui_controller.py index 26aeb5d0..4e2f8c4f 100644 --- a/src/askui/tools/askui/askui_controller.py +++ b/src/askui/tools/askui/askui_controller.py @@ -349,13 +349,16 @@ def _stop_execution(self) -> None: @telemetry.record_call() @override - def screenshot(self, report: bool = True) -> Image.Image: + def screenshot(self, report: bool = True, unscaled: bool = False) -> Image.Image: """ Take a screenshot of the current screen. Args: report (bool, optional): Whether to include the screenshot in reporting. Defaults to `True`. + unscaled (bool, optional): Accepted for interface compatibility. This + client always returns the native screen resolution, so it has no + effect. Defaults to `False`. Returns: Image.Image: A PIL Image object containing the screenshot. @@ -375,7 +378,8 @@ def screenshot(self, report: bool = True) -> Image.Image: screenResponse.bitmap.data, ).split() image = Image.merge("RGB", (b, g, r)) - self._reporter.add_message("AgentOS", "screenshot()", image) + if report: + self._reporter.add_message("AgentOS", "screenshot()", image) return image @telemetry.record_call() diff --git a/src/askui/tools/computer_agent_os_facade.py b/src/askui/tools/computer_agent_os_facade.py index 676a6454..57c7efa4 100644 --- a/src/askui/tools/computer_agent_os_facade.py +++ b/src/askui/tools/computer_agent_os_facade.py @@ -50,6 +50,7 @@ def __init__( image_scaler: ImageScaler, ) -> None: self._agent_os = agent_os + self._image_scaler = image_scaler self._scaler = CoordinateScaler( coordinate_space=coordinate_space, image_scaler=image_scaler, @@ -66,10 +67,47 @@ def disconnect(self) -> None: self._agent_os.disconnect() self._scaler.real_screen_resolution = None - def screenshot(self, report: bool = True) -> Image.Image: + def screenshot(self, report: bool = True, unscaled: bool = False) -> Image.Image: screenshot = self._agent_os.screenshot(report=report) + if unscaled: + self._scaler.real_screen_resolution = screenshot.size + return screenshot return self._scaler.scale_screenshot(screenshot) + def scale_image_for_model(self, image: Image.Image) -> Image.Image: + """Apply the same scaling screenshots receive, without recording state. + + Unlike `screenshot`, this does not update the coordinate scaler's + recorded resolutions, so it is safe to call on arbitrary images (e.g. a + cropped region) without corrupting coordinate mapping. + + Args: + image (Image.Image): The image to scale for model consumption. + + Returns: + Image.Image: The scaled image. + """ + return self._image_scaler(image) + + def scale_point_to_real_screen( + self, x: float, y: float, check_coordinates_in_bounds: bool = True + ) -> tuple[int, int]: + """Map a point from the model coordinate space to real screen pixels. + + Args: + x (float): The horizontal coordinate in the model coordinate space. + y (float): The vertical coordinate in the model coordinate space. + check_coordinates_in_bounds (bool, optional): Whether to raise if the + mapped coordinate falls outside the screen. Set to `False` when the + caller clamps the result itself. Defaults to `True`. + + Returns: + tuple[int, int]: The corresponding `(x, y)` in real screen pixels. + """ + return self._scaler.scale_coordinates( + x, y, check_coordinates_in_bounds=check_coordinates_in_bounds + ) + def _take_silent_screenshot(self) -> Image.Image: return self.screenshot(report=False) diff --git a/src/askui/tools/playwright/agent_os.py b/src/askui/tools/playwright/agent_os.py index 6381be37..5f46e837 100644 --- a/src/askui/tools/playwright/agent_os.py +++ b/src/askui/tools/playwright/agent_os.py @@ -197,12 +197,15 @@ def disconnect(self) -> None: ) @override - def screenshot(self, report: bool = True) -> Image.Image: + def screenshot(self, report: bool = True, unscaled: bool = False) -> Image.Image: """Capture a screenshot of the current page. Args: report (bool, optional): Whether to include the screenshot in reporting. Defaults to `True`. + unscaled (bool, optional): Accepted for interface compatibility. This + agent OS always returns the native page resolution, so it has no + effect. Defaults to `False`. Returns: Image.Image: A PIL Image object containing the screenshot. diff --git a/src/askui/tools/playwright/agent_os_facade.py b/src/askui/tools/playwright/agent_os_facade.py index c6969fe4..1f68d61b 100644 --- a/src/askui/tools/playwright/agent_os_facade.py +++ b/src/askui/tools/playwright/agent_os_facade.py @@ -55,8 +55,10 @@ def disconnect(self) -> None: self._agent_os.disconnect() self._scaler.real_screen_resolution = None - def screenshot(self, report: bool = True) -> Image.Image: + def screenshot(self, report: bool = True, unscaled: bool = False) -> Image.Image: screenshot = self._agent_os.screenshot(report=report) + if unscaled: + return screenshot return self._scaler.scale_screenshot(screenshot) def mouse_move(self, x: float, y: float, duration: int = 500) -> None: diff --git a/src/askui/tools/store/computer/experimental/__init__.py b/src/askui/tools/store/computer/experimental/__init__.py index 43414e4b..c364ef26 100644 --- a/src/askui/tools/store/computer/experimental/__init__.py +++ b/src/askui/tools/store/computer/experimental/__init__.py @@ -8,6 +8,7 @@ ComputerSetProcessInFocusTool, ComputerSetWindowInFocusTool, ) +from .zoom import ComputerZoomTool __all__ = [ "ComputerGetFileNamesTool", @@ -18,4 +19,5 @@ "ComputerAddWindowAsVirtualDisplayTool", "ComputerSetWindowInFocusTool", "ComputerSetProcessInFocusTool", + "ComputerZoomTool", ] diff --git a/src/askui/tools/store/computer/experimental/zoom.py b/src/askui/tools/store/computer/experimental/zoom.py new file mode 100644 index 00000000..cae772ab --- /dev/null +++ b/src/askui/tools/store/computer/experimental/zoom.py @@ -0,0 +1,138 @@ +from typing import cast + +from PIL import Image + +from askui.models.shared import ComputerBaseTool, ToolTags +from askui.reporting import NULL_REPORTER, Reporter +from askui.tools.computer_agent_os_facade import ComputerAgentOsFacade + + +class ComputerZoomTool(ComputerBaseTool): + """ + Views a region of the screen at full resolution to inspect small details. + + Screenshots are downscaled before they reach the model, so small UI elements + (icons, tab titles, status-bar text, line numbers, tiny buttons) can become + illegible. This tool crops the requested region from the full-resolution + screenshot and returns it magnified. The returned image is only a magnified + view; coordinates for subsequent actions still use the original screen + coordinate space. + + Args: + agent_os (`ComputerAgentOsFacade`, optional): The agent OS facade. Injected + automatically when the tool is registered with an agent. + reporter (`Reporter`, optional): Reporter used to show the cropped image + (the exact image handed to the model) in the report. Defaults to a + null reporter that discards messages. + + Example: + ```python + from askui import ComputerAgent + from askui.tools.store.computer.experimental import ComputerZoomTool + + with ComputerAgent(act_tools=[ComputerZoomTool()]) as agent: + agent.act("Enable the tiny checkbox next to 'Advanced options'") + + with ComputerAgent() as agent: + agent.act( + "Enable the tiny checkbox next to 'Advanced options'", + tools=[ComputerZoomTool()], + ) + ``` + """ + + def __init__( + self, + agent_os: ComputerAgentOsFacade | None = None, + reporter: Reporter = NULL_REPORTER, + ) -> None: + super().__init__( + name="zoom", + description=( + "View a specific region of the screen at full resolution. This " + "is a last resort for reading content that is genuinely too small " + "to make out in the normal screenshot (e.g. tiny text, icons, " + "status-bar text, line numbers) when that detail is required to " + "decide your next action.\n" + "Use it sparingly. Before zooming, rely on the normal screenshot " + "you already have. Do NOT use this tool when:\n" + "- the relevant text or element is already legible in the normal " + "screenshot;\n" + "- you only need to locate or click an element (the normal " + "screenshot coordinates are sufficient for that);\n" + "- you have already zoomed into this region — do not re-zoom the " + "same area.\n" + "Provide the region as [x1, y1, x2, y2], the top-left and " + "bottom-right corners in the same coordinates you use for " + "clicking. The returned image is only a magnified view; " + "coordinates for subsequent actions still use the original screen " + "coordinate space." + ), + input_schema={ + "type": "object", + "properties": { + "region": { + "type": "array", + "description": ( + "The region to zoom into as [x1, y1, x2, y2]: the " + "top-left and bottom-right corners in screen " + "coordinates." + ), + "items": {"type": "number"}, + "minItems": 4, + "maxItems": 4, + }, + }, + "required": ["region"], + }, + agent_os=agent_os, + required_tags=[ToolTags.SCALED_AGENT_OS.value], + ) + self.is_cacheable = True + self._reporter = reporter + + def __call__(self, region: list[float]) -> tuple[str, Image.Image]: + if len(region) != 4: # noqa: PLR2004 + error_msg = ( + f"region must contain exactly 4 values [x1, y1, x2, y2], " + f"got {len(region)}" + ) + raise ValueError(error_msg) + + agent_os = cast("ComputerAgentOsFacade", self.agent_os) + # Suppress reporting of the uncropped screenshot; we report the crop below. + screenshot = agent_os.screenshot(unscaled=True, report=False) + + # Map the model-space corners to real screen pixels. Skip the mapper's + # bounds check; we clamp to the screenshot below so a slightly oversized + # region from the model crops to the edge instead of erroring. + x1, y1, x2, y2 = region + left, top = agent_os.scale_point_to_real_screen( + x1, y1, check_coordinates_in_bounds=False + ) + right, bottom = agent_os.scale_point_to_real_screen( + x2, y2, check_coordinates_in_bounds=False + ) + + left, right = sorted((left, right)) + top, bottom = sorted((top, bottom)) + left = max(0, min(left, screenshot.width - 1)) + right = max(left + 1, min(right, screenshot.width)) + top = max(0, min(top, screenshot.height - 1)) + bottom = max(top + 1, min(bottom, screenshot.height)) + + crop = screenshot.crop((left, top, right, bottom)) + crop = agent_os.scale_image_for_model(crop) + # Report the region in real screen pixels (where the crop was actually + # taken), not the raw coordinates the model passed. + self._reporter.add_message( + "AgentOS", f"zoom([{left}, {top}, {right}, {bottom}])", crop + ) + message = ( + f"Zoomed into region [{x1}, {y1}, {x2}, {y2}] shown at full " + "resolution. Coordinates for further actions remain in the original " + "screen coordinate space. Now proceed with the next action (e.g. " + "move/click) using those coordinates; do not zoom again unless a " + "different region is still too small to read." + ) + return message, crop diff --git a/tests/unit/tools/test_zoom_tool.py b/tests/unit/tools/test_zoom_tool.py new file mode 100644 index 00000000..5dd88972 --- /dev/null +++ b/tests/unit/tools/test_zoom_tool.py @@ -0,0 +1,150 @@ +"""Tests for `ComputerZoomTool`. + +The zoom tool returns a magnified, full-resolution crop of a region the model +specifies in its (downscaled) coordinate space. The region is mapped back to +real screen pixels before cropping, so a small box in model space becomes a +larger, more legible crop. +""" + +from typing import cast +from unittest.mock import MagicMock + +import pytest +from PIL import Image +from pytest_mock import MockerFixture + +from askui.models.shared.coordinate_space import ( + NormalizedCoordinateSpace, + PixelCoordinateSpace, + VlmCoordinateSpace, +) +from askui.models.shared.image_scaler import ContainedImageScaler +from askui.tools.computer_agent_os_facade import ComputerAgentOsFacade +from askui.tools.store.computer.experimental import ComputerZoomTool + + +def _make_facade( + real_size: tuple[int, int], + coordinate_space: VlmCoordinateSpace | None = None, +) -> ComputerAgentOsFacade: + """Create a facade wrapping a mocked agent OS with a known screen size.""" + mock_os = MagicMock() + mock_os.tags = [] + mock_os.screenshot.return_value = Image.new("RGB", real_size) + return ComputerAgentOsFacade( + mock_os, + coordinate_space=coordinate_space or PixelCoordinateSpace(), + image_scaler=ContainedImageScaler(), + ) + + +class TestComputerZoomTool: + """A 2048x1536 screen maps 1:2 onto the 1024x768 model space (no padding).""" + + def test_crops_region_mapped_to_real_resolution(self) -> None: + facade = _make_facade((2048, 1536)) + tool = ComputerZoomTool(agent_os=facade) + + message, crop = tool(region=[100, 100, 200, 200]) + + # 100px box in model space -> 200px box at full resolution. + # The model-image scaler only downscales, so a crop within bounds + # passes through unchanged. + assert crop.size == (200, 200) + assert "[100, 100, 200, 200]" in message + + def test_oversized_crop_is_scaled_to_model_bounds(self) -> None: + facade = _make_facade((2048, 1536)) + tool = ComputerZoomTool(agent_os=facade) + + # Full model space -> full 2048x1536 real region, larger than the + # 1024x768 scaler bounds, so it is downscaled like a screenshot. + _, crop = tool(region=[0, 0, 1024, 768]) + + assert crop.size == (1024, 768) + + def test_requests_unscaled_screenshot(self, mocker: MockerFixture) -> None: + facade = _make_facade((2048, 1536)) + spy = mocker.spy(facade, "screenshot") + tool = ComputerZoomTool(agent_os=facade) + + tool(region=[0, 0, 100, 100]) + + assert any(call.kwargs.get("unscaled") is True for call in spy.call_args_list) + + def test_normalizes_unordered_corners(self) -> None: + facade = _make_facade((2048, 1536)) + tool = ComputerZoomTool(agent_os=facade) + + _, crop = tool(region=[200, 200, 100, 100]) + + assert crop.size == (200, 200) + + def test_rejects_region_with_wrong_length(self) -> None: + facade = _make_facade((2048, 1536)) + tool = ComputerZoomTool(agent_os=facade) + + with pytest.raises(ValueError, match="exactly 4 values"): + tool(region=[100, 100, 200]) + + def test_reports_the_cropped_image_not_the_full_screenshot(self) -> None: + reporter = MagicMock() + facade = _make_facade((2048, 1536)) + tool = ComputerZoomTool(agent_os=facade, reporter=reporter) + + _, crop = tool(region=[100, 100, 200, 200]) + + # The underlying screenshot is never fetched with reporting enabled, so + # the uncropped image is not shown in the report. + screenshot_mock = cast("MagicMock", facade._agent_os).screenshot + assert screenshot_mock.call_count >= 1 + assert all( + call.kwargs.get("report") is False + for call in screenshot_mock.call_args_list + ) + # Exactly the cropped image handed to the model is reported. + reporter.add_message.assert_called_once() + reported_image = reporter.add_message.call_args.args[2] + assert reported_image is crop + + def test_reports_scaled_back_coordinates_not_model_coordinates(self) -> None: + reporter = MagicMock() + facade = _make_facade((2048, 1536)) + tool = ComputerZoomTool(agent_os=facade, reporter=reporter) + + tool(region=[100, 100, 200, 200]) + + # Model coords [100, 100, 200, 200] map 1:2 onto the real screen. + reported_message = reporter.add_message.call_args.args[1] + assert "200, 200, 400, 400" in reported_message + assert "100, 100, 200, 200" not in reported_message + + def test_out_of_bounds_region_is_clamped_not_rejected(self) -> None: + facade = _make_facade((2048, 1536)) + tool = ComputerZoomTool(agent_os=facade) + + # Region extends well past the model bounds; it must clamp to the screen + # edge and crop the whole screen instead of raising. + _, crop = tool(region=[0, 0, 5000, 5000]) + + assert crop.size == (1024, 768) + + def test_accepts_normalized_float_region(self) -> None: + facade = _make_facade((1920, 1080), NormalizedCoordinateSpace()) + tool = ComputerZoomTool(agent_os=facade) + + # Kimi-style 0.0-1.0 coordinates: 0.4..0.6 spans 20% of each axis. + _, crop = tool(region=[0.4, 0.4, 0.6, 0.6]) + + # 0.2 * 1920 = 384 wide, 0.2 * 1080 = 216 tall. + assert crop.size == (384, 216) + + def test_returns_text_and_image(self) -> None: + facade = _make_facade((2048, 1536)) + tool = ComputerZoomTool(agent_os=facade) + + result = tool(region=[10, 20, 110, 120]) + + message, crop = result + assert isinstance(message, str) + assert isinstance(crop, Image.Image)