From b9fd35e645f3c4b974c6bb34750119aa39eb0572 Mon Sep 17 00:00:00 2001 From: Samir Mlika Date: Thu, 25 Jun 2026 11:21:07 +0100 Subject: [PATCH 1/3] feat: support PDF documents in tool results, get(), and file reads Let tools and file reads return PDFs to the model as a document content block, mirroring existing image handling. PDFs are sent as an Anthropic base64 `document` block (no beta header) and an OpenAI `file` content part. - agent_message_param: add DocumentBlockParam + Base64PdfSourceParam; allow document blocks in ToolResultBlockParam content - tools: render a returned PdfSource as a document block in tool results - anthropic/openai get models: accept PdfSource (Office docs stay unsupported) - LoadPdfTool: load a PDF from disk and hand it to the model - AgentOs.get_file: detect and return PDFs as PdfSource; sniff file type via filetype.guess and decode the base64 payload once - reporting: truncate base64 PDF (and any media) blobs to keep reports readable --- mypy.ini | 3 +- src/askui/__init__.py | 6 + src/askui/models/__init__.py | 4 + src/askui/models/anthropic/get_model.py | 28 ++-- src/askui/models/anthropic/messages_api.py | 28 ++++ src/askui/models/openai/get_model.py | 29 ++-- src/askui/models/openai/messages_api.py | 49 +++++-- .../models/shared/agent_message_param.py | 17 ++- src/askui/models/shared/tools.py | 99 +++++++++---- .../models/shared/truncation_strategies.py | 5 +- src/askui/reporting.py | 34 +++-- src/askui/tools/agent_os.py | 9 +- src/askui/tools/askui/askui_controller.py | 32 ++-- src/askui/tools/computer_agent_os_facade.py | 7 +- .../store/computer/experimental/get_file.py | 8 +- src/askui/tools/store/universal/__init__.py | 2 + .../tools/store/universal/load_pdf_tool.py | 111 ++++++++++++++ src/askui/utils/pdf_utils.py | 27 ++++ tests/unit/models/anthropic/__init__.py | 0 tests/unit/models/anthropic/test_get_model.py | 82 +++++++++++ tests/unit/models/openai/test_get_model.py | 30 ++-- tests/unit/models/openai/test_messages_api.py | 17 ++- tests/unit/models/shared/__init__.py | 0 .../models/shared/test_tool_call_result.py | 137 ++++++++++++++++++ tests/unit/test_reporting.py | 60 ++++++++ .../tools/askui/test_decode_file_payload.py | 51 +++++++ tests/unit/tools/store/__init__.py | 0 tests/unit/tools/store/test_load_pdf_tool.py | 62 ++++++++ 28 files changed, 824 insertions(+), 113 deletions(-) create mode 100644 src/askui/tools/store/universal/load_pdf_tool.py create mode 100644 tests/unit/models/anthropic/__init__.py create mode 100644 tests/unit/models/anthropic/test_get_model.py create mode 100644 tests/unit/models/shared/__init__.py create mode 100644 tests/unit/models/shared/test_tool_call_result.py create mode 100644 tests/unit/test_reporting.py create mode 100644 tests/unit/tools/askui/test_decode_file_payload.py create mode 100644 tests/unit/tools/store/__init__.py create mode 100644 tests/unit/tools/store/test_load_pdf_tool.py diff --git a/mypy.ini b/mypy.ini index cfb75eb0..8742130c 100644 --- a/mypy.ini +++ b/mypy.ini @@ -14,7 +14,8 @@ warn_unreachable = true strict_optional = true plugins = pydantic.mypy,sqlalchemy.ext.mypy.plugin exclude = (?x)( - ^src/askui/models/ui_tars_ep/ui_tars_api\.py$ + ^\.?venv/.*$ + | ^src/askui/models/ui_tars_ep/ui_tars_api\.py$ | ^src/askui/tools/askui/askui_ui_controller_grpc/.*$ ) mypy_path = src:tests diff --git a/src/askui/__init__.py b/src/askui/__init__.py index d3c09c37..4c276abd 100644 --- a/src/askui/__init__.py +++ b/src/askui/__init__.py @@ -14,11 +14,13 @@ from .locators import Locator from .models import ( Base64ImageSourceParam, + Base64PdfSourceParam, CacheControlEphemeralParam, CitationCharLocationParam, CitationContentBlockLocationParam, CitationPageLocationParam, ContentBlockParam, + DocumentBlockParam, ImageBlockParam, MessageParam, OnMessageCb, @@ -46,6 +48,7 @@ from .retry import ConfigurableRetry, Retry from .tools import ModifierKey, PcKey from .utils.image_utils import ImageSource +from .utils.pdf_utils import PdfSource from .utils.source_utils import InputSource try: @@ -76,6 +79,7 @@ "AgentSettings", "ActSettings", "Base64ImageSourceParam", + "Base64PdfSourceParam", "CacheControlEphemeralParam", "CitationCharLocationParam", "CitationContentBlockLocationParam", @@ -85,10 +89,12 @@ "ConversationCallback", "DEFAULT_GET_RESOLUTION", "DEFAULT_LOCATE_RESOLUTION", + "DocumentBlockParam", "GetSettings", "ImageBlockParam", "ImageSource", "InputSource", + "PdfSource", "Locator", "LocateSettings", "MessageParam", diff --git a/src/askui/models/__init__.py b/src/askui/models/__init__.py index be87ec60..f308467e 100644 --- a/src/askui/models/__init__.py +++ b/src/askui/models/__init__.py @@ -9,11 +9,13 @@ from .openrouter.settings import ChatCompletionsCreateSettings, OpenRouterSettings from .shared.agent_message_param import ( Base64ImageSourceParam, + Base64PdfSourceParam, CacheControlEphemeralParam, CitationCharLocationParam, CitationContentBlockLocationParam, CitationPageLocationParam, ContentBlockParam, + DocumentBlockParam, ImageBlockParam, MessageParam, TextBlockParam, @@ -28,12 +30,14 @@ __all__ = [ "ActModel", "Base64ImageSourceParam", + "Base64PdfSourceParam", "CacheControlEphemeralParam", "ChatCompletionsCreateSettings", "CitationCharLocationParam", "CitationContentBlockLocationParam", "CitationPageLocationParam", "ContentBlockParam", + "DocumentBlockParam", "FallbackGetModel", "FallbackLocateModel", "GetModel", diff --git a/src/askui/models/anthropic/get_model.py b/src/askui/models/anthropic/get_model.py index 421126e6..d1d87d7d 100644 --- a/src/askui/models/anthropic/get_model.py +++ b/src/askui/models/anthropic/get_model.py @@ -4,7 +4,10 @@ from typing_extensions import override -from askui.models.anthropic.messages_api import built_messages_for_get_and_locate +from askui.models.anthropic.messages_api import ( + built_messages_for_get_and_locate, + built_messages_for_get_pdf, +) from askui.models.anthropic.settings import UnexpectedResponseError from askui.models.exceptions import ( QueryNoResponseError, @@ -68,9 +71,9 @@ def get( response_schema: Type[ResponseSchema] | None, get_settings: GetSettings, ) -> ResponseSchema | str: - if isinstance(source, (PdfSource, OfficeDocumentSource)): + if isinstance(source, OfficeDocumentSource): err_msg = ( - f"PDF or Office Document processing is not supported for the model: " + f"Office Document processing is not supported for the model: " f"{self._model_id}" ) raise NotImplementedError(err_msg) @@ -78,14 +81,17 @@ def get( if response_schema is not None: error_msg = "Response schema is not yet supported for Anthropic" raise NotImplementedError(error_msg) - target_size = compute_contained_size( - source.root.width, - source.root.height, - get_settings.resolution.width, - get_settings.resolution.height, - ) - scaled_image = resize_image(source.root, target_size) - messages = built_messages_for_get_and_locate(scaled_image, query) + if isinstance(source, PdfSource): + messages = built_messages_for_get_pdf(source, query) + else: + target_size = compute_contained_size( + source.root.width, + source.root.height, + get_settings.resolution.width, + get_settings.resolution.height, + ) + scaled_image = resize_image(source.root, target_size) + messages = built_messages_for_get_and_locate(scaled_image, query) message = self._messages_api.create_message( messages=messages, model_id=self._model_id, diff --git a/src/askui/models/anthropic/messages_api.py b/src/askui/models/anthropic/messages_api.py index 47ea2681..4cbe0950 100644 --- a/src/askui/models/anthropic/messages_api.py +++ b/src/askui/models/anthropic/messages_api.py @@ -28,8 +28,10 @@ ) from askui.models.shared.agent_message_param import ( Base64ImageSourceParam, + Base64PdfSourceParam, CacheControlEphemeralParam, ContentBlockParam, + DocumentBlockParam, ImageBlockParam, MessageParam, TextBlockParam, @@ -41,6 +43,7 @@ from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection from askui.utils.image_utils import image_to_base64 +from askui.utils.pdf_utils import PdfSource def _is_retryable_error(exception: BaseException) -> bool: @@ -107,6 +110,31 @@ def built_messages_for_get_and_locate( ] +def built_messages_for_get_pdf( + pdf_source: PdfSource, prompt: str +) -> list[MessageParam]: + # Anthropic accepts a base64 PDF `document` block (no beta header); placing + # the document before the text follows Anthropic's PDF best practices. + return [ + MessageParam( + role="user", + content=cast( + "list[ContentBlockParam]", + [ + DocumentBlockParam( + source=Base64PdfSourceParam( + data=pdf_source.to_base64(), + ), + ), + TextBlockParam( + text=prompt, + ), + ], + ), + ) + ] + + def _parse_to_anthropic_types( tools: ToolCollection | None, betas: list[str] | None = None, diff --git a/src/askui/models/openai/get_model.py b/src/askui/models/openai/get_model.py index f651c6d6..86e049db 100644 --- a/src/askui/models/openai/get_model.py +++ b/src/askui/models/openai/get_model.py @@ -64,7 +64,7 @@ def __init__( def _predict( self, - image_url: str, + source_part: dict[str, Any], instruction: str, prompt: GetSystemPrompt, response_schema: type[ResponseSchema] | None, @@ -103,12 +103,7 @@ def _predict( { "role": "user", "content": [ - { - "type": "image_url", - "image_url": { - "url": image_url, - }, - }, + source_part, {"type": "text", "text": str(prompt) + instruction}, ], } @@ -148,17 +143,31 @@ def get( response_schema: type[ResponseSchema] | None, get_settings: GetSettings, ) -> ResponseSchema | str: - if isinstance(source, (PdfSource, OfficeDocumentSource)): + if isinstance(source, OfficeDocumentSource): err_msg = ( - "PDF or Office Document processing is not supported" + "Office Document processing is not supported" " for OpenAI-compatible models" ) raise NotImplementedError(err_msg) system_prompt = get_settings.system_prompt or SYSTEM_PROMPT_GET + if isinstance(source, PdfSource): + source_part: dict[str, Any] = { + "type": "file", + "file": { + "filename": "document.pdf", + "file_data": source.to_data_url(), + }, + } + else: + source_part = { + "type": "image_url", + "image_url": {"url": source.to_data_url()}, + } + response = self._predict( - image_url=source.to_data_url(), + source_part=source_part, instruction=query, prompt=system_prompt, response_schema=response_schema, diff --git a/src/askui/models/openai/messages_api.py b/src/askui/models/openai/messages_api.py index fd0ee20d..612fe3f8 100644 --- a/src/askui/models/openai/messages_api.py +++ b/src/askui/models/openai/messages_api.py @@ -16,6 +16,7 @@ BetaRedactedThinkingBlock, BetaThinkingBlock, ContentBlockParam, + DocumentBlockParam, ImageBlockParam, MessageParam, StopReason, @@ -56,27 +57,41 @@ def _image_block_to_openai(block: ImageBlockParam) -> dict[str, Any]: return {"type": "image_url", "image_url": {"url": url}} +def _document_block_to_openai(block: DocumentBlockParam) -> dict[str, Any]: + """Convert a `DocumentBlockParam` (PDF) to an OpenAI ``file`` content part.""" + data_url = f"data:{block.source.media_type};base64,{block.source.data}" + return { + "type": "file", + "file": { + "filename": "document.pdf", + "file_data": data_url, + }, + } + + def _serialize_tool_result_content( - content: str | list[TextBlockParam | ImageBlockParam], + content: str | list[TextBlockParam | ImageBlockParam | DocumentBlockParam], ) -> tuple[str, list[dict[str, Any]]]: """Serialize ``ToolResultBlockParam.content`` for OpenAI's ``tool`` role. - Returns the text portion as a string and any images as OpenAI content - parts (to be appended as a separate ``user`` message since the OpenAI - ``tool`` role only accepts string content). + Returns the text portion as a string and any images/documents as OpenAI + content parts (to be appended as a separate ``user`` message since the + OpenAI ``tool`` role only accepts string content). """ if isinstance(content, str): return content, [] text_parts: list[str] = [] - image_parts: list[dict[str, Any]] = [] + media_parts: list[dict[str, Any]] = [] for block in content: if isinstance(block, TextBlockParam): text_parts.append(block.text) - else: - image_parts.append(_image_block_to_openai(block)) + elif isinstance(block, ImageBlockParam): + media_parts.append(_image_block_to_openai(block)) + elif isinstance(block, DocumentBlockParam): + media_parts.append(_document_block_to_openai(block)) - return "\n".join(text_parts), image_parts + return "\n".join(text_parts), media_parts def _content_block_to_openai(block: ContentBlockParam) -> dict[str, Any] | None: @@ -88,6 +103,8 @@ def _content_block_to_openai(block: ContentBlockParam) -> dict[str, Any] | None: return {"type": "text", "text": block.text} if isinstance(block, ImageBlockParam): return _image_block_to_openai(block) + if isinstance(block, DocumentBlockParam): + return _document_block_to_openai(block) if isinstance(block, (BetaThinkingBlock, BetaRedactedThinkingBlock)): return None return None @@ -164,16 +181,16 @@ def _convert_user_message( """Convert a user message's content blocks to OpenAI format. ``ToolResultBlockParam`` blocks become ``tool`` role messages. - Images inside tool results are collected and appended as a separate - ``user`` message so the model can still see them. + Images and documents inside tool results are collected and appended as a + separate ``user`` message so the model can still see them. """ - tool_result_images: list[dict[str, Any]] = [] + tool_result_media: list[dict[str, Any]] = [] content_parts: list[dict[str, Any]] = [] for block in blocks: if isinstance(block, ToolResultBlockParam): - text_content, images = _serialize_tool_result_content(block.content) - tool_result_images.extend(images) + text_content, media = _serialize_tool_result_content(block.content) + tool_result_media.extend(media) result.append( { "role": "tool", @@ -189,9 +206,9 @@ def _convert_user_message( if content_parts: result.append({"role": "user", "content": content_parts}) - # Append images from tool results as a separate user message - if tool_result_images: - result.append({"role": "user", "content": tool_result_images}) + # Append images/documents from tool results as a separate user message + if tool_result_media: + result.append({"role": "user", "content": tool_result_media}) def _to_openai_tools(tools: ToolCollection) -> list[dict[str, Any]]: diff --git a/src/askui/models/shared/agent_message_param.py b/src/askui/models/shared/agent_message_param.py index 7b82e87e..8fbf1378 100644 --- a/src/askui/models/shared/agent_message_param.py +++ b/src/askui/models/shared/agent_message_param.py @@ -49,6 +49,12 @@ class Base64ImageSourceParam(BaseModel): type: Literal["base64"] = "base64" +class Base64PdfSourceParam(BaseModel): + data: str + media_type: Literal["application/pdf"] = "application/pdf" + type: Literal["base64"] = "base64" + + class CacheControlEphemeralParam(BaseModel): type: Literal["ephemeral"] = "ephemeral" @@ -59,6 +65,12 @@ class ImageBlockParam(BaseModel): cache_control: CacheControlEphemeralParam | None = None +class DocumentBlockParam(BaseModel): + source: Base64PdfSourceParam + type: Literal["document"] = "document" + cache_control: CacheControlEphemeralParam | None = None + + class TextBlockParam(BaseModel): text: str type: Literal["text"] = "text" @@ -70,7 +82,7 @@ class ToolResultBlockParam(BaseModel): tool_use_id: str type: Literal["tool_result"] = "tool_result" cache_control: CacheControlEphemeralParam | None = None - content: str | list[TextBlockParam | ImageBlockParam] + content: str | list[TextBlockParam | ImageBlockParam | DocumentBlockParam] is_error: bool = False @@ -96,6 +108,7 @@ class BetaRedactedThinkingBlock(BaseModel): ContentBlockParam = ( ImageBlockParam + | DocumentBlockParam | TextBlockParam | ToolResultBlockParam | ToolUseBlockParam @@ -135,11 +148,13 @@ class MessageParam(BaseModel): __all__ = [ "Base64ImageSourceParam", + "Base64PdfSourceParam", "CacheControlEphemeralParam", "CitationCharLocationParam", "CitationContentBlockLocationParam", "CitationPageLocationParam", "ContentBlockParam", + "DocumentBlockParam", "ImageBlockParam", "MessageParam", "TextBlockParam", diff --git a/src/askui/models/shared/tools.py b/src/askui/models/shared/tools.py index 74912911..524151bb 100644 --- a/src/askui/models/shared/tools.py +++ b/src/askui/models/shared/tools.py @@ -14,8 +14,10 @@ from fastmcp.tools import Tool as FastMcpTool from fastmcp.utilities.types import Image as FastMcpImage from mcp import Tool as McpTool +from mcp.types import BlobResourceContents as McpBlobResourceContents from mcp.types import ImageContent as McpImageContent from mcp.types import TextContent as McpTextContent +from mcp.types import TextResourceContents as McpTextResourceContents from PIL import Image from pydantic import BaseModel, ConfigDict, Field, PrivateAttr from typing_extensions import Self @@ -23,8 +25,10 @@ from askui.models.exceptions import AutomationError from askui.models.shared.agent_message_param import ( Base64ImageSourceParam, + Base64PdfSourceParam, CacheControlEphemeralParam, ContentBlockParam, + DocumentBlockParam, ImageBlockParam, TextBlockParam, ToolParam, @@ -34,10 +38,11 @@ from askui.tools import AgentOs from askui.tools.android.agent_os import AndroidAgentOs from askui.utils.image_utils import ImageSource, base64_to_image +from askui.utils.pdf_utils import PdfSource logger = logging.getLogger(__name__) -PrimitiveToolCallResult = Image.Image | None | str | BaseModel +PrimitiveToolCallResult = Image.Image | None | str | BaseModel | PdfSource ToolCallResult = ( PrimitiveToolCallResult @@ -52,40 +57,69 @@ ] = ["image/jpeg", "image/png", "image/gif", "image/webp"] +def _convert_mcp_resource( + resource: McpTextResourceContents | McpBlobResourceContents, +) -> TextBlockParam | DocumentBlockParam | None: + """Convert an MCP embedded resource into a content block. + + Text resources become a text block; PDF blob resources become a document + block. Other binary resources are unsupported and dropped. + """ + if isinstance(resource, McpTextResourceContents): + return TextBlockParam(text=resource.text) + if resource.mimeType == "application/pdf": + return DocumentBlockParam(source=Base64PdfSourceParam(data=resource.blob)) + logger.warning( + "Unsupported embedded resource media type", + extra={"media_type": resource.mimeType}, + ) + return None + + +def _convert_call_tool_result( + result: CallToolResult, +) -> list[TextBlockParam | ImageBlockParam | DocumentBlockParam]: + _result: list[TextBlockParam | ImageBlockParam | DocumentBlockParam] = [] + for block in result.content: + match block.type: + case "text": + _result.append(TextBlockParam(text=block.text)) + case "image": + media_type = block.mimeType + if media_type not in IMAGE_MEDIA_TYPES_SUPPORTED: + logger.warning( + "Unsupported image media type", + extra={"media_type": media_type}, + ) + continue + _result.append( + ImageBlockParam( + source=Base64ImageSourceParam( + media_type=media_type, + data=block.data, + ) + ) + ) + case "resource": + converted = _convert_mcp_resource(block.resource) + if converted is not None: + _result.append(converted) + case _: + logger.warning( + "Unsupported block type", + extra={"block_type": block.type}, + ) + return _result + + def _convert_to_content( result: ToolCallResult, -) -> list[TextBlockParam | ImageBlockParam]: +) -> list[TextBlockParam | ImageBlockParam | DocumentBlockParam]: if result is None: return [] if isinstance(result, CallToolResult): - _result: list[TextBlockParam | ImageBlockParam] = [] - for block in result.content: - match block.type: - case "text": - _result.append(TextBlockParam(text=block.text)) - case "image": - media_type = block.mimeType - if media_type not in IMAGE_MEDIA_TYPES_SUPPORTED: - logger.warning( - "Unsupported image media type", - extra={"media_type": media_type}, - ) - continue - _result.append( - ImageBlockParam( - source=Base64ImageSourceParam( - media_type=media_type, - data=block.data, - ) - ) - ) - case _: - logger.warning( - "Unsupported block type", - extra={"block_type": block.type}, - ) - return _result + return _convert_call_tool_result(result) if isinstance(result, str): return [TextBlockParam(text=result)] @@ -97,6 +131,13 @@ def _convert_to_content( for item in sublist ] + if isinstance(result, PdfSource): + return [ + DocumentBlockParam( + source=Base64PdfSourceParam(data=result.to_base64()), + ) + ] + if isinstance(result, BaseModel): return [TextBlockParam(text=result.model_dump_json())] diff --git a/src/askui/models/shared/truncation_strategies.py b/src/askui/models/shared/truncation_strategies.py index baf2d023..cc20ae60 100644 --- a/src/askui/models/shared/truncation_strategies.py +++ b/src/askui/models/shared/truncation_strategies.py @@ -11,6 +11,7 @@ Base64ImageSourceParam, CacheControlEphemeralParam, ContentBlockParam, + DocumentBlockParam, ImageBlockParam, MessageParam, TextBlockParam, @@ -599,7 +600,9 @@ def _strip_base64_images( elif isinstance(block, ToolResultBlockParam) and isinstance( block.content, list ): - new_nested: list[TextBlockParam | ImageBlockParam] = [] + new_nested: list[ + TextBlockParam | ImageBlockParam | DocumentBlockParam + ] = [] for nested in block.content: if ( stripped < max_to_strip diff --git a/src/askui/reporting.py b/src/askui/reporting.py index 78c521f9..4090ea9b 100644 --- a/src/askui/reporting.py +++ b/src/askui/reporting.py @@ -65,33 +65,43 @@ def _format_duration(seconds: float) -> str: return base -def truncate_base64_images(content: Any) -> Any: - """Replace base64 image data with a placeholder to keep reports readable. +def _base64_media_label(media_type: str) -> str: + """Return a human-friendly label for a base64 source's ``media_type``.""" + if media_type == "application/pdf": + return "PDF" + if media_type.startswith("image/"): + return "image" + return media_type + + +def truncate_base64_media(content: Any) -> Any: + """Replace base64 media data with a placeholder to keep reports readable. Walks the message content recursively and replaces the ``data`` field of - any base64 image source block (matching the schema of - ``Base64ImageSourceParam``, i.e. ``{"type": "base64", "data": "...", - "media_type": "image/..."}``) with a placeholder string. All other - content (including regular long strings like prompts or tool outputs) - is left untouched. + any base64 source block (matching the schema of ``Base64ImageSourceParam`` + / ``Base64PdfSourceParam``, i.e. ``{"type": "base64", "data": "...", + "media_type": "..."}``) with a placeholder string. This covers screenshots, + images, and PDF documents alike, so large binary blobs never bloat the + report. All other content (including regular long strings like prompts or + tool outputs) is left untouched. """ if isinstance(content, dict): content_dict: dict[Any, Any] = content media_type = content_dict.get("media_type") if ( isinstance(media_type, str) - and media_type.startswith("image/") and content_dict.get("type") == "base64" and "data" in content_dict ): - return {**content_dict, "data": "[Base64 image data truncated]"} + label = _base64_media_label(media_type) + return {**content_dict, "data": f"[Base64 {label} data truncated]"} return { - key: truncate_base64_images(value) for key, value in content_dict.items() + key: truncate_base64_media(value) for key, value in content_dict.items() } if isinstance(content, list): content_list: list[Any] = content - return [truncate_base64_images(item) for item in content_list] + return [truncate_base64_media(item) for item in content_list] return content @@ -414,7 +424,7 @@ def add_message( self._start_time = datetime.now(tz=timezone.utc) _images = normalize_to_pil_images(image) - _content = truncate_base64_images(content) + _content = truncate_base64_media(content) timestamp = datetime.now(tz=timezone.utc) formatted_content = self._format_content(_content) diff --git a/src/askui/tools/agent_os.py b/src/askui/tools/agent_os.py index af9cc96d..b72f9e74 100644 --- a/src/askui/tools/agent_os.py +++ b/src/askui/tools/agent_os.py @@ -5,6 +5,7 @@ from pydantic import BaseModel, ConfigDict, Field from askui.models.shared.tool_tags import ToolTags +from askui.utils.pdf_utils import PdfSource if TYPE_CHECKING: from askui.tools.askui.askui_ui_controller_grpc.generated import ( @@ -698,18 +699,18 @@ def get_file_names(self, absolute_directory_path: str) -> list[str]: """ raise NotImplementedError - def get_file(self, path: str) -> Image.Image | str: + def get_file(self, path: str) -> Image.Image | PdfSource | str: """ Read a file from the automation target (desktop Agent OS). - Binary image payloads are returned as `PIL.Image.Image` when recognized; - otherwise UTF-8 text when decodable. + Binary image payloads are returned as `PIL.Image.Image` when recognized, + PDF documents as `PdfSource`; otherwise UTF-8 text when decodable. Args: path (str): File path on the target system. Returns: - Image.Image | str: Decoded file contents. + Image.Image | PdfSource | str: Decoded file contents. Raises: NotImplementedError: If the implementation does not support this operation. diff --git a/src/askui/tools/askui/askui_controller.py b/src/askui/tools/askui/askui_controller.py index 4e2f8c4f..85dcb528 100644 --- a/src/askui/tools/askui/askui_controller.py +++ b/src/askui/tools/askui/askui_controller.py @@ -6,9 +6,11 @@ import time import types import uuid +from io import BytesIO from typing import Literal, Type import grpc +from filetype import guess # type: ignore[import-untyped] from google.protobuf.json_format import MessageToDict from PIL import Image from typing_extensions import Self, override @@ -76,7 +78,7 @@ GetSystemInfoResponseModel, ) from askui.utils.annotated_image import AnnotatedImage -from askui.utils.image_utils import base64_to_image +from askui.utils.pdf_utils import PdfSource from ..utils import process_exists, wait_for_port from .exceptions import ( @@ -1345,21 +1347,22 @@ def get_file_names(self, absolute_directory_path: str) -> list[str]: ) return res.response.fileNames - def get_file(self, path: str) -> Image.Image | str: + def get_file(self, path: str) -> Image.Image | PdfSource | str: """ Get the contents of a file at the given path on the device under automation. The controller returns the file as a Base64-encoded string, which is decoded and returned as `PIL.Image.Image` when the bytes can be opened - as an image (PNG, JPEG, BMP, GIF, WebP, TIFF, ...), or as `str` when - they decode cleanly as UTF-8 text. + as an image (PNG, JPEG, BMP, GIF, WebP, TIFF, ...), as `PdfSource` when + the bytes are a PDF document, or as `str` when they decode cleanly as + UTF-8 text. Args: path (str): The file path to read on the device under automation. Returns: - Image.Image | str: The decoded file contents. + Image.Image | PdfSource | str: The decoded file contents. Raises: DesktopAgentOsError: If the file cannot be read or the response is invalid. @@ -1386,6 +1389,11 @@ def get_file(self, path: str) -> Image.Image | str: ) return decoded + if isinstance(decoded, PdfSource): + detail = f"PDF ({len(decoded.to_bytes())} bytes)" + self._reporter.add_message("AgentOS", f"get_file({path}) -> {detail}") + return decoded + detail = f"text ({len(decoded)} chars)" self._reporter.add_message("AgentOS", f"get_file({path}) -> {detail}") return decoded @@ -1404,16 +1412,18 @@ def remove_virtual_displays(self) -> None: self._reporter.add_message("AgentOS", "remove_virtual_displays() -> done") @staticmethod - def _decode_file_payload(base64_data: str) -> Image.Image | str: - try: - return base64_to_image(base64_data) - except ValueError: - pass + def _decode_file_payload(base64_data: str) -> Image.Image | PdfSource | str: data = base64.b64decode(base64_data, validate=True) + kind = guess(data) + mime: str | None = kind.mime if kind is not None else None + if mime is not None and mime.startswith("image/"): + return Image.open(BytesIO(data)) + if mime == "application/pdf": + return PdfSource(data) if b"\x00" not in data: try: return data.decode("utf-8") except UnicodeDecodeError: pass - message = "File contents are neither a supported image nor UTF-8 text" + message = "File contents are neither a supported image, PDF, nor UTF-8 text" raise DesktopAgentOsError(message) diff --git a/src/askui/tools/computer_agent_os_facade.py b/src/askui/tools/computer_agent_os_facade.py index 57c7efa4..bc1545a9 100644 --- a/src/askui/tools/computer_agent_os_facade.py +++ b/src/askui/tools/computer_agent_os_facade.py @@ -17,6 +17,7 @@ ) from askui.tools.askui.askui_controller import RenderObjectStyle # noqa: TC001 from askui.tools.coordinate_scaler import CoordinateScaler +from askui.utils.pdf_utils import PdfSource if TYPE_CHECKING: from askui.tools.askui.askui_ui_controller_grpc.generated import ( @@ -336,7 +337,7 @@ def get_file_names(self, absolute_directory_path: str) -> list[str]: """ return self._agent_os.get_file_names(absolute_directory_path) - def get_file(self, path: str) -> Image.Image | str: + def get_file(self, path: str) -> Image.Image | PdfSource | str: """ Read a file from the automation target. @@ -344,7 +345,9 @@ def get_file(self, path: str) -> Image.Image | str: path (str): File path on the target system. Returns: - Image.Image | str: Decoded file contents. + Image.Image | PdfSource | str: Decoded file contents. Images are + scaled to the model's coordinate space; PDFs and text pass + through unchanged. """ response = self._agent_os.get_file(path) if isinstance(response, Image.Image): diff --git a/src/askui/tools/store/computer/experimental/get_file.py b/src/askui/tools/store/computer/experimental/get_file.py index b7bf5c93..c751c280 100644 --- a/src/askui/tools/store/computer/experimental/get_file.py +++ b/src/askui/tools/store/computer/experimental/get_file.py @@ -2,6 +2,7 @@ from askui.models.shared import ComputerBaseTool, ToolTags from askui.tools.agent_os import AgentOs +from askui.utils.pdf_utils import PdfSource class ComputerGetFileTool(ComputerBaseTool): @@ -29,8 +30,9 @@ def __init__(self, agent_os: AgentOs | None = None) -> None: name="get_file_tool", description=( "Reads a file at an absolute path on the computer under automation. " - "Returns UTF-8 text as a string, or a decoded image for " - "supported image formats. Unsupported binary types are rejected." + "Returns UTF-8 text as a string, a decoded image for supported " + "image formats, or a PDF document. Unsupported binary types are " + "rejected." ), input_schema={ "type": "object", @@ -51,5 +53,5 @@ def __init__(self, agent_os: AgentOs | None = None) -> None: ) self.is_cacheable = True - def __call__(self, absolute_file_path: str) -> Image.Image | str: + def __call__(self, absolute_file_path: str) -> Image.Image | PdfSource | str: return self.agent_os.get_file(absolute_file_path) diff --git a/src/askui/tools/store/universal/__init__.py b/src/askui/tools/store/universal/__init__.py index 1b3e1f0b..536b313f 100644 --- a/src/askui/tools/store/universal/__init__.py +++ b/src/askui/tools/store/universal/__init__.py @@ -7,6 +7,7 @@ from .get_current_time import GetCurrentTimeTool from .list_files_tool import ListFilesTool from .load_image_tool import LoadImageTool +from .load_pdf_tool import LoadPdfTool from .print_to_console import PrintToConsoleTool from .read_from_file_tool import ReadFromFileTool from .wait_tool import WaitTool @@ -24,4 +25,5 @@ "WaitWithProgressTool", "WriteToFileTool", "LoadImageTool", + "LoadPdfTool", ] diff --git a/src/askui/tools/store/universal/load_pdf_tool.py b/src/askui/tools/store/universal/load_pdf_tool.py new file mode 100644 index 00000000..0feb819e --- /dev/null +++ b/src/askui/tools/store/universal/load_pdf_tool.py @@ -0,0 +1,111 @@ +from pathlib import Path + +from askui.models.shared.tools import Tool +from askui.utils.pdf_utils import PdfSource + + +class LoadPdfTool(Tool): + """ + Tool for loading PDF documents from a directory on the filesystem. + + This tool allows the agent to load PDF files and hand them to the model for + analysis. The PDF is passed through unchanged as a document, so the model + can reason about text, tables, charts, and layout. Use it to read reports, + contracts, forms, or any other PDF-based content during execution. + + Args: + base_dir (str | Path): The base directory path where PDFs will be loaded + from. All PDF paths will be relative to this directory. + + Example: + ```python + from askui import ComputerAgent + from askui.tools.store.universal import LoadPdfTool + + with ComputerAgent() as agent: + agent.act( + "Summarize the key findings in 'report.pdf'", + tools=[LoadPdfTool(base_dir="documents")], + ) + ``` + + Example: + ```python + from askui import ComputerAgent + from askui.tools.store.universal import LoadPdfTool + + with ComputerAgent( + act_tools=[LoadPdfTool(base_dir="documents")] + ) as agent: + agent.act("Summarize the key findings in 'report.pdf'") + ``` + """ + + def __init__(self, base_dir: str | Path) -> None: + if not isinstance(base_dir, Path): + base_dir = Path(base_dir) + absolute = base_dir.absolute() + super().__init__( + name="load_pdf_tool", + description=( + "Loads a PDF document from the filesystem and returns it for " + f"analysis. The base directory is set to '{absolute}' during tool " + "initialization. All PDF paths are relative to this base directory. " + "The document is passed to the model in full (every page as both " + "text and image), so use this tool to read reports, contracts, " + "forms, or any PDF-based content, and to reason about its text, " + "tables, charts, and layout." + ), + input_schema={ + "type": "object", + "properties": { + "pdf_path": { + "type": "string", + "description": ( + "The relative path of the PDF file to load. The path is " + f"relative to the base directory '{absolute}' specified " + "during tool initialization. For example, if pdf_path is " + "'reports/q4.pdf', the PDF will be loaded from " + f"'{absolute}/reports/q4.pdf'." + ), + }, + }, + "required": [ + "pdf_path", + ], + }, + ) + self._base_dir = base_dir + self.is_cacheable = True + + def __call__(self, pdf_path: str = "") -> tuple[str, PdfSource]: + """ + Load a PDF from the specified path and return it for processing. + + Args: + pdf_path (str): The relative path of the PDF file to load, relative to + the base directory specified during tool initialization. + + Returns: + tuple[str, PdfSource]: A tuple containing a confirmation message + indicating the PDF was successfully loaded (including the full + absolute path) and the loaded `PdfSource` respectively. + + Raises: + FileNotFoundError: If the PDF file does not exist at the specified path. + FileExistsError: If the path exists but is not a file (e.g., a directory). + """ + absolute_pdf_path = self._base_dir / pdf_path + + if not absolute_pdf_path.exists(): + error_msg = f"PDF not found: {absolute_pdf_path}" + raise FileNotFoundError(error_msg) + + if not absolute_pdf_path.is_file(): + error_msg = f"Path is not a file: {absolute_pdf_path}" + raise FileExistsError(error_msg) + + return ( + f"PDF was successfully loaded from {absolute_pdf_path}", + PdfSource(absolute_pdf_path), + ) diff --git a/src/askui/utils/pdf_utils.py b/src/askui/utils/pdf_utils.py index 65a3170e..629548f7 100644 --- a/src/askui/utils/pdf_utils.py +++ b/src/askui/utils/pdf_utils.py @@ -1,3 +1,4 @@ +import base64 from io import BufferedReader, BytesIO from pathlib import Path @@ -27,6 +28,32 @@ def reader(self) -> BufferedReader | BytesIO: return self.root.open("rb") return BytesIO(self.root) + def to_bytes(self) -> bytes: + """Read the PDF source into bytes. + + Returns: + bytes: The PDF as bytes. + """ + with self.reader as reader: + return reader.read() + + def to_base64(self) -> str: + """Convert the PDF to a base64 string. + + Returns: + str: A base64 encoded string of the PDF. + """ + return base64.b64encode(self.to_bytes()).decode("utf-8") + + def to_data_url(self) -> str: + """Convert the PDF to a data URL. + + Returns: + str: A data URL string in the format + `"data:application/pdf;base64,..."`. + """ + return f"data:application/pdf;base64,{self.to_base64()}" + __all__ = [ "PdfSource", diff --git a/tests/unit/models/anthropic/__init__.py b/tests/unit/models/anthropic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/models/anthropic/test_get_model.py b/tests/unit/models/anthropic/test_get_model.py new file mode 100644 index 00000000..88fe81f5 --- /dev/null +++ b/tests/unit/models/anthropic/test_get_model.py @@ -0,0 +1,82 @@ +"""Unit tests for `AnthropicGetModel`. + +Claude processes PDFs server-side (each page as text + image), so `get()` sends +a PDF as a base64 `document` block rather than rasterising it client-side. +Images keep the existing resize-and-image-block path; Office documents remain +unsupported. +""" + +import base64 +from unittest.mock import MagicMock + +import pytest +from PIL import Image + +from askui.models.anthropic.get_model import AnthropicGetModel +from askui.models.shared.agent_message_param import ( + DocumentBlockParam, + ImageBlockParam, + MessageParam, + TextBlockParam, +) +from askui.models.shared.messages_api import MessagesApi +from askui.models.shared.settings import GetSettings +from askui.utils.excel_utils import OfficeDocumentSource +from askui.utils.image_utils import ImageSource +from askui.utils.pdf_utils import PdfSource + +_PDF_BYTES = b"%PDF-1.4\n1 0 obj<<>>endobj\ntrailer<<>>\n%%EOF" + + +def _model_returning(text: str) -> tuple[AnthropicGetModel, MagicMock]: + messages_api = MagicMock(spec=MessagesApi) + messages_api.create_message.return_value = MessageParam( + role="assistant", content=[TextBlockParam(text=text)] + ) + model = AnthropicGetModel(model_id="claude-sonnet-4-6", messages_api=messages_api) + return model, messages_api + + +class TestAnthropicGetModel: + def test_pdf_source_sends_document_block(self) -> None: + model, messages_api = _model_returning("42 pages") + + result = model.get( + query="How many pages?", + source=PdfSource(_PDF_BYTES), + response_schema=None, + get_settings=GetSettings(), + ) + + assert result == "42 pages" + blocks = messages_api.create_message.call_args.kwargs["messages"][0].content + assert isinstance(blocks[0], DocumentBlockParam) + assert blocks[0].source.media_type == "application/pdf" + assert base64.b64decode(blocks[0].source.data) == _PDF_BYTES + assert isinstance(blocks[1], TextBlockParam) + assert blocks[1].text == "How many pages?" + + def test_image_source_still_sends_image_block(self) -> None: + model, messages_api = _model_returning("a submit button") + + result = model.get( + query="What is shown?", + source=ImageSource(Image.new("RGB", (10, 10))), + response_schema=None, + get_settings=GetSettings(), + ) + + assert result == "a submit button" + blocks = messages_api.create_message.call_args.kwargs["messages"][0].content + assert isinstance(blocks[0], ImageBlockParam) + + def test_office_document_remains_unsupported(self) -> None: + model, _ = _model_returning("unused") + + with pytest.raises(NotImplementedError, match="Office Document"): + model.get( + query="Describe", + source=MagicMock(spec=OfficeDocumentSource), + response_schema=None, + get_settings=GetSettings(), + ) diff --git a/tests/unit/models/openai/test_get_model.py b/tests/unit/models/openai/test_get_model.py index fef21b9b..f7a0ab85 100644 --- a/tests/unit/models/openai/test_get_model.py +++ b/tests/unit/models/openai/test_get_model.py @@ -69,25 +69,35 @@ def test_no_response_raises_error(self) -> None: get_settings=GetSettings(), ) - def test_pdf_source_not_supported(self) -> None: + def test_pdf_source_sends_file_part(self) -> None: mock_client = MagicMock() + mock_client.chat.completions.create.return_value = _make_completion("3 pages") + source = MagicMock(spec=PdfSource) + source.to_data_url.return_value = "data:application/pdf;base64,abc" - model = OpenAIGetModel(model_id="qwen2.5vl", client=mock_client) - with pytest.raises(NotImplementedError, match="PDF or Office Document"): - model.get( - query="Describe", - source=source, - response_schema=None, - get_settings=GetSettings(), - ) + model = OpenAIGetModel(model_id="gpt-4o", client=mock_client) + result = model.get( + query="How many pages?", + source=source, + response_schema=None, + get_settings=GetSettings(), + ) + + assert result == "3 pages" + content = mock_client.chat.completions.create.call_args.kwargs["messages"][0][ + "content" + ] + file_part = next(part for part in content if part["type"] == "file") + assert file_part["file"]["file_data"] == "data:application/pdf;base64,abc" + assert file_part["file"]["filename"] == "document.pdf" def test_office_document_source_not_supported(self) -> None: mock_client = MagicMock() source = MagicMock(spec=OfficeDocumentSource) model = OpenAIGetModel(model_id="qwen2.5vl", client=mock_client) - with pytest.raises(NotImplementedError, match="PDF or Office Document"): + with pytest.raises(NotImplementedError, match="Office Document"): model.get( query="Describe", source=source, diff --git a/tests/unit/models/openai/test_messages_api.py b/tests/unit/models/openai/test_messages_api.py index 22fbcbce..5a6b7082 100644 --- a/tests/unit/models/openai/test_messages_api.py +++ b/tests/unit/models/openai/test_messages_api.py @@ -21,8 +21,10 @@ ) from askui.models.shared.agent_message_param import ( Base64ImageSourceParam, + Base64PdfSourceParam, BetaRedactedThinkingBlock, BetaThinkingBlock, + DocumentBlockParam, ImageBlockParam, MessageParam, TextBlockParam, @@ -114,7 +116,7 @@ def test_string_content(self) -> None: assert images == [] def test_text_blocks(self) -> None: - content: list[TextBlockParam | ImageBlockParam] = [ + content: list[TextBlockParam | ImageBlockParam | DocumentBlockParam] = [ TextBlockParam(text="line1"), TextBlockParam(text="line2"), ] @@ -123,7 +125,7 @@ def test_text_blocks(self) -> None: assert images == [] def test_image_blocks_extracted(self) -> None: - content: list[TextBlockParam | ImageBlockParam] = [ + content: list[TextBlockParam | ImageBlockParam | DocumentBlockParam] = [ TextBlockParam(text="screenshot"), ImageBlockParam( source=Base64ImageSourceParam(data="abc", media_type="image/png") @@ -134,6 +136,17 @@ def test_image_blocks_extracted(self) -> None: assert len(images) == 1 assert images[0]["type"] == "image_url" + def test_document_blocks_become_file_parts(self) -> None: + content: list[TextBlockParam | ImageBlockParam | DocumentBlockParam] = [ + TextBlockParam(text="report attached"), + DocumentBlockParam(source=Base64PdfSourceParam(data="cGRm")), + ] + text, media = _serialize_tool_result_content(content) + assert text == "report attached" + assert len(media) == 1 + assert media[0]["type"] == "file" + assert media[0]["file"]["file_data"] == "data:application/pdf;base64,cGRm" + class TestToOpenaiMessages: def test_simple_text_message(self) -> None: diff --git a/tests/unit/models/shared/__init__.py b/tests/unit/models/shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/models/shared/test_tool_call_result.py b/tests/unit/models/shared/test_tool_call_result.py new file mode 100644 index 00000000..bb2328d0 --- /dev/null +++ b/tests/unit/models/shared/test_tool_call_result.py @@ -0,0 +1,137 @@ +"""Tests for converting tool-call results into provider-neutral content blocks. + +Tools may return a `PdfSource` to hand a PDF document to the model, mirroring +how returning a `PIL.Image` produces an image block. Anthropic accepts +`document` blocks inside `tool_result` content (base64 PDF, no beta header), so +a returned PDF is converted into a `DocumentBlockParam`. +""" + +import base64 +from typing import cast + +from fastmcp.client.client import CallToolResult +from mcp.types import ( + BlobResourceContents, + EmbeddedResource, + TextResourceContents, +) +from PIL import Image + +from askui.models.anthropic.messages_api import from_content_block +from askui.models.shared.agent_message_param import ( + Base64PdfSourceParam, + DocumentBlockParam, + ImageBlockParam, + TextBlockParam, + ToolResultBlockParam, +) +from askui.models.shared.tools import ( + _convert_call_tool_result, + _convert_mcp_resource, + _convert_to_content, +) +from askui.utils.pdf_utils import PdfSource + +# Smallest payload that is unambiguously a PDF; we only base64-encode the bytes, +# never parse them. +_PDF_BYTES = b"%PDF-1.4\n1 0 obj<<>>endobj\ntrailer<<>>\n%%EOF" + + +class TestConvertPdfToolResult: + def test_pdf_source_becomes_document_block(self) -> None: + result = _convert_to_content(PdfSource(_PDF_BYTES)) + + assert len(result) == 1 + block = result[0] + assert isinstance(block, DocumentBlockParam) + assert isinstance(block.source, Base64PdfSourceParam) + assert block.source.media_type == "application/pdf" + assert base64.b64decode(block.source.data) == _PDF_BYTES + + def test_pdf_alongside_text_and_image_preserves_order(self) -> None: + result = _convert_to_content( + [ + "see the attached report", + PdfSource(_PDF_BYTES), + Image.new("RGB", (2, 2)), + ] + ) + + assert [type(block) for block in result] == [ + TextBlockParam, + DocumentBlockParam, + ImageBlockParam, + ] + + def test_document_in_tool_result_serializes_for_anthropic(self) -> None: + block = ToolResultBlockParam( + tool_use_id="toolu_1", + content=[ + DocumentBlockParam(source=Base64PdfSourceParam(data="cGRm")), + ], + ) + + dumped = cast("dict", from_content_block(block)) + + document = dumped["content"][0] + assert document["type"] == "document" + assert document["source"] == { + "type": "base64", + "media_type": "application/pdf", + "data": "cGRm", + } + + +class TestConvertMcpResource: + """An MCP tool returns a PDF as an embedded blob resource, not an image.""" + + def test_pdf_blob_resource_becomes_document_block(self) -> None: + resource = BlobResourceContents( + uri="file:///doc.pdf", + mimeType="application/pdf", + blob=base64.b64encode(_PDF_BYTES).decode(), + ) + + block = _convert_mcp_resource(resource) + + assert isinstance(block, DocumentBlockParam) + assert base64.b64decode(block.source.data) == _PDF_BYTES + + def test_text_resource_becomes_text_block(self) -> None: + resource = TextResourceContents(uri="file:///a.txt", text="hello") + + block = _convert_mcp_resource(resource) + + assert isinstance(block, TextBlockParam) + assert block.text == "hello" + + def test_unsupported_blob_resource_is_dropped(self) -> None: + resource = BlobResourceContents( + uri="file:///a.bin", + mimeType="application/octet-stream", + blob="QUJD", + ) + + assert _convert_mcp_resource(resource) is None + + def test_embedded_pdf_resource_in_call_tool_result(self) -> None: + result = CallToolResult( + content=[ + EmbeddedResource( + type="resource", + resource=BlobResourceContents( + uri="file:///doc.pdf", + mimeType="application/pdf", + blob=base64.b64encode(_PDF_BYTES).decode(), + ), + ) + ], + structured_content=None, + meta=None, + ) + + blocks = _convert_call_tool_result(result) + + assert len(blocks) == 1 + assert isinstance(blocks[0], DocumentBlockParam) + assert base64.b64decode(blocks[0].source.data) == _PDF_BYTES diff --git a/tests/unit/test_reporting.py b/tests/unit/test_reporting.py new file mode 100644 index 00000000..1e87779c --- /dev/null +++ b/tests/unit/test_reporting.py @@ -0,0 +1,60 @@ +"""Tests for reporting helpers. + +`truncate_base64_media` keeps HTML/text reports readable by replacing large +base64 blobs (screenshots, images, PDF documents) with a short placeholder, +rather than dumping the full encoded payload into the report. +""" + +from typing import Any + +from askui.reporting import truncate_base64_media + + +def _base64_source(media_type: str) -> dict[str, Any]: + return {"type": "base64", "media_type": media_type, "data": "QUJD" * 5000} + + +class TestTruncateBase64Media: + def test_truncates_image_with_friendly_label(self) -> None: + result = truncate_base64_media( + {"type": "image", "source": _base64_source("image/png")} + ) + assert result["source"]["data"] == "[Base64 image data truncated]" + + def test_truncates_pdf_with_friendly_label(self) -> None: + result = truncate_base64_media( + {"type": "document", "source": _base64_source("application/pdf")} + ) + assert result["source"]["data"] == "[Base64 PDF data truncated]" + + def test_truncates_pdf_inside_tool_result(self) -> None: + content = [ + { + "type": "tool_result", + "tool_use_id": "toolu_1", + "content": [ + {"type": "text", "text": "PDF loaded"}, + {"type": "document", "source": _base64_source("application/pdf")}, + ], + } + ] + + result = truncate_base64_media(content) + + document = result[0]["content"][1] + assert document["source"]["data"] == "[Base64 PDF data truncated]" + # Non-media content is left untouched. + assert result[0]["content"][0]["text"] == "PDF loaded" + + def test_unknown_media_type_falls_back_to_raw_type(self) -> None: + result = truncate_base64_media( + {"type": "input_audio", "source": _base64_source("audio/wav")} + ) + assert result["source"]["data"] == "[Base64 audio/wav data truncated]" + + def test_leaves_plain_content_untouched(self) -> None: + assert truncate_base64_media("just a prompt") == "just a prompt" + assert truncate_base64_media({"type": "text", "text": "hello"}) == { + "type": "text", + "text": "hello", + } diff --git a/tests/unit/tools/askui/test_decode_file_payload.py b/tests/unit/tools/askui/test_decode_file_payload.py new file mode 100644 index 00000000..6d898e83 --- /dev/null +++ b/tests/unit/tools/askui/test_decode_file_payload.py @@ -0,0 +1,51 @@ +"""Tests for `AskUiControllerClient._decode_file_payload`. + +`get_file` decodes a Base64 payload from the controller and dispatches by the +detected MIME type (via ``filetype.guess``) rather than by trying to parse it as +each type in turn: images become `PIL.Image.Image`, PDFs become `PdfSource`, +and anything that decodes cleanly as UTF-8 becomes a string. +""" + +import base64 +import io + +import pytest +from PIL import Image + +from askui.tools.askui.askui_controller import ( + AskUiControllerClient, + DesktopAgentOsError, +) +from askui.utils.pdf_utils import PdfSource + +_PDF_BYTES = b"%PDF-1.4\n1 0 obj<<>>endobj\ntrailer<<>>\n%%EOF" + + +def _b64(data: bytes) -> str: + return base64.b64encode(data).decode("ascii") + + +def _png_b64() -> str: + buffer = io.BytesIO() + Image.new("RGB", (2, 2), "red").save(buffer, format="PNG") + return _b64(buffer.getvalue()) + + +class TestDecodeFilePayload: + def test_decodes_image(self) -> None: + result = AskUiControllerClient._decode_file_payload(_png_b64()) + assert isinstance(result, Image.Image) + assert result.format == "PNG" + + def test_decodes_pdf(self) -> None: + result = AskUiControllerClient._decode_file_payload(_b64(_PDF_BYTES)) + assert isinstance(result, PdfSource) + assert result.to_bytes() == _PDF_BYTES + + def test_decodes_utf8_text(self) -> None: + result = AskUiControllerClient._decode_file_payload(_b64(b"hello world")) + assert result == "hello world" + + def test_rejects_unsupported_binary(self) -> None: + with pytest.raises(DesktopAgentOsError): + AskUiControllerClient._decode_file_payload(_b64(b"\x00\x01\x02\x03")) diff --git a/tests/unit/tools/store/__init__.py b/tests/unit/tools/store/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/tools/store/test_load_pdf_tool.py b/tests/unit/tools/store/test_load_pdf_tool.py new file mode 100644 index 00000000..2a355296 --- /dev/null +++ b/tests/unit/tools/store/test_load_pdf_tool.py @@ -0,0 +1,62 @@ +"""Tests for `LoadPdfTool`. + +The tool loads a PDF from a base directory and returns it as a `PdfSource`, the +PDF counterpart to `LoadImageTool`. When the tool result is converted into +content blocks the `PdfSource` becomes a `document` block, so the model receives +the PDF in full. +""" + +import base64 +from pathlib import Path + +import pytest + +from askui.models.shared.agent_message_param import ( + Base64PdfSourceParam, + DocumentBlockParam, +) +from askui.models.shared.tools import _convert_to_content +from askui.tools.store.universal import LoadPdfTool +from askui.utils.pdf_utils import PdfSource + +_PDF_BYTES = b"%PDF-1.4\n1 0 obj<<>>endobj\ntrailer<<>>\n%%EOF" + + +class TestLoadPdfTool: + def test_loads_pdf_relative_to_base_dir(self, tmp_path: Path) -> None: + (tmp_path / "reports").mkdir() + (tmp_path / "reports" / "q4.pdf").write_bytes(_PDF_BYTES) + tool = LoadPdfTool(base_dir=tmp_path) + + message, source = tool(pdf_path="reports/q4.pdf") + + assert isinstance(source, PdfSource) + assert source.to_bytes() == _PDF_BYTES + assert "reports/q4.pdf" in message.replace("\\", "/") + + def test_result_converts_to_document_block(self, tmp_path: Path) -> None: + (tmp_path / "doc.pdf").write_bytes(_PDF_BYTES) + tool = LoadPdfTool(base_dir=tmp_path) + + blocks = _convert_to_content(tool(pdf_path="doc.pdf")) + + # tuple result -> confirmation text block + document block + document = next(b for b in blocks if isinstance(b, DocumentBlockParam)) + assert isinstance(document.source, Base64PdfSourceParam) + assert base64.b64decode(document.source.data) == _PDF_BYTES + + def test_missing_file_raises(self, tmp_path: Path) -> None: + tool = LoadPdfTool(base_dir=tmp_path) + + with pytest.raises(FileNotFoundError): + tool(pdf_path="nope.pdf") + + def test_directory_path_raises(self, tmp_path: Path) -> None: + (tmp_path / "sub").mkdir() + tool = LoadPdfTool(base_dir=tmp_path) + + with pytest.raises(FileExistsError): + tool(pdf_path="sub") + + def test_is_cacheable(self, tmp_path: Path) -> None: + assert LoadPdfTool(base_dir=tmp_path).is_cacheable is True From b3e9400cd64308fdb76380ccf1e9d5fe6b238468 Mon Sep 17 00:00:00 2001 From: philipph-askui Date: Tue, 30 Jun 2026 13:10:39 +0200 Subject: [PATCH 2/3] address review comments --- src/askui/models/anthropic/messages_api.py | 1 + src/askui/models/openai/get_model.py | 4 +- src/askui/models/openai/messages_api.py | 3 +- .../models/shared/agent_message_param.py | 1 + src/askui/models/shared/tools.py | 49 +++++++++-- .../models/shared/truncation_strategies.py | 53 +++++++---- .../tools/store/universal/load_pdf_tool.py | 10 ++- src/askui/utils/pdf_utils.py | 64 ++++++++++++++ tests/unit/models/openai/test_get_model.py | 4 +- .../models/shared/test_tool_call_result.py | 87 ++++++++++++++++++- .../unit/models/test_truncation_strategies.py | 64 ++++++++++++++ tests/unit/tools/store/test_load_pdf_tool.py | 2 +- tests/unit/utils/test_pdf_utils.py | 59 +++++++++++++ 13 files changed, 369 insertions(+), 32 deletions(-) create mode 100644 tests/unit/utils/test_pdf_utils.py diff --git a/src/askui/models/anthropic/messages_api.py b/src/askui/models/anthropic/messages_api.py index 4cbe0950..fd8ea39d 100644 --- a/src/askui/models/anthropic/messages_api.py +++ b/src/askui/models/anthropic/messages_api.py @@ -125,6 +125,7 @@ def built_messages_for_get_pdf( source=Base64PdfSourceParam( data=pdf_source.to_base64(), ), + title=pdf_source.filename, ), TextBlockParam( text=prompt, diff --git a/src/askui/models/openai/get_model.py b/src/askui/models/openai/get_model.py index 86e049db..861c3095 100644 --- a/src/askui/models/openai/get_model.py +++ b/src/askui/models/openai/get_model.py @@ -17,7 +17,7 @@ from askui.models.types.response_schemas import ResponseSchema, to_response_schema from askui.prompts.get_prompts import SYSTEM_PROMPT_GET from askui.utils.excel_utils import OfficeDocumentSource -from askui.utils.pdf_utils import PdfSource +from askui.utils.pdf_utils import DEFAULT_PDF_FILENAME, PdfSource from askui.utils.source_utils import Source logger = logging.getLogger(__name__) @@ -156,7 +156,7 @@ def get( source_part: dict[str, Any] = { "type": "file", "file": { - "filename": "document.pdf", + "filename": source.filename or DEFAULT_PDF_FILENAME, "file_data": source.to_data_url(), }, } diff --git a/src/askui/models/openai/messages_api.py b/src/askui/models/openai/messages_api.py index 612fe3f8..51eacc0d 100644 --- a/src/askui/models/openai/messages_api.py +++ b/src/askui/models/openai/messages_api.py @@ -30,6 +30,7 @@ from askui.models.shared.messages_api import MessagesApi from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection +from askui.utils.pdf_utils import DEFAULT_PDF_FILENAME logger = logging.getLogger(__name__) @@ -63,7 +64,7 @@ def _document_block_to_openai(block: DocumentBlockParam) -> dict[str, Any]: return { "type": "file", "file": { - "filename": "document.pdf", + "filename": block.title or DEFAULT_PDF_FILENAME, "file_data": data_url, }, } diff --git a/src/askui/models/shared/agent_message_param.py b/src/askui/models/shared/agent_message_param.py index 8fbf1378..d2794397 100644 --- a/src/askui/models/shared/agent_message_param.py +++ b/src/askui/models/shared/agent_message_param.py @@ -68,6 +68,7 @@ class ImageBlockParam(BaseModel): class DocumentBlockParam(BaseModel): source: Base64PdfSourceParam type: Literal["document"] = "document" + title: str | None = None cache_control: CacheControlEphemeralParam | None = None diff --git a/src/askui/models/shared/tools.py b/src/askui/models/shared/tools.py index 6c019c9e..fc461e2d 100644 --- a/src/askui/models/shared/tools.py +++ b/src/askui/models/shared/tools.py @@ -1,3 +1,4 @@ +import base64 import logging import re import types @@ -15,11 +16,12 @@ from fastmcp.utilities.types import Image as FastMcpImage from mcp import Tool as McpTool from mcp.types import BlobResourceContents as McpBlobResourceContents +from mcp.types import EmbeddedResource as McpEmbeddedResource from mcp.types import ImageContent as McpImageContent from mcp.types import TextContent as McpTextContent from mcp.types import TextResourceContents as McpTextResourceContents from PIL import Image -from pydantic import BaseModel, ConfigDict, Field, PrivateAttr +from pydantic import AnyUrl, BaseModel, ConfigDict, Field, PrivateAttr from typing_extensions import Self from askui.models.exceptions import AutomationError @@ -39,7 +41,7 @@ from askui.tools import ComputerAgentOS from askui.tools.android.agent_os import AndroidAgentOs from askui.utils.image_utils import ImageSource, base64_to_image -from askui.utils.pdf_utils import PdfSource +from askui.utils.pdf_utils import MAX_PDF_SIZE_BYTES, PdfSource logger = logging.getLogger(__name__) @@ -69,6 +71,17 @@ def _convert_mcp_resource( if isinstance(resource, McpTextResourceContents): return TextBlockParam(text=resource.text) if resource.mimeType == "application/pdf": + # ``blob`` is base64; the decoded size is ~3/4 of its length. + decoded_size = len(resource.blob) * 3 // 4 + if decoded_size > MAX_PDF_SIZE_BYTES: + logger.warning( + "Dropping PDF resource exceeding the maximum supported size", + extra={ + "size_bytes": decoded_size, + "max_size_bytes": MAX_PDF_SIZE_BYTES, + }, + ) + return None return DocumentBlockParam(source=Base64PdfSourceParam(data=resource.blob)) logger.warning( "Unsupported embedded resource media type", @@ -136,6 +149,7 @@ def _convert_to_content( return [ DocumentBlockParam( source=Base64PdfSourceParam(data=result.to_base64()), + title=result.filename, ) ] @@ -169,6 +183,16 @@ def _convert_to_mcp_content( src = ImageSource(result) return FastMcpImage(data=src.to_bytes(), format="png").to_image_content() + if isinstance(result, PdfSource): + return McpEmbeddedResource( + type="resource", + resource=McpBlobResourceContents( + uri=AnyUrl("resource://document.pdf"), + mimeType="application/pdf", + blob=result.to_base64(), + ), + ) + return result @@ -178,11 +202,25 @@ def _convert_from_mcp_tool_call_result( ) -> PrimitiveToolCallResult: if isinstance(result, str): return result + + if isinstance(result, McpEmbeddedResource): + resource = result.resource + if isinstance(resource, McpTextResourceContents): + return resource.text + if resource.mimeType == "application/pdf": + return PdfSource(base64.b64decode(resource.blob)) + msg = ( + "MCP tool returned an unsupported embedded resource media type: " + f"{resource.mimeType}. Expected a text resource or an " + "'application/pdf' blob resource." + ) + raise McpToolAdapterException(tool_name, msg) + if not isinstance(result, (McpTextContent, McpImageContent)): unexpected_type = type(result).__name__ msg = ( f"MCP tool returned unexpected content type: {unexpected_type}. " - "Expected McpTextContent or McpImageContent." + "Expected McpTextContent, McpImageContent, or McpEmbeddedResource." ) raise McpToolAdapterException(tool_name, msg) @@ -323,8 +361,9 @@ def from_mcp_tool( Notes: The underlying callable must return values this adapter can turn - into text or image: `str`, `McpTextContent`, or `McpImageContent` - (or a `list` / `tuple` of those). + into text, image, or PDF: `str`, `McpTextContent`, + `McpImageContent`, or an `McpEmbeddedResource` (text or + `application/pdf` blob) - or a `list` / `tuple` of those. Any other types raise `McpToolAdapterException`. Example: diff --git a/src/askui/models/shared/truncation_strategies.py b/src/askui/models/shared/truncation_strategies.py index cc20ae60..1a92e739 100644 --- a/src/askui/models/shared/truncation_strategies.py +++ b/src/askui/models/shared/truncation_strategies.py @@ -42,6 +42,9 @@ IMAGE_REMOVED_PLACEHOLDER = "[Screenshot removed to reduce message history length]" """Text used to replace stripped base64 images.""" +DOCUMENT_REMOVED_PLACEHOLDER = "[PDF document removed to reduce message history length]" +"""Text used to replace stripped base64 PDF documents.""" + def _has_orphaned_tool_results(msg: MessageParam) -> bool: """Check if a message contains tool_result blocks. @@ -546,28 +549,38 @@ def _remove_images(self) -> None: removed += removed_in_msg @staticmethod + def _is_strippable_media(block: ContentBlockParam) -> bool: + """Whether `block` is a heavy base64 media block subject to stripping. + + Covers base64 images and PDF documents alike - both are large base64 + blobs that should not accumulate unbounded in the message history. + URL-based images are never stripped. + """ + if isinstance(block, ImageBlockParam): + return isinstance(block.source, Base64ImageSourceParam) + return isinstance(block, DocumentBlockParam) + + @classmethod def _count_base64_images( + cls, messages: list[MessageParam], ) -> int: - """Count total base64 image blocks across messages.""" + """Count total strippable base64 media blocks (images and PDFs).""" count = 0 for msg in messages: if isinstance(msg.content, str): continue for block in msg.content: - if isinstance(block, ImageBlockParam) and isinstance( - block.source, Base64ImageSourceParam - ): + if cls._is_strippable_media(block): count += 1 elif isinstance(block, ToolResultBlockParam) and isinstance( block.content, list ): - for nested in block.content: - if isinstance(nested, ImageBlockParam) and isinstance( - nested.source, - Base64ImageSourceParam, - ): - count += 1 + count += sum( + 1 + for nested in block.content + if cls._is_strippable_media(nested) + ) return count @staticmethod @@ -597,6 +610,9 @@ def _strip_base64_images( ): new_content.append(TextBlockParam(text=IMAGE_REMOVED_PLACEHOLDER)) stripped += 1 + elif isinstance(block, DocumentBlockParam): + new_content.append(TextBlockParam(text=DOCUMENT_REMOVED_PLACEHOLDER)) + stripped += 1 elif isinstance(block, ToolResultBlockParam) and isinstance( block.content, list ): @@ -604,18 +620,21 @@ def _strip_base64_images( TextBlockParam | ImageBlockParam | DocumentBlockParam ] = [] for nested in block.content: - if ( - stripped < max_to_strip - and isinstance(nested, ImageBlockParam) - and isinstance( - nested.source, - Base64ImageSourceParam, - ) + if stripped >= max_to_strip: + new_nested.append(nested) + elif isinstance(nested, ImageBlockParam) and isinstance( + nested.source, + Base64ImageSourceParam, ): new_nested.append( TextBlockParam(text=IMAGE_REMOVED_PLACEHOLDER) ) stripped += 1 + elif isinstance(nested, DocumentBlockParam): + new_nested.append( + TextBlockParam(text=DOCUMENT_REMOVED_PLACEHOLDER) + ) + stripped += 1 else: new_nested.append(nested) new_content.append( diff --git a/src/askui/tools/store/universal/load_pdf_tool.py b/src/askui/tools/store/universal/load_pdf_tool.py index 0feb819e..bf19a5e1 100644 --- a/src/askui/tools/store/universal/load_pdf_tool.py +++ b/src/askui/tools/store/universal/load_pdf_tool.py @@ -93,7 +93,8 @@ def __call__(self, pdf_path: str = "") -> tuple[str, PdfSource]: Raises: FileNotFoundError: If the PDF file does not exist at the specified path. - FileExistsError: If the path exists but is not a file (e.g., a directory). + IsADirectoryError: If the path exists but is a directory, not a file. + PdfTooLargeError: If the PDF exceeds the maximum supported size. """ absolute_pdf_path = self._base_dir / pdf_path @@ -103,9 +104,12 @@ def __call__(self, pdf_path: str = "") -> tuple[str, PdfSource]: if not absolute_pdf_path.is_file(): error_msg = f"Path is not a file: {absolute_pdf_path}" - raise FileExistsError(error_msg) + raise IsADirectoryError(error_msg) + + pdf_source = PdfSource(absolute_pdf_path) + pdf_source.validate_size() return ( f"PDF was successfully loaded from {absolute_pdf_path}", - PdfSource(absolute_pdf_path), + pdf_source, ) diff --git a/src/askui/utils/pdf_utils.py b/src/askui/utils/pdf_utils.py index 629548f7..6cf95ed0 100644 --- a/src/askui/utils/pdf_utils.py +++ b/src/askui/utils/pdf_utils.py @@ -4,6 +4,29 @@ from pydantic import ConfigDict, RootModel +# Anthropic and OpenAI both reject PDFs larger than 32 MB. We guard against this +# at the source so the caller gets a clear error instead of an opaque provider +# 400/413 once the (base64-inflated) request is sent. +# See https://docs.anthropic.com/en/docs/build-with-claude/pdf-support +MAX_PDF_SIZE_BYTES = 32 * 1024 * 1024 + +# Fallback file name used when a PDF has no associated path (e.g. loaded from +# raw bytes), since some providers require a file name for document parts. +DEFAULT_PDF_FILENAME = "document.pdf" + + +class PdfTooLargeError(ValueError): + """Raised when a PDF exceeds the maximum size supported by the model.""" + + def __init__(self, size_bytes: int, max_size_bytes: int) -> None: + self.size_bytes = size_bytes + self.max_size_bytes = max_size_bytes + super().__init__( + f"PDF is {size_bytes} bytes, which exceeds the maximum supported " + f"size of {max_size_bytes} bytes (~{max_size_bytes // (1024 * 1024)} " + "MB). Reduce the file size or split the document." + ) + class PdfSource(RootModel): """A class that represents a PDF source. @@ -28,12 +51,44 @@ def reader(self) -> BufferedReader | BytesIO: return self.root.open("rb") return BytesIO(self.root) + @property + def filename(self) -> str | None: + """The file name of the PDF when loaded from a path, otherwise `None`.""" + if isinstance(self.root, Path): + return self.root.name + return None + + @property + def size_bytes(self) -> int: + """The size of the PDF in bytes (without reading it fully into memory).""" + if isinstance(self.root, Path): + return self.root.stat().st_size + return len(self.root) + + def validate_size(self, max_size_bytes: int = MAX_PDF_SIZE_BYTES) -> None: + """Raise `PdfTooLargeError` if the PDF exceeds `max_size_bytes`. + + Args: + max_size_bytes (int, optional): The maximum allowed size in bytes. + Defaults to `MAX_PDF_SIZE_BYTES`. + + Raises: + PdfTooLargeError: If the PDF is larger than `max_size_bytes`. + """ + size = self.size_bytes + if size > max_size_bytes: + raise PdfTooLargeError(size, max_size_bytes) + def to_bytes(self) -> bytes: """Read the PDF source into bytes. Returns: bytes: The PDF as bytes. + + Raises: + PdfTooLargeError: If the PDF exceeds `MAX_PDF_SIZE_BYTES`. """ + self.validate_size() with self.reader as reader: return reader.read() @@ -42,6 +97,9 @@ def to_base64(self) -> str: Returns: str: A base64 encoded string of the PDF. + + Raises: + PdfTooLargeError: If the PDF exceeds `MAX_PDF_SIZE_BYTES`. """ return base64.b64encode(self.to_bytes()).decode("utf-8") @@ -51,10 +109,16 @@ def to_data_url(self) -> str: Returns: str: A data URL string in the format `"data:application/pdf;base64,..."`. + + Raises: + PdfTooLargeError: If the PDF exceeds `MAX_PDF_SIZE_BYTES`. """ return f"data:application/pdf;base64,{self.to_base64()}" __all__ = [ + "DEFAULT_PDF_FILENAME", + "MAX_PDF_SIZE_BYTES", "PdfSource", + "PdfTooLargeError", ] diff --git a/tests/unit/models/openai/test_get_model.py b/tests/unit/models/openai/test_get_model.py index f7a0ab85..f444930e 100644 --- a/tests/unit/models/openai/test_get_model.py +++ b/tests/unit/models/openai/test_get_model.py @@ -75,6 +75,7 @@ def test_pdf_source_sends_file_part(self) -> None: source = MagicMock(spec=PdfSource) source.to_data_url.return_value = "data:application/pdf;base64,abc" + source.filename = "report.pdf" model = OpenAIGetModel(model_id="gpt-4o", client=mock_client) result = model.get( @@ -90,7 +91,8 @@ def test_pdf_source_sends_file_part(self) -> None: ] file_part = next(part for part in content if part["type"] == "file") assert file_part["file"]["file_data"] == "data:application/pdf;base64,abc" - assert file_part["file"]["filename"] == "document.pdf" + # The PDF's real file name is forwarded to OpenAI's ``file`` part. + assert file_part["file"]["filename"] == "report.pdf" def test_office_document_source_not_supported(self) -> None: mock_client = MagicMock() diff --git a/tests/unit/models/shared/test_tool_call_result.py b/tests/unit/models/shared/test_tool_call_result.py index bb2328d0..18edefd2 100644 --- a/tests/unit/models/shared/test_tool_call_result.py +++ b/tests/unit/models/shared/test_tool_call_result.py @@ -7,8 +7,10 @@ """ import base64 +from pathlib import Path from typing import cast +import pytest from fastmcp.client.client import CallToolResult from mcp.types import ( BlobResourceContents, @@ -26,11 +28,14 @@ ToolResultBlockParam, ) from askui.models.shared.tools import ( + McpToolAdapterException, _convert_call_tool_result, + _convert_from_mcp_tool_call_result, _convert_mcp_resource, _convert_to_content, + _convert_to_mcp_content, ) -from askui.utils.pdf_utils import PdfSource +from askui.utils.pdf_utils import MAX_PDF_SIZE_BYTES, PdfSource # Smallest payload that is unambiguously a PDF; we only base64-encode the bytes, # never parse them. @@ -57,11 +62,29 @@ def test_pdf_alongside_text_and_image_preserves_order(self) -> None: ] ) - assert [type(block) for block in result] == [ + expected_types: list[type] = [ TextBlockParam, DocumentBlockParam, ImageBlockParam, ] + assert [type(block) for block in result] == expected_types + + def test_pdf_source_from_path_sets_title(self, tmp_path: Path) -> None: + pdf = tmp_path / "report.pdf" + pdf.write_bytes(_PDF_BYTES) + + result = _convert_to_content(PdfSource(pdf)) + + block = result[0] + assert isinstance(block, DocumentBlockParam) + assert block.title == "report.pdf" + + def test_pdf_source_from_bytes_has_no_title(self) -> None: + result = _convert_to_content(PdfSource(_PDF_BYTES)) + + block = result[0] + assert isinstance(block, DocumentBlockParam) + assert block.title is None def test_document_in_tool_result_serializes_for_anthropic(self) -> None: block = ToolResultBlockParam( @@ -135,3 +158,63 @@ def test_embedded_pdf_resource_in_call_tool_result(self) -> None: assert len(blocks) == 1 assert isinstance(blocks[0], DocumentBlockParam) assert base64.b64decode(blocks[0].source.data) == _PDF_BYTES + + def test_oversized_pdf_resource_is_dropped(self) -> None: + # ``blob`` length implies a decoded size above the limit, so the + # resource is dropped instead of being forwarded to the provider. + oversized_blob = "A" * ((MAX_PDF_SIZE_BYTES + 1024) * 4 // 3) + resource = BlobResourceContents( + uri="file:///big.pdf", + mimeType="application/pdf", + blob=oversized_blob, + ) + + assert _convert_mcp_resource(resource) is None + + +class TestMcpPdfRoundTrip: + """PDFs survive the outbound (`to_mcp`) and inbound (`from_mcp`) MCP paths.""" + + def test_pdf_source_serialized_to_embedded_resource(self) -> None: + converted = _convert_to_mcp_content(PdfSource(_PDF_BYTES)) + + assert isinstance(converted, EmbeddedResource) + assert isinstance(converted.resource, BlobResourceContents) + assert converted.resource.mimeType == "application/pdf" + assert base64.b64decode(converted.resource.blob) == _PDF_BYTES + + def test_embedded_pdf_resource_becomes_pdf_source(self) -> None: + resource = EmbeddedResource( + type="resource", + resource=BlobResourceContents( + uri="file:///doc.pdf", + mimeType="application/pdf", + blob=base64.b64encode(_PDF_BYTES).decode(), + ), + ) + + result = _convert_from_mcp_tool_call_result("tool", resource) + + assert isinstance(result, PdfSource) + assert result.to_bytes() == _PDF_BYTES + + def test_text_embedded_resource_becomes_string(self) -> None: + resource = EmbeddedResource( + type="resource", + resource=TextResourceContents(uri="file:///a.txt", text="hello"), + ) + + assert _convert_from_mcp_tool_call_result("tool", resource) == "hello" + + def test_unsupported_embedded_resource_raises(self) -> None: + resource = EmbeddedResource( + type="resource", + resource=BlobResourceContents( + uri="file:///a.bin", + mimeType="application/octet-stream", + blob="QUJD", + ), + ) + + with pytest.raises(McpToolAdapterException): + _convert_from_mcp_tool_call_result("tool", resource) diff --git a/tests/unit/models/test_truncation_strategies.py b/tests/unit/models/test_truncation_strategies.py index 37adee7f..1e19855a 100644 --- a/tests/unit/models/test_truncation_strategies.py +++ b/tests/unit/models/test_truncation_strategies.py @@ -5,7 +5,9 @@ from askui.callbacks.conversation_callback import ConversationCallback from askui.models.shared.agent_message_param import ( Base64ImageSourceParam, + Base64PdfSourceParam, ContentBlockParam, + DocumentBlockParam, ImageBlockParam, MessageParam, TextBlockParam, @@ -20,6 +22,7 @@ ) IMAGE_REMOVED_PLACEHOLDER = "[Screenshot removed to reduce message history length]" +DOCUMENT_REMOVED_PLACEHOLDER = "[PDF document removed to reduce message history length]" # --------------------------------------------------------------------------- @@ -39,6 +42,10 @@ def _make_url_image_block() -> ImageBlockParam: ) +def _make_pdf_block() -> DocumentBlockParam: + return DocumentBlockParam(source=Base64PdfSourceParam(data="cGRm")) + + def _make_tool_result_with_image(tool_use_id: str = "tool_1") -> ToolResultBlockParam: return ToolResultBlockParam( tool_use_id=tool_use_id, @@ -49,6 +56,16 @@ def _make_tool_result_with_image(tool_use_id: str = "tool_1") -> ToolResultBlock ) +def _make_tool_result_with_pdf(tool_use_id: str = "tool_1") -> ToolResultBlockParam: + return ToolResultBlockParam( + tool_use_id=tool_use_id, + content=[ + TextBlockParam(text="result text"), + _make_pdf_block(), + ], + ) + + def _make_vlm_provider(usage: UsageParam | None = None) -> MagicMock: provider = MagicMock() provider.create_message.return_value = MessageParam( @@ -190,6 +207,53 @@ def test_strips_images_inside_tool_results(self) -> None: assert isinstance(tool_result.content[1], TextBlockParam) assert tool_result.content[1].text == IMAGE_REMOVED_PLACEHOLDER + def test_strips_oldest_pdf_documents(self) -> None: + strategy = _make_strategy(n_images_to_keep=1) + for i in range(3): + role = "user" if i % 2 == 0 else "assistant" + strategy.append_message( + MessageParam(role=role, content=[_make_pdf_block()]) + ) + truncated = strategy.truncated_messages + # Oldest two PDFs replaced by placeholders; most recent one preserved. + assert isinstance(truncated[0].content, list) + assert isinstance(truncated[0].content[0], TextBlockParam) + assert truncated[0].content[0].text == DOCUMENT_REMOVED_PLACEHOLDER + assert isinstance(truncated[1].content[0], TextBlockParam) + assert truncated[1].content[0].text == DOCUMENT_REMOVED_PLACEHOLDER + assert isinstance(truncated[2].content[0], DocumentBlockParam) + + def test_strips_pdf_documents_inside_tool_results(self) -> None: + strategy = _make_strategy(n_images_to_keep=0) + strategy.append_message( + MessageParam( + role="user", + content=[_make_tool_result_with_pdf("tool_1")], + ) + ) + content = strategy.truncated_messages[0].content + assert isinstance(content, list) + tool_result = content[0] + assert isinstance(tool_result, ToolResultBlockParam) + assert isinstance(tool_result.content, list) + assert isinstance(tool_result.content[0], TextBlockParam) + assert tool_result.content[0].text == "result text" + assert isinstance(tool_result.content[1], TextBlockParam) + assert tool_result.content[1].text == DOCUMENT_REMOVED_PLACEHOLDER + + def test_images_and_pdfs_share_the_keep_budget(self) -> None: + # n_images_to_keep counts heavy media (images + PDFs) combined. + strategy = _make_strategy(n_images_to_keep=1) + strategy.append_message(MessageParam(role="user", content=[_make_pdf_block()])) + strategy.append_message( + MessageParam(role="assistant", content=[_make_base64_image_block()]) + ) + truncated = strategy.truncated_messages + # The older PDF is stripped; the most recent media (the image) is kept. + assert isinstance(truncated[0].content[0], TextBlockParam) + assert truncated[0].content[0].text == DOCUMENT_REMOVED_PLACEHOLDER + assert isinstance(truncated[1].content[0], ImageBlockParam) + def test_preserves_non_image_blocks(self) -> None: strategy = _make_strategy(n_images_to_keep=0) strategy.append_message( diff --git a/tests/unit/tools/store/test_load_pdf_tool.py b/tests/unit/tools/store/test_load_pdf_tool.py index 2a355296..e0bd693a 100644 --- a/tests/unit/tools/store/test_load_pdf_tool.py +++ b/tests/unit/tools/store/test_load_pdf_tool.py @@ -55,7 +55,7 @@ def test_directory_path_raises(self, tmp_path: Path) -> None: (tmp_path / "sub").mkdir() tool = LoadPdfTool(base_dir=tmp_path) - with pytest.raises(FileExistsError): + with pytest.raises(IsADirectoryError): tool(pdf_path="sub") def test_is_cacheable(self, tmp_path: Path) -> None: diff --git a/tests/unit/utils/test_pdf_utils.py b/tests/unit/utils/test_pdf_utils.py new file mode 100644 index 00000000..5850edfa --- /dev/null +++ b/tests/unit/utils/test_pdf_utils.py @@ -0,0 +1,59 @@ +"""Unit tests for `PdfSource` size guarding and metadata.""" + +from pathlib import Path + +import pytest + +from askui.utils.pdf_utils import ( + MAX_PDF_SIZE_BYTES, + PdfSource, + PdfTooLargeError, +) + +_PDF_BYTES = b"%PDF-1.4\n1 0 obj<<>>endobj\ntrailer<<>>\n%%EOF" + + +class TestSizeBytes: + def test_size_from_bytes(self) -> None: + assert PdfSource(_PDF_BYTES).size_bytes == len(_PDF_BYTES) + + def test_size_from_path(self, tmp_path: Path) -> None: + pdf = tmp_path / "doc.pdf" + pdf.write_bytes(_PDF_BYTES) + assert PdfSource(pdf).size_bytes == len(_PDF_BYTES) + + +class TestValidateSize: + def test_within_limit_passes(self) -> None: + PdfSource(_PDF_BYTES).validate_size() # does not raise + + def test_over_limit_raises(self) -> None: + oversized = PdfSource(b"x" * (MAX_PDF_SIZE_BYTES + 1)) + with pytest.raises(PdfTooLargeError): + oversized.validate_size() + + def test_custom_limit(self) -> None: + with pytest.raises(PdfTooLargeError): + PdfSource(_PDF_BYTES).validate_size(max_size_bytes=1) + + def test_to_base64_enforces_limit(self) -> None: + oversized = PdfSource(b"x" * (MAX_PDF_SIZE_BYTES + 1)) + with pytest.raises(PdfTooLargeError): + oversized.to_base64() + + def test_error_reports_sizes(self) -> None: + oversized = PdfSource(b"x" * (MAX_PDF_SIZE_BYTES + 1)) + with pytest.raises(PdfTooLargeError) as exc_info: + oversized.validate_size() + assert exc_info.value.size_bytes == MAX_PDF_SIZE_BYTES + 1 + assert exc_info.value.max_size_bytes == MAX_PDF_SIZE_BYTES + + +class TestFilename: + def test_filename_from_path(self, tmp_path: Path) -> None: + pdf = tmp_path / "report.pdf" + pdf.write_bytes(_PDF_BYTES) + assert PdfSource(pdf).filename == "report.pdf" + + def test_filename_none_for_bytes(self) -> None: + assert PdfSource(_PDF_BYTES).filename is None From 10c2a1271ac2ef40abd75011face6c76f6088569 Mon Sep 17 00:00:00 2001 From: philipph-askui Date: Tue, 30 Jun 2026 13:18:09 +0200 Subject: [PATCH 3/3] chore: add docs and example --- docs/07_tools.md | 28 +++++++- docs/10_extracting_data.md | 30 +++++---- examples/pdf_documents.py | 127 +++++++++++++++++++++++++++++++++++++ 3 files changed, 172 insertions(+), 13 deletions(-) create mode 100644 examples/pdf_documents.py diff --git a/docs/07_tools.md b/docs/07_tools.md index 8f26424e..4fe8bb36 100644 --- a/docs/07_tools.md +++ b/docs/07_tools.md @@ -61,6 +61,7 @@ Work with any agent type, no special dependencies required. **Examples:** - `PrintToConsoleTool()` - Print messages to console output +- `LoadPdfTool(base_dir)` - Load a PDF from disk (relative to `base_dir`) and hand it to the model for analysis - Data processing and formatting tools - General utility functions @@ -72,6 +73,7 @@ Require `ComputerAgentOS` and work with `ComputerAgent` for desktop automation. **Examples:** - `ComputerSaveScreenshotTool(base_dir)` - Save screenshots to disk +- `ComputerGetFileTool()` - Read a file from the computer under automation; returns text, a decoded image, or a `PdfSource` for PDF documents (import from `askui.tools.store.computer.experimental`) - Window management - Device Automation @@ -173,11 +175,12 @@ The constructor defines the tool’s metadata and input requirements: Contains the actual business logic that runs when the tool is invoked. -Tools are flexible — they can return plain values, structured data, or even images. +Tools are flexible — they can return plain values, structured data, images, or PDF documents. A tool’s __call__ method may return: - str - numbers or other primitive values - PIL.Image.Image — image output +- PdfSource — a PDF document handed to the model as a document block (see below) - None - a list or tuple containing any of the above @@ -193,6 +196,29 @@ image = downscale_image(image, max_dimension=2000) This preserves the original aspect ratio and only downscales images whose longest side exceeds the limit. +**Returning a PDF:** A tool can return a `PdfSource` to hand a PDF document to the model, mirroring how returning a `PIL.Image.Image` produces an image. The PDF is forwarded unchanged — as a base64 `document` block to Anthropic Claude and as a `file` content part to OpenAI — so the model can reason about its text, tables, charts, and layout. + +```python +from pathlib import Path + +from askui.models.shared.tools import Tool +from askui.utils.pdf_utils import PdfSource + +class LoadInvoiceTool(Tool): + def __init__(self) -> None: + super().__init__( + name="load_invoice", + description="Loads the current invoice PDF for analysis.", + input_schema={"type": "object", "properties": {}}, + ) + + def __call__(self) -> PdfSource: + # Pass a Path (or raw bytes) — a plain str is interpreted as PDF bytes, not a path. + return PdfSource(Path("invoices/latest.pdf")) +``` + +PDFs returned from a tool must not exceed **32MB**; a larger PDF raises `PdfTooLargeError`. When a `PdfSource` is created from a path, its file name is forwarded to the model as the document title. + ### Complete Example Here’s a greeting tool that demonstrates all the key concepts: diff --git a/docs/10_extracting_data.md b/docs/10_extracting_data.md index e72ee913..bd428619 100644 --- a/docs/10_extracting_data.md +++ b/docs/10_extracting_data.md @@ -51,11 +51,11 @@ pip install "askui[office-document]" **Model Compatibility Matrix** -| File Format | AskUI Gemini | Anthropic Claude | Google Gemini -| ------------------- | ------------ | ---------------- | ---------- -| PDF (.pdf) | ✅ | ❌ | ✅ -| Excel (.xlsx, .xls) | ✅ | ✅ | ✅ -| Word (.docx, .doc) | ✅ | ✅ | ✅ +| File Format | AskUI Gemini | Anthropic Claude | Google Gemini | OpenAI +| ------------------- | ------------ | ---------------- | ------------- | ------ +| PDF (.pdf) | ✅ | ✅ | ✅ | ✅ +| Excel (.xlsx, .xls) | ✅ | ✅ | ✅ | ❌ +| Word (.docx, .doc) | ✅ | ✅ | ✅ | ❌ **General Limitations** - **Processing Model Restriction**: not all models support all document formats @@ -66,23 +66,29 @@ pip install "askui[office-document]" ### 📄 PDF Files (.pdf) - **MIME Types**: `application/pdf` -- **Maximum File Size**: 20MB -- **Processing Method**: **Depends on Usage Context** +- **Maximum File Size**: 32MB +- **Processing Method**: the PDF is passed to the model unchanged (every page as both text and image), so the model can reason about text, tables, charts, and layout. No Markdown conversion is performed. **Processing Workflow for PDF Files:** ```mermaid graph TD A[Call agent.get with PDF] --> B[Load as PdfSource] - B --> C[Send directly as binary to Gemini] - C --> D[Gemini processes content] - D --> E[Return results directly] - E --> F[No storage - process again for next call] + B --> C{Model provider} + C -->|Anthropic Claude| D[Send as base64 document block] + C -->|OpenAI| E[Send as base64 file content part] + C -->|AskUI / Google Gemini| F[Send directly as binary] + D --> G[Model processes content] + E --> G + F --> G + G --> H[Return results directly] + H --> I[No storage - process again for next call] ``` **PDF-Specific Limitations** -- **20MB file size limit** for PDF files +- **32MB file size limit** for PDF files. Larger files raise `PdfTooLargeError` before any request is sent. +- **No caching**: the PDF is re-sent to the model on every `get()` call. ### 📊 Excel Files (.xlsx, .xls) diff --git a/examples/pdf_documents.py b/examples/pdf_documents.py new file mode 100644 index 00000000..78683586 --- /dev/null +++ b/examples/pdf_documents.py @@ -0,0 +1,127 @@ +"""Example demonstrating PDF document support. + +The agent can read PDF documents and reason about their text, tables, charts, and +layout. A PDF is forwarded to the model unchanged - as a base64 ``document`` block to +Anthropic Claude and as a ``file`` content part to OpenAI - so no Markdown conversion +is performed. PDFs must not exceed 32MB; a larger file raises ``PdfTooLargeError``. + +Three entry points are shown: +1. `agent.get(source=...)` - extract information directly from a PDF on disk. +2. `LoadPdfTool` - let `act()` load a PDF from disk during execution. +3. `ComputerGetFileTool` - read a file off the computer under automation; PDFs come + back as a `PdfSource` (text files as `str`, images as `PIL.Image.Image`). + +A custom tool returning a `PdfSource` is also shown - any tool may hand a PDF to the +model the same way returning a `PIL.Image.Image` produces an image. + +Required environment variables (see .env): +- ASKUI_WORKSPACE_ID, ASKUI_TOKEN - for the default AskUI model stack + +Drop a `sample.pdf` next to this file (or change `PDF_PATH`) before running the +on-disk examples. +""" + +import logging +from pathlib import Path + +from askui import ComputerAgent +from askui.models.shared.tools import Tool +from askui.tools.store.computer.experimental import ComputerGetFileTool +from askui.tools.store.universal import LoadPdfTool +from askui.utils.pdf_utils import PdfSource + +logging.basicConfig( + level=logging.INFO, + format="[%(levelname)s] %(asctime)s %(pathname)s:%(lineno)d | %(message)s", +) +logger = logging.getLogger(__name__) + +HERE = Path(__file__).parent +PDF_PATH = HERE / "sample.pdf" + + +def extract_from_pdf_file() -> None: + """Extract information straight from a PDF on disk via `get()`. + + No screen interaction is needed - the PDF itself is the source. + """ + if not PDF_PATH.exists(): + logger.warning("No PDF at %s - skipping extract_from_pdf_file()", PDF_PATH) + return + + with ComputerAgent() as agent: + summary = agent.get( + "Summarize the key points of this document in 3 bullet points", + source=str(PDF_PATH), + ) + logger.info("PDF summary:\n%s", summary) + + +def load_pdf_during_act() -> None: + """Let `act()` load a PDF from disk through `LoadPdfTool`. + + `LoadPdfTool` resolves paths relative to its `base_dir`; the loaded PDF is handed + to the model in full (every page as both text and image). + """ + if not PDF_PATH.exists(): + logger.warning("No PDF at %s - skipping load_pdf_during_act()", PDF_PATH) + return + + with ComputerAgent(act_tools=[LoadPdfTool(base_dir=str(HERE))]) as agent: + agent.act( + f"Load '{PDF_PATH.name}', tell me what it is about, and list any headings " + "you find." + ) + + +def read_pdf_from_target_machine() -> None: + """Read a PDF off the computer under automation with `ComputerGetFileTool`. + + The controller returns the file decoded as a `PdfSource`, which the SDK forwards + to the model as a document block so it can reason over the full PDF. + """ + with ComputerAgent(act_tools=[ComputerGetFileTool()]) as agent: + agent.act( + "Read the PDF at '/home/user/report.pdf' on this machine and summarize " + "its first page." + ) + + +class LoadInvoiceTool(Tool): + """Custom tool that hands a fixed invoice PDF to the model. + + Returning a `PdfSource` from a tool mirrors returning a `PIL.Image.Image`: the PDF + is rendered as a document block in the tool result. Pass a `Path` (or raw bytes) - + a plain `str` is interpreted as PDF bytes, not a file path. + """ + + def __init__(self, pdf_path: Path) -> None: + super().__init__( + name="load_invoice", + description="Loads the current invoice PDF for analysis.", + input_schema={"type": "object", "properties": {}}, + ) + self._pdf_path = pdf_path + + def __call__(self) -> PdfSource: + return PdfSource(self._pdf_path) + + +def custom_pdf_returning_tool() -> None: + """Use a custom tool that returns a `PdfSource`.""" + if not PDF_PATH.exists(): + logger.warning("No PDF at %s - skipping custom_pdf_returning_tool()", PDF_PATH) + return + + with ComputerAgent(act_tools=[LoadInvoiceTool(pdf_path=PDF_PATH)]) as agent: + agent.act("Load the invoice and tell me its total amount.") + + +if __name__ == "__main__": + # Pick the scenario you want to try. + extract_from_pdf_file() + # load_pdf_during_act() + # read_pdf_from_target_machine() + # custom_pdf_returning_tool() + + logger.info("Done!")