askui · mlikasam-askui · Jun 25, 2026
diff --git a/mypy.ini b/mypy.ini
@@ -14,7 +14,8 @@ warn_unreachable = true
 strict_optional = true
 plugins = pydantic.mypy,sqlalchemy.ext.mypy.plugin
 exclude = (?x)(
-    ^src/askui/models/ui_tars_ep/ui_tars_api\.py$
+    ^\.?venv/.*$
+    | ^src/askui/models/ui_tars_ep/ui_tars_api\.py$
     | ^src/askui/tools/askui/askui_ui_controller_grpc/.*$
   )
 mypy_path = src:tests

diff --git a/src/askui/__init__.py b/src/askui/__init__.py
@@ -14,11 +14,13 @@
 from .locators import Locator
 from .models import (
     Base64ImageSourceParam,
+    Base64PdfSourceParam,
     CacheControlEphemeralParam,
     CitationCharLocationParam,
     CitationContentBlockLocationParam,
     CitationPageLocationParam,
     ContentBlockParam,
+    DocumentBlockParam,
     ImageBlockParam,
     MessageParam,
     OnMessageCb,
@@ -46,6 +48,7 @@
 from .retry import ConfigurableRetry, Retry
 from .tools import ModifierKey, PcKey
 from .utils.image_utils import ImageSource
+from .utils.pdf_utils import PdfSource
 from .utils.source_utils import InputSource
 
 try:
@@ -76,6 +79,7 @@
     "AgentSettings",
     "ActSettings",
     "Base64ImageSourceParam",
+    "Base64PdfSourceParam",
     "CacheControlEphemeralParam",
     "CitationCharLocationParam",
     "CitationContentBlockLocationParam",
@@ -85,10 +89,12 @@
     "ConversationCallback",
     "DEFAULT_GET_RESOLUTION",
     "DEFAULT_LOCATE_RESOLUTION",
+    "DocumentBlockParam",
     "GetSettings",
     "ImageBlockParam",
     "ImageSource",
     "InputSource",
+    "PdfSource",
     "Locator",
     "LocateSettings",
     "MessageParam",

diff --git a/src/askui/models/__init__.py b/src/askui/models/__init__.py
@@ -9,11 +9,13 @@
 from .openrouter.settings import ChatCompletionsCreateSettings, OpenRouterSettings
 from .shared.agent_message_param import (
     Base64ImageSourceParam,
+    Base64PdfSourceParam,
     CacheControlEphemeralParam,
     CitationCharLocationParam,
     CitationContentBlockLocationParam,
     CitationPageLocationParam,
     ContentBlockParam,
+    DocumentBlockParam,
     ImageBlockParam,
     MessageParam,
     TextBlockParam,
@@ -28,12 +30,14 @@
 __all__ = [
     "ActModel",
     "Base64ImageSourceParam",
+    "Base64PdfSourceParam",
     "CacheControlEphemeralParam",
     "ChatCompletionsCreateSettings",
     "CitationCharLocationParam",
     "CitationContentBlockLocationParam",
     "CitationPageLocationParam",
     "ContentBlockParam",
+    "DocumentBlockParam",
     "FallbackGetModel",
     "FallbackLocateModel",
     "GetModel",

diff --git a/src/askui/models/anthropic/get_model.py b/src/askui/models/anthropic/get_model.py
@@ -4,7 +4,10 @@
 
 from typing_extensions import override
 
-from askui.models.anthropic.messages_api import built_messages_for_get_and_locate
+from askui.models.anthropic.messages_api import (
+    built_messages_for_get_and_locate,
+    built_messages_for_get_pdf,
+)
 from askui.models.anthropic.settings import UnexpectedResponseError
 from askui.models.exceptions import (
     QueryNoResponseError,
@@ -68,24 +71,27 @@ def get(
         response_schema: Type[ResponseSchema] | None,
         get_settings: GetSettings,
     ) -> ResponseSchema | str:
-        if isinstance(source, (PdfSource, OfficeDocumentSource)):
+        if isinstance(source, OfficeDocumentSource):
             err_msg = (
-                f"PDF or Office Document processing is not supported for the model: "
+                f"Office Document processing is not supported for the model: "
                 f"{self._model_id}"
             )
             raise NotImplementedError(err_msg)
         try:
             if response_schema is not None:
                 error_msg = "Response schema is not yet supported for Anthropic"
                 raise NotImplementedError(error_msg)
-            target_size = compute_contained_size(
-                source.root.width,
-                source.root.height,
-                get_settings.resolution.width,
-                get_settings.resolution.height,
-            )
-            scaled_image = resize_image(source.root, target_size)
-            messages = built_messages_for_get_and_locate(scaled_image, query)
+            if isinstance(source, PdfSource):
+                messages = built_messages_for_get_pdf(source, query)
+            else:
+                target_size = compute_contained_size(
+                    source.root.width,
+                    source.root.height,
+                    get_settings.resolution.width,
+                    get_settings.resolution.height,
+                )
+                scaled_image = resize_image(source.root, target_size)
+                messages = built_messages_for_get_and_locate(scaled_image, query)
             message = self._messages_api.create_message(
                 messages=messages,
                 model_id=self._model_id,

diff --git a/src/askui/models/anthropic/messages_api.py b/src/askui/models/anthropic/messages_api.py
@@ -28,8 +28,10 @@
 )
 from askui.models.shared.agent_message_param import (
     Base64ImageSourceParam,
+    Base64PdfSourceParam,
     CacheControlEphemeralParam,
     ContentBlockParam,
+    DocumentBlockParam,
     ImageBlockParam,
     MessageParam,
     TextBlockParam,
@@ -41,6 +43,7 @@
 from askui.models.shared.prompts import SystemPrompt
 from askui.models.shared.tools import ToolCollection
 from askui.utils.image_utils import image_to_base64
+from askui.utils.pdf_utils import PdfSource
 
 
 def _is_retryable_error(exception: BaseException) -> bool:
@@ -107,6 +110,31 @@ def built_messages_for_get_and_locate(
     ]
 
 
+def built_messages_for_get_pdf(
+    pdf_source: PdfSource, prompt: str
+) -> list[MessageParam]:
+    # Anthropic accepts a base64 PDF `document` block (no beta header); placing
+    # the document before the text follows Anthropic's PDF best practices.
+    return [
+        MessageParam(
+            role="user",
+            content=cast(
+                "list[ContentBlockParam]",
+                [
+                    DocumentBlockParam(
+                        source=Base64PdfSourceParam(
+                            data=pdf_source.to_base64(),
+                        ),
+                    ),
+                    TextBlockParam(
+                        text=prompt,
+                    ),
+                ],
+            ),
+        )
+    ]
+
+
 def _parse_to_anthropic_types(
     tools: ToolCollection | None,
     betas: list[str] | None = None,

diff --git a/src/askui/models/openai/get_model.py b/src/askui/models/openai/get_model.py
@@ -64,7 +64,7 @@ def __init__(
 
     def _predict(
         self,
-        image_url: str,
+        source_part: dict[str, Any],
         instruction: str,
         prompt: GetSystemPrompt,
         response_schema: type[ResponseSchema] | None,
@@ -103,12 +103,7 @@ def _predict(
                 {
                     "role": "user",
                     "content": [
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": image_url,
-                            },
-                        },
+                        source_part,
                         {"type": "text", "text": str(prompt) + instruction},
                     ],
                 }
@@ -148,17 +143,31 @@ def get(
         response_schema: type[ResponseSchema] | None,
         get_settings: GetSettings,
     ) -> ResponseSchema | str:
-        if isinstance(source, (PdfSource, OfficeDocumentSource)):
+        if isinstance(source, OfficeDocumentSource):
             err_msg = (
-                "PDF or Office Document processing is not supported"
+                "Office Document processing is not supported"
                 " for OpenAI-compatible models"
             )
             raise NotImplementedError(err_msg)
 
         system_prompt = get_settings.system_prompt or SYSTEM_PROMPT_GET
 
+        if isinstance(source, PdfSource):
+            source_part: dict[str, Any] = {
+                "type": "file",
+                "file": {
+                    "filename": "document.pdf",
+                    "file_data": source.to_data_url(),
+                },
+            }
+        else:
+            source_part = {
+                "type": "image_url",
+                "image_url": {"url": source.to_data_url()},
+            }
+
         response = self._predict(
-            image_url=source.to_data_url(),
+            source_part=source_part,
             instruction=query,
             prompt=system_prompt,
             response_schema=response_schema,

diff --git a/src/askui/models/openai/messages_api.py b/src/askui/models/openai/messages_api.py
@@ -16,6 +16,7 @@
     BetaRedactedThinkingBlock,
     BetaThinkingBlock,
     ContentBlockParam,
+    DocumentBlockParam,
     ImageBlockParam,
     MessageParam,
     StopReason,
@@ -56,27 +57,41 @@ def _image_block_to_openai(block: ImageBlockParam) -> dict[str, Any]:
     return {"type": "image_url", "image_url": {"url": url}}
 
 
+def _document_block_to_openai(block: DocumentBlockParam) -> dict[str, Any]:
+    """Convert a `DocumentBlockParam` (PDF) to an OpenAI ``file`` content part."""
+    data_url = f"data:{block.source.media_type};base64,{block.source.data}"
+    return {
+        "type": "file",
+        "file": {
+            "filename": "document.pdf",
+            "file_data": data_url,
+        },
+    }
+
+
 def _serialize_tool_result_content(
-    content: str | list[TextBlockParam | ImageBlockParam],
+    content: str | list[TextBlockParam | ImageBlockParam | DocumentBlockParam],
 ) -> tuple[str, list[dict[str, Any]]]:
     """Serialize ``ToolResultBlockParam.content`` for OpenAI's ``tool`` role.
 
-    Returns the text portion as a string and any images as OpenAI content
-    parts (to be appended as a separate ``user`` message since the OpenAI
-    ``tool`` role only accepts string content).
+    Returns the text portion as a string and any images/documents as OpenAI
+    content parts (to be appended as a separate ``user`` message since the
+    OpenAI ``tool`` role only accepts string content).
     """
     if isinstance(content, str):
         return content, []
 
     text_parts: list[str] = []
-    image_parts: list[dict[str, Any]] = []
+    media_parts: list[dict[str, Any]] = []
     for block in content:
         if isinstance(block, TextBlockParam):
             text_parts.append(block.text)
-        else:
-            image_parts.append(_image_block_to_openai(block))
+        elif isinstance(block, ImageBlockParam):
+            media_parts.append(_image_block_to_openai(block))
+        elif isinstance(block, DocumentBlockParam):
+            media_parts.append(_document_block_to_openai(block))
 
-    return "\n".join(text_parts), image_parts
+    return "\n".join(text_parts), media_parts
 
 
 def _content_block_to_openai(block: ContentBlockParam) -> dict[str, Any] | None:
@@ -88,6 +103,8 @@ def _content_block_to_openai(block: ContentBlockParam) -> dict[str, Any] | None:
         return {"type": "text", "text": block.text}
     if isinstance(block, ImageBlockParam):
         return _image_block_to_openai(block)
+    if isinstance(block, DocumentBlockParam):
+        return _document_block_to_openai(block)
     if isinstance(block, (BetaThinkingBlock, BetaRedactedThinkingBlock)):
         return None
     return None
@@ -164,16 +181,16 @@ def _convert_user_message(
     """Convert a user message's content blocks to OpenAI format.
 
     ``ToolResultBlockParam`` blocks become ``tool`` role messages.
-    Images inside tool results are collected and appended as a separate
-    ``user`` message so the model can still see them.
+    Images and documents inside tool results are collected and appended as a
+    separate ``user`` message so the model can still see them.
     """
-    tool_result_images: list[dict[str, Any]] = []
+    tool_result_media: list[dict[str, Any]] = []
     content_parts: list[dict[str, Any]] = []
 
     for block in blocks:
         if isinstance(block, ToolResultBlockParam):
-            text_content, images = _serialize_tool_result_content(block.content)
-            tool_result_images.extend(images)
+            text_content, media = _serialize_tool_result_content(block.content)
+            tool_result_media.extend(media)
             result.append(
                 {
                     "role": "tool",
@@ -189,9 +206,9 @@ def _convert_user_message(
     if content_parts:
         result.append({"role": "user", "content": content_parts})
 
-    # Append images from tool results as a separate user message
-    if tool_result_images:
-        result.append({"role": "user", "content": tool_result_images})
+    # Append images/documents from tool results as a separate user message
+    if tool_result_media:
+        result.append({"role": "user", "content": tool_result_media})
 
 
 def _to_openai_tools(tools: ToolCollection) -> list[dict[str, Any]]:

diff --git a/src/askui/models/shared/agent_message_param.py b/src/askui/models/shared/agent_message_param.py
@@ -49,6 +49,12 @@ class Base64ImageSourceParam(BaseModel):
     type: Literal["base64"] = "base64"
 
 
+class Base64PdfSourceParam(BaseModel):
+    data: str
+    media_type: Literal["application/pdf"] = "application/pdf"
+    type: Literal["base64"] = "base64"
+
+
 class CacheControlEphemeralParam(BaseModel):
     type: Literal["ephemeral"] = "ephemeral"
 
@@ -59,6 +65,12 @@ class ImageBlockParam(BaseModel):
     cache_control: CacheControlEphemeralParam | None = None
 
 
+class DocumentBlockParam(BaseModel):
+    source: Base64PdfSourceParam
+    type: Literal["document"] = "document"
+    cache_control: CacheControlEphemeralParam | None = None
+
+
 class TextBlockParam(BaseModel):
     text: str
     type: Literal["text"] = "text"
@@ -70,7 +82,7 @@ class ToolResultBlockParam(BaseModel):
     tool_use_id: str
     type: Literal["tool_result"] = "tool_result"
     cache_control: CacheControlEphemeralParam | None = None
-    content: str | list[TextBlockParam | ImageBlockParam]
+    content: str | list[TextBlockParam | ImageBlockParam | DocumentBlockParam]
     is_error: bool = False
 
 
@@ -96,6 +108,7 @@ class BetaRedactedThinkingBlock(BaseModel):
 
 ContentBlockParam = (
     ImageBlockParam
+    | DocumentBlockParam
     | TextBlockParam
     | ToolResultBlockParam
     | ToolUseBlockParam
@@ -135,11 +148,13 @@ class MessageParam(BaseModel):
 
 __all__ = [
     "Base64ImageSourceParam",
+    "Base64PdfSourceParam",
     "CacheControlEphemeralParam",
     "CitationCharLocationParam",
     "CitationContentBlockLocationParam",
     "CitationPageLocationParam",
     "ContentBlockParam",
+    "DocumentBlockParam",
     "ImageBlockParam",
     "MessageParam",
     "TextBlockParam",