Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ warn_unreachable = true
strict_optional = true
plugins = pydantic.mypy,sqlalchemy.ext.mypy.plugin
exclude = (?x)(
^src/askui/models/ui_tars_ep/ui_tars_api\.py$
^\.?venv/.*$
| ^src/askui/models/ui_tars_ep/ui_tars_api\.py$
| ^src/askui/tools/askui/askui_ui_controller_grpc/.*$
)
mypy_path = src:tests
Expand Down
6 changes: 6 additions & 0 deletions src/askui/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
from .locators import Locator
from .models import (
Base64ImageSourceParam,
Base64PdfSourceParam,
CacheControlEphemeralParam,
CitationCharLocationParam,
CitationContentBlockLocationParam,
CitationPageLocationParam,
ContentBlockParam,
DocumentBlockParam,
ImageBlockParam,
MessageParam,
OnMessageCb,
Expand Down Expand Up @@ -46,6 +48,7 @@
from .retry import ConfigurableRetry, Retry
from .tools import ModifierKey, PcKey
from .utils.image_utils import ImageSource
from .utils.pdf_utils import PdfSource
from .utils.source_utils import InputSource

try:
Expand Down Expand Up @@ -76,6 +79,7 @@
"AgentSettings",
"ActSettings",
"Base64ImageSourceParam",
"Base64PdfSourceParam",
"CacheControlEphemeralParam",
"CitationCharLocationParam",
"CitationContentBlockLocationParam",
Expand All @@ -85,10 +89,12 @@
"ConversationCallback",
"DEFAULT_GET_RESOLUTION",
"DEFAULT_LOCATE_RESOLUTION",
"DocumentBlockParam",
"GetSettings",
"ImageBlockParam",
"ImageSource",
"InputSource",
"PdfSource",
"Locator",
"LocateSettings",
"MessageParam",
Expand Down
4 changes: 4 additions & 0 deletions src/askui/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@
from .openrouter.settings import ChatCompletionsCreateSettings, OpenRouterSettings
from .shared.agent_message_param import (
Base64ImageSourceParam,
Base64PdfSourceParam,
CacheControlEphemeralParam,
CitationCharLocationParam,
CitationContentBlockLocationParam,
CitationPageLocationParam,
ContentBlockParam,
DocumentBlockParam,
ImageBlockParam,
MessageParam,
TextBlockParam,
Expand All @@ -28,12 +30,14 @@
__all__ = [
"ActModel",
"Base64ImageSourceParam",
"Base64PdfSourceParam",
"CacheControlEphemeralParam",
"ChatCompletionsCreateSettings",
"CitationCharLocationParam",
"CitationContentBlockLocationParam",
"CitationPageLocationParam",
"ContentBlockParam",
"DocumentBlockParam",
"FallbackGetModel",
"FallbackLocateModel",
"GetModel",
Expand Down
28 changes: 17 additions & 11 deletions src/askui/models/anthropic/get_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@

from typing_extensions import override

from askui.models.anthropic.messages_api import built_messages_for_get_and_locate
from askui.models.anthropic.messages_api import (
built_messages_for_get_and_locate,
built_messages_for_get_pdf,
)
from askui.models.anthropic.settings import UnexpectedResponseError
from askui.models.exceptions import (
QueryNoResponseError,
Expand Down Expand Up @@ -68,24 +71,27 @@ def get(
response_schema: Type[ResponseSchema] | None,
get_settings: GetSettings,
) -> ResponseSchema | str:
if isinstance(source, (PdfSource, OfficeDocumentSource)):
if isinstance(source, OfficeDocumentSource):
err_msg = (
f"PDF or Office Document processing is not supported for the model: "
f"Office Document processing is not supported for the model: "
f"{self._model_id}"
)
raise NotImplementedError(err_msg)
try:
if response_schema is not None:
error_msg = "Response schema is not yet supported for Anthropic"
raise NotImplementedError(error_msg)
target_size = compute_contained_size(
source.root.width,
source.root.height,
get_settings.resolution.width,
get_settings.resolution.height,
)
scaled_image = resize_image(source.root, target_size)
messages = built_messages_for_get_and_locate(scaled_image, query)
if isinstance(source, PdfSource):
messages = built_messages_for_get_pdf(source, query)
else:
target_size = compute_contained_size(
source.root.width,
source.root.height,
get_settings.resolution.width,
get_settings.resolution.height,
)
scaled_image = resize_image(source.root, target_size)
messages = built_messages_for_get_and_locate(scaled_image, query)
message = self._messages_api.create_message(
messages=messages,
model_id=self._model_id,
Expand Down
28 changes: 28 additions & 0 deletions src/askui/models/anthropic/messages_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,10 @@
)
from askui.models.shared.agent_message_param import (
Base64ImageSourceParam,
Base64PdfSourceParam,
CacheControlEphemeralParam,
ContentBlockParam,
DocumentBlockParam,
ImageBlockParam,
MessageParam,
TextBlockParam,
Expand All @@ -41,6 +43,7 @@
from askui.models.shared.prompts import SystemPrompt
from askui.models.shared.tools import ToolCollection
from askui.utils.image_utils import image_to_base64
from askui.utils.pdf_utils import PdfSource


def _is_retryable_error(exception: BaseException) -> bool:
Expand Down Expand Up @@ -107,6 +110,31 @@ def built_messages_for_get_and_locate(
]


def built_messages_for_get_pdf(
pdf_source: PdfSource, prompt: str
) -> list[MessageParam]:
# Anthropic accepts a base64 PDF `document` block (no beta header); placing
# the document before the text follows Anthropic's PDF best practices.
return [
MessageParam(
role="user",
content=cast(
"list[ContentBlockParam]",
[
DocumentBlockParam(
source=Base64PdfSourceParam(
data=pdf_source.to_base64(),
),
),
TextBlockParam(
text=prompt,
),
],
),
)
]


def _parse_to_anthropic_types(
tools: ToolCollection | None,
betas: list[str] | None = None,
Expand Down
29 changes: 19 additions & 10 deletions src/askui/models/openai/get_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def __init__(

def _predict(
self,
image_url: str,
source_part: dict[str, Any],
instruction: str,
prompt: GetSystemPrompt,
response_schema: type[ResponseSchema] | None,
Expand Down Expand Up @@ -103,12 +103,7 @@ def _predict(
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url,
},
},
source_part,
{"type": "text", "text": str(prompt) + instruction},
],
}
Expand Down Expand Up @@ -148,17 +143,31 @@ def get(
response_schema: type[ResponseSchema] | None,
get_settings: GetSettings,
) -> ResponseSchema | str:
if isinstance(source, (PdfSource, OfficeDocumentSource)):
if isinstance(source, OfficeDocumentSource):
err_msg = (
"PDF or Office Document processing is not supported"
"Office Document processing is not supported"
" for OpenAI-compatible models"
)
raise NotImplementedError(err_msg)

system_prompt = get_settings.system_prompt or SYSTEM_PROMPT_GET

if isinstance(source, PdfSource):
source_part: dict[str, Any] = {
"type": "file",
"file": {
"filename": "document.pdf",
"file_data": source.to_data_url(),
},
}
else:
source_part = {
"type": "image_url",
"image_url": {"url": source.to_data_url()},
}

response = self._predict(
image_url=source.to_data_url(),
source_part=source_part,
instruction=query,
prompt=system_prompt,
response_schema=response_schema,
Expand Down
49 changes: 33 additions & 16 deletions src/askui/models/openai/messages_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
BetaRedactedThinkingBlock,
BetaThinkingBlock,
ContentBlockParam,
DocumentBlockParam,
ImageBlockParam,
MessageParam,
StopReason,
Expand Down Expand Up @@ -56,27 +57,41 @@ def _image_block_to_openai(block: ImageBlockParam) -> dict[str, Any]:
return {"type": "image_url", "image_url": {"url": url}}


def _document_block_to_openai(block: DocumentBlockParam) -> dict[str, Any]:
"""Convert a `DocumentBlockParam` (PDF) to an OpenAI ``file`` content part."""
data_url = f"data:{block.source.media_type};base64,{block.source.data}"
return {
"type": "file",
"file": {
"filename": "document.pdf",
"file_data": data_url,
},
}


def _serialize_tool_result_content(
content: str | list[TextBlockParam | ImageBlockParam],
content: str | list[TextBlockParam | ImageBlockParam | DocumentBlockParam],
) -> tuple[str, list[dict[str, Any]]]:
"""Serialize ``ToolResultBlockParam.content`` for OpenAI's ``tool`` role.

Returns the text portion as a string and any images as OpenAI content
parts (to be appended as a separate ``user`` message since the OpenAI
``tool`` role only accepts string content).
Returns the text portion as a string and any images/documents as OpenAI
content parts (to be appended as a separate ``user`` message since the
OpenAI ``tool`` role only accepts string content).
"""
if isinstance(content, str):
return content, []

text_parts: list[str] = []
image_parts: list[dict[str, Any]] = []
media_parts: list[dict[str, Any]] = []
for block in content:
if isinstance(block, TextBlockParam):
text_parts.append(block.text)
else:
image_parts.append(_image_block_to_openai(block))
elif isinstance(block, ImageBlockParam):
media_parts.append(_image_block_to_openai(block))
elif isinstance(block, DocumentBlockParam):
media_parts.append(_document_block_to_openai(block))

return "\n".join(text_parts), image_parts
return "\n".join(text_parts), media_parts


def _content_block_to_openai(block: ContentBlockParam) -> dict[str, Any] | None:
Expand All @@ -88,6 +103,8 @@ def _content_block_to_openai(block: ContentBlockParam) -> dict[str, Any] | None:
return {"type": "text", "text": block.text}
if isinstance(block, ImageBlockParam):
return _image_block_to_openai(block)
if isinstance(block, DocumentBlockParam):
return _document_block_to_openai(block)
if isinstance(block, (BetaThinkingBlock, BetaRedactedThinkingBlock)):
return None
return None
Expand Down Expand Up @@ -164,16 +181,16 @@ def _convert_user_message(
"""Convert a user message's content blocks to OpenAI format.

``ToolResultBlockParam`` blocks become ``tool`` role messages.
Images inside tool results are collected and appended as a separate
``user`` message so the model can still see them.
Images and documents inside tool results are collected and appended as a
separate ``user`` message so the model can still see them.
"""
tool_result_images: list[dict[str, Any]] = []
tool_result_media: list[dict[str, Any]] = []
content_parts: list[dict[str, Any]] = []

for block in blocks:
if isinstance(block, ToolResultBlockParam):
text_content, images = _serialize_tool_result_content(block.content)
tool_result_images.extend(images)
text_content, media = _serialize_tool_result_content(block.content)
tool_result_media.extend(media)
result.append(
{
"role": "tool",
Expand All @@ -189,9 +206,9 @@ def _convert_user_message(
if content_parts:
result.append({"role": "user", "content": content_parts})

# Append images from tool results as a separate user message
if tool_result_images:
result.append({"role": "user", "content": tool_result_images})
# Append images/documents from tool results as a separate user message
if tool_result_media:
result.append({"role": "user", "content": tool_result_media})


def _to_openai_tools(tools: ToolCollection) -> list[dict[str, Any]]:
Expand Down
17 changes: 16 additions & 1 deletion src/askui/models/shared/agent_message_param.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ class Base64ImageSourceParam(BaseModel):
type: Literal["base64"] = "base64"


class Base64PdfSourceParam(BaseModel):
data: str
media_type: Literal["application/pdf"] = "application/pdf"
type: Literal["base64"] = "base64"


class CacheControlEphemeralParam(BaseModel):
type: Literal["ephemeral"] = "ephemeral"

Expand All @@ -59,6 +65,12 @@ class ImageBlockParam(BaseModel):
cache_control: CacheControlEphemeralParam | None = None


class DocumentBlockParam(BaseModel):
source: Base64PdfSourceParam
type: Literal["document"] = "document"
cache_control: CacheControlEphemeralParam | None = None


class TextBlockParam(BaseModel):
text: str
type: Literal["text"] = "text"
Expand All @@ -70,7 +82,7 @@ class ToolResultBlockParam(BaseModel):
tool_use_id: str
type: Literal["tool_result"] = "tool_result"
cache_control: CacheControlEphemeralParam | None = None
content: str | list[TextBlockParam | ImageBlockParam]
content: str | list[TextBlockParam | ImageBlockParam | DocumentBlockParam]
is_error: bool = False


Expand All @@ -96,6 +108,7 @@ class BetaRedactedThinkingBlock(BaseModel):

ContentBlockParam = (
ImageBlockParam
| DocumentBlockParam
| TextBlockParam
| ToolResultBlockParam
| ToolUseBlockParam
Expand Down Expand Up @@ -135,11 +148,13 @@ class MessageParam(BaseModel):

__all__ = [
"Base64ImageSourceParam",
"Base64PdfSourceParam",
"CacheControlEphemeralParam",
"CitationCharLocationParam",
"CitationContentBlockLocationParam",
"CitationPageLocationParam",
"ContentBlockParam",
"DocumentBlockParam",
"ImageBlockParam",
"MessageParam",
"TextBlockParam",
Expand Down
Loading
Loading