Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 121 additions & 15 deletions src/askui/model_providers/askui_vlm_provider.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,99 @@
"""AskUIVlmProvider — VLM access via AskUI's hosted Anthropic proxy."""
"""AskUIVlmProvider — VLM access via AskUI's hosted model proxies."""

import os
from enum import Enum
from functools import cached_property
from typing import Any

from anthropic import Anthropic
from openai import OpenAI
from typing_extensions import override

from askui.model_providers.vlm_provider import VlmProvider
from askui.models.anthropic.messages_api import AnthropicMessagesApi
from askui.models.askui.inference_api_settings import AskUiInferenceApiSettings
from askui.models.openai.messages_api import OpenAIMessagesApi
from askui.models.shared.agent_message_param import (
MessageParam,
ThinkingConfigParam,
ToolChoiceParam,
)
from askui.models.shared.coordinate_space import (
PixelCoordinateSpace,
ScaledCoordinateSpace,
VlmCoordinateSpace,
)
from askui.models.shared.image_scaler import ImageScaler, PatchOptimizedImageScaler
from askui.models.shared.messages_api import MessagesApi
from askui.models.shared.prompts import SystemPrompt
from askui.models.shared.tools import ToolCollection

_DEFAULT_MODEL_ID = "claude-sonnet-4-6"
_DEFAULT_MAX_IMAGE_EDGE = 1024
# Claude emits native pixel coordinates; Gemini emits coordinates in a
# 1000x1000 normalised grid.
_ANTHROPIC_COORDINATE_SPACE = PixelCoordinateSpace()
_GOOGLE_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)


class _Backend(Enum):
"""The AskUI proxy backend a model is served through."""

ANTHROPIC = "anthropic"
GOOGLE = "google"
OPENAI = "openai"


def _infer_backend(model_id: str) -> _Backend:
"""Infer the AskUI proxy backend that serves ``model_id``.

Claude models route to the Anthropic-compatible proxy; Gemini models (with
or without a ``google/`` vendor prefix) route to the OpenAI-compatible proxy.

Raises:
ValueError: If no backend can be inferred from ``model_id``.
"""
normalized = model_id.lower()
if "claude" in normalized:
return _Backend.ANTHROPIC
if "gemini" in normalized:
return _Backend.GOOGLE
error_msg = (
f"Cannot infer a backend for model id {model_id!r}. Expected the model "
f"id to reference a Claude or Gemini model."
)
raise ValueError(error_msg)


class AskUIVlmProvider(VlmProvider):
"""VLM provider that routes requests through AskUI's hosted Anthropic proxy.
"""VLM provider that routes requests through AskUI's hosted model proxies.

The proxy used is selected from `model_id`:

- Anthropic (Claude) models are served via the Anthropic-compatible proxy
(``/proxy/anthropic``) using the `AnthropicMessagesApi`.
- OpenAI-compatible models (e.g. Gemini) are served via the OpenAI-compatible
proxy (``/proxy/openai/v1/chat/completions``) using the `OpenAIMessagesApi`.

Supports Claude 4.x generation models. Credentials are read from environment
variables (`ASKUI_WORKSPACE_ID`, `ASKUI_TOKEN`) lazily — validation happens
on the first API call, not at construction time.
The backend is inferred from the model-id prefix (see `_infer_backend`); a
`ValueError` is raised when it cannot be determined.

Credentials are read from environment variables (`ASKUI_WORKSPACE_ID`,
`ASKUI_TOKEN`) lazily — validation happens on the first API call, not at
construction time.

Args:
askui_settings (`AskUiInferenceApiSettings` | None, optional):
Connection settings (workspace ID, token, base URL). Reads
from environment variables if not provided.
model_id (str | None, optional): Claude model to use. Defaults to
``"claude-sonnet-4-6"``.
client (`Anthropic` | None, optional): Pre-configured Anthropic client.
If provided, ``askui_settings`` is only used for the base URL.
model_id (str | None, optional): Model to use. Defaults to
``"claude-sonnet-4-6"``. Pass a Gemini model id (e.g.
``"gemini-3.5-pro"``) to route through the OpenAI-compatible proxy.
client (`Anthropic` | `OpenAI` | None, optional): Pre-configured client.
Pass an `Anthropic` client for Claude models or an `OpenAI` client
for Gemini models. It is used only when it matches the proxy the
configured ``model_id`` routes to; otherwise a client is built from
``askui_settings``.
image_scaler (`ImageScaler` | None, optional): Custom image preprocessing
callable. If ``None``, uses Anthropic-optimized patch-based scaling
controlled by ``image_edge_max``.
Expand All @@ -63,15 +119,15 @@ def __init__(
self,
askui_settings: AskUiInferenceApiSettings | None = None,
model_id: str | None = None,
client: Anthropic | None = None,
client: Anthropic | OpenAI | None = None,
image_scaler: ImageScaler | None = None,
image_edge_max: int | None = None,
) -> None:
self._askui_settings = askui_settings or AskUiInferenceApiSettings()
self._model_id_value = (
model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID
)
self._injected_client = client
self._client = client
resolved_edge_max = (
image_edge_max
or int(os.environ.get("ASKUI_VLM_MAX_IMAGE_EDGE", "0"))
Expand All @@ -91,11 +147,32 @@ def model_id(self) -> str:
def image_scaler(self) -> ImageScaler:
return self._image_scaler

@property
@override
def coordinate_space(self) -> VlmCoordinateSpace:
"""The coordinate grid the configured model emits coordinates in.

Gemini (OpenAI proxy) emits coordinates in a 1000x1000 normalised grid;
Claude emits native pixel coordinates.
"""
if self._backend is _Backend.GOOGLE:
return _GOOGLE_COORDINATE_SPACE
return _ANTHROPIC_COORDINATE_SPACE

@cached_property
def _backend(self) -> _Backend:
return _infer_backend(self._model_id_value)

@cached_property
def _messages_api(self) -> AnthropicMessagesApi:
"""Lazily initialise the AnthropicMessagesApi on first use."""
if self._injected_client is not None:
return AnthropicMessagesApi(client=self._injected_client)
def _messages_api(self) -> MessagesApi:
"""Lazily initialise the `MessagesApi` matching the configured model."""
if self._backend is _Backend.OPENAI or self._backend is _Backend.GOOGLE:
return self._build_openai_messages_api()
return self._build_anthropic_messages_api()

def _build_anthropic_messages_api(self) -> AnthropicMessagesApi:
if isinstance(self._client, Anthropic):
return AnthropicMessagesApi(client=self._client)

# TODO askui_settings.verify_ssl are not considered! #noqa
# if self._askui_settings.verify_ssl:
Expand All @@ -110,6 +187,33 @@ def _messages_api(self) -> AnthropicMessagesApi:
)
return AnthropicMessagesApi(client=client)

def _build_openai_messages_api(self) -> OpenAIMessagesApi:
if isinstance(self._client, OpenAI):
return OpenAIMessagesApi(client=self._client)

client = OpenAI(
api_key="DummyValueRequiredByOpenAIClient",
base_url=f"{self._askui_settings.base_url}/proxy/openai/v1",
default_headers={
"Authorization": self._askui_settings.authorization_header
},
)
return OpenAIMessagesApi(client=client)

@override
def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt:
"""Append coordinate info to the system prompt for OpenAI-proxy models.

Claude emits pixel coordinates natively, so the prompt is returned
unchanged. Models routed through the OpenAI proxy (e.g. Gemini) are told
which coordinate grid to emit so their output can be mapped back via
`coordinate_space`.
"""
if self._backend is not _Backend.GOOGLE:
return system
coord_info = self.coordinate_space.build_prompt_section()
return SystemPrompt(prompt=f"{str(system)}\n\n{coord_info}")

@override
def create_message(
self,
Expand All @@ -122,6 +226,8 @@ def create_message(
temperature: float | None = None,
provider_options: dict[str, Any] | None = None,
) -> MessageParam:
if system is not None:
system = self.augment_system_prompt(system)
result: MessageParam = self._messages_api.create_message(
messages=messages,
model_id=self._model_id_value,
Expand Down
14 changes: 8 additions & 6 deletions src/askui/models/anthropic/messages_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,18 @@ def from_content_block(block: ContentBlockParam) -> BetaContentBlockParam:
"""Convert an internal content block to an Anthropic API-compatible dict.

Uses `model_dump()` to produce plain dicts compatible with Anthropic's
TypedDicts. Strips ``visual_representation`` from `ToolUseBlockParam`
as it is not accepted by the API.
TypedDicts. Strips ``visual_representation`` and ``extra_content`` from
`ToolUseBlockParam` as they are not accepted by the API.
"""
if isinstance(block, ToolUseBlockParam):
# visual_representation is an internal field (perceptual hash for cache
# validation) that does not exist in the Anthropic API schema. Sending
# it would cause the API to reject the request with an unknown-field error.
# visual_representation (perceptual hash for cache validation) and
# extra_content (provider-specific data, e.g. Gemini thought signatures)
# are internal fields that do not exist in the Anthropic API schema.
# Sending them would cause the API to reject the request with an
# unknown-field error.
return cast(
"BetaContentBlockParam",
block.model_dump(exclude={"visual_representation"}),
block.model_dump(exclude={"visual_representation", "extra_content"}),
)
return cast("BetaContentBlockParam", block.model_dump())

Expand Down
30 changes: 20 additions & 10 deletions src/askui/models/openai/messages_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,16 +137,19 @@ def _convert_assistant_message(
if isinstance(block, TextBlockParam):
text_parts.append(block.text)
elif isinstance(block, ToolUseBlockParam):
tool_calls.append(
{
"id": block.id,
"type": "function",
"function": {
"name": block.name,
"arguments": json.dumps(block.input),
},
}
)
tool_call: dict[str, Any] = {
"id": block.id,
"type": "function",
"function": {
"name": block.name,
"arguments": json.dumps(block.input),
},
}
# Echo back provider-specific data (e.g. Gemini thought signatures)
# so multi-turn tool calling keeps working.
if block.extra_content is not None:
tool_call["extra_content"] = block.extra_content
tool_calls.append(tool_call)
# Skip thinking blocks silently

openai_msg: dict[str, Any] = {"role": "assistant"}
Expand Down Expand Up @@ -235,11 +238,18 @@ def _parse_tool_calls(
},
)
arguments = {"raw_arguments": tool_call.function.arguments}
# Gemini (via the OpenAI-compatible API) attaches a `thought_signature`
# inside `extra_content` on each tool call. It must be echoed back on
# subsequent turns or the API rejects the request, so preserve it.
extra_content = (tool_call.model_extra or {}).get("extra_content")
content_blocks.append(
ToolUseBlockParam(
id=tool_call.id,
name=tool_call.function.name,
input=arguments,
extra_content=extra_content
if isinstance(extra_content, dict)
else None,
)
)

Expand Down
4 changes: 4 additions & 0 deletions src/askui/models/shared/agent_message_param.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ class ToolUseBlockParam(BaseModel):
type: Literal["tool_use"] = "tool_use"
cache_control: CacheControlEphemeralParam | None = None
visual_representation: str | None = None # Visual hash for cache validation
# Provider-specific data echoed back on subsequent turns. Used by Gemini via
# the OpenAI-compatible API to carry `thought_signature` (required for tool
# calls to keep working across turns). Not part of the Anthropic API schema.
extra_content: dict[str, Any] | None = None


class BetaThinkingBlock(BaseModel):
Expand Down
Loading
Loading