From 905e828cc6392871576ee14ae9b3f74437b6f01f Mon Sep 17 00:00:00 2001 From: philipph-askui Date: Mon, 29 Jun 2026 13:50:49 +0200 Subject: [PATCH 1/3] feat: add support for new openai-compatible endpoint on askui API in askuiVlmProvider --- .../model_providers/askui_vlm_provider.py | 136 ++++++++++++++-- .../test_askui_vlm_provider.py | 151 ++++++++++++++++++ 2 files changed, 272 insertions(+), 15 deletions(-) create mode 100644 tests/unit/model_providers/test_askui_vlm_provider.py diff --git a/src/askui/model_providers/askui_vlm_provider.py b/src/askui/model_providers/askui_vlm_provider.py index 0337b79b..8c799a67 100644 --- a/src/askui/model_providers/askui_vlm_provider.py +++ b/src/askui/model_providers/askui_vlm_provider.py @@ -1,43 +1,99 @@ -"""AskUIVlmProvider — VLM access via AskUI's hosted Anthropic proxy.""" +"""AskUIVlmProvider — VLM access via AskUI's hosted model proxies.""" import os +from enum import Enum from functools import cached_property from typing import Any from anthropic import Anthropic +from openai import OpenAI from typing_extensions import override from askui.model_providers.vlm_provider import VlmProvider from askui.models.anthropic.messages_api import AnthropicMessagesApi from askui.models.askui.inference_api_settings import AskUiInferenceApiSettings +from askui.models.openai.messages_api import OpenAIMessagesApi from askui.models.shared.agent_message_param import ( MessageParam, ThinkingConfigParam, ToolChoiceParam, ) +from askui.models.shared.coordinate_space import ( + PixelCoordinateSpace, + ScaledCoordinateSpace, + VlmCoordinateSpace, +) from askui.models.shared.image_scaler import ImageScaler, PatchOptimizedImageScaler +from askui.models.shared.messages_api import MessagesApi from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection _DEFAULT_MODEL_ID = "claude-sonnet-4-6" _DEFAULT_MAX_IMAGE_EDGE = 1024 +# Claude emits native pixel coordinates; Gemini emits coordinates in a +# 1000x1000 normalised grid. +_ANTHROPIC_COORDINATE_SPACE = PixelCoordinateSpace() +_GOOGLE_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000) + + +class _Backend(Enum): + """The AskUI proxy backend a model is served through.""" + + ANTHROPIC = "anthropic" + GOOGLE = "google" + OPENAI = "openai" + + +def _infer_backend(model_id: str) -> _Backend: + """Infer the AskUI proxy backend that serves ``model_id``. + + Claude models route to the Anthropic-compatible proxy; Gemini models (with + or without a ``google/`` vendor prefix) route to the OpenAI-compatible proxy. + + Raises: + ValueError: If no backend can be inferred from ``model_id``. + """ + normalized = model_id.lower() + if "claude" in normalized: + return _Backend.ANTHROPIC + if "gemini" in normalized: + return _Backend.GOOGLE + error_msg = ( + f"Cannot infer a backend for model id {model_id!r}. Expected the model " + f"id to reference a Claude or Gemini model." + ) + raise ValueError(error_msg) class AskUIVlmProvider(VlmProvider): - """VLM provider that routes requests through AskUI's hosted Anthropic proxy. + """VLM provider that routes requests through AskUI's hosted model proxies. + + The proxy used is selected from `model_id`: + + - Anthropic (Claude) models are served via the Anthropic-compatible proxy + (``/proxy/anthropic``) using the `AnthropicMessagesApi`. + - OpenAI-compatible models (e.g. Gemini) are served via the OpenAI-compatible + proxy (``/proxy/openai/v1/chat/completions``) using the `OpenAIMessagesApi`. - Supports Claude 4.x generation models. Credentials are read from environment - variables (`ASKUI_WORKSPACE_ID`, `ASKUI_TOKEN`) lazily — validation happens - on the first API call, not at construction time. + The backend is inferred from the model-id prefix (see `_infer_backend`); a + `ValueError` is raised when it cannot be determined. + + Credentials are read from environment variables (`ASKUI_WORKSPACE_ID`, + `ASKUI_TOKEN`) lazily — validation happens on the first API call, not at + construction time. Args: askui_settings (`AskUiInferenceApiSettings` | None, optional): Connection settings (workspace ID, token, base URL). Reads from environment variables if not provided. - model_id (str | None, optional): Claude model to use. Defaults to - ``"claude-sonnet-4-6"``. - client (`Anthropic` | None, optional): Pre-configured Anthropic client. - If provided, ``askui_settings`` is only used for the base URL. + model_id (str | None, optional): Model to use. Defaults to + ``"claude-sonnet-4-6"``. Pass a Gemini model id (e.g. + ``"gemini-3.5-pro"``) to route through the OpenAI-compatible proxy. + client (`Anthropic` | `OpenAI` | None, optional): Pre-configured client. + Pass an `Anthropic` client for Claude models or an `OpenAI` client + for Gemini models. It is used only when it matches the proxy the + configured ``model_id`` routes to; otherwise a client is built from + ``askui_settings``. image_scaler (`ImageScaler` | None, optional): Custom image preprocessing callable. If ``None``, uses Anthropic-optimized patch-based scaling controlled by ``image_edge_max``. @@ -63,7 +119,7 @@ def __init__( self, askui_settings: AskUiInferenceApiSettings | None = None, model_id: str | None = None, - client: Anthropic | None = None, + client: Anthropic | OpenAI | None = None, image_scaler: ImageScaler | None = None, image_edge_max: int | None = None, ) -> None: @@ -71,7 +127,7 @@ def __init__( self._model_id_value = ( model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID ) - self._injected_client = client + self._client = client resolved_edge_max = ( image_edge_max or int(os.environ.get("ASKUI_VLM_MAX_IMAGE_EDGE", "0")) @@ -91,11 +147,32 @@ def model_id(self) -> str: def image_scaler(self) -> ImageScaler: return self._image_scaler + @property + @override + def coordinate_space(self) -> VlmCoordinateSpace: + """The coordinate grid the configured model emits coordinates in. + + Gemini (OpenAI proxy) emits coordinates in a 1000x1000 normalised grid; + Claude emits native pixel coordinates. + """ + if self._backend is _Backend.GOOGLE: + return _GOOGLE_COORDINATE_SPACE + return _ANTHROPIC_COORDINATE_SPACE + + @cached_property + def _backend(self) -> _Backend: + return _infer_backend(self._model_id_value) + @cached_property - def _messages_api(self) -> AnthropicMessagesApi: - """Lazily initialise the AnthropicMessagesApi on first use.""" - if self._injected_client is not None: - return AnthropicMessagesApi(client=self._injected_client) + def _messages_api(self) -> MessagesApi: + """Lazily initialise the `MessagesApi` matching the configured model.""" + if self._backend is _Backend.OPENAI or self._backend is _Backend.GOOGLE: + return self._build_openai_messages_api() + return self._build_anthropic_messages_api() + + def _build_anthropic_messages_api(self) -> AnthropicMessagesApi: + if isinstance(self._client, Anthropic): + return AnthropicMessagesApi(client=self._client) # TODO askui_settings.verify_ssl are not considered! #noqa # if self._askui_settings.verify_ssl: @@ -110,6 +187,33 @@ def _messages_api(self) -> AnthropicMessagesApi: ) return AnthropicMessagesApi(client=client) + def _build_openai_messages_api(self) -> OpenAIMessagesApi: + if isinstance(self._client, OpenAI): + return OpenAIMessagesApi(client=self._client) + + client = OpenAI( + api_key="DummyValueRequiredByOpenAIClient", + base_url=f"{self._askui_settings.base_url}/proxy/openai/v1", + default_headers={ + "Authorization": self._askui_settings.authorization_header + }, + ) + return OpenAIMessagesApi(client=client) + + @override + def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt: + """Append coordinate info to the system prompt for OpenAI-proxy models. + + Claude emits pixel coordinates natively, so the prompt is returned + unchanged. Models routed through the OpenAI proxy (e.g. Gemini) are told + which coordinate grid to emit so their output can be mapped back via + `coordinate_space`. + """ + if self._backend is not _Backend.GOOGLE: + return system + coord_info = self.coordinate_space.build_prompt_section() + return SystemPrompt(prompt=f"{str(system)}\n\n{coord_info}") + @override def create_message( self, @@ -122,6 +226,8 @@ def create_message( temperature: float | None = None, provider_options: dict[str, Any] | None = None, ) -> MessageParam: + if system is not None: + system = self.augment_system_prompt(system) result: MessageParam = self._messages_api.create_message( messages=messages, model_id=self._model_id_value, diff --git a/tests/unit/model_providers/test_askui_vlm_provider.py b/tests/unit/model_providers/test_askui_vlm_provider.py new file mode 100644 index 00000000..a3052632 --- /dev/null +++ b/tests/unit/model_providers/test_askui_vlm_provider.py @@ -0,0 +1,151 @@ +"""Unit tests for AskUIVlmProvider proxy routing.""" + +from unittest.mock import MagicMock + +import pytest +from anthropic import Anthropic +from openai import OpenAI + +from askui.model_providers.askui_vlm_provider import AskUIVlmProvider +from askui.models.anthropic.messages_api import AnthropicMessagesApi +from askui.models.askui.inference_api_settings import AskUiInferenceApiSettings +from askui.models.openai.messages_api import OpenAIMessagesApi +from askui.models.shared.coordinate_space import ( + PixelCoordinateSpace, + ScaledCoordinateSpace, +) +from askui.models.shared.prompts import SystemPrompt + +# Placeholder workspace id: all zeros except the mandatory UUIDv4 +# version (4) and variant (8) nibbles required by pydantic validation. +_WORKSPACE_ID = "00000000-0000-4000-8000-000000000000" + + +@pytest.fixture +def askui_settings() -> AskUiInferenceApiSettings: + return AskUiInferenceApiSettings( + workspace_id=_WORKSPACE_ID, + token="secret-token", + ) + + +class TestAskUIVlmProviderRouting: + def test_claude_model_uses_anthropic_messages_api( + self, askui_settings: AskUiInferenceApiSettings + ) -> None: + provider = AskUIVlmProvider( + askui_settings=askui_settings, + model_id="claude-sonnet-4-6", + ) + assert isinstance(provider._messages_api, AnthropicMessagesApi) + + def test_gemini_model_uses_openai_messages_api( + self, askui_settings: AskUiInferenceApiSettings + ) -> None: + provider = AskUIVlmProvider( + askui_settings=askui_settings, + model_id="gemini-2.5-pro", + ) + assert isinstance(provider._messages_api, OpenAIMessagesApi) + + def test_vendor_prefixed_gemini_uses_openai_messages_api( + self, askui_settings: AskUiInferenceApiSettings + ) -> None: + provider = AskUIVlmProvider( + askui_settings=askui_settings, + model_id="google/gemini-3.5-flash", + ) + assert isinstance(provider._messages_api, OpenAIMessagesApi) + + def test_unknown_model_raises( + self, askui_settings: AskUiInferenceApiSettings + ) -> None: + provider = AskUIVlmProvider( + askui_settings=askui_settings, + model_id="mystery-model-1", + ) + with pytest.raises(ValueError, match="Cannot infer a backend"): + _ = provider._messages_api + + def test_gemini_client_targets_openai_proxy( + self, askui_settings: AskUiInferenceApiSettings + ) -> None: + provider = AskUIVlmProvider( + askui_settings=askui_settings, + model_id="gemini-2.5-pro", + ) + api = provider._messages_api + assert isinstance(api, OpenAIMessagesApi) + assert str(api._client.base_url).rstrip("/").endswith("/proxy/openai/v1") + + def test_injected_anthropic_client_used_for_claude( + self, askui_settings: AskUiInferenceApiSettings + ) -> None: + mock_client = MagicMock(spec=Anthropic) + provider = AskUIVlmProvider( + askui_settings=askui_settings, + model_id="claude-sonnet-4-6", + client=mock_client, + ) + api = provider._messages_api + assert isinstance(api, AnthropicMessagesApi) + assert api._client is mock_client + + def test_injected_openai_client_used_for_gemini( + self, askui_settings: AskUiInferenceApiSettings + ) -> None: + mock_client = MagicMock(spec=OpenAI) + provider = AskUIVlmProvider( + askui_settings=askui_settings, + model_id="gemini-2.5-pro", + client=mock_client, + ) + api = provider._messages_api + assert isinstance(api, OpenAIMessagesApi) + assert api._client is mock_client + + +class TestAskUIVlmProviderSystemPrompt: + def test_claude_prompt_unchanged( + self, askui_settings: AskUiInferenceApiSettings + ) -> None: + provider = AskUIVlmProvider( + askui_settings=askui_settings, + model_id="claude-sonnet-4-6", + ) + system = SystemPrompt(prompt="Base prompt.") + assert provider.augment_system_prompt(system) is system + + def test_gemini_prompt_augmented_with_coordinates( + self, askui_settings: AskUiInferenceApiSettings + ) -> None: + provider = AskUIVlmProvider( + askui_settings=askui_settings, + model_id="gemini-2.5-pro", + ) + system = SystemPrompt(prompt="Base prompt.") + rendered = str(provider.augment_system_prompt(system)) + assert "Base prompt." in rendered + assert "1000x1000 normalised grid" in rendered + + +class TestAskUIVlmProviderCoordinateSpace: + def test_claude_uses_pixel_coordinate_space( + self, askui_settings: AskUiInferenceApiSettings + ) -> None: + provider = AskUIVlmProvider( + askui_settings=askui_settings, + model_id="claude-sonnet-4-6", + ) + assert provider.coordinate_space == PixelCoordinateSpace() + + def test_gemini_uses_scaled_coordinate_space( + self, askui_settings: AskUiInferenceApiSettings + ) -> None: + provider = AskUIVlmProvider( + askui_settings=askui_settings, + model_id="google/gemini-3.5-flash", + ) + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) From 01f210b7999abf9ae2c23e07333f2276f921cc86 Mon Sep 17 00:00:00 2001 From: philipph-askui Date: Mon, 29 Jun 2026 16:59:16 +0200 Subject: [PATCH 2/3] fix: preserve Gemini thought_signature for multi-turn tool calling by adding `extra_content` field to `ToolUseBlockParam` --- src/askui/models/anthropic/messages_api.py | 14 ++-- src/askui/models/openai/messages_api.py | 30 ++++++--- .../models/shared/agent_message_param.py | 4 ++ tests/unit/models/openai/test_messages_api.py | 67 +++++++++++++++++++ 4 files changed, 99 insertions(+), 16 deletions(-) diff --git a/src/askui/models/anthropic/messages_api.py b/src/askui/models/anthropic/messages_api.py index 47ea2681..86f8c50c 100644 --- a/src/askui/models/anthropic/messages_api.py +++ b/src/askui/models/anthropic/messages_api.py @@ -54,16 +54,18 @@ def from_content_block(block: ContentBlockParam) -> BetaContentBlockParam: """Convert an internal content block to an Anthropic API-compatible dict. Uses `model_dump()` to produce plain dicts compatible with Anthropic's - TypedDicts. Strips ``visual_representation`` from `ToolUseBlockParam` - as it is not accepted by the API. + TypedDicts. Strips ``visual_representation`` and ``extra_content`` from + `ToolUseBlockParam` as they are not accepted by the API. """ if isinstance(block, ToolUseBlockParam): - # visual_representation is an internal field (perceptual hash for cache - # validation) that does not exist in the Anthropic API schema. Sending - # it would cause the API to reject the request with an unknown-field error. + # visual_representation (perceptual hash for cache validation) and + # extra_content (provider-specific data, e.g. Gemini thought signatures) + # are internal fields that do not exist in the Anthropic API schema. + # Sending them would cause the API to reject the request with an + # unknown-field error. return cast( "BetaContentBlockParam", - block.model_dump(exclude={"visual_representation"}), + block.model_dump(exclude={"visual_representation", "extra_content"}), ) return cast("BetaContentBlockParam", block.model_dump()) diff --git a/src/askui/models/openai/messages_api.py b/src/askui/models/openai/messages_api.py index fd0ee20d..6d9b74d4 100644 --- a/src/askui/models/openai/messages_api.py +++ b/src/askui/models/openai/messages_api.py @@ -137,16 +137,19 @@ def _convert_assistant_message( if isinstance(block, TextBlockParam): text_parts.append(block.text) elif isinstance(block, ToolUseBlockParam): - tool_calls.append( - { - "id": block.id, - "type": "function", - "function": { - "name": block.name, - "arguments": json.dumps(block.input), - }, - } - ) + tool_call: dict[str, Any] = { + "id": block.id, + "type": "function", + "function": { + "name": block.name, + "arguments": json.dumps(block.input), + }, + } + # Echo back provider-specific data (e.g. Gemini thought signatures) + # so multi-turn tool calling keeps working. + if block.extra_content is not None: + tool_call["extra_content"] = block.extra_content + tool_calls.append(tool_call) # Skip thinking blocks silently openai_msg: dict[str, Any] = {"role": "assistant"} @@ -235,11 +238,18 @@ def _parse_tool_calls( }, ) arguments = {"raw_arguments": tool_call.function.arguments} + # Gemini (via the OpenAI-compatible API) attaches a `thought_signature` + # inside `extra_content` on each tool call. It must be echoed back on + # subsequent turns or the API rejects the request, so preserve it. + extra_content = (tool_call.model_extra or {}).get("extra_content") content_blocks.append( ToolUseBlockParam( id=tool_call.id, name=tool_call.function.name, input=arguments, + extra_content=extra_content + if isinstance(extra_content, dict) + else None, ) ) diff --git a/src/askui/models/shared/agent_message_param.py b/src/askui/models/shared/agent_message_param.py index 7b82e87e..2d4c48ab 100644 --- a/src/askui/models/shared/agent_message_param.py +++ b/src/askui/models/shared/agent_message_param.py @@ -81,6 +81,10 @@ class ToolUseBlockParam(BaseModel): type: Literal["tool_use"] = "tool_use" cache_control: CacheControlEphemeralParam | None = None visual_representation: str | None = None # Visual hash for cache validation + # Provider-specific data echoed back on subsequent turns. Used by Gemini via + # the OpenAI-compatible API to carry `thought_signature` (required for tool + # calls to keep working across turns). Not part of the Anthropic API schema. + extra_content: dict[str, Any] | None = None class BetaThinkingBlock(BaseModel): diff --git a/tests/unit/models/openai/test_messages_api.py b/tests/unit/models/openai/test_messages_api.py index 22fbcbce..ef1d0335 100644 --- a/tests/unit/models/openai/test_messages_api.py +++ b/tests/unit/models/openai/test_messages_api.py @@ -170,6 +170,36 @@ def test_user_message_with_image(self) -> None: assert parts[0] == {"type": "text", "text": "What is this?"} assert parts[1]["type"] == "image_url" + def test_tool_call_extra_content_round_tripped(self) -> None: + messages = [ + MessageParam( + role="assistant", + content=[ + ToolUseBlockParam( + id="call_1", + name="move_mouse", + input={"x": 1, "y": 2}, + extra_content={"google": {"thought_signature": "sig-abc"}}, + ), + ], + ) + ] + result = _to_openai_messages(messages) + tc = result[0]["tool_calls"][0] + assert tc["extra_content"] == {"google": {"thought_signature": "sig-abc"}} + + def test_tool_call_without_extra_content_omits_key(self) -> None: + messages = [ + MessageParam( + role="assistant", + content=[ + ToolUseBlockParam(id="call_1", name="screenshot", input={}), + ], + ) + ] + result = _to_openai_messages(messages) + assert "extra_content" not in result[0]["tool_calls"][0] + def test_assistant_message_with_tool_calls(self) -> None: messages = [ MessageParam( @@ -371,6 +401,43 @@ def test_text_and_tool_calls(self) -> None: assert isinstance(result.content[0], TextBlockParam) assert isinstance(result.content[1], ToolUseBlockParam) + def test_tool_call_extra_content_captured(self) -> None: + tool_calls = [ + ChatCompletionMessageToolCall.model_validate( + { + "id": "call_1", + "type": "function", + "function": {"name": "move_mouse", "arguments": "{}"}, + "extra_content": {"google": {"thought_signature": "sig-abc"}}, + } + ) + ] + completion = _make_completion( + tool_calls=tool_calls, finish_reason="tool_calls" + ) + result = _from_openai_response(completion) + assert isinstance(result.content, list) + block = result.content[0] + assert isinstance(block, ToolUseBlockParam) + assert block.extra_content == {"google": {"thought_signature": "sig-abc"}} + + def test_tool_call_without_extra_content_is_none(self) -> None: + tool_calls = [ + ChatCompletionMessageToolCall( + id="call_1", + type="function", + function=Function(name="screenshot", arguments="{}"), + ) + ] + completion = _make_completion( + tool_calls=tool_calls, finish_reason="tool_calls" + ) + result = _from_openai_response(completion) + assert isinstance(result.content, list) + block = result.content[0] + assert isinstance(block, ToolUseBlockParam) + assert block.extra_content is None + def test_usage_captured(self) -> None: completion = _make_completion( content="ok", prompt_tokens=50, completion_tokens=100 From 88eb46195c57d1020cd621ea513440861fca5484 Mon Sep 17 00:00:00 2001 From: philipph-askui Date: Mon, 29 Jun 2026 17:05:14 +0200 Subject: [PATCH 3/3] fix: qa issue --- tests/unit/models/openai/test_messages_api.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/unit/models/openai/test_messages_api.py b/tests/unit/models/openai/test_messages_api.py index ef1d0335..fb490704 100644 --- a/tests/unit/models/openai/test_messages_api.py +++ b/tests/unit/models/openai/test_messages_api.py @@ -412,9 +412,7 @@ def test_tool_call_extra_content_captured(self) -> None: } ) ] - completion = _make_completion( - tool_calls=tool_calls, finish_reason="tool_calls" - ) + completion = _make_completion(tool_calls=tool_calls, finish_reason="tool_calls") result = _from_openai_response(completion) assert isinstance(result.content, list) block = result.content[0] @@ -429,9 +427,7 @@ def test_tool_call_without_extra_content_is_none(self) -> None: function=Function(name="screenshot", arguments="{}"), ) ] - completion = _make_completion( - tool_calls=tool_calls, finish_reason="tool_calls" - ) + completion = _make_completion(tool_calls=tool_calls, finish_reason="tool_calls") result = _from_openai_response(completion) assert isinstance(result.content, list) block = result.content[0]