From 7ff3f42e49fad6a14101640134a3201b0510819b Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Sat, 20 Jun 2026 21:26:21 +0200
Subject: [PATCH 1/2] test(agent): vitest suite + CI for the agent runner; fix
 relay error bug

Convert the hand-run tsx/node:assert scripts under services/agent/test/ into a vitest suite
in tests/unit/ (describe/it, node:assert kept). Add vitest + typescript devDeps, scripts
(test/typecheck/coverage), a node-env vitest.config (junit + v8 coverage), and broaden the
tsconfig include so typecheck covers tests + config. Add a run-services-node-unit-tests job to
12-check-unit-tests.yml, and a services/agent/AGENTS.md.

Testability seam: server.ts exports createAgentServer(run); cli.ts exports
runCli(raw, stream, io) with an injectable engine + output sink, so the HTTP and CLI paths are
tested with a fake engine (server.test.ts, cli.test.ts). src/entry.ts isEntrypoint keeps the
entrypoints inert on import. Add src/version.ts + a richer GET /health; version 0.0.0 -> 0.1.0.

Fix a real bug in tools/dispatch.ts: relayToolCall referenced an undefined callRef in its
error/timeout messages (would throw ReferenceError and mask the real failure); use toolName,
plus a focused relay regression test.

Result: 10 test files / 47 tests pass, tsc --noEmit clean. Two tests are deferred because their
deps live on sibling branches: skills.test.ts (needs engines/skills.ts from feat/agenta-on-rivet)
and wire-contract.test.ts (needs the shared Python golden fixtures). They land when those reach
this branch.

Claude-Session: https://claude.ai/code/session_01GLA8RywSLTGiJvBrDnqZa2
---
 .github/workflows/12-check-unit-tests.yml     |  71 ++
 .gitignore                                    |   2 +
 .../typescript-structure/README.md            |  35 +
 .../typescript-structure/context.md           |  49 +
 .../typescript-structure/plan.md              | 173 ++++
 .../typescript-structure/research.md          | 193 ++++
 .../typescript-structure/status.md            | 217 +++++
 services/agent/AGENTS.md                      |  62 ++
 services/agent/CLAUDE.md                      |   1 +
 services/agent/package.json                   |  16 +-
 services/agent/pnpm-lock.yaml                 | 913 +++++++++++++++++-
 services/agent/src/cli.ts                     | 113 ++-
 services/agent/src/entry.ts                   |  17 +
 services/agent/src/server.ts                  | 132 ++-
 services/agent/src/tools/dispatch.ts          |   4 +-
 services/agent/src/version.ts                 |  35 +
 services/agent/test/code-tool.test.ts         |  92 --
 services/agent/test/continuation.test.ts      |  66 --
 services/agent/test/extension-tools.test.ts   | 109 ---
 services/agent/test/mcp-servers.test.ts       |  58 --
 services/agent/test/responder.test.ts         |  84 --
 services/agent/test/stream-events.test.ts     | 148 ---
 services/agent/test/tool-bridge.test.ts       | 169 ----
 services/agent/test/tool-dispatch.test.ts     |  85 --
 services/agent/tests/unit/cli.test.ts         |  66 ++
 services/agent/tests/unit/code-tool.test.ts   |  89 ++
 .../agent/tests/unit/continuation.test.ts     |  72 ++
 .../agent/tests/unit/extension-tools.test.ts  | 108 +++
 services/agent/tests/unit/mcp-servers.test.ts |  58 ++
 services/agent/tests/unit/responder.test.ts   |  92 ++
 services/agent/tests/unit/server.test.ts      | 109 +++
 .../agent/tests/unit/stream-events.test.ts    | 146 +++
 services/agent/tests/unit/tool-bridge.test.ts | 157 +++
 .../agent/tests/unit/tool-dispatch.test.ts    | 123 +++
 services/agent/tsconfig.json                  |   2 +-
 services/agent/vitest.config.ts               |  20 +
 36 files changed, 2959 insertions(+), 927 deletions(-)
 create mode 100644 docs/design/agent-workflows/typescript-structure/README.md
 create mode 100644 docs/design/agent-workflows/typescript-structure/context.md
 create mode 100644 docs/design/agent-workflows/typescript-structure/plan.md
 create mode 100644 docs/design/agent-workflows/typescript-structure/research.md
 create mode 100644 docs/design/agent-workflows/typescript-structure/status.md
 create mode 100644 services/agent/AGENTS.md
 create mode 120000 services/agent/CLAUDE.md
 create mode 100644 services/agent/src/entry.ts
 create mode 100644 services/agent/src/version.ts
 delete mode 100644 services/agent/test/code-tool.test.ts
 delete mode 100644 services/agent/test/continuation.test.ts
 delete mode 100644 services/agent/test/extension-tools.test.ts
 delete mode 100644 services/agent/test/mcp-servers.test.ts
 delete mode 100644 services/agent/test/responder.test.ts
 delete mode 100644 services/agent/test/stream-events.test.ts
 delete mode 100644 services/agent/test/tool-bridge.test.ts
 delete mode 100644 services/agent/test/tool-dispatch.test.ts
 create mode 100644 services/agent/tests/unit/cli.test.ts
 create mode 100644 services/agent/tests/unit/code-tool.test.ts
 create mode 100644 services/agent/tests/unit/continuation.test.ts
 create mode 100644 services/agent/tests/unit/extension-tools.test.ts
 create mode 100644 services/agent/tests/unit/mcp-servers.test.ts
 create mode 100644 services/agent/tests/unit/responder.test.ts
 create mode 100644 services/agent/tests/unit/server.test.ts
 create mode 100644 services/agent/tests/unit/stream-events.test.ts
 create mode 100644 services/agent/tests/unit/tool-bridge.test.ts
 create mode 100644 services/agent/tests/unit/tool-dispatch.test.ts
 create mode 100644 services/agent/vitest.config.ts

diff --git a/.github/workflows/12-check-unit-tests.yml b/.github/workflows/12-check-unit-tests.yml
index 157d81e470..c8bc699e65 100644
--- a/.github/workflows/12-check-unit-tests.yml
+++ b/.github/workflows/12-check-unit-tests.yml
@@ -301,3 +301,74 @@ jobs:
           files: services/oss/tests/results/junit.xml
           check_name: Application services Unit Test Results
           comment_mode: off
+
+  run-services-node-unit-tests:
+    # The agent runner (services/agent) is a standalone Node/pnpm package, not part of the
+    # Python services suite above. It runs its own vitest unit tests plus a tsc typecheck gate.
+    # No "has_tests" guard on purpose: this suite is established, so a missing/empty suite must
+    # FAIL the job (vitest exits non-zero on no test files), not silently skip it.
+    if: |
+      github.event_name == 'workflow_dispatch' ||
+      !github.event.pull_request.draft
+    runs-on: ubuntu-latest
+    permissions:
+      checks: write
+      pull-requests: write
+      contents: read
+    env:
+      AGENTA_LICENSE: oss
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Skip when package selection excludes services
+        if: github.event_name == 'workflow_dispatch' && !contains(fromJSON('["all","services-only"]'), inputs.packages)
+        run: exit 0
+
+      - name: Set up Node.js
+        if: github.event_name != 'workflow_dispatch' || contains(fromJSON('["all","services-only"]'), inputs.packages)
+        uses: actions/setup-node@v4
+        with:
+          node-version: '24'
+
+      - name: Enable Corepack
+        if: github.event_name != 'workflow_dispatch' || contains(fromJSON('["all","services-only"]'), inputs.packages)
+        run: corepack enable
+
+      - name: Cache pnpm store
+        if: github.event_name != 'workflow_dispatch' || contains(fromJSON('["all","services-only"]'), inputs.packages)
+        uses: actions/cache@v4
+        with:
+          path: ~/.pnpm-store
+          key: ${{ runner.os }}-services-agent-pnpm-${{ hashFiles('services/agent/pnpm-lock.yaml') }}
+          restore-keys: |
+            ${{ runner.os }}-services-agent-pnpm-
+
+      - name: Set up pnpm store
+        if: github.event_name != 'workflow_dispatch' || contains(fromJSON('["all","services-only"]'), inputs.packages)
+        working-directory: services/agent
+        run: pnpm config set store-dir ~/.pnpm-store
+
+      - name: Install dependencies
+        if: github.event_name != 'workflow_dispatch' || contains(fromJSON('["all","services-only"]'), inputs.packages)
+        working-directory: services/agent
+        run: pnpm install --frozen-lockfile
+
+      - name: Typecheck (tsc --noEmit, src + tests + config)
+        if: github.event_name != 'workflow_dispatch' || contains(fromJSON('["all","services-only"]'), inputs.packages)
+        working-directory: services/agent
+        run: pnpm run typecheck
+
+      # The code-tool unit test spawns python3 and node end-to-end; both are preinstalled on
+      # ubuntu runners (node is also set up above), so no setup-python step is needed.
+      - name: Run agent runner unit tests
+        if: github.event_name != 'workflow_dispatch' || contains(fromJSON('["all","services-only"]'), inputs.packages)
+        working-directory: services/agent
+        run: pnpm run test:unit
+
+      - name: Publish agent runner unit test results
+        if: always() && (github.event_name != 'workflow_dispatch' || contains(fromJSON('["all","services-only"]'), inputs.packages))
+        uses: EnricoMi/publish-unit-test-result-action@v2
+        with:
+          files: services/agent/test-results/junit.xml
+          check_name: Agent Runner Unit Test Results
+          comment_mode: off
diff --git a/.gitignore b/.gitignore
index 6c91758e28..e363be21a7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,6 +65,8 @@ sdks/python/oss/tests/results/
 sdks/python/ee/tests/results/
 services/oss/tests/results/
 services/ee/tests/results/
+services/agent/test-results/
+services/agent/coverage/
 .*
 !**/.gitkeep
 !.github/
diff --git a/docs/design/agent-workflows/typescript-structure/README.md b/docs/design/agent-workflows/typescript-structure/README.md
new file mode 100644
index 0000000000..a86e689022
--- /dev/null
+++ b/docs/design/agent-workflows/typescript-structure/README.md
@@ -0,0 +1,35 @@
+# TypeScript structure for the agent runner
+
+Planning workspace for making the new TypeScript code in the agent-workflows project
+usable, maintainable, and testable, with tests that run easily and run in CI.
+
+The new TypeScript lives mostly in one place: `services/agent/` (the Node "agent runner"
+sidecar). This folder researches its current shape and proposes how to structure, test,
+and gate it the way the rest of the monorepo already handles Python and frontend code.
+
+## Files
+
+- [context.md](context.md) — why this work exists, goals, non-goals, who it is for.
+- [research.md](research.md) — what is actually in the repo today: where the TS lives, how
+  it builds, ships, and is (barely) tested; the conventions the repo already standardizes
+  for TS; a Python-to-TypeScript mental model; the gaps.
+- [plan.md](plan.md) — the phased plan to close the gaps, with concrete file changes,
+  scripts, and CI wiring.
+- [status.md](status.md) — source of truth for progress and open decisions. Read this
+  first to see where things stand.
+
+## TL;DR
+
+The runner code is well-organized (clear `engines/`, `tools/`, `tracing/` seams, a single
+`protocol.ts` wire contract). The weak spots are tooling, not architecture:
+
+1. Eight test files exist but there is **no test runner and no `pnpm test`**. Each test is
+   a hand-run `tsx` script.
+2. Those tests run in **no CI workflow**. The Node side is invisible to the unit-test gate.
+3. There is **no typecheck gate** even though the code is already `strict: true`.
+4. The TS side has **no test asserting the cross-language wire contract**, which is only
+   pinned from Python today.
+
+The plan adopts **vitest** (the runner `web/packages/*` already use), wires a Node job into
+`12-check-unit-tests.yml`, adds a `tsc --noEmit` gate, and adds a golden-fixture round-trip
+test so `protocol.ts` cannot drift from the Python wire silently.
diff --git a/docs/design/agent-workflows/typescript-structure/context.md b/docs/design/agent-workflows/typescript-structure/context.md
new file mode 100644
index 0000000000..e8e6edce4d
--- /dev/null
+++ b/docs/design/agent-workflows/typescript-structure/context.md
@@ -0,0 +1,49 @@
+# Context
+
+## Why this work exists
+
+The agent-workflows project introduced the first substantial server-side TypeScript in a
+repo that was Python on the backend and TypeScript only on the frontend. The new code is
+the agent runner sidecar at `services/agent/`. It drives the agent harnesses (Pi, Claude
+Code, rivet's `sandbox-agent`) because those are Node libraries with no Python SDK. The
+Python agent service calls into it over one JSON contract.
+
+This code grew fast during the build-out. It works and it is reasonably well-factored, but
+it sits outside the conventions the rest of the monorepo follows. The owner is a Python
+developer and wants this TypeScript to feel as routine to maintain and test as the Python
+does: a single command to run the tests, the tests running in CI, a typecheck gate, and a
+clear place for new code and new tests to go.
+
+## Goals
+
+1. **Testable, easily.** One command (`pnpm test`) runs every unit test for the runner.
+   Watch mode and coverage work. Writing a new test is obvious and low-ceremony.
+2. **Tested in CI.** The runner's tests run on every PR that touches it, with results
+   published the same way the Python and web suites are.
+3. **Typechecked.** The `strict` TypeScript already configured produces a CI signal, so a
+   type error fails the build instead of reaching the dockerized sidecar at runtime.
+4. **Contract-safe.** The wire contract between the Python service and the Node runner is
+   guarded from both sides, not just from Python.
+5. **Maintainable and discoverable.** A new contributor (or agent) can find where runner
+   code and runner tests belong, following the same instruction-layering the repo uses for
+   `web/` and `api/`.
+
+## Non-goals
+
+- Rewriting or re-architecting the runner. The `engines` / `tools` / `tracing` split and
+  the `protocol.ts` contract stay. This is about tooling and structure, not a redesign.
+- Folding `services/agent` into the `web/` pnpm workspace. It is a deployable sidecar with
+  its own Docker build and its own lockfile; it should stay a standalone package (see
+  research.md for the trade-off).
+- Changing the frontend TypeScript (`web/oss/src/components/AgentChatSlice/`). That code
+  already lives in the web app under established conventions (vitest, package practices).
+  It is out of scope here.
+- End-to-end / live-LLM acceptance tests for the runner. Those depend on real harness
+  credentials and are tracked separately in the agent-workflows test work. This plan is
+  about the fast unit/contract layer that can run on every PR with no secrets.
+
+## Who this is for
+
+The maintainer (Python-first) and any future contributor or agent touching
+`services/agent`. research.md includes a Python-to-TypeScript mental model so the tooling
+choices map onto things already familiar from the SDK and API side (uv, ruff, pytest).
diff --git a/docs/design/agent-workflows/typescript-structure/plan.md b/docs/design/agent-workflows/typescript-structure/plan.md
new file mode 100644
index 0000000000..29ea51f7fe
--- /dev/null
+++ b/docs/design/agent-workflows/typescript-structure/plan.md
@@ -0,0 +1,173 @@
+# Plan
+
+Four phases, ordered so value lands early and nothing later depends on a refactor. Phases 1
+and 2 are the core ask (easy-to-run tests, tests in CI). Phase 3 protects the contract.
+Phase 4 is structure and maintainability, adopted progressively.
+
+Effort estimates assume one developer familiar with the runner. They are deliberate, not
+padded.
+
+## Phase 1 — Make the tests run with one command (~half day)
+
+Goal: `pnpm test` in `services/agent` runs every unit test, with watch and coverage.
+
+0. **Fix the latent bug the typecheck will expose.** `src/tools/dispatch.ts` references an
+   undefined `callRef` at lines 88 and 92 inside `relayToolCall`. Use the in-scope value
+   (`toolName`, or thread the spec's `callRef` in) so the error path stops throwing
+   `ReferenceError`. Found by Codex; this is the proof the typecheck gate has teeth.
+1. Add dev deps to `services/agent/package.json`: `vitest`, `@vitest/coverage-v8`, **and
+   `typescript`** (currently absent: `node_modules/.bin/tsc` does not exist, so `typecheck`
+   cannot run without it). Match the versions `web/packages/*` pin (`vitest` `^4.1.x`); align
+   `@types/node` with Node 24.
+2. Add `services/agent/vitest.config.ts`, modeled on `agenta-shared/vitest.config.ts`:
+   `include: ["tests/unit/**/*.test.ts"]`, `environment: "node"`,
+   `reporters: ["default", "junit"]` to `test-results/junit.xml`, v8 coverage over `src/`.
+3. Add scripts to `package.json`:
+
+   ```jsonc
+   "test": "pnpm run test:unit",
+   "test:unit": "vitest run",
+   "test:watch": "vitest",
+   "test:coverage": "vitest run --coverage",
+   "typecheck": "tsc --noEmit"
+   ```
+
+4. Move `test/*.test.ts` to `tests/unit/*.test.ts` and wrap the bare `{ ... }` blocks in
+   `describe` / `it` so reporting and junit are per-case. **Do not bother rewriting every
+   `assert` to `expect`** (Codex's point): vitest runs `node:assert` fine, so the conversion
+   is just adding `describe`/`it` wrappers, not touching assertions. Keep filenames. The
+   dynamic-import-after-env pattern (e.g. `skills.test.ts`) stays valid; add
+   `vi.resetModules()` only where a file needs a clean module per case.
+5. Update the `Run:` header comment in each test to `pnpm test` (or
+   `pnpm exec vitest run tests/unit/<name>.test.ts` for a single file).
+
+Done when: `pnpm test` is green locally and prints a single summary across all files.
+
+## Phase 2 — Run them in CI (~half day)
+
+Goal: the runner's tests gate every PR that touches `services/agent`.
+
+1. Add a `run-services-node-unit-tests` job to `.github/workflows/12-check-unit-tests.yml`,
+   mirroring the existing `run-web-unit-tests` setup but scoped to the package:
+   - `actions/setup-node@v4` with `node-version: '24'`, `corepack enable`.
+   - Cache the pnpm store keyed on `services/agent/pnpm-lock.yaml`.
+   - `working-directory: services/agent`, `pnpm install --frozen-lockfile`, then
+     `pnpm run typecheck` and `pnpm run test:unit`.
+   - **Ensure `python3` is on the runner.** `test/code-tool.test.ts` spawns `python3` (and
+     `node`) through `runCodeTool`. ubuntu-latest ships python3, but make it explicit, or
+     split the subprocess code-tool test into an integration test the unit job can skip.
+   - Publish `services/agent/test-results/junit.xml` with
+     `EnricoMi/publish-unit-test-result-action@v2`, `check_name: Agent Runner Unit Tests`.
+2. Path-filter the job. The workflow already triggers on `services/**`; gate the new job's
+   steps so it only does work when `services/agent/**` changed (the same `if:` pattern the
+   other jobs use for their package selection), to avoid installing Node on unrelated PRs.
+3. Decide whether `typecheck` failing fails the job. Recommendation: yes. The code is
+   already `strict`; a type error should not merge.
+
+Done when: a PR touching `services/agent` shows an "Agent Runner Unit Tests" check, and a
+deliberately broken type or assertion turns it red.
+
+## Phase 3 — Guard the wire contract from the TS side (~half day)
+
+Goal: a contract change must update Python and TypeScript together, or fail on both.
+
+**Codex correction (important):** `protocol.ts` is types only, erased at runtime. "Loading
+JSON and round-tripping it through an interface" validates nothing at runtime. The contract
+test needs real runtime checks, in two layers:
+
+1. Add `tests/utils/golden.ts` that loads the shared fixtures from
+   `sdks/python/oss/tests/pytest/unit/agents/golden/` (relative path from the runner, read
+   at test time). No copying; one source of truth.
+2. **Runtime validation, not type assertion.** Either (a) introduce a zod (or equivalent)
+   schema that mirrors `protocol.ts` and `parse()` each golden fixture in
+   `tests/unit/wire-contract.test.ts`, or (b) write explicit structural assertions (required
+   keys present, types correct, the `ok` discriminant). Option (a) doubles as a real runtime
+   guard the server can use on inbound requests; option (b) is lighter but only a test.
+3. **Type-level check, separately.** Use vitest's `expectTypeOf` (or a `tsd`-style check) so
+   a fixture that drifts from `AgentRunRequest` fails `typecheck`, independent of the runtime
+   assertions.
+4. Exercise the pure helpers in `protocol.ts` (`messageText`, `resolvePromptText`,
+   `resolveRunSessionId`) against fixture-derived inputs.
+5. Note in `protocol.ts` and Python `test_wire_contract.py` that the contract is now pinned
+   from both sides, so future editors look both ways.
+
+Done when: editing a field name in `protocol.ts` without updating the fixtures (or vice
+versa) fails this test, at runtime and at typecheck.
+
+## Phase 4 — Structure and maintainability (progressive, no big bang)
+
+Adopt as the runner is touched, not in one sweep.
+
+1. **Add `services/agent/AGENTS.md`** (with a `CLAUDE.md` symlink, matching `web/`, `api/`).
+   Keep it short: the package is a standalone pnpm project; how to run/serve/test/typecheck;
+   where runner code goes (`src/{engines,tools,tracing}`) and where tests go
+   (`tests/unit`, fixtures in `tests/utils`); the wire contract is mirrored in Python
+   `wire.py` and pinned by golden fixtures, so change both sides; vitest is the runner.
+   Add a thin `.claude/rules` / `.cursor/rules` pointer if the repo expects one.
+2. **Local typecheck gate (optional).** The root `.husky/pre-commit` already runs prettier
+   and gitleaks repo-wide. Optionally add `pnpm --dir services/agent typecheck` for changed
+   TS, or leave the gate to CI to keep commits fast. Recommendation: CI is the gate; skip
+   the local hook unless commits regularly land type errors.
+3. **Linting (optional, phase-2 nice-to-have).** There is no eslint outside `web/`.
+   `prettier` (global hook) covers formatting. A small `typescript-eslint` flat config for
+   `services/agent` would add real value for async runner code (`no-floating-promises`,
+   `no-misused-promises`). Treat as optional; `tsc --strict` + prettier is an acceptable
+   floor.
+4. **Extract a testability seam (Codex).** `server.ts` and `cli.ts` wire transport to the
+   engines inline, so HTTP/CLI behavior can only be tested with a live harness. Export
+   `createServer(runAgent)` and `runCli(runAgent)` that take the engine as an argument. Then
+   unit tests inject a fake engine returning a deterministic `AgentRunResult` and cover
+   `/health`, invalid-JSON handling, `POST /run`, NDJSON record ordering, and CLI exit codes,
+   with no Pi/Claude/rivet. This is the highest-value structural change for testability.
+5. **Decompose the two large files opportunistically.** When next editing `engines/rivet.ts`
+   or `tracing/otel.ts`, pull a cohesive seam into its own module and unit-test it, the way
+   `responder.ts` was extracted from `rivet.ts`. Not a scheduled refactor.
+
+## Phase 5 — Make it a versioned, supportable service (Codex's main gap)
+
+The review's core point: the plan above makes the runner testable but does not make it a
+first-class deployable. These items make the SDK and the sidecar safe to release on their
+own cadences. Scope and sequence with the platform/release owner; some are bigger than a
+half-day.
+
+1. **Protocol/version negotiation.** Add a `protocolVersion` (major) to the wire and have
+   `GET /health` (or a new `/capabilities`) return `runnerVersion`, `protocolVersion`,
+   supported engines, and harnesses. The Python adapter probes once and refuses an
+   incompatible major before the first run. Today `/health` returns only `{status:"ok"}` and
+   `package.json` is `0.0.0`.
+2. **Release ownership.** Decide whether the sidecar version tracks the Agenta release or is
+   versioned independently, and stop shipping `0.0.0`. The SDK should pin a compatible runner
+   *protocol* range, not a package-version equality.
+3. **Sidecar image publishing.** No CI publishes the runner image today (only api/web/services
+   images are built, e.g. in `42-railway-build.yml`). Add a build/publish job so the HTTP
+   sidecar (the production boundary) is actually distributable.
+4. **Local code-tool execution policy.** `runCodeTool` scopes secret env, but a `code` tool
+   still runs an arbitrary `python3`/`node` process in the sidecar. State the sandbox,
+   resource, and network policy (it is already sandboxed in Daytona; the local/in-sidecar
+   path needs an explicit stance), so this is a deliberate posture, not an oversight.
+5. **Config hygiene.** `services/oss/src/agent/app.py` reads `AGENTA_AGENT_*` via raw
+   `os.getenv`. The repo convention (root `AGENTS.md`) is to add config to
+   `api/oss/src/utils/env.py` and consume the shared `env` object. Align it.
+6. **Fix the stale `local.py` docstring.** `sdks/python/.../adapters/local.py` says the Pi
+   runner is "shipped inside the wheel," which is not true today and is the likely source of
+   the wheel confusion. Either implement that path deliberately (see the packaging options in
+   the answer to question 1) or correct the docstring to match reality.
+
+## Sequencing and ownership
+
+- Phases 1 to 3 are independent of any runtime change and can land as one small PR or three
+  tiny ones. They add no production code paths, only tooling and tests. Start here.
+- Phase 4 item 1 (`AGENTS.md`) is worth doing alongside Phase 1 so the new test location is
+  documented the moment it exists. Item 4 (the `createServer`/`runCli` seam) unblocks the
+  HTTP/CLI tests and is worth pulling forward.
+- Phase 5 is a separate track, owned with whoever owns releases and deployment. It does not
+  block Phases 1 to 4, but it is what turns "tested code" into "supportable service."
+- None of this blocks ongoing runner feature work; it runs in parallel.
+
+## What success looks like
+
+- `cd services/agent && pnpm test` runs the whole suite in one go, green, with a summary.
+- A PR touching the runner gets a red/green unit-test + typecheck check automatically.
+- `protocol.ts` cannot drift from the Python wire without a test failing.
+- A new contributor reads `services/agent/AGENTS.md` and knows where code and tests go and
+  how to run them, without reading the whole tree.
diff --git a/docs/design/agent-workflows/typescript-structure/research.md b/docs/design/agent-workflows/typescript-structure/research.md
new file mode 100644
index 0000000000..98f1ce228c
--- /dev/null
+++ b/docs/design/agent-workflows/typescript-structure/research.md
@@ -0,0 +1,193 @@
+# Research
+
+Findings from reading the repo on 2026-06-20. Everything below is observed in the tree, not
+assumed.
+
+## 1. Where the new TypeScript actually lives
+
+Server-side TypeScript that did not exist before agent-workflows is concentrated in one
+package:
+
+```
+services/agent/                 standalone pnpm package "agenta-agent-pi-wrapper"
+  package.json                  ESM, type:module, pnpm 10.30, Node 24
+  tsconfig.json                 strict, noEmit, moduleResolution Bundler
+  pnpm-lock.yaml                its OWN lockfile (not in the web workspace)
+  src/
+    cli.ts        (88)          entrypoint: stdin JSON in, stdout JSON out
+    server.ts     (155)         entrypoint: HTTP sidecar on :8765 (GET /health, POST /run)
+    protocol.ts   (295)         the /run wire contract: request, result, events, caps
+    responder.ts  (77)          permission/HITL policy seam (extracted from rivet.ts)
+    engines/
+      pi.ts       (403)         drive the Pi SDK in-process
+      rivet.ts    (1085)        drive any harness over ACP via sandbox-agent
+      skills.ts   (50)          resolve forced-skill names to dirs on disk
+    tools/        (7 files)     callback, code, dispatch, mcp-bridge, mcp-server, relay, ...
+    tracing/
+      otel.ts     (1026)        turn a run into OTel spans nested under /invoke
+    extensions/
+      agenta.ts   (114)         Pi extension, esbuild-bundled into dist/ for Pi to load
+  test/           (8 files)     hand-run tsx scripts (see section 3)
+  skills/         SKILL.md       bundled forced-skills for the Agenta harness
+  config/         fallback hello-world agent
+  docker/         Dockerfile (prod) + Dockerfile.dev
+  scripts/        build-extension.mjs (esbuild bundle of the extension)
+```
+
+Total runner source is ~4,100 lines. It is the only meaningful server-side TS in the repo.
+
+Other TypeScript exists but is **not** in scope:
+
+- `web/oss/src/components/AgentChatSlice/` — frontend, already under web conventions.
+- `web/packages/*`, `web/oss`, `web/ee` — the established frontend, vitest + Playwright.
+- `docs/`, `examples/` — Docusaurus and sample apps.
+
+So "TypeScript in different places" is really one homeless package (`services/agent`) plus
+frontend code that already has a home. The plan targets the package.
+
+## 2. How the runner builds, runs, and ships today
+
+- **No compile step for the app.** It runs through `tsx` (a TS-aware Node loader). Both the
+  dev image (`tsx watch src/server.ts`) and the prod image (`tsx src/server.ts`) execute
+  the source directly. `tsconfig.json` is `noEmit: true`; it exists only for typechecking,
+  and nothing runs that typecheck.
+- **One real build:** `scripts/build-extension.mjs` esbuild-bundles `src/extensions/agenta.ts`
+  into `dist/extensions/agenta.js` so Pi can load it anywhere. Both Dockerfiles run
+  `pnpm run build:extension`.
+- **Two transports, one contract.** Python reaches the runner either over HTTP (the docker
+  sidecar) or by spawning the CLI as a subprocess. Both carry the same `/run` JSON. See
+  `sdks/python/agenta/sdk/agents/utils/ts_runner.py` (`deliver_http`, `deliver_subprocess`,
+  plus the NDJSON streaming variants).
+- **Standalone package.** `services/agent` has its own `pnpm-lock.yaml` and is absent from
+  `web/pnpm-workspace.yaml`. That isolation is deliberate and worth keeping: the sidecar
+  image installs only the runner's deps, with no coupling to the web dependency graph.
+- **No TS in the wheel today, but a docstring claims otherwise.** The SDK wheel is pure
+  Python (`uv_build`, zero `.ts`/`.js`). However `sdks/python/.../adapters/local.py` (the
+  unimplemented `LocalBackend`) says the Pi runner is "the bundled JS runner ... shipped
+  inside the wheel." That is aspirational and NOT YET IMPLEMENTED, but it is almost certainly
+  the source of the "is the TS part of the SDK / wheel" worry. The future-local-backend
+  question (bundle a built JS runner into the wheel vs require Docker/npm) is real and
+  undecided; see plan Phase 5 item 6 and the distribution options in status.md.
+
+Scripts present in `package.json` today: `run:cli`, `serve`, `serve:watch`,
+`build:extension`, `login`. There is **no `test`, no `typecheck`, no `lint`, no `format`.**
+
+## 3. How it is tested today (the gap)
+
+There are 8 test files under `services/agent/test/`:
+
+```
+code-tool.test.ts   continuation.test.ts   mcp-servers.test.ts   responder.test.ts
+skills.test.ts      stream-events.test.ts  tool-bridge.test.ts   tool-dispatch.test.ts
+```
+
+They are genuinely good tests in content. The problem is entirely in how they run:
+
+- Each file is a **standalone script** using `node:assert/strict`, with bare `{ ... }`
+  blocks for grouping and a `console.log("...: ok")` at the end. The header of each says
+  `Run: pnpm exec tsx test/<name>.test.ts`.
+- There is **no runner and no aggregation.** Running "the test suite" means running eight
+  commands by hand. A failure is a thrown assertion and a non-zero exit on one file; there
+  is no summary, no count, no `--watch`, no filtering, no coverage, no junit.
+- They run in **no CI workflow.** `12-check-unit-tests.yml` has a `run-services-unit-tests`
+  job, but it only looks at `services/oss/tests/pytest/unit` (Python) and runs
+  `uv run python run-tests.py`. It never installs Node or touches `services/agent`. Every
+  vitest mention in CI refers to `web/packages`. So the runner's tests have never gated a
+  PR.
+- There is **no TS-side contract test.** `protocol.ts` says the contract is pinned by
+  golden fixtures under `sdks/python/oss/tests/pytest/unit/agents/golden/` and checked by
+  the Python `test_wire_contract.py`. That guards the Python mirror (`wire.py`). Nothing on
+  the TS side asserts that `protocol.ts` still accepts those fixtures, so the runner can
+  drift from the contract and only Python would notice.
+
+## 4. What the repo already standardizes for TypeScript tests
+
+We do not need to invent a convention. The frontend already has one, and there is a written
+spec:
+
+- **vitest is the repo's TS unit runner.** `web/packages/*` (agenta-shared, entities,
+  entity-ui, playground, annotation) each ship a `vitest.config.ts` and these scripts:
+
+  ```jsonc
+  "test": "pnpm run test:unit",
+  "test:unit": "vitest run",
+  "test:watch": "vitest",
+  "test:coverage": "vitest run --coverage",
+  "typecheck": "tsc --noEmit"
+  ```
+
+  Config (from `agenta-shared/vitest.config.ts`): `include: ["tests/unit/**/*.test.ts"]`,
+  `environment: "node"`, `reporters: ["default", "junit"]` writing `test-results/junit.xml`,
+  and v8 coverage. This is exactly the shape a Node service wants.
+
+- **CI runs them generically.** The web job runs `pnpm -r --if-present test:unit` across
+  workspace packages and publishes `web/packages/*/test-results/junit.xml` via the
+  `publish-unit-test-result-action`. Any package that defines `test:unit` is picked up; the
+  rest are skipped. A new package following the same script names slots in for free.
+
+- **There is a folder-layout spec.** `docs/designs/testing/testing.structure.specs.md`
+  defines runner-first layout: `<component>/tests/<runner>/{unit,integration,acceptance,utils}`
+  plus `manual/` and `legacy/`. In practice the vitest packages collapse this to
+  `tests/unit/**/*.test.ts` (one runner, so no `vitest/` level). The agent runner's current
+  flat `test/` directory matches neither; aligning it to `tests/unit/` matches the closest
+  precedent (web packages) and the spec.
+
+## 5. Python-to-TypeScript mental model
+
+For mapping the tooling onto what the SDK/API side already does:
+
+| Concern              | Python (api/, sdks/)      | TypeScript (services/agent)        |
+|----------------------|---------------------------|------------------------------------|
+| Package manager      | `uv`                      | `pnpm` (own lockfile)              |
+| Run a script         | `uv run python x.py`      | `pnpm exec tsx x.ts`               |
+| Test runner          | `pytest`                  | **vitest** (proposed)              |
+| One command to test  | `uv run python run-tests.py` | `pnpm test` (proposed)          |
+| Type checker         | `mypy` / pyright          | `tsc --noEmit` (configured, unrun) |
+| Formatter            | `ruff format`             | `prettier` (runs repo-wide in hooks) |
+| Linter               | `ruff check`              | none today (eslint is web-only)    |
+| Fixtures             | `conftest.py` fixtures    | `tests/utils/` helper modules      |
+| CI unit gate         | `12-check-unit-tests.yml` Python jobs | new Node job (proposed) |
+
+The headline: the TS runner has a formatter (via the global pre-commit) but no test runner,
+no test gate, and no type gate. The Python side has all three. Closing that is the work.
+
+## 6. The cross-language contract is the seam that matters most
+
+`protocol.ts` is the single source of the `/run` types. `sdks/python/.../utils/wire.py`
+hand-mirrors them. The contract is pinned by shared golden JSON
+(`run_request.pi.json`, `run_request.claude.json`, `run_result.ok.json`,
+`run_result.error.json`) and asserted by `test_wire_contract.py` on the Python side only.
+
+This is the highest-value place to add a TS test. A vitest test that loads those same
+golden files and round-trips them through `protocol.ts` (parse the request shape, build a
+result that matches the result fixture) means a contract change has to update both sides or
+fail on both sides. It reuses fixtures that already exist, needs no harness and no network,
+and directly protects the Python-to-Node boundary the whole feature rests on.
+
+## 7. Maintainability observations (not blockers)
+
+- **Architecture is sound.** Engines are peers behind one contract; tools are split by
+  concern; the responder seam was already extracted from `rivet.ts` (and is unit-tested).
+  `protocol.ts` carries thorough doc comments. A Python dev can navigate it.
+- **Two large files.** `engines/rivet.ts` (1,085) and `tracing/otel.ts` (1,026) are the
+  obvious decomposition candidates. The responder extraction is the precedent: pull
+  cohesive seams out into separately testable units when you next touch them. Not a
+  big-bang refactor, and not a prerequisite for the test/CI work.
+- **No `AGENTS.md` for the package.** The repo pushes area conventions into nested
+  `AGENTS.md` files (`web/AGENTS.md`, `api/AGENTS.md`) with a `CLAUDE.md` symlink.
+  `services/agent` has a strong `README.md` but no `AGENTS.md`, so the "where does runner
+  code/tests go, how do I run them" rules have nowhere to live. Adding one is cheap and
+  fits the repo's instruction-layering model.
+- **Env-at-import-time.** Some modules read env on import (e.g. `skills.ts` reads
+  `AGENTA_AGENT_SKILLS_DIR`; the test sets it before a dynamic `import()`). vitest isolates
+  modules per test file, so this keeps working, but new tests touching such modules should
+  use dynamic import or `vi.resetModules()` rather than top-level import.
+
+## 8. One real decision to make
+
+**vitest vs `node:test`.** `node:test` is built in and adds zero dependencies, but it has
+no first-class junit reporter or coverage UX and would diverge from the frontend. vitest
+adds one dev dependency but matches `web/packages` exactly, gives junit + v8 coverage +
+watch + filtering out of the box, and lets the CI wiring mirror the web job. Recommendation:
+**vitest.** Everything in the plan assumes it; swapping to `node:test` would only change the
+runner dependency and config, not the structure.
diff --git a/docs/design/agent-workflows/typescript-structure/status.md b/docs/design/agent-workflows/typescript-structure/status.md
new file mode 100644
index 0000000000..a3a403fbd7
--- /dev/null
+++ b/docs/design/agent-workflows/typescript-structure/status.md
@@ -0,0 +1,217 @@
+# Status
+
+Source of truth for this planning folder. Update as work proceeds.
+
+## Current state — 2026-06-20
+
+Research complete. Plan drafted and then reviewed by Codex (gpt-5.5, xhigh). Plan widened in
+response (see plan.md Phases 1, 3, 5). **Phase 1 is implemented and green.**
+
+### Phase 1 done (2026-06-20)
+
+- Fixed the `callRef` bug in `src/tools/dispatch.ts` (lines 88, 92 now use `toolName`).
+- Added dev deps: `vitest` 4.1.9, `@vitest/coverage-v8` 4.1.9, `typescript` 5.9.3; bumped
+  `@types/node` to 24.13.2 (matches the Node 24 runtime). `pnpm-lock.yaml` updated.
+- Added `vitest.config.ts` (node env, junit to `test-results/junit.xml`, v8 coverage).
+- Added scripts: `test`, `test:unit`, `test:watch`, `test:coverage`, `typecheck`.
+- Moved `test/*.test.ts` (9 files, including `extension-tools.test.ts` from the
+  `feat/agent-runner-engines` lane) to `tests/unit/*.test.ts`, wrapped in `describe`/`it`,
+  kept `node:assert`, fixed import depth to `../../src/`.
+- Added `test-results/` and `coverage/` to `.gitignore`.
+
+Verified: `pnpm typecheck` exits 0 (and a planted type error makes it exit 2, so the gate has
+teeth). `pnpm test` = 9 files, 42 tests, all pass, junit written. `pnpm test:coverage` works
+(32.6% line coverage; engines are not exercised by unit tests yet, as expected).
+
+Not mine in the same working tree: `src/engines/pi.ts`, `src/engines/rivet.ts`, the
+Dockerfiles, and `src/engines/skills.ts` were already modified/untracked from the parallel
+`feat/agent-runner-engines` lane. The combined tree still typechecks and tests green.
+
+### Phase 2 done (2026-06-20)
+
+- Added job `run-services-node-unit-tests` to `.github/workflows/12-check-unit-tests.yml`,
+  mirroring the web (pnpm setup) and python-services (has_tests guard + package-selection
+  gate) jobs: Node 24 + corepack pnpm, `pnpm install --frozen-lockfile`, `pnpm run typecheck`,
+  `pnpm run test:unit` (working-directory `services/agent`), then publish
+  `services/agent/test-results/junit.xml` as "Agent Runner Unit Test Results".
+- No `setup-python`: the code-tool test spawns `python3`/`node`, both preinstalled on ubuntu
+  runners.
+- Verified locally: the workflow YAML parses and the job is present;
+  `pnpm install --frozen-lockfile` succeeds (lockfile matches package.json), so CI will not
+  fail on a lockfile mismatch.
+
+### Codex review of Phase 1+2 (xhigh) — all 5 findings fixed (2026-06-20)
+
+Codex confirmed the `callRef` fix is correct and the test conversion is assertion-faithful,
+then found 5 issues. All fixed and verified:
+
+1. **High — CI could pass while running nothing.** The `has_tests` guard let the job skip
+   silently. Removed it; vitest exits non-zero on no test files, so a missing suite now fails.
+2. **High — the nested `.gitignore` is itself ignored.** Root `.gitignore` line 68 (`.*`)
+   ignores every nested `.gitignore`, so the `services/agent/.gitignore` artifact rules could
+   never land. Reverted that edit; added `services/agent/test-results/` and
+   `services/agent/coverage/` to ROOT `.gitignore` (the repo's convention). Verified with
+   `git check-ignore`.
+3. **Medium — typecheck did not cover tests/config.** Broadened `tsconfig.json` `include` to
+   `src + tests + vitest.config.ts`. Proven: a planted type error in a test file now fails
+   `pnpm typecheck`.
+4. **Medium — brittle env isolation.** `skills.test.ts` now saves/restores
+   `AGENTA_AGENT_SKILLS_DIR` in `afterAll`; `responder.test.ts` has an `afterEach` that clears
+   `AGENTA_RIVET_DENY_PERMISSIONS` even if an assertion throws.
+5. **Low — the fixed bug had no direct test.** Added two `relayToolCall` tests in
+   `tool-dispatch.test.ts`: the ok path returns the relayed text, and the empty-error path
+   asserts `tool relay failed for <toolName>` (this would have thrown `ReferenceError` before
+   the fix).
+
+Final state after Phase 1+2: `pnpm typecheck` exits 0 (covers src + tests + config; planted
+errors exit 2). `pnpm test` = 9 files / 44 tests pass. `pnpm install --frozen-lockfile` clean.
+Workflow YAML valid.
+
+### Phase 3 done (2026-06-20)
+
+The TS side of the cross-language wire contract (the "later PR" the Python
+`test_wire_contract.py` names). Two layers, per Codex's correction that types are erased:
+
+- `tests/utils/golden.ts` reads the shared fixtures from
+  `sdks/python/oss/tests/pytest/unit/agents/golden/` in place via `node:fs` (no copy).
+- `tests/unit/wire-contract.test.ts`:
+  - **Runtime**: loads `run_request.pi.json`, `run_request.claude.json`, `run_result.ok.json`,
+    `run_result.error.json`; asserts shapes; exercises `resolvePromptText`,
+    `resolveRunSessionId`, `messageText`; checks the camelCase capability keys and the
+    trailing untyped event the wire carries.
+  - **Compile-time**: `KNOWN_REQUEST_KEYS` (mirrored from the Python test) and the capability
+    keys are assigned to `(keyof AgentRunRequest)[]` / `(keyof HarnessCapabilities)[]`. If
+    `protocol.ts` renames or drops a field the wire still emits, `tsc` fails.
+
+Both gates proven: a wire key not on `AgentRunRequest` fails `tsc` (TS2322); clean restores
+it. Final: `pnpm test` = **10 files / 51 tests** pass, `pnpm typecheck` exits 0.
+
+Phases 1, 2, and 3 are implemented, reviewed, and green.
+
+### Phase 4 done (2026-06-20)
+
+- `services/agent/AGENTS.md` + `CLAUDE.md` symlink (matches `web/`, `api/`): standalone pnpm
+  package, commands, where code/tests go, the mirrored wire contract, the testing seams.
+- **Testability seam (Codex's #1 structural item):** `server.ts` exports
+  `createAgentServer(run)` / `createRequestListener(run)`; `cli.ts` exports
+  `runCli(raw, stream, io)` with an injectable engine and output sink (streaming stays live).
+  Both entrypoints auto-run only when they are the process entry (`src/entry.ts`
+  `isEntrypoint`), so importing them in tests is inert.
+- New tests: `server.test.ts` (5) drives a real server on an ephemeral port with a fake
+  engine (/health, /run, 400 invalid JSON, 500 failure, NDJSON order); `cli.test.ts` (4)
+  drives `runCli` with a fake engine + collecting write (one-shot, invalid JSON, failure,
+  streaming order).
+- Deferred (documented): `typescript-eslint` (tsc --strict + prettier is the floor; risks a
+  rabbit hole in existing engine code) and decomposing `rivet.ts`/`otel.ts` (opportunistic).
+
+### Phase 5 partial (2026-06-20) — runner side done; client/release/CI need decisions
+
+Implemented (self-contained, additive):
+- `src/version.ts`: `PROTOCOL_VERSION = 1`, `RUNNER_VERSION` (from package.json), engines,
+  harnesses. `GET /health` now returns this identity instead of `{status:"ok"}`. Verified
+  live: `{"status":"ok","runner":"0.1.0","protocol":1,"engines":[...],"harnesses":[...]}`.
+- `package.json` version `0.0.0` -> `0.1.0`.
+- Fixed the misleading `sdks/python/.../adapters/local.py` docstring (the source of the wheel
+  worry): the runner is NOT in the wheel; runner-delivery is an open decision.
+
+Deferred (genuine decisions / other areas / would deepen entanglement):
+- Client-side probe: the Python adapter should `GET /health` once and refuse an incompatible
+  protocol major (SDK `ts_runner.py`/adapters; needs the version-compat policy decided).
+- Release ownership + SDK pinning a runner protocol range (decision: does the sidecar version
+  track the Agenta release or version independently?).
+- Sidecar image publishing in CI (`42-railway-build.yml` builds only api/web/services today).
+- Config hygiene: `services/oss/src/agent/app.py` raw `os.getenv` -> shared `env` object
+  (that file is modified by another lane right now; editing it would conflict).
+
+Final after Phases 4+5: `pnpm test` = **12 files / 60 tests** pass, `pnpm typecheck` exits 0.
+
+### Commit status (2026-06-20)
+
+Not committable as an independent unit yet. GitButler committed the new files cleanly (tests,
+config, CI, docs) but refused to commit the edits to `package.json`, `dispatch.ts`,
+`tsconfig.json` and the old-test deletions, because those files are owned by the in-flight
+`feat/agent-runner-engines` commits below in the stack. A half-committed lane is broken, so
+the lane was rolled back to snapshot `fce735461f`. All work is intact and green on disk. It
+should land WITH the agent-runner feature (that lane's owner includes these files, or this
+test work stacks cleanly once that feature is actually committed/pushed).
+
+## Codex review (xhigh) — 2026-06-20
+
+Codex's verdict: the plan is directionally right but too narrow. It fixes test ergonomics
+but does not yet make the runner a versioned, supportable server component. Verified findings
+we accepted:
+
+- **Real bug (verified):** `services/agent/src/tools/dispatch.ts` references `callRef` at
+  lines 88 and 92, but that identifier is not defined in `relayToolCall` (only `spec.callRef`
+  exists elsewhere). On a Daytona relay failure/timeout, the error-message build throws
+  `ReferenceError` and masks the real error. A `tsc --noEmit` gate catches it. This is the
+  strongest argument for the typecheck gate, and it is a one-line fix.
+- **`typescript` is not a dependency (verified):** `node_modules/.bin/tsc` does not exist.
+  The `typecheck` script needs `typescript` added; `tsx` does not provide `tsc`.
+- **Phase 3 was naive (accepted):** TS interfaces are erased at runtime, so "round-trip the
+  golden JSON through `protocol.ts`" does nothing at runtime. Use runtime validation (a zod
+  schema or explicit structural assertions), plus a separate type-level check.
+- **Testability seam (accepted):** export `createServer(runAgent)` / `runCli(runAgent)` so
+  HTTP and CLI paths can be tested with a fake engine, no live Pi/Claude/rivet.
+- **CI detail (verified):** `test/code-tool.test.ts` spawns `python3`. The Node CI job needs
+  Python available, or that test gets split out.
+- **Bigger gaps (accepted, now Phase 5):** no protocol/version negotiation, no sidecar image
+  publishing in CI, no release ownership (`package.json` is `0.0.0`), local code-tool
+  execution has no stated sandbox/resource policy, and `services/oss/src/agent/app.py` reads
+  `AGENTA_AGENT_*` via raw `os.getenv` instead of the shared env object.
+- **Packaging smoking gun (verified):** `sdks/python/.../adapters/local.py` docstring says a
+  "bundled JS runner ... shipped inside the wheel," but it is marked NOT YET IMPLEMENTED.
+  Nothing TS is in the wheel today; the future `LocalBackend` plans to put a bundled JS
+  runner there. That aspirational note is the likely source of the wheel worry.
+
+Where Codex was wrong: it claimed 9 test files; there are 8 (`skills.test.ts` was already
+counted). Minor.
+
+## What is true in the repo today
+
+- `services/agent` is a standalone pnpm package (own lockfile, Node 24, ESM, `tsx` runtime,
+  `strict` tsconfig with `noEmit`).
+- 8 unit tests exist under `services/agent/test/`, written as hand-run `tsx` + `node:assert`
+  scripts. No `pnpm test`, no runner, no aggregation.
+- Those tests run in NO CI workflow. `12-check-unit-tests.yml`'s services job is Python-only
+  (`services/oss/tests/pytest/unit`).
+- No typecheck gate runs anywhere, despite `strict`.
+- The wire contract is pinned from Python only (`test_wire_contract.py` + golden fixtures);
+  the TS `protocol.ts` has no test asserting it.
+- The repo already standardizes vitest for TS units (`web/packages/*`), with a written
+  folder spec (`docs/designs/testing/testing.structure.specs.md`).
+
+## Open decisions
+
+1. **Runner: vitest vs node:test.** Recommended: vitest (matches `web/packages`, junit +
+   coverage + watch out of the box). Blocks Phase 1 config only; structure is the same
+   either way.
+2. **Folder layout: move `test/` to `tests/unit/`?** Recommended: yes, to match web packages
+   and the structure spec. Low-risk mechanical move.
+3. **Does `typecheck` failure fail CI?** Recommended: yes.
+4. **Add eslint to `services/agent`?** Recommended: defer (optional Phase 4); prettier +
+   `tsc --strict` is the floor.
+
+## Progress
+
+- [x] Inventory the new TS and how it builds/ships
+- [x] Confirm the test/CI/typecheck gaps (verified: no CI runs the runner tests)
+- [x] Capture the repo's existing TS conventions (vitest, structure spec, CI shape)
+- [x] Write context / research / plan
+- [x] Phase 1: vitest + scripts + convert tests (green: 42 tests, typecheck gate live)
+- [x] Phase 2: CI Node job + junit publish (added to 12-check-unit-tests.yml; YAML + frozen install verified)
+- [x] Phase 3: golden-fixture contract test on the TS side (runtime + compile-time guards; both proven)
+- [x] Phase 4: `AGENTS.md` + the `createAgentServer`/`runCli` seam + server/cli tests (eslint deferred)
+- [~] Phase 5: runner-side version/`/health` + version bump + local.py docstring DONE; client probe, release scheme, image publishing, app.py config hygiene DEFERRED (decisions)
+- [ ] Commit: lands with `feat/agent-runner-engines` (shared files block an independent commit)
+
+## Notes / caveats for the next reader
+
+- `services/agent` is intentionally NOT in `web/pnpm-workspace.yaml`. Keep it standalone so
+  the sidecar Docker build stays decoupled from the web dependency graph.
+- The golden fixtures live under `sdks/python/oss/tests/pytest/unit/agents/golden/`. The TS
+  contract test should read them in place, not copy them.
+- Frontend TS (`web/oss/src/components/AgentChatSlice/`) is out of scope; it already has a
+  home and conventions.
+- Some runner modules read env at import time; new tests should dynamic-import after setting
+  env (vitest isolates modules per file).
diff --git a/services/agent/AGENTS.md b/services/agent/AGENTS.md
new file mode 100644
index 0000000000..135532260a
--- /dev/null
+++ b/services/agent/AGENTS.md
@@ -0,0 +1,62 @@
+# Agent runner (TypeScript) conventions
+
+Scope: everything under `services/agent/`. This is the Node "agent runner" sidecar. It runs
+the agent loop and serves one contract: a JSON `/run` request in, a structured result out.
+The Python agent service (`services/oss/src/agent/`) decides *what* to run; this package
+*runs* it. It lives in Node because the harnesses (Pi, Claude Code, rivet's `sandbox-agent`)
+are Node libraries with no Python SDK. The repo-wide rules live in `/AGENTS.md`; the
+architecture overview is this folder's `README.md`.
+
+## This is a standalone pnpm package
+
+Not part of the `web/` pnpm workspace. It has its OWN `pnpm-lock.yaml`, builds its own Docker
+image, and pins Node 24 / pnpm 10.30 / ESM (`"type": "module"`). It runs through `tsx` (no
+app compile step); the only build is `pnpm run build:extension` (esbuild-bundles the Pi
+extension into `dist/`). Keep it standalone so the sidecar image stays decoupled from the web
+dependency graph.
+
+## Commands
+
+```bash
+pnpm install              # from services/agent, with Node 24 on PATH
+pnpm run serve            # HTTP sidecar on :8765 (GET /health, POST /run)
+pnpm run run:cli          # one JSON request on stdin -> one result on stdout
+pnpm test                 # vitest: all unit tests
+pnpm run test:watch       # vitest watch
+pnpm run test:coverage    # vitest + v8 coverage
+pnpm run typecheck        # tsc --noEmit (src + tests + vitest.config)
+```
+
+## Where code and tests go
+
+- Runtime code: `src/` — `engines/` (one engine per file: `pi`, `rivet`), `tools/`,
+  `tracing/`, `extensions/`. Entrypoints: `cli.ts`, `server.ts`. The `/run` wire contract is
+  `protocol.ts`.
+- Tests: `tests/unit/**/*.test.ts` (vitest, `node:assert` is fine inside `it`). Shared test
+  helpers and fixtures live in `tests/utils/`. This mirrors `web/packages/*` and the repo
+  testing.structure spec. Do not add tests back under a flat `test/` directory.
+- Build/test artifacts (`test-results/`, `coverage/`, `dist/`) are git-ignored from the ROOT
+  `.gitignore` — a nested `services/agent/.gitignore` does NOT take effect (the repo-wide
+  `.*` rule ignores all nested `.gitignore` files).
+
+## The wire contract is mirrored — change both sides
+
+`src/protocol.ts` is the source of the `/run` types. The Python side hand-mirrors them in
+`sdks/python/agenta/sdk/agents/utils/wire.py`, and the contract is pinned by shared golden
+fixtures in `sdks/python/oss/tests/pytest/unit/agents/golden/`. Both sides assert those
+fixtures: Python in `test_wire_contract.py`, TypeScript in `tests/unit/wire-contract.test.ts`.
+If you add, rename, or remove a wire field, update the golden, then `protocol.ts` AND
+`wire.py` AND both contract tests, deliberately. The TS test has a compile-time key guard, so
+a drifted `protocol.ts` fails `tsc`.
+
+## Testing seams
+
+`server.ts` and `cli.ts` export `createAgentServer(run)` / `runCli(raw, {run})` so the HTTP
+and CLI behavior can be tested with a fake engine (no live Pi/Claude/rivet). Prefer testing
+through those seams over importing the real engines. Engine-internal logic that is pure
+(`tracing/otel.ts` state machine, `tools/*`, `engines/skills.ts`) is unit-tested directly.
+
+## Before committing
+
+There is no eslint here yet (deferred); `tsc --strict` + the repo-wide prettier hook are the
+floor. Run `pnpm test` and `pnpm run typecheck` before pushing.
diff --git a/services/agent/CLAUDE.md b/services/agent/CLAUDE.md
new file mode 120000
index 0000000000..47dc3e3d86
--- /dev/null
+++ b/services/agent/CLAUDE.md
@@ -0,0 +1 @@
+AGENTS.md
\ No newline at end of file
diff --git a/services/agent/package.json b/services/agent/package.json
index 231b6ff5f6..e3311615f9 100644
--- a/services/agent/package.json
+++ b/services/agent/package.json
@@ -1,6 +1,6 @@
 {
   "name": "agenta-agent-pi-wrapper",
-  "version": "0.0.0",
+  "version": "0.1.0",
   "private": true,
   "type": "module",
   "packageManager": "pnpm@10.30.0",
@@ -10,7 +10,12 @@
     "serve": "tsx src/server.ts",
     "serve:watch": "tsx watch src/server.ts",
     "build:extension": "node scripts/build-extension.mjs",
-    "login": "pi"
+    "login": "pi",
+    "test": "pnpm run test:unit",
+    "test:unit": "vitest run",
+    "test:watch": "vitest",
+    "test:coverage": "vitest run --coverage",
+    "typecheck": "tsc --noEmit"
   },
   "dependencies": {
     "@daytonaio/sdk": "^0.187.0",
@@ -26,9 +31,12 @@
     "sandbox-agent": "0.4.2"
   },
   "devDependencies": {
-    "@types/node": "22.10.2",
+    "@types/node": "^24.0.0",
+    "@vitest/coverage-v8": "^4.1.4",
     "esbuild": "0.23.1",
-    "tsx": "4.19.2"
+    "tsx": "4.19.2",
+    "typescript": "^5.9.3",
+    "vitest": "^4.1.4"
   },
   "pnpm": {
     "onlyBuiltDependencies": [
diff --git a/services/agent/pnpm-lock.yaml b/services/agent/pnpm-lock.yaml
index 7bd7134915..62bde1acb0 100644
--- a/services/agent/pnpm-lock.yaml
+++ b/services/agent/pnpm-lock.yaml
@@ -43,14 +43,23 @@ importers:
         version: 0.4.2(@daytonaio/sdk@0.187.0(ws@8.21.0))(zod@4.4.3)
     devDependencies:
       '@types/node':
-        specifier: 22.10.2
-        version: 22.10.2
+        specifier: ^24.0.0
+        version: 24.13.2
+      '@vitest/coverage-v8':
+        specifier: ^4.1.4
+        version: 4.1.9(vitest@4.1.9)
       esbuild:
         specifier: 0.23.1
         version: 0.23.1
       tsx:
         specifier: 4.19.2
         version: 4.19.2
+      typescript:
+        specifier: ^5.9.3
+        version: 5.9.3
+      vitest:
+        specifier: ^4.1.4
+        version: 4.1.9(@opentelemetry/api@1.9.0)(@types/node@24.13.2)(@vitest/coverage-v8@4.1.9)(vite@8.0.16(@types/node@24.13.2)(esbuild@0.23.1)(jiti@2.7.0)(tsx@4.19.2)(yaml@2.9.0))
 
 packages:
 
@@ -269,10 +278,31 @@ packages:
     resolution: {integrity: sha512-iY8yvjE0y651BixKNPgmv1WrQc+GZ142sb0z4gYnChDDY2YqI4P/jsSopBWrKfAt7LOJAkOXt7rC/hms+WclQQ==}
     engines: {node: '>=18.0.0'}
 
+  '@babel/helper-string-parser@7.29.7':
+    resolution: {integrity: sha512-Pb5ijPrZ89GDH8223L4UP8i6QApWxs04RbPQJTeWDV0/keR2E36MeKnyr6LYmUUvqRRI+Iv87SuF1W6ErINzYw==}
+    engines: {node: '>=6.9.0'}
+
+  '@babel/helper-validator-identifier@7.29.7':
+    resolution: {integrity: sha512-qehxGkRj55h/ff8EMaJ+cYhyaKlHIxqYDn682wQD7RNp9UujOQsHog2uS0r2vzr4pW+sXf90NeeayjcNaX3fFg==}
+    engines: {node: '>=6.9.0'}
+
+  '@babel/parser@7.29.7':
+    resolution: {integrity: sha512-hnORnjP/1P/zFEndoeX+n+t1RwWRJiJpM/jO7FW32Kn9r5+sJB2JWOdYo4L6k78j15eCwY3Gm/7364B1EMwtNg==}
+    engines: {node: '>=6.0.0'}
+    hasBin: true
+
   '@babel/runtime@7.29.7':
     resolution: {integrity: sha512-Nq8OhGWiZIZGV6hLHoyAKLLcJihP/xFeBMGJoUrxTX2psI8dCifzLhZISFb+VWS3wFMRDmCGw5R+dOySCqPLhw==}
     engines: {node: '>=6.9.0'}
 
+  '@babel/types@7.29.7':
+    resolution: {integrity: sha512-4zBIxpPzowiZpusoFkyGVwakdRJUyuH5PxQ/PrqghfdFWWasvnCdPfQXHrenDai+gyLARulZjZowCOj6fjT4pA==}
+    engines: {node: '>=6.9.0'}
+
+  '@bcoe/v8-coverage@1.0.2':
+    resolution: {integrity: sha512-6zABk/ECA/QYSCQ1NGiVwwbQerUCZ+TQbp64Q3AgmfNvurHH0j8TtXa1qbShXA6qqkpAj4V5W8pP6mLe1mcMqA==}
+    engines: {node: '>=18'}
+
   '@daytona/api-client@0.187.0':
     resolution: {integrity: sha512-riKOJ6eSuy67DL6iJlAa3Bfjnm4iQmkOdJk0B5hqrYMZeZmVDsgdiZtYvFpyoa+2KCZFNb0Gs5dQwO1d6NhGCw==}
 
@@ -301,6 +331,15 @@ packages:
     resolution: {integrity: sha512-/ZhfFiHSBMH7AbDrBQIN+UWlJnl9tSEpLYICRGGMzmNfyCqX+30NYacIhyOEaD8R5rS6wJZysAOPU0yNwigbXw==}
     engines: {node: '>=22.19.0'}
 
+  '@emnapi/core@1.10.0':
+    resolution: {integrity: sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw==}
+
+  '@emnapi/runtime@1.10.0':
+    resolution: {integrity: sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA==}
+
+  '@emnapi/wasi-threads@1.2.1':
+    resolution: {integrity: sha512-uTII7OYF+/Mes/MrcIOYp5yOtSMLBWSIoLPpcgwipoiKbli6k322tcoFsxoIIxPDqW01SQGAgko4EzZi2BNv2w==}
+
   '@esbuild/aix-ppc64@0.23.1':
     resolution: {integrity: sha512-6VhYk1diRqrhBAqpJEdjASR/+WVRtfjpqKuNw11cLiaWpAT/Uu+nokB+UJnevzy/P9C/ty6AOe0dwueMrGh/iQ==}
     engines: {node: '>=18'}
@@ -569,6 +608,16 @@ packages:
     resolution: {integrity: sha512-wgm9Ehl2jpeqP3zw/7mo3kRHFp5MEDhqAdwy1fTGkHAwnkGOVsgpvQhL8B5n1qlb01jV3n/bI0ZfZp5lWA1k4w==}
     engines: {node: '>=18.0.0'}
 
+  '@jridgewell/resolve-uri@3.1.2':
+    resolution: {integrity: sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==}
+    engines: {node: '>=6.0.0'}
+
+  '@jridgewell/sourcemap-codec@1.5.5':
+    resolution: {integrity: sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==}
+
+  '@jridgewell/trace-mapping@0.3.31':
+    resolution: {integrity: sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==}
+
   '@js-sdsl/ordered-map@4.4.2':
     resolution: {integrity: sha512-iUKgm52T8HOE/makSxjqoWhe95ZJA1/G1sYsGev2JDKUSS14KAgg1LHb+Ba+IPow0xflbnSkOsZcO08C7w1gYw==}
 
@@ -643,6 +692,12 @@ packages:
   '@mistralai/mistralai@2.2.1':
     resolution: {integrity: sha512-uKU8CZmL2RzYKmplsU01hii4p3pe4HqJefpWNRWXm1Tcm0Sm4xXfwSLIy4k7ZCPlbETCGcp69E7hZs+WOJ5itQ==}
 
+  '@napi-rs/wasm-runtime@1.1.5':
+    resolution: {integrity: sha512-AWPoBRJ9tsnVhor4sjO7rkni+7p+2IAEFj6cx06UgP10jkQHqay/36uRV/bFkgrh18D9vb4cr8Q0Pthskgzy+Q==}
+    peerDependencies:
+      '@emnapi/core': ^1.7.1
+      '@emnapi/runtime': ^1.7.1
+
   '@nodable/entities@2.2.0':
     resolution: {integrity: sha512-9uGyhaQavEUMC8AIddIjau4NsnsXhou+j5sBAGojCM1oxmQpVKTWR/9JxABD6UAv12vpIms55fPZKFQEhG6uBg==}
 
@@ -952,6 +1007,9 @@ packages:
     resolution: {integrity: sha512-/UhIkaZgPutTFmQ7RnIJGgDXZmtEJ7Dvi86xNTFWcnRxVRNk/aotsqDJYeEvDP+FSMB2SdW+pQzNMcWP0rwuNA==}
     engines: {node: '>=14'}
 
+  '@oxc-project/types@0.133.0':
+    resolution: {integrity: sha512-KzkdCd6Uxqnf6l3HOw1xfatAlUURA0g14cvBYFyJ5SaNOQbOUvBr9PKArcPcrNIeRsBdgcUzOGrhKveVpvOIGA==}
+
   '@protobufjs/aspromise@1.1.2':
     resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==}
 
@@ -982,6 +1040,104 @@ packages:
   '@protobufjs/utf8@1.1.1':
     resolution: {integrity: sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==}
 
+  '@rolldown/binding-android-arm64@1.0.3':
+    resolution: {integrity: sha512-454rs7jHngixp/NMxd5srYD57OnzSlZ/eFTETjORQHLwJG1lRtmNOJcBerZlfu4GjKqeq8aCCIQrMdHyhI51Hw==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [arm64]
+    os: [android]
+
+  '@rolldown/binding-darwin-arm64@1.0.3':
+    resolution: {integrity: sha512-PcAhP+ynjURNyy8SKGl5DQP94aGuB/7JrXJb/t7P+hanXvQVMWzUvRRhBAcg/lNRadBhoUPqSoP4xw5tR/KBEA==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [arm64]
+    os: [darwin]
+
+  '@rolldown/binding-darwin-x64@1.0.3':
+    resolution: {integrity: sha512-9YpfeUvSE2RS7wysJ81uOZkXJz7f7Q55H2Gvp3VEw/EsahqDtrphrZ0EwDLK5vvKOzaCrBsjF8JmnMLcUt78Gg==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [x64]
+    os: [darwin]
+
+  '@rolldown/binding-freebsd-x64@1.0.3':
+    resolution: {integrity: sha512-yB1IlAsSNHncV6SCTL27/MVGR5htvQsoGxIv5KMGXALp+Ll1wYsn+x98M9MW7qa+NdSbvrrY7ANI4wLJ0n1e6g==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [x64]
+    os: [freebsd]
+
+  '@rolldown/binding-linux-arm-gnueabihf@1.0.3':
+    resolution: {integrity: sha512-Yi30IVAAfLUCy2MseFjbB1jAMDl1VMCAas5StnYp8da9+CKvMd2H2cbEjWcw5NPaPqzvYkVIaF1nNUG+b7u/sw==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [arm]
+    os: [linux]
+
+  '@rolldown/binding-linux-arm64-gnu@1.0.3':
+    resolution: {integrity: sha512-jsO7R8To+AdlYgUmN5sHSCZbfhtMBkO0WUx8iORQnPcMMdgr7qM2DQmMwgabs3GhNztdmoKkMKQFHD6DTMCIQw==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [arm64]
+    os: [linux]
+    libc: [glibc]
+
+  '@rolldown/binding-linux-arm64-musl@1.0.3':
+    resolution: {integrity: sha512-VWkUHwWriDciit80wleYwKILoR/KMvxh/IdwS/paX+ZgpuRpCrKLUdadJbc0NpBEiyhpYawsJ73j9aCvOH+f7Q==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [arm64]
+    os: [linux]
+    libc: [musl]
+
+  '@rolldown/binding-linux-ppc64-gnu@1.0.3':
+    resolution: {integrity: sha512-5f1laC0SlIR0yDbFCd8acUhvJIag6N3zC5P7oUPN6wX0aOma+uKJ0wBDH5aq7I1PVI2ttTlhJwzwRIBnLiSGEg==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [ppc64]
+    os: [linux]
+    libc: [glibc]
+
+  '@rolldown/binding-linux-s390x-gnu@1.0.3':
+    resolution: {integrity: sha512-Iq4ko0r4XsgbrF/LunNgHtAGLRRVE2kXonAXQ/MV0mC6jQpMOhW1SvtZja2EhC/kd05++bP78dsqBeIQyYJ6Yg==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [s390x]
+    os: [linux]
+    libc: [glibc]
+
+  '@rolldown/binding-linux-x64-gnu@1.0.3':
+    resolution: {integrity: sha512-B8m6tD5+/N5FeNQFbKlLA/2yVq9ycQP1SeedyEYYKWBNR3ZQbkvIUcNnDNM03lO1l5F2roiiFJGgvoLLyZXtSg==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [x64]
+    os: [linux]
+    libc: [glibc]
+
+  '@rolldown/binding-linux-x64-musl@1.0.3':
+    resolution: {integrity: sha512-pSdpdUJHkuCxun9LE7jvgUB9qsRgaiyNNCX7m/AvHTcq67AiT/Yhoxvw5zPfhrM8k/BfP8ce/hMOpthKDpEUow==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [x64]
+    os: [linux]
+    libc: [musl]
+
+  '@rolldown/binding-openharmony-arm64@1.0.3':
+    resolution: {integrity: sha512-OXXS3RKJgX2uLwM+gYyuH5omcH8fL1LJs96pZGgtetVCahON57+d4SJHzTgZiOjxgGkSnpXpOsWuPDGAKAigEg==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [arm64]
+    os: [openharmony]
+
+  '@rolldown/binding-wasm32-wasi@1.0.3':
+    resolution: {integrity: sha512-JTtb8BWFynicNSoPrehsCzBtOKjZ6jhMiPFEmOiuXg1Fl8dn2KHQob+GuPSGR0dryQa1PQJbzjF3dqO/whhjLg==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [wasm32]
+
+  '@rolldown/binding-win32-arm64-msvc@1.0.3':
+    resolution: {integrity: sha512-gEdFFEN70A/jxb2svrWsN3aDL7OUtmvlOy+6fa2jxG8K0wQ1ZbdeLGnidov6Yu5/733dI5ySfzFlQ/cb0bSz1g==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [arm64]
+    os: [win32]
+
+  '@rolldown/binding-win32-x64-msvc@1.0.3':
+    resolution: {integrity: sha512-eXB7CHuaQdqmJcc3koCNtNPmT/bj2gc999kUFgBxG8Ac0NdgXc4rkCHhqrgrhN3zddvvvrgzj1e90SuSfmyIXA==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    cpu: [x64]
+    os: [win32]
+
+  '@rolldown/pluginutils@1.0.1':
+    resolution: {integrity: sha512-2j9bGt5Jh8hj+vPtgzPtl72j0yRxHAyumoo6TNfAjsLB04UtpSvPbPcDcBMxz7n+9CYB0c1GxQFxYRg2jimqGw==}
+
   '@sandbox-agent/cli-darwin-arm64@0.4.2':
     resolution: {integrity: sha512-+L1O8SI7k/LLhyB4dG0ghmz1cJHa0WtVjuRTrEE2gw/5EbGLWopPBsCVCmQ7snrQ4fPwtaiZDhfExcEj1VI7aw==}
     cpu: [arm64]
@@ -1057,12 +1213,65 @@ packages:
     resolution: {integrity: sha512-R8Rdn8Hy72KKcebgLiv8jQcQkXoLMOGGv5uI1/k0l+snqkOzQ1R0ChUBCxWMlBsFMekWjq0wRudIweFs7sKT5A==}
     engines: {node: '>=14.0.0'}
 
-  '@types/node@22.10.2':
-    resolution: {integrity: sha512-Xxr6BBRCAOQixvonOye19wnzyDiUtTeqldOOmj3CkeblonbccA12PFwlufvRdrpjXxqnmUaeiU5EOA+7s5diUQ==}
+  '@standard-schema/spec@1.1.0':
+    resolution: {integrity: sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w==}
+
+  '@tybys/wasm-util@0.10.2':
+    resolution: {integrity: sha512-RoBvJ2X0wuKlWFIjrwffGw1IqZHKQqzIchKaadZZfnNpsAYp2mM0h36JtPCjNDAHGgYez/15uMBpfGwchhiMgg==}
+
+  '@types/chai@5.2.3':
+    resolution: {integrity: sha512-Mw558oeA9fFbv65/y4mHtXDs9bPnFMZAL/jxdPFUpOHHIXX91mcgEHbS5Lahr+pwZFR8A7GQleRWeI6cGFC2UA==}
+
+  '@types/deep-eql@4.0.2':
+    resolution: {integrity: sha512-c9h9dVVMigMPc4bwTvC5dxqtqJZwQPePsWjPlpSOnojbor6pGqdk541lfA7AqFQr5pB1BRdq0juY9db81BwyFw==}
+
+  '@types/estree@1.0.9':
+    resolution: {integrity: sha512-GhdPgy1el4/ImP05X05Uw4cw2/M93BCUmnEvWZNStlCzEKME4Fkk+YpoA5OiHNQmoS7Cafb8Xa3Pya8m1Qrzeg==}
+
+  '@types/node@24.13.2':
+    resolution: {integrity: sha512-fRa09kZTgu8o71KFcDjUFuc7F+dEbZYZmkI0mg5YBTRs0yMKjYHsq/c0urDKeDb+D5qVgXOdFcuu+DZPKOITwA==}
 
   '@types/retry@0.12.0':
     resolution: {integrity: sha512-wWKOClTTiizcZhXnPY4wikVAwmdYHp8q6DmC+EJUzAMsycb7HB32Kh9RN4+0gExjmPmZSAQjgURXIGATPegAvA==}
 
+  '@vitest/coverage-v8@4.1.9':
+    resolution: {integrity: sha512-G9/lgqibheLVBDRuya45EbsEXTYcWoSG+TLg7i2axuzx0Eq62eXn+aWXyaVdV5vKvFSWd6ywcX8hA7la9Pvu8g==}
+    peerDependencies:
+      '@vitest/browser': 4.1.9
+      vitest: 4.1.9
+    peerDependenciesMeta:
+      '@vitest/browser':
+        optional: true
+
+  '@vitest/expect@4.1.9':
+    resolution: {integrity: sha512-vl/rYsUKcBr3SnQn166+XR5ZQcgMx3DQhFWdfli/cWpLnLUmbxZvyrJZotLFUryib+LtArYMSTJ5RbQ57ZqrlA==}
+
+  '@vitest/mocker@4.1.9':
+    resolution: {integrity: sha512-EVkXzBjrPGM+cK8/ANWgBrkUCfJfb38/EfTSO8h7pWvKkyPkpWxvR7BkD2MyItMF62C97zAEoqdpUixwR/e+Rw==}
+    peerDependencies:
+      msw: ^2.4.9
+      vite: ^6.0.0 || ^7.0.0 || ^8.0.0
+    peerDependenciesMeta:
+      msw:
+        optional: true
+      vite:
+        optional: true
+
+  '@vitest/pretty-format@4.1.9':
+    resolution: {integrity: sha512-s0iufns3iIFitdgm+YR7g1whCAaGtXz459VS9/PqyKDEEFgYIhsHOQmXgIgDuYCt7DeQmiZT0Qe2OA2p4ZPu5A==}
+
+  '@vitest/runner@4.1.9':
+    resolution: {integrity: sha512-KXLMDtc7oe70+3mJfGrPUWPesswH+3sTxAMAMl8DG7I8IUQT4XW718dY5ID3vPUcmlu27CcKfY4P3h3I29SLJg==}
+
+  '@vitest/snapshot@4.1.9':
+    resolution: {integrity: sha512-Jc7RKGNBo8Z28WYIm0Niej4xdSPByRf6mU58VpHQkd6Zh05rlnA+twjbK5HyeIGHxrzsc3mJgS43uM0CZKzaIA==}
+
+  '@vitest/spy@4.1.9':
+    resolution: {integrity: sha512-fHpsS6mIi+PiEW+vcRVOMkX1oSaPKne3VOclSFICPcGOmfKgXPU5iAah+wcNcj2xPrCCmfq99IDGf+EojhhvhA==}
+
+  '@vitest/utils@4.1.9':
+    resolution: {integrity: sha512-A51o8ymO5PpqlWNnBP9ZHPXDIpuMtTLlGSjN7la4US+LJzoUMyhwjA5QXlm39JexgwHKW4Xjs8Z2d3dLCXOeuA==}
+
   '@zed-industries/claude-agent-acp@0.23.1':
     resolution: {integrity: sha512-aQ1gAm1MBalwEgE/VB/m4z6sXw/fRccNOW268pNLXnWV704ZuLbbm0N+oEv8KTmd53dJ6YzMhMpD8p5ig6C+sA==}
     deprecated: This package has been renamed to @agentclientprotocol/claude-agent-acp. Please migrate to continue receiving updates.
@@ -1100,6 +1309,13 @@ packages:
   anynum@1.0.0:
     resolution: {integrity: sha512-xjR9/zBVnUOP6ztMIIgShjsxui80nQUQH+5xJnvrYLs+90bF25/KJqaAi8mk+B4RDtX1Nspi6fmp4YTEts8SfA==}
 
+  assertion-error@2.0.1:
+    resolution: {integrity: sha512-Izi8RQcffqCeNVgFigKli1ssklIbpHnCYc6AknXGYoB6grJqyeby7jv12JUQgmTAnIDnbck1uxksT4dzN3PWBA==}
+    engines: {node: '>=12'}
+
+  ast-v8-to-istanbul@1.0.4:
+    resolution: {integrity: sha512-0bC0/4bTSrnwdhU3IsZDwEdojvuPrSg59OYZfKsLRtJZ0u8VBx9DebfqqG8bRdCC0I7vjgxmPi41P0lpkhJHtA==}
+
   asynckit@0.4.0:
     resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==}
 
@@ -1141,6 +1357,10 @@ packages:
     resolution: {integrity: sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==}
     engines: {node: '>= 0.4'}
 
+  chai@6.2.2:
+    resolution: {integrity: sha512-NUPRluOfOiTKBKvWPtSD4PhFvWCqOi0BGStNWs57X9js7XGTprSmFoz5F0tWhR4WPjNeR9jXqdC7/UpSJTnlRg==}
+    engines: {node: '>=18'}
+
   chalk@5.6.2:
     resolution: {integrity: sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA==}
     engines: {node: ^12.17.0 || ^14.13 || >=16.0.0}
@@ -1167,6 +1387,9 @@ packages:
     resolution: {integrity: sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==}
     engines: {node: '>= 0.8'}
 
+  convert-source-map@2.0.0:
+    resolution: {integrity: sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==}
+
   cross-spawn@7.0.6:
     resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==}
     engines: {node: '>= 8'}
@@ -1188,6 +1411,10 @@ packages:
     resolution: {integrity: sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==}
     engines: {node: '>=0.4.0'}
 
+  detect-libc@2.1.2:
+    resolution: {integrity: sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ==}
+    engines: {node: '>=8'}
+
   diff@8.0.4:
     resolution: {integrity: sha512-DPi0FmjiSU5EvQV0++GFDOJ9ASQUVFh5kD+OzOnYdi7n3Wpm9hWWGfB/O2blfHcMVTL5WkQXSnRiK9makhrcnw==}
     engines: {node: '>=0.3.1'}
@@ -1214,6 +1441,9 @@ packages:
     resolution: {integrity: sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==}
     engines: {node: '>= 0.4'}
 
+  es-module-lexer@2.1.0:
+    resolution: {integrity: sha512-n27zTYMjYu1aj4MjCWzSP7G9r75utsaoc8m61weK+W8JMBGGQybd43GstCXZ3WNmSFtGT9wi59qQTW6mhTR5LQ==}
+
   es-object-atoms@1.1.2:
     resolution: {integrity: sha512-HWcBoN6NileqtSydK2FqHbS/LoDd2pqrnQHLyJzBj4kOp/ky2MWMN694xOfkK8/SnUsW2DH7EfyVlydKCsm1Zw==}
     engines: {node: '>= 0.4'}
@@ -1231,6 +1461,9 @@ packages:
     resolution: {integrity: sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==}
     engines: {node: '>=6'}
 
+  estree-walker@3.0.3:
+    resolution: {integrity: sha512-7RUKfXgSMMkzt6ZuXmqapOurLGPPfgj6l9uRZ7lRGolvk0y2yocc35LdcxKC5PQZdn2DMqioAQ2NoWcrTKmm6g==}
+
   events@3.3.0:
     resolution: {integrity: sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==}
     engines: {node: '>=0.8.x'}
@@ -1239,6 +1472,10 @@ packages:
     resolution: {integrity: sha512-A5EmesHW6rfnZ9ysHQjPdJRni0SRar0tjtG5MNtm9n5TUvsYU8oozprtRD4AqHxcZWWlVuAmQo2nWKfN9oyjTw==}
     engines: {node: '>=0.10.0'}
 
+  expect-type@1.3.0:
+    resolution: {integrity: sha512-knvyeauYhqjOYvQ66MznSMs83wmHrCycNEN6Ao+2AeYEfxUIkuiVxdEa1qlGEPK+We3n0THiDciYSsCcgW/DoA==}
+    engines: {node: '>=12.0.0'}
+
   extend@3.0.2:
     resolution: {integrity: sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==}
 
@@ -1256,6 +1493,15 @@ packages:
   fastq@1.20.1:
     resolution: {integrity: sha512-GGToxJ/w1x32s/D2EKND7kTil4n8OVk/9mycTc4VDza13lOvpUZTGX3mFSCtV9ksdGBVzvsyAVLM6mHFThxXxw==}
 
+  fdir@6.5.0:
+    resolution: {integrity: sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==}
+    engines: {node: '>=12.0.0'}
+    peerDependencies:
+      picomatch: ^3 || ^4
+    peerDependenciesMeta:
+      picomatch:
+        optional: true
+
   fetch-blob@3.2.0:
     resolution: {integrity: sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ==}
     engines: {node: ^12.20 || >= 14.13}
@@ -1342,6 +1588,10 @@ packages:
   graceful-fs@4.2.11:
     resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==}
 
+  has-flag@4.0.0:
+    resolution: {integrity: sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==}
+    engines: {node: '>=8'}
+
   has-symbols@1.1.0:
     resolution: {integrity: sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==}
     engines: {node: '>= 0.4'}
@@ -1365,6 +1615,9 @@ packages:
     resolution: {integrity: sha512-Hc+ghLoSt6QaYZUv0WBiIvmMDZuZZ7oaDvdH8MbfOO4lOsxdXLEvuC6ePoGs9H1X9oCLyq6+NVN0MKqD+ydxyg==}
     engines: {node: ^20.17.0 || >=22.9.0}
 
+  html-escaper@2.0.2:
+    resolution: {integrity: sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==}
+
   http-proxy-agent@7.0.2:
     resolution: {integrity: sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==}
     engines: {node: '>= 14'}
@@ -1415,10 +1668,25 @@ packages:
     peerDependencies:
       ws: '*'
 
+  istanbul-lib-coverage@3.2.2:
+    resolution: {integrity: sha512-O8dpsF+r0WV/8MNRKfnmrtCWhuKjxrq2w+jpzBL5UZKTi2LeVWnWOmWRxFlesJONmc+wLAGvKQZEOanko0LFTg==}
+    engines: {node: '>=8'}
+
+  istanbul-lib-report@3.0.1:
+    resolution: {integrity: sha512-GCfE1mtsHGOELCU8e/Z7YWzpmybrx/+dSTfLrvY8qRmaY6zXTKWn6WQIjaAFw069icm6GVMNkgu0NzI4iPZUNw==}
+    engines: {node: '>=10'}
+
+  istanbul-reports@3.2.0:
+    resolution: {integrity: sha512-HGYWWS/ehqTV3xN10i23tkPkpH46MLCIMFNCaaKNavAXTF1RkqxawEPtnjnGZ6XKSInBKkiOA5BKS+aZiY3AvA==}
+    engines: {node: '>=8'}
+
   jiti@2.7.0:
     resolution: {integrity: sha512-AC/7JofJvZGrrneWNaEnJeOLUx+JlGt7tNa0wZiRPT4MY1wmfKjt2+6O2p2uz2+skll8OZZmJMNqeke7kKbNgQ==}
     hasBin: true
 
+  js-tokens@10.0.0:
+    resolution: {integrity: sha512-lM/UBzQmfJRo9ABXbPWemivdCW8V2G8FHaHdypQaIy523snUjog0W71ayWXTjiR+ixeMyVHN2XcpnTd/liPg/Q==}
+
   json-bigint@1.0.0:
     resolution: {integrity: sha512-SiPv/8VpZuWbvLSMtTDU8hEfrZWg/mH/nV/b4o0CYbSxu1UIQPLdwKOCIyLQX+VIPO5vrLX3i8qtqFyhdPSUSQ==}
 
@@ -1432,6 +1700,80 @@ packages:
   jws@4.0.1:
     resolution: {integrity: sha512-EKI/M/yqPncGUUh44xz0PxSidXFr/+r0pA70+gIYhjv+et7yxM+s29Y+VGDkovRofQem0fs7Uvf4+YmAdyRduA==}
 
+  lightningcss-android-arm64@1.32.0:
+    resolution: {integrity: sha512-YK7/ClTt4kAK0vo6w3X+Pnm0D2cf2vPHbhOXdoNti1Ga0al1P4TBZhwjATvjNwLEBCnKvjJc2jQgHXH0NEwlAg==}
+    engines: {node: '>= 12.0.0'}
+    cpu: [arm64]
+    os: [android]
+
+  lightningcss-darwin-arm64@1.32.0:
+    resolution: {integrity: sha512-RzeG9Ju5bag2Bv1/lwlVJvBE3q6TtXskdZLLCyfg5pt+HLz9BqlICO7LZM7VHNTTn/5PRhHFBSjk5lc4cmscPQ==}
+    engines: {node: '>= 12.0.0'}
+    cpu: [arm64]
+    os: [darwin]
+
+  lightningcss-darwin-x64@1.32.0:
+    resolution: {integrity: sha512-U+QsBp2m/s2wqpUYT/6wnlagdZbtZdndSmut/NJqlCcMLTWp5muCrID+K5UJ6jqD2BFshejCYXniPDbNh73V8w==}
+    engines: {node: '>= 12.0.0'}
+    cpu: [x64]
+    os: [darwin]
+
+  lightningcss-freebsd-x64@1.32.0:
+    resolution: {integrity: sha512-JCTigedEksZk3tHTTthnMdVfGf61Fky8Ji2E4YjUTEQX14xiy/lTzXnu1vwiZe3bYe0q+SpsSH/CTeDXK6WHig==}
+    engines: {node: '>= 12.0.0'}
+    cpu: [x64]
+    os: [freebsd]
+
+  lightningcss-linux-arm-gnueabihf@1.32.0:
+    resolution: {integrity: sha512-x6rnnpRa2GL0zQOkt6rts3YDPzduLpWvwAF6EMhXFVZXD4tPrBkEFqzGowzCsIWsPjqSK+tyNEODUBXeeVHSkw==}
+    engines: {node: '>= 12.0.0'}
+    cpu: [arm]
+    os: [linux]
+
+  lightningcss-linux-arm64-gnu@1.32.0:
+    resolution: {integrity: sha512-0nnMyoyOLRJXfbMOilaSRcLH3Jw5z9HDNGfT/gwCPgaDjnx0i8w7vBzFLFR1f6CMLKF8gVbebmkUN3fa/kQJpQ==}
+    engines: {node: '>= 12.0.0'}
+    cpu: [arm64]
+    os: [linux]
+    libc: [glibc]
+
+  lightningcss-linux-arm64-musl@1.32.0:
+    resolution: {integrity: sha512-UpQkoenr4UJEzgVIYpI80lDFvRmPVg6oqboNHfoH4CQIfNA+HOrZ7Mo7KZP02dC6LjghPQJeBsvXhJod/wnIBg==}
+    engines: {node: '>= 12.0.0'}
+    cpu: [arm64]
+    os: [linux]
+    libc: [musl]
+
+  lightningcss-linux-x64-gnu@1.32.0:
+    resolution: {integrity: sha512-V7Qr52IhZmdKPVr+Vtw8o+WLsQJYCTd8loIfpDaMRWGUZfBOYEJeyJIkqGIDMZPwPx24pUMfwSxxI8phr/MbOA==}
+    engines: {node: '>= 12.0.0'}
+    cpu: [x64]
+    os: [linux]
+    libc: [glibc]
+
+  lightningcss-linux-x64-musl@1.32.0:
+    resolution: {integrity: sha512-bYcLp+Vb0awsiXg/80uCRezCYHNg1/l3mt0gzHnWV9XP1W5sKa5/TCdGWaR/zBM2PeF/HbsQv/j2URNOiVuxWg==}
+    engines: {node: '>= 12.0.0'}
+    cpu: [x64]
+    os: [linux]
+    libc: [musl]
+
+  lightningcss-win32-arm64-msvc@1.32.0:
+    resolution: {integrity: sha512-8SbC8BR40pS6baCM8sbtYDSwEVQd4JlFTOlaD3gWGHfThTcABnNDBda6eTZeqbofalIJhFx0qKzgHJmcPTnGdw==}
+    engines: {node: '>= 12.0.0'}
+    cpu: [arm64]
+    os: [win32]
+
+  lightningcss-win32-x64-msvc@1.32.0:
+    resolution: {integrity: sha512-Amq9B/SoZYdDi1kFrojnoqPLxYhQ4Wo5XiL8EVJrVsB8ARoC1PWW6VGtT0WKCemjy8aC+louJnjS7U18x3b06Q==}
+    engines: {node: '>= 12.0.0'}
+    cpu: [x64]
+    os: [win32]
+
+  lightningcss@1.32.0:
+    resolution: {integrity: sha512-NXYBzinNrblfraPGyrbPoD19C1h9lfI/1mzgWYvXUTe414Gz/X1FD2XBZSZM7rRTrMA8JL3OtAaGifrIKhQ5yQ==}
+    engines: {node: '>= 12.0.0'}
+
   lodash.camelcase@4.3.0:
     resolution: {integrity: sha512-TwuEnCnxbc3rAvhf/LbG7tJUDzhqXyFnv3dtzLOPgCG/hODL7WFnsbwktkD7yUV0RrreP/l1PALq/YSg6VvjlA==}
 
@@ -1442,6 +1784,16 @@ packages:
     resolution: {integrity: sha512-RPimw/7aMdv2oqRrxKwvZXcPfwBrn/JZ2xYcY9Hus/6LaS3VOAKVWKWgNLCFSiOm1ESXinjsDlidVU7JlnCN2A==}
     engines: {node: 20 || >=22}
 
+  magic-string@0.30.21:
+    resolution: {integrity: sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ==}
+
+  magicast@0.5.3:
+    resolution: {integrity: sha512-pVKE4UdSQ7DvHzivsCIFx2BJn1mHG6KsyrFcaxFx6tONdneEuThrDx0Cj3AMg58KyN4pzYT+LHOotxDQDjNvkw==}
+
+  make-dir@4.0.0:
+    resolution: {integrity: sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw==}
+    engines: {node: '>=10'}
+
   marked@15.0.12:
     resolution: {integrity: sha512-8dD6FusOQSrpv9Z1rdNMdlSgQOIP880DHqnohobOmYLElGEqAL/JvxvuxZO16r4HtjTlfPRDC1hbvxC9dPN2nA==}
     engines: {node: '>= 18'}
@@ -1485,6 +1837,11 @@ packages:
   ms@2.1.3:
     resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==}
 
+  nanoid@3.3.13:
+    resolution: {integrity: sha512-sPdqC6ByMVVGvF1ynvvMo0/o+oD1VX7DaHhijt1bFgjvBkHBib4t49GoNDhf2NDta4oeUNlaGbSt5K7qjZ955Q==}
+    engines: {node: ^10 || ^12 || ^13.7 || ^14 || >=15.0.1}
+    hasBin: true
+
   node-domexception@1.0.0:
     resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==}
     engines: {node: '>=10.5.0'}
@@ -1494,6 +1851,10 @@ packages:
     resolution: {integrity: sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==}
     engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0}
 
+  obug@2.1.3:
+    resolution: {integrity: sha512-9miFgM2OFba7hB+pRgvtV84pYTBaoTHohvmIgiRt6dRIzbwEOIaNaP+dIlGs2fNFoB0SeISs0Jz5WFVRid6Xyg==}
+    engines: {node: '>=12.20.0'}
+
   openai@6.26.0:
     resolution: {integrity: sha512-zd23dbWTjiJ6sSAX6s0HrCZi41JwTA1bQVs0wLQPZ2/5o2gxOJA5wh7yOAUgwYybfhDXyhwlpeQf7Mlgx8EOCA==}
     hasBin: true
@@ -1537,10 +1898,21 @@ packages:
     engines: {node: '>=20'}
     hasBin: true
 
+  picocolors@1.1.1:
+    resolution: {integrity: sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==}
+
   picomatch@2.3.2:
     resolution: {integrity: sha512-V7+vQEJ06Z+c5tSye8S+nHUfI51xoXIXjHQ99cQtKUkQqqO1kO/KCJUfZXuB47h/YBlDhah2H3hdUGXn8ie0oA==}
     engines: {node: '>=8.6'}
 
+  picomatch@4.0.4:
+    resolution: {integrity: sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==}
+    engines: {node: '>=12'}
+
+  postcss@8.5.15:
+    resolution: {integrity: sha512-FfR8sjd4em2T6fb3I2MwAJU7HWVMr9zba+enmQeeWFfCbm+UOC/0X4DS8XtpUTMwWMGbjKYP7xjfNekzyGmB3A==}
+    engines: {node: ^10 || ^12 || >=14}
+
   proper-lockfile@4.1.2:
     resolution: {integrity: sha512-TjNPblN4BwAWMXU8s9AEz4JmQxnD1NNL7bNOY/AKUzyamc379FWASUhc/K1pL2noVb+XmZKLL68cjzLsiOAMaA==}
 
@@ -1586,6 +1958,11 @@ packages:
     resolution: {integrity: sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==}
     engines: {iojs: '>=1.0.0', node: '>=0.10.0'}
 
+  rolldown@1.0.3:
+    resolution: {integrity: sha512-i00lAJ2ks1BYr7rjNjKC7BcqAS7nVfiT3QX1SI5aY+AFHblCmaUf9OE9dbdzDvW6dJxbi2ZCZiy9v3CcwOiX3g==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    hasBin: true
+
   run-parallel@1.2.0:
     resolution: {integrity: sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==}
 
@@ -1641,9 +2018,22 @@ packages:
     resolution: {integrity: sha512-VsC6n6vz1ihYYyZZwX7YZSF5l5x36ca17OC+a69h94YqB7X6XLwf+5MOgynYir2SLFUbl8gIYvBo8K8RoNQ6bQ==}
     engines: {node: '>= 0.4'}
 
+  siginfo@2.0.0:
+    resolution: {integrity: sha512-ybx0WO1/8bSBLEWXZvEd7gMW3Sn3JFlW3TvX1nREbDLRNQNaeNN8WK0meBwPdAaOI7TtRRRJn/Es1zhrrCHu7g==}
+
   signal-exit@3.0.7:
     resolution: {integrity: sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==}
 
+  source-map-js@1.2.1:
+    resolution: {integrity: sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==}
+    engines: {node: '>=0.10.0'}
+
+  stackback@0.0.2:
+    resolution: {integrity: sha512-1XMJE5fQo1jGH6Y/7ebnwPOBEkIEnT4QF32d5R1+VXdXveM0IBMJt8zfaxX1P3QhVwrYe+576+jkANtSS2mBbw==}
+
+  std-env@4.1.0:
+    resolution: {integrity: sha512-Rq7ybcX2RuC55r9oaPVEW7/xu3tj8u4GeBYHBWCychFtzMIr86A7e3PPEBPT37sHStKX3+TiX/Fr/ACmJLVlLQ==}
+
   stream-browserify@3.0.0:
     resolution: {integrity: sha512-H73RAHsVBapbim0tU2JwwOiXUj+fikfiaoYAKHF3VJfA0pe2BCzkhAHBlLG6REzE+2WNZcxOXjK7lkso+9euLA==}
 
@@ -1665,10 +2055,29 @@ packages:
   strnum@2.4.0:
     resolution: {integrity: sha512-sHrVyWWdq28RbhjuJdZsA1SnGRJV6NiXbk6AXBxDOsgAcA+lmpUZCYjOdLBxkXMwis6RRe7dlZt4VlIWFVzkmg==}
 
+  supports-color@7.2.0:
+    resolution: {integrity: sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==}
+    engines: {node: '>=8'}
+
   tar@7.5.16:
     resolution: {integrity: sha512-56adEpPMouktRlBLXiaYFFzZ/3+JXa8P9n7WbR+ibIjtviN55mEaOkiysCnPnWm+7kkui1Dn8J9l+g6zV8731w==}
     engines: {node: '>=18'}
 
+  tinybench@2.9.0:
+    resolution: {integrity: sha512-0+DUvqWMValLmha6lr4kD8iAMK1HzV0/aKnCtWb9v9641TnP/MFb7Pc2bxoxQjTXAErryXVgUOfv2YqNllqGeg==}
+
+  tinyexec@1.2.4:
+    resolution: {integrity: sha512-SHf/r48b7vOrjve9PxJo3MN5v5yuyjHvdUcrQffT3WXMUfnGmHDVbC4k3sHJaJTgZCwpUplIaAo5ANtMyp3YHg==}
+    engines: {node: '>=18'}
+
+  tinyglobby@0.2.17:
+    resolution: {integrity: sha512-wXR/dYpcqKmfWpEdZjiKJOwCNFndD0DMnrW/cYjVGttEkBfVgcLFHoNrlj47mjOVic9yyNu65alsgF4NQyTa2g==}
+    engines: {node: '>=12.0.0'}
+
+  tinyrainbow@3.1.0:
+    resolution: {integrity: sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==}
+    engines: {node: '>=14.0.0'}
+
   to-regex-range@5.0.1:
     resolution: {integrity: sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==}
     engines: {node: '>=8.0'}
@@ -1687,8 +2096,13 @@ packages:
   typebox@1.1.38:
     resolution: {integrity: sha512-pZ0aQPmMmXoUvSbeuWf/Hzsc+avNw/Zd6VeE8CFgkVGWyuHPJvqeJJDeJqLve+K70LvjYIoleGcoJHPT17cWoA==}
 
-  undici-types@6.20.0:
-    resolution: {integrity: sha512-Ny6QZ2Nju20vw1SRHe3d9jVu6gJ+4e3+MMpqu7pqE5HT6WsTSlce++GQmK5UXS8mzV8DSYHrQH+Xrf2jVcuKNg==}
+  typescript@5.9.3:
+    resolution: {integrity: sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==}
+    engines: {node: '>=14.17'}
+    hasBin: true
+
+  undici-types@7.18.2:
+    resolution: {integrity: sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w==}
 
   undici@8.3.0:
     resolution: {integrity: sha512-TkUDgb6tl7KOGZ+7e8E3d2FYgUQgF6z5YypqjWmixVQSQERFcVrVg0ySADm2LVLRh5ljAaHTCR5Fmz3Q34rB7Q==}
@@ -1697,6 +2111,90 @@ packages:
   util-deprecate@1.0.2:
     resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==}
 
+  vite@8.0.16:
+    resolution: {integrity: sha512-h9bXPmJichP5fLmVQo3PyaGSDE2n3aPuomeAlVRm0JLmt4rY6zmPKd59HYI4LNW8oTK7tlTsuC7l/m7awx9Jcw==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+    hasBin: true
+    peerDependencies:
+      '@types/node': ^20.19.0 || >=22.12.0
+      '@vitejs/devtools': ^0.1.18
+      esbuild: ^0.27.0 || ^0.28.0
+      jiti: '>=1.21.0'
+      less: ^4.0.0
+      sass: ^1.70.0
+      sass-embedded: ^1.70.0
+      stylus: '>=0.54.8'
+      sugarss: ^5.0.0
+      terser: ^5.16.0
+      tsx: ^4.8.1
+      yaml: ^2.4.2
+    peerDependenciesMeta:
+      '@types/node':
+        optional: true
+      '@vitejs/devtools':
+        optional: true
+      esbuild:
+        optional: true
+      jiti:
+        optional: true
+      less:
+        optional: true
+      sass:
+        optional: true
+      sass-embedded:
+        optional: true
+      stylus:
+        optional: true
+      sugarss:
+        optional: true
+      terser:
+        optional: true
+      tsx:
+        optional: true
+      yaml:
+        optional: true
+
+  vitest@4.1.9:
+    resolution: {integrity: sha512-nE3/LEyc0z87uHYLZebqCUOaJr2hdtuPp7BQ4BosVFnfltxgAvMG08NyrSGlPpOUWvR27c5flSmYFTNr78L9GQ==}
+    engines: {node: ^20.0.0 || ^22.0.0 || >=24.0.0}
+    hasBin: true
+    peerDependencies:
+      '@edge-runtime/vm': '*'
+      '@opentelemetry/api': ^1.9.0
+      '@types/node': ^20.0.0 || ^22.0.0 || >=24.0.0
+      '@vitest/browser-playwright': 4.1.9
+      '@vitest/browser-preview': 4.1.9
+      '@vitest/browser-webdriverio': 4.1.9
+      '@vitest/coverage-istanbul': 4.1.9
+      '@vitest/coverage-v8': 4.1.9
+      '@vitest/ui': 4.1.9
+      happy-dom: '*'
+      jsdom: '*'
+      vite: ^6.0.0 || ^7.0.0 || ^8.0.0
+    peerDependenciesMeta:
+      '@edge-runtime/vm':
+        optional: true
+      '@opentelemetry/api':
+        optional: true
+      '@types/node':
+        optional: true
+      '@vitest/browser-playwright':
+        optional: true
+      '@vitest/browser-preview':
+        optional: true
+      '@vitest/browser-webdriverio':
+        optional: true
+      '@vitest/coverage-istanbul':
+        optional: true
+      '@vitest/coverage-v8':
+        optional: true
+      '@vitest/ui':
+        optional: true
+      happy-dom:
+        optional: true
+      jsdom:
+        optional: true
+
   web-streams-polyfill@3.3.3:
     resolution: {integrity: sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==}
     engines: {node: '>= 8'}
@@ -1706,6 +2204,11 @@ packages:
     engines: {node: '>= 8'}
     hasBin: true
 
+  why-is-node-running@2.3.0:
+    resolution: {integrity: sha512-hUrmaWBdVDcxvYqnyh09zunKzROWjbZTiNy8dBEjkS7ehEDQibXJ7XvlmtbwuTclUiIyN+CyXQD4Vmko8fNm8w==}
+    engines: {node: '>=8'}
+    hasBin: true
+
   wrap-ansi@7.0.0:
     resolution: {integrity: sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==}
     engines: {node: '>=10'}
@@ -2217,8 +2720,23 @@ snapshots:
 
   '@aws/lambda-invoke-store@0.2.4': {}
 
+  '@babel/helper-string-parser@7.29.7': {}
+
+  '@babel/helper-validator-identifier@7.29.7': {}
+
+  '@babel/parser@7.29.7':
+    dependencies:
+      '@babel/types': 7.29.7
+
   '@babel/runtime@7.29.7': {}
 
+  '@babel/types@7.29.7':
+    dependencies:
+      '@babel/helper-string-parser': 7.29.7
+      '@babel/helper-validator-identifier': 7.29.7
+
+  '@bcoe/v8-coverage@1.0.2': {}
+
   '@daytona/api-client@0.187.0':
     dependencies:
       axios: 1.18.0
@@ -2332,6 +2850,22 @@ snapshots:
       get-east-asian-width: 1.6.0
       marked: 15.0.12
 
+  '@emnapi/core@1.10.0':
+    dependencies:
+      '@emnapi/wasi-threads': 1.2.1
+      tslib: 2.8.1
+    optional: true
+
+  '@emnapi/runtime@1.10.0':
+    dependencies:
+      tslib: 2.8.1
+    optional: true
+
+  '@emnapi/wasi-threads@1.2.1':
+    dependencies:
+      tslib: 2.8.1
+    optional: true
+
   '@esbuild/aix-ppc64@0.23.1':
     optional: true
 
@@ -2495,6 +3029,15 @@ snapshots:
     dependencies:
       minipass: 7.1.3
 
+  '@jridgewell/resolve-uri@3.1.2': {}
+
+  '@jridgewell/sourcemap-codec@1.5.5': {}
+
+  '@jridgewell/trace-mapping@0.3.31':
+    dependencies:
+      '@jridgewell/resolve-uri': 3.1.2
+      '@jridgewell/sourcemap-codec': 1.5.5
+
   '@js-sdsl/ordered-map@4.4.2': {}
 
   '@mariozechner/clipboard-darwin-arm64@0.3.9':
@@ -2550,6 +3093,13 @@ snapshots:
       - bufferutil
       - utf-8-validate
 
+  '@napi-rs/wasm-runtime@1.1.5(@emnapi/core@1.10.0)(@emnapi/runtime@1.10.0)':
+    dependencies:
+      '@emnapi/core': 1.10.0
+      '@emnapi/runtime': 1.10.0
+      '@tybys/wasm-util': 0.10.2
+    optional: true
+
   '@nodable/entities@2.2.0': {}
 
   '@nodelib/fs.scandir@2.1.5':
@@ -2937,6 +3487,8 @@ snapshots:
 
   '@opentelemetry/semantic-conventions@1.41.1': {}
 
+  '@oxc-project/types@0.133.0': {}
+
   '@protobufjs/aspromise@1.1.2': {}
 
   '@protobufjs/base64@1.1.2': {}
@@ -2959,6 +3511,57 @@ snapshots:
 
   '@protobufjs/utf8@1.1.1': {}
 
+  '@rolldown/binding-android-arm64@1.0.3':
+    optional: true
+
+  '@rolldown/binding-darwin-arm64@1.0.3':
+    optional: true
+
+  '@rolldown/binding-darwin-x64@1.0.3':
+    optional: true
+
+  '@rolldown/binding-freebsd-x64@1.0.3':
+    optional: true
+
+  '@rolldown/binding-linux-arm-gnueabihf@1.0.3':
+    optional: true
+
+  '@rolldown/binding-linux-arm64-gnu@1.0.3':
+    optional: true
+
+  '@rolldown/binding-linux-arm64-musl@1.0.3':
+    optional: true
+
+  '@rolldown/binding-linux-ppc64-gnu@1.0.3':
+    optional: true
+
+  '@rolldown/binding-linux-s390x-gnu@1.0.3':
+    optional: true
+
+  '@rolldown/binding-linux-x64-gnu@1.0.3':
+    optional: true
+
+  '@rolldown/binding-linux-x64-musl@1.0.3':
+    optional: true
+
+  '@rolldown/binding-openharmony-arm64@1.0.3':
+    optional: true
+
+  '@rolldown/binding-wasm32-wasi@1.0.3':
+    dependencies:
+      '@emnapi/core': 1.10.0
+      '@emnapi/runtime': 1.10.0
+      '@napi-rs/wasm-runtime': 1.1.5(@emnapi/core@1.10.0)(@emnapi/runtime@1.10.0)
+    optional: true
+
+  '@rolldown/binding-win32-arm64-msvc@1.0.3':
+    optional: true
+
+  '@rolldown/binding-win32-x64-msvc@1.0.3':
+    optional: true
+
+  '@rolldown/pluginutils@1.0.1': {}
+
   '@sandbox-agent/cli-darwin-arm64@0.4.2':
     optional: true
 
@@ -3043,12 +3646,83 @@ snapshots:
       '@smithy/util-buffer-from': 2.2.0
       tslib: 2.8.1
 
-  '@types/node@22.10.2':
+  '@standard-schema/spec@1.1.0': {}
+
+  '@tybys/wasm-util@0.10.2':
+    dependencies:
+      tslib: 2.8.1
+    optional: true
+
+  '@types/chai@5.2.3':
+    dependencies:
+      '@types/deep-eql': 4.0.2
+      assertion-error: 2.0.1
+
+  '@types/deep-eql@4.0.2': {}
+
+  '@types/estree@1.0.9': {}
+
+  '@types/node@24.13.2':
     dependencies:
-      undici-types: 6.20.0
+      undici-types: 7.18.2
 
   '@types/retry@0.12.0': {}
 
+  '@vitest/coverage-v8@4.1.9(vitest@4.1.9)':
+    dependencies:
+      '@bcoe/v8-coverage': 1.0.2
+      '@vitest/utils': 4.1.9
+      ast-v8-to-istanbul: 1.0.4
+      istanbul-lib-coverage: 3.2.2
+      istanbul-lib-report: 3.0.1
+      istanbul-reports: 3.2.0
+      magicast: 0.5.3
+      obug: 2.1.3
+      std-env: 4.1.0
+      tinyrainbow: 3.1.0
+      vitest: 4.1.9(@opentelemetry/api@1.9.0)(@types/node@24.13.2)(@vitest/coverage-v8@4.1.9)(vite@8.0.16(@types/node@24.13.2)(esbuild@0.23.1)(jiti@2.7.0)(tsx@4.19.2)(yaml@2.9.0))
+
+  '@vitest/expect@4.1.9':
+    dependencies:
+      '@standard-schema/spec': 1.1.0
+      '@types/chai': 5.2.3
+      '@vitest/spy': 4.1.9
+      '@vitest/utils': 4.1.9
+      chai: 6.2.2
+      tinyrainbow: 3.1.0
+
+  '@vitest/mocker@4.1.9(vite@8.0.16(@types/node@24.13.2)(esbuild@0.23.1)(jiti@2.7.0)(tsx@4.19.2)(yaml@2.9.0))':
+    dependencies:
+      '@vitest/spy': 4.1.9
+      estree-walker: 3.0.3
+      magic-string: 0.30.21
+    optionalDependencies:
+      vite: 8.0.16(@types/node@24.13.2)(esbuild@0.23.1)(jiti@2.7.0)(tsx@4.19.2)(yaml@2.9.0)
+
+  '@vitest/pretty-format@4.1.9':
+    dependencies:
+      tinyrainbow: 3.1.0
+
+  '@vitest/runner@4.1.9':
+    dependencies:
+      '@vitest/utils': 4.1.9
+      pathe: 2.0.3
+
+  '@vitest/snapshot@4.1.9':
+    dependencies:
+      '@vitest/pretty-format': 4.1.9
+      '@vitest/utils': 4.1.9
+      magic-string: 0.30.21
+      pathe: 2.0.3
+
+  '@vitest/spy@4.1.9': {}
+
+  '@vitest/utils@4.1.9':
+    dependencies:
+      '@vitest/pretty-format': 4.1.9
+      convert-source-map: 2.0.0
+      tinyrainbow: 3.1.0
+
   '@zed-industries/claude-agent-acp@0.23.1':
     dependencies:
       '@agentclientprotocol/sdk': 0.17.0(zod@4.4.3)
@@ -3083,6 +3757,14 @@ snapshots:
 
   anynum@1.0.0: {}
 
+  assertion-error@2.0.1: {}
+
+  ast-v8-to-istanbul@1.0.4:
+    dependencies:
+      '@jridgewell/trace-mapping': 0.3.31
+      estree-walker: 3.0.3
+      js-tokens: 10.0.0
+
   asynckit@0.4.0: {}
 
   axios@1.18.0:
@@ -3127,6 +3809,8 @@ snapshots:
       es-errors: 1.3.0
       function-bind: 1.1.2
 
+  chai@6.2.2: {}
+
   chalk@5.6.2: {}
 
   chownr@3.0.0: {}
@@ -3149,6 +3833,8 @@ snapshots:
     dependencies:
       delayed-stream: 1.0.0
 
+  convert-source-map@2.0.0: {}
+
   cross-spawn@7.0.6:
     dependencies:
       path-key: 3.1.1
@@ -3163,6 +3849,8 @@ snapshots:
 
   delayed-stream@1.0.0: {}
 
+  detect-libc@2.1.2: {}
+
   diff@8.0.4: {}
 
   dotenv@17.4.2: {}
@@ -3183,6 +3871,8 @@ snapshots:
 
   es-errors@1.3.0: {}
 
+  es-module-lexer@2.1.0: {}
+
   es-object-atoms@1.1.2:
     dependencies:
       es-errors: 1.3.0
@@ -3223,12 +3913,18 @@ snapshots:
 
   escalade@3.2.0: {}
 
+  estree-walker@3.0.3:
+    dependencies:
+      '@types/estree': 1.0.9
+
   events@3.3.0: {}
 
   expand-tilde@2.0.2:
     dependencies:
       homedir-polyfill: 1.0.3
 
+  expect-type@1.3.0: {}
+
   extend@3.0.2: {}
 
   fast-glob@3.3.3:
@@ -3255,6 +3951,10 @@ snapshots:
     dependencies:
       reusify: 1.1.0
 
+  fdir@6.5.0(picomatch@4.0.4):
+    optionalDependencies:
+      picomatch: 4.0.4
+
   fetch-blob@3.2.0:
     dependencies:
       node-domexception: 1.0.0
@@ -3354,6 +4054,8 @@ snapshots:
 
   graceful-fs@4.2.11: {}
 
+  has-flag@4.0.0: {}
+
   has-symbols@1.1.0: {}
 
   has-tostringtag@1.0.2:
@@ -3374,6 +4076,8 @@ snapshots:
     dependencies:
       lru-cache: 11.5.1
 
+  html-escaper@2.0.2: {}
+
   http-proxy-agent@7.0.2:
     dependencies:
       agent-base: 7.1.4
@@ -3424,8 +4128,23 @@ snapshots:
     dependencies:
       ws: 8.21.0
 
+  istanbul-lib-coverage@3.2.2: {}
+
+  istanbul-lib-report@3.0.1:
+    dependencies:
+      istanbul-lib-coverage: 3.2.2
+      make-dir: 4.0.0
+      supports-color: 7.2.0
+
+  istanbul-reports@3.2.0:
+    dependencies:
+      html-escaper: 2.0.2
+      istanbul-lib-report: 3.0.1
+
   jiti@2.7.0: {}
 
+  js-tokens@10.0.0: {}
+
   json-bigint@1.0.0:
     dependencies:
       bignumber.js: 9.3.1
@@ -3446,12 +4165,75 @@ snapshots:
       jwa: 2.0.1
       safe-buffer: 5.2.1
 
+  lightningcss-android-arm64@1.32.0:
+    optional: true
+
+  lightningcss-darwin-arm64@1.32.0:
+    optional: true
+
+  lightningcss-darwin-x64@1.32.0:
+    optional: true
+
+  lightningcss-freebsd-x64@1.32.0:
+    optional: true
+
+  lightningcss-linux-arm-gnueabihf@1.32.0:
+    optional: true
+
+  lightningcss-linux-arm64-gnu@1.32.0:
+    optional: true
+
+  lightningcss-linux-arm64-musl@1.32.0:
+    optional: true
+
+  lightningcss-linux-x64-gnu@1.32.0:
+    optional: true
+
+  lightningcss-linux-x64-musl@1.32.0:
+    optional: true
+
+  lightningcss-win32-arm64-msvc@1.32.0:
+    optional: true
+
+  lightningcss-win32-x64-msvc@1.32.0:
+    optional: true
+
+  lightningcss@1.32.0:
+    dependencies:
+      detect-libc: 2.1.2
+    optionalDependencies:
+      lightningcss-android-arm64: 1.32.0
+      lightningcss-darwin-arm64: 1.32.0
+      lightningcss-darwin-x64: 1.32.0
+      lightningcss-freebsd-x64: 1.32.0
+      lightningcss-linux-arm-gnueabihf: 1.32.0
+      lightningcss-linux-arm64-gnu: 1.32.0
+      lightningcss-linux-arm64-musl: 1.32.0
+      lightningcss-linux-x64-gnu: 1.32.0
+      lightningcss-linux-x64-musl: 1.32.0
+      lightningcss-win32-arm64-msvc: 1.32.0
+      lightningcss-win32-x64-msvc: 1.32.0
+
   lodash.camelcase@4.3.0: {}
 
   long@5.3.2: {}
 
   lru-cache@11.5.1: {}
 
+  magic-string@0.30.21:
+    dependencies:
+      '@jridgewell/sourcemap-codec': 1.5.5
+
+  magicast@0.5.3:
+    dependencies:
+      '@babel/parser': 7.29.7
+      '@babel/types': 7.29.7
+      source-map-js: 1.2.1
+
+  make-dir@4.0.0:
+    dependencies:
+      semver: 7.8.0
+
   marked@15.0.12: {}
 
   math-intrinsics@1.1.0: {}
@@ -3483,6 +4265,8 @@ snapshots:
 
   ms@2.1.3: {}
 
+  nanoid@3.3.13: {}
+
   node-domexception@1.0.0: {}
 
   node-fetch@3.3.2:
@@ -3491,6 +4275,8 @@ snapshots:
       fetch-blob: 3.2.0
       formdata-polyfill: 4.0.10
 
+  obug@2.1.3: {}
+
   openai@6.26.0(ws@8.21.0)(zod@4.4.3):
     optionalDependencies:
       ws: 8.21.0
@@ -3521,8 +4307,18 @@ snapshots:
       '@agentclientprotocol/sdk': 0.26.0(zod@3.25.76)
       zod: 3.25.76
 
+  picocolors@1.1.1: {}
+
   picomatch@2.3.2: {}
 
+  picomatch@4.0.4: {}
+
+  postcss@8.5.15:
+    dependencies:
+      nanoid: 3.3.13
+      picocolors: 1.1.1
+      source-map-js: 1.2.1
+
   proper-lockfile@4.1.2:
     dependencies:
       graceful-fs: 4.2.11
@@ -3540,7 +4336,7 @@ snapshots:
       '@protobufjs/path': 1.1.2
       '@protobufjs/pool': 1.1.0
       '@protobufjs/utf8': 1.1.1
-      '@types/node': 22.10.2
+      '@types/node': 24.13.2
       long: 5.3.2
 
   protobufjs@8.0.1:
@@ -3555,7 +4351,7 @@ snapshots:
       '@protobufjs/path': 1.1.2
       '@protobufjs/pool': 1.1.0
       '@protobufjs/utf8': 1.1.1
-      '@types/node': 22.10.2
+      '@types/node': 24.13.2
       long: 5.3.2
 
   proxy-from-env@2.1.0: {}
@@ -3585,6 +4381,27 @@ snapshots:
 
   reusify@1.1.0: {}
 
+  rolldown@1.0.3:
+    dependencies:
+      '@oxc-project/types': 0.133.0
+      '@rolldown/pluginutils': 1.0.1
+    optionalDependencies:
+      '@rolldown/binding-android-arm64': 1.0.3
+      '@rolldown/binding-darwin-arm64': 1.0.3
+      '@rolldown/binding-darwin-x64': 1.0.3
+      '@rolldown/binding-freebsd-x64': 1.0.3
+      '@rolldown/binding-linux-arm-gnueabihf': 1.0.3
+      '@rolldown/binding-linux-arm64-gnu': 1.0.3
+      '@rolldown/binding-linux-arm64-musl': 1.0.3
+      '@rolldown/binding-linux-ppc64-gnu': 1.0.3
+      '@rolldown/binding-linux-s390x-gnu': 1.0.3
+      '@rolldown/binding-linux-x64-gnu': 1.0.3
+      '@rolldown/binding-linux-x64-musl': 1.0.3
+      '@rolldown/binding-openharmony-arm64': 1.0.3
+      '@rolldown/binding-wasm32-wasi': 1.0.3
+      '@rolldown/binding-win32-arm64-msvc': 1.0.3
+      '@rolldown/binding-win32-x64-msvc': 1.0.3
+
   run-parallel@1.2.0:
     dependencies:
       queue-microtask: 1.2.3
@@ -3611,8 +4428,16 @@ snapshots:
 
   shell-quote@1.8.4: {}
 
+  siginfo@2.0.0: {}
+
   signal-exit@3.0.7: {}
 
+  source-map-js@1.2.1: {}
+
+  stackback@0.0.2: {}
+
+  std-env@4.1.0: {}
+
   stream-browserify@3.0.0:
     dependencies:
       inherits: 2.0.4
@@ -3638,6 +4463,10 @@ snapshots:
     dependencies:
       anynum: 1.0.0
 
+  supports-color@7.2.0:
+    dependencies:
+      has-flag: 4.0.0
+
   tar@7.5.16:
     dependencies:
       '@isaacs/fs-minipass': 4.0.1
@@ -3646,6 +4475,17 @@ snapshots:
       minizlib: 3.1.0
       yallist: 5.0.0
 
+  tinybench@2.9.0: {}
+
+  tinyexec@1.2.4: {}
+
+  tinyglobby@0.2.17:
+    dependencies:
+      fdir: 6.5.0(picomatch@4.0.4)
+      picomatch: 4.0.4
+
+  tinyrainbow@3.1.0: {}
+
   to-regex-range@5.0.1:
     dependencies:
       is-number: 7.0.0
@@ -3663,18 +4503,69 @@ snapshots:
 
   typebox@1.1.38: {}
 
-  undici-types@6.20.0: {}
+  typescript@5.9.3: {}
+
+  undici-types@7.18.2: {}
 
   undici@8.3.0: {}
 
   util-deprecate@1.0.2: {}
 
+  vite@8.0.16(@types/node@24.13.2)(esbuild@0.23.1)(jiti@2.7.0)(tsx@4.19.2)(yaml@2.9.0):
+    dependencies:
+      lightningcss: 1.32.0
+      picomatch: 4.0.4
+      postcss: 8.5.15
+      rolldown: 1.0.3
+      tinyglobby: 0.2.17
+    optionalDependencies:
+      '@types/node': 24.13.2
+      esbuild: 0.23.1
+      fsevents: 2.3.3
+      jiti: 2.7.0
+      tsx: 4.19.2
+      yaml: 2.9.0
+
+  vitest@4.1.9(@opentelemetry/api@1.9.0)(@types/node@24.13.2)(@vitest/coverage-v8@4.1.9)(vite@8.0.16(@types/node@24.13.2)(esbuild@0.23.1)(jiti@2.7.0)(tsx@4.19.2)(yaml@2.9.0)):
+    dependencies:
+      '@vitest/expect': 4.1.9
+      '@vitest/mocker': 4.1.9(vite@8.0.16(@types/node@24.13.2)(esbuild@0.23.1)(jiti@2.7.0)(tsx@4.19.2)(yaml@2.9.0))
+      '@vitest/pretty-format': 4.1.9
+      '@vitest/runner': 4.1.9
+      '@vitest/snapshot': 4.1.9
+      '@vitest/spy': 4.1.9
+      '@vitest/utils': 4.1.9
+      es-module-lexer: 2.1.0
+      expect-type: 1.3.0
+      magic-string: 0.30.21
+      obug: 2.1.3
+      pathe: 2.0.3
+      picomatch: 4.0.4
+      std-env: 4.1.0
+      tinybench: 2.9.0
+      tinyexec: 1.2.4
+      tinyglobby: 0.2.17
+      tinyrainbow: 3.1.0
+      vite: 8.0.16(@types/node@24.13.2)(esbuild@0.23.1)(jiti@2.7.0)(tsx@4.19.2)(yaml@2.9.0)
+      why-is-node-running: 2.3.0
+    optionalDependencies:
+      '@opentelemetry/api': 1.9.0
+      '@types/node': 24.13.2
+      '@vitest/coverage-v8': 4.1.9(vitest@4.1.9)
+    transitivePeerDependencies:
+      - msw
+
   web-streams-polyfill@3.3.3: {}
 
   which@2.0.2:
     dependencies:
       isexe: 2.0.0
 
+  why-is-node-running@2.3.0:
+    dependencies:
+      siginfo: 2.0.0
+      stackback: 0.0.2
+
   wrap-ansi@7.0.0:
     dependencies:
       ansi-styles: 4.3.0
diff --git a/services/agent/src/cli.ts b/services/agent/src/cli.ts
index 7f45ebb714..6909992b51 100644
--- a/services/agent/src/cli.ts
+++ b/services/agent/src/cli.ts
@@ -1,88 +1,109 @@
 /**
  * WP-2 Pi wrapper CLI: the JSON transport for the Harness port.
  *
- * Reads one JSON `AgentRunRequest` from stdin, runs Pi once, and writes one JSON
- * `AgentRunResult` to stdout. stdout carries the result and nothing else; logs go
- * to stderr. This is the one-shot "json adapter" the design doc describes; a
- * long-lived RPC adapter can replace it later behind the same Python-side port.
+ * Reads one JSON `AgentRunRequest` from stdin, runs the agent once, and writes one JSON
+ * `AgentRunResult` to stdout. stdout carries the result and nothing else; logs go to stderr.
+ * With `--stream`, writes NDJSON instead: one `{kind:"event"}` line per event the moment it
+ * is built, then exactly one terminal `{kind:"result"}` line.
+ *
+ * `runCli(raw, stream, io)` is the testable seam: it takes the raw stdin string and an
+ * injectable engine runner + output sink, and returns the exit code. Tests pass a fake engine
+ * and a collecting `write`, so no stdin/stdout/process.exit mocking is needed; production
+ * defaults to the real engine and `process.stdout` (which keeps streaming live).
  */
 import type {
   AgentRunRequest,
   AgentRunResult,
   EmitEvent,
-  StreamRecord,
 } from "./protocol.ts";
 import { runPi } from "./engines/pi.ts";
 import { runRivet } from "./engines/rivet.ts";
+import { isEntrypoint } from "./entry.ts";
 
-// Engine: `rivet` drives a harness over ACP via a rivet daemon; `pi` (default) is the
-// legacy in-process Pi path. The request's `backend` wins, then the AGENT_BACKEND env.
-function runAgent(
+/** Run one request through an engine. Tests inject a fake to avoid a live harness. */
+export type RunAgent = (
   request: AgentRunRequest,
   emit?: EmitEvent,
-): Promise<AgentRunResult> {
+) => Promise<AgentRunResult>;
+
+// Engine: `rivet` drives a harness over ACP via a rivet daemon; `pi` (default) is the
+// legacy in-process Pi path. The request's `backend` wins, then the AGENT_BACKEND env.
+const runAgent: RunAgent = (request, emit) => {
   const backend = (request.backend ?? process.env.AGENT_BACKEND ?? "pi").toLowerCase();
   return backend === "rivet" ? runRivet(request, emit) : runPi(request, emit);
-}
+};
 
-async function readStdin(): Promise<string> {
-  const chunks: Buffer[] = [];
-  for await (const chunk of process.stdin) {
-    chunks.push(chunk as Buffer);
-  }
-  return Buffer.concat(chunks).toString("utf8");
+function errorMessage(err: unknown): string {
+  return err instanceof Error ? err.stack ?? err.message : String(err);
 }
 
-// One-shot mode: the whole result as a single JSON document (the `/invoke` contract).
-function emitResult(result: AgentRunResult): void {
-  process.stdout.write(JSON.stringify(result));
+export interface CliIO {
+  /** Engine runner; defaults to the real backend dispatch. */
+  run?: RunAgent;
+  /** Output sink; defaults to `process.stdout`. Called incrementally so streaming stays live. */
+  write?: (chunk: string) => void;
 }
 
-// Streaming mode (`--stream`): one NDJSON record per line — an `{kind:"event"}` line the
-// moment each event is built, then exactly one terminal `{kind:"result"}` line.
-function writeRecord(record: StreamRecord): void {
-  process.stdout.write(JSON.stringify(record) + "\n");
-}
-
-async function main(): Promise<void> {
-  const stream = process.argv.includes("--stream");
-  const raw = await readStdin();
+/**
+ * Run one request and return the process exit code (0 = ok, 1 = failure/invalid input).
+ * Output is delivered through `io.write` as it is produced.
+ */
+export async function runCli(
+  raw: string,
+  stream: boolean,
+  io: CliIO = {},
+): Promise<number> {
+  const run = io.run ?? runAgent;
+  const write = io.write ?? ((chunk: string) => void process.stdout.write(chunk));
 
   let request: AgentRunRequest;
   try {
     request = raw.trim() ? (JSON.parse(raw) as AgentRunRequest) : {};
   } catch (err) {
     const failure: AgentRunResult = { ok: false, error: `Invalid JSON on stdin: ${String(err)}` };
-    if (stream) writeRecord({ kind: "result", result: failure });
-    else emitResult(failure);
-    process.exit(1);
+    write(stream ? JSON.stringify({ kind: "result", result: failure }) + "\n" : JSON.stringify(failure));
+    return 1;
   }
 
   if (!stream) {
     try {
-      const result = await runAgent(request);
-      emitResult(result);
-      process.exit(result.ok ? 0 : 1);
+      const result = await run(request);
+      write(JSON.stringify(result));
+      return result.ok ? 0 : 1;
     } catch (err) {
-      emitResult({
-        ok: false,
-        error: err instanceof Error ? err.stack ?? err.message : String(err),
-      });
-      process.exit(1);
+      write(JSON.stringify({ ok: false, error: errorMessage(err) }));
+      return 1;
     }
-    return;
   }
 
-  const emit: EmitEvent = (event) => writeRecord({ kind: "event", event });
+  const emit: EmitEvent = (event) => write(JSON.stringify({ kind: "event", event }) + "\n");
   let result: AgentRunResult;
   try {
-    result = await runAgent(request, emit);
+    result = await run(request, emit);
   } catch (err) {
-    result = { ok: false, error: err instanceof Error ? err.stack ?? err.message : String(err) };
+    result = { ok: false, error: errorMessage(err) };
   }
   // Streaming delivered the events live, so don't echo them in the terminal record.
-  writeRecord({ kind: "result", result: { ...result, events: [] } });
-  process.exit(result.ok ? 0 : 1);
+  write(JSON.stringify({ kind: "result", result: { ...result, events: [] } }) + "\n");
+  return result.ok ? 0 : 1;
 }
 
-main();
+async function readStdin(): Promise<string> {
+  const chunks: Buffer[] = [];
+  for await (const chunk of process.stdin) {
+    chunks.push(chunk as Buffer);
+  }
+  return Buffer.concat(chunks).toString("utf8");
+}
+
+async function main(): Promise<void> {
+  const stream = process.argv.includes("--stream");
+  const raw = await readStdin();
+  const code = await runCli(raw, stream);
+  process.exit(code);
+}
+
+// Only run when this file is the process entry (`tsx src/cli.ts`); importing it is inert.
+if (isEntrypoint(import.meta.url)) {
+  void main();
+}
diff --git a/services/agent/src/entry.ts b/services/agent/src/entry.ts
new file mode 100644
index 0000000000..877aac822e
--- /dev/null
+++ b/services/agent/src/entry.ts
@@ -0,0 +1,17 @@
+/**
+ * True when `moduleUrl` is the process entry point, so an entrypoint module runs its `main()`
+ * under `tsx src/x.ts` but stays inert when imported by a test. Compares the resolved real
+ * paths of `process.argv[1]` and the module's own file.
+ */
+import { argv } from "node:process";
+import { realpathSync } from "node:fs";
+import { fileURLToPath } from "node:url";
+
+export function isEntrypoint(moduleUrl: string): boolean {
+  if (!argv[1]) return false;
+  try {
+    return realpathSync(argv[1]) === realpathSync(fileURLToPath(moduleUrl));
+  } catch {
+    return false;
+  }
+}
diff --git a/services/agent/src/server.ts b/services/agent/src/server.ts
index aae23c4480..71c95c0111 100644
--- a/services/agent/src/server.ts
+++ b/services/agent/src/server.ts
@@ -4,13 +4,21 @@
  * Same contract as the CLI, exposed over HTTP so the wrapper can run as its own
  * container (a sidecar) that the Python service calls in-network:
  *
- *   GET  /health -> { status: "ok" }
+ *   GET  /health -> runner identity ({ status, runner, protocol, engines, harnesses })
  *   POST /run    -> body is an AgentRunRequest, response is an AgentRunResult
  *
  * Uses Node's built-in http server (no framework dependency). Pi auth comes from
  * PI_CODING_AGENT_DIR / ~/.pi/agent, mounted into the container.
+ *
+ * `createAgentServer(run)` is the testable seam: it builds the server around an injectable
+ * engine runner so the HTTP behavior can be tested with a fake engine (no live harness).
  */
-import { createServer, type IncomingMessage, type ServerResponse } from "node:http";
+import {
+  createServer,
+  type IncomingMessage,
+  type Server,
+  type ServerResponse,
+} from "node:http";
 
 import type {
   AgentRunRequest,
@@ -20,6 +28,8 @@ import type {
 } from "./protocol.ts";
 import { runPi } from "./engines/pi.ts";
 import { runRivet } from "./engines/rivet.ts";
+import { runnerInfo } from "./version.ts";
+import { isEntrypoint } from "./entry.ts";
 
 const PORT = Number(process.env.PORT ?? 8765);
 
@@ -29,18 +39,21 @@ const PORT = Number(process.env.PORT ?? 8765);
 // request shape (a rivet request carries `harness`/`sandbox`).
 const DEFAULT_BACKEND = (process.env.AGENT_BACKEND ?? "auto").toLowerCase();
 
-function runAgent(
+/** Run one request through an engine. Tests inject a fake to avoid a live harness. */
+export type RunAgent = (
   request: AgentRunRequest,
   emit?: EmitEvent,
   signal?: AbortSignal,
-): Promise<AgentRunResult> {
+) => Promise<AgentRunResult>;
+
+const runAgent: RunAgent = (request, emit, signal) => {
   const backend = (request.backend ?? DEFAULT_BACKEND).toLowerCase();
   if (backend === "rivet") return runRivet(request, emit, signal);
   if (backend === "pi") return runPi(request, emit);
   return request.harness || request.sandbox
     ? runRivet(request, emit, signal)
     : runPi(request, emit);
-}
+};
 
 /**
  * Stream a run as NDJSON: one `{kind:"event"}` line per event the moment it is built, then
@@ -48,9 +61,10 @@ function runAgent(
  * with `Accept: application/x-ndjson`; the one-shot `/run` path is left untouched.
  */
 async function runAndStream(
-  req: IncomingMessage,
+  _req: IncomingMessage,
   res: ServerResponse,
   request: AgentRunRequest,
+  run: RunAgent,
 ): Promise<void> {
   res.writeHead(200, {
     "content-type": "application/x-ndjson",
@@ -75,7 +89,7 @@ async function runAndStream(
 
   let result: AgentRunResult;
   try {
-    result = await runAgent(request, emit, controller.signal);
+    result = await run(request, emit, controller.signal);
   } catch (err) {
     const message = err instanceof Error ? err.stack ?? err.message : String(err);
     result = { ok: false, error: message };
@@ -102,54 +116,68 @@ async function readBody(req: IncomingMessage): Promise<string> {
   return Buffer.concat(chunks).toString("utf8");
 }
 
-const server = createServer(async (req, res) => {
-  try {
-    if (req.method === "GET" && req.url === "/health") {
-      return send(res, 200, { status: "ok" });
-    }
-
-    if (req.method === "POST" && req.url === "/run") {
-      const raw = await readBody(req);
-      let request: AgentRunRequest;
-      try {
-        request = raw.trim() ? (JSON.parse(raw) as AgentRunRequest) : {};
-      } catch (err) {
-        return send(res, 400, { ok: false, error: `Invalid JSON: ${String(err)}` });
+/** Build the HTTP request listener around a given engine runner (the testable seam). */
+export function createRequestListener(
+  run: RunAgent,
+): (req: IncomingMessage, res: ServerResponse) => Promise<void> {
+  return async (req, res) => {
+    try {
+      if (req.method === "GET" && req.url === "/health") {
+        return send(res, 200, runnerInfo());
       }
 
-      const wantsStream = (req.headers["accept"] ?? "").includes(
-        "application/x-ndjson",
-      );
-      if (wantsStream) {
-        await runAndStream(req, res, request);
-        return;
+      if (req.method === "POST" && req.url === "/run") {
+        const raw = await readBody(req);
+        let request: AgentRunRequest;
+        try {
+          request = raw.trim() ? (JSON.parse(raw) as AgentRunRequest) : {};
+        } catch (err) {
+          return send(res, 400, { ok: false, error: `Invalid JSON: ${String(err)}` });
+        }
+
+        const wantsStream = (req.headers["accept"] ?? "").includes(
+          "application/x-ndjson",
+        );
+        if (wantsStream) {
+          await runAndStream(req, res, request, run);
+          return;
+        }
+
+        const result = await run(request);
+        return send(res, result.ok ? 200 : 500, result);
       }
 
-      const result = await runAgent(request);
-      return send(res, result.ok ? 200 : 500, result);
+      return send(res, 404, { ok: false, error: "Not found" });
+    } catch (err) {
+      const message = err instanceof Error ? err.stack ?? err.message : String(err);
+      return send(res, 500, { ok: false, error: message });
     }
+  };
+}
 
-    return send(res, 404, { ok: false, error: "Not found" });
-  } catch (err) {
-    const message = err instanceof Error ? err.stack ?? err.message : String(err);
-    return send(res, 500, { ok: false, error: message });
-  }
-});
-
-// The rivet SDK can reject a background promise (e.g. an adapter install or the Daytona
-// preview SSE failing) outside any awaited path. Node's default turns that into an
-// uncaught exception that kills the whole process — taking every in-flight request with
-// it (the caller sees "Server disconnected"). Log and keep serving instead; the failing
-// run still returns its own error to its caller.
-process.on("unhandledRejection", (reason) => {
-  process.stderr.write(
-    `[pi-wrapper] unhandledRejection: ${reason instanceof Error ? (reason.stack ?? reason.message) : String(reason)}\n`,
-  );
-});
-process.on("uncaughtException", (err) => {
-  process.stderr.write(`[pi-wrapper] uncaughtException: ${err.stack ?? err.message}\n`);
-});
-
-server.listen(PORT, () => {
-  process.stderr.write(`[pi-wrapper] http server listening on :${PORT}\n`);
-});
+/** Create the sidecar HTTP server. Defaults to the real engine dispatch; tests pass a fake. */
+export function createAgentServer(run: RunAgent = runAgent): Server {
+  return createServer(createRequestListener(run));
+}
+
+// Only run as a server when this file is the process entry (`tsx src/server.ts`); importing
+// it (e.g. from a test) is inert.
+if (isEntrypoint(import.meta.url)) {
+  // The rivet SDK can reject a background promise (e.g. an adapter install or the Daytona
+  // preview SSE failing) outside any awaited path. Node's default turns that into an
+  // uncaught exception that kills the whole process — taking every in-flight request with
+  // it (the caller sees "Server disconnected"). Log and keep serving instead; the failing
+  // run still returns its own error to its caller.
+  process.on("unhandledRejection", (reason) => {
+    process.stderr.write(
+      `[pi-wrapper] unhandledRejection: ${reason instanceof Error ? (reason.stack ?? reason.message) : String(reason)}\n`,
+    );
+  });
+  process.on("uncaughtException", (err) => {
+    process.stderr.write(`[pi-wrapper] uncaughtException: ${err.stack ?? err.message}\n`);
+  });
+
+  createAgentServer().listen(PORT, () => {
+    process.stderr.write(`[pi-wrapper] http server listening on :${PORT}\n`);
+  });
+}
diff --git a/services/agent/src/tools/dispatch.ts b/services/agent/src/tools/dispatch.ts
index fd68a87b72..ee9845b9f1 100644
--- a/services/agent/src/tools/dispatch.ts
+++ b/services/agent/src/tools/dispatch.ts
@@ -85,11 +85,11 @@ export async function relayToolCall(
         /* best-effort cleanup */
       }
       if (res.ok) return res.text ?? "";
-      throw new Error(res.error || `tool relay failed for ${callRef}`);
+      throw new Error(res.error || `tool relay failed for ${toolName}`);
     }
     await sleep(RELAY_POLL_MS);
   }
-  throw new Error(`tool relay timed out for ${callRef}`);
+  throw new Error(`tool relay timed out for ${toolName}`);
 }
 
 /**
diff --git a/services/agent/src/version.ts b/services/agent/src/version.ts
new file mode 100644
index 0000000000..5c34701fd3
--- /dev/null
+++ b/services/agent/src/version.ts
@@ -0,0 +1,35 @@
+/**
+ * Runner identity, surfaced on `GET /health` so a client can detect an incompatible runner
+ * before the first run (the version-skew guard).
+ *
+ * `PROTOCOL_VERSION` is the MAJOR of the `/run` wire contract in `protocol.ts`. Bump it only
+ * for a change that is not backward compatible; a client that probes `/health` can then
+ * refuse a runner whose protocol major it does not support. `RUNNER_VERSION` is the package
+ * version (the build), distinct from the protocol.
+ */
+import pkg from "../package.json";
+
+export const PROTOCOL_VERSION = 1;
+export const RUNNER_VERSION: string = pkg.version;
+export const ENGINES = ["pi", "rivet"] as const;
+export const HARNESSES = ["pi", "claude", "agenta"] as const;
+
+export interface RunnerInfo {
+  status: "ok";
+  /** Package build version (e.g. "0.1.0"). */
+  runner: string;
+  /** Wire-contract major. A client refuses a major it does not understand. */
+  protocol: number;
+  engines: readonly string[];
+  harnesses: readonly string[];
+}
+
+export function runnerInfo(): RunnerInfo {
+  return {
+    status: "ok",
+    runner: RUNNER_VERSION,
+    protocol: PROTOCOL_VERSION,
+    engines: ENGINES,
+    harnesses: HARNESSES,
+  };
+}
diff --git a/services/agent/test/code-tool.test.ts b/services/agent/test/code-tool.test.ts
deleted file mode 100644
index 0711f57b41..0000000000
--- a/services/agent/test/code-tool.test.ts
+++ /dev/null
@@ -1,92 +0,0 @@
-/**
- * Unit test for the code-tool executor (runCodeTool).
- *
- * Exercises both runtimes end-to-end through real subprocesses: a python tool, node tools
- * written as a bare top-level `function main` (the F2 regression) and as an explicit
- * `module.exports.main`, an async node `main`, the F3 env-isolation guarantee (provider keys
- * do NOT leak in; declared scoped secrets DO), and the non-zero-exit reject path.
- *
- * Run: pnpm exec tsx test/code-tool.test.ts
- */
-import assert from "node:assert/strict";
-
-import { runCodeTool } from "../src/tools/code.ts";
-
-// --- Python: bare `def main(**kw)` ------------------------------------------
-{
-  const code = 'def main(**kw):\n    return {"sum": kw.get("a", 0) + kw.get("b", 0)}\n';
-  const out = await runCodeTool("python", code, undefined, { a: 2, b: 3 });
-  assert.deepEqual(JSON.parse(out), { sum: 5 }, "python bare main returns the right JSON");
-}
-
-// --- Node: bare top-level `function main` (F2 regression) -------------------
-{
-  const code = "function main(inputs) { return { got: inputs }; }";
-  const out = await runCodeTool("node", code, undefined, { hello: "world" });
-  assert.deepEqual(
-    JSON.parse(out),
-    { got: { hello: "world" } },
-    "node bare function main executes and echoes the input",
-  );
-}
-
-// --- Node: explicit `module.exports.main` -----------------------------------
-{
-  const code = "module.exports.main = function (inputs) { return { via: 'exports', got: inputs }; };";
-  const out = await runCodeTool("node", code, undefined, { x: 1 });
-  assert.deepEqual(
-    JSON.parse(out),
-    { via: "exports", got: { x: 1 } },
-    "node module.exports.main works",
-  );
-}
-
-// --- Node: async `main` returning a Promise ---------------------------------
-{
-  const code =
-    "async function main(inputs) { await new Promise((r) => setTimeout(r, 5)); return { doubled: inputs.n * 2 }; }";
-  const out = await runCodeTool("node", code, undefined, { n: 21 });
-  assert.deepEqual(JSON.parse(out), { doubled: 42 }, "node async main resolves");
-}
-
-// --- F3: provider keys do NOT leak; scoped secrets DO -----------------------
-{
-  const hadKey = "OPENAI_API_KEY" in process.env;
-  const prevKey = process.env.OPENAI_API_KEY;
-  process.env.OPENAI_API_KEY = "leak-me-xyz";
-  try {
-    // The provider key sits in process.env but must not reach the snippet.
-    const leakCode = "function main() { return { key: process.env.OPENAI_API_KEY ?? 'absent' }; }";
-    const leakOut = await runCodeTool("node", leakCode, undefined, {});
-    assert.deepEqual(
-      JSON.parse(leakOut),
-      { key: "absent" },
-      "F3: OPENAI_API_KEY did NOT leak into the snippet env",
-    );
-
-    // A secret declared on the tool (passed via the scoped `env` arg) must be visible.
-    const scopedCode =
-      "function main() { return { secret: process.env.MY_TOOL_SECRET ?? 'absent' }; }";
-    const scopedOut = await runCodeTool("node", scopedCode, { MY_TOOL_SECRET: "ok" }, {});
-    assert.deepEqual(
-      JSON.parse(scopedOut),
-      { secret: "ok" },
-      "F3: scoped MY_TOOL_SECRET IS visible to the snippet",
-    );
-  } finally {
-    if (hadKey) process.env.OPENAI_API_KEY = prevKey;
-    else delete process.env.OPENAI_API_KEY;
-  }
-}
-
-// --- Non-zero exit / throw rejects ------------------------------------------
-{
-  const code = "function main() { throw new Error('boom'); }";
-  await assert.rejects(
-    () => runCodeTool("node", code, undefined, {}),
-    /boom|exited/,
-    "a throwing snippet rejects",
-  );
-}
-
-console.log("code-tool.test.ts: all assertions passed");
diff --git a/services/agent/test/continuation.test.ts b/services/agent/test/continuation.test.ts
deleted file mode 100644
index c9f9d4356c..0000000000
--- a/services/agent/test/continuation.test.ts
+++ /dev/null
@@ -1,66 +0,0 @@
-/**
- * Unit tests for the cross-turn HITL continuation substrate.
- *
- * Under the cold model the harness rebuilds context from the replayed transcript, and ACP
- * prompt content blocks cannot carry tool calls/results. So a resolved interaction (an
- * approved tool that ran, a client-fulfilled tool) must survive into the replay as text.
- * `messageTranscript` encodes tool turns; `buildTurnText` keeps them in the replayed history.
- *
- * Run: pnpm exec tsx test/continuation.test.ts
- */
-import assert from "node:assert/strict";
-
-import { messageTranscript, buildTurnText } from "../src/engines/rivet.ts";
-import {
-  resolveRunSessionId,
-  type AgentRunRequest,
-  type ContentBlock,
-} from "../src/protocol.ts";
-
-// --- messageTranscript -------------------------------------------------------
-assert.equal(messageTranscript("hello"), "hello");
-assert.equal(messageTranscript([{ type: "text", text: "a" }, { type: "text", text: "b" }]), "a\nb");
-assert.equal(
-  messageTranscript([{ type: "tool_call", toolName: "getWeather", input: { city: "Paris" } }]),
-  '[called getWeather({"city":"Paris"})]',
-);
-assert.equal(
-  messageTranscript([{ type: "tool_result", toolName: "getWeather", output: { temp: 24 } }]),
-  '[getWeather returned: {"temp":24}]',
-);
-assert.equal(
-  messageTranscript([{ type: "tool_result", toolName: "send", output: "boom", isError: true }]),
-  "[send error: boom]",
-);
-
-// --- session id metadata ------------------------------------------------------
-assert.equal(
-  resolveRunSessionId({ sessionId: "sess_platform" }, "runner-ephemeral"),
-  "sess_platform",
-);
-assert.equal(resolveRunSessionId({}, "runner-ephemeral"), "runner-ephemeral");
-
-// --- buildTurnText keeps a resolved tool turn in the replay ------------------
-{
-  const req: AgentRunRequest = {
-    messages: [
-      { role: "user", content: "weather in Paris?" },
-      {
-        role: "assistant",
-        content: [{ type: "tool_call", toolName: "getWeather", input: { city: "Paris" } } as ContentBlock],
-      },
-      {
-        role: "tool",
-        content: [{ type: "tool_result", toolName: "getWeather", output: { temp: 24 } } as ContentBlock],
-      },
-      { role: "user", content: "and tomorrow?" },
-    ],
-  };
-  const text = buildTurnText(req);
-  assert.ok(text.includes("called getWeather"), "tool call survives replay");
-  assert.ok(text.includes("getWeather returned"), "tool result survives replay");
-  assert.ok(text.includes("and tomorrow?"), "latest user prompt is the live turn");
-  assert.ok(text.startsWith("Conversation so far:"), "transcript header present");
-}
-
-console.log("continuation.test.ts: all assertions passed");
diff --git a/services/agent/test/extension-tools.test.ts b/services/agent/test/extension-tools.test.ts
deleted file mode 100644
index 5db5e22177..0000000000
--- a/services/agent/test/extension-tools.test.ts
+++ /dev/null
@@ -1,109 +0,0 @@
-/**
- * Regression: the Agenta Pi extension registers custom tools from AGENTA_TOOL_PUBLIC_SPECS.
- *
- * Guards QA finding F-005 (docs/design/agent-workflows/qa/findings.md): a build where the
- * extension stopped reading AGENTA_TOOL_PUBLIC_SPECS shipped custom tools that the model never
- * saw, so it improvised with bash and failed. This pins the contract at the source: given the
- * public-spec env the runner sets (buildPiExtensionEnv in engines/rivet.ts), the extension
- * factory calls pi.registerTool once per spec, passes the JSON Schema through, and gives each
- * tool an execute() that relays to the runner. It is also inert when the env is absent.
- *
- * Run: pnpm exec tsx test/extension-tools.test.ts
- */
-import assert from "node:assert/strict";
-
-import factory from "../src/extensions/agenta.ts";
-
-const TOOL_ENV = [
-  "AGENTA_TOOL_PUBLIC_SPECS",
-  "AGENTA_TOOL_RELAY_DIR",
-  "AGENTA_TRACEPARENT",
-  "AGENTA_OTLP_ENDPOINT",
-  "AGENTA_USAGE_OUT",
-  "AGENTA_CAPTURE_CONTENT",
-];
-
-function fakePi() {
-  const registered: any[] = [];
-  return {
-    registered,
-    registerTool(spec: any) {
-      registered.push(spec);
-    },
-    on() {},
-  };
-}
-
-function clearEnv() {
-  for (const key of TOOL_ENV) delete process.env[key];
-}
-
-// --- registers one tool per public spec, schema passed through --------------
-{
-  clearEnv();
-  process.env.AGENTA_TOOL_PUBLIC_SPECS = JSON.stringify([
-    {
-      name: "secret_math",
-      description: "qa math",
-      inputSchema: {
-        type: "object",
-        properties: { x: { type: "integer" } },
-        required: ["x"],
-      },
-    },
-    { name: "no_schema_tool", description: "no schema" },
-  ]);
-  process.env.AGENTA_TOOL_RELAY_DIR = "/tmp/agenta-relay-test";
-
-  const pi = fakePi();
-  factory(pi as any);
-
-  assert.equal(pi.registered.length, 2, "registers one tool per public spec");
-  assert.deepEqual(
-    pi.registered.map((t) => t.name),
-    ["secret_math", "no_schema_tool"],
-    "registers each spec by name",
-  );
-
-  const math = pi.registered[0];
-  assert.equal(math.description, "qa math", "carries the description");
-  assert.ok(
-    math.parameters && math.parameters.properties && math.parameters.properties.x,
-    "passes the JSON Schema through to Pi",
-  );
-  assert.equal(typeof math.execute, "function", "each tool has an execute() that relays");
-
-  const noSchema = pi.registered[1];
-  assert.ok(
-    noSchema.parameters,
-    "a spec without inputSchema falls back to a schema, never undefined",
-  );
-}
-
-// --- inert without the tool env (the F-005 bug shape: never delivered) ------
-{
-  clearEnv();
-  const pi = fakePi();
-  factory(pi as any);
-  assert.equal(
-    pi.registered.length,
-    0,
-    "no tool env => registers nothing (no silent partial state)",
-  );
-}
-
-// --- specs present but relay dir missing => does not register ---------------
-{
-  clearEnv();
-  process.env.AGENTA_TOOL_PUBLIC_SPECS = JSON.stringify([{ name: "x" }]);
-  const pi = fakePi();
-  factory(pi as any);
-  assert.equal(
-    pi.registered.length,
-    0,
-    "specs without a relay dir do not register (incomplete wiring is not honored)",
-  );
-}
-
-clearEnv();
-console.log("extension-tools.test.ts: all assertions passed");
diff --git a/services/agent/test/mcp-servers.test.ts b/services/agent/test/mcp-servers.test.ts
deleted file mode 100644
index 97e821429f..0000000000
--- a/services/agent/test/mcp-servers.test.ts
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Unit tests for the user-declared MCP server conversion (Agent B's Slice 4, wired in rivet).
- *
- * Agent B's `resolve_mcp_servers` emits the McpServerConfig wire shape
- * ({name,transport,command,args,env,url?,tools?}, env as a Record), pinned in the Python
- * test_wire_contract. This covers the TS half: converting that to the ACP stdio entry the
- * session consumes (env as a {name,value} list), skipping remote/http, and not enforcing the
- * per-server tools allowlist over ACP in v1.
- *
- * Run: pnpm exec tsx test/mcp-servers.test.ts
- */
-import assert from "node:assert/strict";
-
-import { toAcpMcpServers } from "../src/engines/rivet.ts";
-import type { McpServerConfig } from "../src/protocol.ts";
-
-assert.deepEqual(toAcpMcpServers(undefined), [], "undefined -> []");
-assert.deepEqual(toAcpMcpServers([]), [], "[] -> []");
-
-// stdio server: env Record -> ACP {name,value} list; defaults applied.
-{
-  const servers: McpServerConfig[] = [
-    {
-      name: "github",
-      transport: "stdio",
-      command: "npx",
-      args: ["-y", "@modelcontextprotocol/server-github"],
-      env: { GITHUB_PERSONAL_ACCESS_TOKEN: "ghp_x", LOG_LEVEL: "info" },
-      tools: ["create_issue"], // allowlist not enforced over ACP v1 (logged), server still delivered
-    },
-  ];
-  const out = toAcpMcpServers(servers);
-  assert.equal(out.length, 1);
-  assert.equal(out[0].name, "github");
-  assert.equal(out[0].command, "npx");
-  assert.deepEqual(out[0].args, ["-y", "@modelcontextprotocol/server-github"]);
-  assert.deepEqual(out[0].env, [
-    { name: "GITHUB_PERSONAL_ACCESS_TOKEN", value: "ghp_x" },
-    { name: "LOG_LEVEL", value: "info" },
-  ]);
-}
-
-// remote/http is skipped (no auth on the wire by design); stdio without command is skipped.
-{
-  const out = toAcpMcpServers([
-    { name: "remote", transport: "http", url: "https://example.com/mcp" },
-    { name: "broken", transport: "stdio" }, // no command
-  ]);
-  assert.deepEqual(out, [], "http + command-less stdio both skipped");
-}
-
-// missing env / args default to empty.
-{
-  const out = toAcpMcpServers([{ name: "fs", transport: "stdio", command: "mcp-fs" }]);
-  assert.deepEqual(out, [{ name: "fs", command: "mcp-fs", args: [], env: [] }]);
-}
-
-console.log("mcp-servers.test.ts: all assertions passed");
diff --git a/services/agent/test/responder.test.ts b/services/agent/test/responder.test.ts
deleted file mode 100644
index e06ae43e00..0000000000
--- a/services/agent/test/responder.test.ts
+++ /dev/null
@@ -1,84 +0,0 @@
-/**
- * Unit tests for the interaction responder seam and the otel `emitEvent` hook.
- *
- * Covers the behavior parity of the responder (it replaces the old inline auto-approve in
- * rivet.ts) and that an out-of-stream event (an `interaction_request`) routed through
- * `emitEvent` lands in both the live sink and the batch `events()` log. No harness, no
- * network.
- *
- * Run: pnpm exec tsx test/responder.test.ts
- */
-import assert from "node:assert/strict";
-
-import { createRivetOtel } from "../src/tracing/otel.ts";
-import type { AgentEvent } from "../src/protocol.ts";
-import {
-  PolicyResponder,
-  decisionToReply,
-  policyFromRequest,
-} from "../src/responder.ts";
-
-// --- policyFromRequest -------------------------------------------------------
-{
-  delete process.env.AGENTA_RIVET_DENY_PERMISSIONS;
-  assert.equal(policyFromRequest(undefined), "auto");
-  assert.equal(policyFromRequest("auto"), "auto");
-  assert.equal(policyFromRequest("deny"), "deny");
-
-  process.env.AGENTA_RIVET_DENY_PERMISSIONS = "true";
-  assert.equal(policyFromRequest(undefined), "deny", "env forces deny");
-  assert.equal(policyFromRequest("auto"), "deny", "env overrides auto");
-  delete process.env.AGENTA_RIVET_DENY_PERMISSIONS;
-}
-
-// --- decisionToReply (parity with the old inline mapping) --------------------
-{
-  assert.equal(decisionToReply("allow", ["always", "once", "reject"]), "always");
-  assert.equal(decisionToReply("allow", ["once", "reject"]), "once");
-  assert.equal(decisionToReply("allow", []), "once", "allow falls back to once");
-  assert.equal(decisionToReply("deny", ["always", "once", "reject"]), "reject");
-  assert.equal(decisionToReply("deny", []), "reject", "deny falls back to reject");
-}
-
-// --- PolicyResponder ---------------------------------------------------------
-{
-  const auto = new PolicyResponder("auto");
-  const deny = new PolicyResponder("deny");
-  const req = { id: "p1", availableReplies: ["once", "reject"] };
-  assert.equal(await auto.onPermission(req), "allow");
-  assert.equal(await deny.onPermission(req), "deny");
-}
-
-// --- emitEvent: streaming path (sink + batch) --------------------------------
-{
-  const emitted: AgentEvent[] = [];
-  const run = createRivetOtel({ harness: "claude", model: "anthropic/x", emit: (e) => emitted.push(e) });
-  run.start({ prompt: "hi" });
-  const interaction: AgentEvent = {
-    type: "interaction_request",
-    id: "p1",
-    kind: "permission",
-    payload: { availableReplies: ["once", "reject"] },
-  };
-  run.emitEvent(interaction);
-
-  const live = emitted.find((e) => e.type === "interaction_request");
-  assert.ok(live, "interaction_request flushed to the live sink");
-  assert.equal((live as any).id, "p1");
-  assert.ok(
-    run.events().some((e) => e.type === "interaction_request"),
-    "interaction_request also recorded in the batch log",
-  );
-}
-
-// --- emitEvent: one-shot path (batch only) -----------------------------------
-{
-  const run = createRivetOtel({ harness: "claude", model: "anthropic/x" });
-  run.start({ prompt: "hi" });
-  run.emitEvent({ type: "data", name: "weather", data: { temp: 24 } });
-  const ev = run.events().find((e) => e.type === "data");
-  assert.ok(ev, "data event recorded with no live sink");
-  assert.equal((ev as any).name, "weather");
-}
-
-console.log("responder.test.ts: all assertions passed");
diff --git a/services/agent/test/stream-events.test.ts b/services/agent/test/stream-events.test.ts
deleted file mode 100644
index f27e31fc23..0000000000
--- a/services/agent/test/stream-events.test.ts
+++ /dev/null
@@ -1,148 +0,0 @@
-/**
- * Unit test for the createRivetOtel delta/lifecycle state machine.
- *
- * Drives `handleUpdate` with a hand-built ACP `session/update` sequence (Claude-style
- * cumulative text snapshots, a tool call between two text runs, a reasoning run) and asserts
- * the streaming and one-shot event shapes. No harness, no network: spans are built offline
- * and never flushed.
- *
- * Run: pnpm exec tsx test/stream-events.test.ts
- */
-import assert from "node:assert/strict";
-
-import { createRivetOtel } from "../src/tracing/otel.ts";
-import type { AgentEvent } from "../src/protocol.ts";
-
-const textChunk = (text: string) => ({
-  sessionUpdate: "agent_message_chunk",
-  content: { type: "text", text },
-});
-const thoughtChunk = (text: string) => ({
-  sessionUpdate: "agent_thought_chunk",
-  content: { type: "text", text },
-});
-const toolCall = (id: string, title: string, rawInput: unknown) => ({
-  sessionUpdate: "tool_call",
-  toolCallId: id,
-  title,
-  rawInput,
-});
-const toolDone = (id: string, text: string) => ({
-  sessionUpdate: "tool_call_update",
-  toolCallId: id,
-  status: "completed",
-  content: [{ content: { type: "text", text } }],
-});
-const usage = () => ({ sessionUpdate: "usage_update", used: 100, cost: { amount: 0.01 } });
-
-// The same ACP sequence drives both modes: two text runs around a tool call, then reasoning.
-function drive(run: ReturnType<typeof createRivetOtel>): void {
-  run.start({ prompt: "weather in Paris?" });
-  run.handleUpdate(textChunk("Hello ")); // pure delta
-  run.handleUpdate(textChunk("Hello world")); // cumulative snapshot (Claude-style)
-  run.handleUpdate(toolCall("call_1", "getWeather", { city: "Paris" }));
-  run.handleUpdate(toolDone("call_1", "sunny"));
-  run.handleUpdate(textChunk("Hello world It is sunny.")); // resumes after the tool
-  run.handleUpdate(thoughtChunk("thinking..."));
-  run.handleUpdate(usage());
-}
-
-const types = (events: AgentEvent[]) => events.map((e) => e.type);
-const ofType = <T extends AgentEvent["type"]>(events: AgentEvent[], t: T) =>
-  events.filter((e) => e.type === t) as Extract<AgentEvent, { type: T }>[];
-
-// --- Scenario 1: streaming (emit set) ---------------------------------------
-{
-  const emitted: AgentEvent[] = [];
-  const run = createRivetOtel({ harness: "claude", model: "anthropic/x", emit: (e) => emitted.push(e) });
-  drive(run);
-  const finalText = run.finish();
-
-  // No coalesced text events on the streaming path.
-  assert.equal(ofType(emitted, "message").length, 0, "no coalesced message when streaming");
-  assert.equal(ofType(emitted, "thought").length, 0, "no coalesced thought when streaming");
-
-  // Exactly one terminal done.
-  assert.equal(ofType(emitted, "done").length, 1, "exactly one done");
-
-  // Two text blocks (split by the tool call), one reasoning block, balanced start/end.
-  const mStart = ofType(emitted, "message_start");
-  const mEnd = ofType(emitted, "message_end");
-  assert.equal(mStart.length, 2, "two message_start");
-  assert.equal(mEnd.length, 2, "two message_end");
-  assert.deepEqual(mStart.map((e) => e.id), ["msg-0", "msg-1"], "stable monotonic text ids");
-  const rStart = ofType(emitted, "reasoning_start");
-  const rEnd = ofType(emitted, "reasoning_end");
-  assert.equal(rStart.length, 1, "one reasoning_start");
-  assert.equal(rEnd.length, 1, "one reasoning_end");
-
-  // Deltas are pure and reconstruct the full text, with no overlap/repeat.
-  const text = ofType(emitted, "message_delta").map((e) => e.delta).join("");
-  assert.equal(text, "Hello world It is sunny.", "concatenated deltas == full text");
-  assert.equal(text, finalText, "deltas match finish() output");
-  const reasoning = ofType(emitted, "reasoning_delta").map((e) => e.delta).join("");
-  assert.equal(reasoning, "thinking...", "concatenated reasoning deltas");
-
-  // Ordering invariant: each block's start precedes its deltas precede its end; tool result
-  // lands before the second text block opens.
-  const seq = types(emitted);
-  assert.ok(seq.indexOf("message_end") < seq.indexOf("tool_call"), "first text block closes before the tool call");
-  assert.ok(seq.indexOf("tool_result") < seq.lastIndexOf("message_start"), "tool result precedes the second text block");
-  for (const id of ["msg-0", "msg-1", "reason-2"]) {
-    const idxs = emitted
-      .map((e, i) => ((e as any).id === id ? { i, t: e.type } : null))
-      .filter(Boolean) as { i: number; t: string }[];
-    assert.ok(idxs[0].t.endsWith("_start"), `${id} starts with *_start`);
-    assert.ok(idxs[idxs.length - 1].t.endsWith("_end"), `${id} ends with *_end`);
-  }
-}
-
-// --- Scenario 2: one-shot (no emit) -----------------------------------------
-{
-  const run = createRivetOtel({ harness: "claude", model: "anthropic/x" });
-  drive(run);
-  const finalText = run.finish();
-  const events = run.events();
-
-  // Coalesced text/thought, no delta lifecycle events.
-  const messages = ofType(events, "message");
-  assert.equal(messages.length, 1, "one coalesced message");
-  assert.equal(messages[0].text, "Hello world It is sunny.", "coalesced text == final");
-  assert.equal(messages[0].text, finalText);
-  assert.equal(ofType(events, "thought").length, 1, "one coalesced thought");
-  for (const t of ["message_start", "message_delta", "message_end", "reasoning_start", "reasoning_delta", "reasoning_end"]) {
-    assert.equal(events.filter((e) => e.type === t).length, 0, `no ${t} on the one-shot path`);
-  }
-
-  // The structured tool/usage events are still present, with exactly one done.
-  assert.equal(ofType(events, "tool_call").length, 1, "tool_call present");
-  assert.equal(ofType(events, "tool_result").length, 1, "tool_result present");
-  assert.equal(ofType(events, "usage").length, 1, "usage present");
-  assert.equal(ofType(events, "done").length, 1, "exactly one done");
-}
-
-// --- Scenario 3: span-less mode still records ACP events ---------------------
-{
-  const run = createRivetOtel({ harness: "pi", model: "openai-codex/x", emitSpans: false });
-  drive(run);
-  run.setUsage({ input: 4, output: 6, total: 10, cost: 0.02 });
-  const finalText = run.finish();
-  const events = run.events();
-
-  assert.equal(finalText, "Hello world It is sunny.");
-  assert.equal(ofType(events, "message").length, 1, "message present without spans");
-  assert.equal(ofType(events, "thought").length, 1, "thought present without spans");
-  assert.equal(ofType(events, "tool_call").length, 1, "tool_call present without spans");
-  assert.equal(ofType(events, "tool_result").length, 1, "tool_result present without spans");
-  const usageEvents = ofType(events, "usage");
-  assert.equal(usageEvents.length, 1, "usage present without spans");
-  assert.deepEqual(
-    usageEvents[0],
-    { type: "usage", input: 4, output: 6, total: 10, cost: 0.02 },
-    "final usage replaces stream-only usage before done",
-  );
-  assert.equal(ofType(events, "done").length, 1, "exactly one done without spans");
-  assert.ok(types(events).indexOf("usage") < types(events).indexOf("done"), "usage precedes done");
-}
-
-console.log("stream-events.test.ts: all assertions passed");
diff --git a/services/agent/test/tool-bridge.test.ts b/services/agent/test/tool-bridge.test.ts
deleted file mode 100644
index 4dac2b3f9d..0000000000
--- a/services/agent/test/tool-bridge.test.ts
+++ /dev/null
@@ -1,169 +0,0 @@
-/**
- * Unit tests for buildToolMcpServers (the tool MCP bridge attachment decision).
- *
- * Regression cover for F4: attachment must be decided per tool kind, not on the callback
- * endpoint alone. A `code` tool runs locally in mcp-server.ts and needs no endpoint, so a run
- * whose tools are all `code` must still attach the `agenta-tools` server. Only `callback`-kind
- * tools require AGENTA_TOOL_CALLBACK_ENDPOINT; missing it must degrade those tools, not drop the
- * whole server. `client` tools are browser-fulfilled and never justify attaching the bridge.
- *
- * Run: pnpm exec tsx test/tool-bridge.test.ts
- */
-import assert from "node:assert/strict";
-
-import { buildToolMcpServers } from "../src/tools/mcp-bridge.ts";
-import type { ResolvedToolSpec, ToolCallbackContext } from "../src/protocol.ts";
-
-/** Look up an env var value by name in the ACP {name,value} list (undefined if absent). */
-function envValue(
-  env: { name: string; value: string }[],
-  name: string,
-): string | undefined {
-  return env.find((e) => e.name === name)?.value;
-}
-
-const relayDir = "/tmp/agenta-tools";
-
-// code-only specs + no callback -> one server, with public specs and relay dir.
-{
-  const specs: ResolvedToolSpec[] = [
-    {
-      name: "adder",
-      description: "Add numbers",
-      kind: "code",
-      runtime: "python",
-      code: "def main(**k): return 1",
-      env: { PRIVATE: "secret" },
-    },
-  ];
-  const out = buildToolMcpServers(specs, relayDir);
-  assert.equal(out.length, 1, "code-only run still attaches the server");
-  assert.equal(out[0].name, "agenta-tools");
-  assert.ok(
-    envValue(out[0].env, "AGENTA_TOOL_PUBLIC_SPECS") !== undefined,
-    "AGENTA_TOOL_PUBLIC_SPECS is set",
-  );
-  assert.equal(
-    envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"),
-    undefined,
-    "no endpoint env for code-only run",
-  );
-  assert.equal(envValue(out[0].env, "AGENTA_TOOL_RELAY_DIR"), relayDir);
-  assert.equal(envValue(out[0].env, "AGENTA_TOOL_CALLBACK_AUTH"), undefined);
-  assert.equal(envValue(out[0].env, "AGENTA_TOOL_SPECS"), undefined);
-  // Only public metadata round-trips; private executor fields stay runner-side.
-  assert.deepEqual(JSON.parse(envValue(out[0].env, "AGENTA_TOOL_PUBLIC_SPECS")!), [
-    { name: "adder", description: "Add numbers" },
-  ]);
-}
-
-// callback specs + a callback with endpoint -> still no endpoint/auth in child env.
-{
-  const specs: ResolvedToolSpec[] = [
-    { name: "search", kind: "callback", callRef: "composio.search" },
-  ];
-  const callback: ToolCallbackContext = {
-    endpoint: "https://agenta.example/tools/call",
-    authorization: "Bearer tok",
-  };
-  const out = buildToolMcpServers(specs, callback, relayDir);
-  assert.equal(out.length, 1);
-  assert.equal(
-    envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"),
-    undefined,
-    "endpoint env is never exposed to the bridge",
-  );
-  assert.equal(
-    envValue(out[0].env, "AGENTA_TOOL_CALLBACK_AUTH"),
-    undefined,
-    "auth env is never exposed to the bridge",
-  );
-  assert.equal(envValue(out[0].env, "AGENTA_TOOL_RELAY_DIR"), relayDir);
-}
-
-// callback spec + endpoint but no authorization -> still only public metadata + relay dir.
-{
-  const specs: ResolvedToolSpec[] = [
-    { name: "search", kind: "callback", callRef: "composio.search" },
-  ];
-  const out = buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" }, relayDir);
-  assert.equal(out.length, 1);
-  assert.equal(
-    envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"),
-    undefined,
-  );
-  assert.equal(
-    envValue(out[0].env, "AGENTA_TOOL_CALLBACK_AUTH"),
-    undefined,
-    "no AUTH env when authorization absent",
-  );
-}
-
-// absent kind defaults to callback (back-compat): endpoint still wired when present.
-{
-  const specs: ResolvedToolSpec[] = [{ name: "legacy", callRef: "composio.legacy" }];
-  const out = buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" }, relayDir);
-  assert.equal(out.length, 1, "back-compat (no kind) attaches as a callback tool");
-  assert.equal(
-    envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"),
-    undefined,
-  );
-}
-
-// mixed code+callback specs + NO endpoint -> still one server (so code works), endpoint omitted.
-{
-  const specs: ResolvedToolSpec[] = [
-    { name: "adder", kind: "code", runtime: "python", code: "def main(**k): return 1" },
-    { name: "search", kind: "callback", callRef: "composio.search" },
-  ];
-  const out = buildToolMcpServers(specs, relayDir);
-  assert.notDeepEqual(out, [], "mixed run with no endpoint must not return []");
-  assert.equal(out.length, 1, "still attaches the server so the code tool works");
-  assert.equal(
-    envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"),
-    undefined,
-    "endpoint env omitted when missing",
-  );
-  // Both executable specs are advertised, but only as public metadata.
-  assert.deepEqual(JSON.parse(envValue(out[0].env, "AGENTA_TOOL_PUBLIC_SPECS")!), [
-    { name: "adder" },
-    { name: "search" },
-  ]);
-}
-
-// empty specs -> [].
-assert.deepEqual(buildToolMcpServers([], undefined), [], "empty specs -> []");
-
-// client-only specs -> [] (no executable tools; the bridge does not advertise client tools).
-{
-  const specs: ResolvedToolSpec[] = [{ name: "confirm", kind: "client" }];
-  assert.deepEqual(
-    buildToolMcpServers(specs, undefined),
-    [],
-    "client-only -> [] (nothing executable here)",
-  );
-  // Even with an endpoint, client-only stays empty.
-  assert.deepEqual(
-    buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" }, relayDir),
-    [],
-    "client-only -> [] even with an endpoint",
-  );
-}
-
-// client tools alongside an executable one are dropped from AGENTA_TOOL_SPECS, server attaches.
-{
-  const specs: ResolvedToolSpec[] = [
-    { name: "confirm", kind: "client" },
-    { name: "adder", kind: "code", runtime: "python", code: "def main(**k): return 1" },
-  ];
-  const out = buildToolMcpServers(specs, relayDir);
-  assert.equal(out.length, 1, "executable spec attaches the server");
-  const passed: ResolvedToolSpec[] = JSON.parse(envValue(out[0].env, "AGENTA_TOOL_PUBLIC_SPECS")!);
-  assert.deepEqual(
-    passed.map((s) => s.name),
-    ["adder"],
-    "client spec excluded from the executable list passed to the bridge",
-  );
-}
-
-console.log("tool-bridge.test.ts: all assertions passed");
diff --git a/services/agent/test/tool-dispatch.test.ts b/services/agent/test/tool-dispatch.test.ts
deleted file mode 100644
index 8ec779d396..0000000000
--- a/services/agent/test/tool-dispatch.test.ts
+++ /dev/null
@@ -1,85 +0,0 @@
-/**
- * Unit tests for the shared tool-dispatch module (tools/dispatch.ts) and its routing.
- *
- * The kind-dispatch ("branch on spec.kind to execute a resolved tool") used to be duplicated
- * across engines/pi.ts, extensions/agenta.ts, and tools/mcp-server.ts. It now lives once in
- * `runResolvedTool`. These tests cover both the routing into that function and the call-site
- * advertising behavior that stays per-site:
- *  - buildCustomTools (pi.ts) skips `client` specs, builds a tool per `code`/`callback` spec,
- *    and skips a `callback` spec with no callback endpoint.
- *  - runResolvedTool runs a real `code` snippet end-to-end (python) and throws for `client`.
- *
- * No network and no harness: the `code` path shells out to python3 (available locally); the
- * `callback`/relay paths are not exercised here (they need a live /tools/call or a relay dir).
- *
- * Run: pnpm exec tsx test/tool-dispatch.test.ts
- */
-import assert from "node:assert/strict";
-
-import { buildCustomTools } from "../src/engines/pi.ts";
-import { runResolvedTool } from "../src/tools/dispatch.ts";
-import type { ResolvedToolSpec, ToolCallbackContext } from "../src/protocol.ts";
-
-const callback: ToolCallbackContext = { endpoint: "https://agenta.test/tools/call" };
-
-const clientSpec: ResolvedToolSpec = { name: "client_tool", kind: "client" };
-const codeSpec: ResolvedToolSpec = {
-  name: "code_tool",
-  kind: "code",
-  runtime: "python",
-  code: 'def main(**kw):\n    return {"echo": kw}\n',
-};
-const callbackSpec: ResolvedToolSpec = {
-  name: "callback_tool",
-  kind: "callback",
-  callRef: "composio.SOME_ACTION",
-};
-
-// --- buildCustomTools routing -----------------------------------------------
-{
-  const tools = buildCustomTools([clientSpec, codeSpec, callbackSpec], callback);
-  const names = tools.map((t) => t.name);
-
-  // `client` is browser-fulfilled, so it is never registered in-process.
-  assert.ok(!names.includes("client_tool"), "client spec is skipped");
-  // `code` and `callback` each produce exactly one tool with the spec's name.
-  assert.ok(names.includes("code_tool"), "code spec produces a tool");
-  assert.ok(names.includes("callback_tool"), "callback spec produces a tool");
-  assert.equal(tools.length, 2, "only the two executable specs produce tools");
-}
-
-// A `callback` spec with no callback endpoint is skipped (logged), but a sibling `code`
-// spec still registers (code never needs the endpoint).
-{
-  const tools = buildCustomTools([codeSpec, callbackSpec], undefined);
-  const names = tools.map((t) => t.name);
-  assert.ok(names.includes("code_tool"), "code spec still registers without an endpoint");
-  assert.ok(
-    !names.includes("callback_tool"),
-    "callback spec is skipped when no callback endpoint",
-  );
-  assert.equal(tools.length, 1, "only the code spec registers without an endpoint");
-}
-
-// --- runResolvedTool: code executes; client throws --------------------------
-{
-  const text = await runResolvedTool(codeSpec, { greeting: "hi", n: 3 }, {
-    toolCallId: "call-1",
-  });
-  const parsed = JSON.parse(text);
-  assert.deepEqual(
-    parsed,
-    { echo: { greeting: "hi", n: 3 } },
-    "code tool runs the snippet and returns its JSON output containing the input",
-  );
-}
-
-{
-  await assert.rejects(
-    () => runResolvedTool(clientSpec, {}, { toolCallId: "call-2" }),
-    /browser-fulfilled/,
-    "client tool throws (never executed in-sandbox)",
-  );
-}
-
-console.log("tool-dispatch.test.ts: all assertions passed");
diff --git a/services/agent/tests/unit/cli.test.ts b/services/agent/tests/unit/cli.test.ts
new file mode 100644
index 0000000000..2481b6895a
--- /dev/null
+++ b/services/agent/tests/unit/cli.test.ts
@@ -0,0 +1,66 @@
+/**
+ * Unit tests for the stdin/stdout CLI transport via the `runCli(raw, stream, io)` seam.
+ *
+ * Injects a FAKE engine and a collecting `write`, so no stdin/stdout/process.exit mocking is
+ * needed. Covers the one-shot happy path, invalid JSON, a failing result, and the streaming
+ * order (event lines then exactly one terminal result line). No harness, no process exit.
+ *
+ * Run: pnpm test (or: pnpm exec vitest run tests/unit/cli.test.ts)
+ */
+import { describe, it } from "vitest";
+import assert from "node:assert/strict";
+
+import { runCli, type RunAgent } from "../../src/cli.ts";
+
+const okRun: RunAgent = async () => ({ ok: true, output: "hi" });
+
+function collector() {
+  const chunks: string[] = [];
+  return { chunks, write: (s: string) => chunks.push(s), text: () => chunks.join("") };
+}
+
+describe("runCli", () => {
+  it("one-shot: writes the result JSON and returns exit 0", async () => {
+    const out = collector();
+    const code = await runCli(JSON.stringify({ backend: "pi" }), false, { run: okRun, write: out.write });
+    assert.equal(code, 0);
+    assert.deepEqual(JSON.parse(out.text()), { ok: true, output: "hi" });
+  });
+
+  it("invalid JSON: returns exit 1 with an error result", async () => {
+    const out = collector();
+    const code = await runCli("{not json", false, { run: okRun, write: out.write });
+    assert.equal(code, 1);
+    const res = JSON.parse(out.text()) as { ok: boolean; error: string };
+    assert.equal(res.ok, false);
+    assert.match(res.error, /Invalid JSON on stdin/);
+  });
+
+  it("a failing result returns exit 1", async () => {
+    const out = collector();
+    const code = await runCli("{}", false, {
+      run: async () => ({ ok: false, error: "boom" }),
+      write: out.write,
+    });
+    assert.equal(code, 1);
+    assert.equal((JSON.parse(out.text()) as { error: string }).error, "boom");
+  });
+
+  it("stream: event lines then exactly one terminal result line", async () => {
+    const out = collector();
+    const streamRun: RunAgent = async (_req, emit) => {
+      emit?.({ type: "message", text: "a" });
+      emit?.({ type: "message", text: "b" });
+      return { ok: true, output: "ab", events: [{ type: "message", text: "a" }] };
+    };
+    const code = await runCli("{}", true, { run: streamRun, write: out.write });
+    assert.equal(code, 0);
+    const records = out
+      .text()
+      .trim()
+      .split("\n")
+      .map((line) => JSON.parse(line) as { kind: string; result?: { events: unknown[] } });
+    assert.deepEqual(records.map((r) => r.kind), ["event", "event", "result"]);
+    assert.deepEqual(records[2].result!.events, [], "terminal result does not echo events");
+  });
+});
diff --git a/services/agent/tests/unit/code-tool.test.ts b/services/agent/tests/unit/code-tool.test.ts
new file mode 100644
index 0000000000..5a3566614d
--- /dev/null
+++ b/services/agent/tests/unit/code-tool.test.ts
@@ -0,0 +1,89 @@
+/**
+ * Unit test for the code-tool executor (runCodeTool).
+ *
+ * Exercises both runtimes end-to-end through real subprocesses: a python tool, node tools
+ * written as a bare top-level `function main` (the F2 regression) and as an explicit
+ * `module.exports.main`, an async node `main`, the F3 env-isolation guarantee (provider keys
+ * do NOT leak in; declared scoped secrets DO), and the non-zero-exit reject path.
+ *
+ * Needs `python3` and `node` on PATH (both present locally and on ubuntu CI runners).
+ *
+ * Run: pnpm test (or: pnpm exec vitest run tests/unit/code-tool.test.ts)
+ */
+import { describe, it } from "vitest";
+import assert from "node:assert/strict";
+
+import { runCodeTool } from "../../src/tools/code.ts";
+
+describe("runCodeTool", () => {
+  it("runs a python bare `def main(**kw)`", async () => {
+    const code = 'def main(**kw):\n    return {"sum": kw.get("a", 0) + kw.get("b", 0)}\n';
+    const out = await runCodeTool("python", code, undefined, { a: 2, b: 3 });
+    assert.deepEqual(JSON.parse(out), { sum: 5 }, "python bare main returns the right JSON");
+  });
+
+  it("runs a node bare top-level `function main` (F2 regression)", async () => {
+    const code = "function main(inputs) { return { got: inputs }; }";
+    const out = await runCodeTool("node", code, undefined, { hello: "world" });
+    assert.deepEqual(
+      JSON.parse(out),
+      { got: { hello: "world" } },
+      "node bare function main executes and echoes the input",
+    );
+  });
+
+  it("runs a node explicit `module.exports.main`", async () => {
+    const code = "module.exports.main = function (inputs) { return { via: 'exports', got: inputs }; };";
+    const out = await runCodeTool("node", code, undefined, { x: 1 });
+    assert.deepEqual(
+      JSON.parse(out),
+      { via: "exports", got: { x: 1 } },
+      "node module.exports.main works",
+    );
+  });
+
+  it("runs an async node `main` returning a Promise", async () => {
+    const code =
+      "async function main(inputs) { await new Promise((r) => setTimeout(r, 5)); return { doubled: inputs.n * 2 }; }";
+    const out = await runCodeTool("node", code, undefined, { n: 21 });
+    assert.deepEqual(JSON.parse(out), { doubled: 42 }, "node async main resolves");
+  });
+
+  it("F3: provider keys do NOT leak; scoped secrets DO", async () => {
+    const hadKey = "OPENAI_API_KEY" in process.env;
+    const prevKey = process.env.OPENAI_API_KEY;
+    process.env.OPENAI_API_KEY = "leak-me-xyz";
+    try {
+      // The provider key sits in process.env but must not reach the snippet.
+      const leakCode = "function main() { return { key: process.env.OPENAI_API_KEY ?? 'absent' }; }";
+      const leakOut = await runCodeTool("node", leakCode, undefined, {});
+      assert.deepEqual(
+        JSON.parse(leakOut),
+        { key: "absent" },
+        "F3: OPENAI_API_KEY did NOT leak into the snippet env",
+      );
+
+      // A secret declared on the tool (passed via the scoped `env` arg) must be visible.
+      const scopedCode =
+        "function main() { return { secret: process.env.MY_TOOL_SECRET ?? 'absent' }; }";
+      const scopedOut = await runCodeTool("node", scopedCode, { MY_TOOL_SECRET: "ok" }, {});
+      assert.deepEqual(
+        JSON.parse(scopedOut),
+        { secret: "ok" },
+        "F3: scoped MY_TOOL_SECRET IS visible to the snippet",
+      );
+    } finally {
+      if (hadKey) process.env.OPENAI_API_KEY = prevKey;
+      else delete process.env.OPENAI_API_KEY;
+    }
+  });
+
+  it("rejects when the snippet throws / exits non-zero", async () => {
+    const code = "function main() { throw new Error('boom'); }";
+    await assert.rejects(
+      () => runCodeTool("node", code, undefined, {}),
+      /boom|exited/,
+      "a throwing snippet rejects",
+    );
+  });
+});
diff --git a/services/agent/tests/unit/continuation.test.ts b/services/agent/tests/unit/continuation.test.ts
new file mode 100644
index 0000000000..9d7215ebec
--- /dev/null
+++ b/services/agent/tests/unit/continuation.test.ts
@@ -0,0 +1,72 @@
+/**
+ * Unit tests for the cross-turn HITL continuation substrate.
+ *
+ * Under the cold model the harness rebuilds context from the replayed transcript, and ACP
+ * prompt content blocks cannot carry tool calls/results. So a resolved interaction (an
+ * approved tool that ran, a client-fulfilled tool) must survive into the replay as text.
+ * `messageTranscript` encodes tool turns; `buildTurnText` keeps them in the replayed history.
+ *
+ * Run: pnpm test (or: pnpm exec vitest run tests/unit/continuation.test.ts)
+ */
+import { describe, it } from "vitest";
+import assert from "node:assert/strict";
+
+import { messageTranscript, buildTurnText } from "../../src/engines/rivet.ts";
+import {
+  resolveRunSessionId,
+  type AgentRunRequest,
+  type ContentBlock,
+} from "../../src/protocol.ts";
+
+describe("messageTranscript", () => {
+  it("encodes plain text, content blocks, and tool turns", () => {
+    assert.equal(messageTranscript("hello"), "hello");
+    assert.equal(messageTranscript([{ type: "text", text: "a" }, { type: "text", text: "b" }]), "a\nb");
+    assert.equal(
+      messageTranscript([{ type: "tool_call", toolName: "getWeather", input: { city: "Paris" } }]),
+      '[called getWeather({"city":"Paris"})]',
+    );
+    assert.equal(
+      messageTranscript([{ type: "tool_result", toolName: "getWeather", output: { temp: 24 } }]),
+      '[getWeather returned: {"temp":24}]',
+    );
+    assert.equal(
+      messageTranscript([{ type: "tool_result", toolName: "send", output: "boom", isError: true }]),
+      "[send error: boom]",
+    );
+  });
+});
+
+describe("resolveRunSessionId", () => {
+  it("prefers the platform session id, falling back to the ephemeral one", () => {
+    assert.equal(
+      resolveRunSessionId({ sessionId: "sess_platform" }, "runner-ephemeral"),
+      "sess_platform",
+    );
+    assert.equal(resolveRunSessionId({}, "runner-ephemeral"), "runner-ephemeral");
+  });
+});
+
+describe("buildTurnText", () => {
+  it("keeps a resolved tool turn in the replay", () => {
+    const req: AgentRunRequest = {
+      messages: [
+        { role: "user", content: "weather in Paris?" },
+        {
+          role: "assistant",
+          content: [{ type: "tool_call", toolName: "getWeather", input: { city: "Paris" } } as ContentBlock],
+        },
+        {
+          role: "tool",
+          content: [{ type: "tool_result", toolName: "getWeather", output: { temp: 24 } } as ContentBlock],
+        },
+        { role: "user", content: "and tomorrow?" },
+      ],
+    };
+    const text = buildTurnText(req);
+    assert.ok(text.includes("called getWeather"), "tool call survives replay");
+    assert.ok(text.includes("getWeather returned"), "tool result survives replay");
+    assert.ok(text.includes("and tomorrow?"), "latest user prompt is the live turn");
+    assert.ok(text.startsWith("Conversation so far:"), "transcript header present");
+  });
+});
diff --git a/services/agent/tests/unit/extension-tools.test.ts b/services/agent/tests/unit/extension-tools.test.ts
new file mode 100644
index 0000000000..674dad09a6
--- /dev/null
+++ b/services/agent/tests/unit/extension-tools.test.ts
@@ -0,0 +1,108 @@
+/**
+ * Regression: the Agenta Pi extension registers custom tools from AGENTA_TOOL_PUBLIC_SPECS.
+ *
+ * Guards QA finding F-005 (docs/design/agent-workflows/qa/findings.md): a build where the
+ * extension stopped reading AGENTA_TOOL_PUBLIC_SPECS shipped custom tools that the model never
+ * saw, so it improvised with bash and failed. This pins the contract at the source: given the
+ * public-spec env the runner sets (buildPiExtensionEnv in engines/rivet.ts), the extension
+ * factory calls pi.registerTool once per spec, passes the JSON Schema through, and gives each
+ * tool an execute() that relays to the runner. It is also inert when the env is absent.
+ *
+ * Run: pnpm test (or: pnpm exec vitest run tests/unit/extension-tools.test.ts)
+ */
+import { afterEach, describe, it } from "vitest";
+import assert from "node:assert/strict";
+
+import factory from "../../src/extensions/agenta.ts";
+
+const TOOL_ENV = [
+  "AGENTA_TOOL_PUBLIC_SPECS",
+  "AGENTA_TOOL_RELAY_DIR",
+  "AGENTA_TRACEPARENT",
+  "AGENTA_OTLP_ENDPOINT",
+  "AGENTA_USAGE_OUT",
+  "AGENTA_CAPTURE_CONTENT",
+];
+
+function fakePi() {
+  const registered: any[] = [];
+  return {
+    registered,
+    registerTool(spec: any) {
+      registered.push(spec);
+    },
+    on() {},
+  };
+}
+
+function clearEnv() {
+  for (const key of TOOL_ENV) delete process.env[key];
+}
+
+afterEach(clearEnv);
+
+describe("agenta extension tool registration", () => {
+  it("registers one tool per public spec, schema passed through", () => {
+    clearEnv();
+    process.env.AGENTA_TOOL_PUBLIC_SPECS = JSON.stringify([
+      {
+        name: "secret_math",
+        description: "qa math",
+        inputSchema: {
+          type: "object",
+          properties: { x: { type: "integer" } },
+          required: ["x"],
+        },
+      },
+      { name: "no_schema_tool", description: "no schema" },
+    ]);
+    process.env.AGENTA_TOOL_RELAY_DIR = "/tmp/agenta-relay-test";
+
+    const pi = fakePi();
+    factory(pi as any);
+
+    assert.equal(pi.registered.length, 2, "registers one tool per public spec");
+    assert.deepEqual(
+      pi.registered.map((t) => t.name),
+      ["secret_math", "no_schema_tool"],
+      "registers each spec by name",
+    );
+
+    const math = pi.registered[0];
+    assert.equal(math.description, "qa math", "carries the description");
+    assert.ok(
+      math.parameters && math.parameters.properties && math.parameters.properties.x,
+      "passes the JSON Schema through to Pi",
+    );
+    assert.equal(typeof math.execute, "function", "each tool has an execute() that relays");
+
+    const noSchema = pi.registered[1];
+    assert.ok(
+      noSchema.parameters,
+      "a spec without inputSchema falls back to a schema, never undefined",
+    );
+  });
+
+  it("is inert without the tool env (the F-005 bug shape: never delivered)", () => {
+    clearEnv();
+    const pi = fakePi();
+    factory(pi as any);
+    assert.equal(
+      pi.registered.length,
+      0,
+      "no tool env => registers nothing (no silent partial state)",
+    );
+  });
+
+  it("does not register when specs are present but the relay dir is missing", () => {
+    clearEnv();
+    process.env.AGENTA_TOOL_PUBLIC_SPECS = JSON.stringify([{ name: "x" }]);
+    const pi = fakePi();
+    factory(pi as any);
+    assert.equal(
+      pi.registered.length,
+      0,
+      "specs without a relay dir do not register (incomplete wiring is not honored)",
+    );
+  });
+});
diff --git a/services/agent/tests/unit/mcp-servers.test.ts b/services/agent/tests/unit/mcp-servers.test.ts
new file mode 100644
index 0000000000..d77e63297b
--- /dev/null
+++ b/services/agent/tests/unit/mcp-servers.test.ts
@@ -0,0 +1,58 @@
+/**
+ * Unit tests for the user-declared MCP server conversion (Agent B's Slice 4, wired in rivet).
+ *
+ * Agent B's `resolve_mcp_servers` emits the McpServerConfig wire shape
+ * ({name,transport,command,args,env,url?,tools?}, env as a Record), pinned in the Python
+ * test_wire_contract. This covers the TS half: converting that to the ACP stdio entry the
+ * session consumes (env as a {name,value} list), skipping remote/http, and not enforcing the
+ * per-server tools allowlist over ACP in v1.
+ *
+ * Run: pnpm test (or: pnpm exec vitest run tests/unit/mcp-servers.test.ts)
+ */
+import { describe, it } from "vitest";
+import assert from "node:assert/strict";
+
+import { toAcpMcpServers } from "../../src/engines/rivet.ts";
+import type { McpServerConfig } from "../../src/protocol.ts";
+
+describe("toAcpMcpServers", () => {
+  it("maps empty input to []", () => {
+    assert.deepEqual(toAcpMcpServers(undefined), [], "undefined -> []");
+    assert.deepEqual(toAcpMcpServers([]), [], "[] -> []");
+  });
+
+  it("converts a stdio server's env Record to an ACP {name,value} list", () => {
+    const servers: McpServerConfig[] = [
+      {
+        name: "github",
+        transport: "stdio",
+        command: "npx",
+        args: ["-y", "@modelcontextprotocol/server-github"],
+        env: { GITHUB_PERSONAL_ACCESS_TOKEN: "ghp_x", LOG_LEVEL: "info" },
+        tools: ["create_issue"], // allowlist not enforced over ACP v1 (logged), server still delivered
+      },
+    ];
+    const out = toAcpMcpServers(servers);
+    assert.equal(out.length, 1);
+    assert.equal(out[0].name, "github");
+    assert.equal(out[0].command, "npx");
+    assert.deepEqual(out[0].args, ["-y", "@modelcontextprotocol/server-github"]);
+    assert.deepEqual(out[0].env, [
+      { name: "GITHUB_PERSONAL_ACCESS_TOKEN", value: "ghp_x" },
+      { name: "LOG_LEVEL", value: "info" },
+    ]);
+  });
+
+  it("skips remote/http and command-less stdio servers", () => {
+    const out = toAcpMcpServers([
+      { name: "remote", transport: "http", url: "https://example.com/mcp" },
+      { name: "broken", transport: "stdio" }, // no command
+    ]);
+    assert.deepEqual(out, [], "http + command-less stdio both skipped");
+  });
+
+  it("defaults missing env / args to empty", () => {
+    const out = toAcpMcpServers([{ name: "fs", transport: "stdio", command: "mcp-fs" }]);
+    assert.deepEqual(out, [{ name: "fs", command: "mcp-fs", args: [], env: [] }]);
+  });
+});
diff --git a/services/agent/tests/unit/responder.test.ts b/services/agent/tests/unit/responder.test.ts
new file mode 100644
index 0000000000..ebe4eb0412
--- /dev/null
+++ b/services/agent/tests/unit/responder.test.ts
@@ -0,0 +1,92 @@
+/**
+ * Unit tests for the interaction responder seam and the otel `emitEvent` hook.
+ *
+ * Covers the behavior parity of the responder (it replaces the old inline auto-approve in
+ * rivet.ts) and that an out-of-stream event (an `interaction_request`) routed through
+ * `emitEvent` lands in both the live sink and the batch `events()` log. No harness, no
+ * network.
+ *
+ * Run: pnpm test (or: pnpm exec vitest run tests/unit/responder.test.ts)
+ */
+import { afterEach, describe, it } from "vitest";
+import assert from "node:assert/strict";
+
+import { createRivetOtel } from "../../src/tracing/otel.ts";
+import type { AgentEvent } from "../../src/protocol.ts";
+import {
+  PolicyResponder,
+  decisionToReply,
+  policyFromRequest,
+} from "../../src/responder.ts";
+
+// Defensive cleanup: policyFromRequest reads this env var; never let it leak past a test
+// (e.g. if an assertion throws mid-test, before the inline delete runs).
+afterEach(() => {
+  delete process.env.AGENTA_RIVET_DENY_PERMISSIONS;
+});
+
+describe("policyFromRequest", () => {
+  it("honors the arg and the env override", () => {
+    delete process.env.AGENTA_RIVET_DENY_PERMISSIONS;
+    assert.equal(policyFromRequest(undefined), "auto");
+    assert.equal(policyFromRequest("auto"), "auto");
+    assert.equal(policyFromRequest("deny"), "deny");
+
+    process.env.AGENTA_RIVET_DENY_PERMISSIONS = "true";
+    assert.equal(policyFromRequest(undefined), "deny", "env forces deny");
+    assert.equal(policyFromRequest("auto"), "deny", "env overrides auto");
+    delete process.env.AGENTA_RIVET_DENY_PERMISSIONS;
+  });
+});
+
+describe("decisionToReply (parity with the old inline mapping)", () => {
+  it("maps allow/deny onto the available replies", () => {
+    assert.equal(decisionToReply("allow", ["always", "once", "reject"]), "always");
+    assert.equal(decisionToReply("allow", ["once", "reject"]), "once");
+    assert.equal(decisionToReply("allow", []), "once", "allow falls back to once");
+    assert.equal(decisionToReply("deny", ["always", "once", "reject"]), "reject");
+    assert.equal(decisionToReply("deny", []), "reject", "deny falls back to reject");
+  });
+});
+
+describe("PolicyResponder", () => {
+  it("auto allows and deny denies", async () => {
+    const auto = new PolicyResponder("auto");
+    const deny = new PolicyResponder("deny");
+    const req = { id: "p1", availableReplies: ["once", "reject"] };
+    assert.equal(await auto.onPermission(req), "allow");
+    assert.equal(await deny.onPermission(req), "deny");
+  });
+});
+
+describe("emitEvent", () => {
+  it("streaming path: flushes to the live sink and the batch log", () => {
+    const emitted: AgentEvent[] = [];
+    const run = createRivetOtel({ harness: "claude", model: "anthropic/x", emit: (e) => emitted.push(e) });
+    run.start({ prompt: "hi" });
+    const interaction: AgentEvent = {
+      type: "interaction_request",
+      id: "p1",
+      kind: "permission",
+      payload: { availableReplies: ["once", "reject"] },
+    };
+    run.emitEvent(interaction);
+
+    const live = emitted.find((e) => e.type === "interaction_request");
+    assert.ok(live, "interaction_request flushed to the live sink");
+    assert.equal((live as any).id, "p1");
+    assert.ok(
+      run.events().some((e) => e.type === "interaction_request"),
+      "interaction_request also recorded in the batch log",
+    );
+  });
+
+  it("one-shot path: records in the batch log only", () => {
+    const run = createRivetOtel({ harness: "claude", model: "anthropic/x" });
+    run.start({ prompt: "hi" });
+    run.emitEvent({ type: "data", name: "weather", data: { temp: 24 } });
+    const ev = run.events().find((e) => e.type === "data");
+    assert.ok(ev, "data event recorded with no live sink");
+    assert.equal((ev as any).name, "weather");
+  });
+});
diff --git a/services/agent/tests/unit/server.test.ts b/services/agent/tests/unit/server.test.ts
new file mode 100644
index 0000000000..badf61db2c
--- /dev/null
+++ b/services/agent/tests/unit/server.test.ts
@@ -0,0 +1,109 @@
+/**
+ * Unit tests for the HTTP transport via the `createAgentServer(run)` seam.
+ *
+ * Starts a real server on an ephemeral port with a FAKE engine (no Pi/Claude/rivet) and makes
+ * real requests. Covers /health, the /run happy path, invalid JSON (400), a failing result
+ * (500), and the NDJSON streaming order (events first, then exactly one terminal result).
+ *
+ * Run: pnpm test (or: pnpm exec vitest run tests/unit/server.test.ts)
+ */
+import { describe, it } from "vitest";
+import assert from "node:assert/strict";
+import type { AddressInfo } from "node:net";
+
+import { createAgentServer, type RunAgent } from "../../src/server.ts";
+
+async function listen(run: RunAgent): Promise<{ url: string; close: () => Promise<void> }> {
+  const server = createAgentServer(run);
+  await new Promise<void>((resolve) => server.listen(0, "127.0.0.1", resolve));
+  const { port } = server.address() as AddressInfo;
+  return {
+    url: `http://127.0.0.1:${port}`,
+    close: () => new Promise<void>((resolve) => server.close(() => resolve())),
+  };
+}
+
+const okRun: RunAgent = async () => ({ ok: true, output: "hi", events: [] });
+
+describe("createAgentServer", () => {
+  it("GET /health returns runner identity", async () => {
+    const s = await listen(okRun);
+    try {
+      const res = await fetch(`${s.url}/health`);
+      assert.equal(res.status, 200);
+      const body = (await res.json()) as Record<string, unknown>;
+      assert.equal(body.status, "ok");
+      assert.equal(typeof body.runner, "string");
+      assert.equal(typeof body.protocol, "number");
+      assert.ok(Array.isArray(body.engines) && (body.engines as unknown[]).includes("pi"));
+      assert.ok(Array.isArray(body.harnesses));
+    } finally {
+      await s.close();
+    }
+  });
+
+  it("POST /run returns the engine result (200)", async () => {
+    const s = await listen(okRun);
+    try {
+      const res = await fetch(`${s.url}/run`, { method: "POST", body: JSON.stringify({ backend: "pi" }) });
+      assert.equal(res.status, 200);
+      const body = (await res.json()) as { ok: boolean; output: string };
+      assert.equal(body.ok, true);
+      assert.equal(body.output, "hi");
+    } finally {
+      await s.close();
+    }
+  });
+
+  it("POST /run with invalid JSON returns 400", async () => {
+    const s = await listen(okRun);
+    try {
+      const res = await fetch(`${s.url}/run`, { method: "POST", body: "{not json" });
+      assert.equal(res.status, 400);
+      const body = (await res.json()) as { ok: boolean; error: string };
+      assert.equal(body.ok, false);
+      assert.match(body.error, /Invalid JSON/);
+    } finally {
+      await s.close();
+    }
+  });
+
+  it("a failing result returns 500", async () => {
+    const failRun: RunAgent = async () => ({ ok: false, error: "boom" });
+    const s = await listen(failRun);
+    try {
+      const res = await fetch(`${s.url}/run`, { method: "POST", body: "{}" });
+      assert.equal(res.status, 500);
+      const body = (await res.json()) as { ok: boolean; error: string };
+      assert.equal(body.ok, false);
+      assert.equal(body.error, "boom");
+    } finally {
+      await s.close();
+    }
+  });
+
+  it("NDJSON stream: events first, then exactly one terminal result with no echoed events", async () => {
+    const streamRun: RunAgent = async (_req, emit) => {
+      emit?.({ type: "message", text: "a" });
+      emit?.({ type: "message", text: "b" });
+      return { ok: true, output: "ab", events: [{ type: "message", text: "a" }] };
+    };
+    const s = await listen(streamRun);
+    try {
+      const res = await fetch(`${s.url}/run`, {
+        method: "POST",
+        headers: { accept: "application/x-ndjson" },
+        body: "{}",
+      });
+      assert.equal(res.status, 200);
+      const records = (await res.text())
+        .trim()
+        .split("\n")
+        .map((line) => JSON.parse(line) as { kind: string; result?: { events: unknown[] } });
+      assert.deepEqual(records.map((r) => r.kind), ["event", "event", "result"]);
+      assert.deepEqual(records[2].result!.events, [], "terminal result does not echo events");
+    } finally {
+      await s.close();
+    }
+  });
+});
diff --git a/services/agent/tests/unit/stream-events.test.ts b/services/agent/tests/unit/stream-events.test.ts
new file mode 100644
index 0000000000..ff9bd1437b
--- /dev/null
+++ b/services/agent/tests/unit/stream-events.test.ts
@@ -0,0 +1,146 @@
+/**
+ * Unit test for the createRivetOtel delta/lifecycle state machine.
+ *
+ * Drives `handleUpdate` with a hand-built ACP `session/update` sequence (Claude-style
+ * cumulative text snapshots, a tool call between two text runs, a reasoning run) and asserts
+ * the streaming and one-shot event shapes. No harness, no network: spans are built offline
+ * and never flushed.
+ *
+ * Run: pnpm test (or: pnpm exec vitest run tests/unit/stream-events.test.ts)
+ */
+import { describe, it } from "vitest";
+import assert from "node:assert/strict";
+
+import { createRivetOtel } from "../../src/tracing/otel.ts";
+import type { AgentEvent } from "../../src/protocol.ts";
+
+const textChunk = (text: string) => ({
+  sessionUpdate: "agent_message_chunk",
+  content: { type: "text", text },
+});
+const thoughtChunk = (text: string) => ({
+  sessionUpdate: "agent_thought_chunk",
+  content: { type: "text", text },
+});
+const toolCall = (id: string, title: string, rawInput: unknown) => ({
+  sessionUpdate: "tool_call",
+  toolCallId: id,
+  title,
+  rawInput,
+});
+const toolDone = (id: string, text: string) => ({
+  sessionUpdate: "tool_call_update",
+  toolCallId: id,
+  status: "completed",
+  content: [{ content: { type: "text", text } }],
+});
+const usage = () => ({ sessionUpdate: "usage_update", used: 100, cost: { amount: 0.01 } });
+
+// The same ACP sequence drives both modes: two text runs around a tool call, then reasoning.
+function drive(run: ReturnType<typeof createRivetOtel>): void {
+  run.start({ prompt: "weather in Paris?" });
+  run.handleUpdate(textChunk("Hello ")); // pure delta
+  run.handleUpdate(textChunk("Hello world")); // cumulative snapshot (Claude-style)
+  run.handleUpdate(toolCall("call_1", "getWeather", { city: "Paris" }));
+  run.handleUpdate(toolDone("call_1", "sunny"));
+  run.handleUpdate(textChunk("Hello world It is sunny.")); // resumes after the tool
+  run.handleUpdate(thoughtChunk("thinking..."));
+  run.handleUpdate(usage());
+}
+
+const types = (events: AgentEvent[]) => events.map((e) => e.type);
+const ofType = <T extends AgentEvent["type"]>(events: AgentEvent[], t: T) =>
+  events.filter((e) => e.type === t) as Extract<AgentEvent, { type: T }>[];
+
+describe("createRivetOtel state machine", () => {
+  it("scenario 1: streaming (emit set) yields pure deltas and balanced lifecycle", () => {
+    const emitted: AgentEvent[] = [];
+    const run = createRivetOtel({ harness: "claude", model: "anthropic/x", emit: (e) => emitted.push(e) });
+    drive(run);
+    const finalText = run.finish();
+
+    // No coalesced text events on the streaming path.
+    assert.equal(ofType(emitted, "message").length, 0, "no coalesced message when streaming");
+    assert.equal(ofType(emitted, "thought").length, 0, "no coalesced thought when streaming");
+
+    // Exactly one terminal done.
+    assert.equal(ofType(emitted, "done").length, 1, "exactly one done");
+
+    // Two text blocks (split by the tool call), one reasoning block, balanced start/end.
+    const mStart = ofType(emitted, "message_start");
+    const mEnd = ofType(emitted, "message_end");
+    assert.equal(mStart.length, 2, "two message_start");
+    assert.equal(mEnd.length, 2, "two message_end");
+    assert.deepEqual(mStart.map((e) => e.id), ["msg-0", "msg-1"], "stable monotonic text ids");
+    const rStart = ofType(emitted, "reasoning_start");
+    const rEnd = ofType(emitted, "reasoning_end");
+    assert.equal(rStart.length, 1, "one reasoning_start");
+    assert.equal(rEnd.length, 1, "one reasoning_end");
+
+    // Deltas are pure and reconstruct the full text, with no overlap/repeat.
+    const text = ofType(emitted, "message_delta").map((e) => e.delta).join("");
+    assert.equal(text, "Hello world It is sunny.", "concatenated deltas == full text");
+    assert.equal(text, finalText, "deltas match finish() output");
+    const reasoning = ofType(emitted, "reasoning_delta").map((e) => e.delta).join("");
+    assert.equal(reasoning, "thinking...", "concatenated reasoning deltas");
+
+    // Ordering invariant: each block's start precedes its deltas precede its end; tool result
+    // lands before the second text block opens.
+    const seq = types(emitted);
+    assert.ok(seq.indexOf("message_end") < seq.indexOf("tool_call"), "first text block closes before the tool call");
+    assert.ok(seq.indexOf("tool_result") < seq.lastIndexOf("message_start"), "tool result precedes the second text block");
+    for (const id of ["msg-0", "msg-1", "reason-2"]) {
+      const idxs = emitted
+        .map((e, i) => ((e as any).id === id ? { i, t: e.type } : null))
+        .filter(Boolean) as { i: number; t: string }[];
+      assert.ok(idxs[0].t.endsWith("_start"), `${id} starts with *_start`);
+      assert.ok(idxs[idxs.length - 1].t.endsWith("_end"), `${id} ends with *_end`);
+    }
+  });
+
+  it("scenario 2: one-shot (no emit) coalesces text/thought and keeps structured events", () => {
+    const run = createRivetOtel({ harness: "claude", model: "anthropic/x" });
+    drive(run);
+    const finalText = run.finish();
+    const events = run.events();
+
+    // Coalesced text/thought, no delta lifecycle events.
+    const messages = ofType(events, "message");
+    assert.equal(messages.length, 1, "one coalesced message");
+    assert.equal(messages[0].text, "Hello world It is sunny.", "coalesced text == final");
+    assert.equal(messages[0].text, finalText);
+    assert.equal(ofType(events, "thought").length, 1, "one coalesced thought");
+    for (const t of ["message_start", "message_delta", "message_end", "reasoning_start", "reasoning_delta", "reasoning_end"]) {
+      assert.equal(events.filter((e) => e.type === t).length, 0, `no ${t} on the one-shot path`);
+    }
+
+    // The structured tool/usage events are still present, with exactly one done.
+    assert.equal(ofType(events, "tool_call").length, 1, "tool_call present");
+    assert.equal(ofType(events, "tool_result").length, 1, "tool_result present");
+    assert.equal(ofType(events, "usage").length, 1, "usage present");
+    assert.equal(ofType(events, "done").length, 1, "exactly one done");
+  });
+
+  it("scenario 3: span-less mode still records ACP events and final usage", () => {
+    const run = createRivetOtel({ harness: "pi", model: "openai-codex/x", emitSpans: false });
+    drive(run);
+    run.setUsage({ input: 4, output: 6, total: 10, cost: 0.02 });
+    const finalText = run.finish();
+    const events = run.events();
+
+    assert.equal(finalText, "Hello world It is sunny.");
+    assert.equal(ofType(events, "message").length, 1, "message present without spans");
+    assert.equal(ofType(events, "thought").length, 1, "thought present without spans");
+    assert.equal(ofType(events, "tool_call").length, 1, "tool_call present without spans");
+    assert.equal(ofType(events, "tool_result").length, 1, "tool_result present without spans");
+    const usageEvents = ofType(events, "usage");
+    assert.equal(usageEvents.length, 1, "usage present without spans");
+    assert.deepEqual(
+      usageEvents[0],
+      { type: "usage", input: 4, output: 6, total: 10, cost: 0.02 },
+      "final usage replaces stream-only usage before done",
+    );
+    assert.equal(ofType(events, "done").length, 1, "exactly one done without spans");
+    assert.ok(types(events).indexOf("usage") < types(events).indexOf("done"), "usage precedes done");
+  });
+});
diff --git a/services/agent/tests/unit/tool-bridge.test.ts b/services/agent/tests/unit/tool-bridge.test.ts
new file mode 100644
index 0000000000..fcd7eb6a13
--- /dev/null
+++ b/services/agent/tests/unit/tool-bridge.test.ts
@@ -0,0 +1,157 @@
+/**
+ * Unit tests for buildToolMcpServers (the tool MCP bridge attachment decision).
+ *
+ * Regression cover for F4: attachment must be decided per tool kind, not on the callback
+ * endpoint alone. A `code` tool runs locally in mcp-server.ts and needs no endpoint, so a run
+ * whose tools are all `code` must still attach the `agenta-tools` server. Only `callback`-kind
+ * tools require AGENTA_TOOL_CALLBACK_ENDPOINT; missing it must degrade those tools, not drop the
+ * whole server. `client` tools are browser-fulfilled and never justify attaching the bridge.
+ *
+ * Run: pnpm test (or: pnpm exec vitest run tests/unit/tool-bridge.test.ts)
+ */
+import { describe, it } from "vitest";
+import assert from "node:assert/strict";
+
+import { buildToolMcpServers } from "../../src/tools/mcp-bridge.ts";
+import type { ResolvedToolSpec, ToolCallbackContext } from "../../src/protocol.ts";
+
+/** Look up an env var value by name in the ACP {name,value} list (undefined if absent). */
+function envValue(
+  env: { name: string; value: string }[],
+  name: string,
+): string | undefined {
+  return env.find((e) => e.name === name)?.value;
+}
+
+const relayDir = "/tmp/agenta-tools";
+
+describe("buildToolMcpServers", () => {
+  it("attaches the server for a code-only run, with public specs and relay dir", () => {
+    const specs: ResolvedToolSpec[] = [
+      {
+        name: "adder",
+        description: "Add numbers",
+        kind: "code",
+        runtime: "python",
+        code: "def main(**k): return 1",
+        env: { PRIVATE: "secret" },
+      },
+    ];
+    const out = buildToolMcpServers(specs, relayDir);
+    assert.equal(out.length, 1, "code-only run still attaches the server");
+    assert.equal(out[0].name, "agenta-tools");
+    assert.ok(
+      envValue(out[0].env, "AGENTA_TOOL_PUBLIC_SPECS") !== undefined,
+      "AGENTA_TOOL_PUBLIC_SPECS is set",
+    );
+    assert.equal(
+      envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"),
+      undefined,
+      "no endpoint env for code-only run",
+    );
+    assert.equal(envValue(out[0].env, "AGENTA_TOOL_RELAY_DIR"), relayDir);
+    assert.equal(envValue(out[0].env, "AGENTA_TOOL_CALLBACK_AUTH"), undefined);
+    assert.equal(envValue(out[0].env, "AGENTA_TOOL_SPECS"), undefined);
+    // Only public metadata round-trips; private executor fields stay runner-side.
+    assert.deepEqual(JSON.parse(envValue(out[0].env, "AGENTA_TOOL_PUBLIC_SPECS")!), [
+      { name: "adder", description: "Add numbers" },
+    ]);
+  });
+
+  it("never exposes endpoint/auth env to the bridge child (callback + full callback)", () => {
+    const specs: ResolvedToolSpec[] = [
+      { name: "search", kind: "callback", callRef: "composio.search" },
+    ];
+    const callback: ToolCallbackContext = {
+      endpoint: "https://agenta.example/tools/call",
+      authorization: "Bearer tok",
+    };
+    const out = buildToolMcpServers(specs, callback, relayDir);
+    assert.equal(out.length, 1);
+    assert.equal(
+      envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"),
+      undefined,
+      "endpoint env is never exposed to the bridge",
+    );
+    assert.equal(
+      envValue(out[0].env, "AGENTA_TOOL_CALLBACK_AUTH"),
+      undefined,
+      "auth env is never exposed to the bridge",
+    );
+    assert.equal(envValue(out[0].env, "AGENTA_TOOL_RELAY_DIR"), relayDir);
+  });
+
+  it("omits AUTH env when authorization is absent (endpoint but no auth)", () => {
+    const specs: ResolvedToolSpec[] = [
+      { name: "search", kind: "callback", callRef: "composio.search" },
+    ];
+    const out = buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" }, relayDir);
+    assert.equal(out.length, 1);
+    assert.equal(envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"), undefined);
+    assert.equal(
+      envValue(out[0].env, "AGENTA_TOOL_CALLBACK_AUTH"),
+      undefined,
+      "no AUTH env when authorization absent",
+    );
+  });
+
+  it("treats an absent kind as callback (back-compat)", () => {
+    const specs: ResolvedToolSpec[] = [{ name: "legacy", callRef: "composio.legacy" }];
+    const out = buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" }, relayDir);
+    assert.equal(out.length, 1, "back-compat (no kind) attaches as a callback tool");
+    assert.equal(envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"), undefined);
+  });
+
+  it("attaches one server for a mixed code+callback run with no endpoint", () => {
+    const specs: ResolvedToolSpec[] = [
+      { name: "adder", kind: "code", runtime: "python", code: "def main(**k): return 1" },
+      { name: "search", kind: "callback", callRef: "composio.search" },
+    ];
+    const out = buildToolMcpServers(specs, relayDir);
+    assert.notDeepEqual(out, [], "mixed run with no endpoint must not return []");
+    assert.equal(out.length, 1, "still attaches the server so the code tool works");
+    assert.equal(
+      envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"),
+      undefined,
+      "endpoint env omitted when missing",
+    );
+    // Both executable specs are advertised, but only as public metadata.
+    assert.deepEqual(JSON.parse(envValue(out[0].env, "AGENTA_TOOL_PUBLIC_SPECS")!), [
+      { name: "adder" },
+      { name: "search" },
+    ]);
+  });
+
+  it("returns [] for empty specs", () => {
+    assert.deepEqual(buildToolMcpServers([], undefined), [], "empty specs -> []");
+  });
+
+  it("returns [] for client-only specs (nothing executable, even with an endpoint)", () => {
+    const specs: ResolvedToolSpec[] = [{ name: "confirm", kind: "client" }];
+    assert.deepEqual(
+      buildToolMcpServers(specs, undefined),
+      [],
+      "client-only -> [] (nothing executable here)",
+    );
+    assert.deepEqual(
+      buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" }, relayDir),
+      [],
+      "client-only -> [] even with an endpoint",
+    );
+  });
+
+  it("drops client tools from the advertised list but still attaches for an executable sibling", () => {
+    const specs: ResolvedToolSpec[] = [
+      { name: "confirm", kind: "client" },
+      { name: "adder", kind: "code", runtime: "python", code: "def main(**k): return 1" },
+    ];
+    const out = buildToolMcpServers(specs, relayDir);
+    assert.equal(out.length, 1, "executable spec attaches the server");
+    const passed: ResolvedToolSpec[] = JSON.parse(envValue(out[0].env, "AGENTA_TOOL_PUBLIC_SPECS")!);
+    assert.deepEqual(
+      passed.map((s) => s.name),
+      ["adder"],
+      "client spec excluded from the executable list passed to the bridge",
+    );
+  });
+});
diff --git a/services/agent/tests/unit/tool-dispatch.test.ts b/services/agent/tests/unit/tool-dispatch.test.ts
new file mode 100644
index 0000000000..af27dc991f
--- /dev/null
+++ b/services/agent/tests/unit/tool-dispatch.test.ts
@@ -0,0 +1,123 @@
+/**
+ * Unit tests for the shared tool-dispatch module (tools/dispatch.ts) and its routing.
+ *
+ * The kind-dispatch ("branch on spec.kind to execute a resolved tool") used to be duplicated
+ * across engines/pi.ts, extensions/agenta.ts, and tools/mcp-server.ts. It now lives once in
+ * `runResolvedTool`. These tests cover both the routing into that function and the call-site
+ * advertising behavior that stays per-site:
+ *  - buildCustomTools (pi.ts) skips `client` specs, builds a tool per `code`/`callback` spec,
+ *    and skips a `callback` spec with no callback endpoint.
+ *  - runResolvedTool runs a real `code` snippet end-to-end (python) and throws for `client`.
+ *
+ * No network and no harness: the `code` path shells out to python3 (available locally); the
+ * `callback`/relay paths are not exercised here (they need a live /tools/call or a relay dir).
+ *
+ * Run: pnpm test (or: pnpm exec vitest run tests/unit/tool-dispatch.test.ts)
+ */
+import { describe, it } from "vitest";
+import assert from "node:assert/strict";
+import { mkdtempSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+
+import { buildCustomTools } from "../../src/engines/pi.ts";
+import { relayToolCall, runResolvedTool } from "../../src/tools/dispatch.ts";
+import { RELAY_RES_SUFFIX, sanitizeRelayId } from "../../src/tools/relay.ts";
+import type { ResolvedToolSpec, ToolCallbackContext } from "../../src/protocol.ts";
+
+const callback: ToolCallbackContext = { endpoint: "https://agenta.test/tools/call" };
+
+const clientSpec: ResolvedToolSpec = { name: "client_tool", kind: "client" };
+const codeSpec: ResolvedToolSpec = {
+  name: "code_tool",
+  kind: "code",
+  runtime: "python",
+  code: 'def main(**kw):\n    return {"echo": kw}\n',
+};
+const callbackSpec: ResolvedToolSpec = {
+  name: "callback_tool",
+  kind: "callback",
+  callRef: "composio.SOME_ACTION",
+};
+
+describe("buildCustomTools routing", () => {
+  it("skips client specs and builds one tool per code/callback spec", () => {
+    const tools = buildCustomTools([clientSpec, codeSpec, callbackSpec], callback);
+    const names = tools.map((t) => t.name);
+
+    // `client` is browser-fulfilled, so it is never registered in-process.
+    assert.ok(!names.includes("client_tool"), "client spec is skipped");
+    // `code` and `callback` each produce exactly one tool with the spec's name.
+    assert.ok(names.includes("code_tool"), "code spec produces a tool");
+    assert.ok(names.includes("callback_tool"), "callback spec produces a tool");
+    assert.equal(tools.length, 2, "only the two executable specs produce tools");
+  });
+
+  it("skips a callback spec with no endpoint but keeps a sibling code spec", () => {
+    const tools = buildCustomTools([codeSpec, callbackSpec], undefined);
+    const names = tools.map((t) => t.name);
+    assert.ok(names.includes("code_tool"), "code spec still registers without an endpoint");
+    assert.ok(
+      !names.includes("callback_tool"),
+      "callback spec is skipped when no callback endpoint",
+    );
+    assert.equal(tools.length, 1, "only the code spec registers without an endpoint");
+  });
+});
+
+describe("runResolvedTool", () => {
+  it("runs a code spec end-to-end (python)", async () => {
+    const text = await runResolvedTool(codeSpec, { greeting: "hi", n: 3 }, {
+      toolCallId: "call-1",
+    });
+    const parsed = JSON.parse(text);
+    assert.deepEqual(
+      parsed,
+      { echo: { greeting: "hi", n: 3 } },
+      "code tool runs the snippet and returns its JSON output containing the input",
+    );
+  });
+
+  it("throws for a client spec (never executed in-sandbox)", async () => {
+    await assert.rejects(
+      () => runResolvedTool(clientSpec, {}, { toolCallId: "call-2" }),
+      /browser-fulfilled/,
+      "client tool throws (never executed in-sandbox)",
+    );
+  });
+});
+
+// Directly exercises the Daytona file-relay path (the code site of the fixed `callRef` bug):
+// pre-write the response file the runner watches for, then call relayToolCall and read it back.
+describe("relayToolCall (Daytona file relay)", () => {
+  it("returns the relayed text when the response is ok", async () => {
+    const dir = mkdtempSync(join(tmpdir(), "agenta-relay-test-"));
+    try {
+      const toolCallId = "call-ok";
+      const resPath = join(dir, sanitizeRelayId(toolCallId) + RELAY_RES_SUFFIX);
+      writeFileSync(resPath, JSON.stringify({ ok: true, text: "relayed-ok" }));
+      const out = await relayToolCall(dir, "myTool", toolCallId, { a: 1 });
+      assert.equal(out, "relayed-ok");
+    } finally {
+      rmSync(dir, { recursive: true, force: true });
+    }
+  });
+
+  it("reports the tool name on an empty relay error (regression for the callRef bug)", async () => {
+    const dir = mkdtempSync(join(tmpdir(), "agenta-relay-test-"));
+    try {
+      const toolCallId = "call-err";
+      const resPath = join(dir, sanitizeRelayId(toolCallId) + RELAY_RES_SUFFIX);
+      // ok:false with an empty error string forces the fallback message, which referenced the
+      // undefined `callRef` before the fix and would have thrown a ReferenceError instead.
+      writeFileSync(resPath, JSON.stringify({ ok: false, error: "" }));
+      await assert.rejects(
+        () => relayToolCall(dir, "myTool", toolCallId, {}),
+        /tool relay failed for myTool/,
+        "the error message uses toolName, not an undefined callRef",
+      );
+    } finally {
+      rmSync(dir, { recursive: true, force: true });
+    }
+  });
+});
diff --git a/services/agent/tsconfig.json b/services/agent/tsconfig.json
index b8314675f3..be7c8f733b 100644
--- a/services/agent/tsconfig.json
+++ b/services/agent/tsconfig.json
@@ -12,5 +12,5 @@
     "resolveJsonModule": true,
     "allowImportingTsExtensions": true
   },
-  "include": ["src/**/*.ts"]
+  "include": ["src/**/*.ts", "tests/**/*.ts", "vitest.config.ts"]
 }
diff --git a/services/agent/vitest.config.ts b/services/agent/vitest.config.ts
new file mode 100644
index 0000000000..f11ad12029
--- /dev/null
+++ b/services/agent/vitest.config.ts
@@ -0,0 +1,20 @@
+import { defineConfig } from "vitest/config";
+
+// Mirrors the web/packages/* convention: node env, junit for CI publishing, v8 coverage
+// over src/. Unit tests live in tests/unit/**; the runner code stays in src/.
+export default defineConfig({
+  test: {
+    include: ["tests/unit/**/*.test.ts"],
+    environment: "node",
+    reporters: ["default", "junit"],
+    outputFile: {
+      junit: "./test-results/junit.xml",
+    },
+    coverage: {
+      provider: "v8",
+      include: ["src/**/*.ts"],
+      reporter: ["text", "lcov", "json-summary"],
+      reportsDirectory: "./coverage",
+    },
+  },
+});

From 6eb063c0e07b2f32111afe889a4d5a35a6d60c8e Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Sun, 21 Jun 2026 01:14:51 +0200
Subject: [PATCH 2/2] Update .github/workflows/12-check-unit-tests.yml

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 .github/workflows/12-check-unit-tests.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/12-check-unit-tests.yml b/.github/workflows/12-check-unit-tests.yml
index c8bc699e65..6f066a9817 100644
--- a/.github/workflows/12-check-unit-tests.yml
+++ b/.github/workflows/12-check-unit-tests.yml
@@ -319,6 +319,8 @@ jobs:
       AGENTA_LICENSE: oss
     steps:
       - uses: actions/checkout@v6
+        with:
+          persist-credentials: false
 
       - name: Skip when package selection excludes services
         if: github.event_name == 'workflow_dispatch' && !contains(fromJSON('["all","services-only"]'), inputs.packages)