From 04b7c7dba0733f12da68aef4579ac31ee8a7c89e Mon Sep 17 00:00:00 2001 From: Kurt Stohrer Date: Tue, 12 May 2026 16:22:34 -0400 Subject: [PATCH] Add agent-loop e2e suite for style_update, a11y_fix, error_fix (ANN-6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stands up the agent round-trip plumbing as a measurable, CI-blocking artifact for the public demo and design-partner pitch. Each task type drives a full lifecycle on react-workflows (React+Vite) and vue-data-lab (Vue+Vite): - Capture the AgentLoopTarget component + tracer stylesheet - Seed a deterministic task shape via the per-MFE API - Open the host shell, exercise the task panel / a11y scan / error monitor as appropriate - Run a rule-based simulator that follows the same MCP-CLI sequence (annotask task / update-task) a real coding agent would - Verify the iframe DOM, axe rescan, or console stream reflects the fix after Vite HMR - Restore the captured files and emit a per-run JSON metric The simulator is intentionally rule-based for v1 — see docs/agent-loop-evals.md for the schema, the caveats around what is and isn't measured today, and where the LLM apply step plugs in for v2. Per-test metrics land under playgrounds/stress-test/e2e/annotask/reports/agent-loop/ and are uploaded as a CI artifact. A focused playwright config (agent-loop.config.ts) only spins up host + the two target MFEs so the new CI job stays under the broader stress-cluster cost. The existing pnpm test:e2e:stress:annotask script still picks the tests up via its directory filter. Co-Authored-By: Paperclip --- .github/workflows/ci.yml | 30 ++++ docs/agent-loop-evals.md | 137 ++++++++++++++++ package.json | 1 + .../src/AgentLoopTarget.tsx | 46 ++++++ .../src/agent-loop-target.css | 8 + .../apps/mfe-react-workflows/src/main.tsx | 12 ++ .../mfe-vue-data-lab/src/AgentLoopTarget.vue | 38 +++++ .../src/agent-loop-target.css | 6 + .../apps/mfe-vue-data-lab/src/main.ts | 8 + .../stress-test/e2e/agent-loop.config.ts | 39 +++++ .../e2e/annotask/agent-loop/a11y-fix.spec.ts | 144 ++++++++++++++++ .../e2e/annotask/agent-loop/error-fix.spec.ts | 155 ++++++++++++++++++ .../annotask/agent-loop/style-update.spec.ts | 130 +++++++++++++++ .../e2e/annotask/helpers/agent-loop/cli.ts | 61 +++++++ .../annotask/helpers/agent-loop/metrics.ts | 66 ++++++++ .../annotask/helpers/agent-loop/simulator.ts | 150 +++++++++++++++++ .../annotask/helpers/agent-loop/targets.ts | 71 ++++++++ 17 files changed, 1102 insertions(+) create mode 100644 docs/agent-loop-evals.md create mode 100644 playgrounds/stress-test/apps/mfe-react-workflows/src/AgentLoopTarget.tsx create mode 100644 playgrounds/stress-test/apps/mfe-react-workflows/src/agent-loop-target.css create mode 100644 playgrounds/stress-test/apps/mfe-vue-data-lab/src/AgentLoopTarget.vue create mode 100644 playgrounds/stress-test/apps/mfe-vue-data-lab/src/agent-loop-target.css create mode 100644 playgrounds/stress-test/e2e/agent-loop.config.ts create mode 100644 playgrounds/stress-test/e2e/annotask/agent-loop/a11y-fix.spec.ts create mode 100644 playgrounds/stress-test/e2e/annotask/agent-loop/error-fix.spec.ts create mode 100644 playgrounds/stress-test/e2e/annotask/agent-loop/style-update.spec.ts create mode 100644 playgrounds/stress-test/e2e/annotask/helpers/agent-loop/cli.ts create mode 100644 playgrounds/stress-test/e2e/annotask/helpers/agent-loop/metrics.ts create mode 100644 playgrounds/stress-test/e2e/annotask/helpers/agent-loop/simulator.ts create mode 100644 playgrounds/stress-test/e2e/annotask/helpers/agent-loop/targets.ts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f7d338a..374fada 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -58,3 +58,33 @@ jobs: - run: npx playwright install --with-deps chromium - run: pnpm test:e2e --project=${{ matrix.project }} + + agent-loop: + runs-on: ubuntu-latest + needs: build-and-test + steps: + - uses: actions/checkout@v4 + + - uses: pnpm/action-setup@v4 + with: + version: 10 + + - uses: actions/setup-node@v4 + with: + node-version: 20 + cache: pnpm + + - run: pnpm install --frozen-lockfile + + - run: pnpm build + + - run: npx playwright install --with-deps chromium + + - run: pnpm test:e2e:stress:annotask:agent-loop + + - uses: actions/upload-artifact@v4 + if: always() + with: + name: agent-loop-metrics + path: playgrounds/stress-test/e2e/annotask/reports/agent-loop/ + if-no-files-found: warn diff --git a/docs/agent-loop-evals.md b/docs/agent-loop-evals.md new file mode 100644 index 0000000..a186e77 --- /dev/null +++ b/docs/agent-loop-evals.md @@ -0,0 +1,137 @@ +# Agent-loop evaluation harness + +The agent-loop e2e suite measures the Annotask round-trip — user +annotates → task lands in MCP → coding agent applies the change → +re-render verifies the fix — for the three highest-leverage task types +on the stress-test playground. + +It is the credibility artifact behind the public demo and the +design-partner pitch deck. The numbers it emits are how we'll know +whether shipping the next task type is helping or regressing the loop. + +> **v1 scope.** The simulator that stands in for the coding agent is +> deterministic and rule-based — not LLM-driven. The harness is here +> to measure *plumbing reliability* (does the task land, do the MCP +> tools work, does HMR pick the fix up, do metrics persist) so we can +> ship the public demo without a paid-LLM dependency. The follow-up +> ticket on **agent-apply quality** (tracked under +> [ANN-1](/ANN/issues/ANN-1) child issues) is where the real LLM gets +> wired into this same harness. + +## What each test proves + +| Task type | Test surface | Round-trip assertion | +| ------------- | ----------------------------------------------------------------------- | ----------------------------------------------------------------------------- | +| `style_update`| Tracer stylesheet on a known `data-agent-loop-target` element. | Iframe `getComputedStyle().color` flips after Vite HMR. | +| `a11y_fix` | `` in the test-only target component with `alt` attribute removed. | `axe-core` rescan reports zero `image-alt` violations after the fix. | +| `error_fix` | `console.error()` injected into the target component. | Console listener sees zero tracer errors after the fix lands. | + +All three tests run on both **React+Vite** (`react-workflows`, port +4210) and **Vue+Vite** (`vue-data-lab`, port 4220) MFEs. Adding a new +framework target is a single entry in +`playgrounds/stress-test/e2e/annotask/helpers/agent-loop/targets.ts`. + +## How the simulator stands in for an agent + +The agent simulator +(`playgrounds/stress-test/e2e/annotask/helpers/agent-loop/simulator.ts`) +calls the same `annotask` CLI flags a real coding agent would (`--mcp`, +`--server=…`) so we exercise the MCP-shaped tool surface end-to-end: + +1. `annotask task --mcp` — hydrate full task detail +2. `annotask update-task --status=in_progress --mcp` — lock it +3. **Apply step (rule-based for v1):** + - `style_update` — replace the `before` rgb literal with `after` in + `agent-loop-target.css` + - `a11y_fix` — for `rule: image-alt`, regex-inject `alt=""` on any + `` missing the attribute + - `error_fix` — strip every line containing the test's tracer + comment marker +4. `annotask update-task --status=review --resolution="…" --mcp` + +The apply step is what an LLM coding agent will replace in the v2 +ticket. The rest of the loop — lock, fetch context, mark review, +re-fetch denied tasks — is the production path. + +## Running the suite + +```bash +pnpm build # CLI must be built first; simulator uses dist/cli.js +pnpm test:e2e:stress:annotask # runs everything under playgrounds/stress-test/e2e/annotask/ +``` + +The Playwright config under `playgrounds/stress-test/e2e/` boots the +host shell, the seven stress MFEs, and the four fast native API +services with `reuseExistingServer: true`. First boot takes about a +minute while Vite optimizes deps. + +The agent-loop specs run in `serial` mode per (framework × task type) +because each test mutates the AgentLoopTarget component file and +restores it in `afterEach`. Two concurrent style_update tests on the +same MFE would race on the file. + +## Reading the metrics output + +Each test writes one JSON file under +`playgrounds/stress-test/e2e/annotask/reports/agent-loop/`: + +```json +{ + "task_type": "style_update", + "app_id": "react-workflows", + "framework": "react+vite", + "outcome": "success", + "time_to_apply_ms": 412, + "retries": 0, + "denied_on_first_try": false, + "task_id": "task-abc123", + "resolution": "Swapped color from rgb(255, 0, 0) to rgb(0, 128, 0) in agent-loop-target.css", + "error_message": null, + "recorded_at": "2026-05-12T20:21:14.882Z" +} +``` + +Field meanings — useful when this seeds the eval dashboard: + +- **outcome** — `success` if the round-trip assertion passes; otherwise + `failure` with `error_message` set. +- **time_to_apply_ms** — wall-clock from simulator start to task + transitioning to `review`. Not the full round-trip — HMR and re-scan + time are reported in the Playwright test duration, not here. +- **retries** — always `0` in v1 (simulator does not loop). When the + LLM agent lands, the simulator will increment this on `denied` → + `in_progress` cycles. +- **denied_on_first_try** — placeholder for the v2 LLM apply harness. + The deterministic simulator never gets denied today. +- **task_id** / **resolution** — copied from the MCP-CLI response to + make it easy to grep back to the originating task without re-running + the suite. + +## v1 caveats (what's *not* tested yet) + +- The shell's inspector tool is not driven for `style_update` — tasks + are seeded via the per-MFE API. Driving the inspector tool is its own + test; the agent-loop suite focuses on what the agent does *after* + the task lands. +- The "Create Fix Task" button on `a11y_fix` is exercised in + `annotate.spec.ts`. The agent-loop suite seeds a deterministic task + shape directly so the simulator can run against a known anchor. +- The simulator's deterministic apply rules cover **one** failure mode + per task type. The v2 ticket on agent-apply quality expands rules + (or, more likely, replaces them with an LLM call) so the harness can + measure performance on the full task-type matrix. +- `retries` and `denied_on_first_try` are wired into the metric shape + but always zero/false in v1. The schema is locked so the dashboard + doesn't churn when the LLM agent ships. + +## How to add a new task type + +1. Add a deterministic apply function to `helpers/agent-loop/simulator.ts`. +2. Add a fixture to `AgentLoopTarget.{tsx,vue}` (or a sibling target + file) that the test can mutate to seed the failure mode. +3. Add a spec under `playgrounds/stress-test/e2e/annotask/agent-loop/` + following the same `capture → seed → drive shell → simulate → + verify → restore` pattern. +4. Extend `TaskTypeKey` in `helpers/agent-loop/metrics.ts` so the JSON + output stays type-checked. +5. Document the new task type in the table at the top of this file. diff --git a/package.json b/package.json index 786514c..5fd0f24 100644 --- a/package.json +++ b/package.json @@ -61,6 +61,7 @@ "stress-test:down": "docker compose -f playgrounds/stress-test/docker-compose.yml down", "test:e2e:stress": "playwright test --config playgrounds/stress-test/e2e/playwright.config.ts", "test:e2e:stress:annotask": "playwright test --config playgrounds/stress-test/e2e/playwright.config.ts annotask/ || true", + "test:e2e:stress:annotask:agent-loop": "playwright test --config playgrounds/stress-test/e2e/agent-loop.config.ts", "typecheck": "tsc --noEmit && vue-tsc --noEmit -p src/shell/tsconfig.json", "test": "vitest run", "test:watch": "vitest", diff --git a/playgrounds/stress-test/apps/mfe-react-workflows/src/AgentLoopTarget.tsx b/playgrounds/stress-test/apps/mfe-react-workflows/src/AgentLoopTarget.tsx new file mode 100644 index 0000000..d6cd159 --- /dev/null +++ b/playgrounds/stress-test/apps/mfe-react-workflows/src/AgentLoopTarget.tsx @@ -0,0 +1,46 @@ +/** + * Test-only target for agent-loop e2e tests. + * + * Always mounted but visually inert by default. The e2e tests in + * `playgrounds/stress-test/e2e/annotask/agent-loop/` mutate + * `agent-loop-target.css` to drive a known style change through Vite + * HMR and verify the round-trip. They also mutate this file to seed + * a11y violations and console errors, then run the agent simulator + * to apply a fix and restore the file in `afterEach`. + * + * The "Agent-loop e2e target" landmark only renders when the URL hash + * is `#agent-loop-target` so it stays invisible in normal stress-test + * use. + */ +import { useEffect, useState } from 'react' +import './agent-loop-target.css' + +function useShowTarget(): boolean { + const [show, setShow] = useState( + typeof window !== 'undefined' && window.location.hash === '#agent-loop-target', + ) + useEffect(() => { + const handler = () => setShow(window.location.hash === '#agent-loop-target') + window.addEventListener('hashchange', handler) + return () => window.removeEventListener('hashchange', handler) + }, []) + return show +} + +export function AgentLoopTarget(): JSX.Element | null { + const show = useShowTarget() + if (!show) return null + return ( +
+

Agent-loop e2e target

+

Tracer element for agent-loop e2e tests.

+ +
+ ) +} diff --git a/playgrounds/stress-test/apps/mfe-react-workflows/src/agent-loop-target.css b/playgrounds/stress-test/apps/mfe-react-workflows/src/agent-loop-target.css new file mode 100644 index 0000000..0c6e5ab --- /dev/null +++ b/playgrounds/stress-test/apps/mfe-react-workflows/src/agent-loop-target.css @@ -0,0 +1,8 @@ +/* + * Agent-loop e2e: tracer stylesheet. Rewritten by the simulator during + * style_update tests, then restored in afterEach. Vite HMR picks up + * each edit and the test asserts the iframe's computed style flipped. + */ +[data-agent-loop-target='paragraph'] { + color: rgb(255, 0, 0); +} diff --git a/playgrounds/stress-test/apps/mfe-react-workflows/src/main.tsx b/playgrounds/stress-test/apps/mfe-react-workflows/src/main.tsx index a2999a7..9567f38 100644 --- a/playgrounds/stress-test/apps/mfe-react-workflows/src/main.tsx +++ b/playgrounds/stress-test/apps/mfe-react-workflows/src/main.tsx @@ -5,6 +5,7 @@ import { bootstrapTheme } from '@annotask/stress-ui-tokens' import { StrictMode } from 'react' import { createRoot } from 'react-dom/client' import { Root } from './Root' +import { AgentLoopTarget } from './AgentLoopTarget' bootstrapTheme() @@ -13,3 +14,14 @@ createRoot(document.getElementById('app')!).render( , ) + +// Agent-loop e2e target — only renders when the page hash is +// `#agent-loop-target`. Inert otherwise. +const agentLoopHost = document.createElement('div') +agentLoopHost.id = 'agent-loop-host' +document.body.appendChild(agentLoopHost) +createRoot(agentLoopHost).render( + + + , +) diff --git a/playgrounds/stress-test/apps/mfe-vue-data-lab/src/AgentLoopTarget.vue b/playgrounds/stress-test/apps/mfe-vue-data-lab/src/AgentLoopTarget.vue new file mode 100644 index 0000000..a718945 --- /dev/null +++ b/playgrounds/stress-test/apps/mfe-vue-data-lab/src/AgentLoopTarget.vue @@ -0,0 +1,38 @@ + + + + diff --git a/playgrounds/stress-test/apps/mfe-vue-data-lab/src/agent-loop-target.css b/playgrounds/stress-test/apps/mfe-vue-data-lab/src/agent-loop-target.css new file mode 100644 index 0000000..fe498c8 --- /dev/null +++ b/playgrounds/stress-test/apps/mfe-vue-data-lab/src/agent-loop-target.css @@ -0,0 +1,6 @@ +/* + * Agent-loop e2e: tracer stylesheet. See the React sibling's notes. + */ +[data-agent-loop-target='paragraph'] { + color: rgb(255, 0, 0); +} diff --git a/playgrounds/stress-test/apps/mfe-vue-data-lab/src/main.ts b/playgrounds/stress-test/apps/mfe-vue-data-lab/src/main.ts index b1c4902..5a687d9 100644 --- a/playgrounds/stress-test/apps/mfe-vue-data-lab/src/main.ts +++ b/playgrounds/stress-test/apps/mfe-vue-data-lab/src/main.ts @@ -2,7 +2,15 @@ import '@annotask/stress-ui-tokens/tokens.css' import { bootstrapTheme } from '@annotask/stress-ui-tokens' import { createApp } from 'vue' import App from './App.vue' +import AgentLoopTarget from './AgentLoopTarget.vue' bootstrapTheme() createApp(App).mount('#app') + +// Agent-loop e2e target — only renders when the page hash is +// `#agent-loop-target`. Inert otherwise. +const agentLoopHost = document.createElement('div') +agentLoopHost.id = 'agent-loop-host' +document.body.appendChild(agentLoopHost) +createApp(AgentLoopTarget).mount(agentLoopHost) diff --git a/playgrounds/stress-test/e2e/agent-loop.config.ts b/playgrounds/stress-test/e2e/agent-loop.config.ts new file mode 100644 index 0000000..c7e5725 --- /dev/null +++ b/playgrounds/stress-test/e2e/agent-loop.config.ts @@ -0,0 +1,39 @@ +/** + * Focused Playwright config for the agent-loop e2e suite. Only spins up + * the host shell plus the two target MFEs (react-workflows, + * vue-data-lab) — the rest of the stress cluster is overkill for these + * specs and would triple the CI runtime. + * + * If you need to run against the full stress cluster instead, use + * `pnpm test:e2e:stress:annotask` which loads the broader config. + */ +import { defineConfig, devices } from '@playwright/test' + +const webServers = [ + { name: 'stress-host', command: 'pnpm dev:stress-host', url: 'http://localhost:4200' }, + { name: 'stress-react-workflows', command: 'pnpm dev:stress-react-workflows', url: 'http://localhost:4210' }, + { name: 'stress-vue-data-lab', command: 'pnpm dev:stress-vue-data-lab', url: 'http://localhost:4220' }, +] + +export default defineConfig({ + testDir: './annotask/agent-loop', + timeout: 90_000, + expect: { timeout: 15_000 }, + fullyParallel: false, + workers: 1, + retries: 0, + reporter: [['list'], ['./annotask/reporter.ts']], + use: { + trace: 'on-first-retry', + baseURL: 'http://localhost:4200', + ...devices['Desktop Chrome'], + }, + webServer: webServers.map(s => ({ + command: s.command, + url: s.url, + reuseExistingServer: true, + timeout: 120_000, + stdout: 'ignore', + stderr: 'pipe', + })), +}) diff --git a/playgrounds/stress-test/e2e/annotask/agent-loop/a11y-fix.spec.ts b/playgrounds/stress-test/e2e/annotask/agent-loop/a11y-fix.spec.ts new file mode 100644 index 0000000..c667061 --- /dev/null +++ b/playgrounds/stress-test/e2e/annotask/agent-loop/a11y-fix.spec.ts @@ -0,0 +1,144 @@ +/** + * a11y_fix agent loop — produce an image-alt violation, let the shell's + * axe-core scan catch it, seed an a11y_fix task with the rule context, + * then run the simulator (deterministic image-alt fix) and verify the + * violation is gone after re-scan. + */ +import { test, expect } from '@playwright/test' +import { writeFileSync } from 'node:fs' +import { APPS, apiUrl } from '../fixtures/apps' +import { AnnotaskShell } from '../fixtures/annotask-page' +import { SEL } from '../helpers/selectors' +import { AGENT_LOOP_APPS, capture, restore, type CapturedFile } from '../helpers/agent-loop/targets' +import { applyA11yFix } from '../helpers/agent-loop/simulator' +import { emptyMetric, writeMetric, type RunMetric } from '../helpers/agent-loop/metrics' + +const FEATURE_GROUP = 'agent-loop' +const FEATURE_ID = 'a11y-fix' + +/** Remove the `alt=""` (or `alt=...`) attribute from the AgentLoopTarget + * image. Keeps the file syntactically valid in both JSX and Vue + * templates because we control the markup. */ +function breakImageAlt(file: string): string { + return file.replace(/]*?)\salt=("[^"]*"|\{[^}]*\})([^>]*)>/g, '') +} + +for (const target of AGENT_LOOP_APPS) { + const app = APPS.find(a => a.id === target.id) + if (!app) throw new Error(`agent-loop target ${target.id} is not in APPS`) + + test.describe(`[${target.id}] agent-loop · a11y_fix`, () => { + test.describe.configure({ mode: 'serial' }) + + let captured: CapturedFile[] = [] + let metric: RunMetric + + test.beforeEach(async () => { + captured = capture([target.componentPath, target.cssPath]) + metric = emptyMetric('a11y_fix', target.id, target.framework) + // Seed the broken state: image with no alt attribute. + const broken = breakImageAlt(captured[0].contents) + if (broken === captured[0].contents) { + throw new Error( + `a11y_fix seed: could not break alt= attribute in ${target.componentPath}`, + ) + } + writeFileSync(target.componentPath, broken) + }) + + test.afterEach(async () => { + restore(captured) + writeMetric(metric) + }) + + test('agent fixes image-alt violation surfaced by axe scan', async ({ page, request }) => { + test.info().annotations.push({ + type: 'matrix', + description: `${target.id}/${FEATURE_GROUP}/${FEATURE_ID}`, + }) + + // 1. Load the iframe at the target hash and confirm the + // violation is surfaced by the shell's axe-core scan. + await page.goto(`http://localhost:${target.port}/#agent-loop-target`) + await expect(page.locator("[data-agent-loop-target='image']")).toBeVisible({ timeout: 10_000 }) + + const shell = new AnnotaskShell(page, app) + await shell.open() + // Route the iframe to the target hash via the toolbar input. + await page.locator(SEL.inputRoute).fill('/#agent-loop-target') + await page.locator(SEL.inputRoute).press('Enter') + + await shell.gotoAuditSection('a11y') + await page.locator(SEL.btnScanA11y).click() + await expect.poll(async () => { + return page.locator(SEL.a11yViolation).count() + }, { timeout: 15_000 }).toBeGreaterThan(0) + + // 2. Seed the a11y_fix task explicitly. (We could click the + // shell's "Create Fix Task" button instead — that path is + // exercised in `annotate.spec.ts` — but here we want a + // deterministic task shape for the simulator.) + const desc = `agent-loop a11y_fix image-alt · ${target.id} · ${Date.now()}` + const seedRes = await request.post(apiUrl(app, '/tasks'), { + data: { + type: 'a11y_fix', + description: desc, + file: target.componentPath, + line: 1, + context: { + rule: 'image-alt', + impact: 'serious', + helpUrl: 'https://dequeuniversity.com/rules/axe/4.10/image-alt', + selector: "[data-agent-loop-target='image']", + }, + }, + }) + expect(seedRes.ok(), `seed POST failed: ${await seedRes.text()}`).toBeTruthy() + const taskId = (await seedRes.json()).task?.id ?? (await seedRes.json()).id + expect(taskId).toBeTruthy() + metric.task_id = taskId + + // 3. Run the simulator: deterministic alt="" injection. + const started = Date.now() + let result + try { + result = await applyA11yFix({ + taskId, + port: app.port, + componentPath: target.componentPath, + rule: 'image-alt', + }) + } catch (err) { + metric.error_message = err instanceof Error ? err.message : String(err) + throw err + } + metric.time_to_apply_ms = Date.now() - started + metric.resolution = result.resolution + + // 4. Reload iframe (HMR may have already applied), re-scan, + // expect violation count to be zero (or at least drop). + await page.locator(SEL.inputRoute).fill('/#agent-loop-target') + await page.locator(SEL.inputRoute).press('Enter') + await page.locator(SEL.btnScanA11y).click() + await expect.poll(async () => { + const rows = page.locator(SEL.a11yViolation) + const count = await rows.count() + let imageAltStill = 0 + for (let i = 0; i < count; i++) { + const text = (await rows.nth(i).textContent()) ?? '' + if (text.toLowerCase().includes('image-alt') || text.toLowerCase().includes('alternative text')) { + imageAltStill++ + } + } + return imageAltStill + }, { timeout: 15_000 }).toBe(0) + + // 5. Verify task state. + const taskRes = await request.get(apiUrl(app, `/tasks/${taskId}`)) + const task = (await taskRes.json()).task ?? (await taskRes.json()) + expect(task.status).toBe('review') + + metric.outcome = 'success' + }) + }) +} diff --git a/playgrounds/stress-test/e2e/annotask/agent-loop/error-fix.spec.ts b/playgrounds/stress-test/e2e/annotask/agent-loop/error-fix.spec.ts new file mode 100644 index 0000000..e68c6f0 --- /dev/null +++ b/playgrounds/stress-test/e2e/annotask/agent-loop/error-fix.spec.ts @@ -0,0 +1,155 @@ +/** + * error_fix agent loop — inject a `console.error` tracer into the + * AgentLoopTarget, confirm the shell's error monitor catches it, seed + * an error_fix task pointing at the marker line, run the simulator + * to delete the marker line, and verify the error stops firing. + */ +import { test, expect } from '@playwright/test' +import { writeFileSync } from 'node:fs' +import { APPS, apiUrl } from '../fixtures/apps' +import { AnnotaskShell } from '../fixtures/annotask-page' +import { SEL } from '../helpers/selectors' +import { AGENT_LOOP_APPS, capture, restore, type CapturedFile } from '../helpers/agent-loop/targets' +import { applyErrorFix } from '../helpers/agent-loop/simulator' +import { emptyMetric, writeMetric, type RunMetric } from '../helpers/agent-loop/metrics' + +const FEATURE_GROUP = 'agent-loop' +const FEATURE_ID = 'error-fix' +const TRACER = 'e2e-agent-loop-error-tracer' + +/** Inject a `console.error` call into the AgentLoopTarget render path. + * For React, we inject above the `
` JSX (legal inside + * fragment-free returns we control). For Vue, we inject inside the + * ``, + ) + } + // React/TSX — drop the error inside the component body before the + // JSX `return`. Match the closing brace of `useShowTarget(...)` line + // and append on a new line. + return file.replace( + /export function AgentLoopTarget\(\): JSX\.Element \| null \{\n(\s+)const show = useShowTarget\(\)/, + `export function AgentLoopTarget(): JSX.Element | null {\n$1const show = useShowTarget()\n$1console.error('${TRACER}') // ${TRACER}`, + ) +} + +for (const target of AGENT_LOOP_APPS) { + const app = APPS.find(a => a.id === target.id) + if (!app) throw new Error(`agent-loop target ${target.id} is not in APPS`) + + test.describe(`[${target.id}] agent-loop · error_fix`, () => { + test.describe.configure({ mode: 'serial' }) + + let captured: CapturedFile[] = [] + let metric: RunMetric + + test.beforeEach(async () => { + captured = capture([target.componentPath]) + metric = emptyMetric('error_fix', target.id, target.framework) + const broken = breakWithErrorTracer(captured[0].contents) + if (broken === captured[0].contents) { + throw new Error( + `error_fix seed: could not inject tracer into ${target.componentPath}`, + ) + } + writeFileSync(target.componentPath, broken) + }) + + test.afterEach(async () => { + restore(captured) + writeMetric(metric) + }) + + test('agent removes throwing line and console error stops firing', async ({ page, request }) => { + test.info().annotations.push({ + type: 'matrix', + description: `${target.id}/${FEATURE_GROUP}/${FEATURE_ID}`, + }) + + // 1. Open the shell and route the iframe to the target hash. + const consoleErrors: string[] = [] + page.on('console', msg => { + if (msg.type() === 'error' && msg.text().includes(TRACER)) { + consoleErrors.push(msg.text()) + } + }) + + const shell = new AnnotaskShell(page, app) + await shell.open() + await page.locator(SEL.inputRoute).fill('/#agent-loop-target') + await page.locator(SEL.inputRoute).press('Enter') + await expect(page.locator(SEL.iframe)).toBeVisible() + + // 2. Confirm the shell's error monitor catches the tracer. + await shell.gotoAuditSection('errors') + await expect.poll(async () => { + const rows = page.locator(SEL.errorRow) + const count = await rows.count() + for (let i = 0; i < count; i++) { + const text = (await rows.nth(i).textContent()) ?? '' + if (text.includes(TRACER)) return true + } + return false + }, { timeout: 15_000 }).toBe(true) + + // 3. Seed the error_fix task with the marker line as anchor. + const desc = `agent-loop error_fix · ${target.id} · ${Date.now()}` + const seedRes = await request.post(apiUrl(app, '/tasks'), { + data: { + type: 'error_fix', + description: desc, + file: target.componentPath, + line: 1, + context: { + message: TRACER, + marker: TRACER, + severity: 'error', + }, + }, + }) + expect(seedRes.ok(), `seed POST failed: ${await seedRes.text()}`).toBeTruthy() + const taskId = (await seedRes.json()).task?.id ?? (await seedRes.json()).id + expect(taskId).toBeTruthy() + metric.task_id = taskId + + // 4. Run the simulator: strip the tracer line. + const started = Date.now() + let result + try { + result = await applyErrorFix({ + taskId, + port: app.port, + componentPath: target.componentPath, + marker: TRACER, + }) + } catch (err) { + metric.error_message = err instanceof Error ? err.message : String(err) + throw err + } + metric.time_to_apply_ms = Date.now() - started + metric.resolution = result.resolution + + // 5. Reload iframe, observe no further tracer errors. + consoleErrors.length = 0 + await page.locator(SEL.inputRoute).fill('/#agent-loop-target') + await page.locator(SEL.inputRoute).press('Enter') + await page.waitForTimeout(2_000) // HMR settle window + expect( + consoleErrors, + `tracer console.error still firing after fix: ${consoleErrors.join(' | ')}`, + ).toHaveLength(0) + + // 6. Verify task state. + const taskRes = await request.get(apiUrl(app, `/tasks/${taskId}`)) + const task = (await taskRes.json()).task ?? (await taskRes.json()) + expect(task.status).toBe('review') + + metric.outcome = 'success' + }) + }) +} diff --git a/playgrounds/stress-test/e2e/annotask/agent-loop/style-update.spec.ts b/playgrounds/stress-test/e2e/annotask/agent-loop/style-update.spec.ts new file mode 100644 index 0000000..263d02c --- /dev/null +++ b/playgrounds/stress-test/e2e/annotask/agent-loop/style-update.spec.ts @@ -0,0 +1,130 @@ +/** + * style_update agent loop — annotate a styled element, simulator + * rewrites the tracer stylesheet, iframe DOM picks the change up via + * Vite HMR. + * + * v1 caveat: the apply step is rule-based, not LLM-driven (see + * `docs/agent-loop-evals.md`). The test exercises the full task + * lifecycle (pending → in_progress → review) and verifies the rendered + * DOM, but does not yet exercise the shell's inspector tool to create + * the task — the seed goes straight through the per-MFE API. + */ +import { test, expect } from '@playwright/test' +import { APPS, apiUrl } from '../fixtures/apps' +import { AnnotaskShell } from '../fixtures/annotask-page' +import { SEL } from '../helpers/selectors' +import { AGENT_LOOP_APPS, capture, restore, type CapturedFile } from '../helpers/agent-loop/targets' +import { applyStyleUpdate } from '../helpers/agent-loop/simulator' +import { emptyMetric, writeMetric, type RunMetric } from '../helpers/agent-loop/metrics' + +const FEATURE_GROUP = 'agent-loop' +const FEATURE_ID = 'style-update' + +for (const target of AGENT_LOOP_APPS) { + const app = APPS.find(a => a.id === target.id) + if (!app) throw new Error(`agent-loop target ${target.id} is not in APPS`) + + test.describe(`[${target.id}] agent-loop · style_update`, () => { + test.describe.configure({ mode: 'serial' }) + + let captured: CapturedFile[] = [] + let metric: RunMetric + + test.beforeEach(async () => { + captured = capture([target.cssPath, target.componentPath]) + metric = emptyMetric('style_update', target.id, target.framework) + }) + + test.afterEach(async () => { + restore(captured) + writeMetric(metric) + }) + + test('agent applies color token swap and iframe re-renders', async ({ page, request }) => { + test.info().annotations.push({ + type: 'matrix', + description: `${target.id}/${FEATURE_GROUP}/${FEATURE_ID}`, + }) + + // 1. Seed the style_update task via the per-MFE API. The + // `context.changes` block mirrors what the shell's inspector + // emits today (see src/shell/composables/useStyleEditor.ts). + const desc = `agent-loop style_update · ${target.id} · ${Date.now()}` + const seedRes = await request.post(apiUrl(app, '/tasks'), { + data: { + type: 'style_update', + description: desc, + file: target.cssPath, + line: 4, + context: { + element: "[data-agent-loop-target='paragraph']", + changes: [ + { + type: 'style_update', + element: "[data-agent-loop-target='paragraph']", + property: 'color', + before: 'rgb(255, 0, 0)', + after: 'rgb(0, 128, 0)', + }, + ], + }, + }, + }) + expect(seedRes.ok(), `seed POST failed: ${await seedRes.text()}`).toBeTruthy() + const seedBody = await seedRes.json() + const taskId = seedBody.task?.id ?? seedBody.id + expect(taskId).toBeTruthy() + metric.task_id = taskId + + // 2. Drive the shell — boots, shows the seeded task in the panel. + const shell = new AnnotaskShell(page, app) + await shell.open() + await shell.openTasksPanel() + await expect( + page.locator(SEL.taskCard).filter({ hasText: desc }), + ).toBeVisible({ timeout: 5_000 }) + + // 3. Run the simulator (same MCP-CLI sequence a real agent + // follows). It locks, rewrites the CSS, and marks `review`. + const started = Date.now() + let result + try { + result = await applyStyleUpdate({ + taskId, + port: app.port, + cssPath: target.cssPath, + selector: "[data-agent-loop-target='paragraph']", + property: 'color', + before: 'rgb(255, 0, 0)', + after: 'rgb(0, 128, 0)', + }) + } catch (err) { + metric.error_message = err instanceof Error ? err.message : String(err) + throw err + } + metric.time_to_apply_ms = Date.now() - started + metric.resolution = result.resolution + + // 4. Verify the iframe DOM picks up the HMR-applied change. + // We load the dev app directly with the target hash so the + // AgentLoopTarget component mounts and the tracer stylesheet + // is in scope. + await page.goto(`http://localhost:${target.port}/#agent-loop-target`) + const targetEl = page.locator("[data-agent-loop-target='paragraph']") + await expect(targetEl).toBeVisible({ timeout: 10_000 }) + await expect.poll(async () => { + return await targetEl.evaluate(el => getComputedStyle(el).color) + }, { timeout: 10_000 }).toBe('rgb(0, 128, 0)') + + // 5. Verify task transitioned to review with a resolution note. + const taskRes = await request.get(apiUrl(app, `/tasks/${taskId}`)) + expect(taskRes.ok()).toBeTruthy() + const taskBody = await taskRes.json() + const task = taskBody.task ?? taskBody + expect(task.status).toBe('review') + expect(task.resolution).toBeTruthy() + + metric.outcome = 'success' + }) + }) +} diff --git a/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/cli.ts b/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/cli.ts new file mode 100644 index 0000000..586a788 --- /dev/null +++ b/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/cli.ts @@ -0,0 +1,61 @@ +/** + * Thin wrappers around the bundled annotask CLI that match the MCP + * tool sequence in `skills/annotask-apply/SKILL.md`. Tests should + * prefer these over hand-rolling HTTP calls so we exercise the same + * CLI surface a real agent would use. + */ +import { execFileSync } from 'node:child_process' +import { existsSync } from 'node:fs' +import { dirname, join } from 'node:path' +import { fileURLToPath } from 'node:url' + +const __filename = fileURLToPath(import.meta.url) +const __dirname = dirname(__filename) + +const REPO_ROOT = join(__dirname, '..', '..', '..', '..', '..', '..') +const CLI_ENTRY = join(REPO_ROOT, 'dist', 'cli.js') + +function runCli(args: string[]): string { + if (!existsSync(CLI_ENTRY)) { + throw new Error(`annotask CLI not built at ${CLI_ENTRY} — run 'pnpm build' first`) + } + try { + return execFileSync('node', [CLI_ENTRY, ...args], { + encoding: 'utf8', + stdio: ['ignore', 'pipe', 'pipe'], + }) + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err) + throw new Error(`annotask CLI failed (${args.join(' ')}): ${msg}`) + } +} + +export function getTask(port: number, id: string): Record { + const out = runCli(['task', id, '--mcp', `--server=http://localhost:${port}`]) + return JSON.parse(out) as Record +} + +export function listTasks(port: number, status?: string): Array> { + const args = ['tasks', '--mcp', `--server=http://localhost:${port}`] + if (status) args.push(`--status=${status}`) + const parsed = JSON.parse(runCli(args)) + return Array.isArray(parsed) ? parsed : (parsed.tasks ?? []) +} + +export function updateTaskStatus( + port: number, + id: string, + status: string, + resolution?: string, +): Record { + const args = [ + 'update-task', + id, + '--mcp', + `--server=http://localhost:${port}`, + `--status=${status}`, + ] + if (resolution) args.push(`--resolution=${resolution}`) + const out = runCli(args) + return JSON.parse(out) as Record +} diff --git a/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/metrics.ts b/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/metrics.ts new file mode 100644 index 0000000..99405af --- /dev/null +++ b/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/metrics.ts @@ -0,0 +1,66 @@ +/** + * Per-run agent-loop eval metrics. + * + * Writes one JSON file per (task type, app) combination under + * `playgrounds/stress-test/e2e/annotask/reports/agent-loop/`. The + * shape is intentionally small — see `docs/agent-loop-evals.md` for + * the schema and the v1 caveats around what each field means. + */ +import { mkdirSync, writeFileSync } from 'node:fs' +import { dirname, join } from 'node:path' +import { fileURLToPath } from 'node:url' + +const __filename = fileURLToPath(import.meta.url) +const __dirname = dirname(__filename) + +const REPORTS_DIR = join(__dirname, '..', '..', 'reports', 'agent-loop') + +export type TaskTypeKey = 'style_update' | 'a11y_fix' | 'error_fix' +export type Outcome = 'success' | 'failure' + +export interface RunMetric { + task_type: TaskTypeKey + app_id: string + framework: string + outcome: Outcome + /** Wall-clock ms from task creation to status=review. */ + time_to_apply_ms: number | null + retries: number + denied_on_first_try: boolean + task_id: string | null + resolution: string | null + error_message: string | null + /** ISO 8601 string. */ + recorded_at: string +} + +function safeFileName(metric: RunMetric): string { + return `${metric.task_type}__${metric.app_id}__${Date.now()}.json` +} + +export function writeMetric(metric: RunMetric): string { + mkdirSync(REPORTS_DIR, { recursive: true }) + const file = join(REPORTS_DIR, safeFileName(metric)) + writeFileSync(file, JSON.stringify(metric, null, 2)) + return file +} + +export function emptyMetric( + taskType: TaskTypeKey, + appId: string, + framework: string, +): RunMetric { + return { + task_type: taskType, + app_id: appId, + framework, + outcome: 'failure', + time_to_apply_ms: null, + retries: 0, + denied_on_first_try: false, + task_id: null, + resolution: null, + error_message: null, + recorded_at: new Date().toISOString(), + } +} diff --git a/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/simulator.ts b/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/simulator.ts new file mode 100644 index 0000000..4a3425c --- /dev/null +++ b/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/simulator.ts @@ -0,0 +1,150 @@ +/** + * Agent-loop simulator. + * + * Drives the same MCP-shaped tool sequence a real coding agent would + * follow when working through `skills/annotask-apply/SKILL.md`: + * + * 1. `annotask_get_task` to fetch full task detail + * 2. `annotask_update_task` → `in_progress` to lock the task + * 3. Apply a deterministic fix to the source file the task points at + * 4. `annotask_update_task` → `review` with a one-line resolution + * + * Quality of the apply step is intentionally rule-based, not LLM-driven — + * v1 of this harness exists to measure the *loop plumbing*, not the + * agent's reasoning. See `docs/agent-loop-evals.md` for what that + * means and which ticket owns LLM apply quality. + */ +import { readFileSync, writeFileSync } from 'node:fs' +import { extname } from 'node:path' +import { getTask, updateTaskStatus } from './cli' + +export interface SimulatorResult { + taskId: string + resolution: string + /** Wall-clock ms across the locked → review transition. */ + durationMs: number + /** Always 0 for v1 — the simulator never re-tries. */ + retries: number +} + +export interface StyleUpdateInput { + taskId: string + port: number + cssPath: string + selector: string + property: string + before: string + after: string +} + +export interface A11yFixInput { + taskId: string + port: number + componentPath: string + /** axe-core rule id; v1 only supports `image-alt`. */ + rule: string +} + +export interface ErrorFixInput { + taskId: string + port: number + componentPath: string + /** Marker comment that the simulator removes. */ + marker: string +} + +async function lockAndReview( + taskId: string, + port: number, + apply: () => T, + resolutionFor: (applied: T) => string, +): Promise { + const started = Date.now() + // Hydrate full task detail (mirrors annotask_get_task) — surfaces a + // clear error if the test never seeded the task. + getTask(port, taskId) + updateTaskStatus(port, taskId, 'in_progress') + const applied = apply() + const resolution = resolutionFor(applied) + updateTaskStatus(port, taskId, 'review', resolution) + return { taskId, resolution, durationMs: Date.now() - started, retries: 0 } +} + +export async function applyStyleUpdate(input: StyleUpdateInput): Promise { + return lockAndReview( + input.taskId, + input.port, + () => { + const css = readFileSync(input.cssPath, 'utf8') + // Replace the literal `before` value following the selector block. + // The tracer CSS is hand-shaped so a single replacement is safe. + const next = css.replace(input.before, input.after) + if (next === css) { + throw new Error( + `style_update simulator: '${input.before}' not found in ${input.cssPath}`, + ) + } + writeFileSync(input.cssPath, next) + return { property: input.property, before: input.before, after: input.after } + }, + a => `Swapped ${a.property} from ${a.before} to ${a.after} in agent-loop-target.css`, + ) +} + +export async function applyA11yFix(input: A11yFixInput): Promise { + return lockAndReview( + input.taskId, + input.port, + () => { + if (input.rule !== 'image-alt') { + throw new Error( + `a11y_fix simulator: rule '${input.rule}' is not in the v1 deterministic rule set`, + ) + } + const file = readFileSync(input.componentPath, 'utf8') + // Match an opening tag that doesn't already have an `alt=` + // attribute and inject `alt=""`. Works for both JSX and Vue + // templates because we never spread props onto in these + // tracer files. + const next = file.replace( + /]*\balt=)([^>]*?)(\s*\/?)>/g, + '', + ) + if (next === file) { + throw new Error( + `a11y_fix simulator: no without alt found in ${input.componentPath}`, + ) + } + writeFileSync(input.componentPath, next) + return { rule: input.rule } + }, + a => `Added alt="" to per WCAG ${a.rule}`, + ) +} + +export async function applyErrorFix(input: ErrorFixInput): Promise { + return lockAndReview( + input.taskId, + input.port, + () => { + const file = readFileSync(input.componentPath, 'utf8') + const lines = file.split('\n') + const matched: number[] = [] + const kept = lines.filter((line, idx) => { + if (line.includes(input.marker)) { + matched.push(idx + 1) + return false + } + return true + }) + if (matched.length === 0) { + throw new Error( + `error_fix simulator: marker '${input.marker}' not found in ${input.componentPath}`, + ) + } + writeFileSync(input.componentPath, kept.join('\n')) + return { matched, ext: extname(input.componentPath) } + }, + a => `Removed ${a.matched.length} line(s) marked '${a.ext}' tracer`, + ) +} diff --git a/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/targets.ts b/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/targets.ts new file mode 100644 index 0000000..c4da6f3 --- /dev/null +++ b/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/targets.ts @@ -0,0 +1,71 @@ +/** + * Agent-loop e2e: per-MFE target file layout. + * + * Each target MFE has a dedicated test-only component plus a tracer + * stylesheet. Tests rewrite these files via the simulator, verify the + * iframe DOM picks up the change through Vite HMR, then restore the + * captured originals in `afterEach`. + */ +import { readFileSync, writeFileSync } from 'node:fs' +import { dirname, join } from 'node:path' +import { fileURLToPath } from 'node:url' + +const __filename = fileURLToPath(import.meta.url) +const __dirname = dirname(__filename) + +const REPO_ROOT = join(__dirname, '..', '..', '..', '..', '..', '..') + +export interface AgentLoopApp { + /** stress-test MFE id (matches `playgrounds/stress-test/e2e/annotask/fixtures/apps.ts`) */ + id: 'react-workflows' | 'vue-data-lab' + /** dev server port */ + port: number + /** Human-readable framework label, used in metrics. */ + framework: 'react+vite' | 'vue+vite' + /** Absolute path to the AgentLoopTarget component file. */ + componentPath: string + /** Absolute path to the tracer stylesheet. */ + cssPath: string +} + +export const AGENT_LOOP_APPS: AgentLoopApp[] = [ + { + id: 'react-workflows', + port: 4210, + framework: 'react+vite', + componentPath: join( + REPO_ROOT, + 'playgrounds/stress-test/apps/mfe-react-workflows/src/AgentLoopTarget.tsx', + ), + cssPath: join( + REPO_ROOT, + 'playgrounds/stress-test/apps/mfe-react-workflows/src/agent-loop-target.css', + ), + }, + { + id: 'vue-data-lab', + port: 4220, + framework: 'vue+vite', + componentPath: join( + REPO_ROOT, + 'playgrounds/stress-test/apps/mfe-vue-data-lab/src/AgentLoopTarget.vue', + ), + cssPath: join( + REPO_ROOT, + 'playgrounds/stress-test/apps/mfe-vue-data-lab/src/agent-loop-target.css', + ), + }, +] + +export interface CapturedFile { + path: string + contents: string +} + +export function capture(paths: string[]): CapturedFile[] { + return paths.map(p => ({ path: p, contents: readFileSync(p, 'utf8') })) +} + +export function restore(files: CapturedFile[]): void { + for (const f of files) writeFileSync(f.path, f.contents) +}