From 04b7c7dba0733f12da68aef4579ac31ee8a7c89e Mon Sep 17 00:00:00 2001
From: Kurt Stohrer <kurtstohrer@gmail.com>
Date: Tue, 12 May 2026 16:22:34 -0400
Subject: [PATCH] Add agent-loop e2e suite for style_update, a11y_fix,
 error_fix (ANN-6)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stands up the agent round-trip plumbing as a measurable, CI-blocking
artifact for the public demo and design-partner pitch.

Each task type drives a full lifecycle on react-workflows (React+Vite)
and vue-data-lab (Vue+Vite):

- Capture the AgentLoopTarget component + tracer stylesheet
- Seed a deterministic task shape via the per-MFE API
- Open the host shell, exercise the task panel / a11y scan / error
  monitor as appropriate
- Run a rule-based simulator that follows the same MCP-CLI sequence
  (annotask task / update-task) a real coding agent would
- Verify the iframe DOM, axe rescan, or console stream reflects the
  fix after Vite HMR
- Restore the captured files and emit a per-run JSON metric

The simulator is intentionally rule-based for v1 — see
docs/agent-loop-evals.md for the schema, the caveats around what is
and isn't measured today, and where the LLM apply step plugs in for
v2. Per-test metrics land under
playgrounds/stress-test/e2e/annotask/reports/agent-loop/ and are
uploaded as a CI artifact.

A focused playwright config (agent-loop.config.ts) only spins up host
+ the two target MFEs so the new CI job stays under the broader
stress-cluster cost. The existing pnpm test:e2e:stress:annotask script
still picks the tests up via its directory filter.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
---
 .github/workflows/ci.yml                      |  30 ++++
 docs/agent-loop-evals.md                      | 137 ++++++++++++++++
 package.json                                  |   1 +
 .../src/AgentLoopTarget.tsx                   |  46 ++++++
 .../src/agent-loop-target.css                 |   8 +
 .../apps/mfe-react-workflows/src/main.tsx     |  12 ++
 .../mfe-vue-data-lab/src/AgentLoopTarget.vue  |  38 +++++
 .../src/agent-loop-target.css                 |   6 +
 .../apps/mfe-vue-data-lab/src/main.ts         |   8 +
 .../stress-test/e2e/agent-loop.config.ts      |  39 +++++
 .../e2e/annotask/agent-loop/a11y-fix.spec.ts  | 144 ++++++++++++++++
 .../e2e/annotask/agent-loop/error-fix.spec.ts | 155 ++++++++++++++++++
 .../annotask/agent-loop/style-update.spec.ts  | 130 +++++++++++++++
 .../e2e/annotask/helpers/agent-loop/cli.ts    |  61 +++++++
 .../annotask/helpers/agent-loop/metrics.ts    |  66 ++++++++
 .../annotask/helpers/agent-loop/simulator.ts  | 150 +++++++++++++++++
 .../annotask/helpers/agent-loop/targets.ts    |  71 ++++++++
 17 files changed, 1102 insertions(+)
 create mode 100644 docs/agent-loop-evals.md
 create mode 100644 playgrounds/stress-test/apps/mfe-react-workflows/src/AgentLoopTarget.tsx
 create mode 100644 playgrounds/stress-test/apps/mfe-react-workflows/src/agent-loop-target.css
 create mode 100644 playgrounds/stress-test/apps/mfe-vue-data-lab/src/AgentLoopTarget.vue
 create mode 100644 playgrounds/stress-test/apps/mfe-vue-data-lab/src/agent-loop-target.css
 create mode 100644 playgrounds/stress-test/e2e/agent-loop.config.ts
 create mode 100644 playgrounds/stress-test/e2e/annotask/agent-loop/a11y-fix.spec.ts
 create mode 100644 playgrounds/stress-test/e2e/annotask/agent-loop/error-fix.spec.ts
 create mode 100644 playgrounds/stress-test/e2e/annotask/agent-loop/style-update.spec.ts
 create mode 100644 playgrounds/stress-test/e2e/annotask/helpers/agent-loop/cli.ts
 create mode 100644 playgrounds/stress-test/e2e/annotask/helpers/agent-loop/metrics.ts
 create mode 100644 playgrounds/stress-test/e2e/annotask/helpers/agent-loop/simulator.ts
 create mode 100644 playgrounds/stress-test/e2e/annotask/helpers/agent-loop/targets.ts

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f7d338a..374fada 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -58,3 +58,33 @@ jobs:
       - run: npx playwright install --with-deps chromium
 
       - run: pnpm test:e2e --project=${{ matrix.project }}
+
+  agent-loop:
+    runs-on: ubuntu-latest
+    needs: build-and-test
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: pnpm/action-setup@v4
+        with:
+          version: 10
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: 20
+          cache: pnpm
+
+      - run: pnpm install --frozen-lockfile
+
+      - run: pnpm build
+
+      - run: npx playwright install --with-deps chromium
+
+      - run: pnpm test:e2e:stress:annotask:agent-loop
+
+      - uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: agent-loop-metrics
+          path: playgrounds/stress-test/e2e/annotask/reports/agent-loop/
+          if-no-files-found: warn
diff --git a/docs/agent-loop-evals.md b/docs/agent-loop-evals.md
new file mode 100644
index 0000000..a186e77
--- /dev/null
+++ b/docs/agent-loop-evals.md
@@ -0,0 +1,137 @@
+# Agent-loop evaluation harness
+
+The agent-loop e2e suite measures the Annotask round-trip — user
+annotates → task lands in MCP → coding agent applies the change →
+re-render verifies the fix — for the three highest-leverage task types
+on the stress-test playground.
+
+It is the credibility artifact behind the public demo and the
+design-partner pitch deck. The numbers it emits are how we'll know
+whether shipping the next task type is helping or regressing the loop.
+
+> **v1 scope.** The simulator that stands in for the coding agent is
+> deterministic and rule-based — not LLM-driven. The harness is here
+> to measure *plumbing reliability* (does the task land, do the MCP
+> tools work, does HMR pick the fix up, do metrics persist) so we can
+> ship the public demo without a paid-LLM dependency. The follow-up
+> ticket on **agent-apply quality** (tracked under
+> [ANN-1](/ANN/issues/ANN-1) child issues) is where the real LLM gets
+> wired into this same harness.
+
+## What each test proves
+
+| Task type     | Test surface                                                            | Round-trip assertion                                                          |
+| ------------- | ----------------------------------------------------------------------- | ----------------------------------------------------------------------------- |
+| `style_update`| Tracer stylesheet on a known `data-agent-loop-target` element.          | Iframe `getComputedStyle().color` flips after Vite HMR.                       |
+| `a11y_fix`    | `<img>` in the test-only target component with `alt` attribute removed. | `axe-core` rescan reports zero `image-alt` violations after the fix.           |
+| `error_fix`   | `console.error(<tracer>)` injected into the target component.           | Console listener sees zero tracer errors after the fix lands.                  |
+
+All three tests run on both **React+Vite** (`react-workflows`, port
+4210) and **Vue+Vite** (`vue-data-lab`, port 4220) MFEs. Adding a new
+framework target is a single entry in
+`playgrounds/stress-test/e2e/annotask/helpers/agent-loop/targets.ts`.
+
+## How the simulator stands in for an agent
+
+The agent simulator
+(`playgrounds/stress-test/e2e/annotask/helpers/agent-loop/simulator.ts`)
+calls the same `annotask` CLI flags a real coding agent would (`--mcp`,
+`--server=…`) so we exercise the MCP-shaped tool surface end-to-end:
+
+1. `annotask task <id> --mcp` — hydrate full task detail
+2. `annotask update-task <id> --status=in_progress --mcp` — lock it
+3. **Apply step (rule-based for v1):**
+   - `style_update` — replace the `before` rgb literal with `after` in
+     `agent-loop-target.css`
+   - `a11y_fix` — for `rule: image-alt`, regex-inject `alt=""` on any
+     `<img>` missing the attribute
+   - `error_fix` — strip every line containing the test's tracer
+     comment marker
+4. `annotask update-task <id> --status=review --resolution="…" --mcp`
+
+The apply step is what an LLM coding agent will replace in the v2
+ticket. The rest of the loop — lock, fetch context, mark review,
+re-fetch denied tasks — is the production path.
+
+## Running the suite
+
+```bash
+pnpm build                       # CLI must be built first; simulator uses dist/cli.js
+pnpm test:e2e:stress:annotask    # runs everything under playgrounds/stress-test/e2e/annotask/
+```
+
+The Playwright config under `playgrounds/stress-test/e2e/` boots the
+host shell, the seven stress MFEs, and the four fast native API
+services with `reuseExistingServer: true`. First boot takes about a
+minute while Vite optimizes deps.
+
+The agent-loop specs run in `serial` mode per (framework × task type)
+because each test mutates the AgentLoopTarget component file and
+restores it in `afterEach`. Two concurrent style_update tests on the
+same MFE would race on the file.
+
+## Reading the metrics output
+
+Each test writes one JSON file under
+`playgrounds/stress-test/e2e/annotask/reports/agent-loop/`:
+
+```json
+{
+  "task_type": "style_update",
+  "app_id": "react-workflows",
+  "framework": "react+vite",
+  "outcome": "success",
+  "time_to_apply_ms": 412,
+  "retries": 0,
+  "denied_on_first_try": false,
+  "task_id": "task-abc123",
+  "resolution": "Swapped color from rgb(255, 0, 0) to rgb(0, 128, 0) in agent-loop-target.css",
+  "error_message": null,
+  "recorded_at": "2026-05-12T20:21:14.882Z"
+}
+```
+
+Field meanings — useful when this seeds the eval dashboard:
+
+- **outcome** — `success` if the round-trip assertion passes; otherwise
+  `failure` with `error_message` set.
+- **time_to_apply_ms** — wall-clock from simulator start to task
+  transitioning to `review`. Not the full round-trip — HMR and re-scan
+  time are reported in the Playwright test duration, not here.
+- **retries** — always `0` in v1 (simulator does not loop). When the
+  LLM agent lands, the simulator will increment this on `denied` →
+  `in_progress` cycles.
+- **denied_on_first_try** — placeholder for the v2 LLM apply harness.
+  The deterministic simulator never gets denied today.
+- **task_id** / **resolution** — copied from the MCP-CLI response to
+  make it easy to grep back to the originating task without re-running
+  the suite.
+
+## v1 caveats (what's *not* tested yet)
+
+- The shell's inspector tool is not driven for `style_update` — tasks
+  are seeded via the per-MFE API. Driving the inspector tool is its own
+  test; the agent-loop suite focuses on what the agent does *after*
+  the task lands.
+- The "Create Fix Task" button on `a11y_fix` is exercised in
+  `annotate.spec.ts`. The agent-loop suite seeds a deterministic task
+  shape directly so the simulator can run against a known anchor.
+- The simulator's deterministic apply rules cover **one** failure mode
+  per task type. The v2 ticket on agent-apply quality expands rules
+  (or, more likely, replaces them with an LLM call) so the harness can
+  measure performance on the full task-type matrix.
+- `retries` and `denied_on_first_try` are wired into the metric shape
+  but always zero/false in v1. The schema is locked so the dashboard
+  doesn't churn when the LLM agent ships.
+
+## How to add a new task type
+
+1. Add a deterministic apply function to `helpers/agent-loop/simulator.ts`.
+2. Add a fixture to `AgentLoopTarget.{tsx,vue}` (or a sibling target
+   file) that the test can mutate to seed the failure mode.
+3. Add a spec under `playgrounds/stress-test/e2e/annotask/agent-loop/`
+   following the same `capture → seed → drive shell → simulate →
+   verify → restore` pattern.
+4. Extend `TaskTypeKey` in `helpers/agent-loop/metrics.ts` so the JSON
+   output stays type-checked.
+5. Document the new task type in the table at the top of this file.
diff --git a/package.json b/package.json
index 786514c..5fd0f24 100644
--- a/package.json
+++ b/package.json
@@ -61,6 +61,7 @@
     "stress-test:down": "docker compose -f playgrounds/stress-test/docker-compose.yml down",
     "test:e2e:stress": "playwright test --config playgrounds/stress-test/e2e/playwright.config.ts",
     "test:e2e:stress:annotask": "playwright test --config playgrounds/stress-test/e2e/playwright.config.ts annotask/ || true",
+    "test:e2e:stress:annotask:agent-loop": "playwright test --config playgrounds/stress-test/e2e/agent-loop.config.ts",
     "typecheck": "tsc --noEmit && vue-tsc --noEmit -p src/shell/tsconfig.json",
     "test": "vitest run",
     "test:watch": "vitest",
diff --git a/playgrounds/stress-test/apps/mfe-react-workflows/src/AgentLoopTarget.tsx b/playgrounds/stress-test/apps/mfe-react-workflows/src/AgentLoopTarget.tsx
new file mode 100644
index 0000000..d6cd159
--- /dev/null
+++ b/playgrounds/stress-test/apps/mfe-react-workflows/src/AgentLoopTarget.tsx
@@ -0,0 +1,46 @@
+/**
+ * Test-only target for agent-loop e2e tests.
+ *
+ * Always mounted but visually inert by default. The e2e tests in
+ * `playgrounds/stress-test/e2e/annotask/agent-loop/` mutate
+ * `agent-loop-target.css` to drive a known style change through Vite
+ * HMR and verify the round-trip. They also mutate this file to seed
+ * a11y violations and console errors, then run the agent simulator
+ * to apply a fix and restore the file in `afterEach`.
+ *
+ * The "Agent-loop e2e target" landmark only renders when the URL hash
+ * is `#agent-loop-target` so it stays invisible in normal stress-test
+ * use.
+ */
+import { useEffect, useState } from 'react'
+import './agent-loop-target.css'
+
+function useShowTarget(): boolean {
+  const [show, setShow] = useState(
+    typeof window !== 'undefined' && window.location.hash === '#agent-loop-target',
+  )
+  useEffect(() => {
+    const handler = () => setShow(window.location.hash === '#agent-loop-target')
+    window.addEventListener('hashchange', handler)
+    return () => window.removeEventListener('hashchange', handler)
+  }, [])
+  return show
+}
+
+export function AgentLoopTarget(): JSX.Element | null {
+  const show = useShowTarget()
+  if (!show) return null
+  return (
+    <section data-testid="agent-loop-target" aria-labelledby="agent-loop-target-heading">
+      <h2 id="agent-loop-target-heading">Agent-loop e2e target</h2>
+      <p data-agent-loop-target="paragraph">Tracer element for agent-loop e2e tests.</p>
+      <img
+        data-agent-loop-target="image"
+        src="data:image/svg+xml;utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' width='8' height='8'%3E%3C/svg%3E"
+        alt=""
+        width={8}
+        height={8}
+      />
+    </section>
+  )
+}
diff --git a/playgrounds/stress-test/apps/mfe-react-workflows/src/agent-loop-target.css b/playgrounds/stress-test/apps/mfe-react-workflows/src/agent-loop-target.css
new file mode 100644
index 0000000..0c6e5ab
--- /dev/null
+++ b/playgrounds/stress-test/apps/mfe-react-workflows/src/agent-loop-target.css
@@ -0,0 +1,8 @@
+/*
+ * Agent-loop e2e: tracer stylesheet. Rewritten by the simulator during
+ * style_update tests, then restored in afterEach. Vite HMR picks up
+ * each edit and the test asserts the iframe's computed style flipped.
+ */
+[data-agent-loop-target='paragraph'] {
+  color: rgb(255, 0, 0);
+}
diff --git a/playgrounds/stress-test/apps/mfe-react-workflows/src/main.tsx b/playgrounds/stress-test/apps/mfe-react-workflows/src/main.tsx
index a2999a7..9567f38 100644
--- a/playgrounds/stress-test/apps/mfe-react-workflows/src/main.tsx
+++ b/playgrounds/stress-test/apps/mfe-react-workflows/src/main.tsx
@@ -5,6 +5,7 @@ import { bootstrapTheme } from '@annotask/stress-ui-tokens'
 import { StrictMode } from 'react'
 import { createRoot } from 'react-dom/client'
 import { Root } from './Root'
+import { AgentLoopTarget } from './AgentLoopTarget'
 
 bootstrapTheme()
 
@@ -13,3 +14,14 @@ createRoot(document.getElementById('app')!).render(
     <Root />
   </StrictMode>,
 )
+
+// Agent-loop e2e target — only renders when the page hash is
+// `#agent-loop-target`. Inert otherwise.
+const agentLoopHost = document.createElement('div')
+agentLoopHost.id = 'agent-loop-host'
+document.body.appendChild(agentLoopHost)
+createRoot(agentLoopHost).render(
+  <StrictMode>
+    <AgentLoopTarget />
+  </StrictMode>,
+)
diff --git a/playgrounds/stress-test/apps/mfe-vue-data-lab/src/AgentLoopTarget.vue b/playgrounds/stress-test/apps/mfe-vue-data-lab/src/AgentLoopTarget.vue
new file mode 100644
index 0000000..a718945
--- /dev/null
+++ b/playgrounds/stress-test/apps/mfe-vue-data-lab/src/AgentLoopTarget.vue
@@ -0,0 +1,38 @@
+<!--
+  Test-only target for agent-loop e2e tests. See the React sibling
+  `AgentLoopTarget.tsx` for the full rationale. Only renders when the
+  page is loaded with the `#agent-loop-target` hash.
+-->
+<script setup lang="ts">
+import { onMounted, onUnmounted, ref } from 'vue'
+import './agent-loop-target.css'
+
+const show = ref(
+  typeof window !== 'undefined' && window.location.hash === '#agent-loop-target',
+)
+
+function update() {
+  show.value = window.location.hash === '#agent-loop-target'
+}
+
+onMounted(() => window.addEventListener('hashchange', update))
+onUnmounted(() => window.removeEventListener('hashchange', update))
+</script>
+
+<template>
+  <section
+    v-if="show"
+    data-testid="agent-loop-target"
+    aria-labelledby="agent-loop-target-heading"
+  >
+    <h2 id="agent-loop-target-heading">Agent-loop e2e target</h2>
+    <p data-agent-loop-target="paragraph">Tracer element for agent-loop e2e tests.</p>
+    <img
+      data-agent-loop-target="image"
+      src="data:image/svg+xml;utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' width='8' height='8'%3E%3C/svg%3E"
+      alt=""
+      width="8"
+      height="8"
+    />
+  </section>
+</template>
diff --git a/playgrounds/stress-test/apps/mfe-vue-data-lab/src/agent-loop-target.css b/playgrounds/stress-test/apps/mfe-vue-data-lab/src/agent-loop-target.css
new file mode 100644
index 0000000..fe498c8
--- /dev/null
+++ b/playgrounds/stress-test/apps/mfe-vue-data-lab/src/agent-loop-target.css
@@ -0,0 +1,6 @@
+/*
+ * Agent-loop e2e: tracer stylesheet. See the React sibling's notes.
+ */
+[data-agent-loop-target='paragraph'] {
+  color: rgb(255, 0, 0);
+}
diff --git a/playgrounds/stress-test/apps/mfe-vue-data-lab/src/main.ts b/playgrounds/stress-test/apps/mfe-vue-data-lab/src/main.ts
index b1c4902..5a687d9 100644
--- a/playgrounds/stress-test/apps/mfe-vue-data-lab/src/main.ts
+++ b/playgrounds/stress-test/apps/mfe-vue-data-lab/src/main.ts
@@ -2,7 +2,15 @@ import '@annotask/stress-ui-tokens/tokens.css'
 import { bootstrapTheme } from '@annotask/stress-ui-tokens'
 import { createApp } from 'vue'
 import App from './App.vue'
+import AgentLoopTarget from './AgentLoopTarget.vue'
 
 bootstrapTheme()
 
 createApp(App).mount('#app')
+
+// Agent-loop e2e target — only renders when the page hash is
+// `#agent-loop-target`. Inert otherwise.
+const agentLoopHost = document.createElement('div')
+agentLoopHost.id = 'agent-loop-host'
+document.body.appendChild(agentLoopHost)
+createApp(AgentLoopTarget).mount(agentLoopHost)
diff --git a/playgrounds/stress-test/e2e/agent-loop.config.ts b/playgrounds/stress-test/e2e/agent-loop.config.ts
new file mode 100644
index 0000000..c7e5725
--- /dev/null
+++ b/playgrounds/stress-test/e2e/agent-loop.config.ts
@@ -0,0 +1,39 @@
+/**
+ * Focused Playwright config for the agent-loop e2e suite. Only spins up
+ * the host shell plus the two target MFEs (react-workflows,
+ * vue-data-lab) — the rest of the stress cluster is overkill for these
+ * specs and would triple the CI runtime.
+ *
+ * If you need to run against the full stress cluster instead, use
+ * `pnpm test:e2e:stress:annotask` which loads the broader config.
+ */
+import { defineConfig, devices } from '@playwright/test'
+
+const webServers = [
+  { name: 'stress-host', command: 'pnpm dev:stress-host', url: 'http://localhost:4200' },
+  { name: 'stress-react-workflows', command: 'pnpm dev:stress-react-workflows', url: 'http://localhost:4210' },
+  { name: 'stress-vue-data-lab', command: 'pnpm dev:stress-vue-data-lab', url: 'http://localhost:4220' },
+]
+
+export default defineConfig({
+  testDir: './annotask/agent-loop',
+  timeout: 90_000,
+  expect: { timeout: 15_000 },
+  fullyParallel: false,
+  workers: 1,
+  retries: 0,
+  reporter: [['list'], ['./annotask/reporter.ts']],
+  use: {
+    trace: 'on-first-retry',
+    baseURL: 'http://localhost:4200',
+    ...devices['Desktop Chrome'],
+  },
+  webServer: webServers.map(s => ({
+    command: s.command,
+    url: s.url,
+    reuseExistingServer: true,
+    timeout: 120_000,
+    stdout: 'ignore',
+    stderr: 'pipe',
+  })),
+})
diff --git a/playgrounds/stress-test/e2e/annotask/agent-loop/a11y-fix.spec.ts b/playgrounds/stress-test/e2e/annotask/agent-loop/a11y-fix.spec.ts
new file mode 100644
index 0000000..c667061
--- /dev/null
+++ b/playgrounds/stress-test/e2e/annotask/agent-loop/a11y-fix.spec.ts
@@ -0,0 +1,144 @@
+/**
+ * a11y_fix agent loop — produce an image-alt violation, let the shell's
+ * axe-core scan catch it, seed an a11y_fix task with the rule context,
+ * then run the simulator (deterministic image-alt fix) and verify the
+ * violation is gone after re-scan.
+ */
+import { test, expect } from '@playwright/test'
+import { writeFileSync } from 'node:fs'
+import { APPS, apiUrl } from '../fixtures/apps'
+import { AnnotaskShell } from '../fixtures/annotask-page'
+import { SEL } from '../helpers/selectors'
+import { AGENT_LOOP_APPS, capture, restore, type CapturedFile } from '../helpers/agent-loop/targets'
+import { applyA11yFix } from '../helpers/agent-loop/simulator'
+import { emptyMetric, writeMetric, type RunMetric } from '../helpers/agent-loop/metrics'
+
+const FEATURE_GROUP = 'agent-loop'
+const FEATURE_ID = 'a11y-fix'
+
+/** Remove the `alt=""` (or `alt=...`) attribute from the AgentLoopTarget
+ *  image. Keeps the file syntactically valid in both JSX and Vue
+ *  templates because we control the markup. */
+function breakImageAlt(file: string): string {
+  return file.replace(/<img([^>]*?)\salt=("[^"]*"|\{[^}]*\})([^>]*)>/g, '<img$1$3>')
+}
+
+for (const target of AGENT_LOOP_APPS) {
+  const app = APPS.find(a => a.id === target.id)
+  if (!app) throw new Error(`agent-loop target ${target.id} is not in APPS`)
+
+  test.describe(`[${target.id}] agent-loop · a11y_fix`, () => {
+    test.describe.configure({ mode: 'serial' })
+
+    let captured: CapturedFile[] = []
+    let metric: RunMetric
+
+    test.beforeEach(async () => {
+      captured = capture([target.componentPath, target.cssPath])
+      metric = emptyMetric('a11y_fix', target.id, target.framework)
+      // Seed the broken state: image with no alt attribute.
+      const broken = breakImageAlt(captured[0].contents)
+      if (broken === captured[0].contents) {
+        throw new Error(
+          `a11y_fix seed: could not break alt= attribute in ${target.componentPath}`,
+        )
+      }
+      writeFileSync(target.componentPath, broken)
+    })
+
+    test.afterEach(async () => {
+      restore(captured)
+      writeMetric(metric)
+    })
+
+    test('agent fixes image-alt violation surfaced by axe scan', async ({ page, request }) => {
+      test.info().annotations.push({
+        type: 'matrix',
+        description: `${target.id}/${FEATURE_GROUP}/${FEATURE_ID}`,
+      })
+
+      // 1. Load the iframe at the target hash and confirm the
+      //    violation is surfaced by the shell's axe-core scan.
+      await page.goto(`http://localhost:${target.port}/#agent-loop-target`)
+      await expect(page.locator("[data-agent-loop-target='image']")).toBeVisible({ timeout: 10_000 })
+
+      const shell = new AnnotaskShell(page, app)
+      await shell.open()
+      // Route the iframe to the target hash via the toolbar input.
+      await page.locator(SEL.inputRoute).fill('/#agent-loop-target')
+      await page.locator(SEL.inputRoute).press('Enter')
+
+      await shell.gotoAuditSection('a11y')
+      await page.locator(SEL.btnScanA11y).click()
+      await expect.poll(async () => {
+        return page.locator(SEL.a11yViolation).count()
+      }, { timeout: 15_000 }).toBeGreaterThan(0)
+
+      // 2. Seed the a11y_fix task explicitly. (We could click the
+      //    shell's "Create Fix Task" button instead — that path is
+      //    exercised in `annotate.spec.ts` — but here we want a
+      //    deterministic task shape for the simulator.)
+      const desc = `agent-loop a11y_fix image-alt · ${target.id} · ${Date.now()}`
+      const seedRes = await request.post(apiUrl(app, '/tasks'), {
+        data: {
+          type: 'a11y_fix',
+          description: desc,
+          file: target.componentPath,
+          line: 1,
+          context: {
+            rule: 'image-alt',
+            impact: 'serious',
+            helpUrl: 'https://dequeuniversity.com/rules/axe/4.10/image-alt',
+            selector: "[data-agent-loop-target='image']",
+          },
+        },
+      })
+      expect(seedRes.ok(), `seed POST failed: ${await seedRes.text()}`).toBeTruthy()
+      const taskId = (await seedRes.json()).task?.id ?? (await seedRes.json()).id
+      expect(taskId).toBeTruthy()
+      metric.task_id = taskId
+
+      // 3. Run the simulator: deterministic alt="" injection.
+      const started = Date.now()
+      let result
+      try {
+        result = await applyA11yFix({
+          taskId,
+          port: app.port,
+          componentPath: target.componentPath,
+          rule: 'image-alt',
+        })
+      } catch (err) {
+        metric.error_message = err instanceof Error ? err.message : String(err)
+        throw err
+      }
+      metric.time_to_apply_ms = Date.now() - started
+      metric.resolution = result.resolution
+
+      // 4. Reload iframe (HMR may have already applied), re-scan,
+      //    expect violation count to be zero (or at least drop).
+      await page.locator(SEL.inputRoute).fill('/#agent-loop-target')
+      await page.locator(SEL.inputRoute).press('Enter')
+      await page.locator(SEL.btnScanA11y).click()
+      await expect.poll(async () => {
+        const rows = page.locator(SEL.a11yViolation)
+        const count = await rows.count()
+        let imageAltStill = 0
+        for (let i = 0; i < count; i++) {
+          const text = (await rows.nth(i).textContent()) ?? ''
+          if (text.toLowerCase().includes('image-alt') || text.toLowerCase().includes('alternative text')) {
+            imageAltStill++
+          }
+        }
+        return imageAltStill
+      }, { timeout: 15_000 }).toBe(0)
+
+      // 5. Verify task state.
+      const taskRes = await request.get(apiUrl(app, `/tasks/${taskId}`))
+      const task = (await taskRes.json()).task ?? (await taskRes.json())
+      expect(task.status).toBe('review')
+
+      metric.outcome = 'success'
+    })
+  })
+}
diff --git a/playgrounds/stress-test/e2e/annotask/agent-loop/error-fix.spec.ts b/playgrounds/stress-test/e2e/annotask/agent-loop/error-fix.spec.ts
new file mode 100644
index 0000000..e68c6f0
--- /dev/null
+++ b/playgrounds/stress-test/e2e/annotask/agent-loop/error-fix.spec.ts
@@ -0,0 +1,155 @@
+/**
+ * error_fix agent loop — inject a `console.error` tracer into the
+ * AgentLoopTarget, confirm the shell's error monitor catches it, seed
+ * an error_fix task pointing at the marker line, run the simulator
+ * to delete the marker line, and verify the error stops firing.
+ */
+import { test, expect } from '@playwright/test'
+import { writeFileSync } from 'node:fs'
+import { APPS, apiUrl } from '../fixtures/apps'
+import { AnnotaskShell } from '../fixtures/annotask-page'
+import { SEL } from '../helpers/selectors'
+import { AGENT_LOOP_APPS, capture, restore, type CapturedFile } from '../helpers/agent-loop/targets'
+import { applyErrorFix } from '../helpers/agent-loop/simulator'
+import { emptyMetric, writeMetric, type RunMetric } from '../helpers/agent-loop/metrics'
+
+const FEATURE_GROUP = 'agent-loop'
+const FEATURE_ID = 'error-fix'
+const TRACER = 'e2e-agent-loop-error-tracer'
+
+/** Inject a `console.error` call into the AgentLoopTarget render path.
+ *  For React, we inject above the `<section …>` JSX (legal inside
+ *  fragment-free returns we control). For Vue, we inject inside the
+ *  `<script setup>` block. */
+function breakWithErrorTracer(file: string): string {
+  if (file.includes('<script setup')) {
+    // Vue SFC — append the error line at the end of <script setup>.
+    return file.replace(
+      /<\/script>/,
+      `console.error('${TRACER}') // ${TRACER}\n</script>`,
+    )
+  }
+  // React/TSX — drop the error inside the component body before the
+  // JSX `return`. Match the closing brace of `useShowTarget(...)` line
+  // and append on a new line.
+  return file.replace(
+    /export function AgentLoopTarget\(\): JSX\.Element \| null \{\n(\s+)const show = useShowTarget\(\)/,
+    `export function AgentLoopTarget(): JSX.Element | null {\n$1const show = useShowTarget()\n$1console.error('${TRACER}') // ${TRACER}`,
+  )
+}
+
+for (const target of AGENT_LOOP_APPS) {
+  const app = APPS.find(a => a.id === target.id)
+  if (!app) throw new Error(`agent-loop target ${target.id} is not in APPS`)
+
+  test.describe(`[${target.id}] agent-loop · error_fix`, () => {
+    test.describe.configure({ mode: 'serial' })
+
+    let captured: CapturedFile[] = []
+    let metric: RunMetric
+
+    test.beforeEach(async () => {
+      captured = capture([target.componentPath])
+      metric = emptyMetric('error_fix', target.id, target.framework)
+      const broken = breakWithErrorTracer(captured[0].contents)
+      if (broken === captured[0].contents) {
+        throw new Error(
+          `error_fix seed: could not inject tracer into ${target.componentPath}`,
+        )
+      }
+      writeFileSync(target.componentPath, broken)
+    })
+
+    test.afterEach(async () => {
+      restore(captured)
+      writeMetric(metric)
+    })
+
+    test('agent removes throwing line and console error stops firing', async ({ page, request }) => {
+      test.info().annotations.push({
+        type: 'matrix',
+        description: `${target.id}/${FEATURE_GROUP}/${FEATURE_ID}`,
+      })
+
+      // 1. Open the shell and route the iframe to the target hash.
+      const consoleErrors: string[] = []
+      page.on('console', msg => {
+        if (msg.type() === 'error' && msg.text().includes(TRACER)) {
+          consoleErrors.push(msg.text())
+        }
+      })
+
+      const shell = new AnnotaskShell(page, app)
+      await shell.open()
+      await page.locator(SEL.inputRoute).fill('/#agent-loop-target')
+      await page.locator(SEL.inputRoute).press('Enter')
+      await expect(page.locator(SEL.iframe)).toBeVisible()
+
+      // 2. Confirm the shell's error monitor catches the tracer.
+      await shell.gotoAuditSection('errors')
+      await expect.poll(async () => {
+        const rows = page.locator(SEL.errorRow)
+        const count = await rows.count()
+        for (let i = 0; i < count; i++) {
+          const text = (await rows.nth(i).textContent()) ?? ''
+          if (text.includes(TRACER)) return true
+        }
+        return false
+      }, { timeout: 15_000 }).toBe(true)
+
+      // 3. Seed the error_fix task with the marker line as anchor.
+      const desc = `agent-loop error_fix · ${target.id} · ${Date.now()}`
+      const seedRes = await request.post(apiUrl(app, '/tasks'), {
+        data: {
+          type: 'error_fix',
+          description: desc,
+          file: target.componentPath,
+          line: 1,
+          context: {
+            message: TRACER,
+            marker: TRACER,
+            severity: 'error',
+          },
+        },
+      })
+      expect(seedRes.ok(), `seed POST failed: ${await seedRes.text()}`).toBeTruthy()
+      const taskId = (await seedRes.json()).task?.id ?? (await seedRes.json()).id
+      expect(taskId).toBeTruthy()
+      metric.task_id = taskId
+
+      // 4. Run the simulator: strip the tracer line.
+      const started = Date.now()
+      let result
+      try {
+        result = await applyErrorFix({
+          taskId,
+          port: app.port,
+          componentPath: target.componentPath,
+          marker: TRACER,
+        })
+      } catch (err) {
+        metric.error_message = err instanceof Error ? err.message : String(err)
+        throw err
+      }
+      metric.time_to_apply_ms = Date.now() - started
+      metric.resolution = result.resolution
+
+      // 5. Reload iframe, observe no further tracer errors.
+      consoleErrors.length = 0
+      await page.locator(SEL.inputRoute).fill('/#agent-loop-target')
+      await page.locator(SEL.inputRoute).press('Enter')
+      await page.waitForTimeout(2_000) // HMR settle window
+      expect(
+        consoleErrors,
+        `tracer console.error still firing after fix: ${consoleErrors.join(' | ')}`,
+      ).toHaveLength(0)
+
+      // 6. Verify task state.
+      const taskRes = await request.get(apiUrl(app, `/tasks/${taskId}`))
+      const task = (await taskRes.json()).task ?? (await taskRes.json())
+      expect(task.status).toBe('review')
+
+      metric.outcome = 'success'
+    })
+  })
+}
diff --git a/playgrounds/stress-test/e2e/annotask/agent-loop/style-update.spec.ts b/playgrounds/stress-test/e2e/annotask/agent-loop/style-update.spec.ts
new file mode 100644
index 0000000..263d02c
--- /dev/null
+++ b/playgrounds/stress-test/e2e/annotask/agent-loop/style-update.spec.ts
@@ -0,0 +1,130 @@
+/**
+ * style_update agent loop — annotate a styled element, simulator
+ * rewrites the tracer stylesheet, iframe DOM picks the change up via
+ * Vite HMR.
+ *
+ * v1 caveat: the apply step is rule-based, not LLM-driven (see
+ * `docs/agent-loop-evals.md`). The test exercises the full task
+ * lifecycle (pending → in_progress → review) and verifies the rendered
+ * DOM, but does not yet exercise the shell's inspector tool to create
+ * the task — the seed goes straight through the per-MFE API.
+ */
+import { test, expect } from '@playwright/test'
+import { APPS, apiUrl } from '../fixtures/apps'
+import { AnnotaskShell } from '../fixtures/annotask-page'
+import { SEL } from '../helpers/selectors'
+import { AGENT_LOOP_APPS, capture, restore, type CapturedFile } from '../helpers/agent-loop/targets'
+import { applyStyleUpdate } from '../helpers/agent-loop/simulator'
+import { emptyMetric, writeMetric, type RunMetric } from '../helpers/agent-loop/metrics'
+
+const FEATURE_GROUP = 'agent-loop'
+const FEATURE_ID = 'style-update'
+
+for (const target of AGENT_LOOP_APPS) {
+  const app = APPS.find(a => a.id === target.id)
+  if (!app) throw new Error(`agent-loop target ${target.id} is not in APPS`)
+
+  test.describe(`[${target.id}] agent-loop · style_update`, () => {
+    test.describe.configure({ mode: 'serial' })
+
+    let captured: CapturedFile[] = []
+    let metric: RunMetric
+
+    test.beforeEach(async () => {
+      captured = capture([target.cssPath, target.componentPath])
+      metric = emptyMetric('style_update', target.id, target.framework)
+    })
+
+    test.afterEach(async () => {
+      restore(captured)
+      writeMetric(metric)
+    })
+
+    test('agent applies color token swap and iframe re-renders', async ({ page, request }) => {
+      test.info().annotations.push({
+        type: 'matrix',
+        description: `${target.id}/${FEATURE_GROUP}/${FEATURE_ID}`,
+      })
+
+      // 1. Seed the style_update task via the per-MFE API. The
+      //    `context.changes` block mirrors what the shell's inspector
+      //    emits today (see src/shell/composables/useStyleEditor.ts).
+      const desc = `agent-loop style_update · ${target.id} · ${Date.now()}`
+      const seedRes = await request.post(apiUrl(app, '/tasks'), {
+        data: {
+          type: 'style_update',
+          description: desc,
+          file: target.cssPath,
+          line: 4,
+          context: {
+            element: "[data-agent-loop-target='paragraph']",
+            changes: [
+              {
+                type: 'style_update',
+                element: "[data-agent-loop-target='paragraph']",
+                property: 'color',
+                before: 'rgb(255, 0, 0)',
+                after: 'rgb(0, 128, 0)',
+              },
+            ],
+          },
+        },
+      })
+      expect(seedRes.ok(), `seed POST failed: ${await seedRes.text()}`).toBeTruthy()
+      const seedBody = await seedRes.json()
+      const taskId = seedBody.task?.id ?? seedBody.id
+      expect(taskId).toBeTruthy()
+      metric.task_id = taskId
+
+      // 2. Drive the shell — boots, shows the seeded task in the panel.
+      const shell = new AnnotaskShell(page, app)
+      await shell.open()
+      await shell.openTasksPanel()
+      await expect(
+        page.locator(SEL.taskCard).filter({ hasText: desc }),
+      ).toBeVisible({ timeout: 5_000 })
+
+      // 3. Run the simulator (same MCP-CLI sequence a real agent
+      //    follows). It locks, rewrites the CSS, and marks `review`.
+      const started = Date.now()
+      let result
+      try {
+        result = await applyStyleUpdate({
+          taskId,
+          port: app.port,
+          cssPath: target.cssPath,
+          selector: "[data-agent-loop-target='paragraph']",
+          property: 'color',
+          before: 'rgb(255, 0, 0)',
+          after: 'rgb(0, 128, 0)',
+        })
+      } catch (err) {
+        metric.error_message = err instanceof Error ? err.message : String(err)
+        throw err
+      }
+      metric.time_to_apply_ms = Date.now() - started
+      metric.resolution = result.resolution
+
+      // 4. Verify the iframe DOM picks up the HMR-applied change.
+      //    We load the dev app directly with the target hash so the
+      //    AgentLoopTarget component mounts and the tracer stylesheet
+      //    is in scope.
+      await page.goto(`http://localhost:${target.port}/#agent-loop-target`)
+      const targetEl = page.locator("[data-agent-loop-target='paragraph']")
+      await expect(targetEl).toBeVisible({ timeout: 10_000 })
+      await expect.poll(async () => {
+        return await targetEl.evaluate(el => getComputedStyle(el).color)
+      }, { timeout: 10_000 }).toBe('rgb(0, 128, 0)')
+
+      // 5. Verify task transitioned to review with a resolution note.
+      const taskRes = await request.get(apiUrl(app, `/tasks/${taskId}`))
+      expect(taskRes.ok()).toBeTruthy()
+      const taskBody = await taskRes.json()
+      const task = taskBody.task ?? taskBody
+      expect(task.status).toBe('review')
+      expect(task.resolution).toBeTruthy()
+
+      metric.outcome = 'success'
+    })
+  })
+}
diff --git a/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/cli.ts b/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/cli.ts
new file mode 100644
index 0000000..586a788
--- /dev/null
+++ b/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/cli.ts
@@ -0,0 +1,61 @@
+/**
+ * Thin wrappers around the bundled annotask CLI that match the MCP
+ * tool sequence in `skills/annotask-apply/SKILL.md`. Tests should
+ * prefer these over hand-rolling HTTP calls so we exercise the same
+ * CLI surface a real agent would use.
+ */
+import { execFileSync } from 'node:child_process'
+import { existsSync } from 'node:fs'
+import { dirname, join } from 'node:path'
+import { fileURLToPath } from 'node:url'
+
+const __filename = fileURLToPath(import.meta.url)
+const __dirname = dirname(__filename)
+
+const REPO_ROOT = join(__dirname, '..', '..', '..', '..', '..', '..')
+const CLI_ENTRY = join(REPO_ROOT, 'dist', 'cli.js')
+
+function runCli(args: string[]): string {
+  if (!existsSync(CLI_ENTRY)) {
+    throw new Error(`annotask CLI not built at ${CLI_ENTRY} — run 'pnpm build' first`)
+  }
+  try {
+    return execFileSync('node', [CLI_ENTRY, ...args], {
+      encoding: 'utf8',
+      stdio: ['ignore', 'pipe', 'pipe'],
+    })
+  } catch (err: unknown) {
+    const msg = err instanceof Error ? err.message : String(err)
+    throw new Error(`annotask CLI failed (${args.join(' ')}): ${msg}`)
+  }
+}
+
+export function getTask(port: number, id: string): Record<string, unknown> {
+  const out = runCli(['task', id, '--mcp', `--server=http://localhost:${port}`])
+  return JSON.parse(out) as Record<string, unknown>
+}
+
+export function listTasks(port: number, status?: string): Array<Record<string, unknown>> {
+  const args = ['tasks', '--mcp', `--server=http://localhost:${port}`]
+  if (status) args.push(`--status=${status}`)
+  const parsed = JSON.parse(runCli(args))
+  return Array.isArray(parsed) ? parsed : (parsed.tasks ?? [])
+}
+
+export function updateTaskStatus(
+  port: number,
+  id: string,
+  status: string,
+  resolution?: string,
+): Record<string, unknown> {
+  const args = [
+    'update-task',
+    id,
+    '--mcp',
+    `--server=http://localhost:${port}`,
+    `--status=${status}`,
+  ]
+  if (resolution) args.push(`--resolution=${resolution}`)
+  const out = runCli(args)
+  return JSON.parse(out) as Record<string, unknown>
+}
diff --git a/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/metrics.ts b/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/metrics.ts
new file mode 100644
index 0000000..99405af
--- /dev/null
+++ b/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/metrics.ts
@@ -0,0 +1,66 @@
+/**
+ * Per-run agent-loop eval metrics.
+ *
+ * Writes one JSON file per (task type, app) combination under
+ * `playgrounds/stress-test/e2e/annotask/reports/agent-loop/`. The
+ * shape is intentionally small — see `docs/agent-loop-evals.md` for
+ * the schema and the v1 caveats around what each field means.
+ */
+import { mkdirSync, writeFileSync } from 'node:fs'
+import { dirname, join } from 'node:path'
+import { fileURLToPath } from 'node:url'
+
+const __filename = fileURLToPath(import.meta.url)
+const __dirname = dirname(__filename)
+
+const REPORTS_DIR = join(__dirname, '..', '..', 'reports', 'agent-loop')
+
+export type TaskTypeKey = 'style_update' | 'a11y_fix' | 'error_fix'
+export type Outcome = 'success' | 'failure'
+
+export interface RunMetric {
+  task_type: TaskTypeKey
+  app_id: string
+  framework: string
+  outcome: Outcome
+  /** Wall-clock ms from task creation to status=review. */
+  time_to_apply_ms: number | null
+  retries: number
+  denied_on_first_try: boolean
+  task_id: string | null
+  resolution: string | null
+  error_message: string | null
+  /** ISO 8601 string. */
+  recorded_at: string
+}
+
+function safeFileName(metric: RunMetric): string {
+  return `${metric.task_type}__${metric.app_id}__${Date.now()}.json`
+}
+
+export function writeMetric(metric: RunMetric): string {
+  mkdirSync(REPORTS_DIR, { recursive: true })
+  const file = join(REPORTS_DIR, safeFileName(metric))
+  writeFileSync(file, JSON.stringify(metric, null, 2))
+  return file
+}
+
+export function emptyMetric(
+  taskType: TaskTypeKey,
+  appId: string,
+  framework: string,
+): RunMetric {
+  return {
+    task_type: taskType,
+    app_id: appId,
+    framework,
+    outcome: 'failure',
+    time_to_apply_ms: null,
+    retries: 0,
+    denied_on_first_try: false,
+    task_id: null,
+    resolution: null,
+    error_message: null,
+    recorded_at: new Date().toISOString(),
+  }
+}
diff --git a/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/simulator.ts b/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/simulator.ts
new file mode 100644
index 0000000..4a3425c
--- /dev/null
+++ b/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/simulator.ts
@@ -0,0 +1,150 @@
+/**
+ * Agent-loop simulator.
+ *
+ * Drives the same MCP-shaped tool sequence a real coding agent would
+ * follow when working through `skills/annotask-apply/SKILL.md`:
+ *
+ *   1. `annotask_get_task` to fetch full task detail
+ *   2. `annotask_update_task` → `in_progress` to lock the task
+ *   3. Apply a deterministic fix to the source file the task points at
+ *   4. `annotask_update_task` → `review` with a one-line resolution
+ *
+ * Quality of the apply step is intentionally rule-based, not LLM-driven —
+ * v1 of this harness exists to measure the *loop plumbing*, not the
+ * agent's reasoning. See `docs/agent-loop-evals.md` for what that
+ * means and which ticket owns LLM apply quality.
+ */
+import { readFileSync, writeFileSync } from 'node:fs'
+import { extname } from 'node:path'
+import { getTask, updateTaskStatus } from './cli'
+
+export interface SimulatorResult {
+  taskId: string
+  resolution: string
+  /** Wall-clock ms across the locked → review transition. */
+  durationMs: number
+  /** Always 0 for v1 — the simulator never re-tries. */
+  retries: number
+}
+
+export interface StyleUpdateInput {
+  taskId: string
+  port: number
+  cssPath: string
+  selector: string
+  property: string
+  before: string
+  after: string
+}
+
+export interface A11yFixInput {
+  taskId: string
+  port: number
+  componentPath: string
+  /** axe-core rule id; v1 only supports `image-alt`. */
+  rule: string
+}
+
+export interface ErrorFixInput {
+  taskId: string
+  port: number
+  componentPath: string
+  /** Marker comment that the simulator removes. */
+  marker: string
+}
+
+async function lockAndReview<T>(
+  taskId: string,
+  port: number,
+  apply: () => T,
+  resolutionFor: (applied: T) => string,
+): Promise<SimulatorResult> {
+  const started = Date.now()
+  // Hydrate full task detail (mirrors annotask_get_task) — surfaces a
+  // clear error if the test never seeded the task.
+  getTask(port, taskId)
+  updateTaskStatus(port, taskId, 'in_progress')
+  const applied = apply()
+  const resolution = resolutionFor(applied)
+  updateTaskStatus(port, taskId, 'review', resolution)
+  return { taskId, resolution, durationMs: Date.now() - started, retries: 0 }
+}
+
+export async function applyStyleUpdate(input: StyleUpdateInput): Promise<SimulatorResult> {
+  return lockAndReview(
+    input.taskId,
+    input.port,
+    () => {
+      const css = readFileSync(input.cssPath, 'utf8')
+      // Replace the literal `before` value following the selector block.
+      // The tracer CSS is hand-shaped so a single replacement is safe.
+      const next = css.replace(input.before, input.after)
+      if (next === css) {
+        throw new Error(
+          `style_update simulator: '${input.before}' not found in ${input.cssPath}`,
+        )
+      }
+      writeFileSync(input.cssPath, next)
+      return { property: input.property, before: input.before, after: input.after }
+    },
+    a => `Swapped ${a.property} from ${a.before} to ${a.after} in agent-loop-target.css`,
+  )
+}
+
+export async function applyA11yFix(input: A11yFixInput): Promise<SimulatorResult> {
+  return lockAndReview(
+    input.taskId,
+    input.port,
+    () => {
+      if (input.rule !== 'image-alt') {
+        throw new Error(
+          `a11y_fix simulator: rule '${input.rule}' is not in the v1 deterministic rule set`,
+        )
+      }
+      const file = readFileSync(input.componentPath, 'utf8')
+      // Match an <img ...> opening tag that doesn't already have an `alt=`
+      // attribute and inject `alt=""`. Works for both JSX and Vue
+      // templates because we never spread props onto <img> in these
+      // tracer files.
+      const next = file.replace(
+        /<img(?![^>]*\balt=)([^>]*?)(\s*\/?)>/g,
+        '<img$1 alt=""$2>',
+      )
+      if (next === file) {
+        throw new Error(
+          `a11y_fix simulator: no <img> without alt found in ${input.componentPath}`,
+        )
+      }
+      writeFileSync(input.componentPath, next)
+      return { rule: input.rule }
+    },
+    a => `Added alt="" to <img> per WCAG ${a.rule}`,
+  )
+}
+
+export async function applyErrorFix(input: ErrorFixInput): Promise<SimulatorResult> {
+  return lockAndReview(
+    input.taskId,
+    input.port,
+    () => {
+      const file = readFileSync(input.componentPath, 'utf8')
+      const lines = file.split('\n')
+      const matched: number[] = []
+      const kept = lines.filter((line, idx) => {
+        if (line.includes(input.marker)) {
+          matched.push(idx + 1)
+          return false
+        }
+        return true
+      })
+      if (matched.length === 0) {
+        throw new Error(
+          `error_fix simulator: marker '${input.marker}' not found in ${input.componentPath}`,
+        )
+      }
+      writeFileSync(input.componentPath, kept.join('\n'))
+      return { matched, ext: extname(input.componentPath) }
+    },
+    a => `Removed ${a.matched.length} line(s) marked '${a.ext}' tracer`,
+  )
+}
diff --git a/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/targets.ts b/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/targets.ts
new file mode 100644
index 0000000..c4da6f3
--- /dev/null
+++ b/playgrounds/stress-test/e2e/annotask/helpers/agent-loop/targets.ts
@@ -0,0 +1,71 @@
+/**
+ * Agent-loop e2e: per-MFE target file layout.
+ *
+ * Each target MFE has a dedicated test-only component plus a tracer
+ * stylesheet. Tests rewrite these files via the simulator, verify the
+ * iframe DOM picks up the change through Vite HMR, then restore the
+ * captured originals in `afterEach`.
+ */
+import { readFileSync, writeFileSync } from 'node:fs'
+import { dirname, join } from 'node:path'
+import { fileURLToPath } from 'node:url'
+
+const __filename = fileURLToPath(import.meta.url)
+const __dirname = dirname(__filename)
+
+const REPO_ROOT = join(__dirname, '..', '..', '..', '..', '..', '..')
+
+export interface AgentLoopApp {
+  /** stress-test MFE id (matches `playgrounds/stress-test/e2e/annotask/fixtures/apps.ts`) */
+  id: 'react-workflows' | 'vue-data-lab'
+  /** dev server port */
+  port: number
+  /** Human-readable framework label, used in metrics. */
+  framework: 'react+vite' | 'vue+vite'
+  /** Absolute path to the AgentLoopTarget component file. */
+  componentPath: string
+  /** Absolute path to the tracer stylesheet. */
+  cssPath: string
+}
+
+export const AGENT_LOOP_APPS: AgentLoopApp[] = [
+  {
+    id: 'react-workflows',
+    port: 4210,
+    framework: 'react+vite',
+    componentPath: join(
+      REPO_ROOT,
+      'playgrounds/stress-test/apps/mfe-react-workflows/src/AgentLoopTarget.tsx',
+    ),
+    cssPath: join(
+      REPO_ROOT,
+      'playgrounds/stress-test/apps/mfe-react-workflows/src/agent-loop-target.css',
+    ),
+  },
+  {
+    id: 'vue-data-lab',
+    port: 4220,
+    framework: 'vue+vite',
+    componentPath: join(
+      REPO_ROOT,
+      'playgrounds/stress-test/apps/mfe-vue-data-lab/src/AgentLoopTarget.vue',
+    ),
+    cssPath: join(
+      REPO_ROOT,
+      'playgrounds/stress-test/apps/mfe-vue-data-lab/src/agent-loop-target.css',
+    ),
+  },
+]
+
+export interface CapturedFile {
+  path: string
+  contents: string
+}
+
+export function capture(paths: string[]): CapturedFile[] {
+  return paths.map(p => ({ path: p, contents: readFileSync(p, 'utf8') }))
+}
+
+export function restore(files: CapturedFile[]): void {
+  for (const f of files) writeFileSync(f.path, f.contents)
+}