Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
8ef82ea
docs(plans): scope pi-ai migration spike (#1205)
christso May 2, 2026
ee9ccbd
chore: fix pre-existing import order in targets-validator
christso May 2, 2026
55f8d06
docs(plans): commit to Path B for pi-ai migration
christso May 2, 2026
3eaca6e
refactor(core): port OpenAI provider + rubric-generator to pi-ai (ste…
christso May 2, 2026
dba722e
fix(core): handle assistant/tool roles + safer baseUrl/cost in pi-ai …
christso May 2, 2026
a7d51f0
refactor(core): treat pi-ai as a normal dep — drop dynamic-import dance
christso May 3, 2026
ca907df
refactor(core): pre-resolve pi-ai Model in OpenAIProvider constructor
christso May 3, 2026
18236ac
fix(cli): declare @mariozechner/pi-ai as a runtime dep
christso May 3, 2026
6dbb106
feat(core): teach Provider.invoke about tools + multi-step
christso May 3, 2026
43f6076
refactor(core): migrate all grader consumers off asLanguageModel
christso May 3, 2026
c0fe2b2
refactor(core): drop ai-sdk entirely; all providers on pi-ai
christso May 3, 2026
5b31b75
chore(core): rename ai-sdk.ts → llm-providers.ts; add pi-ai-shim sync…
christso May 3, 2026
64aeab9
fix(core): root-cause pi-ai type resolution; delete shim
christso May 3, 2026
88586cb
chore(core): freshen comments + per-provider fallback metadata + cast…
christso May 3, 2026
14564a2
chore(core): simplify resolvePiModel fallback to universal 128K/16K
christso May 3, 2026
52275fc
docs: add MiMo targets to targets.yaml; document max_output_tokens fo…
christso May 3, 2026
32af2aa
docs: add MiMo direct API target with Bitwarden key; update targets.y…
christso May 4, 2026
55ab661
docs: remove bws references from targets.yaml templates
christso May 4, 2026
ced0b2b
chore(deps): bump @mariozechner/pi-ai ^0.62.0 → ^0.72.1
christso May 4, 2026
9f28c3f
docs: remove spike plan doc (content moved to #1205)
christso May 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .agentv/targets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,24 @@ targets:
provider: openrouter
api_key: ${{ OPENROUTER_API_KEY }}
model: ${{ OPENROUTER_MODEL }}

# ── MiMo (Xiaomi) via OpenRouter ───────────────────────────────────
- name: mimo
provider: openrouter
api_key: ${{ OPENROUTER_API_KEY }}
model: xiaomi/mimo-v2.5-pro
grader_target: grader

- name: mimo-flash
provider: openrouter
api_key: ${{ OPENROUTER_API_KEY }}
model: xiaomi/mimo-v2-flash
grader_target: grader

- name: mimo-direct
provider: openai
base_url: https://token-plan-sgp.xiaomimimo.com/v1
api_key: ${{ XIAOMI_MIMO_API_KEY }}
model: xiaomi/mimo-v2.5-pro
max_output_tokens: 131072
grader_target: grader
2 changes: 1 addition & 1 deletion apps/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@
"test:watch": "bun test --watch"
},
"dependencies": {
"@ai-sdk/openai": "^3.0.0",
"@anthropic-ai/claude-agent-sdk": "^0.2.49",
"@github/copilot-sdk": "^0.1.25",
"@hono/node-server": "^1.19.11",
"@inquirer/prompts": "^8.2.1",
"@mariozechner/pi-ai": "^0.72.1",
"@openai/codex-sdk": "^0.104.0",
"cmd-ts": "^0.14.3",
"dotenv": "^16.4.5",
Expand Down
31 changes: 31 additions & 0 deletions apps/cli/src/templates/.agentv/targets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,34 @@ targets:
cwd: ${{ CLI_EVALS_DIR }}
healthcheck:
command: uv run ./mock_cli.py --healthcheck

# ── MiMo (Xiaomi) via OpenRouter ───────────────────────────────────
# All MiMo models are available through OpenRouter with OpenAI-compatible API.
# See https://openrouter.ai/xiaomi/mimo-v2.5-pro for pricing and limits.
#
# Models:
# mimo-v2.5-pro — 1M context, 131K output, flagship
# mimo-v2-pro — 1M context, ~131K output
# mimo-v2.5 — 1M context, ~131K output, multimodal
# mimo-v2-flash — 262K context, 65K output, fast MoE (open-source)
# mimo-v2-omni — 262K context, 65K output, omni-modal
- name: mimo
provider: openrouter
api_key: ${{ OPENROUTER_API_KEY }}
model: xiaomi/mimo-v2.5-pro

- name: mimo-flash
provider: openrouter
api_key: ${{ OPENROUTER_API_KEY }}
model: xiaomi/mimo-v2-flash

# ── Direct provider (not through OpenRouter) ───────────────────────
# For providers not in pi-ai's model registry, set max_output_tokens
# to match your model's actual output limit. Without this, the default
# is 16K which may cap output below the model's capability.
# - name: mimo-direct
# provider: openai
# base_url: https://token-plan-sgp.xiaomimimo.com/v1
# api_key: ${{ XIAOMI_MIMO_API_KEY }}
# model: xiaomi/mimo-v2.5-pro
# max_output_tokens: 131072
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
*
* Validates that reasoning tokens, cached tokens, duration, cost,
* and other metrics survive the JSONL → artifact conversion pipeline
* for: claude-cli, codex, copilot-cli, pi-coding-agent, and llm (ai-sdk).
* for: claude-cli, codex, copilot-cli, pi-coding-agent, and llm (pi-ai).
*/
import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
import { existsSync, mkdtempSync, readFileSync, rmSync } from 'node:fs';
Expand Down
346 changes: 309 additions & 37 deletions bun.lock

Large diffs are not rendered by default.

3 changes: 0 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,5 @@
"tsup": "8.3.5",
"typescript": "5.8.3",
"yaml": "^2.8.3"
},
"dependencies": {
"@openrouter/ai-sdk-provider": "^2.3.3"
}
}
7 changes: 1 addition & 6 deletions packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,9 @@
"dependencies": {
"@agentclientprotocol/sdk": "^0.14.1",
"@agentv/eval": "workspace:*",
"@ai-sdk/anthropic": "^3.0.0",
"@ai-sdk/azure": "^3.0.0",
"@ai-sdk/google": "^3.0.0",
"@ai-sdk/openai": "^3.0.0",
"@github/copilot-sdk": "^0.1.25",
"@mariozechner/pi-ai": "^0.72.1",
"@openai/codex-sdk": "^0.104.0",
"@openrouter/ai-sdk-provider": "^2.3.1",
"ai": "^6.0.0",
"fast-glob": "^3.3.3",
"json5": "^2.2.3",
"micromatch": "^4.0.8",
Expand Down
19 changes: 9 additions & 10 deletions packages/core/src/evaluation/generators/rubric-generator.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { generateText } from 'ai';
import { z } from 'zod';

import type { Provider } from '../providers/types.js';
import { extractLastAssistantContent } from '../providers/types.js';
import type { RubricItem } from '../types.js';

const rubricItemSchema = z.object({
Expand All @@ -24,6 +24,10 @@ export interface GenerateRubricsOptions {

/**
* Generate rubrics from expected outcome using an LLM.
*
* Calls the provider through `Provider.invoke()` — the LLM call itself is
* a single non-streaming, non-tool-using completion. JSON output is parsed
* with up to 3 retries to absorb model formatting variance.
*/
export async function generateRubrics(
options: GenerateRubricsOptions,
Expand All @@ -32,11 +36,6 @@ export async function generateRubrics(

const prompt = buildPrompt(criteria, question, referenceAnswer);

const model = provider.asLanguageModel?.();
if (!model) {
throw new Error('Provider does not support language model interface');
}

const system = `You are an expert at creating evaluation rubrics.
You must return a valid JSON object matching this schema:
{
Expand All @@ -55,12 +54,12 @@ You must return a valid JSON object matching this schema:

for (let attempt = 1; attempt <= 3; attempt++) {
try {
const { text } = await generateText({
model,
system,
prompt,
const response = await provider.invoke({
question: prompt,
systemPrompt: system,
});

const text = extractLastAssistantContent(response.output);
const cleaned = text.replace(/```json\n?|```/g, '').trim();
result = rubricGenerationSchema.parse(JSON.parse(cleaned));
break;
Expand Down
26 changes: 0 additions & 26 deletions packages/core/src/evaluation/graders/composite.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import { generateText } from 'ai';

import { extractLastAssistantContent } from '../providers/types.js';
import type {
AssertionEntry,
Expand Down Expand Up @@ -340,30 +338,6 @@ export class CompositeGrader implements Grader {
};

try {
const model = graderProvider.asLanguageModel?.();
if (model) {
const { text } = await generateText({
model,
system: systemPrompt,
prompt: userPrompt,
});

const data = freeformEvaluationSchema.parse(parseJsonFromText(text));
const score = clampScore(data.score);
const assertions: AssertionEntry[] = Array.isArray(data.assertions)
? data.assertions.slice(0, 8)
: [];

return {
score,
verdict: scoreToVerdict(score),
assertions,
expectedAspectCount: Math.max(assertions.length, 1),
graderRawRequest,
scores,
};
}

const response = await graderProvider.invoke({
question: userPrompt,
systemPrompt,
Expand Down
Loading
Loading