From 0f43d8aa358754c5aa1a9593b1c4b440cb78171b Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sat, 2 May 2026 15:42:04 +0200 Subject: [PATCH 1/6] feat: add shell grader and workspace env preflight checks (#1207, #1208) Adds two new eval features: **Shell grader** (`type: shell`): runs a shell command and checks its stdout. - No `expected`: passes when exit code is 0 - `expected` with no `operator`: exact string match (trimmed stdout) - `expected` + `operator` (>, <, >=, <=, ==, !=): numeric float comparison **Workspace env preflight** (`workspace.env`): declares required system dependencies that are checked once before before_all hooks run. Fails fast with a clear diagnostic listing all missing commands/modules. Example: ```yaml workspace: env: required_commands: [ffmpeg, pandoc] required_python_modules: [PIL, openai] assertions: - type: shell command: "pdfinfo report.pdf | grep Pages | awk '{print $2}'" operator: ">=" expected: "5" ``` Closes #1207, #1208 Co-Authored-By: Claude Sonnet 4.6 --- packages/core/src/evaluation/graders/index.ts | 2 + packages/core/src/evaluation/graders/shell.ts | 104 ++++++++++++++++++ .../src/evaluation/loaders/grader-parser.ts | 43 ++++++++ packages/core/src/evaluation/orchestrator.ts | 56 ++++++++++ .../evaluation/registry/builtin-graders.ts | 3 + packages/core/src/evaluation/types.ts | 73 +++++++++++- packages/core/src/evaluation/yaml-parser.ts | 24 +++- .../test/evaluation/graders/shell.test.ts | 81 ++++++++++++++ .../evaluation/loaders/grader-parser.test.ts | 54 +++++++++ packages/eval/src/assertion.ts | 1 + 10 files changed, 439 insertions(+), 2 deletions(-) create mode 100644 packages/core/src/evaluation/graders/shell.ts create mode 100644 packages/core/test/evaluation/graders/shell.test.ts diff --git a/packages/core/src/evaluation/graders/index.ts b/packages/core/src/evaluation/graders/index.ts index 8a1a28800..171c394d1 100644 --- a/packages/core/src/evaluation/graders/index.ts +++ b/packages/core/src/evaluation/graders/index.ts @@ -58,6 +58,8 @@ export type { LlmGraderOptions } from './llm-grader.js'; export { formatToolCalls } from './format-tool-calls.js'; +export { ShellGrader } from './shell.js'; + export { SkillTriggerGrader } from './skill-trigger.js'; export { assembleLlmGraderPrompt } from './llm-grader-prompt.js'; diff --git a/packages/core/src/evaluation/graders/shell.ts b/packages/core/src/evaluation/graders/shell.ts new file mode 100644 index 000000000..b0ede2f5a --- /dev/null +++ b/packages/core/src/evaluation/graders/shell.ts @@ -0,0 +1,104 @@ +/** + * Shell grader: runs a shell command and checks its stdout. + * + * Pass/fail logic: + * - No `expected`: passes when exit code is 0. + * - `expected`, no `operator`: trims stdout and compares as exact string. + * - `expected` + `operator`: parses stdout and expected as floats, compares numerically. + * + * The command runs in the workspace directory when available. + * + * To add a new comparison operator: extend `ShellOperator` in types.ts and add a + * case to `compareNumeric` below. + */ + +import { execShellWithStdin } from '../../runtime/exec.js'; +import type { ShellGraderConfig, ShellOperator } from '../types.js'; +import { scoreToVerdict } from './scoring.js'; +import type { EvaluationContext, EvaluationScore, Grader } from './types.js'; + +function compareNumeric(actual: number, operator: ShellOperator, expected: number): boolean { + switch (operator) { + case '>': + return actual > expected; + case '<': + return actual < expected; + case '>=': + return actual >= expected; + case '<=': + return actual <= expected; + case '==': + return actual === expected; + case '!=': + return actual !== expected; + } +} + +export class ShellGrader implements Grader { + readonly kind = 'shell'; + + constructor(private readonly config: ShellGraderConfig) {} + + async evaluate(context: EvaluationContext): Promise { + const { command, expected, operator } = this.config; + const cwd = context.workspacePath; + + let result: { stdout: string; stderr: string; exitCode: number }; + try { + result = await execShellWithStdin(command, '', { cwd }); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + return { + score: 0, + verdict: 'fail', + assertions: [{ text: `Shell command error: ${msg}`, passed: false }], + expectedAspectCount: 1, + }; + } + + const stdout = result.stdout.trim(); + + let passed: boolean; + let assertionText: string; + + if (expected === undefined) { + passed = result.exitCode === 0; + assertionText = passed + ? `Command exited with code 0` + : `Command exited with code ${result.exitCode}`; + } else if (operator !== undefined) { + const actualNum = parseFloat(stdout); + const expectedNum = parseFloat(expected); + if (Number.isNaN(actualNum) || Number.isNaN(expectedNum)) { + return { + score: 0, + verdict: 'fail', + assertions: [ + { + text: `Cannot compare numerically: stdout="${stdout}", expected="${expected}"`, + passed: false, + }, + ], + expectedAspectCount: 1, + }; + } + passed = compareNumeric(actualNum, operator, expectedNum); + assertionText = passed + ? `${actualNum} ${operator} ${expectedNum} (passed)` + : `${actualNum} ${operator} ${expectedNum} (failed)`; + } else { + passed = stdout === expected; + assertionText = passed + ? `stdout "${stdout}" equals expected "${expected}"` + : `stdout "${stdout}" does not equal expected "${expected}"`; + } + + const score = passed ? 1 : 0; + return { + score, + verdict: scoreToVerdict(score), + assertions: [{ text: assertionText, passed }], + expectedAspectCount: 1, + }; + } +} diff --git a/packages/core/src/evaluation/loaders/grader-parser.ts b/packages/core/src/evaluation/loaders/grader-parser.ts index 7bdd9d01a..6fb8322db 100644 --- a/packages/core/src/evaluation/loaders/grader-parser.ts +++ b/packages/core/src/evaluation/loaders/grader-parser.ts @@ -1371,6 +1371,45 @@ async function parseGraderList( continue; } + if (typeValue === 'shell') { + const command = asString(rawEvaluator.command); + if (!command) { + logWarning(`Skipping shell evaluator '${name}' in '${evalId}': missing command`); + continue; + } + const expected = asString(rawEvaluator.expected); + const rawOperator = asString(rawEvaluator.operator); + const validOperators = ['>', '<', '>=', '<=', '==', '!='] as const; + const operator = validOperators.includes(rawOperator as (typeof validOperators)[number]) + ? (rawOperator as (typeof validOperators)[number]) + : undefined; + if (rawOperator && !operator) { + logWarning( + `Skipping shell evaluator '${name}' in '${evalId}': invalid operator "${rawOperator}". Valid: ${validOperators.join(', ')}`, + ); + continue; + } + const weight = validateWeight(rawEvaluator.weight, name, evalId); + const { required, min_score } = parseRequiredAndMinScore( + rawEvaluator.required, + (rawEvaluator as Record).min_score as JsonValue | undefined, + name, + evalId, + ); + evaluators.push({ + name, + type: 'shell', + command, + ...(expected !== undefined ? { expected } : {}), + ...(operator !== undefined ? { operator } : {}), + ...(weight !== undefined ? { weight } : {}), + ...(required !== undefined ? { required } : {}), + ...(min_score !== undefined ? { min_score } : {}), + ...(negate !== undefined ? { negate } : {}), + }); + continue; + } + const graderTarget = rawEvaluator.target; let graderTargetName: string | undefined; if (graderTarget !== undefined) { @@ -1734,6 +1773,10 @@ function generateAssertionName(typeValue: string, rawEvaluator: JsonObject): str return 'is-json'; case 'equals': return value ? `equals-${value}` : 'equals'; + case 'shell': { + const cmd = asString(rawEvaluator.command); + return cmd ? `shell-${cmd.slice(0, 30)}` : 'shell'; + } case 'rubrics': return 'rubrics'; default: diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 8d37c8b91..e3da97d6e 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -958,6 +958,20 @@ export async function runEvaluation( setupLog('Docker image pull complete'); } + // Run preflight environment checks (fail fast before any hooks or test cases) + if (suiteWorkspace?.env) { + try { + await runPreflightChecks(suiteWorkspace.env, sharedWorkspacePath ?? undefined, setupLog); + setupLog('preflight checks passed'); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + if (sharedWorkspacePath && !useStaticWorkspace) { + await cleanupWorkspace(sharedWorkspacePath).catch(() => {}); + } + throw new Error(message); + } + } + // Execute before_all (runs ONCE before first test per workspace) const suiteHooksEnabled = hooksEnabled(suiteWorkspace); const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all; @@ -3924,3 +3938,45 @@ function computeWeightedMean( return totalWeight > 0 ? weightedSum / totalWeight : 0; } + +/** + * Run preflight environment checks for workspace.env config. + * Fails fast if any required command or Python module is missing. + * Called once before before_all hooks, so long evals abort immediately on missing deps. + */ +async function runPreflightChecks( + env: import('./types.js').WorkspaceEnvConfig, + cwd: string | undefined, + log: (msg: string) => void, +): Promise { + const execFileAsync = promisify(execFile); + const missing: string[] = []; + + for (const cmd of env.required_commands ?? []) { + log(`preflight: checking command "${cmd}"`); + try { + if (process.platform === 'win32') { + await execFileAsync('where', [cmd], { cwd }); + } else { + await execFileAsync('sh', ['-c', `command -v ${cmd}`], { cwd }); + } + } catch { + missing.push(`command: ${cmd}`); + } + } + + for (const mod of env.required_python_modules ?? []) { + log(`preflight: checking Python module "${mod}"`); + try { + await execFileAsync('python3', ['-c', `import ${mod}`], { cwd }); + } catch { + missing.push(`python module: ${mod}`); + } + } + + if (missing.length > 0) { + throw new Error( + `Preflight checks failed — missing dependencies:\n${missing.map((m) => ` • ${m}`).join('\n')}\n\nInstall the missing dependencies before running this eval.`, + ); + } +} diff --git a/packages/core/src/evaluation/registry/builtin-graders.ts b/packages/core/src/evaluation/registry/builtin-graders.ts index b24eb20b0..d08f4a625 100644 --- a/packages/core/src/evaluation/registry/builtin-graders.ts +++ b/packages/core/src/evaluation/registry/builtin-graders.ts @@ -15,6 +15,7 @@ import { type Grader, LatencyGrader, LlmGrader, + ShellGrader, SkillTriggerGrader, TokenUsageGrader, ToolTrajectoryGrader, @@ -54,6 +55,7 @@ import type { LatencyGraderConfig, LlmGraderConfig, RegexGraderConfig, + ShellGraderConfig, SkillTriggerGraderConfig, StartsWithGraderConfig, TokenUsageGraderConfig, @@ -427,6 +429,7 @@ export function createBuiltinRegistry(): GraderRegistry { .register('regex', regexFactory) .register('is-json', isJsonFactory) .register('equals', equalsFactory) + .register('shell', (config) => new ShellGrader(config as ShellGraderConfig)) .register('inline-assert', (config) => { // biome-ignore lint/suspicious/noExplicitAny: symbol key access requires any const fn = (config as any)[INLINE_ASSERT_FN] as diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 53828126e..e30ddbf6b 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -188,6 +188,7 @@ const GRADER_KIND_VALUES = [ 'equals', 'rubrics', 'inline-assert', + 'shell', ] as const; export type GraderKind = (typeof GRADER_KIND_VALUES)[number]; @@ -339,6 +340,25 @@ export type DockerWorkspaceConfig = { readonly cpus?: number; }; +/** + * Preflight environment requirements for the workspace. + * Checked once before before_all hooks run. Fails fast if anything is missing. + * + * @example + * ```yaml + * workspace: + * env: + * required_commands: [ffmpeg, pandoc] + * required_python_modules: [PIL, openai] + * ``` + */ +export type WorkspaceEnvConfig = { + /** Shell commands that must be present in PATH (checked via `command -v`) */ + readonly required_commands?: readonly string[]; + /** Python modules that must be importable (checked via `python3 -c "import "`) */ + readonly required_python_modules?: readonly string[]; +}; + export type WorkspaceConfig = { /** Template directory or .code-workspace file. Directories are copied to temp workspace. * .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */ @@ -359,6 +379,8 @@ export type WorkspaceConfig = { * Used as default cwd for hook commands so that file-referenced templates resolve * relative paths from their own directory, not the eval file's directory. */ readonly workspaceFileDir?: string; + /** Preflight environment requirements. Checked before before_all hooks run. */ + readonly env?: WorkspaceEnvConfig; }; export type CodeGraderConfig = { @@ -868,6 +890,54 @@ export type InlineAssertEvaluatorConfig = { readonly negate?: boolean; }; +/** + * Numeric comparison operators for shell assertions. + * When set, the shell command stdout is compared numerically against `expected`. + */ +export type ShellOperator = '>' | '<' | '>=' | '<=' | '==' | '!='; + +/** + * Configuration for the shell grader. + * Runs a shell command and checks its stdout against an expected value. + * + * - With no `expected`: passes when the command exits with code 0. + * - With `expected` and no `operator`: trims stdout and compares exactly. + * - With `expected` and `operator`: parses both sides as floats and compares numerically. + * + * The command runs in the workspace directory (if available). + * + * @example Exact match + * ```yaml + * - type: shell + * command: "wc -l output.txt | awk '{print $1}'" + * expected: "42" + * ``` + * + * @example Numeric comparison + * ```yaml + * - type: shell + * command: "pdfinfo report.pdf | grep Pages | awk '{print $2}'" + * operator: ">=" + * expected: "5" + * ``` + */ +export type ShellGraderConfig = { + readonly name: string; + readonly type: 'shell'; + /** Shell command to execute */ + readonly command: string; + /** Expected stdout value. If omitted, only the exit code (0 = pass) is checked. */ + readonly expected?: string; + /** Numeric comparison operator. When set, both stdout and expected are parsed as floats. */ + readonly operator?: ShellOperator; + readonly weight?: number; + readonly required?: boolean | number; + /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ + readonly min_score?: number; + /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ + readonly negate?: boolean; +}; + export type GraderConfig = | CodeGraderConfig | LlmGraderConfig @@ -891,7 +961,8 @@ export type GraderConfig = | IsJsonGraderConfig | EqualsGraderConfig | RubricsEvaluatorConfig - | InlineAssertEvaluatorConfig; + | InlineAssertEvaluatorConfig + | ShellGraderConfig; /** * A single turn in a multi-turn conversation evaluation. diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 8aa397c4c..ba25e993a 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -54,6 +54,7 @@ import type { TrialsConfig, TurnFailurePolicy, WorkspaceConfig, + WorkspaceEnvConfig, WorkspaceHookConfig, WorkspaceHooksConfig, WorkspaceScriptConfig, @@ -853,8 +854,9 @@ function parseWorkspaceConfig(raw: unknown, evalFileDir: string): WorkspaceConfi const mode = explicitMode ?? (workspacePath ? 'static' : undefined); const docker = parseDockerWorkspaceConfig(obj.docker); + const env = parseWorkspaceEnvConfig(obj.env); - if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker) + if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker && !env) return undefined; return { @@ -865,6 +867,26 @@ function parseWorkspaceConfig(raw: unknown, evalFileDir: string): WorkspaceConfi ...(mode !== undefined && { mode }), ...(workspacePath !== undefined && { path: workspacePath }), ...(docker !== undefined && { docker }), + ...(env !== undefined && { env }), + }; +} + +function parseWorkspaceEnvConfig(raw: unknown): WorkspaceEnvConfig | undefined { + if (!isJsonObject(raw)) return undefined; + const obj = raw as Record; + + const required_commands = Array.isArray(obj.required_commands) + ? (obj.required_commands.filter((c) => typeof c === 'string') as string[]) + : undefined; + const required_python_modules = Array.isArray(obj.required_python_modules) + ? (obj.required_python_modules.filter((m) => typeof m === 'string') as string[]) + : undefined; + + if (!required_commands?.length && !required_python_modules?.length) return undefined; + + return { + ...(required_commands?.length && { required_commands }), + ...(required_python_modules?.length && { required_python_modules }), }; } diff --git a/packages/core/test/evaluation/graders/shell.test.ts b/packages/core/test/evaluation/graders/shell.test.ts new file mode 100644 index 000000000..6862e9831 --- /dev/null +++ b/packages/core/test/evaluation/graders/shell.test.ts @@ -0,0 +1,81 @@ +import { describe, expect, it, vi } from 'vitest'; +import type { ShellGraderConfig } from '../../../src/evaluation/types.js'; +import { ShellGrader } from '../../../src/evaluation/graders/shell.js'; +import type { EvaluationContext } from '../../../src/evaluation/graders/types.js'; + +const mockContext = (workspacePath?: string): EvaluationContext => + ({ + candidate: '', + workspacePath, + evalCase: { id: 'test', input: [] }, + }) as unknown as EvaluationContext; + +const grader = (extra: Partial = {}) => + new ShellGrader({ name: 'test-shell', type: 'shell', command: 'echo 5', ...extra }); + +describe('ShellGrader', () => { + it('passes when command exits 0 and no expected', async () => { + const result = await grader({ command: 'true' }).evaluate(mockContext()); + expect(result.score).toBe(1); + expect(result.verdict).toBe('pass'); + }); + + it('fails when command exits non-zero and no expected', async () => { + const result = await grader({ command: 'false' }).evaluate(mockContext()); + expect(result.score).toBe(0); + expect(result.verdict).toBe('fail'); + }); + + it('passes on exact string match', async () => { + const result = await grader({ command: 'echo hello', expected: 'hello' }).evaluate( + mockContext(), + ); + expect(result.score).toBe(1); + }); + + it('fails on string mismatch', async () => { + const result = await grader({ command: 'echo hello', expected: 'world' }).evaluate( + mockContext(), + ); + expect(result.score).toBe(0); + expect(result.assertions[0].text).toContain('does not equal'); + }); + + it.each([ + ['>', 10, 5, 1], + ['>', 5, 10, 0], + ['<', 3, 10, 1], + ['>=', 5, 5, 1], + ['<=', 5, 5, 1], + ['==', 7, 7, 1], + ['!=', 7, 5, 1], + ['!=', 5, 5, 0], + ] as const)( + 'numeric operator %s: actual=%d, expected=%d → score=%d', + async (op, actual, expected, score) => { + const result = await grader({ + command: `echo ${actual}`, + operator: op, + expected: String(expected), + }).evaluate(mockContext()); + expect(result.score).toBe(score); + }, + ); + + it('fails with clear message when stdout is not a number for numeric comparison', async () => { + const result = await grader({ + command: 'echo notanumber', + operator: '>=', + expected: '5', + }).evaluate(mockContext()); + expect(result.score).toBe(0); + expect(result.assertions[0].text).toContain('Cannot compare numerically'); + }); + + it('returns score 0 when command errors', async () => { + const result = await grader({ command: 'nonexistent_command_xyz_abc_987' }).evaluate( + mockContext(), + ); + expect(result.score).toBe(0); + }); +}); diff --git a/packages/core/test/evaluation/loaders/grader-parser.test.ts b/packages/core/test/evaluation/loaders/grader-parser.test.ts index de1984798..8e75db73c 100644 --- a/packages/core/test/evaluation/loaders/grader-parser.test.ts +++ b/packages/core/test/evaluation/loaders/grader-parser.test.ts @@ -14,6 +14,7 @@ import type { LatencyGraderConfig, LlmGraderConfig, RegexGraderConfig, + ShellGraderConfig, } from '../../../src/evaluation/types.js'; describe('parseGraders - deterministic assertion types', () => { @@ -211,6 +212,59 @@ describe('parseGraders - deterministic assertion types', () => { expect(evaluators).toBeUndefined(); }); + it('parses type: shell with command only', async () => { + const evaluators = await parseGraders( + { evaluators: [{ name: 'check-exit', type: 'shell', command: 'echo hi' }] }, + undefined, + [tempDir], + 'test-1', + ); + expect(evaluators).toHaveLength(1); + const config = evaluators?.[0] as ShellGraderConfig; + expect(config.type).toBe('shell'); + expect(config.command).toBe('echo hi'); + expect(config.expected).toBeUndefined(); + expect(config.operator).toBeUndefined(); + }); + + it('parses type: shell with expected and operator', async () => { + const evaluators = await parseGraders( + { + evaluators: [ + { name: 'check-pages', type: 'shell', command: 'echo 14', expected: '5', operator: '>=' }, + ], + }, + undefined, + [tempDir], + 'test-1', + ); + expect(evaluators).toHaveLength(1); + const config = evaluators?.[0] as ShellGraderConfig; + expect(config.type).toBe('shell'); + expect(config.expected).toBe('5'); + expect(config.operator).toBe('>='); + }); + + it('skips shell evaluator with invalid operator', async () => { + const evaluators = await parseGraders( + { evaluators: [{ name: 'bad-op', type: 'shell', command: 'echo 1', operator: '???' }] }, + undefined, + [tempDir], + 'test-1', + ); + expect(evaluators).toBeUndefined(); + }); + + it('skips shell evaluator with missing command', async () => { + const evaluators = await parseGraders( + { evaluators: [{ name: 'no-cmd', type: 'shell' }] }, + undefined, + [tempDir], + 'test-1', + ); + expect(evaluators).toBeUndefined(); + }); + it('parses type: rubrics with criteria as llm-grader', async () => { const evaluators = await parseGraders( { diff --git a/packages/eval/src/assertion.ts b/packages/eval/src/assertion.ts index 1d654f329..658a31488 100644 --- a/packages/eval/src/assertion.ts +++ b/packages/eval/src/assertion.ts @@ -59,6 +59,7 @@ export type AssertionType = | 'equals' | 'regex' | 'is-json' + | 'shell' // legacy snake_case aliases (still accepted) | 'llm_grader' | 'code_grader' From 6c63a36be51f855564435987edf26b262a3ccd42 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sat, 2 May 2026 15:45:04 +0200 Subject: [PATCH 2/6] fix: resolve lint errors in shell grader and targets-validator imports Co-Authored-By: Claude Sonnet 4.6 --- packages/core/src/evaluation/graders/shell.ts | 6 +++--- .../core/src/evaluation/validation/targets-validator.ts | 2 +- packages/core/test/evaluation/graders/shell.test.ts | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/core/src/evaluation/graders/shell.ts b/packages/core/src/evaluation/graders/shell.ts index b0ede2f5a..27673033b 100644 --- a/packages/core/src/evaluation/graders/shell.ts +++ b/packages/core/src/evaluation/graders/shell.ts @@ -64,11 +64,11 @@ export class ShellGrader implements Grader { if (expected === undefined) { passed = result.exitCode === 0; assertionText = passed - ? `Command exited with code 0` + ? 'Command exited with code 0' : `Command exited with code ${result.exitCode}`; } else if (operator !== undefined) { - const actualNum = parseFloat(stdout); - const expectedNum = parseFloat(expected); + const actualNum = Number.parseFloat(stdout); + const expectedNum = Number.parseFloat(expected); if (Number.isNaN(actualNum) || Number.isNaN(expectedNum)) { return { score: 0, diff --git a/packages/core/src/evaluation/validation/targets-validator.ts b/packages/core/src/evaluation/validation/targets-validator.ts index 7d9fc74f8..524b66548 100644 --- a/packages/core/src/evaluation/validation/targets-validator.ts +++ b/packages/core/src/evaluation/validation/targets-validator.ts @@ -1,12 +1,12 @@ import { readFile } from 'node:fs/promises'; import path from 'node:path'; +import { interpolateEnv } from '../interpolation.js'; import { CLI_PLACEHOLDERS, COMMON_TARGET_SETTINGS, findDeprecatedCamelCaseTargetWarnings, } from '../providers/targets.js'; -import { interpolateEnv } from '../interpolation.js'; import { KNOWN_PROVIDERS, PROVIDER_ALIASES } from '../providers/types.js'; import { parseYamlValue } from '../yaml-loader.js'; import type { ValidationError, ValidationResult } from './types.js'; diff --git a/packages/core/test/evaluation/graders/shell.test.ts b/packages/core/test/evaluation/graders/shell.test.ts index 6862e9831..5d977e2ef 100644 --- a/packages/core/test/evaluation/graders/shell.test.ts +++ b/packages/core/test/evaluation/graders/shell.test.ts @@ -1,7 +1,7 @@ -import { describe, expect, it, vi } from 'vitest'; -import type { ShellGraderConfig } from '../../../src/evaluation/types.js'; +import { describe, expect, it } from 'vitest'; import { ShellGrader } from '../../../src/evaluation/graders/shell.js'; import type { EvaluationContext } from '../../../src/evaluation/graders/types.js'; +import type { ShellGraderConfig } from '../../../src/evaluation/types.js'; const mockContext = (workspacePath?: string): EvaluationContext => ({ From 49989f24c47a11931d72bc447825ba5f7e03ebe8 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 3 May 2026 06:21:18 +0200 Subject: [PATCH 3/6] refactor: replace shell grader with code-grader plain-text fallback (#1210) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per design review: the `shell` grader type violated the "audit existing primitives first" principle — `code-grader` already runs shell commands. Promptfoo solves this the same way (javascript/python fallbacks, no dedicated shell type). Remove the `shell` grader type entirely and instead extend `code-grader` to accept plain-text stdout without requiring the JSON protocol: | stdout (trimmed, case-insensitive) | score | |---|---| | empty string | 1 if exit 0, 0 if exit non-zero | | "true", "pass", "1" | 1 | | "false", "fail", "0" | 0 | | numeric string | clamped float | | anything else | 1 if exit 0, 0 if exit non-zero | Scripts that write to stderr on non-zero exit still surface as errors (existing behavior). Silent non-zero exits (e.g. `[ "$pages" -ge 5 ]`) use exit-code convention. Usage: # numeric comparison via exit code - type: code-grader command: ["bash", "-c", "[ $(pdfinfo report.pdf | grep Pages | awk '{print $2}') -ge 5 ]"] # score from stdout - type: code-grader command: ["bash", "-c", "echo 0.75"] Closes #1210 Co-Authored-By: Claude Sonnet 4.6 --- .../src/evaluation/graders/code-grader.ts | 93 +++++++++++++--- packages/core/src/evaluation/graders/index.ts | 2 - packages/core/src/evaluation/graders/shell.ts | 104 ------------------ .../src/evaluation/loaders/grader-parser.ts | 43 -------- .../evaluation/registry/builtin-graders.ts | 3 - packages/core/src/evaluation/types.ts | 52 +-------- .../graders/code-grader-plain-text.test.ts | 86 +++++++++++++++ .../test/evaluation/graders/shell.test.ts | 81 -------------- .../evaluation/loaders/grader-parser.test.ts | 54 --------- packages/eval/src/assertion.ts | 1 - 10 files changed, 164 insertions(+), 355 deletions(-) delete mode 100644 packages/core/src/evaluation/graders/shell.ts create mode 100644 packages/core/test/evaluation/graders/code-grader-plain-text.test.ts delete mode 100644 packages/core/test/evaluation/graders/shell.test.ts diff --git a/packages/core/src/evaluation/graders/code-grader.ts b/packages/core/src/evaluation/graders/code-grader.ts index 3895a2ffb..20166e533 100644 --- a/packages/core/src/evaluation/graders/code-grader.ts +++ b/packages/core/src/evaluation/graders/code-grader.ts @@ -212,6 +212,8 @@ export class CodeGrader implements Grader { try { let stdout: string; + let exitCode = 0; + let execStderr = ''; if (context.dockerConfig) { // Docker execution mode: run grader inside a container const { DockerWorkspaceProvider } = await import('../workspace/docker-workspace.js'); @@ -221,27 +223,49 @@ export class CodeGrader implements Grader { stdin: inputPayload, repoCheckouts: getRepoCheckoutTargets(context.evalCase.workspace?.repos), }); - if (result.exitCode !== 0) { - const trimmedErr = result.stderr.trim(); - throw new Error( - trimmedErr.length > 0 - ? `Code evaluator exited with code ${result.exitCode}: ${trimmedErr}` - : `Code evaluator exited with code ${result.exitCode}`, - ); - } + exitCode = result.exitCode; stdout = result.stdout.trim(); + execStderr = result.stderr; } else { - stdout = await executeScript( + const result = await runScriptRaw( this.command, inputPayload, this.agentTimeoutMs, this.cwd, env, ); + exitCode = result.exitCode; + stdout = result.stdout.trim(); + execStderr = result.stderr; } - const parsed = parseJsonSafe(stdout); - const score = clampScore(typeof parsed?.score === 'number' ? parsed.score : 0); - const assertions: AssertionEntry[] = Array.isArray(parsed?.assertions) + // Non-zero exit with JSON stdout, or with stderr output, is treated as an error + // (script signaled failure through the protocol or wrote an error message). + // Non-zero exit with plain stdout and no stderr uses the exit-code convention + // (score 0 = fail) — so one-liners like `[ "$pages" -ge 5 ]` work cleanly. + const looksLikeJson = stdout.startsWith('{') || stdout.startsWith('['); + const hasStderr = execStderr.trim().length > 0; + if (exitCode !== 0 && (looksLikeJson || hasStderr)) { + const trimmedErr = formatStderr(execStderr); + throw new Error( + trimmedErr.length > 0 + ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` + : `Code evaluator exited with code ${exitCode}`, + ); + } + const rawParsed = parseJsonSafe(stdout); + // Only treat stdout as the JSON protocol if it parsed as an object (not a bare + // boolean, number, or string). Plain scalars fall through to parsePlainScore. + const parsed = + rawParsed != null && typeof rawParsed === 'object' && !Array.isArray(rawParsed) + ? rawParsed + : undefined; + // Plain-text fallback: when stdout is not a JSON object, interpret as a simple score. + // Supports exit-code convention (empty stdout = pass/fail by exit code), boolean + // strings, and numeric scores — so short shell one-liners work without JSON protocol. + const score = parsed != null + ? clampScore(typeof parsed.score === 'number' ? parsed.score : 0) + : parsePlainScore(stdout, exitCode); + const assertions: AssertionEntry[] = parsed != null && Array.isArray(parsed?.assertions) ? parsed.assertions .filter( (a: unknown): a is { text: string; passed: boolean; evidence?: string } => @@ -325,6 +349,19 @@ export class CodeGrader implements Grader { } } +/** Run a script and return raw stdout/stderr/exitCode without throwing. */ +async function runScriptRaw( + scriptPath: readonly string[] | string, + input: string, + agentTimeoutMs?: number, + cwd?: string, + env?: Record, +): Promise<{ stdout: string; stderr: string; exitCode: number }> { + return typeof scriptPath === 'string' + ? execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) + : execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }); +} + export async function executeScript( scriptPath: readonly string[] | string, input: string, @@ -332,10 +369,13 @@ export async function executeScript( cwd?: string, env?: Record, ): Promise { - const { stdout, stderr, exitCode } = - typeof scriptPath === 'string' - ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) - : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }); + const { stdout, stderr, exitCode } = await runScriptRaw( + scriptPath, + input, + agentTimeoutMs, + cwd, + env, + ); if (exitCode !== 0) { const trimmedErr = formatStderr(stderr); @@ -349,6 +389,27 @@ export async function executeScript( return stdout.trim(); } +/** + * Interpret plain-text (non-JSON) stdout as a score. + * + * | stdout (trimmed, lowercase) | score | + * |---|---| + * | empty string | 1 if exit 0, 0 if exit non-zero | + * | "true", "pass", "1" | 1 | + * | "false", "fail", "0" | 0 | + * | numeric string | clamped float | + * | anything else | 1 if exit 0, 0 if exit non-zero | + */ +function parsePlainScore(stdout: string, exitCode: number): number { + const t = stdout.trim().toLowerCase(); + if (t === '' || t === 'true' || t === 'pass') return exitCode === 0 ? 1 : 0; + if (t === '1') return 1; + if (t === 'false' || t === 'fail' || t === '0') return 0; + const n = Number(t); + if (!Number.isNaN(n)) return clampScore(n); + return exitCode === 0 ? 1 : 0; +} + function formatStderr(stderr: string): string { const trimmed = stderr.trim(); const maxLength = 2000; diff --git a/packages/core/src/evaluation/graders/index.ts b/packages/core/src/evaluation/graders/index.ts index 171c394d1..8a1a28800 100644 --- a/packages/core/src/evaluation/graders/index.ts +++ b/packages/core/src/evaluation/graders/index.ts @@ -58,8 +58,6 @@ export type { LlmGraderOptions } from './llm-grader.js'; export { formatToolCalls } from './format-tool-calls.js'; -export { ShellGrader } from './shell.js'; - export { SkillTriggerGrader } from './skill-trigger.js'; export { assembleLlmGraderPrompt } from './llm-grader-prompt.js'; diff --git a/packages/core/src/evaluation/graders/shell.ts b/packages/core/src/evaluation/graders/shell.ts deleted file mode 100644 index 27673033b..000000000 --- a/packages/core/src/evaluation/graders/shell.ts +++ /dev/null @@ -1,104 +0,0 @@ -/** - * Shell grader: runs a shell command and checks its stdout. - * - * Pass/fail logic: - * - No `expected`: passes when exit code is 0. - * - `expected`, no `operator`: trims stdout and compares as exact string. - * - `expected` + `operator`: parses stdout and expected as floats, compares numerically. - * - * The command runs in the workspace directory when available. - * - * To add a new comparison operator: extend `ShellOperator` in types.ts and add a - * case to `compareNumeric` below. - */ - -import { execShellWithStdin } from '../../runtime/exec.js'; -import type { ShellGraderConfig, ShellOperator } from '../types.js'; -import { scoreToVerdict } from './scoring.js'; -import type { EvaluationContext, EvaluationScore, Grader } from './types.js'; - -function compareNumeric(actual: number, operator: ShellOperator, expected: number): boolean { - switch (operator) { - case '>': - return actual > expected; - case '<': - return actual < expected; - case '>=': - return actual >= expected; - case '<=': - return actual <= expected; - case '==': - return actual === expected; - case '!=': - return actual !== expected; - } -} - -export class ShellGrader implements Grader { - readonly kind = 'shell'; - - constructor(private readonly config: ShellGraderConfig) {} - - async evaluate(context: EvaluationContext): Promise { - const { command, expected, operator } = this.config; - const cwd = context.workspacePath; - - let result: { stdout: string; stderr: string; exitCode: number }; - try { - result = await execShellWithStdin(command, '', { cwd }); - } catch (err) { - const msg = err instanceof Error ? err.message : String(err); - return { - score: 0, - verdict: 'fail', - assertions: [{ text: `Shell command error: ${msg}`, passed: false }], - expectedAspectCount: 1, - }; - } - - const stdout = result.stdout.trim(); - - let passed: boolean; - let assertionText: string; - - if (expected === undefined) { - passed = result.exitCode === 0; - assertionText = passed - ? 'Command exited with code 0' - : `Command exited with code ${result.exitCode}`; - } else if (operator !== undefined) { - const actualNum = Number.parseFloat(stdout); - const expectedNum = Number.parseFloat(expected); - if (Number.isNaN(actualNum) || Number.isNaN(expectedNum)) { - return { - score: 0, - verdict: 'fail', - assertions: [ - { - text: `Cannot compare numerically: stdout="${stdout}", expected="${expected}"`, - passed: false, - }, - ], - expectedAspectCount: 1, - }; - } - passed = compareNumeric(actualNum, operator, expectedNum); - assertionText = passed - ? `${actualNum} ${operator} ${expectedNum} (passed)` - : `${actualNum} ${operator} ${expectedNum} (failed)`; - } else { - passed = stdout === expected; - assertionText = passed - ? `stdout "${stdout}" equals expected "${expected}"` - : `stdout "${stdout}" does not equal expected "${expected}"`; - } - - const score = passed ? 1 : 0; - return { - score, - verdict: scoreToVerdict(score), - assertions: [{ text: assertionText, passed }], - expectedAspectCount: 1, - }; - } -} diff --git a/packages/core/src/evaluation/loaders/grader-parser.ts b/packages/core/src/evaluation/loaders/grader-parser.ts index 6fb8322db..7bdd9d01a 100644 --- a/packages/core/src/evaluation/loaders/grader-parser.ts +++ b/packages/core/src/evaluation/loaders/grader-parser.ts @@ -1371,45 +1371,6 @@ async function parseGraderList( continue; } - if (typeValue === 'shell') { - const command = asString(rawEvaluator.command); - if (!command) { - logWarning(`Skipping shell evaluator '${name}' in '${evalId}': missing command`); - continue; - } - const expected = asString(rawEvaluator.expected); - const rawOperator = asString(rawEvaluator.operator); - const validOperators = ['>', '<', '>=', '<=', '==', '!='] as const; - const operator = validOperators.includes(rawOperator as (typeof validOperators)[number]) - ? (rawOperator as (typeof validOperators)[number]) - : undefined; - if (rawOperator && !operator) { - logWarning( - `Skipping shell evaluator '${name}' in '${evalId}': invalid operator "${rawOperator}". Valid: ${validOperators.join(', ')}`, - ); - continue; - } - const weight = validateWeight(rawEvaluator.weight, name, evalId); - const { required, min_score } = parseRequiredAndMinScore( - rawEvaluator.required, - (rawEvaluator as Record).min_score as JsonValue | undefined, - name, - evalId, - ); - evaluators.push({ - name, - type: 'shell', - command, - ...(expected !== undefined ? { expected } : {}), - ...(operator !== undefined ? { operator } : {}), - ...(weight !== undefined ? { weight } : {}), - ...(required !== undefined ? { required } : {}), - ...(min_score !== undefined ? { min_score } : {}), - ...(negate !== undefined ? { negate } : {}), - }); - continue; - } - const graderTarget = rawEvaluator.target; let graderTargetName: string | undefined; if (graderTarget !== undefined) { @@ -1773,10 +1734,6 @@ function generateAssertionName(typeValue: string, rawEvaluator: JsonObject): str return 'is-json'; case 'equals': return value ? `equals-${value}` : 'equals'; - case 'shell': { - const cmd = asString(rawEvaluator.command); - return cmd ? `shell-${cmd.slice(0, 30)}` : 'shell'; - } case 'rubrics': return 'rubrics'; default: diff --git a/packages/core/src/evaluation/registry/builtin-graders.ts b/packages/core/src/evaluation/registry/builtin-graders.ts index d08f4a625..b24eb20b0 100644 --- a/packages/core/src/evaluation/registry/builtin-graders.ts +++ b/packages/core/src/evaluation/registry/builtin-graders.ts @@ -15,7 +15,6 @@ import { type Grader, LatencyGrader, LlmGrader, - ShellGrader, SkillTriggerGrader, TokenUsageGrader, ToolTrajectoryGrader, @@ -55,7 +54,6 @@ import type { LatencyGraderConfig, LlmGraderConfig, RegexGraderConfig, - ShellGraderConfig, SkillTriggerGraderConfig, StartsWithGraderConfig, TokenUsageGraderConfig, @@ -429,7 +427,6 @@ export function createBuiltinRegistry(): GraderRegistry { .register('regex', regexFactory) .register('is-json', isJsonFactory) .register('equals', equalsFactory) - .register('shell', (config) => new ShellGrader(config as ShellGraderConfig)) .register('inline-assert', (config) => { // biome-ignore lint/suspicious/noExplicitAny: symbol key access requires any const fn = (config as any)[INLINE_ASSERT_FN] as diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index e30ddbf6b..2027810a8 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -188,7 +188,6 @@ const GRADER_KIND_VALUES = [ 'equals', 'rubrics', 'inline-assert', - 'shell', ] as const; export type GraderKind = (typeof GRADER_KIND_VALUES)[number]; @@ -890,54 +889,6 @@ export type InlineAssertEvaluatorConfig = { readonly negate?: boolean; }; -/** - * Numeric comparison operators for shell assertions. - * When set, the shell command stdout is compared numerically against `expected`. - */ -export type ShellOperator = '>' | '<' | '>=' | '<=' | '==' | '!='; - -/** - * Configuration for the shell grader. - * Runs a shell command and checks its stdout against an expected value. - * - * - With no `expected`: passes when the command exits with code 0. - * - With `expected` and no `operator`: trims stdout and compares exactly. - * - With `expected` and `operator`: parses both sides as floats and compares numerically. - * - * The command runs in the workspace directory (if available). - * - * @example Exact match - * ```yaml - * - type: shell - * command: "wc -l output.txt | awk '{print $1}'" - * expected: "42" - * ``` - * - * @example Numeric comparison - * ```yaml - * - type: shell - * command: "pdfinfo report.pdf | grep Pages | awk '{print $2}'" - * operator: ">=" - * expected: "5" - * ``` - */ -export type ShellGraderConfig = { - readonly name: string; - readonly type: 'shell'; - /** Shell command to execute */ - readonly command: string; - /** Expected stdout value. If omitted, only the exit code (0 = pass) is checked. */ - readonly expected?: string; - /** Numeric comparison operator. When set, both stdout and expected are parsed as floats. */ - readonly operator?: ShellOperator; - readonly weight?: number; - readonly required?: boolean | number; - /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */ - readonly min_score?: number; - /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */ - readonly negate?: boolean; -}; - export type GraderConfig = | CodeGraderConfig | LlmGraderConfig @@ -961,8 +912,7 @@ export type GraderConfig = | IsJsonGraderConfig | EqualsGraderConfig | RubricsEvaluatorConfig - | InlineAssertEvaluatorConfig - | ShellGraderConfig; + | InlineAssertEvaluatorConfig; /** * A single turn in a multi-turn conversation evaluation. diff --git a/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts b/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts new file mode 100644 index 000000000..09c79eb42 --- /dev/null +++ b/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts @@ -0,0 +1,86 @@ +/** + * Tests for code-grader plain-text fallback. + * + * When a code-grader script emits non-JSON stdout, the grader interprets it + * as a simple score instead of requiring the full JSON protocol. This lets + * shell one-liners work without a JSON wrapper. + */ + +import { describe, expect, it } from 'vitest'; +import { CodeGrader } from '../../../src/evaluation/graders/code-grader.js'; +import type { EvaluationContext } from '../../../src/evaluation/graders/types.js'; + +const ctx = { candidate: '', evalCase: { id: 'test', input: [] } } as unknown as EvaluationContext; + +const grader = (cmd: string) => + new CodeGrader({ + command: ['bash', '-c', cmd], + agentTimeoutMs: 10_000, + }); + +describe('code-grader plain-text fallback', () => { + it('exit 0 with empty stdout → score 1', async () => { + const result = await grader('true').evaluate(ctx); + expect(result.score).toBe(1); + expect(result.verdict).toBe('pass'); + }); + + it('exit 1 with empty stdout → score 0', async () => { + const result = await grader('false').evaluate(ctx); + expect(result.score).toBe(0); + expect(result.verdict).toBe('fail'); + }); + + it('stdout "PASS" → score 1', async () => { + const result = await grader('echo PASS').evaluate(ctx); + expect(result.score).toBe(1); + }); + + it('stdout "FAIL" → score 0', async () => { + const result = await grader('echo FAIL').evaluate(ctx); + expect(result.score).toBe(0); + }); + + it('stdout "true" → score 1', async () => { + const result = await grader('echo true').evaluate(ctx); + expect(result.score).toBe(1); + }); + + it('stdout "false" → score 0', async () => { + const result = await grader('echo false').evaluate(ctx); + expect(result.score).toBe(0); + }); + + it('stdout numeric string → score as float', async () => { + const result = await grader('echo 0.75').evaluate(ctx); + expect(result.score).toBe(0.75); + }); + + it('stdout numeric "1" → score 1', async () => { + const result = await grader('echo 1').evaluate(ctx); + expect(result.score).toBe(1); + }); + + it('stdout numeric "0" → score 0', async () => { + const result = await grader('echo 0').evaluate(ctx); + expect(result.score).toBe(0); + }); + + it('exit-code numeric comparison: [ 14 -ge 5 ] → score 1', async () => { + const result = await grader('pages=14; [ "$pages" -ge 5 ]').evaluate(ctx); + expect(result.score).toBe(1); + }); + + it('exit-code numeric comparison: [ 3 -ge 10 ] → score 0', async () => { + const result = await grader('pages=3; [ "$pages" -ge 10 ]').evaluate(ctx); + expect(result.score).toBe(0); + }); + + it('JSON protocol still works (score from JSON)', async () => { + const result = await grader( + `echo '{"score":0.6,"assertions":[{"text":"ok","passed":true}]}'`, + ).evaluate(ctx); + expect(result.score).toBe(0.6); + expect(result.assertions).toHaveLength(1); + }); +}); diff --git a/packages/core/test/evaluation/graders/shell.test.ts b/packages/core/test/evaluation/graders/shell.test.ts deleted file mode 100644 index 5d977e2ef..000000000 --- a/packages/core/test/evaluation/graders/shell.test.ts +++ /dev/null @@ -1,81 +0,0 @@ -import { describe, expect, it } from 'vitest'; -import { ShellGrader } from '../../../src/evaluation/graders/shell.js'; -import type { EvaluationContext } from '../../../src/evaluation/graders/types.js'; -import type { ShellGraderConfig } from '../../../src/evaluation/types.js'; - -const mockContext = (workspacePath?: string): EvaluationContext => - ({ - candidate: '', - workspacePath, - evalCase: { id: 'test', input: [] }, - }) as unknown as EvaluationContext; - -const grader = (extra: Partial = {}) => - new ShellGrader({ name: 'test-shell', type: 'shell', command: 'echo 5', ...extra }); - -describe('ShellGrader', () => { - it('passes when command exits 0 and no expected', async () => { - const result = await grader({ command: 'true' }).evaluate(mockContext()); - expect(result.score).toBe(1); - expect(result.verdict).toBe('pass'); - }); - - it('fails when command exits non-zero and no expected', async () => { - const result = await grader({ command: 'false' }).evaluate(mockContext()); - expect(result.score).toBe(0); - expect(result.verdict).toBe('fail'); - }); - - it('passes on exact string match', async () => { - const result = await grader({ command: 'echo hello', expected: 'hello' }).evaluate( - mockContext(), - ); - expect(result.score).toBe(1); - }); - - it('fails on string mismatch', async () => { - const result = await grader({ command: 'echo hello', expected: 'world' }).evaluate( - mockContext(), - ); - expect(result.score).toBe(0); - expect(result.assertions[0].text).toContain('does not equal'); - }); - - it.each([ - ['>', 10, 5, 1], - ['>', 5, 10, 0], - ['<', 3, 10, 1], - ['>=', 5, 5, 1], - ['<=', 5, 5, 1], - ['==', 7, 7, 1], - ['!=', 7, 5, 1], - ['!=', 5, 5, 0], - ] as const)( - 'numeric operator %s: actual=%d, expected=%d → score=%d', - async (op, actual, expected, score) => { - const result = await grader({ - command: `echo ${actual}`, - operator: op, - expected: String(expected), - }).evaluate(mockContext()); - expect(result.score).toBe(score); - }, - ); - - it('fails with clear message when stdout is not a number for numeric comparison', async () => { - const result = await grader({ - command: 'echo notanumber', - operator: '>=', - expected: '5', - }).evaluate(mockContext()); - expect(result.score).toBe(0); - expect(result.assertions[0].text).toContain('Cannot compare numerically'); - }); - - it('returns score 0 when command errors', async () => { - const result = await grader({ command: 'nonexistent_command_xyz_abc_987' }).evaluate( - mockContext(), - ); - expect(result.score).toBe(0); - }); -}); diff --git a/packages/core/test/evaluation/loaders/grader-parser.test.ts b/packages/core/test/evaluation/loaders/grader-parser.test.ts index 8e75db73c..de1984798 100644 --- a/packages/core/test/evaluation/loaders/grader-parser.test.ts +++ b/packages/core/test/evaluation/loaders/grader-parser.test.ts @@ -14,7 +14,6 @@ import type { LatencyGraderConfig, LlmGraderConfig, RegexGraderConfig, - ShellGraderConfig, } from '../../../src/evaluation/types.js'; describe('parseGraders - deterministic assertion types', () => { @@ -212,59 +211,6 @@ describe('parseGraders - deterministic assertion types', () => { expect(evaluators).toBeUndefined(); }); - it('parses type: shell with command only', async () => { - const evaluators = await parseGraders( - { evaluators: [{ name: 'check-exit', type: 'shell', command: 'echo hi' }] }, - undefined, - [tempDir], - 'test-1', - ); - expect(evaluators).toHaveLength(1); - const config = evaluators?.[0] as ShellGraderConfig; - expect(config.type).toBe('shell'); - expect(config.command).toBe('echo hi'); - expect(config.expected).toBeUndefined(); - expect(config.operator).toBeUndefined(); - }); - - it('parses type: shell with expected and operator', async () => { - const evaluators = await parseGraders( - { - evaluators: [ - { name: 'check-pages', type: 'shell', command: 'echo 14', expected: '5', operator: '>=' }, - ], - }, - undefined, - [tempDir], - 'test-1', - ); - expect(evaluators).toHaveLength(1); - const config = evaluators?.[0] as ShellGraderConfig; - expect(config.type).toBe('shell'); - expect(config.expected).toBe('5'); - expect(config.operator).toBe('>='); - }); - - it('skips shell evaluator with invalid operator', async () => { - const evaluators = await parseGraders( - { evaluators: [{ name: 'bad-op', type: 'shell', command: 'echo 1', operator: '???' }] }, - undefined, - [tempDir], - 'test-1', - ); - expect(evaluators).toBeUndefined(); - }); - - it('skips shell evaluator with missing command', async () => { - const evaluators = await parseGraders( - { evaluators: [{ name: 'no-cmd', type: 'shell' }] }, - undefined, - [tempDir], - 'test-1', - ); - expect(evaluators).toBeUndefined(); - }); - it('parses type: rubrics with criteria as llm-grader', async () => { const evaluators = await parseGraders( { diff --git a/packages/eval/src/assertion.ts b/packages/eval/src/assertion.ts index 658a31488..1d654f329 100644 --- a/packages/eval/src/assertion.ts +++ b/packages/eval/src/assertion.ts @@ -59,7 +59,6 @@ export type AssertionType = | 'equals' | 'regex' | 'is-json' - | 'shell' // legacy snake_case aliases (still accepted) | 'llm_grader' | 'code_grader' From 54b20329651bf5471ac95506db33f5f6fd5d1d8b Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 3 May 2026 06:23:54 +0200 Subject: [PATCH 4/6] style: fix biome formatting in code-grader Co-Authored-By: Claude Sonnet 4.6 --- .../src/evaluation/graders/code-grader.ts | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/packages/core/src/evaluation/graders/code-grader.ts b/packages/core/src/evaluation/graders/code-grader.ts index 20166e533..22b6c1f8d 100644 --- a/packages/core/src/evaluation/graders/code-grader.ts +++ b/packages/core/src/evaluation/graders/code-grader.ts @@ -262,23 +262,25 @@ export class CodeGrader implements Grader { // Plain-text fallback: when stdout is not a JSON object, interpret as a simple score. // Supports exit-code convention (empty stdout = pass/fail by exit code), boolean // strings, and numeric scores — so short shell one-liners work without JSON protocol. - const score = parsed != null - ? clampScore(typeof parsed.score === 'number' ? parsed.score : 0) - : parsePlainScore(stdout, exitCode); - const assertions: AssertionEntry[] = parsed != null && Array.isArray(parsed?.assertions) - ? parsed.assertions - .filter( - (a: unknown): a is { text: string; passed: boolean; evidence?: string } => - typeof a === 'object' && - a !== null && - typeof (a as Record).text === 'string', - ) - .map((a) => ({ - text: String(a.text), - passed: Boolean(a.passed), - ...(typeof a.evidence === 'string' ? { evidence: a.evidence } : {}), - })) - : []; + const score = + parsed != null + ? clampScore(typeof parsed.score === 'number' ? parsed.score : 0) + : parsePlainScore(stdout, exitCode); + const assertions: AssertionEntry[] = + parsed != null && Array.isArray(parsed?.assertions) + ? parsed.assertions + .filter( + (a: unknown): a is { text: string; passed: boolean; evidence?: string } => + typeof a === 'object' && + a !== null && + typeof (a as Record).text === 'string', + ) + .map((a) => ({ + text: String(a.text), + passed: Boolean(a.passed), + ...(typeof a.evidence === 'string' ? { evidence: a.evidence } : {}), + })) + : []; // Capture optional structured details from code judge output const details = parsed?.details && typeof parsed.details === 'object' && !Array.isArray(parsed.details) From 888bb737f04b80052cce9bc7007efab816dee829 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 4 May 2026 04:33:47 +0200 Subject: [PATCH 5/6] refactor: simplify code-grader plain-text fallback to exit-code + assertion text Replace the string/numeric score interpretation with a clean two-convention model: - Exit code: 0 = score 1 (pass), non-zero = score 0 (fail) - Stdout: becomes the assertion text (human-readable context for the result) - Stderr on non-zero exit: still surfaces as an error For numeric scores or multi-aspect results, use the JSON protocol. This removes the "0"/"1"/numeric string ambiguity and aligns with how Unix tooling (bats, make, shell builtins) already signals pass/fail. Updates docs and tests to reflect the new model. Co-Authored-By: Claude Sonnet 4.6 --- .../docs/docs/graders/code-graders.mdx | 45 +++++++++++- .../src/evaluation/graders/code-grader.ts | 44 +++--------- .../graders/code-grader-plain-text.test.ts | 69 ++++++++----------- 3 files changed, 84 insertions(+), 74 deletions(-) diff --git a/apps/web/src/content/docs/docs/graders/code-graders.mdx b/apps/web/src/content/docs/docs/graders/code-graders.mdx index 1bec26b3b..ac969174d 100644 --- a/apps/web/src/content/docs/docs/graders/code-graders.mdx +++ b/apps/web/src/content/docs/docs/graders/code-graders.mdx @@ -9,7 +9,7 @@ Code graders are scripts that evaluate agent responses deterministically. Write ## Contract -Code graders communicate via stdin/stdout JSON: +Code graders receive eval context via stdin JSON and return a result via stdout. **Input (stdin):** ```json @@ -19,8 +19,12 @@ Code graders communicate via stdin/stdout JSON: "output": "The answer is 42.", "expected_output": "42" } +``` + +### JSON output (full protocol) + +Emit a JSON object for numeric scores or multi-aspect results: -**Output (stdout):** ```json { "score": 1.0, @@ -35,6 +39,43 @@ Code graders communicate via stdin/stdout JSON: | `score` | `number` | 0.0 to 1.0 | | `assertions` | `Array<{ text, passed, evidence? }>` | Per-aspect results with verdict and optional evidence | +### Plain-text output (exit-code convention) + +For simple pass/fail checks, skip the JSON protocol entirely. The exit code determines the score and stdout becomes the assertion text: + +| Exit code | Score | Verdict | +|-----------|-------|---------| +| 0 | 1.0 | pass | +| non-zero (no stderr) | 0.0 | fail | + +```bash +#!/bin/bash +# check-pages.sh — passes when PDF has at least 5 pages +pages=$(pdfinfo report.pdf | grep Pages | awk '{print $2}') +if [ "$pages" -ge 5 ]; then + echo "PDF has $pages pages (≥5 required)" +else + echo "PDF has only $pages pages (<5 required)" + exit 1 +fi +``` + +```yaml +assertions: + - type: code-grader + command: [bash, scripts/check-pages.sh] +``` + +Silent one-liners work too — stdout is optional: + +```yaml +assertions: + - type: code-grader + command: ["bash", "-c", "[ $(wc -l < output.txt) -ge 10 ]"] +``` + +Scripts that write to stderr and exit non-zero surface as execution errors rather than quality failures. + ## Python Example ```python diff --git a/packages/core/src/evaluation/graders/code-grader.ts b/packages/core/src/evaluation/graders/code-grader.ts index 22b6c1f8d..5c16ba38d 100644 --- a/packages/core/src/evaluation/graders/code-grader.ts +++ b/packages/core/src/evaluation/graders/code-grader.ts @@ -240,8 +240,8 @@ export class CodeGrader implements Grader { } // Non-zero exit with JSON stdout, or with stderr output, is treated as an error // (script signaled failure through the protocol or wrote an error message). - // Non-zero exit with plain stdout and no stderr uses the exit-code convention - // (score 0 = fail) — so one-liners like `[ "$pages" -ge 5 ]` work cleanly. + // Non-zero exit with plain stdout and no stderr uses the exit-code convention — + // score 0 (fail), stdout becomes the assertion text. const looksLikeJson = stdout.startsWith('{') || stdout.startsWith('['); const hasStderr = execStderr.trim().length > 0; if (exitCode !== 0 && (looksLikeJson || hasStderr)) { @@ -253,19 +253,16 @@ export class CodeGrader implements Grader { ); } const rawParsed = parseJsonSafe(stdout); - // Only treat stdout as the JSON protocol if it parsed as an object (not a bare - // boolean, number, or string). Plain scalars fall through to parsePlainScore. + // Only treat stdout as the JSON protocol if it parsed as a plain object. + // Bare JSON scalars (numbers, booleans, strings) fall through to the plain-text path. const parsed = rawParsed != null && typeof rawParsed === 'object' && !Array.isArray(rawParsed) ? rawParsed : undefined; - // Plain-text fallback: when stdout is not a JSON object, interpret as a simple score. - // Supports exit-code convention (empty stdout = pass/fail by exit code), boolean - // strings, and numeric scores — so short shell one-liners work without JSON protocol. - const score = - parsed != null - ? clampScore(typeof parsed.score === 'number' ? parsed.score : 0) - : parsePlainScore(stdout, exitCode); + // Plain-text fallback: exit code is pass/fail, stdout is the assertion text. + // For numeric scores or multi-aspect results, use the JSON protocol instead. + const passed = exitCode === 0; + const score = parsed != null ? clampScore(typeof parsed.score === 'number' ? parsed.score : 0) : (passed ? 1 : 0); const assertions: AssertionEntry[] = parsed != null && Array.isArray(parsed?.assertions) ? parsed.assertions @@ -280,7 +277,9 @@ export class CodeGrader implements Grader { passed: Boolean(a.passed), ...(typeof a.evidence === 'string' ? { evidence: a.evidence } : {}), })) - : []; + : parsed == null + ? [{ text: stdout.trim() || (passed ? 'exit 0' : `exit ${exitCode}`), passed }] + : []; // Capture optional structured details from code judge output const details = parsed?.details && typeof parsed.details === 'object' && !Array.isArray(parsed.details) @@ -391,27 +390,6 @@ export async function executeScript( return stdout.trim(); } -/** - * Interpret plain-text (non-JSON) stdout as a score. - * - * | stdout (trimmed, lowercase) | score | - * |---|---| - * | empty string | 1 if exit 0, 0 if exit non-zero | - * | "true", "pass", "1" | 1 | - * | "false", "fail", "0" | 0 | - * | numeric string | clamped float | - * | anything else | 1 if exit 0, 0 if exit non-zero | - */ -function parsePlainScore(stdout: string, exitCode: number): number { - const t = stdout.trim().toLowerCase(); - if (t === '' || t === 'true' || t === 'pass') return exitCode === 0 ? 1 : 0; - if (t === '1') return 1; - if (t === 'false' || t === 'fail' || t === '0') return 0; - const n = Number(t); - if (!Number.isNaN(n)) return clampScore(n); - return exitCode === 0 ? 1 : 0; -} - function formatStderr(stderr: string): string { const trimmed = stderr.trim(); const maxLength = 2000; diff --git a/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts b/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts index 09c79eb42..13e9b2cbe 100644 --- a/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts +++ b/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts @@ -1,9 +1,9 @@ /** * Tests for code-grader plain-text fallback. * - * When a code-grader script emits non-JSON stdout, the grader interprets it - * as a simple score instead of requiring the full JSON protocol. This lets - * shell one-liners work without a JSON wrapper. + * When a script emits non-JSON stdout, the grader uses the exit code as + * pass/fail (0 = score 1, non-zero = score 0) and stdout as the assertion + * text. For numeric scores or multi-aspect results, use the JSON protocol. */ import { describe, expect, it } from 'vitest'; @@ -13,57 +13,41 @@ import type { EvaluationContext } from '../../../src/evaluation/graders/types.js const ctx = { candidate: '', evalCase: { id: 'test', input: [] } } as unknown as EvaluationContext; const grader = (cmd: string) => - new CodeGrader({ - command: ['bash', '-c', cmd], - agentTimeoutMs: 10_000, - }); + new CodeGrader({ command: ['bash', '-c', cmd], agentTimeoutMs: 10_000 }); describe('code-grader plain-text fallback', () => { - it('exit 0 with empty stdout → score 1', async () => { + it('exit 0 with empty stdout → score 1, assertion text "exit 0"', async () => { const result = await grader('true').evaluate(ctx); expect(result.score).toBe(1); expect(result.verdict).toBe('pass'); + expect(result.assertions[0]).toMatchObject({ text: 'exit 0', passed: true }); }); - it('exit 1 with empty stdout → score 0', async () => { + it('exit 1 with empty stdout → score 0, assertion text "exit 1"', async () => { const result = await grader('false').evaluate(ctx); expect(result.score).toBe(0); expect(result.verdict).toBe('fail'); + expect(result.assertions[0]).toMatchObject({ text: 'exit 1', passed: false }); }); - it('stdout "PASS" → score 1', async () => { - const result = await grader('echo PASS').evaluate(ctx); - expect(result.score).toBe(1); - }); - - it('stdout "FAIL" → score 0', async () => { - const result = await grader('echo FAIL').evaluate(ctx); - expect(result.score).toBe(0); - }); - - it('stdout "true" → score 1', async () => { - const result = await grader('echo true').evaluate(ctx); - expect(result.score).toBe(1); - }); - - it('stdout "false" → score 0', async () => { - const result = await grader('echo false').evaluate(ctx); - expect(result.score).toBe(0); - }); - - it('stdout numeric string → score as float', async () => { - const result = await grader('echo 0.75').evaluate(ctx); - expect(result.score).toBe(0.75); - }); - - it('stdout numeric "1" → score 1', async () => { - const result = await grader('echo 1').evaluate(ctx); + it('exit 0 with stdout → score 1, stdout is assertion text', async () => { + const result = await grader('echo "PDF has 14 pages (≥5 required)"').evaluate(ctx); expect(result.score).toBe(1); + expect(result.assertions[0]).toMatchObject({ + text: 'PDF has 14 pages (≥5 required)', + passed: true, + }); }); - it('stdout numeric "0" → score 0', async () => { - const result = await grader('echo 0').evaluate(ctx); + it('exit 1 with stdout → score 0, stdout is assertion text', async () => { + const result = await grader( + 'echo "PDF has 3 pages (<5 required)"; exit 1', + ).evaluate(ctx); expect(result.score).toBe(0); + expect(result.assertions[0]).toMatchObject({ + text: 'PDF has 3 pages (<5 required)', + passed: false, + }); }); it('exit-code numeric comparison: [ 14 -ge 5 ] → score 1', async () => { @@ -76,11 +60,18 @@ describe('code-grader plain-text fallback', () => { expect(result.score).toBe(0); }); - it('JSON protocol still works (score from JSON)', async () => { + it('JSON protocol still works (score + assertions)', async () => { const result = await grader( `echo '{"score":0.6,"assertions":[{"text":"ok","passed":true}]}'`, ).evaluate(ctx); expect(result.score).toBe(0.6); expect(result.assertions).toHaveLength(1); + expect(result.assertions[0].text).toBe('ok'); + }); + + it('script with stderr on non-zero exit → surfaces as error assertion', async () => { + const result = await grader('echo "bad" >&2; exit 1').evaluate(ctx); + expect(result.score).toBe(0); + expect(result.assertions[0].text).toContain('exited with code'); }); }); From a5287c67fcc8e7d6a47985155082f89cce405a38 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 4 May 2026 04:37:43 +0200 Subject: [PATCH 6/6] style: fix biome formatting Co-Authored-By: Claude Sonnet 4.6 --- packages/core/src/evaluation/graders/code-grader.ts | 7 ++++++- .../test/evaluation/graders/code-grader-plain-text.test.ts | 4 +--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/packages/core/src/evaluation/graders/code-grader.ts b/packages/core/src/evaluation/graders/code-grader.ts index 5c16ba38d..62e9a7f37 100644 --- a/packages/core/src/evaluation/graders/code-grader.ts +++ b/packages/core/src/evaluation/graders/code-grader.ts @@ -262,7 +262,12 @@ export class CodeGrader implements Grader { // Plain-text fallback: exit code is pass/fail, stdout is the assertion text. // For numeric scores or multi-aspect results, use the JSON protocol instead. const passed = exitCode === 0; - const score = parsed != null ? clampScore(typeof parsed.score === 'number' ? parsed.score : 0) : (passed ? 1 : 0); + const score = + parsed != null + ? clampScore(typeof parsed.score === 'number' ? parsed.score : 0) + : passed + ? 1 + : 0; const assertions: AssertionEntry[] = parsed != null && Array.isArray(parsed?.assertions) ? parsed.assertions diff --git a/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts b/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts index 13e9b2cbe..27d863b4c 100644 --- a/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts +++ b/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts @@ -40,9 +40,7 @@ describe('code-grader plain-text fallback', () => { }); it('exit 1 with stdout → score 0, stdout is assertion text', async () => { - const result = await grader( - 'echo "PDF has 3 pages (<5 required)"; exit 1', - ).evaluate(ctx); + const result = await grader('echo "PDF has 3 pages (<5 required)"; exit 1').evaluate(ctx); expect(result.score).toBe(0); expect(result.assertions[0]).toMatchObject({ text: 'PDF has 3 pages (<5 required)',