diff --git a/apps/web/src/content/docs/docs/graders/code-graders.mdx b/apps/web/src/content/docs/docs/graders/code-graders.mdx index 1bec26b3..ac969174 100644 --- a/apps/web/src/content/docs/docs/graders/code-graders.mdx +++ b/apps/web/src/content/docs/docs/graders/code-graders.mdx @@ -9,7 +9,7 @@ Code graders are scripts that evaluate agent responses deterministically. Write ## Contract -Code graders communicate via stdin/stdout JSON: +Code graders receive eval context via stdin JSON and return a result via stdout. **Input (stdin):** ```json @@ -19,8 +19,12 @@ Code graders communicate via stdin/stdout JSON: "output": "The answer is 42.", "expected_output": "42" } +``` + +### JSON output (full protocol) + +Emit a JSON object for numeric scores or multi-aspect results: -**Output (stdout):** ```json { "score": 1.0, @@ -35,6 +39,43 @@ Code graders communicate via stdin/stdout JSON: | `score` | `number` | 0.0 to 1.0 | | `assertions` | `Array<{ text, passed, evidence? }>` | Per-aspect results with verdict and optional evidence | +### Plain-text output (exit-code convention) + +For simple pass/fail checks, skip the JSON protocol entirely. The exit code determines the score and stdout becomes the assertion text: + +| Exit code | Score | Verdict | +|-----------|-------|---------| +| 0 | 1.0 | pass | +| non-zero (no stderr) | 0.0 | fail | + +```bash +#!/bin/bash +# check-pages.sh — passes when PDF has at least 5 pages +pages=$(pdfinfo report.pdf | grep Pages | awk '{print $2}') +if [ "$pages" -ge 5 ]; then + echo "PDF has $pages pages (≥5 required)" +else + echo "PDF has only $pages pages (<5 required)" + exit 1 +fi +``` + +```yaml +assertions: + - type: code-grader + command: [bash, scripts/check-pages.sh] +``` + +Silent one-liners work too — stdout is optional: + +```yaml +assertions: + - type: code-grader + command: ["bash", "-c", "[ $(wc -l < output.txt) -ge 10 ]"] +``` + +Scripts that write to stderr and exit non-zero surface as execution errors rather than quality failures. + ## Python Example ```python diff --git a/packages/core/src/evaluation/graders/code-grader.ts b/packages/core/src/evaluation/graders/code-grader.ts index 3895a2ff..62e9a7f3 100644 --- a/packages/core/src/evaluation/graders/code-grader.ts +++ b/packages/core/src/evaluation/graders/code-grader.ts @@ -212,6 +212,8 @@ export class CodeGrader implements Grader { try { let stdout: string; + let exitCode = 0; + let execStderr = ''; if (context.dockerConfig) { // Docker execution mode: run grader inside a container const { DockerWorkspaceProvider } = await import('../workspace/docker-workspace.js'); @@ -221,40 +223,68 @@ export class CodeGrader implements Grader { stdin: inputPayload, repoCheckouts: getRepoCheckoutTargets(context.evalCase.workspace?.repos), }); - if (result.exitCode !== 0) { - const trimmedErr = result.stderr.trim(); - throw new Error( - trimmedErr.length > 0 - ? `Code evaluator exited with code ${result.exitCode}: ${trimmedErr}` - : `Code evaluator exited with code ${result.exitCode}`, - ); - } + exitCode = result.exitCode; stdout = result.stdout.trim(); + execStderr = result.stderr; } else { - stdout = await executeScript( + const result = await runScriptRaw( this.command, inputPayload, this.agentTimeoutMs, this.cwd, env, ); + exitCode = result.exitCode; + stdout = result.stdout.trim(); + execStderr = result.stderr; + } + // Non-zero exit with JSON stdout, or with stderr output, is treated as an error + // (script signaled failure through the protocol or wrote an error message). + // Non-zero exit with plain stdout and no stderr uses the exit-code convention — + // score 0 (fail), stdout becomes the assertion text. + const looksLikeJson = stdout.startsWith('{') || stdout.startsWith('['); + const hasStderr = execStderr.trim().length > 0; + if (exitCode !== 0 && (looksLikeJson || hasStderr)) { + const trimmedErr = formatStderr(execStderr); + throw new Error( + trimmedErr.length > 0 + ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` + : `Code evaluator exited with code ${exitCode}`, + ); } - const parsed = parseJsonSafe(stdout); - const score = clampScore(typeof parsed?.score === 'number' ? parsed.score : 0); - const assertions: AssertionEntry[] = Array.isArray(parsed?.assertions) - ? parsed.assertions - .filter( - (a: unknown): a is { text: string; passed: boolean; evidence?: string } => - typeof a === 'object' && - a !== null && - typeof (a as Record).text === 'string', - ) - .map((a) => ({ - text: String(a.text), - passed: Boolean(a.passed), - ...(typeof a.evidence === 'string' ? { evidence: a.evidence } : {}), - })) - : []; + const rawParsed = parseJsonSafe(stdout); + // Only treat stdout as the JSON protocol if it parsed as a plain object. + // Bare JSON scalars (numbers, booleans, strings) fall through to the plain-text path. + const parsed = + rawParsed != null && typeof rawParsed === 'object' && !Array.isArray(rawParsed) + ? rawParsed + : undefined; + // Plain-text fallback: exit code is pass/fail, stdout is the assertion text. + // For numeric scores or multi-aspect results, use the JSON protocol instead. + const passed = exitCode === 0; + const score = + parsed != null + ? clampScore(typeof parsed.score === 'number' ? parsed.score : 0) + : passed + ? 1 + : 0; + const assertions: AssertionEntry[] = + parsed != null && Array.isArray(parsed?.assertions) + ? parsed.assertions + .filter( + (a: unknown): a is { text: string; passed: boolean; evidence?: string } => + typeof a === 'object' && + a !== null && + typeof (a as Record).text === 'string', + ) + .map((a) => ({ + text: String(a.text), + passed: Boolean(a.passed), + ...(typeof a.evidence === 'string' ? { evidence: a.evidence } : {}), + })) + : parsed == null + ? [{ text: stdout.trim() || (passed ? 'exit 0' : `exit ${exitCode}`), passed }] + : []; // Capture optional structured details from code judge output const details = parsed?.details && typeof parsed.details === 'object' && !Array.isArray(parsed.details) @@ -325,6 +355,19 @@ export class CodeGrader implements Grader { } } +/** Run a script and return raw stdout/stderr/exitCode without throwing. */ +async function runScriptRaw( + scriptPath: readonly string[] | string, + input: string, + agentTimeoutMs?: number, + cwd?: string, + env?: Record, +): Promise<{ stdout: string; stderr: string; exitCode: number }> { + return typeof scriptPath === 'string' + ? execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) + : execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }); +} + export async function executeScript( scriptPath: readonly string[] | string, input: string, @@ -332,10 +375,13 @@ export async function executeScript( cwd?: string, env?: Record, ): Promise { - const { stdout, stderr, exitCode } = - typeof scriptPath === 'string' - ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) - : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }); + const { stdout, stderr, exitCode } = await runScriptRaw( + scriptPath, + input, + agentTimeoutMs, + cwd, + env, + ); if (exitCode !== 0) { const trimmedErr = formatStderr(stderr); diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 8d37c8b9..e3da97d6 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -958,6 +958,20 @@ export async function runEvaluation( setupLog('Docker image pull complete'); } + // Run preflight environment checks (fail fast before any hooks or test cases) + if (suiteWorkspace?.env) { + try { + await runPreflightChecks(suiteWorkspace.env, sharedWorkspacePath ?? undefined, setupLog); + setupLog('preflight checks passed'); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + if (sharedWorkspacePath && !useStaticWorkspace) { + await cleanupWorkspace(sharedWorkspacePath).catch(() => {}); + } + throw new Error(message); + } + } + // Execute before_all (runs ONCE before first test per workspace) const suiteHooksEnabled = hooksEnabled(suiteWorkspace); const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all; @@ -3924,3 +3938,45 @@ function computeWeightedMean( return totalWeight > 0 ? weightedSum / totalWeight : 0; } + +/** + * Run preflight environment checks for workspace.env config. + * Fails fast if any required command or Python module is missing. + * Called once before before_all hooks, so long evals abort immediately on missing deps. + */ +async function runPreflightChecks( + env: import('./types.js').WorkspaceEnvConfig, + cwd: string | undefined, + log: (msg: string) => void, +): Promise { + const execFileAsync = promisify(execFile); + const missing: string[] = []; + + for (const cmd of env.required_commands ?? []) { + log(`preflight: checking command "${cmd}"`); + try { + if (process.platform === 'win32') { + await execFileAsync('where', [cmd], { cwd }); + } else { + await execFileAsync('sh', ['-c', `command -v ${cmd}`], { cwd }); + } + } catch { + missing.push(`command: ${cmd}`); + } + } + + for (const mod of env.required_python_modules ?? []) { + log(`preflight: checking Python module "${mod}"`); + try { + await execFileAsync('python3', ['-c', `import ${mod}`], { cwd }); + } catch { + missing.push(`python module: ${mod}`); + } + } + + if (missing.length > 0) { + throw new Error( + `Preflight checks failed — missing dependencies:\n${missing.map((m) => ` • ${m}`).join('\n')}\n\nInstall the missing dependencies before running this eval.`, + ); + } +} diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 53828126..2027810a 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -339,6 +339,25 @@ export type DockerWorkspaceConfig = { readonly cpus?: number; }; +/** + * Preflight environment requirements for the workspace. + * Checked once before before_all hooks run. Fails fast if anything is missing. + * + * @example + * ```yaml + * workspace: + * env: + * required_commands: [ffmpeg, pandoc] + * required_python_modules: [PIL, openai] + * ``` + */ +export type WorkspaceEnvConfig = { + /** Shell commands that must be present in PATH (checked via `command -v`) */ + readonly required_commands?: readonly string[]; + /** Python modules that must be importable (checked via `python3 -c "import "`) */ + readonly required_python_modules?: readonly string[]; +}; + export type WorkspaceConfig = { /** Template directory or .code-workspace file. Directories are copied to temp workspace. * .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */ @@ -359,6 +378,8 @@ export type WorkspaceConfig = { * Used as default cwd for hook commands so that file-referenced templates resolve * relative paths from their own directory, not the eval file's directory. */ readonly workspaceFileDir?: string; + /** Preflight environment requirements. Checked before before_all hooks run. */ + readonly env?: WorkspaceEnvConfig; }; export type CodeGraderConfig = { diff --git a/packages/core/src/evaluation/validation/targets-validator.ts b/packages/core/src/evaluation/validation/targets-validator.ts index 7d9fc74f..524b6654 100644 --- a/packages/core/src/evaluation/validation/targets-validator.ts +++ b/packages/core/src/evaluation/validation/targets-validator.ts @@ -1,12 +1,12 @@ import { readFile } from 'node:fs/promises'; import path from 'node:path'; +import { interpolateEnv } from '../interpolation.js'; import { CLI_PLACEHOLDERS, COMMON_TARGET_SETTINGS, findDeprecatedCamelCaseTargetWarnings, } from '../providers/targets.js'; -import { interpolateEnv } from '../interpolation.js'; import { KNOWN_PROVIDERS, PROVIDER_ALIASES } from '../providers/types.js'; import { parseYamlValue } from '../yaml-loader.js'; import type { ValidationError, ValidationResult } from './types.js'; diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 8aa397c4..ba25e993 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -54,6 +54,7 @@ import type { TrialsConfig, TurnFailurePolicy, WorkspaceConfig, + WorkspaceEnvConfig, WorkspaceHookConfig, WorkspaceHooksConfig, WorkspaceScriptConfig, @@ -853,8 +854,9 @@ function parseWorkspaceConfig(raw: unknown, evalFileDir: string): WorkspaceConfi const mode = explicitMode ?? (workspacePath ? 'static' : undefined); const docker = parseDockerWorkspaceConfig(obj.docker); + const env = parseWorkspaceEnvConfig(obj.env); - if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker) + if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker && !env) return undefined; return { @@ -865,6 +867,26 @@ function parseWorkspaceConfig(raw: unknown, evalFileDir: string): WorkspaceConfi ...(mode !== undefined && { mode }), ...(workspacePath !== undefined && { path: workspacePath }), ...(docker !== undefined && { docker }), + ...(env !== undefined && { env }), + }; +} + +function parseWorkspaceEnvConfig(raw: unknown): WorkspaceEnvConfig | undefined { + if (!isJsonObject(raw)) return undefined; + const obj = raw as Record; + + const required_commands = Array.isArray(obj.required_commands) + ? (obj.required_commands.filter((c) => typeof c === 'string') as string[]) + : undefined; + const required_python_modules = Array.isArray(obj.required_python_modules) + ? (obj.required_python_modules.filter((m) => typeof m === 'string') as string[]) + : undefined; + + if (!required_commands?.length && !required_python_modules?.length) return undefined; + + return { + ...(required_commands?.length && { required_commands }), + ...(required_python_modules?.length && { required_python_modules }), }; } diff --git a/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts b/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts new file mode 100644 index 00000000..27d863b4 --- /dev/null +++ b/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts @@ -0,0 +1,75 @@ +/** + * Tests for code-grader plain-text fallback. + * + * When a script emits non-JSON stdout, the grader uses the exit code as + * pass/fail (0 = score 1, non-zero = score 0) and stdout as the assertion + * text. For numeric scores or multi-aspect results, use the JSON protocol. + */ + +import { describe, expect, it } from 'vitest'; +import { CodeGrader } from '../../../src/evaluation/graders/code-grader.js'; +import type { EvaluationContext } from '../../../src/evaluation/graders/types.js'; + +const ctx = { candidate: '', evalCase: { id: 'test', input: [] } } as unknown as EvaluationContext; + +const grader = (cmd: string) => + new CodeGrader({ command: ['bash', '-c', cmd], agentTimeoutMs: 10_000 }); + +describe('code-grader plain-text fallback', () => { + it('exit 0 with empty stdout → score 1, assertion text "exit 0"', async () => { + const result = await grader('true').evaluate(ctx); + expect(result.score).toBe(1); + expect(result.verdict).toBe('pass'); + expect(result.assertions[0]).toMatchObject({ text: 'exit 0', passed: true }); + }); + + it('exit 1 with empty stdout → score 0, assertion text "exit 1"', async () => { + const result = await grader('false').evaluate(ctx); + expect(result.score).toBe(0); + expect(result.verdict).toBe('fail'); + expect(result.assertions[0]).toMatchObject({ text: 'exit 1', passed: false }); + }); + + it('exit 0 with stdout → score 1, stdout is assertion text', async () => { + const result = await grader('echo "PDF has 14 pages (≥5 required)"').evaluate(ctx); + expect(result.score).toBe(1); + expect(result.assertions[0]).toMatchObject({ + text: 'PDF has 14 pages (≥5 required)', + passed: true, + }); + }); + + it('exit 1 with stdout → score 0, stdout is assertion text', async () => { + const result = await grader('echo "PDF has 3 pages (<5 required)"; exit 1').evaluate(ctx); + expect(result.score).toBe(0); + expect(result.assertions[0]).toMatchObject({ + text: 'PDF has 3 pages (<5 required)', + passed: false, + }); + }); + + it('exit-code numeric comparison: [ 14 -ge 5 ] → score 1', async () => { + const result = await grader('pages=14; [ "$pages" -ge 5 ]').evaluate(ctx); + expect(result.score).toBe(1); + }); + + it('exit-code numeric comparison: [ 3 -ge 10 ] → score 0', async () => { + const result = await grader('pages=3; [ "$pages" -ge 10 ]').evaluate(ctx); + expect(result.score).toBe(0); + }); + + it('JSON protocol still works (score + assertions)', async () => { + const result = await grader( + `echo '{"score":0.6,"assertions":[{"text":"ok","passed":true}]}'`, + ).evaluate(ctx); + expect(result.score).toBe(0.6); + expect(result.assertions).toHaveLength(1); + expect(result.assertions[0].text).toBe('ok'); + }); + + it('script with stderr on non-zero exit → surfaces as error assertion', async () => { + const result = await grader('echo "bad" >&2; exit 1').evaluate(ctx); + expect(result.score).toBe(0); + expect(result.assertions[0].text).toContain('exited with code'); + }); +});