Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 43 additions & 2 deletions apps/web/src/content/docs/docs/graders/code-graders.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Code graders are scripts that evaluate agent responses deterministically. Write

## Contract

Code graders communicate via stdin/stdout JSON:
Code graders receive eval context via stdin JSON and return a result via stdout.

**Input (stdin):**
```json
Expand All @@ -19,8 +19,12 @@ Code graders communicate via stdin/stdout JSON:
"output": "The answer is 42.",
"expected_output": "42"
}
```

### JSON output (full protocol)

Emit a JSON object for numeric scores or multi-aspect results:

**Output (stdout):**
```json
{
"score": 1.0,
Expand All @@ -35,6 +39,43 @@ Code graders communicate via stdin/stdout JSON:
| `score` | `number` | 0.0 to 1.0 |
| `assertions` | `Array<{ text, passed, evidence? }>` | Per-aspect results with verdict and optional evidence |

### Plain-text output (exit-code convention)

For simple pass/fail checks, skip the JSON protocol entirely. The exit code determines the score and stdout becomes the assertion text:

| Exit code | Score | Verdict |
|-----------|-------|---------|
| 0 | 1.0 | pass |
| non-zero (no stderr) | 0.0 | fail |

```bash
#!/bin/bash
# check-pages.sh — passes when PDF has at least 5 pages
pages=$(pdfinfo report.pdf | grep Pages | awk '{print $2}')
if [ "$pages" -ge 5 ]; then
echo "PDF has $pages pages (≥5 required)"
else
echo "PDF has only $pages pages (<5 required)"
exit 1
fi
```

```yaml
assertions:
- type: code-grader
command: [bash, scripts/check-pages.sh]
```

Silent one-liners work too — stdout is optional:

```yaml
assertions:
- type: code-grader
command: ["bash", "-c", "[ $(wc -l < output.txt) -ge 10 ]"]
```

Scripts that write to stderr and exit non-zero surface as execution errors rather than quality failures.

## Python Example

```python
Expand Down
104 changes: 75 additions & 29 deletions packages/core/src/evaluation/graders/code-grader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,8 @@ export class CodeGrader implements Grader {

try {
let stdout: string;
let exitCode = 0;
let execStderr = '';
if (context.dockerConfig) {
// Docker execution mode: run grader inside a container
const { DockerWorkspaceProvider } = await import('../workspace/docker-workspace.js');
Expand All @@ -221,40 +223,68 @@ export class CodeGrader implements Grader {
stdin: inputPayload,
repoCheckouts: getRepoCheckoutTargets(context.evalCase.workspace?.repos),
});
if (result.exitCode !== 0) {
const trimmedErr = result.stderr.trim();
throw new Error(
trimmedErr.length > 0
? `Code evaluator exited with code ${result.exitCode}: ${trimmedErr}`
: `Code evaluator exited with code ${result.exitCode}`,
);
}
exitCode = result.exitCode;
stdout = result.stdout.trim();
execStderr = result.stderr;
} else {
stdout = await executeScript(
const result = await runScriptRaw(
this.command,
inputPayload,
this.agentTimeoutMs,
this.cwd,
env,
);
exitCode = result.exitCode;
stdout = result.stdout.trim();
execStderr = result.stderr;
}
// Non-zero exit with JSON stdout, or with stderr output, is treated as an error
// (script signaled failure through the protocol or wrote an error message).
// Non-zero exit with plain stdout and no stderr uses the exit-code convention —
// score 0 (fail), stdout becomes the assertion text.
const looksLikeJson = stdout.startsWith('{') || stdout.startsWith('[');
const hasStderr = execStderr.trim().length > 0;
if (exitCode !== 0 && (looksLikeJson || hasStderr)) {
const trimmedErr = formatStderr(execStderr);
throw new Error(
trimmedErr.length > 0
? `Code evaluator exited with code ${exitCode}: ${trimmedErr}`
: `Code evaluator exited with code ${exitCode}`,
);
}
const parsed = parseJsonSafe(stdout);
const score = clampScore(typeof parsed?.score === 'number' ? parsed.score : 0);
const assertions: AssertionEntry[] = Array.isArray(parsed?.assertions)
? parsed.assertions
.filter(
(a: unknown): a is { text: string; passed: boolean; evidence?: string } =>
typeof a === 'object' &&
a !== null &&
typeof (a as Record<string, unknown>).text === 'string',
)
.map((a) => ({
text: String(a.text),
passed: Boolean(a.passed),
...(typeof a.evidence === 'string' ? { evidence: a.evidence } : {}),
}))
: [];
const rawParsed = parseJsonSafe(stdout);
// Only treat stdout as the JSON protocol if it parsed as a plain object.
// Bare JSON scalars (numbers, booleans, strings) fall through to the plain-text path.
const parsed =
rawParsed != null && typeof rawParsed === 'object' && !Array.isArray(rawParsed)
? rawParsed
: undefined;
// Plain-text fallback: exit code is pass/fail, stdout is the assertion text.
// For numeric scores or multi-aspect results, use the JSON protocol instead.
const passed = exitCode === 0;
const score =
parsed != null
? clampScore(typeof parsed.score === 'number' ? parsed.score : 0)
: passed
? 1
: 0;
const assertions: AssertionEntry[] =
parsed != null && Array.isArray(parsed?.assertions)
? parsed.assertions
.filter(
(a: unknown): a is { text: string; passed: boolean; evidence?: string } =>
typeof a === 'object' &&
a !== null &&
typeof (a as Record<string, unknown>).text === 'string',
)
.map((a) => ({
text: String(a.text),
passed: Boolean(a.passed),
...(typeof a.evidence === 'string' ? { evidence: a.evidence } : {}),
}))
: parsed == null
? [{ text: stdout.trim() || (passed ? 'exit 0' : `exit ${exitCode}`), passed }]
: [];
// Capture optional structured details from code judge output
const details =
parsed?.details && typeof parsed.details === 'object' && !Array.isArray(parsed.details)
Expand Down Expand Up @@ -325,17 +355,33 @@ export class CodeGrader implements Grader {
}
}

/** Run a script and return raw stdout/stderr/exitCode without throwing. */
async function runScriptRaw(
scriptPath: readonly string[] | string,
input: string,
agentTimeoutMs?: number,
cwd?: string,
env?: Record<string, string>,
): Promise<{ stdout: string; stderr: string; exitCode: number }> {
return typeof scriptPath === 'string'
? execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env })
: execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
}

export async function executeScript(
scriptPath: readonly string[] | string,
input: string,
agentTimeoutMs?: number,
cwd?: string,
env?: Record<string, string>,
): Promise<string> {
const { stdout, stderr, exitCode } =
typeof scriptPath === 'string'
? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env })
: await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
const { stdout, stderr, exitCode } = await runScriptRaw(
scriptPath,
input,
agentTimeoutMs,
cwd,
env,
);

if (exitCode !== 0) {
const trimmedErr = formatStderr(stderr);
Expand Down
56 changes: 56 additions & 0 deletions packages/core/src/evaluation/orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,20 @@ export async function runEvaluation(
setupLog('Docker image pull complete');
}

// Run preflight environment checks (fail fast before any hooks or test cases)
if (suiteWorkspace?.env) {
try {
await runPreflightChecks(suiteWorkspace.env, sharedWorkspacePath ?? undefined, setupLog);
setupLog('preflight checks passed');
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
if (sharedWorkspacePath && !useStaticWorkspace) {
await cleanupWorkspace(sharedWorkspacePath).catch(() => {});
}
throw new Error(message);
}
}

// Execute before_all (runs ONCE before first test per workspace)
const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
Expand Down Expand Up @@ -3924,3 +3938,45 @@ function computeWeightedMean(

return totalWeight > 0 ? weightedSum / totalWeight : 0;
}

/**
* Run preflight environment checks for workspace.env config.
* Fails fast if any required command or Python module is missing.
* Called once before before_all hooks, so long evals abort immediately on missing deps.
*/
async function runPreflightChecks(
env: import('./types.js').WorkspaceEnvConfig,
cwd: string | undefined,
log: (msg: string) => void,
): Promise<void> {
const execFileAsync = promisify(execFile);
const missing: string[] = [];

for (const cmd of env.required_commands ?? []) {
log(`preflight: checking command "${cmd}"`);
try {
if (process.platform === 'win32') {
await execFileAsync('where', [cmd], { cwd });
} else {
await execFileAsync('sh', ['-c', `command -v ${cmd}`], { cwd });
}
} catch {
missing.push(`command: ${cmd}`);
}
}

for (const mod of env.required_python_modules ?? []) {
log(`preflight: checking Python module "${mod}"`);
try {
await execFileAsync('python3', ['-c', `import ${mod}`], { cwd });
} catch {
missing.push(`python module: ${mod}`);
}
}

if (missing.length > 0) {
throw new Error(
`Preflight checks failed — missing dependencies:\n${missing.map((m) => ` • ${m}`).join('\n')}\n\nInstall the missing dependencies before running this eval.`,
);
}
}
21 changes: 21 additions & 0 deletions packages/core/src/evaluation/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,25 @@ export type DockerWorkspaceConfig = {
readonly cpus?: number;
};

/**
* Preflight environment requirements for the workspace.
* Checked once before before_all hooks run. Fails fast if anything is missing.
*
* @example
* ```yaml
* workspace:
* env:
* required_commands: [ffmpeg, pandoc]
* required_python_modules: [PIL, openai]
* ```
*/
export type WorkspaceEnvConfig = {
/** Shell commands that must be present in PATH (checked via `command -v`) */
readonly required_commands?: readonly string[];
/** Python modules that must be importable (checked via `python3 -c "import <module>"`) */
readonly required_python_modules?: readonly string[];
};

export type WorkspaceConfig = {
/** Template directory or .code-workspace file. Directories are copied to temp workspace.
* .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
Expand All @@ -359,6 +378,8 @@ export type WorkspaceConfig = {
* Used as default cwd for hook commands so that file-referenced templates resolve
* relative paths from their own directory, not the eval file's directory. */
readonly workspaceFileDir?: string;
/** Preflight environment requirements. Checked before before_all hooks run. */
readonly env?: WorkspaceEnvConfig;
};

export type CodeGraderConfig = {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import { readFile } from 'node:fs/promises';
import path from 'node:path';

import { interpolateEnv } from '../interpolation.js';
import {
CLI_PLACEHOLDERS,
COMMON_TARGET_SETTINGS,
findDeprecatedCamelCaseTargetWarnings,
} from '../providers/targets.js';
import { interpolateEnv } from '../interpolation.js';
import { KNOWN_PROVIDERS, PROVIDER_ALIASES } from '../providers/types.js';
import { parseYamlValue } from '../yaml-loader.js';
import type { ValidationError, ValidationResult } from './types.js';
Expand Down
24 changes: 23 additions & 1 deletion packages/core/src/evaluation/yaml-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ import type {
TrialsConfig,
TurnFailurePolicy,
WorkspaceConfig,
WorkspaceEnvConfig,
WorkspaceHookConfig,
WorkspaceHooksConfig,
WorkspaceScriptConfig,
Expand Down Expand Up @@ -853,8 +854,9 @@ function parseWorkspaceConfig(raw: unknown, evalFileDir: string): WorkspaceConfi
const mode = explicitMode ?? (workspacePath ? 'static' : undefined);

const docker = parseDockerWorkspaceConfig(obj.docker);
const env = parseWorkspaceEnvConfig(obj.env);

if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker)
if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker && !env)
return undefined;

return {
Expand All @@ -865,6 +867,26 @@ function parseWorkspaceConfig(raw: unknown, evalFileDir: string): WorkspaceConfi
...(mode !== undefined && { mode }),
...(workspacePath !== undefined && { path: workspacePath }),
...(docker !== undefined && { docker }),
...(env !== undefined && { env }),
};
}

function parseWorkspaceEnvConfig(raw: unknown): WorkspaceEnvConfig | undefined {
if (!isJsonObject(raw)) return undefined;
const obj = raw as Record<string, unknown>;

const required_commands = Array.isArray(obj.required_commands)
? (obj.required_commands.filter((c) => typeof c === 'string') as string[])
: undefined;
const required_python_modules = Array.isArray(obj.required_python_modules)
? (obj.required_python_modules.filter((m) => typeof m === 'string') as string[])
: undefined;

if (!required_commands?.length && !required_python_modules?.length) return undefined;

return {
...(required_commands?.length && { required_commands }),
...(required_python_modules?.length && { required_python_modules }),
};
}

Expand Down
Loading
Loading