diff --git a/examples/features/code-grader-sdk/scripts/verify-attachments.ts b/examples/features/code-grader-sdk/scripts/verify-attachments.ts index 2fec360b1..430b7a66a 100755 --- a/examples/features/code-grader-sdk/scripts/verify-attachments.ts +++ b/examples/features/code-grader-sdk/scripts/verify-attachments.ts @@ -58,11 +58,5 @@ export default defineCodeGrader(({ expectedOutput, output, inputFiles }) => { } } - const passed = assertions.filter((a) => a.passed).length; - const score = assertions.length === 0 ? 0 : passed / assertions.length; - - return { - score, - assertions, - }; + return { assertions }; }); diff --git a/examples/features/copilot-log-eval/graders/transcript-quality.ts b/examples/features/copilot-log-eval/graders/transcript-quality.ts index 87c9f329a..b9ea5d000 100644 --- a/examples/features/copilot-log-eval/graders/transcript-quality.ts +++ b/examples/features/copilot-log-eval/graders/transcript-quality.ts @@ -91,8 +91,5 @@ export default defineCodeGrader(({ output }) => { }); } - const passed = assertions.filter((a) => a.passed).length; - const score = assertions.length > 0 ? passed / assertions.length : 0; - - return { score, assertions }; + return { assertions }; }); diff --git a/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts b/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts index 5004381de..58f67833e 100644 --- a/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts +++ b/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts @@ -37,10 +37,5 @@ export default defineCodeGrader(({ output }) => { assertions.push({ text: 'Answer does not mention France', passed: false }); } - const passed = assertions.filter((a) => a.passed).length; - const total = assertions.length; - return { - score: total > 0 ? passed / total : 0, - assertions, - }; + return { assertions }; }); diff --git a/examples/features/eval-assert-demo/.agentv/graders/length-check.ts b/examples/features/eval-assert-demo/.agentv/graders/length-check.ts index da054ff5d..6f939ba44 100644 --- a/examples/features/eval-assert-demo/.agentv/graders/length-check.ts +++ b/examples/features/eval-assert-demo/.agentv/graders/length-check.ts @@ -37,10 +37,5 @@ export default defineCodeGrader(({ output }) => { assertions.push({ text: `Answer has ${wordCount} words (> 50, too verbose)`, passed: false }); } - const passed = assertions.filter((a) => a.passed).length; - const total = assertions.length; - return { - score: total > 0 ? passed / total : 0, - assertions, - }; + return { assertions }; }); diff --git a/examples/features/execution-metrics/scripts/check-metrics-present.ts b/examples/features/execution-metrics/scripts/check-metrics-present.ts index 1aacdea13..c85926439 100644 --- a/examples/features/execution-metrics/scripts/check-metrics-present.ts +++ b/examples/features/execution-metrics/scripts/check-metrics-present.ts @@ -17,10 +17,7 @@ export default defineCodeGrader(({ trace, tokenUsage, costUsd, durationMs }) => const assertions: Array<{ text: string; passed: boolean }> = []; if (!trace) { - return { - score: 0, - assertions: [{ text: 'No trace provided', passed: false }], - }; + return { assertions: [{ text: 'No trace provided', passed: false }] }; } // Check for tokenUsage @@ -47,11 +44,5 @@ export default defineCodeGrader(({ trace, tokenUsage, costUsd, durationMs }) => assertions.push({ text: 'durationMs not present', passed: false }); } - const passed = assertions.filter((a) => a.passed).length; - const score = passed / assertions.length; - - return { - score, - assertions, - }; + return { assertions }; }); diff --git a/examples/features/file-changes-with-repos/scripts/check-file-changes.ts b/examples/features/file-changes-with-repos/scripts/check-file-changes.ts index b198b7a35..a150c3fdf 100644 --- a/examples/features/file-changes-with-repos/scripts/check-file-changes.ts +++ b/examples/features/file-changes-with-repos/scripts/check-file-changes.ts @@ -22,7 +22,7 @@ if (!fileChanges || fileChanges.trim().length === 0) { passed: false, evidence: 'file_changes is empty — workspace not configured or file tracking failed', }); - console.log(JSON.stringify({ score: 0, assertions })); + console.log(JSON.stringify({ assertions })); process.exit(0); } @@ -56,5 +56,4 @@ assertions.push({ evidence: hasAddFn ? undefined : 'add() function not found in diff', }); -const passed = assertions.filter((a) => a.passed).length; -console.log(JSON.stringify({ score: passed / assertions.length, assertions })); +console.log(JSON.stringify({ assertions })); diff --git a/examples/features/functional-grading/scripts/functional-check.ts b/examples/features/functional-grading/scripts/functional-check.ts index 8952f9e9c..b4c933b1e 100644 --- a/examples/features/functional-grading/scripts/functional-check.ts +++ b/examples/features/functional-grading/scripts/functional-check.ts @@ -19,7 +19,6 @@ const workspacePath: string | null = input.workspace_path; if (!workspacePath) { console.log( JSON.stringify({ - score: 0, assertions: [ { text: 'workspace_path not provided — cannot run functional checks', @@ -63,13 +62,4 @@ if (compiled) { runStage('tests', 'npm', ['test']); } -const passed = assertions.filter((a) => a.passed).length; -const total = assertions.length; -const score = total > 0 ? passed / total : 0; - -console.log( - JSON.stringify({ - score, - assertions, - }), -); +console.log(JSON.stringify({ assertions })); diff --git a/examples/features/import-claude/graders/transcript-quality.ts b/examples/features/import-claude/graders/transcript-quality.ts index 91e61ad04..9dc7b0ef5 100644 --- a/examples/features/import-claude/graders/transcript-quality.ts +++ b/examples/features/import-claude/graders/transcript-quality.ts @@ -28,9 +28,5 @@ export default defineCodeGrader(({ output }) => { passed: emptyAssistant.length === 0, }); - const passed = assertions.filter((a) => a.passed).length; - return { - score: assertions.length > 0 ? passed / assertions.length : 0, - assertions, - }; + return { assertions }; }); diff --git a/examples/features/workspace-artifact/scripts/check-csv-artifact.ts b/examples/features/workspace-artifact/scripts/check-csv-artifact.ts index 43809cdea..7836489ee 100644 --- a/examples/features/workspace-artifact/scripts/check-csv-artifact.ts +++ b/examples/features/workspace-artifact/scripts/check-csv-artifact.ts @@ -25,7 +25,7 @@ if (!fileChanges || fileChanges.trim().length === 0) { passed: false, evidence: 'file_changes is empty — workspace snapshot or git baseline may not be configured', }); - console.log(JSON.stringify({ score: 0, assertions })); + console.log(JSON.stringify({ assertions })); process.exit(0); } @@ -65,7 +65,4 @@ assertions.push({ evidence: hasDataRow ? undefined : 'No data rows found after the header', }); -const passed = assertions.filter((a) => a.passed).length; -const score = passed / assertions.length; - -console.log(JSON.stringify({ score, assertions })); +console.log(JSON.stringify({ assertions })); diff --git a/examples/showcase/cross-repo-sync/scripts/validate-sync.ts b/examples/showcase/cross-repo-sync/scripts/validate-sync.ts index 59bb8bcbc..4512bfa19 100644 --- a/examples/showcase/cross-repo-sync/scripts/validate-sync.ts +++ b/examples/showcase/cross-repo-sync/scripts/validate-sync.ts @@ -26,10 +26,7 @@ defineCodeGrader(({ fileChanges, config }) => { if (!fileChanges) { assertions.push({ text: 'No file changes captured', passed: false }); - return { - score: 0, - assertions, - }; + return { assertions }; } // Parse diff blocks @@ -57,12 +54,7 @@ defineCodeGrader(({ fileChanges, config }) => { } } - const passed = assertions.filter((a) => a.passed).length; - const total = assertions.length; - const score = total > 0 ? passed / total : 0; - return { - score, assertions, details: { files_checked: expectedFiles.length, diff --git a/packages/core/src/evaluation/graders/code-grader.ts b/packages/core/src/evaluation/graders/code-grader.ts index 62e9a7f37..b672ab32d 100644 --- a/packages/core/src/evaluation/graders/code-grader.ts +++ b/packages/core/src/evaluation/graders/code-grader.ts @@ -262,12 +262,6 @@ export class CodeGrader implements Grader { // Plain-text fallback: exit code is pass/fail, stdout is the assertion text. // For numeric scores or multi-aspect results, use the JSON protocol instead. const passed = exitCode === 0; - const score = - parsed != null - ? clampScore(typeof parsed.score === 'number' ? parsed.score : 0) - : passed - ? 1 - : 0; const assertions: AssertionEntry[] = parsed != null && Array.isArray(parsed?.assertions) ? parsed.assertions @@ -285,6 +279,19 @@ export class CodeGrader implements Grader { : parsed == null ? [{ text: stdout.trim() || (passed ? 'exit 0' : `exit ${exitCode}`), passed }] : []; + // When the script omits `score` but returns `assertions`, derive score as passing/total. + const score = + parsed != null + ? clampScore( + typeof parsed.score === 'number' + ? parsed.score + : assertions.length > 0 + ? assertions.filter((a) => a.passed).length / assertions.length + : 0, + ) + : passed + ? 1 + : 0; // Capture optional structured details from code judge output const details = parsed?.details && typeof parsed.details === 'object' && !Array.isArray(parsed.details) diff --git a/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts b/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts index 27d863b4c..86f9b8709 100644 --- a/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts +++ b/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts @@ -67,6 +67,34 @@ describe('code-grader plain-text fallback', () => { expect(result.assertions[0].text).toBe('ok'); }); + it('assertions without score → derived as passing/total', async () => { + const result = await grader( + `echo '{"assertions":[{"text":"a","passed":true},{"text":"b","passed":false},{"text":"c","passed":true}]}'`, + ).evaluate(ctx); + expect(result.score).toBeCloseTo(2 / 3); + expect(result.assertions).toHaveLength(3); + }); + + it('assertions all passing without score → score 1', async () => { + const result = await grader( + `echo '{"assertions":[{"text":"a","passed":true},{"text":"b","passed":true}]}'`, + ).evaluate(ctx); + expect(result.score).toBe(1); + }); + + it('assertions all failing without score → score 0', async () => { + const result = await grader(`echo '{"assertions":[{"text":"a","passed":false}]}'`).evaluate( + ctx, + ); + expect(result.score).toBe(0); + }); + + it('empty assertions array without score → score 0', async () => { + const result = await grader(`echo '{"assertions":[]}'`).evaluate(ctx); + expect(result.score).toBe(0); + expect(result.assertions).toHaveLength(0); + }); + it('script with stderr on non-zero exit → surfaces as error assertion', async () => { const result = await grader('echo "bad" >&2; exit 1').evaluate(ctx); expect(result.score).toBe(0);