Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,5 @@ export default defineCodeGrader(({ expectedOutput, output, inputFiles }) => {
}
}

const passed = assertions.filter((a) => a.passed).length;
const score = assertions.length === 0 ? 0 : passed / assertions.length;

return {
score,
assertions,
};
return { assertions };
});
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,5 @@ export default defineCodeGrader(({ output }) => {
});
}

const passed = assertions.filter((a) => a.passed).length;
const score = assertions.length > 0 ? passed / assertions.length : 0;

return { score, assertions };
return { assertions };
});
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,5 @@ export default defineCodeGrader(({ output }) => {
assertions.push({ text: 'Answer does not mention France', passed: false });
}

const passed = assertions.filter((a) => a.passed).length;
const total = assertions.length;
return {
score: total > 0 ? passed / total : 0,
assertions,
};
return { assertions };
});
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,5 @@ export default defineCodeGrader(({ output }) => {
assertions.push({ text: `Answer has ${wordCount} words (> 50, too verbose)`, passed: false });
}

const passed = assertions.filter((a) => a.passed).length;
const total = assertions.length;
return {
score: total > 0 ? passed / total : 0,
assertions,
};
return { assertions };
});
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@ export default defineCodeGrader(({ trace, tokenUsage, costUsd, durationMs }) =>
const assertions: Array<{ text: string; passed: boolean }> = [];

if (!trace) {
return {
score: 0,
assertions: [{ text: 'No trace provided', passed: false }],
};
return { assertions: [{ text: 'No trace provided', passed: false }] };
}

// Check for tokenUsage
Expand All @@ -47,11 +44,5 @@ export default defineCodeGrader(({ trace, tokenUsage, costUsd, durationMs }) =>
assertions.push({ text: 'durationMs not present', passed: false });
}

const passed = assertions.filter((a) => a.passed).length;
const score = passed / assertions.length;

return {
score,
assertions,
};
return { assertions };
});
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ if (!fileChanges || fileChanges.trim().length === 0) {
passed: false,
evidence: 'file_changes is empty — workspace not configured or file tracking failed',
});
console.log(JSON.stringify({ score: 0, assertions }));
console.log(JSON.stringify({ assertions }));
process.exit(0);
}

Expand Down Expand Up @@ -56,5 +56,4 @@ assertions.push({
evidence: hasAddFn ? undefined : 'add() function not found in diff',
});

const passed = assertions.filter((a) => a.passed).length;
console.log(JSON.stringify({ score: passed / assertions.length, assertions }));
console.log(JSON.stringify({ assertions }));
12 changes: 1 addition & 11 deletions examples/features/functional-grading/scripts/functional-check.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ const workspacePath: string | null = input.workspace_path;
if (!workspacePath) {
console.log(
JSON.stringify({
score: 0,
assertions: [
{
text: 'workspace_path not provided — cannot run functional checks',
Expand Down Expand Up @@ -63,13 +62,4 @@ if (compiled) {
runStage('tests', 'npm', ['test']);
}

const passed = assertions.filter((a) => a.passed).length;
const total = assertions.length;
const score = total > 0 ? passed / total : 0;

console.log(
JSON.stringify({
score,
assertions,
}),
);
console.log(JSON.stringify({ assertions }));
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,5 @@ export default defineCodeGrader(({ output }) => {
passed: emptyAssistant.length === 0,
});

const passed = assertions.filter((a) => a.passed).length;
return {
score: assertions.length > 0 ? passed / assertions.length : 0,
assertions,
};
return { assertions };
});
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ if (!fileChanges || fileChanges.trim().length === 0) {
passed: false,
evidence: 'file_changes is empty — workspace snapshot or git baseline may not be configured',
});
console.log(JSON.stringify({ score: 0, assertions }));
console.log(JSON.stringify({ assertions }));
process.exit(0);
}

Expand Down Expand Up @@ -65,7 +65,4 @@ assertions.push({
evidence: hasDataRow ? undefined : 'No data rows found after the header',
});

const passed = assertions.filter((a) => a.passed).length;
const score = passed / assertions.length;

console.log(JSON.stringify({ score, assertions }));
console.log(JSON.stringify({ assertions }));
10 changes: 1 addition & 9 deletions examples/showcase/cross-repo-sync/scripts/validate-sync.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,7 @@ defineCodeGrader(({ fileChanges, config }) => {

if (!fileChanges) {
assertions.push({ text: 'No file changes captured', passed: false });
return {
score: 0,
assertions,
};
return { assertions };
}

// Parse diff blocks
Expand Down Expand Up @@ -57,12 +54,7 @@ defineCodeGrader(({ fileChanges, config }) => {
}
}

const passed = assertions.filter((a) => a.passed).length;
const total = assertions.length;
const score = total > 0 ? passed / total : 0;

return {
score,
assertions,
details: {
files_checked: expectedFiles.length,
Expand Down
19 changes: 13 additions & 6 deletions packages/core/src/evaluation/graders/code-grader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -262,12 +262,6 @@ export class CodeGrader implements Grader {
// Plain-text fallback: exit code is pass/fail, stdout is the assertion text.
// For numeric scores or multi-aspect results, use the JSON protocol instead.
const passed = exitCode === 0;
const score =
parsed != null
? clampScore(typeof parsed.score === 'number' ? parsed.score : 0)
: passed
? 1
: 0;
const assertions: AssertionEntry[] =
parsed != null && Array.isArray(parsed?.assertions)
? parsed.assertions
Expand All @@ -285,6 +279,19 @@ export class CodeGrader implements Grader {
: parsed == null
? [{ text: stdout.trim() || (passed ? 'exit 0' : `exit ${exitCode}`), passed }]
: [];
// When the script omits `score` but returns `assertions`, derive score as passing/total.
const score =
parsed != null
? clampScore(
typeof parsed.score === 'number'
? parsed.score
: assertions.length > 0
? assertions.filter((a) => a.passed).length / assertions.length
: 0,
)
: passed
? 1
: 0;
// Capture optional structured details from code judge output
const details =
parsed?.details && typeof parsed.details === 'object' && !Array.isArray(parsed.details)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,34 @@ describe('code-grader plain-text fallback', () => {
expect(result.assertions[0].text).toBe('ok');
});

it('assertions without score → derived as passing/total', async () => {
const result = await grader(
`echo '{"assertions":[{"text":"a","passed":true},{"text":"b","passed":false},{"text":"c","passed":true}]}'`,
).evaluate(ctx);
expect(result.score).toBeCloseTo(2 / 3);
expect(result.assertions).toHaveLength(3);
});

it('assertions all passing without score → score 1', async () => {
const result = await grader(
`echo '{"assertions":[{"text":"a","passed":true},{"text":"b","passed":true}]}'`,
).evaluate(ctx);
expect(result.score).toBe(1);
});

it('assertions all failing without score → score 0', async () => {
const result = await grader(`echo '{"assertions":[{"text":"a","passed":false}]}'`).evaluate(
ctx,
);
expect(result.score).toBe(0);
});

it('empty assertions array without score → score 0', async () => {
const result = await grader(`echo '{"assertions":[]}'`).evaluate(ctx);
expect(result.score).toBe(0);
expect(result.assertions).toHaveLength(0);
});

it('script with stderr on non-zero exit → surfaces as error assertion', async () => {
const result = await grader('echo "bad" >&2; exit 1').evaluate(ctx);
expect(result.score).toBe(0);
Expand Down
Loading