diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e98a15..ea589d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,35 @@ This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres AgentOps that ships this change. ### Fixed +- **Doctor regression check no longer flags the previous PR run as "current" + in CI.** The results-history loader (`agent/sources/results_history.py`) + was reading the wrong fields from `results.json` and excluding + `.agentops/results/latest/` from the candidate list. Three coordinated + schema-alignment fixes restore correctness: + 1. `_summarize` now reads top-level `aggregate_metrics` first (the field + the orchestrator actually writes, per `core/results.py`), then falls + back to legacy `metrics`/`run_metrics`. Previously the loader looked + only at the legacy fields, so every freshly-written local + `RunSummary` had `metrics = {}` and the regression check could never + see the current run's metrics. + 2. `_summarize` now reads `summary.overall_passed` first when deriving + the `run_pass` flag, then falls back to the legacy `summary.run_pass` + / `metrics.run_pass` shapes. + 3. `_summarize` now orders runs by `timestamp` → `finished_at` → + `started_at` → `created_at` → `summary.timestamp`. The previous list + omitted `finished_at`/`started_at`, which are the two fields + `results.json` actually contains, so every loaded run defaulted to + epoch-zero ordering. + 4. `_collect_local_runs` now includes `.agentops/results/latest/` when it + is the only local results directory. In CI, generated workflows run + `agentops eval run --output .agentops/results/latest` and write + nowhere else; the old loader unconditionally skipped `latest/` for + dev-mode dedup, so in CI `local_runs` was always empty. With cloud + listing trailing behind by seconds (eventual consistency), the + regression check would then compute `latest = previous_run` and + blame the just-completed candidate's coherence/groundedness on the + prior PR. Dev-mode dedup is preserved: when a timestamped sibling + exists, `latest/` is still skipped. - **Prompt-agent deploy: `stage` no longer fails with `Required properties ["kind"] are not present` against `azure-ai-projects` 2.x.** `_copy_definition` previously called `.copy()` on the typed `PromptAgentDefinition` returned by `get_version`. In SDK 1.x that diff --git a/src/agentops/agent/sources/results_history.py b/src/agentops/agent/sources/results_history.py index 966fb13..72c5f3e 100644 --- a/src/agentops/agent/sources/results_history.py +++ b/src/agentops/agent/sources/results_history.py @@ -74,7 +74,12 @@ def _summarize(path: Path) -> Optional[RunSummary]: if not isinstance(data, dict): return None - metrics_raw = data.get("metrics") or data.get("run_metrics") or {} + metrics_raw = ( + data.get("aggregate_metrics") + or data.get("metrics") + or data.get("run_metrics") + or {} + ) metrics: Dict[str, float] = {} if isinstance(metrics_raw, dict): for key, value in metrics_raw.items(): @@ -85,9 +90,11 @@ def _summarize(path: Path) -> Optional[RunSummary]: summary = data.get("summary") or {} run_pass: Optional[bool] = None - if isinstance(summary, dict) and "run_pass" in summary: + if isinstance(summary, dict) and "overall_passed" in summary: + run_pass = bool(summary["overall_passed"]) + elif isinstance(summary, dict) and "run_pass" in summary: run_pass = bool(summary["run_pass"]) - elif "run_pass" in metrics_raw: + elif isinstance(metrics_raw, dict) and "run_pass" in metrics_raw: try: run_pass = bool(float(metrics_raw["run_pass"])) except (TypeError, ValueError): @@ -105,6 +112,8 @@ def _summarize(path: Path) -> Optional[RunSummary]: timestamp_raw = ( data.get("timestamp") + or data.get("finished_at") + or data.get("started_at") or data.get("created_at") or (summary.get("timestamp") if isinstance(summary, dict) else None) ) @@ -184,14 +193,23 @@ def _collect_local_runs( return [] candidates: List[Path] = [] + latest_target: Optional[Path] = None for child in base.iterdir(): if not child.is_dir(): continue + target = child / "results.json" + if not target.is_file(): + continue if child.name == "latest": + latest_target = target continue - target = child / "results.json" - if target.is_file(): - candidates.append(target) + candidates.append(target) + # CI workflows write directly to `.agentops/results/latest/` with no + # timestamped sibling. Include the `latest/` entry only when it is the + # sole local result so dev-mode runs (which already have a timestamped + # sibling) are not double-counted. + if not candidates and latest_target is not None: + candidates.append(latest_target) summaries: List[RunSummary] = [] for path in candidates: diff --git a/tests/unit/test_agent_results_history.py b/tests/unit/test_agent_results_history.py index 769f0ae..4584976 100644 --- a/tests/unit/test_agent_results_history.py +++ b/tests/unit/test_agent_results_history.py @@ -70,6 +70,147 @@ def test_collect_results_history_disabled(tmp_path: Path) -> None: assert history.diagnostics["status"] == "disabled" +def test_collect_results_history_loads_latest_only_in_ci(tmp_path: Path) -> None: + """CI writes directly to .agentops/results/latest/ without a sibling dir. + + The loader must include `latest/` when no other timestamped dirs exist so + the regression check sees the just-completed run as `latest` instead of + falling back to a stale Foundry-listing entry. + """ + workspace = tmp_path + results = workspace / ".agentops" / "results" + latest_dir = results / "latest" + latest_dir.mkdir(parents=True) + payload = { + "version": 1, + "started_at": "2026-05-31T12:54:00+00:00", + "finished_at": "2026-05-31T12:54:04+00:00", + "aggregate_metrics": { + "coherence": 5.0, + "similarity": 5.0, + }, + "summary": { + "overall_passed": True, + "items_total": 3, + "items_passed_all": 3, + }, + } + (latest_dir / "results.json").write_text(json.dumps(payload), encoding="utf-8") + + config = ResultsHistorySourceConfig( + enabled=True, path=".agentops/results", lookback_runs=10 + ) + history = collect_results_history(workspace, config) + + assert len(history.runs) == 1 + run = history.runs[0] + assert run.run_id == "latest" + assert run.metrics == {"coherence": 5.0, "similarity": 5.0} + assert run.run_pass is True + assert run.items_total == 3 + assert run.timestamp is not None + assert run.timestamp.year == 2026 + + +def test_collect_results_history_prefers_timestamped_over_latest(tmp_path: Path) -> None: + """In dev mode both `latest/` and a timestamped dir exist (same run). + + The loader must skip `latest/` so the regression check does not see the + same run under two different keys. + """ + workspace = tmp_path + results = workspace / ".agentops" / "results" + # Timestamped dir wins; `latest/` is its sibling pointer. + _write_run(results, "2026-05-31-12-54", "2026-05-31T12:54:00Z", {"coherence": 5.0}) + _write_run(results, "latest", "2026-05-31T12:54:00Z", {"coherence": 5.0}) + + config = ResultsHistorySourceConfig( + enabled=True, path=".agentops/results", lookback_runs=10 + ) + history = collect_results_history(workspace, config) + + assert [r.run_id for r in history.runs] == ["2026-05-31-12-54"] + + +def test_collect_results_history_reads_aggregate_metrics_field(tmp_path: Path) -> None: + """The orchestrator writes `aggregate_metrics`, not `metrics`/`run_metrics`.""" + workspace = tmp_path + results = workspace / ".agentops" / "results" + run_dir = results / "run-1" + run_dir.mkdir(parents=True) + payload = { + "version": 1, + "started_at": "2026-05-30T10:00:00+00:00", + "finished_at": "2026-05-30T10:00:30+00:00", + "aggregate_metrics": {"coherence": 4.5, "fluency": 4.0}, + "summary": { + "overall_passed": True, + "items_total": 2, + "items_passed_all": 2, + }, + } + (run_dir / "results.json").write_text(json.dumps(payload), encoding="utf-8") + + config = ResultsHistorySourceConfig( + enabled=True, path=".agentops/results", lookback_runs=10 + ) + history = collect_results_history(workspace, config) + + assert len(history.runs) == 1 + assert history.runs[0].metrics == {"coherence": 4.5, "fluency": 4.0} + assert history.runs[0].run_pass is True + + +def test_collect_results_history_orders_by_finished_at(tmp_path: Path) -> None: + """`finished_at` should be preferred over `started_at` for ordering.""" + workspace = tmp_path + results = workspace / ".agentops" / "results" + + # run-a started later but finished earlier (e.g., shorter run). + run_a = results / "run-a" + run_a.mkdir(parents=True) + (run_a / "results.json").write_text( + json.dumps( + { + "started_at": "2026-05-30T10:05:00+00:00", + "finished_at": "2026-05-30T10:05:10+00:00", + "aggregate_metrics": {"coherence": 4.0}, + "summary": { + "overall_passed": True, + "items_total": 1, + "items_passed_all": 1, + }, + } + ), + encoding="utf-8", + ) + # run-b started earlier but finished later (long-running). + run_b = results / "run-b" + run_b.mkdir(parents=True) + (run_b / "results.json").write_text( + json.dumps( + { + "started_at": "2026-05-30T10:00:00+00:00", + "finished_at": "2026-05-30T11:00:00+00:00", + "aggregate_metrics": {"coherence": 3.0}, + "summary": { + "overall_passed": True, + "items_total": 1, + "items_passed_all": 1, + }, + } + ), + encoding="utf-8", + ) + + config = ResultsHistorySourceConfig( + enabled=True, path=".agentops/results", lookback_runs=10 + ) + history = collect_results_history(workspace, config) + + assert [r.run_id for r in history.runs] == ["run-a", "run-b"] + + def test_collect_results_history_falls_back_to_foundry_cloud( tmp_path: Path, monkeypatch ) -> None: