Azure · placerda · May 31, 2026 · May 31, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -41,6 +41,35 @@ This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres
   AgentOps that ships this change.
 
 ### Fixed
+- **Doctor regression check no longer flags the previous PR run as "current"
+  in CI.** The results-history loader (`agent/sources/results_history.py`)
+  was reading the wrong fields from `results.json` and excluding
+  `.agentops/results/latest/` from the candidate list. Three coordinated
+  schema-alignment fixes restore correctness:
+  1. `_summarize` now reads top-level `aggregate_metrics` first (the field
+     the orchestrator actually writes, per `core/results.py`), then falls
+     back to legacy `metrics`/`run_metrics`. Previously the loader looked
+     only at the legacy fields, so every freshly-written local
+     `RunSummary` had `metrics = {}` and the regression check could never
+     see the current run's metrics.
+  2. `_summarize` now reads `summary.overall_passed` first when deriving
+     the `run_pass` flag, then falls back to the legacy `summary.run_pass`
+     / `metrics.run_pass` shapes.
+  3. `_summarize` now orders runs by `timestamp` → `finished_at` →
+     `started_at` → `created_at` → `summary.timestamp`. The previous list
+     omitted `finished_at`/`started_at`, which are the two fields
+     `results.json` actually contains, so every loaded run defaulted to
+     epoch-zero ordering.
+  4. `_collect_local_runs` now includes `.agentops/results/latest/` when it
+     is the only local results directory. In CI, generated workflows run
+     `agentops eval run --output .agentops/results/latest` and write
+     nowhere else; the old loader unconditionally skipped `latest/` for
+     dev-mode dedup, so in CI `local_runs` was always empty. With cloud
+     listing trailing behind by seconds (eventual consistency), the
+     regression check would then compute `latest = previous_run` and
+     blame the just-completed candidate's coherence/groundedness on the
+     prior PR. Dev-mode dedup is preserved: when a timestamped sibling
+     exists, `latest/` is still skipped.
 - **Prompt-agent deploy: `stage` no longer fails with `Required properties ["kind"] are not present` against `azure-ai-projects` 2.x.**
   `_copy_definition` previously called `.copy()` on the typed
   `PromptAgentDefinition` returned by `get_version`. In SDK 1.x that

diff --git a/src/agentops/agent/sources/results_history.py b/src/agentops/agent/sources/results_history.py
@@ -74,7 +74,12 @@ def _summarize(path: Path) -> Optional[RunSummary]:
     if not isinstance(data, dict):
         return None
 
-    metrics_raw = data.get("metrics") or data.get("run_metrics") or {}
+    metrics_raw = (
+        data.get("aggregate_metrics")
+        or data.get("metrics")
+        or data.get("run_metrics")
+        or {}
+    )
     metrics: Dict[str, float] = {}
     if isinstance(metrics_raw, dict):
         for key, value in metrics_raw.items():
@@ -85,9 +90,11 @@ def _summarize(path: Path) -> Optional[RunSummary]:
 
     summary = data.get("summary") or {}
     run_pass: Optional[bool] = None
-    if isinstance(summary, dict) and "run_pass" in summary:
+    if isinstance(summary, dict) and "overall_passed" in summary:
+        run_pass = bool(summary["overall_passed"])
+    elif isinstance(summary, dict) and "run_pass" in summary:
         run_pass = bool(summary["run_pass"])
-    elif "run_pass" in metrics_raw:
+    elif isinstance(metrics_raw, dict) and "run_pass" in metrics_raw:
         try:
             run_pass = bool(float(metrics_raw["run_pass"]))
         except (TypeError, ValueError):
@@ -105,6 +112,8 @@ def _summarize(path: Path) -> Optional[RunSummary]:
 
     timestamp_raw = (
         data.get("timestamp")
+        or data.get("finished_at")
+        or data.get("started_at")
         or data.get("created_at")
         or (summary.get("timestamp") if isinstance(summary, dict) else None)
     )
@@ -184,14 +193,23 @@ def _collect_local_runs(
         return []
 
     candidates: List[Path] = []
+    latest_target: Optional[Path] = None
     for child in base.iterdir():
         if not child.is_dir():
             continue
+        target = child / "results.json"
+        if not target.is_file():
+            continue
         if child.name == "latest":
+            latest_target = target
             continue
-        target = child / "results.json"
-        if target.is_file():
-            candidates.append(target)
+        candidates.append(target)
+    # CI workflows write directly to `.agentops/results/latest/` with no
+    # timestamped sibling. Include the `latest/` entry only when it is the
+    # sole local result so dev-mode runs (which already have a timestamped
+    # sibling) are not double-counted.
+    if not candidates and latest_target is not None:
+        candidates.append(latest_target)
 
     summaries: List[RunSummary] = []
     for path in candidates:

diff --git a/tests/unit/test_agent_results_history.py b/tests/unit/test_agent_results_history.py
@@ -70,6 +70,147 @@ def test_collect_results_history_disabled(tmp_path: Path) -> None:
     assert history.diagnostics["status"] == "disabled"
 
 
+def test_collect_results_history_loads_latest_only_in_ci(tmp_path: Path) -> None:
+    """CI writes directly to .agentops/results/latest/ without a sibling dir.
+
+    The loader must include `latest/` when no other timestamped dirs exist so
+    the regression check sees the just-completed run as `latest` instead of
+    falling back to a stale Foundry-listing entry.
+    """
+    workspace = tmp_path
+    results = workspace / ".agentops" / "results"
+    latest_dir = results / "latest"
+    latest_dir.mkdir(parents=True)
+    payload = {
+        "version": 1,
+        "started_at": "2026-05-31T12:54:00+00:00",
+        "finished_at": "2026-05-31T12:54:04+00:00",
+        "aggregate_metrics": {
+            "coherence": 5.0,
+            "similarity": 5.0,
+        },
+        "summary": {
+            "overall_passed": True,
+            "items_total": 3,
+            "items_passed_all": 3,
+        },
+    }
+    (latest_dir / "results.json").write_text(json.dumps(payload), encoding="utf-8")
+
+    config = ResultsHistorySourceConfig(
+        enabled=True, path=".agentops/results", lookback_runs=10
+    )
+    history = collect_results_history(workspace, config)
+
+    assert len(history.runs) == 1
+    run = history.runs[0]
+    assert run.run_id == "latest"
+    assert run.metrics == {"coherence": 5.0, "similarity": 5.0}
+    assert run.run_pass is True
+    assert run.items_total == 3
+    assert run.timestamp is not None
+    assert run.timestamp.year == 2026
+
+
+def test_collect_results_history_prefers_timestamped_over_latest(tmp_path: Path) -> None:
+    """In dev mode both `latest/` and a timestamped dir exist (same run).
+
+    The loader must skip `latest/` so the regression check does not see the
+    same run under two different keys.
+    """
+    workspace = tmp_path
+    results = workspace / ".agentops" / "results"
+    # Timestamped dir wins; `latest/` is its sibling pointer.
+    _write_run(results, "2026-05-31-12-54", "2026-05-31T12:54:00Z", {"coherence": 5.0})
+    _write_run(results, "latest", "2026-05-31T12:54:00Z", {"coherence": 5.0})
+
+    config = ResultsHistorySourceConfig(
+        enabled=True, path=".agentops/results", lookback_runs=10
+    )
+    history = collect_results_history(workspace, config)
+
+    assert [r.run_id for r in history.runs] == ["2026-05-31-12-54"]
+
+
+def test_collect_results_history_reads_aggregate_metrics_field(tmp_path: Path) -> None:
+    """The orchestrator writes `aggregate_metrics`, not `metrics`/`run_metrics`."""
+    workspace = tmp_path
+    results = workspace / ".agentops" / "results"
+    run_dir = results / "run-1"
+    run_dir.mkdir(parents=True)
+    payload = {
+        "version": 1,
+        "started_at": "2026-05-30T10:00:00+00:00",
+        "finished_at": "2026-05-30T10:00:30+00:00",
+        "aggregate_metrics": {"coherence": 4.5, "fluency": 4.0},
+        "summary": {
+            "overall_passed": True,
+            "items_total": 2,
+            "items_passed_all": 2,
+        },
+    }
+    (run_dir / "results.json").write_text(json.dumps(payload), encoding="utf-8")
+
+    config = ResultsHistorySourceConfig(
+        enabled=True, path=".agentops/results", lookback_runs=10
+    )
+    history = collect_results_history(workspace, config)
+
+    assert len(history.runs) == 1
+    assert history.runs[0].metrics == {"coherence": 4.5, "fluency": 4.0}
+    assert history.runs[0].run_pass is True
+
+
+def test_collect_results_history_orders_by_finished_at(tmp_path: Path) -> None:
+    """`finished_at` should be preferred over `started_at` for ordering."""
+    workspace = tmp_path
+    results = workspace / ".agentops" / "results"
+
+    # run-a started later but finished earlier (e.g., shorter run).
+    run_a = results / "run-a"
+    run_a.mkdir(parents=True)
+    (run_a / "results.json").write_text(
+        json.dumps(
+            {
+                "started_at": "2026-05-30T10:05:00+00:00",
+                "finished_at": "2026-05-30T10:05:10+00:00",
+                "aggregate_metrics": {"coherence": 4.0},
+                "summary": {
+                    "overall_passed": True,
+                    "items_total": 1,
+                    "items_passed_all": 1,
+                },
+            }
+        ),
+        encoding="utf-8",
+    )
+    # run-b started earlier but finished later (long-running).
+    run_b = results / "run-b"
+    run_b.mkdir(parents=True)
+    (run_b / "results.json").write_text(
+        json.dumps(
+            {
+                "started_at": "2026-05-30T10:00:00+00:00",
+                "finished_at": "2026-05-30T11:00:00+00:00",
+                "aggregate_metrics": {"coherence": 3.0},
+                "summary": {
+                    "overall_passed": True,
+                    "items_total": 1,
+                    "items_passed_all": 1,
+                },
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    config = ResultsHistorySourceConfig(
+        enabled=True, path=".agentops/results", lookback_runs=10
+    )
+    history = collect_results_history(workspace, config)
+
+    assert [r.run_id for r in history.runs] == ["run-a", "run-b"]
+
+
 def test_collect_results_history_falls_back_to_foundry_cloud(
     tmp_path: Path, monkeypatch
 ) -> None: