Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,35 @@ This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres
AgentOps that ships this change.

### Fixed
- **Doctor regression check no longer flags the previous PR run as "current"
in CI.** The results-history loader (`agent/sources/results_history.py`)
was reading the wrong fields from `results.json` and excluding
`.agentops/results/latest/` from the candidate list. Three coordinated
schema-alignment fixes restore correctness:
1. `_summarize` now reads top-level `aggregate_metrics` first (the field
the orchestrator actually writes, per `core/results.py`), then falls
back to legacy `metrics`/`run_metrics`. Previously the loader looked
only at the legacy fields, so every freshly-written local
`RunSummary` had `metrics = {}` and the regression check could never
see the current run's metrics.
2. `_summarize` now reads `summary.overall_passed` first when deriving
the `run_pass` flag, then falls back to the legacy `summary.run_pass`
/ `metrics.run_pass` shapes.
3. `_summarize` now orders runs by `timestamp` → `finished_at` →
`started_at` → `created_at` → `summary.timestamp`. The previous list
omitted `finished_at`/`started_at`, which are the two fields
`results.json` actually contains, so every loaded run defaulted to
epoch-zero ordering.
4. `_collect_local_runs` now includes `.agentops/results/latest/` when it
is the only local results directory. In CI, generated workflows run
`agentops eval run --output .agentops/results/latest` and write
nowhere else; the old loader unconditionally skipped `latest/` for
dev-mode dedup, so in CI `local_runs` was always empty. With cloud
listing trailing behind by seconds (eventual consistency), the
regression check would then compute `latest = previous_run` and
blame the just-completed candidate's coherence/groundedness on the
prior PR. Dev-mode dedup is preserved: when a timestamped sibling
exists, `latest/` is still skipped.
- **Prompt-agent deploy: `stage` no longer fails with `Required properties ["kind"] are not present` against `azure-ai-projects` 2.x.**
`_copy_definition` previously called `.copy()` on the typed
`PromptAgentDefinition` returned by `get_version`. In SDK 1.x that
Expand Down
30 changes: 24 additions & 6 deletions src/agentops/agent/sources/results_history.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,12 @@ def _summarize(path: Path) -> Optional[RunSummary]:
if not isinstance(data, dict):
return None

metrics_raw = data.get("metrics") or data.get("run_metrics") or {}
metrics_raw = (
data.get("aggregate_metrics")
or data.get("metrics")
or data.get("run_metrics")
or {}
)
metrics: Dict[str, float] = {}
if isinstance(metrics_raw, dict):
for key, value in metrics_raw.items():
Expand All @@ -85,9 +90,11 @@ def _summarize(path: Path) -> Optional[RunSummary]:

summary = data.get("summary") or {}
run_pass: Optional[bool] = None
if isinstance(summary, dict) and "run_pass" in summary:
if isinstance(summary, dict) and "overall_passed" in summary:
run_pass = bool(summary["overall_passed"])
elif isinstance(summary, dict) and "run_pass" in summary:
run_pass = bool(summary["run_pass"])
elif "run_pass" in metrics_raw:
elif isinstance(metrics_raw, dict) and "run_pass" in metrics_raw:
try:
run_pass = bool(float(metrics_raw["run_pass"]))
except (TypeError, ValueError):
Expand All @@ -105,6 +112,8 @@ def _summarize(path: Path) -> Optional[RunSummary]:

timestamp_raw = (
data.get("timestamp")
or data.get("finished_at")
or data.get("started_at")
or data.get("created_at")
or (summary.get("timestamp") if isinstance(summary, dict) else None)
)
Expand Down Expand Up @@ -184,14 +193,23 @@ def _collect_local_runs(
return []

candidates: List[Path] = []
latest_target: Optional[Path] = None
for child in base.iterdir():
if not child.is_dir():
continue
target = child / "results.json"
if not target.is_file():
continue
if child.name == "latest":
latest_target = target
continue
target = child / "results.json"
if target.is_file():
candidates.append(target)
candidates.append(target)
# CI workflows write directly to `.agentops/results/latest/` with no
# timestamped sibling. Include the `latest/` entry only when it is the
# sole local result so dev-mode runs (which already have a timestamped
# sibling) are not double-counted.
if not candidates and latest_target is not None:
candidates.append(latest_target)

summaries: List[RunSummary] = []
for path in candidates:
Expand Down
141 changes: 141 additions & 0 deletions tests/unit/test_agent_results_history.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,147 @@ def test_collect_results_history_disabled(tmp_path: Path) -> None:
assert history.diagnostics["status"] == "disabled"


def test_collect_results_history_loads_latest_only_in_ci(tmp_path: Path) -> None:
"""CI writes directly to .agentops/results/latest/ without a sibling dir.

The loader must include `latest/` when no other timestamped dirs exist so
the regression check sees the just-completed run as `latest` instead of
falling back to a stale Foundry-listing entry.
"""
workspace = tmp_path
results = workspace / ".agentops" / "results"
latest_dir = results / "latest"
latest_dir.mkdir(parents=True)
payload = {
"version": 1,
"started_at": "2026-05-31T12:54:00+00:00",
"finished_at": "2026-05-31T12:54:04+00:00",
"aggregate_metrics": {
"coherence": 5.0,
"similarity": 5.0,
},
"summary": {
"overall_passed": True,
"items_total": 3,
"items_passed_all": 3,
},
}
(latest_dir / "results.json").write_text(json.dumps(payload), encoding="utf-8")

config = ResultsHistorySourceConfig(
enabled=True, path=".agentops/results", lookback_runs=10
)
history = collect_results_history(workspace, config)

assert len(history.runs) == 1
run = history.runs[0]
assert run.run_id == "latest"
assert run.metrics == {"coherence": 5.0, "similarity": 5.0}
assert run.run_pass is True
assert run.items_total == 3
assert run.timestamp is not None
assert run.timestamp.year == 2026


def test_collect_results_history_prefers_timestamped_over_latest(tmp_path: Path) -> None:
"""In dev mode both `latest/` and a timestamped dir exist (same run).

The loader must skip `latest/` so the regression check does not see the
same run under two different keys.
"""
workspace = tmp_path
results = workspace / ".agentops" / "results"
# Timestamped dir wins; `latest/` is its sibling pointer.
_write_run(results, "2026-05-31-12-54", "2026-05-31T12:54:00Z", {"coherence": 5.0})
_write_run(results, "latest", "2026-05-31T12:54:00Z", {"coherence": 5.0})

config = ResultsHistorySourceConfig(
enabled=True, path=".agentops/results", lookback_runs=10
)
history = collect_results_history(workspace, config)

assert [r.run_id for r in history.runs] == ["2026-05-31-12-54"]


def test_collect_results_history_reads_aggregate_metrics_field(tmp_path: Path) -> None:
"""The orchestrator writes `aggregate_metrics`, not `metrics`/`run_metrics`."""
workspace = tmp_path
results = workspace / ".agentops" / "results"
run_dir = results / "run-1"
run_dir.mkdir(parents=True)
payload = {
"version": 1,
"started_at": "2026-05-30T10:00:00+00:00",
"finished_at": "2026-05-30T10:00:30+00:00",
"aggregate_metrics": {"coherence": 4.5, "fluency": 4.0},
"summary": {
"overall_passed": True,
"items_total": 2,
"items_passed_all": 2,
},
}
(run_dir / "results.json").write_text(json.dumps(payload), encoding="utf-8")

config = ResultsHistorySourceConfig(
enabled=True, path=".agentops/results", lookback_runs=10
)
history = collect_results_history(workspace, config)

assert len(history.runs) == 1
assert history.runs[0].metrics == {"coherence": 4.5, "fluency": 4.0}
assert history.runs[0].run_pass is True


def test_collect_results_history_orders_by_finished_at(tmp_path: Path) -> None:
"""`finished_at` should be preferred over `started_at` for ordering."""
workspace = tmp_path
results = workspace / ".agentops" / "results"

# run-a started later but finished earlier (e.g., shorter run).
run_a = results / "run-a"
run_a.mkdir(parents=True)
(run_a / "results.json").write_text(
json.dumps(
{
"started_at": "2026-05-30T10:05:00+00:00",
"finished_at": "2026-05-30T10:05:10+00:00",
"aggregate_metrics": {"coherence": 4.0},
"summary": {
"overall_passed": True,
"items_total": 1,
"items_passed_all": 1,
},
}
),
encoding="utf-8",
)
# run-b started earlier but finished later (long-running).
run_b = results / "run-b"
run_b.mkdir(parents=True)
(run_b / "results.json").write_text(
json.dumps(
{
"started_at": "2026-05-30T10:00:00+00:00",
"finished_at": "2026-05-30T11:00:00+00:00",
"aggregate_metrics": {"coherence": 3.0},
"summary": {
"overall_passed": True,
"items_total": 1,
"items_passed_all": 1,
},
}
),
encoding="utf-8",
)

config = ResultsHistorySourceConfig(
enabled=True, path=".agentops/results", lookback_runs=10
)
history = collect_results_history(workspace, config)

assert [r.run_id for r in history.runs] == ["run-a", "run-b"]


def test_collect_results_history_falls_back_to_foundry_cloud(
tmp_path: Path, monkeypatch
) -> None:
Expand Down
Loading