Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,6 @@ Dockerfile.local
CLAUDE.local.md
.zed
tsconfig.tsbuildinfo

# BTX spec cache
py/src/braintrust/btx/.spec-cache/
12 changes: 12 additions & 0 deletions py/noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def _pinned_python_version():
CONTRIB_DIR = "braintrust/contrib"
DEVSERVER_DIR = "braintrust/devserver"
TYPE_TESTS_DIR = "braintrust/type_tests"
BTX_DIR = "braintrust/btx"


SILENT_INSTALLS = True
Expand Down Expand Up @@ -195,6 +196,16 @@ def test_openai_http2_streaming(session, version):
_run_tests(session, f"{INTEGRATION_DIR}/openai/test_openai_http2.py", version=version)


@nox.session()
@nox.parametrize("version", OPENAI_VERSIONS, ids=OPENAI_VERSIONS)
def test_btx_openai(session, version):
"""Run the BTX cross-language LLM-span spec tests (OpenAI provider)."""
_install_test_deps(session)
_install_matrix_dep(session, "openai", version)
session.install("pyyaml")
_run_tests(session, "braintrust/btx", version=version, env={"BTX_PROVIDER": "openai", "BTX_CLIENT": "openai"})


OPENAI_AGENTS_VERSIONS = _get_matrix_versions("openai-agents")


Expand Down Expand Up @@ -560,6 +571,7 @@ def _run_core_tests(session):
CONTRIB_DIR,
DEVSERVER_DIR,
TYPE_TESTS_DIR,
BTX_DIR,
],
)

Expand Down
2 changes: 2 additions & 0 deletions py/src/braintrust/btx/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# BTX: Cross-language SDK spec tests for the Braintrust Python SDK.
# Specs are fetched from braintrustdata/braintrust-spec at a pinned ref.
299 changes: 299 additions & 0 deletions py/src/braintrust/btx/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,299 @@
"""pytest configuration for BTX tests.

Dual-mode operation:
VCR off (--disable-vcr or --vcr-record=all):
- Real provider API calls
- Spans sent to Braintrust backend via real logger
- Spans fetched back via BTQL for validation

VCR on (default, cassettes present):
- Provider HTTP replayed from cassettes
- Spans captured in memory via _internal_with_memory_background_logger
- No Braintrust backend calls needed

The VCR mode is detected from the pytest-vcr options already present in the
test session (--disable-vcr / --vcr-record).
"""

from __future__ import annotations

import os
import sys
import tarfile
import tempfile
import urllib.request
from pathlib import Path

import pytest
from braintrust import logger
from braintrust.test_helpers import init_test_logger


_BTX_DIR = Path(__file__).parent
_SPEC_REF_FILE = _BTX_DIR / "spec-ref.txt"
_SPEC_CACHE_DIR = _BTX_DIR / ".spec-cache"

_TEST_PROJECT = "btx-test-project"

# Stash key: spec root path, set by pytest_configure before collection
_spec_root_key = pytest.StashKey[Path]()
# Stash key: whether VCR is disabled (live mode)
_vcr_off_key = pytest.StashKey[bool]()


# ---------------------------------------------------------------------------
# Spec fetching — before collection
# ---------------------------------------------------------------------------


def _read_spec_ref() -> str:
return _SPEC_REF_FILE.read_text().strip()


def _fetch_spec_if_needed(ref: str) -> Path:
"""Download braintrust-spec@ref into the local cache; skip if already present.

Pure Python implementation — no bash or curl required, works on all
platforms including Windows.

Race-condition safe: extracts into a temporary sibling directory and then
atomically renames it into the final cache_dir. If two processes race,
one wins the rename and the other detects the final directory already
exists and returns immediately.
"""
import shutil

cache_dir = _SPEC_CACHE_DIR / ref
llm_span_root = cache_dir / "test" / "llm_span"

if llm_span_root.exists():
return llm_span_root

_SPEC_CACHE_DIR.mkdir(parents=True, exist_ok=True)
print(f"\n[btx] Fetching braintrust-spec@{ref} ...")

url = f"https://github.com/braintrustdata/braintrust-spec/archive/{ref}.tar.gz"

# Extract into a unique temp directory next to the final cache_dir so that
# the eventual os.rename() is atomic (same filesystem, no cross-device move).
tmp_dir = Path(tempfile.mkdtemp(dir=_SPEC_CACHE_DIR, prefix=f"{ref}.tmp."))
# Use mkstemp (not deprecated mktemp) to atomically create the temp tarball.
tmp_tar_fd, tmp_tar_str = tempfile.mkstemp(suffix=".tar.gz", dir=_SPEC_CACHE_DIR)
os.close(tmp_tar_fd)
tmp_tar = Path(tmp_tar_str)

try:
urllib.request.urlretrieve(url, tmp_tar)

with tarfile.open(tmp_tar, "r:gz") as tar:
members = tar.getmembers()
# Strip the top-level directory (e.g. "braintrust-spec-af0e006/")
top = members[0].name.split("/")[0] + "/"
for member in members:
member.name = member.name[len(top) :]
if member.name:
# filter="data" was added in 3.12; fall back gracefully on older Pythons
if sys.version_info >= (3, 12):
tar.extract(member, tmp_dir, filter="data")
else:
tar.extract(member, tmp_dir) # noqa: S202

# Atomic rename: if another process already won the race, our tmp_dir
# is redundant — clean it up and use the existing cache_dir.
try:
tmp_dir.rename(cache_dir)
except (FileExistsError, OSError):
# Another process beat us to it; that's fine.
if not llm_span_root.exists():
raise
except Exception:
shutil.rmtree(tmp_dir, ignore_errors=True)
raise
finally:
Path(tmp_tar).unlink(missing_ok=True)
shutil.rmtree(tmp_dir, ignore_errors=True)

if not llm_span_root.exists():
raise FileNotFoundError(f"Expected llm_span dir not found after fetch: {llm_span_root}")

print(f"[btx] Spec cached at {llm_span_root}")
return llm_span_root


def pytest_configure(config: pytest.Config) -> None:
"""Fetch specs before collection and detect VCR mode."""
# --- spec fetch ---
env_override = os.environ.get("BTX_SPEC_ROOT")
if env_override:
spec_root = Path(env_override)
else:
ref = _read_spec_ref()
spec_root = _fetch_spec_if_needed(ref)

config.stash[_spec_root_key] = spec_root
os.environ["BTX_SPEC_ROOT"] = str(spec_root)

# --- VCR mode detection ---
# vcr_off means: bypass VCR entirely, make real API calls, validate via BTQL.
# This is only true when --disable-vcr is passed.
# --vcr-record=all means: make real API calls but still use VCR (to record
# cassettes) and capture spans in-memory — so vcr_off stays False.
vcr_off = bool(config.getoption("--disable-vcr", default=False, skip=True))
config.stash[_vcr_off_key] = vcr_off


# ---------------------------------------------------------------------------
# VCR configuration
# ---------------------------------------------------------------------------

# Response headers to drop before writing cassettes. These carry sensitive or
# ephemeral values (session cookies, org/project IDs, per-request trace IDs)
# that should never be committed to source control.
_SCRUB_RESPONSE_HEADERS = {
"set-cookie",
"openai-organization",
"openai-project",
"x-request-id",
"cf-ray",
"cf-cache-status",
"alt-svc",
}


def _scrub_response_headers(response: dict) -> dict:
"""Strip sensitive/ephemeral headers from responses before cassette write."""
response["headers"] = {
k: v for k, v in response.get("headers", {}).items() if k.lower() not in _SCRUB_RESPONSE_HEADERS
}
return response


@pytest.fixture(scope="session")
def vcr_config() -> dict:
"""In CI: record_mode=none. Locally: record_mode=once."""
record_mode = "none" if (os.environ.get("CI") or os.environ.get("GITHUB_ACTIONS")) else "once"
return {
"record_mode": record_mode,
"decode_compressed_response": True,
# Match on method + URI + body: the request payload (model, messages, etc.)
# is what determines which cassette response is appropriate.
# Volatile per-version metadata lives in headers, not the body, so we strip
# those via filter_headers instead of dropping body from match_on.
"match_on": ["method", "scheme", "host", "port", "path", "query", "body"],
"filter_headers": [
"authorization",
"Authorization",
"x-api-key",
"api-key",
"openai-organization",
"openai-api-key",
"x-goog-api-key",
"x-bt-auth-token",
"cookie",
"Cookie",
# Stainless SDK telemetry headers — version-specific, not part of the
# request semantics; strip so cassettes survive SDK version bumps.
"user-agent",
"User-Agent",
"x-stainless-arch",
"x-stainless-async",
"x-stainless-lang",
"x-stainless-os",
"x-stainless-package-version",
"x-stainless-runtime",
"x-stainless-runtime-version",
"x-stainless-read-timeout",
"x-stainless-retry-count",
],
"before_record_response": _scrub_response_headers,
}


def _btx_cassette_path(provider: str, spec_name: str) -> str:
"""Return the absolute cassette path for a given provider and spec name.

Cassettes live in the provider's integration cassette directory so they
share the same version matrix as the rest of that provider's tests:
integrations/<provider>/cassettes/<version>/btx/<spec_name>.yaml

Using an absolute path causes pytest-vcr to ignore vcr_cassette_dir
entirely and write/read cassettes directly at this location.
"""
from braintrust.integrations.conftest import _versioned_cassette_dir

integration_cassettes = _BTX_DIR.parent / "integrations" / provider / "cassettes"
versioned_dir = Path(_versioned_cassette_dir(str(integration_cassettes)))
cassette = versioned_dir / "btx" / f"{spec_name}.yaml"
cassette.parent.mkdir(parents=True, exist_ok=True)
return str(cassette)


@pytest.fixture
def vcr_cassette_name(request: pytest.FixtureRequest) -> str:
"""Return the absolute cassette path for this spec.

The parametrize ID is '<provider>/<spec_name>' (e.g. 'openai/completions').
Cassettes are routed to the provider's own integration directory:
integrations/<provider>/cassettes/<version>/btx/<spec_name>.yaml
"""
node_name = request.node.name # e.g. "test_btx_spec[openai/completions]"
if "[" in node_name and node_name.endswith("]"):
spec_id = node_name[node_name.index("[") + 1 : -1]
else:
spec_id = node_name

if "/" in spec_id:
provider, spec_name = spec_id.split("/", 1)
return _btx_cassette_path(provider, spec_name)
return spec_id


# ---------------------------------------------------------------------------
# Mode-aware fixtures
# ---------------------------------------------------------------------------


@pytest.fixture(scope="session")
def btx_vcr_off(request: pytest.FixtureRequest) -> bool:
"""True when running in live (VCR-off) mode."""
return request.config.stash.get(_vcr_off_key, False)


@pytest.fixture(scope="session")
def btx_spec_root(request: pytest.FixtureRequest) -> Path:
"""The llm_span spec root (already fetched by pytest_configure)."""
return request.config.stash[_spec_root_key]


@pytest.fixture(scope="session")
def btx_project_id(btx_vcr_off: bool) -> str | None:
"""Resolve the Braintrust project ID once per session (live mode only).

In VCR mode this is never called. In live mode the project name/ID is
constant across all test cases, so we look it up once here rather than
once per parametrized test.
"""
if not btx_vcr_off:
return None
project_id = os.environ.get("BRAINTRUST_PROJECT_ID") or os.environ.get("BRAINTRUST_DEFAULT_PROJECT_ID")
if project_id:
return project_id
from .span_fetcher import fetch_project_id

project = os.environ.get("BRAINTRUST_PROJECT") or os.environ.get(
"BRAINTRUST_DEFAULT_PROJECT_NAME", "python-unit-test"
)
return fetch_project_id(project)


@pytest.fixture
def memory_logger(btx_vcr_off):
"""In VCR-on mode: install in-memory span capture.
In VCR-off mode: yield None (spans go to the real Braintrust backend).
"""
if btx_vcr_off:
yield None
else:
init_test_logger(_TEST_PROJECT)
with logger._internal_with_memory_background_logger() as bgl:
yield bgl
Loading