`` returns a large result list of
+ ```` entries, each inside an anchor carrying the
+ chip's ``id``. We return the id of the row whose name matches ``name``
+ exactly (variant-safe) — this disambiguates plain SKUs that the fuzzy
+ ``cpu.php`` search would otherwise redirect to a popular sibling.
+ """
+ resp = client.get(LOOKUP, params={"cpu": search_query(name)})
+ if resp.status_code != 200:
+ return None
+ soup = BeautifulSoup(resp.text, "html.parser")
+ want = normalize_name(name)
+ for span in soup.select("span.prdname"):
+ anchor = span.find_parent("a", href=True)
+ if anchor is None:
+ continue
+ href = anchor["href"]
+ if not isinstance(href, str):
+ continue
+ m = _ID_RE.search(href)
+ if m and normalize_name(span.get_text(" ", strip=True)) == want:
+ return m.group(1)
+ return None
+
+
+def _fetch_by(client: httpx.Client, name: str, params: dict[str, str]) -> PassMarkResult | None:
+ resp = client.get(BASE, params=params)
+ if resp.status_code == 404:
+ return None
+ resp.raise_for_status()
+ parsed = _extract(resp.text)
+ if parsed is None:
+ return None
+ heading, cpu_mark, single = parsed
+ if not heading_matches(name, heading):
+ return None
+ return PassMarkResult(
+ page_name=heading, cpu_mark=cpu_mark, single_thread=single, source_url=str(resp.url)
+ )
+
+
+def fetch_scores(
+ client: httpx.Client,
+ name: str,
+ *,
+ id_override: str | None = None,
+ auto_resolve: bool = True,
+) -> PassMarkResult | None:
+ """Fetch variant-confirmed scores for ``name``.
+
+ Order: (1) ``id_override`` if given; (2) fuzzy name search — kept only if
+ the served heading matches exactly; (3) ``auto_resolve`` via the lookup
+ list to find the exact id, then the canonical id page. Returns None only
+ when no exact-variant match exists anywhere (caller flags for review).
+ """
+ query = search_query(name)
+ if id_override:
+ return _fetch_by(client, name, {"id": id_override, "cpu": query})
+ direct = _fetch_by(client, name, {"cpu": query})
+ if direct is not None:
+ return direct
+ if not auto_resolve:
+ return None
+ resolved = resolve_id(client, name)
+ if resolved is None:
+ return None
+ return _fetch_by(client, name, {"id": resolved, "cpu": name})
+
+
+def make_client(*, timeout: float = 30.0) -> httpx.Client:
+ return httpx.Client(
+ headers=BROWSER_HEADERS, timeout=timeout, follow_redirects=True
+ )
diff --git a/app/ingest/sources/spec2006.py b/app/ingest/sources/spec2006.py
new file mode 100644
index 0000000..3412c82
--- /dev/null
+++ b/app/ingest/sources/spec2006.py
@@ -0,0 +1,112 @@
+"""spec.org SPEC CPU2006 → specint2006 / specfp2006 (bulk result tables).
+
+SPEC publishes every CINT2006 / CFP2006 *speed* result as one giant static
+table (``cint2006.html`` / ``cfp2006.html``, ~11k rows each). Each row is a
+single system submission; the processor sits in the final parenthesised group
+of the "System Name" column (e.g. ``ACTINA SOLAR 220 X3 (Intel Xeon X5650)``,
+sometimes with a ``, 2.30 GHz`` tail), and the last two cells are the Base and
+Peak scores.
+
+Like the cgdirector source these are *bulk tables*: each page is fetched once,
+cached, and matched by exact normalized name (variant-safe — "i5-2400" never
+matches "i5-2400S"). A chip appears in many submissions with differing scores
+(different system / RAM / compiler); we keep the **maximum Base** result — the
+best published baseline configuration, deterministic and verifiable from the
+cited page. We use the *speed* metric (one copy), which is a per-CPU figure and
+does not inflate with socket/core count the way the rate metric would.
+
+SPEC CPU2006 was retired in 2018, so coverage is old desktop + server (Xeon,
+Opteron, POWER) and stops before the 2017+ generation. Never fabricates.
+"""
+
+from __future__ import annotations
+
+import re
+
+import httpx
+
+from .passmark import normalize_name
+
+CINT_URL = "https://www.spec.org/cpu2006/results/cint2006.html"
+CFP_URL = "https://www.spec.org/cpu2006/results/cfp2006.html"
+# Both metrics are reachable from this canonical results index.
+RESULTS_INDEX = "https://www.spec.org/cpu2006/results/"
+
+# Strip a trailing clock annotation inside the processor parens, e.g.
+# "Intel Xeon E5-2670 v3, 2.30 GHz" -> "Intel Xeon E5-2670 v3".
+_CLOCK_TAIL = re.compile(r",\s*[\d.]+\s*[GM]Hz\s*$", re.IGNORECASE)
+_PAREN = re.compile(r"\(([^()]*)\)")
+
+_caches: dict[str, dict[str, float]] = {}
+
+
+def _processor_from_system(system_name: str) -> str | None:
+ """Extract the CPU model from a SPEC "System Name" cell.
+
+ The processor is the last parenthesised group; drop a trailing ", X GHz".
+ """
+ groups = _PAREN.findall(system_name)
+ if not groups:
+ return None
+ proc = _CLOCK_TAIL.sub("", groups[-1]).strip()
+ return proc or None
+
+
+def _load(client: httpx.Client, url: str) -> dict[str, float]:
+ """Return ``{normalized_processor: max_base_score}`` for a results page."""
+ if url in _caches:
+ return _caches[url]
+ table: dict[str, float] = {}
+ _caches[url] = table
+ resp = client.get(url)
+ if resp.status_code != 200:
+ return table
+ # Stream-parse rows with a lightweight regex pass — bs4 on an 11k-row,
+ # 8 MB document is needlessly slow and memory-hungry here.
+ from bs4 import BeautifulSoup
+
+ soup = BeautifulSoup(resp.text, "html.parser")
+ for tr in soup.find_all("tr"):
+ cells = [c.get_text(" ", strip=True) for c in tr.find_all("td")]
+ if len(cells) < 9: # header / section rows have fewer / no
+ continue
+ proc = _processor_from_system(cells[1])
+ if not proc:
+ continue
+ try:
+ base = float(cells[7])
+ except (ValueError, IndexError):
+ continue
+ if base <= 0:
+ continue
+ key = normalize_name(proc)
+ if not key:
+ continue
+ prev = table.get(key)
+ if prev is None or base > prev:
+ table[key] = base
+ return table
+
+
+def reset_cache() -> None:
+ """Clear module caches (tests / re-runs)."""
+ _caches.clear()
+
+
+def resolve(
+ client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, float], str] | None:
+ """SPEC CPU2006 resolver: ``({specint2006?, specfp2006?}, url)`` or None."""
+ key = normalize_name(name)
+ if not key:
+ return None
+ scores: dict[str, float] = {}
+ cint = _load(client, CINT_URL).get(key)
+ if cint is not None:
+ scores["specint2006"] = cint
+ cfp = _load(client, CFP_URL).get(key)
+ if cfp is not None:
+ scores["specfp2006"] = cfp
+ if not scores:
+ return None
+ return scores, RESULTS_INDEX
diff --git a/app/ingest/sources/technical_city.py b/app/ingest/sources/technical_city.py
new file mode 100644
index 0000000..2668387
--- /dev/null
+++ b/app/ingest/sources/technical_city.py
@@ -0,0 +1,114 @@
+"""technical.city CPU pages → legacy Cinebench scores (R15 / R10 / R11.5).
+
+Fills the legacy Cinebench fields that PassMark's site doesn't carry. Uses
+explicit per-CPU URLs (``/en/cpu/``) — no fuzzy search — and confirms the
+page heading matches the requested chip. Matching is vendor-insensitive because
+technical.city drops the "AMD"/"Intel" prefix ("Ryzen 7 5800X: specs and
+benchmarks"). Each benchmark sits in a ``div.tab`` (```` label) whose
+``.item`` for the page's own CPU holds the value in ````.
+A field stays absent when the page doesn't list it (older chips have no R15).
+
+Variant-safe: a wrong slug 404s or serves a different chip, which the heading
+check rejects. Never fabricates.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+
+import httpx
+from bs4 import BeautifulSoup
+
+from .passmark import normalize_name
+
+BASE = "https://technical.city/en/cpu/{slug}"
+_VENDOR_RE = re.compile(r"^(amd|intel)\s+", re.IGNORECASE)
+_NUM_RE = re.compile(r"\d[\d,]*\.?\d*")
+
+
+@dataclass(frozen=True)
+class LegacyResult:
+ page_name: str
+ scores: dict[str, float] # field name -> int|float
+ source_url: str
+
+
+def slug(name: str) -> str:
+ """Dataset name → technical.city URL slug (drops vendor + codename)."""
+ s = re.sub(r"\s*\([^)]*\)", "", name)
+ s = _VENDOR_RE.sub("", s).strip()
+ return re.sub(r"\s+", "-", s)
+
+
+def _key(name: str) -> str:
+ """Vendor-insensitive comparable key (technical.city omits the vendor)."""
+ return normalize_name(_VENDOR_RE.sub("", re.sub(r"\s*\([^)]*\)", "", name)))
+
+
+def _field_for(label: str) -> str | None:
+ """Map a benchmark section heading to a schema field, or None."""
+ low = label.lower()
+ if "single" in low:
+ suffix = "single"
+ elif "multi" in low:
+ suffix = "multi"
+ else:
+ return None
+ if "11.5" in low:
+ return f"cinebench_r11_5_{suffix}"
+ if re.search(r"\br?10\b", low):
+ return f"cinebench_r10_{suffix}"
+ if re.search(r"\br?15\b", low):
+ return f"cinebench_r15_{suffix}"
+ return None
+
+
+def _value(text: str, *, decimal: bool) -> float | int | None:
+ m = _NUM_RE.search(text)
+ if not m:
+ return None
+ raw = float(m.group(0).replace(",", ""))
+ return raw if decimal else int(raw)
+
+
+def fetch_legacy(client: httpx.Client, name: str) -> LegacyResult | None:
+ """Fetch variant-confirmed legacy Cinebench scores for ``name``."""
+ resp = client.get(BASE.format(slug=slug(name)))
+ if resp.status_code != 200:
+ return None
+ soup = BeautifulSoup(resp.text, "html.parser")
+ h1 = soup.find("h1")
+ if h1 is None:
+ return None
+ heading = h1.get_text(" ", strip=True).split(":", 1)[0].strip()
+ if _key(heading) != _key(name):
+ return None
+ # The heading gate confirms page identity; within each benchmark tab the
+ # page's own CPU is the first value row (technical.city renders it as
+ # "this CPU vs others"), and its may be a short form ("i9-14900K").
+ scores: dict[str, float] = {}
+ for tab in soup.select("div.tab"):
+ h4 = tab.find("h4")
+ if h4 is None:
+ continue
+ field = _field_for(h4.get_text(" ", strip=True))
+ if field is None or field in scores:
+ continue
+ em = tab.select_one(".item em.avarage")
+ if em is None:
+ continue
+ val = _value(em.get_text(" ", strip=True), decimal="r11_5" in field)
+ if val is not None:
+ scores[field] = val
+ if not scores:
+ return None
+ return LegacyResult(page_name=heading, scores=scores, source_url=str(resp.url))
+
+
+def resolve(
+ client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, float], str] | None:
+ """Generic resolver: ``(scores_dict, source_url)`` or None (for enrich runner)."""
+ r = fetch_legacy(client, name)
+ return (r.scores, r.source_url) if r else None
diff --git a/app/ingest/sources/topcpu.py b/app/ingest/sources/topcpu.py
new file mode 100644
index 0000000..1a001e8
--- /dev/null
+++ b/app/ingest/sources/topcpu.py
@@ -0,0 +1,165 @@
+"""topcpu.net → CPU benchmark scores + GPU Time Spy (open static ranking pages).
+
+topcpu.net publishes per-benchmark ranking pages where each row is an
+```` comparison checkbox with a sibling
+``span.font-bold`` score. The same parser serves every page; only the URL and
+the name-normalizer differ (CPU vs GPU).
+
+GPU: ``timespy_score`` = 3DMark Time Spy *graphics* score (GPU-only sub-score,
+e.g. RTX 4090 ≈ 36 328, not the CPU-influenced overall).
+
+CPU: fills the families our other sources leave thin/capped — Cinebench 2024
+(cgdirector charts only had ~30), PassMark (cpubenchmark's public lookup caps at
+~644), Geekbench 6 and Cinebench R23. Values are the same scale as our existing
+sources (cross-checked: 14900K CB2024 2130 vs 2211, PassMark 61 120 vs 58 335,
+GB6 22 637 vs 21 000, R23 38 497 vs 40 500 — normal cross-aggregator variance).
+
+Bulk tables: each page fetched once, cached, matched by an exact variant-safe
+normalized key (``normalize_name`` for CPUs keeps K/KF/X suffixes distinct;
+``normalize_gpu`` for GPUs keeps Ti/XT/Laptop distinct). Fill-only-nulls upstream
+means existing source-of-record values are never overwritten. Never fabricates.
+"""
+
+from __future__ import annotations
+
+import re
+from collections.abc import Callable
+
+import httpx
+from bs4 import BeautifulSoup
+
+from .blender import normalize_gpu
+from .passmark import normalize_name
+
+_EN = "https://www.topcpu.net/en/"
+TIMESPY_URL = _EN + "gpu-r/3dmark-time-spy"
+URL = TIMESPY_URL # back-compat: GPU Time Spy is the original single page
+CPU_INDEX_URL = _EN + "cpu-r/"
+
+# (multi_url, multi_field, single_url, single_field) per CPU benchmark family.
+_CPU_FAMILIES: list[tuple[str, str, str, str]] = [
+ (_EN + "cpu-r/cinebench-2024-multi-core", "cinebench_2024_multi",
+ _EN + "cpu-r/cinebench-2024-single-core", "cinebench_2024_single"),
+ (_EN + "cpu-r/passmark-cpu-multi-core", "passmark_cpu_mark",
+ _EN + "cpu-r/passmark-cpu-single-core", "passmark_single"),
+ (_EN + "cpu-r/geekbench-6-multi-core", "geekbench_multi",
+ _EN + "cpu-r/geekbench-6-single-core", "geekbench_single"),
+ (_EN + "cpu-r/cinebench-r23-multi-core", "cinebench_r23_multi",
+ _EN + "cpu-r/cinebench-r23-single-core", "cinebench_r23_single"),
+]
+
+# (url, field, is_float) for the extra GPU benchmark dimensions.
+_GPU_FAMILIES: list[tuple[str, str, bool]] = [
+ (_EN + "gpu-r/3dmark-time-spy-extreme", "timespy_extreme_score", False),
+ (_EN + "gpu-r/3dmark-speed-way", "speedway_score", False),
+ (_EN + "gpu-r/octanebench", "octanebench_score", False),
+ (_EN + "gpu-r/fp32-float", "fp32_tflops", True),
+]
+
+_BOLD = re.compile(r"font-bold")
+_DIGITS = re.compile(r"[^0-9]")
+_NUM = re.compile(r"[\d,]+\.?\d*")
+
+# Cached normalized score maps, keyed by (url, normalizer name).
+_caches: dict[str, dict[str, float]] = {}
+
+
+def _load_map(
+ client: httpx.Client,
+ url: str,
+ normalizer: Callable[[str], str],
+ *,
+ as_float: bool = False,
+) -> dict[str, float]:
+ ckey = f"{url}|{normalizer.__name__}"
+ if ckey in _caches:
+ return _caches[ckey]
+ table: dict[str, float] = {}
+ _caches[ckey] = table
+ resp = client.get(url)
+ if resp.status_code != 200:
+ return table
+ soup = BeautifulSoup(resp.text, "html.parser")
+ for inp in soup.select("input[data-cmp]"):
+ name = inp.get("value")
+ row = inp.parent
+ if not isinstance(name, str) or not name or row is None:
+ continue
+ bold = row.find("span", class_=_BOLD)
+ if bold is None:
+ continue
+ text = bold.get_text(strip=True)
+ if as_float:
+ m = _NUM.search(text)
+ value: float | None = float(m.group(0).replace(",", "")) if m else None
+ else:
+ digits = _DIGITS.sub("", text)
+ value = int(digits) if digits else None
+ if value is None:
+ continue
+ key = normalizer(name)
+ if key:
+ # First occurrence wins (page is sorted best-first).
+ table.setdefault(key, value)
+ return table
+
+
+def reset_cache() -> None:
+ """Clear module caches (tests / re-runs)."""
+ _caches.clear()
+
+
+def resolve(
+ client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, int], str] | None:
+ """GPU Time Spy resolver: ``({"timespy_score": score}, url)`` or None."""
+ hit = _load_map(client, TIMESPY_URL, normalize_gpu).get(normalize_gpu(name))
+ if hit is None:
+ return None
+ return {"timespy_score": int(hit)}, TIMESPY_URL
+
+
+def resolve_cpu(
+ client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, int], str] | None:
+ """CPU resolver: fills any of the four families present, or None."""
+ key = normalize_name(name)
+ if not key:
+ return None
+ scores: dict[str, int] = {}
+ for multi_url, multi_field, single_url, single_field in _CPU_FAMILIES:
+ m = _load_map(client, multi_url, normalize_name).get(key)
+ if m is not None:
+ scores[multi_field] = int(m)
+ s = _load_map(client, single_url, normalize_name).get(key)
+ if s is not None:
+ scores[single_field] = int(s)
+ if not scores:
+ return None
+ return scores, CPU_INDEX_URL
+
+
+def resolve_gpu(
+ client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, float], str] | None:
+ """GPU breadth resolver: Time Spy Extreme / Speed Way / OctaneBench / FP32.
+
+ WARNING: topcpu publishes unreliable *estimated* 3DMark/Octane scores for
+ pre-DX12 cards that can't actually run them (e.g. Radeon HD 5670 "Time Spy"
+ 3897 — physically impossible; contradicts its PassMark G3D). The same applies
+ to ``resolve`` (Time Spy). When enriching, GUARD on DX12 capability
+ (release year >= 2011 / GCN/Kepler+) before writing timespy*/speedway/
+ octanebench — only fp32_tflops (a spec) is era-safe. See
+ TechAPI/.claude/benchmark_fill_progress.md pt.7.
+ """
+ key = normalize_gpu(name)
+ if not key:
+ return None
+ scores: dict[str, float] = {}
+ for url, field, as_float in _GPU_FAMILIES:
+ v = _load_map(client, url, normalize_gpu, as_float=as_float).get(key)
+ if v is not None:
+ scores[field] = v
+ if not scores:
+ return None
+ return scores, CPU_INDEX_URL.replace("cpu-r", "gpu-r")
diff --git a/app/ingest/sources/videocardbenchmark.py b/app/ingest/sources/videocardbenchmark.py
new file mode 100644
index 0000000..a78da8f
--- /dev/null
+++ b/app/ingest/sources/videocardbenchmark.py
@@ -0,0 +1,62 @@
+"""videocardbenchmark.net → passmark_g3d_mark (PassMark G3D Mark, GPU).
+
+PassMark's GPU database is the GPU analogue of cpubenchmark.net. Its
+``gpu_list.php`` page is one big HTML table covering ~the entire history of
+discrete GPUs — modern RTX/RX down to GeForce 256, Voodoo and Matrox — so unlike
+Blender/Time Spy (which only test ~2014+ cards) it can fill the legacy GPUs.
+
+Each row is ``| NAME | G3D | …``. Bulk
+table: fetched once, cached, matched by exact ``normalize_gpu`` key (variant-safe
+— RTX 4070 ≠ 4070 Ti). ToS: per-name lookup + attribution, no bulk re-publishing
+of the chart. Never fabricates — an unlisted GPU stays null.
+"""
+
+from __future__ import annotations
+
+import re
+
+import httpx
+from bs4 import BeautifulSoup
+
+from .blender import normalize_gpu
+
+URL = "https://www.videocardbenchmark.net/gpu_list.php"
+_DIGITS = re.compile(r"[^0-9]")
+
+_cache: dict[str, int] = {}
+
+
+def _load(client: httpx.Client) -> dict[str, int]:
+ if _cache:
+ return _cache
+ resp = client.get(URL)
+ if resp.status_code != 200:
+ return _cache
+ soup = BeautifulSoup(resp.text, "html.parser")
+ for tr in soup.select('tr[id^="gpu"]'):
+ cells = tr.find_all("td")
+ if len(cells) < 2:
+ continue
+ name = cells[0].get_text(" ", strip=True)
+ digits = _DIGITS.sub("", cells[1].get_text())
+ if not name or not digits:
+ continue
+ key = normalize_gpu(name)
+ if key:
+ _cache.setdefault(key, int(digits))
+ return _cache
+
+
+def reset_cache() -> None:
+ """Clear module cache (tests / re-runs)."""
+ _cache.clear()
+
+
+def resolve(
+ client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, int], str] | None:
+ """PassMark G3D resolver: ``({"passmark_g3d_mark": score}, url)`` or None."""
+ hit = _load(client).get(normalize_gpu(name))
+ if hit is None:
+ return None
+ return {"passmark_g3d_mark": hit}, URL
diff --git a/app/models/cpu.py b/app/models/cpu.py
index 22d4683..f7b07bc 100644
--- a/app/models/cpu.py
+++ b/app/models/cpu.py
@@ -51,10 +51,33 @@ class CPU(SQLModel, table=True):
memory_support: str | None = None # "DDR5-5600"
# Benchmarks (raw, algorithm input only — ADR-006)
+ # Modern (current generation)
cinebench_r23_single: int | None = None
cinebench_r23_multi: int | None = None
+ # Cinebench 2024 — Maxon's current release (superseded R23, Redshift engine);
+ # much smaller scale (single ~100-140, multi ~hundreds-thousands).
+ cinebench_2024_single: int | None = None
+ cinebench_2024_multi: int | None = None
geekbench_single: int | None = None
geekbench_multi: int | None = None
+ # Legacy benchmark programs — added per maintainer request to score pre-R23 CPUs.
+ # Cinebench R15/R10 are integer scores; R11.5 reports small decimals (e.g. 1.52).
+ cinebench_r15_single: int | None = None
+ cinebench_r15_multi: int | None = None
+ cinebench_r11_5_single: float | None = None
+ cinebench_r11_5_multi: float | None = None
+ cinebench_r10_single: int | None = None
+ cinebench_r10_multi: int | None = None
+ # PassMark CPU Mark — single-thread rating + overall mark.
+ passmark_single: int | None = None
+ passmark_cpu_mark: int | None = None
+ # SPEC CPU2006 base rates (workstation/server era).
+ specint2006: float | None = None
+ specfp2006: float | None = None
+ # Classic synthetics for 1990s–2000s parts.
+ dhrystone_mips: float | None = None
+ whetstone_mflops: float | None = None
+ superpi_1m_sec: float | None = None # SuperPI 1M time in seconds (lower is better)
# Meta
msrp_usd: int | None = None
diff --git a/app/models/gpu.py b/app/models/gpu.py
index 912a4cb..3997b14 100644
--- a/app/models/gpu.py
+++ b/app/models/gpu.py
@@ -44,6 +44,11 @@ class DiscreteGPU(SQLModel, table=True):
# Benchmarks (open licenses only)
blender_score: float | None = None
timespy_score: int | None = None
+ passmark_g3d_mark: int | None = None # PassMark G3D Mark (videocardbenchmark.net)
+ timespy_extreme_score: int | None = None # 3DMark Time Spy Extreme (4K)
+ speedway_score: int | None = None # 3DMark Speed Way (DX12 Ultimate / ray tracing)
+ octanebench_score: int | None = None # OctaneBench (OctaneRender, NVIDIA/CUDA)
+ fp32_tflops: float | None = None # Peak FP32 compute throughput
# Meta
verified: bool = False
diff --git a/integrity_check.py b/integrity_check.py
new file mode 100644
index 0000000..3da4690
--- /dev/null
+++ b/integrity_check.py
@@ -0,0 +1,137 @@
+"""One-off data-integrity scan for TechAPI CPU+GPU (structural + benchmark anomaly).
+
+Complements app/validate.py (schema) with: duplicate detection, slug/file match,
+verified-without-source, name/tier vs core-count consistency, single>multi sanity,
+era-vs-score outliers, and CROSS-SOURCE correlation outliers (the key wrong-variant
+contamination detector). Read-only; prints flagged items for human review.
+
+Usage::
+
+ python integrity_check.py [DATA_ROOT] [--strict]
+
+By default it prints every flagged item and exits 0 (human-review mode). With
+``--strict`` it additionally exits non-zero when any *hard* anomaly is found —
+unambiguous corruption that must block the weekly refresh PR: duplicate slugs,
+slug/filename mismatches, and physically-impossible single>multi benchmarks.
+The statistical cross-source/era outliers stay advisory (a heterogeneous catalog
+of server + desktop + mobile parts legitimately produces many ratio outliers), so
+they are printed for review but never fail the gate.
+"""
+from __future__ import annotations
+import os, json, math, re, statistics, sys
+
+# Em-dash etc. in section headers must not crash on legacy consoles (e.g. cp949).
+try:
+ sys.stdout.reconfigure(encoding="utf-8") # type: ignore[union-attr]
+except Exception:
+ pass
+
+_argv = sys.argv[1:]
+STRICT = "--strict" in _argv
+_positional = [a for a in _argv if not a.startswith("-")]
+ROOT = _positional[0] if _positional else r"C:\Users\29\Desktop\TechAPI\data"
+
+# Hard anomalies block the weekly gate under --strict; soft ones are review-only.
+HARD: list[str] = []
+def hard(msg: str) -> None:
+ HARD.append(msg)
+ print(msg)
+
+def load(comp):
+ recs = []
+ for dp, _, fs in os.walk(os.path.join(ROOT, comp)):
+ for fn in fs:
+ if fn.endswith(".json") and not fn.startswith("_"):
+ p = os.path.join(dp, fn)
+ recs.append((p, fn[:-5], json.load(open(p, encoding="utf-8"))))
+ return recs
+
+def mad_outliers(pairs, lo=0.34, hi=3.0):
+ """pairs: list of (label, a, b); flag log(a/b) outliers via median±3*MAD."""
+ rs = [(l, math.log(a / b)) for l, a, b in pairs if a and b]
+ if len(rs) < 8:
+ return []
+ med = statistics.median(r for _, r in rs)
+ mad = statistics.median(abs(r - med) for _, r in rs) or 1e-9
+ return [(l, round(math.exp(r), 2)) for l, r in rs if abs(r - med) > 4 * mad]
+
+def section(t): print(f"\n### {t}")
+
+cpus = load("cpu"); gpus = load("gpu")
+print(f"loaded CPU={len(cpus)} GPU={len(gpus)}")
+
+# --- 1. duplicates + slug/file + verified-no-source ---
+section("structural")
+for comp, recs in (("cpu", cpus), ("gpu", gpus)):
+ slugs, names = {}, {}
+ for p, fn, d in recs:
+ slugs.setdefault(d.get("slug"), []).append(fn)
+ names.setdefault(d.get("name"), []).append(fn)
+ if d.get("slug") != fn:
+ hard(f" [{comp}] slug!=file: {fn} slug={d.get('slug')}")
+ for s, fl in slugs.items():
+ if len(fl) > 1: hard(f" [{comp}] DUP slug {s}: {fl}")
+ for n, fl in names.items():
+ if len(fl) > 1: hard(f" [{comp}] DUP name {n!r}: {fl}")
+
+# --- 2. AMD Ryzen line vs DESKTOP model tier-digit (2nd digit); APU/mobile excepted ---
+section("CPU name/tier consistency (desktop mainstream only)")
+TIERMAP = {"6": "5", "7": "7", "8": "7", "9": "9"} # 2nd model digit -> expected line
+for p, fn, d in cpus:
+ n = d.get("name", "")
+ # mainstream desktop: 4-digit model, no G/U/H/HS/HX (APU/mobile) suffix
+ m = re.match(r"AMD Ryzen (\d) (\d)(\d)\d\d(X3D|X|XT)?$", n)
+ if m:
+ line, _gen, tier = m.group(1), m.group(2), m.group(3)
+ exp = TIERMAP.get(tier)
+ if exp and exp != line:
+ print(f" [tier] {n!r}: line Ryzen {line} but tier-digit {tier} → expect Ryzen {exp}")
+
+# --- 3. benchmark sanity: single>multi (consistent-scale benches) ---
+section("CPU single>multi (cinebench/geekbench — should be multi>=single)")
+for p, fn, d in cpus:
+ for s, mu in [("cinebench_r23_single","cinebench_r23_multi"),
+ ("geekbench_single","geekbench_multi"),
+ ("cinebench_2024_single","cinebench_2024_multi")]:
+ a, b = d.get(s), d.get(mu)
+ if a and b and a > b and (d.get("threads") or 1) > 1:
+ hard(f" {d['name']!r}: {s}={a} > {mu}={b}")
+
+# --- 4. era vs score (catch wrong-variant: old chip w/ modern score) ---
+section("CPU era-vs-score outliers")
+for p, fn, d in cpus:
+ y = (d.get("release_date") or "0")[:4]
+ pm = d.get("passmark_cpu_mark"); r23 = d.get("cinebench_r23_multi")
+ if y < "2006" and pm and pm > 1500:
+ print(f" {d['name']!r} ({y}): passmark {pm} too high for era")
+ if y < "2011" and r23 and r23 > 3000:
+ print(f" {d['name']!r} ({y}): r23 {r23} too high for era")
+
+# --- 5. cross-source correlation outliers (KEY contamination detector) ---
+section("CPU cross-source ratio outliers (possible wrong-variant)")
+def collect(recs, fa, fb):
+ return [(d["name"], d[fa], d[fb]) for p, fn, d in recs if d.get(fa) and d.get(fb)]
+for fa, fb in [("passmark_cpu_mark","cinebench_r23_multi"),
+ ("passmark_cpu_mark","geekbench_multi"),
+ ("cinebench_r23_multi","geekbench_multi"),
+ ("cinebench_2024_multi","cinebench_r23_multi")]:
+ out = mad_outliers(collect(cpus, fa, fb))
+ for label, ratio in out:
+ print(f" [{fa}/{fb}] {label!r}: ratio={ratio}")
+
+# --- 6. GPU cross-source + sanity ---
+section("GPU cross-source ratio outliers + sanity")
+for fa, fb in [("passmark_g3d_mark","timespy_score"),
+ ("timespy_score","blender_score"),
+ ("fp32_tflops","timespy_score"),
+ ("passmark_g3d_mark","fp32_tflops")]:
+ for label, ratio in mad_outliers(collect(gpus, fa, fb)):
+ print(f" [{fa}/{fb}] {label!r}: ratio={ratio}")
+
+print("\n(no lines under a section = clean)")
+
+if STRICT and HARD:
+ print(f"\n❌ integrity gate: {len(HARD)} hard anomaly(ies) — blocking refresh.")
+ sys.exit(1)
+if STRICT:
+ print("\n✅ integrity gate: no hard anomalies.")
diff --git a/passmark_ids.json b/passmark_ids.json
new file mode 100644
index 0000000..7e45324
--- /dev/null
+++ b/passmark_ids.json
@@ -0,0 +1,8 @@
+{
+ "Intel Core i7-11700": "3947",
+ "Intel Core i9-11900": "4245",
+ "Intel Core i5-12500": "4675",
+ "Intel Core i5-12600": "4688",
+ "AMD Ryzen 7 3800X": "3499",
+ "Intel Processor N100": "5157"
+}
diff --git a/tests/unit/test_bulk_benchmark_sources.py b/tests/unit/test_bulk_benchmark_sources.py
new file mode 100644
index 0000000..38b8cf3
--- /dev/null
+++ b/tests/unit/test_bulk_benchmark_sources.py
@@ -0,0 +1,100 @@
+"""Bulk-table benchmark sources (cgdirector R23, notebookcheck R15/R23) — no network."""
+
+from __future__ import annotations
+
+from app.ingest.sources import cgdirector, notebookcheck
+
+
+class _Resp:
+ status_code = 200
+
+ def __init__(self, text: str) -> None:
+ self.text = text
+
+
+class _Client:
+ def __init__(self, text: str) -> None:
+ self._text = text
+
+ def get(self, url): # noqa: ANN001
+ return _Resp(self._text)
+
+
+CG_HTML = """
+
+ | CPU Name | Cores | Ghz | Single Score | Multi Score |
+ | AMD Ryzen 7 5800X | 8 | 4.7 | 1593 | 11201 |
+ | Intel Core i7 14700K | 20 | 5.6 | 2228 | 33572 |
+
+"""
+
+
+def test_cgdirector_parses_and_matches_exact() -> None:
+ cgdirector.reset_cache()
+ client = _Client(CG_HTML)
+ assert cgdirector.resolve(client, "AMD Ryzen 7 5800X") == (
+ {"cinebench_r23_single": 1593, "cinebench_r23_multi": 11201},
+ cgdirector.R23_URL,
+ )
+ # dash vs space in source name still matches
+ out = cgdirector.resolve(client, "Intel Core i7-14700K")
+ assert out and out[0]["cinebench_r23_multi"] == 33572
+ # absent chip
+ assert cgdirector.resolve(client, "AMD Ryzen 5 9999X") is None
+
+
+CB2024_HTML = """
+
+ | CPU Name | Single Score | Multi Score |
+ | AMD Ryzen 7 5800X | 98 | 861 |
+ | Intel Core i9 14900K | 139 | 2211 |
+
+"""
+
+
+def test_cgdirector_cinebench_2024() -> None:
+ cgdirector.reset_cache()
+ out = cgdirector.resolve_2024(_Client(CB2024_HTML), "AMD Ryzen 7 5800X")
+ assert out == (
+ {"cinebench_2024_single": 98, "cinebench_2024_multi": 861},
+ cgdirector.CB2024_URL,
+ )
+
+
+NBC_HTML = """
+
+ | Model | Cores / Threads |
+ Cinebench R15 CPU Single 64Bit | Cinebench R15 CPU Multi 64Bit |
+ Cinebench R23 Single Core | Cinebench R23 Multi Core |
+ Geekbench 6.6 Multi-Core |
+ | AMD Ryzen 7 5800X | 8/16 |
+ 265.5 n2 | 2608.5 n2 | 1574.5 n2 | 15476 n2 | 10035 |
+ | Intel Core i7-1165G7 | 4/8 |
+ 218 n5 | 850 n5 | 1458 n5 | 5216 n5 | 5000 |
+
+"""
+
+
+def test_notebookcheck_extracts_r15_and_r23_only() -> None:
+ notebookcheck.reset_cache()
+ client = _Client(NBC_HTML)
+ out = notebookcheck.resolve(client, "AMD Ryzen 7 5800X")
+ assert out is not None
+ scores, url = out
+ assert url == notebookcheck.URL
+ # R15 + R23 captured (rounded ints); Geekbench column NOT taken.
+ assert scores == {
+ "cinebench_r15_single": 266,
+ "cinebench_r15_multi": 2608,
+ "cinebench_r23_single": 1574,
+ "cinebench_r23_multi": 15476,
+ }
+ assert notebookcheck.resolve(client, "Intel Core i7-1165G7")[0]["cinebench_r23_multi"] == 5216
+ assert notebookcheck.resolve(client, "Nonexistent CPU 1") is None
+
+
+def test_notebookcheck_geekbench_is_gb6_only() -> None:
+ notebookcheck.reset_cache()
+ out = notebookcheck.resolve_geekbench(_Client(NBC_HTML), "AMD Ryzen 7 5800X")
+ # NBC_HTML carries only a GB6 multi column → GB5.x must never leak in.
+ assert out is not None and out[0] == {"geekbench_multi": 10035}
diff --git a/tests/unit/test_gpu_sources.py b/tests/unit/test_gpu_sources.py
new file mode 100644
index 0000000..fc49f2a
--- /dev/null
+++ b/tests/unit/test_gpu_sources.py
@@ -0,0 +1,240 @@
+"""GPU benchmark sources — Blender (opendata) + Time Spy (topcpu). No network."""
+
+from __future__ import annotations
+
+import io
+import json
+import zipfile
+
+from app.ingest.sources import blender, topcpu, videocardbenchmark
+
+# --- shared GPU name normalization (variant safety) ---------------------------
+
+
+def test_normalize_gpu_matching_and_variants() -> None:
+ n = blender.normalize_gpu
+ # Vendor-prefixed source name collapses onto our vendorless dataset name.
+ assert n("GeForce RTX 4070") == n("NVIDIA GeForce RTX 4070")
+ assert n("Radeon RX 7900 XTX") == n("AMD Radeon RX 7900 XTX")
+ assert n("Arc A770") == n("Intel Arc A770 Graphics")
+ # Memory-size and OpenGL tails are dropped.
+ assert n("Radeon RX 580 8GB") == n("AMD Radeon RX 580")
+ assert n("GeForce RTX 3070/PCIe/SSE2") == n("GeForce RTX 3070")
+ # Variants stay distinct.
+ assert n("GeForce RTX 4070") != n("GeForce RTX 4070 Ti")
+ assert n("GeForce RTX 4070 Ti") != n("GeForce RTX 4070 Ti Super")
+ assert n("Radeon RX 7900 XT") != n("Radeon RX 7900 XTX")
+
+
+# --- Blender (opendata snapshot) ----------------------------------------------
+
+
+class _Resp:
+ status_code = 200
+
+ def __init__(self, content: bytes) -> None:
+ self.content = content
+
+
+class _ZipClient:
+ def __init__(self, content: bytes) -> None:
+ self._content = content
+
+ def get(self, url): # noqa: ANN001
+ return _Resp(self._content)
+
+
+def _submission(device: str, version: str, spms: list[float]) -> dict:
+ scenes = ["monster", "junkshop", "classroom"]
+ return {
+ "data": [
+ {
+ "blender_version": {"version": version},
+ "device_info": {
+ "device_type": "OPTIX",
+ "compute_devices": [{"name": device, "type": "OPTIX"}],
+ },
+ "scene": {"label": scenes[i]},
+ "stats": {"samples_per_minute": spm},
+ }
+ for i, spm in enumerate(spms)
+ ]
+ }
+
+
+def _zip_of(lines: list[dict]) -> bytes:
+ buf = io.BytesIO()
+ with zipfile.ZipFile(buf, "w") as zf:
+ zf.writestr("LICENSE.txt", "CC0")
+ zf.writestr(
+ "opendata-test.jsonl", "\n".join(json.dumps(x) for x in lines)
+ )
+ return buf.getvalue()
+
+
+def test_blender_median_of_scene_sums_pinned_version() -> None:
+ blender.reset_cache()
+ name = "NVIDIA GeForce RTX 4080 SUPER"
+ lines = [
+ # Two 4.5 runs → sums 9000 and 8000 → median 8500.
+ _submission(name, "4.5.0", [4500, 2300, 2200]), # sum 9000
+ _submission(name, "4.5.1", [4000, 2000, 2000]), # sum 8000
+ # A 3.6 run must be ignored (version pin).
+ _submission(name, "3.6.0", [999, 999, 999]),
+ # A CPU row must be ignored (only GPU device types count) — covered by
+ # device_type filter; here we just add another version to be safe.
+ ]
+ out = blender.resolve(_ZipClient(_zip_of(lines)), "GeForce RTX 4080 Super")
+ assert out is not None
+ scores, url = out
+ assert scores == {"blender_score": 8500.0}
+ assert url == blender.SNAPSHOT_URL
+ # Unknown GPU → None.
+ assert blender.resolve(_ZipClient(_zip_of(lines)), "GeForce RTX 9999") is None
+
+
+# --- Time Spy (topcpu ranking) ------------------------------------------------
+
+
+class _HtmlResp:
+ status_code = 200
+
+ def __init__(self, text: str) -> None:
+ self.text = text
+
+
+class _HtmlClient:
+ def __init__(self, text: str) -> None:
+ self._text = text
+
+ def get(self, url): # noqa: ANN001
+ return _HtmlResp(self._text)
+
+
+TOPCPU_HTML = """
+
+
+"""
+
+
+def test_topcpu_parses_score_from_sibling_and_variant_safe() -> None:
+ topcpu.reset_cache()
+ client = _HtmlClient(TOPCPU_HTML)
+ assert topcpu.resolve(client, "GeForce RTX 4090") == (
+ {"timespy_score": 36328},
+ topcpu.URL,
+ )
+ # Variant safety: plain 4070 absent here → None (only 4070 Ti present).
+ assert topcpu.resolve(client, "GeForce RTX 4070") is None
+ assert topcpu.resolve(client, "GeForce RTX 4070 Ti")[0]["timespy_score"] == 22000
+
+
+def _cpu_row(name: str, score: str) -> str:
+ # Real topcpu rows carry the full vendor-prefixed name in the input value.
+ return (
+ f''
+ )
+
+
+class _RoutingClient:
+ """Serves different HTML per URL substring (CPU multi/single pages)."""
+
+ def __init__(self, routes: dict[str, str]) -> None:
+ self._routes = routes
+
+ def get(self, url): # noqa: ANN001
+ for frag, html in self._routes.items():
+ if frag in url:
+ return _HtmlResp(html)
+ return _HtmlResp("")
+
+
+def test_topcpu_cpu_combines_multi_and_single_families() -> None:
+ topcpu.reset_cache()
+ n = "Intel Core i9-14900K"
+ routes = {
+ "cinebench-2024-multi-core": "" + _cpu_row(n, "2130") + " ",
+ "cinebench-2024-single-core": "" + _cpu_row(n, "139") + " ",
+ "passmark-cpu-multi-core": "" + _cpu_row(n, "61120") + " ",
+ "passmark-cpu-single-core": "" + _cpu_row(n, "4770") + " ",
+ }
+ client = _RoutingClient(routes)
+ out = topcpu.resolve_cpu(client, "Intel Core i9-14900K")
+ assert out is not None
+ scores, url = out
+ assert scores == {
+ "cinebench_2024_multi": 2130,
+ "cinebench_2024_single": 139,
+ "passmark_cpu_mark": 61120,
+ "passmark_single": 4770,
+ }
+ assert url == topcpu.CPU_INDEX_URL
+ # A CPU absent from every page → None.
+ assert topcpu.resolve_cpu(client, "AMD Ryzen 5 9999X") is None
+
+
+# --- PassMark GPU (videocardbenchmark) ----------------------------------------
+
+VCB_HTML = """
+
+"""
+
+
+def test_videocardbenchmark_parses_g3d_and_variant_safe() -> None:
+ videocardbenchmark.reset_cache()
+ client = _HtmlClient(VCB_HTML)
+ # Comma-formatted score parsed; legacy card covered.
+ assert videocardbenchmark.resolve(client, "GeForce RTX 4090") == (
+ {"passmark_g3d_mark": 38073},
+ videocardbenchmark.URL,
+ )
+ assert videocardbenchmark.resolve(client, "GeForce 256")[0]["passmark_g3d_mark"] == 5
+ # Variant safety: plain 3070 absent (only 3070 Ti present) → None.
+ assert videocardbenchmark.resolve(client, "GeForce RTX 3070") is None
+ ti = videocardbenchmark.resolve(client, "GeForce RTX 3070 Ti")
+ assert ti is not None
+ assert ti[0]["passmark_g3d_mark"] == 23223
+
+
+def _gpu_row(name: str, score: str) -> str:
+ return (
+ f''
+ )
+
+
+def test_topcpu_gpu_breadth_int_and_float() -> None:
+ topcpu.reset_cache()
+ n = "GeForce RTX 4090"
+ routes = {
+ "3dmark-time-spy-extreme": "" + _gpu_row(n, "19460") + " ",
+ "3dmark-speed-way": "" + _gpu_row(n, "10074") + " ",
+ "octanebench": "" + _gpu_row(n, "1274") + " ",
+ "fp32-float": "" + _gpu_row(n, "82.58") + " ", # float metric
+ }
+ out = topcpu.resolve_gpu(_RoutingClient(routes), "GeForce RTX 4090")
+ assert out is not None
+ scores, url = out
+ assert scores == {
+ "timespy_extreme_score": 19460,
+ "speedway_score": 10074,
+ "octanebench_score": 1274,
+ "fp32_tflops": 82.58, # parsed as float, not 8258
+ }
+ assert "gpu-r" in url
+ assert topcpu.resolve_gpu(_RoutingClient(routes), "Radeon RX 9999") is None
diff --git a/tests/unit/test_passmark_enrich.py b/tests/unit/test_passmark_enrich.py
new file mode 100644
index 0000000..014f64a
--- /dev/null
+++ b/tests/unit/test_passmark_enrich.py
@@ -0,0 +1,135 @@
+"""PassMark scraper variant-safety + enrichment unit tests (no network)."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from app.ingest import enrich as enrich_mod
+from app.ingest.sources import passmark
+from app.ingest.sources.passmark import (
+ PassMarkResult,
+ _extract,
+ heading_matches,
+ normalize_name,
+)
+
+
+def test_normalize_strips_clock_and_graphics_tails() -> None:
+ assert normalize_name("AMD Ryzen 7 5800X @ 3.80GHz") == normalize_name(
+ "AMD Ryzen 7 5800X"
+ )
+ assert normalize_name("AMD Ryzen 5 4600G with Radeon Graphics") == normalize_name(
+ "AMD Ryzen 5 4600G"
+ )
+ assert normalize_name("Intel Celeron G5905 (Comet Lake)") == normalize_name(
+ "Intel Celeron G5905"
+ )
+
+
+def test_variants_stay_distinct() -> None:
+ # The whole point: fuzzy siblings must NOT compare equal.
+ assert not heading_matches("AMD Ryzen 7 5800X", "AMD Ryzen 7 5800X3D")
+ assert not heading_matches("Intel Core i9-14900K", "Intel Core i9-14900KS")
+ assert not heading_matches("Intel Core i5-12400", "Intel Core i5-12400F")
+ assert not heading_matches("AMD Ryzen 9 5900X", "AMD Ryzen 9 5900XT")
+ # ...but a clock-suffixed exact match must.
+ assert heading_matches("Intel Core i9-13900K", "Intel Core i9-13900K @ 3.00GHz")
+
+
+def test_extract_reads_labels() -> None:
+ html = """
+
+ AMD Ryzen 7 5800X @ 3.80GHz
+ Multithread Rating: 27,684
+ Single Thread Rating: 3,448
+
+ """
+ parsed = _extract(html)
+ assert parsed is not None
+ heading, mark, single = parsed
+ assert heading.startswith("AMD Ryzen 7 5800X")
+ assert (mark, single) == (27684, 3448)
+
+
+class _FakeResp:
+ def __init__(self, text: str, status_code: int = 200) -> None:
+ self.text = text
+ self.status_code = status_code
+
+
+class _FakeClient:
+ """Returns a canned lookup-results page for resolve_id parsing."""
+
+ def __init__(self, text: str) -> None:
+ self._text = text
+
+ def get(self, url, params=None): # noqa: ANN001
+ return _FakeResp(self._text)
+
+
+def test_resolve_id_picks_exact_variant() -> None:
+ # Lookup list with several i5-2500 siblings; only the plain one must win.
+ html = """
+
+ Intel Core i5-2500K @ 3.30GHz
+
+ Intel Core i5-2500 @ 3.30GHz
+
+ Intel Core i5-2500S @ 2.70GHz
+ """
+ assert passmark.resolve_id(_FakeClient(html), "Intel Core i5-2500") == "803"
+ assert passmark.resolve_id(_FakeClient(html), "Intel Core i5-2500K") == "804"
+ assert passmark.resolve_id(_FakeClient(html), "Intel Core i5-9999") is None
+
+
+def test_enrich_fills_only_exact_match_nulls(tmp_path: Path, monkeypatch) -> None:
+ cpu_dir = tmp_path / "cpu" / "amd" / "2020" / "consumer"
+ cpu_dir.mkdir(parents=True)
+ rec = {
+ "slug": "ryzen-7-5800x",
+ "name": "AMD Ryzen 7 5800X",
+ "passmark_single": None,
+ "passmark_cpu_mark": None,
+ "source_urls": ["https://amd.com/x"],
+ }
+ path = cpu_dir / "ryzen-7-5800x.json"
+ path.write_text(json.dumps(rec), encoding="utf-8")
+
+ def fake_fetch(client, name, *, id_override=None): # noqa: ANN001
+ return PassMarkResult("AMD Ryzen 7 5800X", 27684, 3448, "https://cpubenchmark.net/x")
+
+ monkeypatch.setattr(enrich_mod, "fetch_scores", fake_fetch)
+ monkeypatch.setattr(passmark, "make_client", lambda **k: None)
+ monkeypatch.setattr(enrich_mod, "make_client", lambda **k: None)
+
+ result = enrich_mod.enrich(data_root=tmp_path, sleep=0)
+
+ assert len(result.filled) == 1
+ written = json.loads(path.read_text(encoding="utf-8"))
+ assert written["passmark_single"] == 3448
+ assert written["passmark_cpu_mark"] == 27684
+ assert "https://cpubenchmark.net/x" in written["source_urls"]
+
+
+def test_enrich_reports_unresolved_on_mismatch(tmp_path: Path, monkeypatch) -> None:
+ cpu_dir = tmp_path / "cpu" / "intel" / "2024" / "consumer"
+ cpu_dir.mkdir(parents=True)
+ path = cpu_dir / "core-i5-12400.json"
+ path.write_text(
+ json.dumps(
+ {"slug": "core-i5-12400", "name": "Intel Core i5-12400",
+ "passmark_single": None, "passmark_cpu_mark": None, "source_urls": []}
+ ),
+ encoding="utf-8",
+ )
+ # Simulate fuzzy mismatch → client returns None.
+ monkeypatch.setattr(enrich_mod, "fetch_scores", lambda *a, **k: None)
+ monkeypatch.setattr(enrich_mod, "make_client", lambda **k: None)
+
+ result = enrich_mod.enrich(data_root=tmp_path, sleep=0)
+
+ assert result.filled == []
+ assert "Intel Core i5-12400" in result.unresolved
+ written = json.loads(path.read_text(encoding="utf-8"))
+ assert written["passmark_cpu_mark"] is None # untouched
diff --git a/tests/unit/test_spec2006.py b/tests/unit/test_spec2006.py
new file mode 100644
index 0000000..4781560
--- /dev/null
+++ b/tests/unit/test_spec2006.py
@@ -0,0 +1,77 @@
+"""SPEC CPU2006 bulk-table source (specint2006 / specfp2006) — no network."""
+
+from __future__ import annotations
+
+from app.ingest.sources import spec2006
+
+
+class _Resp:
+ status_code = 200
+
+ def __init__(self, text: str) -> None:
+ self.text = text
+
+
+class _Client:
+ """Serves cint HTML for the CINT url, cfp HTML for the CFP url."""
+
+ def __init__(self, cint: str, cfp: str) -> None:
+ self._cint = cint
+ self._cfp = cfp
+
+ def get(self, url): # noqa: ANN001
+ return _Resp(self._cint if "cint" in url else self._cfp)
+
+
+def _row(system: str, base: str, peak: str = "0") -> str:
+ # 9 cells: sponsor, system(+links), autopar, cores, chips, c/chip, t/core, base, peak.
+ return (
+ f" | | Sponsor | {system} HTML | CSV | Yes | "
+ f"4 | 1 | 4 | 1 | {base} | {peak} | "
+ )
+
+
+CINT = (
+ ""
+ "| Test Sponsor | System Name | "
+ # i5-2500K appears twice — keep the MAX base (47.4, not 40.0).
+ + _row("Box A (Intel Core i5-2500K, 3.30 GHz)", "40.0")
+ + _row("Box B (Intel Core i5-2500K)", "47.4", "56.4")
+ # non-K sibling must stay distinct from the K SKU.
+ + _row("Box C (Intel Core i5-2500)", "42.7")
+ + _row("Server (AMD Opteron 6276)", "20.5")
+ + " "
+)
+
+CFP = (
+ ""
+ "| Test Sponsor | System Name | "
+ + _row("Box B (Intel Core i5-2500K)", "56.4")
+ + " "
+)
+
+
+def test_max_base_and_variant_safety() -> None:
+ spec2006.reset_cache()
+ client = _Client(CINT, CFP)
+ # Keeps the maximum base across submissions; pulls fp from the other page.
+ assert spec2006.resolve(client, "Intel Core i5-2500K") == (
+ {"specint2006": 47.4, "specfp2006": 56.4},
+ spec2006.RESULTS_INDEX,
+ )
+ # Non-K sibling resolves to its own row only (no fp data → int only).
+ assert spec2006.resolve(client, "Intel Core i5-2500") == (
+ {"specint2006": 42.7},
+ spec2006.RESULTS_INDEX,
+ )
+ # Clock-suffixed paren still matches the plain name.
+ assert spec2006.resolve(client, "AMD Opteron 6276")[0] == {"specint2006": 20.5}
+ # Absent chip.
+ assert spec2006.resolve(client, "AMD Ryzen 9 9999X") is None
+
+
+def test_processor_extraction() -> None:
+ f = spec2006._processor_from_system
+ assert f("ACTINA 220 (Intel Xeon X5650) HTML | CSV") == "Intel Xeon X5650"
+ assert f("Box (Intel Xeon E5-2670 v3, 2.30 GHz) Config") == "Intel Xeon E5-2670 v3"
+ assert f("No parens here") is None
diff --git a/tests/unit/test_technical_city.py b/tests/unit/test_technical_city.py
new file mode 100644
index 0000000..fb469fc
--- /dev/null
+++ b/tests/unit/test_technical_city.py
@@ -0,0 +1,67 @@
+"""technical.city legacy-Cinebench source unit tests (no network)."""
+
+from __future__ import annotations
+
+from app.ingest.sources import technical_city as tc
+from app.ingest.sources.technical_city import _field_for, _value, slug
+
+
+def test_slug_drops_vendor_and_codename() -> None:
+ assert slug("AMD Ryzen 7 5800X") == "Ryzen-7-5800X"
+ assert slug("Intel Core i9-14900K") == "Core-i9-14900K"
+ assert slug("Intel Core i7-2600K (Sandy Bridge)") == "Core-i7-2600K"
+ assert slug("Intel Core 2 Duo E8400") == "Core-2-Duo-E8400"
+
+
+def test_field_for_maps_versions() -> None:
+ assert _field_for("Cinebench 15 64-bit single-core") == "cinebench_r15_single"
+ assert _field_for("Cinebench 15 64-bit multi-core") == "cinebench_r15_multi"
+ assert _field_for("Cinebench R10 32-bit single-core") == "cinebench_r10_single"
+ assert _field_for("Cinebench 11.5 64-bit multi-core") == "cinebench_r11_5_multi"
+ assert _field_for("Passmark") is None # not a cinebench field
+ assert _field_for("GeekBench 5 Single-Core") is None
+
+
+def test_value_parses_int_and_decimal_and_ignores_trailing() -> None:
+ assert _value("2,609", decimal=False) == 2609
+ assert _value("27684Samples: 24208", decimal=False) == 27684 # trailing noise ignored
+ assert _value("3.09", decimal=True) == 3.09
+
+
+def test_fetch_legacy_parses_and_gates_on_heading() -> None:
+ html = """
+ Ryzen 7 5800X: specs and benchmarks
+ Cinebench 15 64-bit single-core
+
+ Cinebench 15 64-bit multi-core
+
+ Cinebench 11.5 64-bit single-core
+
+ """
+
+ class _Resp:
+ status_code = 200
+ text = html
+ url = "https://technical.city/en/cpu/Ryzen-7-5800X"
+
+ class _Client:
+ def get(self, url): # noqa: ANN001
+ return _Resp()
+
+ # vendor-insensitive match: dataset name carries "AMD", page heading doesn't.
+ r = tc.fetch_legacy(_Client(), "AMD Ryzen 7 5800X")
+ assert r is not None
+ assert r.scores == {
+ "cinebench_r15_single": 266,
+ "cinebench_r15_multi": 2609,
+ "cinebench_r11_5_single": 3.09,
+ }
+
+ # Wrong chip on the page → rejected (variant-safety).
+ assert tc.fetch_legacy(_Client(), "AMD Ryzen 9 5950X") is None
|