Skip to content

Commit 7059e02

Browse files
committed
feat(ingest): rowspan-aware table grid + Wikipedia GPU source
Two quality wins for the weekly crawler: 1. Generalize Wikipedia table parsing into ``app/ingest/sources/wikitable.py``. The new grid materialiser respects ``rowspan`` and ``colspan``, so a table-wide ``Architecture`` cell shared across a generation is now correctly inherited by every SKU row underneath. Previously those rows fell back to the page-level architecture string ("Intel Core") and curators had to fix them by hand. 2. Add a GPU ingest source (``wikipedia_gpu.py``) covering the List_of_<vendor>_graphics_processing_units family for NVIDIA, AMD, and Intel. New normalize helpers: - parse_frequency_mhz (GPU base/boost clocks) - parse_memory_gb (handles MB/GB units) - parse_memory_bus_bit ("384-bit" → 384) - parse_pcie_version ("PCIe 4.0 x16" → "4.0", "PCI-e Gen 5" → "5") - guess_gpu_segment (consumer vs enterprise via name tokens) GPU candidates require nine fields per the schema; most list-page rows miss a few, so they surface as drafts and stay out of PRs unless the user opts in with ``--include-drafts``. Wire ``gpu`` into ``app.ingest`` CLI's ``--category`` choices. Tests (+ 23) — grid rowspan/colspan handling, GPU row extraction with inherited architecture, GPU-specific value parsers.
1 parent 6ed2150 commit 7059e02

8 files changed

Lines changed: 605 additions & 74 deletions

File tree

app/ingest/__main__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,11 @@
2121
from .pipeline import run
2222
from .sources.base import IngestCandidate, IngestSource
2323
from .sources.wikipedia_cpu import WikipediaCpuIngest
24+
from .sources.wikipedia_gpu import WikipediaGpuIngest
2425

2526
SOURCES_BY_CATEGORY: dict[str, list[IngestSource]] = {
2627
"cpu": [WikipediaCpuIngest()],
28+
"gpu": [WikipediaGpuIngest()],
2729
}
2830

2931

app/ingest/normalize.py

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,12 @@
1212
from datetime import date
1313

1414
_FREQ_RE = re.compile(r"(\d+(?:\.\d+)?)\s*(GHz|MHz)\b", re.IGNORECASE)
15-
_INT_RE = re.compile(r"(\d{1,4})")
15+
_FREQ_MHZ_RE = re.compile(r"(\d+(?:\.\d+)?)\s*(MHz|GHz)\b", re.IGNORECASE)
16+
_INT_RE = re.compile(r"(\d{1,5})")
1617
_CACHE_RE = re.compile(r"(\d+(?:\.\d+)?)\s*(MB|KB|GB)\b", re.IGNORECASE)
18+
_MEMORY_RE = re.compile(r"(\d+(?:\.\d+)?)\s*(GB|MB)\b", re.IGNORECASE)
19+
_BUS_RE = re.compile(r"(\d{2,4})\s*-?\s*bit\b", re.IGNORECASE)
20+
_PCIE_RE = re.compile(r"PCI[-\s]?[Ee]?\s*(?:Gen\s*)?(\d(?:\.\d)?)", re.IGNORECASE)
1721
_TDP_RE = re.compile(r"(\d{1,4})(?:\s*/\s*\d{1,4})?\s*W\b", re.IGNORECASE)
1822

1923
_ISO_DATE_RE = re.compile(r"^(\d{4})-(\d{2})-(\d{2})$")
@@ -140,6 +144,59 @@ def _safe_date(year: int, month: int, day: int) -> date | None:
140144
return None
141145

142146

147+
def parse_frequency_mhz(text: str) -> int | None:
148+
"""``"1500 MHz"`` → ``1500``; ``"2.5 GHz"`` → ``2500``."""
149+
if not text:
150+
return None
151+
match = _FREQ_MHZ_RE.search(text)
152+
if not match:
153+
return None
154+
value = float(match.group(1))
155+
unit = match.group(2).lower()
156+
return int(value * 1000) if unit == "ghz" else int(value)
157+
158+
159+
def parse_memory_gb(text: str) -> float | None:
160+
"""``"24 GB"`` → ``24.0``; ``"4096 MB"`` → ``4.0``."""
161+
if not text:
162+
return None
163+
match = _MEMORY_RE.search(text)
164+
if not match:
165+
return None
166+
value = float(match.group(1))
167+
return value if match.group(2).lower() == "gb" else round(value / 1024, 3)
168+
169+
170+
def parse_memory_bus_bit(text: str) -> int | None:
171+
"""``"384-bit"`` → ``384``; ``"128 bit"`` → ``128``."""
172+
if not text:
173+
return None
174+
match = _BUS_RE.search(text)
175+
return int(match.group(1)) if match else None
176+
177+
178+
def parse_pcie_version(text: str) -> str | None:
179+
"""``"PCIe 4.0 x16"`` → ``"4.0"``; ``"PCI-e Gen 5"`` → ``"5"``."""
180+
if not text:
181+
return None
182+
match = _PCIE_RE.search(text)
183+
return match.group(1) if match else None
184+
185+
186+
def guess_gpu_segment(name: str) -> str:
187+
"""Heuristic GPU segment classifier (``"consumer"`` vs ``"enterprise"``)."""
188+
lowered = name.lower()
189+
enterprise_tokens = (
190+
"quadro", "tesla", "a100", "h100", "h200", "b100", "b200",
191+
"instinct", "mi300", "mi325", "mi350",
192+
"data center", "datacenter", "professional", "radeon pro",
193+
"rtx 6000", "rtx 5000", "rtx 4500", "rtx 4000",
194+
)
195+
if any(token in lowered for token in enterprise_tokens):
196+
return "enterprise"
197+
return "consumer"
198+
199+
143200
def guess_cpu_segment(name: str) -> str:
144201
"""Heuristic CPU segment classifier.
145202

app/ingest/sources/wikipedia_cpu.py

Lines changed: 18 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,11 @@
22
33
Each ``List_of_<vendor>_<family>_processors`` page on Wikipedia is a series
44
of ``table.wikitable`` blocks; rows are individual SKUs and columns map to
5-
schema fields. Column headers vary subtly between pages, so we match by
5+
schema fields. Header text varies subtly between pages, so we match by
66
loose keywords (``"cores"``, ``"base"``, ``"tdp"`` …) rather than position.
7-
8-
Required output fields (per the validator): ``slug``, ``name``,
9-
``manufacturer``, ``release_date``, ``segment``, ``architecture``,
10-
``cores``, ``threads``. Anything missing collapses into
11-
``IngestCandidate.missing_fields``; the pipeline skips drafts unless a
12-
``--include-drafts`` flag is set.
7+
Rows whose first column is provided via ``rowspan`` on the preceding row
8+
(e.g. an ``Architecture`` cell shared across a generation) are materialised
9+
by the shared grid parser.
1310
"""
1411

1512
from __future__ import annotations
@@ -32,20 +29,26 @@
3229
parse_tdp_w,
3330
)
3431
from .base import IngestCandidate
32+
from .wikitable import parse_table
3533

3634
# (manufacturer, page, architecture-fallback). Architecture is overridden
37-
# per-table when a preceding ``<h2>``/``<h3>`` provides a better label.
35+
# per-row when the table has an explicit ``Architecture`` / ``Codename``
36+
# column, and per-table from the preceding section heading otherwise.
3837
PAGES: list[tuple[str, str, str]] = [
3938
("intel", "List_of_Intel_Core_processors", "Intel Core"),
4039
("intel", "List_of_Intel_Xeon_processors", "Intel Xeon"),
40+
("intel", "List_of_Intel_Atom_processors", "Intel Atom"),
4141
("amd", "List_of_AMD_Ryzen_processors", "AMD Ryzen"),
4242
("amd", "List_of_AMD_Epyc_processors", "AMD EPYC"),
43+
("amd", "List_of_AMD_Threadripper_processors", "AMD Threadripper"),
4344
]
4445

45-
# Lowercased header tokens → canonical field name. Only the first match wins
46-
# per row (so a "Cores/Threads" column maps to ``cores`` via the first hit).
46+
# Lowercased header tokens → canonical field name. Order matters: the first
47+
# matching fragment per cell wins (so a "Cores/Threads" column maps to
48+
# ``cores`` rather than ``threads``).
4749
HEADER_RULES: dict[str, list[str]] = {
4850
"model": ["model", "processor", "cpu", "name"],
51+
"architecture": ["architecture", "codename", "code name", "core name"],
4952
"cores": ["cores", "core"],
5053
"threads": ["threads", "thread"],
5154
"base_clock": ["base", "freq", "clock"],
@@ -88,85 +91,27 @@ def _extract(
8891
soup = BeautifulSoup(html, "html.parser")
8992
source_url = f"https://en.wikipedia.org/wiki/{page}"
9093
for table in soup.select("table.wikitable"):
91-
headers = _table_headers(table)
92-
if not headers or "model" not in headers.values():
93-
continue
94-
architecture = _nearest_section_label(table) or fallback_arch
95-
for row in table.select("tr"):
96-
cells = row.find_all(["td"])
97-
if not cells:
98-
continue
99-
row_text = _row_by_field(cells, headers)
100-
model = row_text.get("model")
101-
if not model:
102-
continue
94+
section_label = _nearest_section_label(table) or fallback_arch
95+
for row in parse_table(table, HEADER_RULES):
96+
model = row.cells.get("model", "")
10397
slug = slugify(model, manufacturer=manufacturer)
10498
if len(slug) < 4 or not any(ch.isdigit() for ch in slug):
10599
continue
100+
architecture = row.cells.get("architecture") or section_label
106101
yield _build_candidate(
107102
manufacturer=manufacturer,
108103
architecture=architecture,
109104
model=model,
110105
slug=slug,
111-
row=row_text,
106+
row=row.cells,
112107
source_url=source_url,
113108
)
114109

115110

116-
def _table_headers(table: Tag) -> dict[int, str]:
117-
"""Map column index → canonical field name based on header text."""
118-
header_row = table.find("tr")
119-
if header_row is None:
120-
return {}
121-
out: dict[int, str] = {}
122-
index = 0
123-
for cell in header_row.find_all(["th", "td"]):
124-
if not isinstance(cell, Tag):
125-
continue
126-
text = cell.get_text(" ", strip=True).lower()
127-
canonical = _match_header(text)
128-
if canonical is not None:
129-
out[index] = canonical
130-
index += _colspan(cell)
131-
return out
132-
133-
134-
def _match_header(text: str) -> str | None:
135-
for canonical, tokens in HEADER_RULES.items():
136-
for token in tokens:
137-
if token in text:
138-
return canonical
139-
return None
140-
141-
142-
def _row_by_field(cells: list[Tag], headers: dict[int, str]) -> dict[str, str]:
143-
result: dict[str, str] = {}
144-
index = 0
145-
for cell in cells:
146-
canonical = headers.get(index)
147-
if canonical is not None and canonical not in result:
148-
result[canonical] = cell.get_text(" ", strip=True)
149-
index += _colspan(cell)
150-
return result
151-
152-
153-
def _colspan(cell: Tag) -> int:
154-
raw = cell.attrs.get("colspan")
155-
if isinstance(raw, list):
156-
raw = raw[0] if raw else None
157-
if raw is None:
158-
return 1
159-
try:
160-
return int(raw)
161-
except (TypeError, ValueError):
162-
return 1
163-
164-
165111
def _nearest_section_label(table: Tag) -> str | None:
166112
for prev in table.find_all_previous(["h2", "h3", "h4"]):
167113
text = prev.get_text(" ", strip=True)
168114
if text and "edit" not in text.lower():
169-
# Wikipedia headings sometimes end with "[edit]" pre-strip.
170115
return text.split("[")[0].strip() or None
171116
return None
172117

0 commit comments

Comments
 (0)