GetTechAPI
diff --git a/‎app/ingest/__main__.py‎
Lines changed: 2 additions & 0 deletions b/‎app/ingest/__main__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎app/ingest/normalize.py‎
Lines changed: 58 additions & 1 deletion b/‎app/ingest/normalize.py‎
Lines changed: 58 additions & 1 deletion
diff --git a/‎app/ingest/sources/wikipedia_cpu.py‎
Lines changed: 18 additions & 73 deletions b/‎app/ingest/sources/wikipedia_cpu.py‎
Lines changed: 18 additions & 73 deletions
@@ -21,9 +21,11 @@
 from .pipeline import run
 from .sources.base import IngestCandidate, IngestSource
 from .sources.wikipedia_cpu import WikipediaCpuIngest
+from .sources.wikipedia_gpu import WikipediaGpuIngest
 
 SOURCES_BY_CATEGORY: dict[str, list[IngestSource]] = {
     "cpu": [WikipediaCpuIngest()],
+    "gpu": [WikipediaGpuIngest()],
 }
 
 
 
@@ -12,8 +12,12 @@
 from datetime import date
 
 _FREQ_RE = re.compile(r"(\d+(?:\.\d+)?)\s*(GHz|MHz)\b", re.IGNORECASE)
-_INT_RE = re.compile(r"(\d{1,4})")
+_FREQ_MHZ_RE = re.compile(r"(\d+(?:\.\d+)?)\s*(MHz|GHz)\b", re.IGNORECASE)
+_INT_RE = re.compile(r"(\d{1,5})")
 _CACHE_RE = re.compile(r"(\d+(?:\.\d+)?)\s*(MB|KB|GB)\b", re.IGNORECASE)
+_MEMORY_RE = re.compile(r"(\d+(?:\.\d+)?)\s*(GB|MB)\b", re.IGNORECASE)
+_BUS_RE = re.compile(r"(\d{2,4})\s*-?\s*bit\b", re.IGNORECASE)
+_PCIE_RE = re.compile(r"PCI[-\s]?[Ee]?\s*(?:Gen\s*)?(\d(?:\.\d)?)", re.IGNORECASE)
 _TDP_RE = re.compile(r"(\d{1,4})(?:\s*/\s*\d{1,4})?\s*W\b", re.IGNORECASE)
 
 _ISO_DATE_RE = re.compile(r"^(\d{4})-(\d{2})-(\d{2})$")
@@ -140,6 +144,59 @@ def _safe_date(year: int, month: int, day: int) -> date | None:
         return None
 
 
+def parse_frequency_mhz(text: str) -> int | None:
+    """``"1500 MHz"`` → ``1500``; ``"2.5 GHz"`` → ``2500``."""
+    if not text:
+        return None
+    match = _FREQ_MHZ_RE.search(text)
+    if not match:
+        return None
+    value = float(match.group(1))
+    unit = match.group(2).lower()
+    return int(value * 1000) if unit == "ghz" else int(value)
+
+
+def parse_memory_gb(text: str) -> float | None:
+    """``"24 GB"`` → ``24.0``; ``"4096 MB"`` → ``4.0``."""
+    if not text:
+        return None
+    match = _MEMORY_RE.search(text)
+    if not match:
+        return None
+    value = float(match.group(1))
+    return value if match.group(2).lower() == "gb" else round(value / 1024, 3)
+
+
+def parse_memory_bus_bit(text: str) -> int | None:
+    """``"384-bit"`` → ``384``; ``"128 bit"`` → ``128``."""
+    if not text:
+        return None
+    match = _BUS_RE.search(text)
+    return int(match.group(1)) if match else None
+
+
+def parse_pcie_version(text: str) -> str | None:
+    """``"PCIe 4.0 x16"`` → ``"4.0"``; ``"PCI-e Gen 5"`` → ``"5"``."""
+    if not text:
+        return None
+    match = _PCIE_RE.search(text)
+    return match.group(1) if match else None
+
+
+def guess_gpu_segment(name: str) -> str:
+    """Heuristic GPU segment classifier (``"consumer"`` vs ``"enterprise"``)."""
+    lowered = name.lower()
+    enterprise_tokens = (
+        "quadro", "tesla", "a100", "h100", "h200", "b100", "b200",
+        "instinct", "mi300", "mi325", "mi350",
+        "data center", "datacenter", "professional", "radeon pro",
+        "rtx 6000", "rtx 5000", "rtx 4500", "rtx 4000",
+    )
+    if any(token in lowered for token in enterprise_tokens):
+        return "enterprise"
+    return "consumer"
+
+
 def guess_cpu_segment(name: str) -> str:
     """Heuristic CPU segment classifier.
 
 
@@ -2,14 +2,11 @@
 
 Each ``List_of_<vendor>_<family>_processors`` page on Wikipedia is a series
 of ``table.wikitable`` blocks; rows are individual SKUs and columns map to
-schema fields. Column headers vary subtly between pages, so we match by
+schema fields. Header text varies subtly between pages, so we match by
 loose keywords (``"cores"``, ``"base"``, ``"tdp"`` …) rather than position.
-
-Required output fields (per the validator): ``slug``, ``name``,
-``manufacturer``, ``release_date``, ``segment``, ``architecture``,
-``cores``, ``threads``. Anything missing collapses into
-``IngestCandidate.missing_fields``; the pipeline skips drafts unless a
-``--include-drafts`` flag is set.
+Rows whose first column is provided via ``rowspan`` on the preceding row
+(e.g. an ``Architecture`` cell shared across a generation) are materialised
+by the shared grid parser.
 """
 
 from __future__ import annotations
@@ -32,20 +29,26 @@
     parse_tdp_w,
 )
 from .base import IngestCandidate
+from .wikitable import parse_table
 
 # (manufacturer, page, architecture-fallback). Architecture is overridden
-# per-table when a preceding ``<h2>``/``<h3>`` provides a better label.
+# per-row when the table has an explicit ``Architecture`` / ``Codename``
+# column, and per-table from the preceding section heading otherwise.
 PAGES: list[tuple[str, str, str]] = [
     ("intel", "List_of_Intel_Core_processors", "Intel Core"),
     ("intel", "List_of_Intel_Xeon_processors", "Intel Xeon"),
+    ("intel", "List_of_Intel_Atom_processors", "Intel Atom"),
     ("amd", "List_of_AMD_Ryzen_processors", "AMD Ryzen"),
     ("amd", "List_of_AMD_Epyc_processors", "AMD EPYC"),
+    ("amd", "List_of_AMD_Threadripper_processors", "AMD Threadripper"),
 ]
 
-# Lowercased header tokens → canonical field name. Only the first match wins
-# per row (so a "Cores/Threads" column maps to ``cores`` via the first hit).
+# Lowercased header tokens → canonical field name. Order matters: the first
+# matching fragment per cell wins (so a "Cores/Threads" column maps to
+# ``cores`` rather than ``threads``).
 HEADER_RULES: dict[str, list[str]] = {
     "model": ["model", "processor", "cpu", "name"],
+    "architecture": ["architecture", "codename", "code name", "core name"],
     "cores": ["cores", "core"],
     "threads": ["threads", "thread"],
     "base_clock": ["base", "freq", "clock"],
@@ -88,85 +91,27 @@ def _extract(
         soup = BeautifulSoup(html, "html.parser")
         source_url = f"https://en.wikipedia.org/wiki/{page}"
         for table in soup.select("table.wikitable"):
-            headers = _table_headers(table)
-            if not headers or "model" not in headers.values():
-                continue
-            architecture = _nearest_section_label(table) or fallback_arch
-            for row in table.select("tr"):
-                cells = row.find_all(["td"])
-                if not cells:
-                    continue
-                row_text = _row_by_field(cells, headers)
-                model = row_text.get("model")
-                if not model:
-                    continue
+            section_label = _nearest_section_label(table) or fallback_arch
+            for row in parse_table(table, HEADER_RULES):
+                model = row.cells.get("model", "")
                 slug = slugify(model, manufacturer=manufacturer)
                 if len(slug) < 4 or not any(ch.isdigit() for ch in slug):
                     continue
+                architecture = row.cells.get("architecture") or section_label
                 yield _build_candidate(
                     manufacturer=manufacturer,
                     architecture=architecture,
                     model=model,
                     slug=slug,
-                    row=row_text,
+                    row=row.cells,
                     source_url=source_url,
                 )
 
 
-def _table_headers(table: Tag) -> dict[int, str]:
-    """Map column index → canonical field name based on header text."""
-    header_row = table.find("tr")
-    if header_row is None:
-        return {}
-    out: dict[int, str] = {}
-    index = 0
-    for cell in header_row.find_all(["th", "td"]):
-        if not isinstance(cell, Tag):
-            continue
-        text = cell.get_text(" ", strip=True).lower()
-        canonical = _match_header(text)
-        if canonical is not None:
-            out[index] = canonical
-        index += _colspan(cell)
-    return out
-
-
-def _match_header(text: str) -> str | None:
-    for canonical, tokens in HEADER_RULES.items():
-        for token in tokens:
-            if token in text:
-                return canonical
-    return None
-
-
-def _row_by_field(cells: list[Tag], headers: dict[int, str]) -> dict[str, str]:
-    result: dict[str, str] = {}
-    index = 0
-    for cell in cells:
-        canonical = headers.get(index)
-        if canonical is not None and canonical not in result:
-            result[canonical] = cell.get_text(" ", strip=True)
-        index += _colspan(cell)
-    return result
-
-
-def _colspan(cell: Tag) -> int:
-    raw = cell.attrs.get("colspan")
-    if isinstance(raw, list):
-        raw = raw[0] if raw else None
-    if raw is None:
-        return 1
-    try:
-        return int(raw)
-    except (TypeError, ValueError):
-        return 1
-
-
 def _nearest_section_label(table: Tag) -> str | None:
     for prev in table.find_all_previous(["h2", "h3", "h4"]):
         text = prev.get_text(" ", strip=True)
         if text and "edit" not in text.lower():
-            # Wikipedia headings sometimes end with "[edit]" pre-strip.
             return text.split("[")[0].strip() or None
     return None
Original file line number	Diff line number	Diff line change
`@@ -21,9 +21,11 @@`
`21`	`21`	`from .pipeline import run`
`22`	`22`	`from .sources.base import IngestCandidate, IngestSource`
`23`	`23`	`from .sources.wikipedia_cpu import WikipediaCpuIngest`
	`24`	`+from .sources.wikipedia_gpu import WikipediaGpuIngest`
`24`	`25`
`25`	`26`	`SOURCES_BY_CATEGORY: dict[str, list[IngestSource]] = {`
`26`	`27`	`"cpu": [WikipediaCpuIngest()],`
	`28`	`+ "gpu": [WikipediaGpuIngest()],`
`27`	`29`	`}`
`28`	`30`
`29`	`31`