feat(ingest): smartphone source with curated-SoC foreign-key gating

Seungpyo1007 · Seungpyo1007 · commit e17eaccb4b04 · 2026-05-30T09:41:53.000+09:00
Adds the third ingest category. Each per-brand Wikipedia smartphone list
page yields IngestCandidates; a row's SoC text is normalized to a slug
and looked up against the curated TechAPI SoC catalog (validator enforces
the smartphone.soc FK). Phones whose SoC is not yet curated still emit a
candidate but with soc=null and the FK in ``missing_fields``, so they
collapse to drafts and stay out of the PR unless --include-drafts.

New normalize helpers
- parse_ram_gb            ("6/8/12 GB" → 12, picks the flagship config)
- parse_battery_mah       ("5,000 mAh" → 5000, range-checks 500..12000)
- parse_weight_g          ("232 g" → 232, range-checks 50..500)
- guess_os                ("Android 14, One UI 6.1" → "Android 14")
- soc_text_to_slug        ("Qualcomm Snapdragon 8 Elite for Galaxy" →
                           "snapdragon-8-elite"; trims vendor prefix +
                           "for X" / "Mobile Platform" / parens suffix)

The OS resolver intentionally ranks the underlying OS (Android, iOS) above
OEM skins (One UI, OxygenOS, HyperOS) so phones whose row prints both
record the OS, not the skin.

CLI + workflow ``--category`` now accepts ``smartphone``.

Tests (+ 30): RAM/battery/weight parsers (range checks), OS guesser
priority + brand fallback, SoC slug normalization, full Wikipedia row
extraction with both known and unknown SoCs.
diff --git a/.github/workflows/weekly-ingest.yml b/.github/workflows/weekly-ingest.yml
@@ -10,7 +10,7 @@ on:
       category:
         description: "Category to ingest"
         type: choice
-        options: [cpu, gpu]
+        options: [cpu, gpu, smartphone]
         default: cpu
       limit:
         description: "Max candidates per source"
diff --git a/app/ingest/__main__.py b/app/ingest/__main__.py
@@ -22,10 +22,12 @@
 from .sources.base import IngestCandidate, IngestSource
 from .sources.wikipedia_cpu import WikipediaCpuIngest
 from .sources.wikipedia_gpu import WikipediaGpuIngest
+from .sources.wikipedia_smartphone import WikipediaSmartphoneIngest
 
 SOURCES_BY_CATEGORY: dict[str, list[IngestSource]] = {
     "cpu": [WikipediaCpuIngest()],
     "gpu": [WikipediaGpuIngest()],
+    "smartphone": [WikipediaSmartphoneIngest()],
 }
 
 
diff --git a/app/ingest/normalize.py b/app/ingest/normalize.py
@@ -19,6 +19,10 @@
 _BUS_RE = re.compile(r"(\d{2,4})\s*-?\s*bit\b", re.IGNORECASE)
 _PCIE_RE = re.compile(r"PCI[-\s]?[Ee]?\s*(?:Gen\s*)?(\d(?:\.\d)?)", re.IGNORECASE)
 _TDP_RE = re.compile(r"(\d{1,4})(?:\s*/\s*\d{1,4})?\s*W\b", re.IGNORECASE)
+_RAM_RE = re.compile(r"(\d{1,3}(?:\.\d+)?)\s*(GB|MB)\b", re.IGNORECASE)
+_BATTERY_RE = re.compile(r"(\d{3,5})\s*m\s*A\s*h\b", re.IGNORECASE)
+_WEIGHT_RE = re.compile(r"(\d{1,3}(?:\.\d+)?)\s*g\b")
+_OS_VERSION_RE = re.compile(r"\b(\d{1,2}(?:\.\d+)?)\b")
 
 _ISO_DATE_RE = re.compile(r"^(\d{4})-(\d{2})-(\d{2})$")
 _NUMERIC_DATE_RE = re.compile(r"^(\d{4})/(\d{2})/(\d{2})$")
@@ -197,6 +201,80 @@ def guess_gpu_segment(name: str) -> str:
     return "consumer"
 
 
+def parse_ram_gb(text: str) -> int | None:
+    """``"8 GB"`` → ``8``; ``"512 MB"`` → ``None`` (sub-GB ignored).
+
+    Picks the largest value when the text lists multiple options like
+    ``"6/8/12 GB"`` to reflect the flagship configuration.
+    """
+    if not text:
+        return None
+    values: list[int] = []
+    for match in _RAM_RE.finditer(text):
+        amount = float(match.group(1))
+        unit = match.group(2).lower()
+        if unit == "gb" and amount >= 1:
+            values.append(int(amount))
+    return max(values) if values else None
+
+
+def parse_battery_mah(text: str) -> int | None:
+    """``"5,000 mAh"`` → ``5000``; rejects values outside [500, 12000]."""
+    if not text:
+        return None
+    match = _BATTERY_RE.search(text.replace(",", ""))
+    if not match:
+        return None
+    value = int(match.group(1))
+    return value if 500 <= value <= 12000 else None
+
+
+def parse_weight_g(text: str) -> int | None:
+    """``"232 g"`` → ``232``; rejects values outside [50, 500]."""
+    if not text:
+        return None
+    match = _WEIGHT_RE.search(text.replace(",", ""))
+    if not match:
+        return None
+    value = int(float(match.group(1)))
+    return value if 50 <= value <= 500 else None
+
+
+def guess_os(text: str, *, brand: str = "") -> str | None:
+    """Best-effort OS string from a Wikipedia smartphone row.
+
+    Recognizes Android/iOS/iPadOS/HarmonyOS/Windows; falls back to inferring
+    ``"iOS"`` for Apple brand, ``"Android"`` for everyone else, when the
+    text is non-empty but does not name an OS.
+    """
+    if not text:
+        return None
+    lowered = text.lower()
+    # OS names first; OEM-skin names are fallback because rows usually print
+    # the OS *and* the skin together (e.g. "Android 14, One UI 6.1") and the
+    # underlying OS is the schema-relevant value.
+    for token, label in (
+        ("ipados", "iPadOS"),
+        ("ios", "iOS"),
+        ("android", "Android"),
+        ("harmonyos", "HarmonyOS"),
+        ("windows phone", "Windows Phone"),
+        ("windows", "Windows"),
+        ("hyperos", "HyperOS"),
+        ("oxygenos", "OxygenOS"),
+        ("oneui", "One UI"),
+        ("one ui", "One UI"),
+    ):
+        if token in lowered:
+            version = _OS_VERSION_RE.search(text)
+            return f"{label} {version.group(1)}" if version else label
+    if brand == "apple":
+        return "iOS"
+    if brand:
+        return "Android"
+    return None
+
+
 def guess_cpu_segment(name: str) -> str:
     """Heuristic CPU segment classifier.
 
diff --git a/app/ingest/sources/wikipedia_smartphone.py b/app/ingest/sources/wikipedia_smartphone.py
@@ -0,0 +1,197 @@
+"""Wikipedia smartphone list pages → ``IngestCandidate`` rows.
+
+Each major OEM has a ``List of <brand> smartphones`` (or equivalent) page on
+Wikipedia. We parse those, look up the SoC name against the curated TechAPI
+SoC catalog, and emit candidates only for phones whose SoC is already
+curated — the validator enforces ``smartphone.soc`` foreign-key integrity
+so unknown SoCs would otherwise tank the whole dataset.
+"""
+
+from __future__ import annotations
+
+import re
+from collections.abc import Iterator
+from pathlib import Path
+
+from bs4 import BeautifulSoup
+
+from app.coverage.curated import curated_slugs
+from app.coverage.normalize import slugify
+from app.coverage.sources.wikipedia import fetch_wikipedia_html
+
+from ..normalize import (
+    guess_os,
+    parse_battery_mah,
+    parse_date,
+    parse_ram_gb,
+    parse_weight_g,
+)
+from .base import IngestCandidate
+from .wikitable import parse_table
+
+PAGES: list[tuple[str, str]] = [
+    ("samsung", "List_of_Samsung_Galaxy_smartphones"),
+    ("apple", "List_of_iPhone_models"),
+    ("google", "Pixel_(smartphone)"),
+    ("oneplus", "List_of_OnePlus_products"),
+    ("xiaomi", "List_of_Xiaomi_smartphones"),
+]
+
+HEADER_RULES: dict[str, list[str]] = {
+    "model": ["model", "name"],
+    "release_date": ["released", "release", "launched", "launch", "date"],
+    "soc": ["soc", "chipset", "processor", "platform"],
+    "ram": ["ram", "memory"],
+    "battery": ["battery"],
+    "weight": ["weight", "mass"],
+    "os": ["os", "operating system", "software"],
+    "display": ["display", "screen"],
+    "camera": ["camera", "rear"],
+}
+
+# Vendor-name prefixes we trim before slugifying the SoC text. We keep the
+# product family (Snapdragon / Dimensity / Exynos / Tensor / Kirin) and drop
+# only the company name. Apple is the exception — "Apple A17 Pro" slugs as
+# ``a17-pro`` because "Apple" is the family identifier, not a vendor prefix.
+_SOC_VENDOR_PREFIXES = (
+    "qualcomm",
+    "mediatek",
+    "samsung",
+    "huawei",
+    "google",
+    "apple",
+)
+
+
+class WikipediaSmartphoneIngest:
+    """Per-row ingestion from Wikipedia per-brand smartphone list pages."""
+
+    category = "smartphone"
+    name = "wikipedia-smartphone-ingest"
+    description = "Wikipedia: per-row extraction from brand-specific smartphone list pages."
+
+    def __init__(self, pages: list[tuple[str, str]] | None = None) -> None:
+        self._pages = pages if pages is not None else PAGES
+        # Lazy: populated on first ``fetch()`` call so tests can monkeypatch
+        # TECHAPI_DATA_DIR before the source touches disk.
+        self._known_socs: set[str] | None = None
+        self._known_brands: set[str] | None = None
+
+    def fetch(self, *, limit: int | None = None) -> Iterator[IngestCandidate]:
+        self._refresh_curated_indexes()
+        emitted = 0
+        for brand, page in self._pages:
+            if self._known_brands is not None and brand not in self._known_brands:
+                continue
+            try:
+                html = fetch_wikipedia_html(page)
+            except Exception:
+                continue
+            for candidate in self._extract(html, brand, page, self._known_socs or set()):
+                yield candidate
+                emitted += 1
+                if limit is not None and emitted >= limit:
+                    return
+
+    def _refresh_curated_indexes(self) -> None:
+        self._known_socs = curated_slugs("soc")
+        self._known_brands = curated_slugs("brand")
+
+    @staticmethod
+    def _extract(
+        html: str, brand: str, page: str, known_socs: set[str]
+    ) -> Iterator[IngestCandidate]:
+        soup = BeautifulSoup(html, "html.parser")
+        source_url = f"https://en.wikipedia.org/wiki/{page}"
+        for table in soup.select("table.wikitable"):
+            for row in parse_table(table, HEADER_RULES):
+                model = row.cells.get("model", "")
+                slug = slugify(model, manufacturer=brand)
+                if len(slug) < 3:
+                    continue
+                yield _build_candidate(
+                    brand=brand,
+                    model=model,
+                    slug=slug,
+                    row=row.cells,
+                    source_url=source_url,
+                    known_socs=known_socs,
+                )
+
+
+def soc_text_to_slug(text: str) -> str:
+    """Heuristically map an SoC table cell to a TechAPI SoC slug.
+
+    The shape of the text varies wildly — ``"Snapdragon 8 Gen 3"``,
+    ``"Qualcomm Snapdragon 8 Gen 3 Mobile Platform"``,
+    ``"Apple A17 Pro"`` — so we strip vendor prefixes, drop common
+    promotional suffixes, then run the standard slugifier.
+    """
+    if not text:
+        return ""
+    # Wikipedia often appends " for Galaxy" / " (Mobile Platform)" etc.
+    cleaned = re.sub(r"\s+for\s+\w+\b", "", text, flags=re.IGNORECASE)
+    cleaned = re.sub(r"\s+\(.+?\)\s*$", "", cleaned)
+    cleaned = re.sub(r"\s+mobile\s+platform\b", "", cleaned, flags=re.IGNORECASE)
+    lowered = cleaned.strip().lower()
+    for prefix in _SOC_VENDOR_PREFIXES:
+        if lowered.startswith(prefix + " "):
+            cleaned = cleaned[len(prefix) + 1 :]
+            break
+    return slugify(cleaned)
+
+
+def _build_candidate(
+    *,
+    brand: str,
+    model: str,
+    slug: str,
+    row: dict[str, str],
+    source_url: str,
+    known_socs: set[str],
+) -> IngestCandidate:
+    release_date = parse_date(row.get("release_date", ""))
+    soc_text = row.get("soc", "")
+    soc_slug = soc_text_to_slug(soc_text)
+    soc_value = soc_slug if soc_slug in known_socs else None
+
+    ram_gb = parse_ram_gb(row.get("ram", ""))
+    battery = parse_battery_mah(row.get("battery", ""))
+    weight = parse_weight_g(row.get("weight", ""))
+    os_value = guess_os(row.get("os", ""), brand=brand)
+
+    name = (
+        model
+        if model.lower().startswith(brand) or model.lower().startswith(brand.upper())
+        else f"{brand.title()} {model}"
+    )
+
+    record: dict[str, object | None] = {
+        "slug": slug,
+        "name": name,
+        "brand": brand,
+        "soc": soc_value,
+        "release_date": release_date.isoformat() if release_date else None,
+        "ram_gb": ram_gb,
+        "battery_mah": battery,
+        "weight_g": weight,
+        "os": os_value,
+        "msrp_usd": None,
+        "verified": False,
+        "source_urls": [source_url],
+    }
+
+    required = ("soc", "release_date", "ram_gb", "battery_mah", "weight_g", "os")
+    missing = tuple(field for field in required if record.get(field) in (None, ""))
+
+    output_path = Path("smartphone") / brand / f"{slug}.json"
+
+    return IngestCandidate(
+        category="smartphone",
+        manufacturer=brand,
+        slug=slug,
+        record=record,
+        source_url=source_url,
+        output_path=output_path,
+        missing_fields=missing,
+    )
diff --git a/tests/unit/test_ingest_smartphone_parsers.py b/tests/unit/test_ingest_smartphone_parsers.py
diff --git a/tests/unit/test_ingest_wikipedia_smartphone.py b/tests/unit/test_ingest_wikipedia_smartphone.py