Skip to content

Commit e17eacc

Browse files
committed
feat(ingest): smartphone source with curated-SoC foreign-key gating
Adds the third ingest category. Each per-brand Wikipedia smartphone list page yields IngestCandidates; a row's SoC text is normalized to a slug and looked up against the curated TechAPI SoC catalog (validator enforces the smartphone.soc FK). Phones whose SoC is not yet curated still emit a candidate but with soc=null and the FK in ``missing_fields``, so they collapse to drafts and stay out of the PR unless --include-drafts. New normalize helpers - parse_ram_gb ("6/8/12 GB" → 12, picks the flagship config) - parse_battery_mah ("5,000 mAh" → 5000, range-checks 500..12000) - parse_weight_g ("232 g" → 232, range-checks 50..500) - guess_os ("Android 14, One UI 6.1" → "Android 14") - soc_text_to_slug ("Qualcomm Snapdragon 8 Elite for Galaxy" → "snapdragon-8-elite"; trims vendor prefix + "for X" / "Mobile Platform" / parens suffix) The OS resolver intentionally ranks the underlying OS (Android, iOS) above OEM skins (One UI, OxygenOS, HyperOS) so phones whose row prints both record the OS, not the skin. CLI + workflow ``--category`` now accepts ``smartphone``. Tests (+ 30): RAM/battery/weight parsers (range checks), OS guesser priority + brand fallback, SoC slug normalization, full Wikipedia row extraction with both known and unknown SoCs.
1 parent 51f4354 commit e17eacc

6 files changed

Lines changed: 466 additions & 1 deletion

File tree

.github/workflows/weekly-ingest.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ on:
1010
category:
1111
description: "Category to ingest"
1212
type: choice
13-
options: [cpu, gpu]
13+
options: [cpu, gpu, smartphone]
1414
default: cpu
1515
limit:
1616
description: "Max candidates per source"

app/ingest/__main__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,12 @@
2222
from .sources.base import IngestCandidate, IngestSource
2323
from .sources.wikipedia_cpu import WikipediaCpuIngest
2424
from .sources.wikipedia_gpu import WikipediaGpuIngest
25+
from .sources.wikipedia_smartphone import WikipediaSmartphoneIngest
2526

2627
SOURCES_BY_CATEGORY: dict[str, list[IngestSource]] = {
2728
"cpu": [WikipediaCpuIngest()],
2829
"gpu": [WikipediaGpuIngest()],
30+
"smartphone": [WikipediaSmartphoneIngest()],
2931
}
3032

3133

app/ingest/normalize.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919
_BUS_RE = re.compile(r"(\d{2,4})\s*-?\s*bit\b", re.IGNORECASE)
2020
_PCIE_RE = re.compile(r"PCI[-\s]?[Ee]?\s*(?:Gen\s*)?(\d(?:\.\d)?)", re.IGNORECASE)
2121
_TDP_RE = re.compile(r"(\d{1,4})(?:\s*/\s*\d{1,4})?\s*W\b", re.IGNORECASE)
22+
_RAM_RE = re.compile(r"(\d{1,3}(?:\.\d+)?)\s*(GB|MB)\b", re.IGNORECASE)
23+
_BATTERY_RE = re.compile(r"(\d{3,5})\s*m\s*A\s*h\b", re.IGNORECASE)
24+
_WEIGHT_RE = re.compile(r"(\d{1,3}(?:\.\d+)?)\s*g\b")
25+
_OS_VERSION_RE = re.compile(r"\b(\d{1,2}(?:\.\d+)?)\b")
2226

2327
_ISO_DATE_RE = re.compile(r"^(\d{4})-(\d{2})-(\d{2})$")
2428
_NUMERIC_DATE_RE = re.compile(r"^(\d{4})/(\d{2})/(\d{2})$")
@@ -197,6 +201,80 @@ def guess_gpu_segment(name: str) -> str:
197201
return "consumer"
198202

199203

204+
def parse_ram_gb(text: str) -> int | None:
205+
"""``"8 GB"`` → ``8``; ``"512 MB"`` → ``None`` (sub-GB ignored).
206+
207+
Picks the largest value when the text lists multiple options like
208+
``"6/8/12 GB"`` to reflect the flagship configuration.
209+
"""
210+
if not text:
211+
return None
212+
values: list[int] = []
213+
for match in _RAM_RE.finditer(text):
214+
amount = float(match.group(1))
215+
unit = match.group(2).lower()
216+
if unit == "gb" and amount >= 1:
217+
values.append(int(amount))
218+
return max(values) if values else None
219+
220+
221+
def parse_battery_mah(text: str) -> int | None:
222+
"""``"5,000 mAh"`` → ``5000``; rejects values outside [500, 12000]."""
223+
if not text:
224+
return None
225+
match = _BATTERY_RE.search(text.replace(",", ""))
226+
if not match:
227+
return None
228+
value = int(match.group(1))
229+
return value if 500 <= value <= 12000 else None
230+
231+
232+
def parse_weight_g(text: str) -> int | None:
233+
"""``"232 g"`` → ``232``; rejects values outside [50, 500]."""
234+
if not text:
235+
return None
236+
match = _WEIGHT_RE.search(text.replace(",", ""))
237+
if not match:
238+
return None
239+
value = int(float(match.group(1)))
240+
return value if 50 <= value <= 500 else None
241+
242+
243+
def guess_os(text: str, *, brand: str = "") -> str | None:
244+
"""Best-effort OS string from a Wikipedia smartphone row.
245+
246+
Recognizes Android/iOS/iPadOS/HarmonyOS/Windows; falls back to inferring
247+
``"iOS"`` for Apple brand, ``"Android"`` for everyone else, when the
248+
text is non-empty but does not name an OS.
249+
"""
250+
if not text:
251+
return None
252+
lowered = text.lower()
253+
# OS names first; OEM-skin names are fallback because rows usually print
254+
# the OS *and* the skin together (e.g. "Android 14, One UI 6.1") and the
255+
# underlying OS is the schema-relevant value.
256+
for token, label in (
257+
("ipados", "iPadOS"),
258+
("ios", "iOS"),
259+
("android", "Android"),
260+
("harmonyos", "HarmonyOS"),
261+
("windows phone", "Windows Phone"),
262+
("windows", "Windows"),
263+
("hyperos", "HyperOS"),
264+
("oxygenos", "OxygenOS"),
265+
("oneui", "One UI"),
266+
("one ui", "One UI"),
267+
):
268+
if token in lowered:
269+
version = _OS_VERSION_RE.search(text)
270+
return f"{label} {version.group(1)}" if version else label
271+
if brand == "apple":
272+
return "iOS"
273+
if brand:
274+
return "Android"
275+
return None
276+
277+
200278
def guess_cpu_segment(name: str) -> str:
201279
"""Heuristic CPU segment classifier.
202280
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
"""Wikipedia smartphone list pages → ``IngestCandidate`` rows.
2+
3+
Each major OEM has a ``List of <brand> smartphones`` (or equivalent) page on
4+
Wikipedia. We parse those, look up the SoC name against the curated TechAPI
5+
SoC catalog, and emit candidates only for phones whose SoC is already
6+
curated — the validator enforces ``smartphone.soc`` foreign-key integrity
7+
so unknown SoCs would otherwise tank the whole dataset.
8+
"""
9+
10+
from __future__ import annotations
11+
12+
import re
13+
from collections.abc import Iterator
14+
from pathlib import Path
15+
16+
from bs4 import BeautifulSoup
17+
18+
from app.coverage.curated import curated_slugs
19+
from app.coverage.normalize import slugify
20+
from app.coverage.sources.wikipedia import fetch_wikipedia_html
21+
22+
from ..normalize import (
23+
guess_os,
24+
parse_battery_mah,
25+
parse_date,
26+
parse_ram_gb,
27+
parse_weight_g,
28+
)
29+
from .base import IngestCandidate
30+
from .wikitable import parse_table
31+
32+
PAGES: list[tuple[str, str]] = [
33+
("samsung", "List_of_Samsung_Galaxy_smartphones"),
34+
("apple", "List_of_iPhone_models"),
35+
("google", "Pixel_(smartphone)"),
36+
("oneplus", "List_of_OnePlus_products"),
37+
("xiaomi", "List_of_Xiaomi_smartphones"),
38+
]
39+
40+
HEADER_RULES: dict[str, list[str]] = {
41+
"model": ["model", "name"],
42+
"release_date": ["released", "release", "launched", "launch", "date"],
43+
"soc": ["soc", "chipset", "processor", "platform"],
44+
"ram": ["ram", "memory"],
45+
"battery": ["battery"],
46+
"weight": ["weight", "mass"],
47+
"os": ["os", "operating system", "software"],
48+
"display": ["display", "screen"],
49+
"camera": ["camera", "rear"],
50+
}
51+
52+
# Vendor-name prefixes we trim before slugifying the SoC text. We keep the
53+
# product family (Snapdragon / Dimensity / Exynos / Tensor / Kirin) and drop
54+
# only the company name. Apple is the exception — "Apple A17 Pro" slugs as
55+
# ``a17-pro`` because "Apple" is the family identifier, not a vendor prefix.
56+
_SOC_VENDOR_PREFIXES = (
57+
"qualcomm",
58+
"mediatek",
59+
"samsung",
60+
"huawei",
61+
"google",
62+
"apple",
63+
)
64+
65+
66+
class WikipediaSmartphoneIngest:
67+
"""Per-row ingestion from Wikipedia per-brand smartphone list pages."""
68+
69+
category = "smartphone"
70+
name = "wikipedia-smartphone-ingest"
71+
description = "Wikipedia: per-row extraction from brand-specific smartphone list pages."
72+
73+
def __init__(self, pages: list[tuple[str, str]] | None = None) -> None:
74+
self._pages = pages if pages is not None else PAGES
75+
# Lazy: populated on first ``fetch()`` call so tests can monkeypatch
76+
# TECHAPI_DATA_DIR before the source touches disk.
77+
self._known_socs: set[str] | None = None
78+
self._known_brands: set[str] | None = None
79+
80+
def fetch(self, *, limit: int | None = None) -> Iterator[IngestCandidate]:
81+
self._refresh_curated_indexes()
82+
emitted = 0
83+
for brand, page in self._pages:
84+
if self._known_brands is not None and brand not in self._known_brands:
85+
continue
86+
try:
87+
html = fetch_wikipedia_html(page)
88+
except Exception:
89+
continue
90+
for candidate in self._extract(html, brand, page, self._known_socs or set()):
91+
yield candidate
92+
emitted += 1
93+
if limit is not None and emitted >= limit:
94+
return
95+
96+
def _refresh_curated_indexes(self) -> None:
97+
self._known_socs = curated_slugs("soc")
98+
self._known_brands = curated_slugs("brand")
99+
100+
@staticmethod
101+
def _extract(
102+
html: str, brand: str, page: str, known_socs: set[str]
103+
) -> Iterator[IngestCandidate]:
104+
soup = BeautifulSoup(html, "html.parser")
105+
source_url = f"https://en.wikipedia.org/wiki/{page}"
106+
for table in soup.select("table.wikitable"):
107+
for row in parse_table(table, HEADER_RULES):
108+
model = row.cells.get("model", "")
109+
slug = slugify(model, manufacturer=brand)
110+
if len(slug) < 3:
111+
continue
112+
yield _build_candidate(
113+
brand=brand,
114+
model=model,
115+
slug=slug,
116+
row=row.cells,
117+
source_url=source_url,
118+
known_socs=known_socs,
119+
)
120+
121+
122+
def soc_text_to_slug(text: str) -> str:
123+
"""Heuristically map an SoC table cell to a TechAPI SoC slug.
124+
125+
The shape of the text varies wildly — ``"Snapdragon 8 Gen 3"``,
126+
``"Qualcomm Snapdragon 8 Gen 3 Mobile Platform"``,
127+
``"Apple A17 Pro"`` — so we strip vendor prefixes, drop common
128+
promotional suffixes, then run the standard slugifier.
129+
"""
130+
if not text:
131+
return ""
132+
# Wikipedia often appends " for Galaxy" / " (Mobile Platform)" etc.
133+
cleaned = re.sub(r"\s+for\s+\w+\b", "", text, flags=re.IGNORECASE)
134+
cleaned = re.sub(r"\s+\(.+?\)\s*$", "", cleaned)
135+
cleaned = re.sub(r"\s+mobile\s+platform\b", "", cleaned, flags=re.IGNORECASE)
136+
lowered = cleaned.strip().lower()
137+
for prefix in _SOC_VENDOR_PREFIXES:
138+
if lowered.startswith(prefix + " "):
139+
cleaned = cleaned[len(prefix) + 1 :]
140+
break
141+
return slugify(cleaned)
142+
143+
144+
def _build_candidate(
145+
*,
146+
brand: str,
147+
model: str,
148+
slug: str,
149+
row: dict[str, str],
150+
source_url: str,
151+
known_socs: set[str],
152+
) -> IngestCandidate:
153+
release_date = parse_date(row.get("release_date", ""))
154+
soc_text = row.get("soc", "")
155+
soc_slug = soc_text_to_slug(soc_text)
156+
soc_value = soc_slug if soc_slug in known_socs else None
157+
158+
ram_gb = parse_ram_gb(row.get("ram", ""))
159+
battery = parse_battery_mah(row.get("battery", ""))
160+
weight = parse_weight_g(row.get("weight", ""))
161+
os_value = guess_os(row.get("os", ""), brand=brand)
162+
163+
name = (
164+
model
165+
if model.lower().startswith(brand) or model.lower().startswith(brand.upper())
166+
else f"{brand.title()} {model}"
167+
)
168+
169+
record: dict[str, object | None] = {
170+
"slug": slug,
171+
"name": name,
172+
"brand": brand,
173+
"soc": soc_value,
174+
"release_date": release_date.isoformat() if release_date else None,
175+
"ram_gb": ram_gb,
176+
"battery_mah": battery,
177+
"weight_g": weight,
178+
"os": os_value,
179+
"msrp_usd": None,
180+
"verified": False,
181+
"source_urls": [source_url],
182+
}
183+
184+
required = ("soc", "release_date", "ram_gb", "battery_mah", "weight_g", "os")
185+
missing = tuple(field for field in required if record.get(field) in (None, ""))
186+
187+
output_path = Path("smartphone") / brand / f"{slug}.json"
188+
189+
return IngestCandidate(
190+
category="smartphone",
191+
manufacturer=brand,
192+
slug=slug,
193+
record=record,
194+
source_url=source_url,
195+
output_path=output_path,
196+
missing_fields=missing,
197+
)

0 commit comments

Comments
 (0)