Skip to content

Commit 6ed2150

Browse files
committed
fix(ingest): mypy-friendly colspan helper + safer workflow YAML
- wikipedia_cpu.py: extract a typed ``_colspan(Tag) -> int`` helper so bs4's ``Tag.get`` return-type (str | AttributeValueList | None) no longer trips strict mypy at the two call sites. - weekly-ingest.yml: move ``inputs.*`` references into job-level ``env`` so schedule-triggered runs (where ``inputs`` is undefined) don't fail expression validation. Token check is now a shell guard inside the PR step, not a step-level ``if:`` reading ``secrets.X``.
1 parent 08f6e3a commit 6ed2150

2 files changed

Lines changed: 35 additions & 20 deletions

File tree

.github/workflows/weekly-ingest.yml

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,15 @@ permissions:
2727
jobs:
2828
ingest:
2929
runs-on: ubuntu-latest
30+
env:
31+
CATEGORY: ${{ inputs.category || 'cpu' }}
32+
LIMIT: ${{ inputs.limit || '50' }}
33+
INCLUDE_DRAFTS: ${{ inputs.include_drafts || 'false' }}
34+
TECHAPI_PR_TOKEN: ${{ secrets.TECHAPI_PR_TOKEN }}
3035
steps:
3136
- uses: actions/checkout@v4
3237

33-
# Use the PAT when present so we can push to the TechAPI fork later;
38+
# Use the PAT when present so we can push to TechAPI later;
3439
# fall back to the default token for read-only test runs.
3540
- uses: actions/checkout@v4
3641
with:
@@ -49,16 +54,17 @@ jobs:
4954
- name: Run ingest
5055
env:
5156
TECHAPI_DATA_DIR: ${{ github.workspace }}/TechAPI/data
52-
INGEST_CATEGORY: ${{ inputs.category || 'cpu' }}
53-
INGEST_LIMIT: ${{ inputs.limit || '50' }}
54-
INGEST_DRAFTS: ${{ inputs.include_drafts && '--include-drafts' || '' }}
5557
run: |
58+
DRAFTS_FLAG=""
59+
if [ "$INCLUDE_DRAFTS" = "true" ]; then
60+
DRAFTS_FLAG="--include-drafts"
61+
fi
5662
python -m app.ingest \
57-
--category "$INGEST_CATEGORY" \
58-
--limit "$INGEST_LIMIT" \
63+
--category "$CATEGORY" \
64+
--limit "$LIMIT" \
5965
--data-root TechAPI/data \
6066
--summary ingest-summary.md \
61-
$INGEST_DRAFTS
67+
$DRAFTS_FLAG
6268
6369
- name: Upload summary artifact
6470
uses: actions/upload-artifact@v4
@@ -77,13 +83,15 @@ jobs:
7783
fi
7884
7985
- name: Open PR against TechAPI
80-
if: ${{ steps.changes.outputs.has_changes == 'true' && secrets.TECHAPI_PR_TOKEN != '' }}
86+
if: steps.changes.outputs.has_changes == 'true'
8187
env:
8288
GH_TOKEN: ${{ secrets.TECHAPI_PR_TOKEN }}
83-
CATEGORY: ${{ inputs.category || 'cpu' }}
84-
IS_DRAFT: ${{ inputs.include_drafts && 'true' || 'false' }}
8589
run: |
8690
set -euo pipefail
91+
if [ -z "${GH_TOKEN:-}" ]; then
92+
echo "::warning::Ingest produced additions but TECHAPI_PR_TOKEN is unset; skipping PR. Summary attached as artifact."
93+
exit 0
94+
fi
8795
cd TechAPI
8896
BRANCH="ingest/${CATEGORY}-$(date -u +%Y%m%d-%H%M%S)"
8997
git config user.name "techengine-bot"
@@ -93,7 +101,7 @@ jobs:
93101
git commit -m "feat(data/${CATEGORY}): weekly ingest"
94102
git push -u origin "$BRANCH"
95103
DRAFT_FLAG=""
96-
if [ "$IS_DRAFT" = "true" ]; then
104+
if [ "$INCLUDE_DRAFTS" = "true" ]; then
97105
DRAFT_FLAG="--draft"
98106
fi
99107
gh pr create \
@@ -102,8 +110,3 @@ jobs:
102110
--base main \
103111
--head "$BRANCH" \
104112
$DRAFT_FLAG
105-
106-
- name: Note when PR token is missing
107-
if: ${{ steps.changes.outputs.has_changes == 'true' && secrets.TECHAPI_PR_TOKEN == '' }}
108-
run: |
109-
echo "::warning::Ingest produced additions but TECHAPI_PR_TOKEN is unset; skipping PR. Summary attached as artifact."

app/ingest/sources/wikipedia_cpu.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,12 +121,13 @@ def _table_headers(table: Tag) -> dict[int, str]:
121121
out: dict[int, str] = {}
122122
index = 0
123123
for cell in header_row.find_all(["th", "td"]):
124+
if not isinstance(cell, Tag):
125+
continue
124126
text = cell.get_text(" ", strip=True).lower()
125127
canonical = _match_header(text)
126128
if canonical is not None:
127129
out[index] = canonical
128-
colspan = int(cell.get("colspan", 1) or 1)
129-
index += colspan
130+
index += _colspan(cell)
130131
return out
131132

132133

@@ -145,11 +146,22 @@ def _row_by_field(cells: list[Tag], headers: dict[int, str]) -> dict[str, str]:
145146
canonical = headers.get(index)
146147
if canonical is not None and canonical not in result:
147148
result[canonical] = cell.get_text(" ", strip=True)
148-
colspan = int(cell.get("colspan", 1) or 1)
149-
index += colspan
149+
index += _colspan(cell)
150150
return result
151151

152152

153+
def _colspan(cell: Tag) -> int:
154+
raw = cell.attrs.get("colspan")
155+
if isinstance(raw, list):
156+
raw = raw[0] if raw else None
157+
if raw is None:
158+
return 1
159+
try:
160+
return int(raw)
161+
except (TypeError, ValueError):
162+
return 1
163+
164+
153165
def _nearest_section_label(table: Tag) -> str | None:
154166
for prev in table.find_all_previous(["h2", "h3", "h4"]):
155167
text = prev.get_text(" ", strip=True)

0 commit comments

Comments
 (0)