-
Notifications
You must be signed in to change notification settings - Fork 0
167 lines (151 loc) · 5.95 KB
/
weekly-refresh.yml
File metadata and controls
167 lines (151 loc) · 5.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
name: weekly-refresh
# Weekly automated data refresh:
# 1. live-scrape benchmark sources into a TechAPI checkout
# 2. gate on FULL-dataset integrity (schema + cross-source anomalies)
# 3. regenerate the static v1 dump + openapi.json
# 4. open a dated refresh PR against the public TechAPI repo
#
# TechEngine owns collection/validation/dump; TechAPI owns data/site/deploy.
#
# Token model: TechAPI is public, so the checkout uses the default GITHUB_TOKEN
# (read-only) as a fallback — that lets the collect→validate→dump path run on
# every push even when no PAT is configured. Only the cross-repo PR needs write
# access, so just that step is guarded by `secrets.TECHAPI_TOKEN`. Add the PAT
# (TechAPI Contents:write + Pull requests:write) as TECHAPI_TOKEN to enable PRs.
on:
schedule:
- cron: "0 6 * * 1" # Mondays 06:00 UTC
workflow_dispatch:
inputs:
sleep:
description: "Seconds between scrape requests (politeness)"
type: string
default: "1.0"
permissions:
contents: read
concurrency:
group: weekly-refresh
cancel-in-progress: false
jobs:
refresh:
runs-on: ubuntu-latest
env:
SLEEP: ${{ inputs.sleep || '1.0' }}
TECHAPI_TOKEN: ${{ secrets.TECHAPI_TOKEN }}
# Validate/seed/dump all read the data tree from this env var.
TECHAPI_DATA_DIR: ${{ github.workspace }}/techapi/data
steps:
- name: Checkout TechEngine
uses: actions/checkout@v4
# Read-only with the default token when no PAT is set; the PAT (when
# present) lets peter-evans push the refresh branch back later.
- name: Checkout TechAPI
uses: actions/checkout@v4
with:
repository: GetTechAPI/TechAPI
path: techapi
token: ${{ secrets.TECHAPI_TOKEN || secrets.GITHUB_TOKEN }}
- uses: actions/setup-python@v5
with:
python-version: "3.12"
cache: pip
- name: Install TechEngine
run: pip install -e .
- name: Compute refresh date
id: meta
run: echo "date=$(date -u +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
# --- 1. Live collection (per-source; a flaky scrape must not sink the run) ---
- name: Enrich benchmarks (all sources)
run: |
set -uo pipefail
run_enrich() {
comp="$1"; src="$2"
echo "::group::enrich ${comp}/${src}"
if python -m app.ingest.enrich \
--source "$src" --component "$comp" \
--data-root ./techapi/data --sleep "$SLEEP" \
--summary "enrich-${comp}-${src}.md"; then
:
else
echo "::warning::enrich source '${src}' (${comp}) failed; skipping"
fi
echo "::endgroup::"
}
for s in passmark cinebench-legacy cinebench-r23 cinebench-2024 \
cinebench-nbc geekbench-nbc spec-cpu2006 topcpu-cpu; do
run_enrich cpu "$s"
done
for s in blender timespy passmark-gpu topcpu-gpu; do
run_enrich gpu "$s"
done
# --- 2. Integrity gate over the WHOLE dataset (new + existing) ---
# Either failure stops the job before the dump/PR, so contaminated data
# can never reach a refresh PR.
- name: Validate (schema / range / slug / FK)
run: python -m app.validate
- name: Integrity check (cross-source anomalies, strict gate)
run: python integrity_check.py ./techapi/data --strict
# --- 3. Static dump → site/public (what the Astro site fetches at runtime) ---
- name: Generate static dump
run: python -m app.dump --output ./techapi/site/public
# --- PR body: per-source enrich summaries + gate result ---
- name: Build PR body
run: |
{
echo "# Weekly data refresh — ${{ steps.meta.outputs.date }}"
echo
echo "Automated live re-scrape + full-dataset integrity gate + static dump."
echo
echo "## Validation"
echo "- \`app.validate\` (schema/range/slug/FK): **passed**"
echo "- \`integrity_check.py --strict\` (cross-source anomaly gate): **passed**"
echo
echo "## Enrichment summaries"
for f in enrich-*.md; do
[ -f "$f" ] || continue
echo
echo "<details><summary>$f</summary>"
echo
cat "$f"
echo
echo "</details>"
done
} > pr-body.md
- name: Upload run artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: refresh-${{ steps.meta.outputs.date }}
path: |
enrich-*.md
pr-body.md
if-no-files-found: ignore
# Fallback when no PAT: keep the regenerated dump so the work isn't lost.
- name: Upload dump artifact (no-token fallback)
if: env.TECHAPI_TOKEN == ''
uses: actions/upload-artifact@v4
with:
name: dump-${{ steps.meta.outputs.date }}
path: |
techapi/site/public/v1
techapi/site/public/openapi.json
if-no-files-found: ignore
# --- 4. Dated branch + auto PR against TechAPI (only with a PAT) ---
- name: Create refresh PR
if: env.TECHAPI_TOKEN != ''
uses: peter-evans/create-pull-request@v6
with:
path: ./techapi
token: ${{ secrets.TECHAPI_TOKEN }}
branch: refresh/${{ steps.meta.outputs.date }}
base: main
add-paths: |
data
site/public/v1
site/public/openapi.json
commit-message: "chore(data): weekly refresh ${{ steps.meta.outputs.date }}"
title: "chore(data): weekly refresh ${{ steps.meta.outputs.date }}"
body-path: pr-body.md
committer: TechEngineBot <289859915+TechEngineBot@users.noreply.github.com>
author: TechEngineBot <289859915+TechEngineBot@users.noreply.github.com>
delete-branch: true