From 23ccd8d2aa9dcd83987b698ecd33ba730039050f Mon Sep 17 00:00:00 2001 From: zhangyue Date: Wed, 6 May 2026 02:41:07 +0000 Subject: [PATCH 01/88] ci: add ci_test GitHub workflows --- .ci | 1 + .ci/README.md | 388 -------- .ci/agent.py | 1088 --------------------- .ci/build.py | 260 ----- .ci/ci_resource.py | 575 ----------- .ci/github_status.py | 100 -- .ci/images/ascend/Dockerfile | 29 - .ci/images/cambricon/Dockerfile | 33 - .ci/images/iluvatar/Dockerfile | 53 - .ci/images/metax/Dockerfile | 46 - .ci/images/moore/Dockerfile | 38 - .ci/images/nvidia/Dockerfile | 46 - .ci/restart-agent.sh | 50 - .ci/run.py | 499 ---------- .ci/tests/__init__.py | 0 .ci/tests/conftest.py | 46 - .ci/tests/test_agent.py | 724 -------------- .ci/tests/test_build.py | 207 ---- .ci/tests/test_github_status.py | 145 --- .ci/tests/test_resource.py | 434 -------- .ci/tests/test_run.py | 450 --------- .ci/tests/test_utils.py | 108 -- .ci/utils.py | 116 --- .ci/config.yaml => .github/ci_config.yaml | 51 +- .github/workflows/ci_test.yml | 15 + .gitmodules | 3 + 26 files changed, 39 insertions(+), 5466 deletions(-) create mode 160000 .ci delete mode 100644 .ci/README.md delete mode 100644 .ci/agent.py delete mode 100644 .ci/build.py delete mode 100644 .ci/ci_resource.py delete mode 100644 .ci/github_status.py delete mode 100644 .ci/images/ascend/Dockerfile delete mode 100644 .ci/images/cambricon/Dockerfile delete mode 100644 .ci/images/iluvatar/Dockerfile delete mode 100644 .ci/images/metax/Dockerfile delete mode 100644 .ci/images/moore/Dockerfile delete mode 100644 .ci/images/nvidia/Dockerfile delete mode 100755 .ci/restart-agent.sh delete mode 100644 .ci/run.py delete mode 100644 .ci/tests/__init__.py delete mode 100644 .ci/tests/conftest.py delete mode 100644 .ci/tests/test_agent.py delete mode 100644 .ci/tests/test_build.py delete mode 100644 .ci/tests/test_github_status.py delete mode 100644 .ci/tests/test_resource.py delete mode 100644 .ci/tests/test_run.py delete mode 100644 .ci/tests/test_utils.py delete mode 100644 .ci/utils.py rename .ci/config.yaml => .github/ci_config.yaml (78%) create mode 100644 .github/workflows/ci_test.yml create mode 100644 .gitmodules diff --git a/.ci b/.ci new file mode 160000 index 000000000..02bec4b85 --- /dev/null +++ b/.ci @@ -0,0 +1 @@ +Subproject commit 02bec4b85604e1f4cdcc673ce431ddd90ed2018f diff --git a/.ci/README.md b/.ci/README.md deleted file mode 100644 index bfc28ea1e..000000000 --- a/.ci/README.md +++ /dev/null @@ -1,388 +0,0 @@ -# .ci — CI Images and Pipeline - -``` -.ci/ -├── config.yaml # Unified config (images, jobs, agent definitions) -├── utils.py # Shared utilities (load_config, normalize_config, get_git_commit) -├── agent.py # Runner Agent (scheduler, webhooks, remote dispatch) -├── build.py # Image builder -├── run.py # CI pipeline runner (Docker layer) -├── ci_resource.py # GPU/memory detection and allocation -├── github_status.py # GitHub Commit Status reporting -├── images/ -│ ├── nvidia/Dockerfile -│ ├── iluvatar/Dockerfile -│ ├── metax/Dockerfile -│ ├── moore/Dockerfile -│ ├── cambricon/Dockerfile -│ └── ascend/Dockerfile -└── tests/ # Unit tests - ├── conftest.py - ├── test_agent.py - ├── test_build.py - ├── test_run.py - ├── test_resource.py - ├── test_github_status.py - └── test_utils.py -``` - -**Prerequisites**: Docker, Python 3.10+, `pip install pyyaml` - ---- - -## Configuration `config.yaml` - -Config uses a **platform-centric** top-level structure. Each platform defines its image, platform-level defaults, and job list. -At load time, jobs are flattened to `{platform}_{job}` format (e.g., `nvidia_gpu`). - -```yaml -repo: - url: https://github.com/InfiniTensor/InfiniOps.git - branch: master - -github: - status_context_prefix: "ci/infiniops" - -agents: # Remote agent URLs (used by CLI for cross-machine dispatch) - nvidia: - url: http://nvidia-host:8080 - iluvatar: - url: http://iluvatar-host:8080 - -platforms: - nvidia: - image: # Image definition - dockerfile: .ci/images/nvidia/ - build_args: - BASE_IMAGE: nvcr.io/nvidia/pytorch:25.12-py3 - setup: pip install .[dev] --no-build-isolation - jobs: - gpu: # Flattened as `nvidia_gpu`. - resources: - ngpus: 1 # Scheduler auto-picks this many free GPUs. - memory: 32GB - shm_size: 16g - timeout: 3600 - stages: - - name: test - run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml - - iluvatar: - image: - dockerfile: .ci/images/iluvatar/ - build_args: - BASE_IMAGE: corex:qs_pj20250825 - APT_MIRROR: http://archive.ubuntu.com/ubuntu - PIP_INDEX_URL: https://pypi.org/simple - docker_args: # Platform-level docker args, inherited by all jobs - - "--privileged" - - "--cap-add=ALL" - - "--pid=host" - - "--ipc=host" - volumes: - - /dev:/dev - - /lib/firmware:/lib/firmware - - /usr/src:/usr/src - - /lib/modules:/lib/modules - setup: pip install .[dev] --no-build-isolation - jobs: - gpu: # Flattened as `iluvatar_gpu`. - resources: - ngpus: 1 - gpu_ids: auto - memory: 32GB - shm_size: 16g - timeout: 3600 - stages: - - name: test - run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml -``` - -### Config hierarchy - -| Level | Field | Description | -|---|---|---| -| **Platform** | `image` | Image definition (dockerfile, build_args) | -| | `image_tag` | Default image tag (defaults to `latest`) | -| | `docker_args` | Extra `docker run` args (e.g., `--privileged`) | -| | `volumes` | Extra volume mounts | -| | `setup` | In-container setup command | -| | `env` | Injected container env vars | -| **Job** | `resources.ngpus` | Number of GPUs to allocate (default: 1). Used with `gpu_ids: auto` for dynamic allocation | -| | `resources.gpu_ids` | `auto`: scheduler picks `ngpus` least-loaded GPUs. Static: pin to specific IDs (e.g., `"0"`, `"0,2"`). `all`: use all GPUs | -| | `resources.memory` | Container memory limit | -| | `resources.shm_size` | Shared memory size | -| | `resources.timeout` | Max run time in seconds | -| | `stages` | Execution stage list | -| | Any platform field | Jobs can override any platform-level default | - ---- - -## Image builder `build.py` - -| Flag | Description | -|---|---| -| `--platform nvidia\|iluvatar\|metax\|moore\|ascend\|all` | Target platform (default: `all`) | -| `--commit` | Use specific commit ref as image tag (default: HEAD) | -| `--force` | Skip Dockerfile change detection | -| `--dry-run` | Print commands without executing | - -```bash -# Build with change detection (skips if no Dockerfile changes) -python .ci/build.py --platform nvidia - -# Build Iluvatar image -python .ci/build.py --platform iluvatar --force - -# Force build all platforms -python .ci/build.py --force -``` - -Build artifacts are stored as local Docker image tags: `infiniops-ci/:` and `:latest`. -Proxy and `no_proxy` env vars are forwarded from the host to `docker build` automatically. - -> `--push` is reserved for future use; requires a `registry` section in `config.yaml`. - ---- - -## Pipeline runner `run.py` - -Platform is auto-detected (via `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi`/`cnmon`/`npu-smi` on PATH), no manual specification needed. - -| Flag | Description | -|---|---| -| `--config` | Config file path (default: `.ci/config.yaml`) | -| `--job` | Job name (e.g., `nvidia_gpu`, `ascend_npu`). Defaults to all jobs for the current platform | -| `--branch` | Override clone branch (default: config `repo.branch`) | -| `--stage` | Run only the specified stage | -| `--image-tag` | Override image tag | -| `--gpu-id` | Override GPU device IDs (nvidia via `--gpus`, others via platform-specific env var) | -| `--test` | Override pytest test path (e.g., `tests/test_gemm.py::test_gemm`) | -| `--results-dir` | Host directory mounted to `/workspace/results` inside the container | -| `--local` | Mount current directory (read-only) instead of cloning from git | -| `--dry-run` | Print docker command without executing | - -```bash -# Simplest usage: auto-detect platform, run all jobs, use config default branch -python .ci/run.py - -# Run a specific job -python .ci/run.py --job nvidia_gpu - -# Run only the test stage, preview mode -python .ci/run.py --job nvidia_gpu --stage test --dry-run - -# Test local uncommitted changes without pushing -python .ci/run.py --local -``` - -Container execution flow: `git clone` → `checkout` → `setup` → stages (fail-fast: first failure breaks the loop and preserves the real exit code). -With `--local`, the current directory is mounted read-only at `/workspace/repo` and copied to a writable temp directory inside the container before setup runs — host files are never modified. -Proxy vars are forwarded from the host. Test results are written to `--results-dir` (each run gets a unique directory with timestamp + UUID suffix). Each run uses a clean environment (no host pip cache mounted). - ---- - -## Platform differences - -| Platform | GPU passthrough | Device env var | Base image | Detection tool | -|---|---|---|---|---| -| NVIDIA | `--gpus` (NVIDIA Container Toolkit) | — (uses Docker flag) | `nvcr.io/nvidia/pytorch:25.12-py3` | `nvidia-smi` | -| Iluvatar | `--privileged` + `/dev` mount | `CUDA_VISIBLE_DEVICES` | `corex:qs_pj20250825` | `ixsmi` | -| MetaX | `--privileged` | `CUDA_VISIBLE_DEVICES` | `maca-pytorch:3.2.1.4-...` | `mx-smi` | -| Moore | `--privileged` | `MTHREADS_VISIBLE_DEVICES` | `vllm_musa:20251112_hygon` | `mthreads-gmi` | -| Cambricon | `--privileged` | `MLU_VISIBLE_DEVICES` | `cambricon/pytorch:v1.25.3` | `cnmon` | -| Ascend | `--privileged` + device mounts | `ASCEND_VISIBLE_DEVICES` | `vllm-ascend:v0.18.0rc1-openeuler` | `npu-smi` | - -Device visibility is derived from the platform name (see `PLATFORM_DEVICE_ENV` in `ci_resource.py`). NVIDIA uses Docker's `--gpus` flag; all other platforms use `--privileged` and control visibility via a platform-specific environment variable. - ---- - -## Runner Agent `agent.py` - -The Runner Agent supports CLI manual dispatch, GitHub webhook triggers, resource-aware dynamic scheduling, and cross-machine remote dispatch. - -### CLI manual execution - -```bash -# Run all jobs (dispatched to remote agents, using config default branch) -python .ci/agent.py run - -# Specify branch -python .ci/agent.py run --branch feat/xxx - -# Run a specific job -python .ci/agent.py run --job nvidia_gpu - -# Filter by platform -python .ci/agent.py run --platform nvidia - -# Preview mode -python .ci/agent.py run --dry-run -``` - -| Flag | Description | -|---|---| -| `--branch` | Test branch (default: config `repo.branch`) | -| `--job` | Specific job name | -| `--platform` | Filter jobs by platform | -| `--commit` | Override commit SHA used for GitHub status reporting | -| `--image-tag` | Override image tag | -| `--dry-run` | Preview mode | - -### Webhook server - -Deploy one Agent instance per platform machine (platform is auto-detected). On each machine: - -```bash -python .ci/agent.py serve --port 8080 -``` - -Additional `serve` flags: - -| Flag | Description | -|---|---| -| `--port` | Listen port (default: 8080) | -| `--host` | Listen address (default: `0.0.0.0`) | -| `--webhook-secret` | GitHub webhook signing secret (or `WEBHOOK_SECRET` env var) | -| `--api-token` | `/api/run` Bearer auth token (or `AGENT_API_TOKEN` env var) | -| `--results-dir` | Results directory (default: `ci-results`) | -| `--utilization-threshold` | GPU idle threshold percentage (default: 10) | - -| Endpoint | Method | Description | -|---|---|---| -| `/webhook` | POST | GitHub webhook (push/pull_request) | -| `/api/run` | POST | Remote job trigger | -| `/api/job/{id}` | GET | Query job status | -| `/api/job/{id}/log` | GET | Full job log (text/plain) | -| `/health` | GET | Health check | -| `/status` | GET | Queue + resource status | - -Webhook supports `X-Hub-Signature-256` signature verification via `--webhook-secret` or `WEBHOOK_SECRET` env var. - -### Remote agent configuration - -Configure agent URLs in `config.yaml`; the CLI automatically dispatches remote jobs to the corresponding agents: - -```yaml -agents: - nvidia: - url: http://:8080 - iluvatar: - url: http://:8080 - metax: - url: http://:8080 - moore: - url: http://:8080 -``` - -### Resource scheduling - -The Agent auto-detects GPU utilization and system memory to dynamically determine parallelism: -- GPUs with utilization < threshold (default 10%) and not already allocated → available -- Allocation picks the **least-loaded** GPUs first (sorted by utilization ascending) -- When `gpu_ids: auto` (default), the scheduler allocates `ngpus` GPUs per job -- When resources are insufficient, jobs are queued automatically (max 100 pending); completed jobs release resources and trigger scheduling of queued tasks -- Docker execution has a Python-level timeout fallback (job timeout + 120s) to prevent stuck containers - -### GitHub Status - -Set the `GITHUB_TOKEN` env var and the Agent will automatically report commit status: -- `pending` — job started -- `success` / `failure` — job completed - -Status context format: `ci/infiniops/{job_name}` - ---- - -## Multi-machine deployment guide - -### Per-platform setup - -Each machine needs Docker installed, the platform runtime, and the base CI image built. - -| Platform | Runtime check | Base image | Build command | -|---|---|---|---| -| NVIDIA | `nvidia-smi` (+ [Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) | `nvcr.io/nvidia/pytorch:25.12-py3` (public) | `python .ci/build.py --platform nvidia` | -| Iluvatar | `ixsmi` | `corex:qs_pj20250825` (import in advance) | `python .ci/build.py --platform iluvatar` | -| MetaX | `mx-smi` | `maca-pytorch:3.2.1.4-...` (import in advance) | `python .ci/build.py --platform metax` | -| Moore | `mthreads-gmi` | `vllm_musa:20251112_hygon` (import in advance) | `python .ci/build.py --platform moore` | -| Cambricon | `cnmon` | `cambricon/pytorch:v1.25.3` (import in advance) | `python .ci/build.py --platform cambricon` | -| Ascend | `npu-smi` (+ Ascend driver + CANN toolkit) | `vllm-ascend:v0.18.0rc1-openeuler` (import in advance) | `python .ci/build.py --platform ascend` | - -### Start Agent services - -On each machine (platform is auto-detected): - -```bash -python .ci/agent.py serve --port 8080 -``` - -### Configure remote agent URLs - -On the trigger machine, add the `agents` section to `config.yaml` (see [Remote agent configuration](#remote-agent-configuration) above for the format). - -### Trigger cross-platform tests - -```bash -# Run all platform jobs at once (using config default branch) -python .ci/agent.py run - -# Preview mode (no actual execution) -python .ci/agent.py run --dry-run - -# Run only a specific platform -python .ci/agent.py run --platform nvidia -``` - -### Optional configuration - -#### GitHub Status reporting - -Set the env var on all machines so each reports its own platform's test status: - -```bash -export GITHUB_TOKEN=ghp_xxxxxxxxxxxx -``` - -#### API Token authentication - -When agents are exposed on untrusted networks, enable token auth: - -```bash -python .ci/agent.py serve --port 8080 --api-token -# Or: export AGENT_API_TOKEN= -``` - -#### GitHub Webhook auto-trigger - -In GitHub repo → Settings → Webhooks, add a webhook for each machine: - -| Field | Value | -|---|---| -| Payload URL | `http://:8080/webhook` | -| Content type | `application/json` | -| Secret | Must match `--webhook-secret` | -| Events | `push` and `pull_request` | - -```bash -python .ci/agent.py serve --port 8080 --webhook-secret -# Or: export WEBHOOK_SECRET= -``` - -### Verification checklist - -```bash -# 1. Dry-run each machine individually -for platform in nvidia iluvatar metax moore cambricon ascend; do - python .ci/agent.py run --platform $platform --dry-run -done - -# 2. Health and resource checks -for ip in ; do - curl http://$ip:8080/health - curl http://$ip:8080/status -done - -# 3. Cross-platform test -python .ci/agent.py run --branch master -``` diff --git a/.ci/agent.py b/.ci/agent.py deleted file mode 100644 index 9e8899b50..000000000 --- a/.ci/agent.py +++ /dev/null @@ -1,1088 +0,0 @@ -#!/usr/bin/env python3 -"""CI Runner Agent: webhook server, resource-aware scheduler, GitHub status reporting. - -Usage: - # Run jobs locally (or dispatch to remote agents) - python .ci/agent.py run - python .ci/agent.py run --branch master --job nvidia_gpu --dry-run - - # Start webhook server (auto-detects platform) - python .ci/agent.py serve --port 8080 -""" - -import argparse -import collections -import hashlib -import hmac -import json -import os -import shlex -import subprocess -import sys -import threading -import time -import urllib.error -import urllib.request -import uuid -from concurrent.futures import ThreadPoolExecutor, as_completed -from datetime import datetime -from http.server import BaseHTTPRequestHandler, HTTPServer -from pathlib import Path - -import ci_resource as res -import github_status as gh -import run - -# Maximum POST body size (1 MB) to prevent memory exhaustion -MAX_CONTENT_LENGTH = 1 * 1024 * 1024 - -# Job states -STATE_QUEUED = "queued" -STATE_RUNNING = "running" -STATE_PENDING = "pending" -STATE_SUCCESS = "success" -STATE_FAILURE = "failure" -STATE_ERROR = "error" - -TAIL_LINES = 50 -MAX_QUEUE_SIZE = 100 - -# urllib helpers (module-level for easier mocking in tests) -urllib_request = urllib.request.Request -urllib_urlopen = urllib.request.urlopen - - -class QueueFullError(Exception): - """Raised when the job queue has reached its maximum size.""" - - -# --------------------------------------------------------------------------- -# Data classes -# --------------------------------------------------------------------------- - - -class JobRequest: - """Describes a CI job to be executed.""" - - def __init__( - self, job_name, branch, commit_sha, config, image_tag=None, results_dir=None - ): - self.job_id = str(uuid.uuid4())[:8] - self.job_name = job_name - self.branch = branch - self.commit_sha = commit_sha - self.config = config - self.image_tag = image_tag - self.results_dir = results_dir or Path("ci-results") - self.created_at = datetime.now().isoformat() - - job = config["jobs"][job_name] - self.platform = job.get("platform", "nvidia") - - def to_dict(self): - return { - "job_id": self.job_id, - "job_name": self.job_name, - "branch": self.branch, - "commit_sha": self.commit_sha, - "platform": self.platform, - "created_at": self.created_at, - } - - -class JobResult: - """Outcome of a completed job.""" - - def __init__( - self, - job_id, - job_name, - commit_sha, - returncode, - results_dir, - duration, - error_tail=None, - log_file=None, - ): - self.job_id = job_id - self.job_name = job_name - self.commit_sha = commit_sha - self.returncode = returncode - self.results_dir = results_dir - self.duration = duration - self.error_tail = error_tail or [] - self.log_file = log_file - - self.state = STATE_SUCCESS if returncode == 0 else STATE_FAILURE - - def to_dict(self): - d = { - "job_id": self.job_id, - "job_name": self.job_name, - "commit_sha": self.commit_sha, - "state": self.state, - "returncode": self.returncode, - "results_dir": str(self.results_dir), - "duration_seconds": round(self.duration, 1), - } - - if self.error_tail: - d["error_tail"] = self.error_tail - - if self.log_file: - d["log_file"] = str(self.log_file) - - return d - - -# --------------------------------------------------------------------------- -# Scheduler -# --------------------------------------------------------------------------- - - -class Scheduler: - """Resource-aware job scheduler with dynamic parallelism.""" - - def __init__( - self, - config, - platform, - resource_pool, - results_dir=None, - max_workers=4, - no_status=False, - dry_run=False, - ): - self._config = config - self._platform = platform - self._resource_pool = resource_pool - self._results_dir = results_dir or Path("ci-results") - self._no_status = no_status - self._dry_run = dry_run - self._queue = collections.deque() - self._jobs: dict[str, dict] = {} # job_id -> {request, result, state, gpu_ids} - self._executor = ThreadPoolExecutor(max_workers=max_workers) - self._lock = threading.Lock() - self._done_event = threading.Event() - - # GitHub config - github_cfg = config.get("github", {}) - self._status_prefix = github_cfg.get("status_context_prefix", "ci/infiniops") - repo = config.get("repo", {}) - repo_url = repo.get("url", "") - self._owner, self._repo = gh.parse_repo_url(repo_url) - - def submit(self, job_request): - """Add a job to the queue and attempt to schedule it. - - Returns the job_id. Raises ``QueueFullError`` if the queue is at - capacity. - """ - with self._lock: - if len(self._queue) >= MAX_QUEUE_SIZE: - raise QueueFullError( - f"queue full ({MAX_QUEUE_SIZE} jobs), try again later" - ) - - self._jobs[job_request.job_id] = { - "request": job_request, - "result": None, - "state": STATE_QUEUED, - "gpu_ids": [], - } - self._queue.append(job_request) - - self._try_schedule() - return job_request.job_id - - def get_job(self, job_id): - """Get job info by ID.""" - with self._lock: - entry = self._jobs.get(job_id) - - if not entry: - return None - - info = entry["request"].to_dict() - info["state"] = entry["state"] - - if entry["result"]: - info.update(entry["result"].to_dict()) - - return info - - def get_job_log_file(self, job_id): - """Return the log file path for a completed job, or None.""" - with self._lock: - entry = self._jobs.get(job_id) - - if not entry or not entry["result"]: - return None - - return entry["result"].log_file - - def get_status(self): - """Return scheduler status for the /status endpoint.""" - with self._lock: - queued = [self._jobs[r.job_id]["request"].to_dict() for r in self._queue] - running = [] - completed = [] - - for entry in self._jobs.values(): - state = entry["state"] - - if state == STATE_RUNNING: - running.append( - {**entry["request"].to_dict(), "gpu_ids": entry["gpu_ids"]} - ) - elif state in (STATE_SUCCESS, STATE_FAILURE): - completed.append(entry["result"].to_dict()) - - return { - "queued": queued, - "running": running, - "completed": completed[-20:], # Last 20 - "resources": self._resource_pool.get_status(), - } - - def wait_all(self): - """Block until all submitted jobs are done. Returns list of JobResult.""" - while True: - with self._lock: - pending = any( - e["state"] in (STATE_QUEUED, STATE_RUNNING) - for e in self._jobs.values() - ) - - if not pending: - break - - self._done_event.wait(timeout=2.0) - self._done_event.clear() - - with self._lock: - return [e["result"] for e in self._jobs.values() if e["result"] is not None] - - def _try_schedule(self): - """Try to run queued jobs that have enough resources. - - Resource allocation and job submission are split: allocation decisions - are made under the lock, but executor.submit() happens outside to - prevent deadlock when the thread pool is saturated. - """ - to_launch = [] # [(req, gpu_ids), ...] - - with self._lock: - remaining = collections.deque() - - while self._queue: - req = self._queue.popleft() - job_cfg = self._config["jobs"].get(req.job_name, {}) - gpu_count = res.parse_gpu_requirement(job_cfg) - memory_mb = res.parse_memory_requirement(job_cfg) - - if self._dry_run: - # In dry-run mode, skip resource checks - gpu_ids, ok = [], True - else: - gpu_ids, ok = self._resource_pool.allocate(gpu_count, memory_mb) - - if ok: - self._jobs[req.job_id]["state"] = STATE_RUNNING - self._jobs[req.job_id]["gpu_ids"] = gpu_ids - to_launch.append((req, gpu_ids)) - else: - remaining.append(req) - - self._queue = remaining - - # Submit outside the lock to avoid deadlock with ThreadPoolExecutor - for req, gpu_ids in to_launch: - self._executor.submit(self._run_job, req, gpu_ids) - - def _run_job(self, req, gpu_ids): - """Execute a single job in a worker thread. - - Wrapped in try/finally to guarantee GPU resources are always released - and job state is updated even on unexpected exceptions. - """ - context = gh.build_status_context(self._status_prefix, req.job_name) - result = None - - try: - # Post pending status - if not self._no_status: - gh.post_commit_status( - self._owner, - self._repo, - req.commit_sha, - STATE_PENDING, - context, - f"Running {req.job_name}...", - ) - - job_cfg = self._config["jobs"][req.job_name] - all_stages = job_cfg.get("stages", []) - repo_url = self._config.get("repo", {}).get("url", "") - commit_short = ( - req.commit_sha[:7] if len(req.commit_sha) > 7 else req.commit_sha - ) - results_dir = run.build_results_dir( - req.results_dir, req.platform, all_stages, commit_short - ) - - gpu_id_str = ",".join(str(g) for g in gpu_ids) if gpu_ids else None - docker_args = run.build_docker_args( - self._config, - req.job_name, - repo_url, - req.branch, - all_stages, - "/workspace", - req.image_tag, - gpu_id_override=gpu_id_str, - results_dir=results_dir, - ) - - start = time.monotonic() - - if self._dry_run: - print(f"[dry-run] {req.job_name}: {shlex.join(docker_args)}") - returncode = 0 - error_tail = [] - log_file = None - else: - results_dir.mkdir(parents=True, exist_ok=True) - log_file = results_dir / "job.log" - proc = subprocess.Popen( - docker_args, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - ) - tail_buf = collections.deque(maxlen=TAIL_LINES) - - with open(log_file, "wb") as lf: - for line in proc.stdout: - sys.stdout.buffer.write(line) - lf.write(line) - tail_buf.append(line) - - proc.stdout.close() - - # Python-level timeout as fallback for the in-container timeout. - job_timeout = job_cfg.get("resources", {}).get("timeout") - fallback_timeout = (job_timeout + 120) if job_timeout else 7200 - - try: - returncode = proc.wait(timeout=fallback_timeout) - except subprocess.TimeoutExpired: - proc.kill() - proc.wait() - returncode = -9 - timeout_msg = f"Job killed: exceeded {fallback_timeout}s timeout\n" - tail_buf.append(timeout_msg.encode()) - - with open(log_file, "ab") as lf: - lf.write(timeout_msg.encode()) - - if returncode != 0: - error_tail = [ - raw.decode("utf-8", errors="replace").rstrip("\n") - for raw in tail_buf - ] - else: - error_tail = [] - - duration = time.monotonic() - start - - result = JobResult( - job_id=req.job_id, - job_name=req.job_name, - commit_sha=req.commit_sha, - returncode=returncode, - results_dir=results_dir, - duration=duration, - error_tail=error_tail, - log_file=log_file, - ) - - # Post final status - if not self._no_status: - gh.post_commit_status( - self._owner, - self._repo, - req.commit_sha, - result.state, - context, - f"{req.job_name}: {result.state} in {duration:.0f}s", - ) - except Exception as e: - print( - f"error: job {req.job_name} failed with exception: {e}", file=sys.stderr - ) - - if result is None: - result = JobResult( - job_id=req.job_id, - job_name=req.job_name, - commit_sha=req.commit_sha, - returncode=-1, - results_dir=req.results_dir, - duration=0, - error_tail=[str(e)], - ) - - if not self._no_status: - gh.post_commit_status( - self._owner, - self._repo, - req.commit_sha, - STATE_ERROR, - context, - f"{req.job_name}: internal error", - ) - finally: - # Always release resources and update state - self._resource_pool.release(gpu_ids) - - with self._lock: - self._jobs[req.job_id]["result"] = result - self._jobs[req.job_id]["state"] = ( - result.state if result else STATE_FAILURE - ) - - self._done_event.set() - # Safe outside lock: `_try_schedule` acquires `self._lock` internally. - self._try_schedule() - - return result - - -# --------------------------------------------------------------------------- -# Webhook server -# --------------------------------------------------------------------------- - - -def verify_signature(secret, body, signature_header): - """Verify GitHub webhook HMAC-SHA256 signature.""" - if not signature_header: - return False - - expected = ( - "sha256=" + hmac.new(secret.encode("utf-8"), body, hashlib.sha256).hexdigest() - ) - return hmac.compare_digest(expected, signature_header) - - -def _verify_api_token(handler): - """Check Bearer token for /api/run authentication. - - Returns True if authenticated, False (and sends 401) if not. - When no api_token is configured on the server, all requests are allowed. - """ - api_token = getattr(handler.server, "api_token", None) - - if not api_token: - return True - - auth_header = handler.headers.get("Authorization", "") - - if auth_header == f"Bearer {api_token}": - return True - - handler._respond_json(401, {"error": "unauthorized"}) - return False - - -class WebhookHandler(BaseHTTPRequestHandler): - """HTTP handler for GitHub webhooks and API endpoints.""" - - def log_message(self, format, *args): - msg = format % args if args else format - print(f"[agent] {msg}", file=sys.stderr) - - def do_GET(self): - if self.path == "/health": - self._respond_json(200, {"status": "ok", "platform": self.server.platform}) - elif self.path == "/status": - status = self.server.scheduler.get_status() - self._respond_json(200, status) - elif self.path.startswith("/api/job/"): - self._handle_api_job() - else: - self._respond_json(404, {"error": "not found"}) - - def do_POST(self): - content_length = int(self.headers.get("Content-Length", 0)) - - if content_length > MAX_CONTENT_LENGTH: - self._respond_json(413, {"error": "payload too large"}) - return - - body = self.rfile.read(content_length) - - if self.path == "/webhook": - self._handle_webhook(body) - elif self.path == "/api/run": - self._handle_api_run(body) - else: - self._respond_json(404, {"error": "not found"}) - - def _handle_webhook(self, body): - # Verify signature if secret is configured - if self.server.webhook_secret: - sig = self.headers.get("X-Hub-Signature-256", "") - - if not verify_signature(self.server.webhook_secret, body, sig): - self._respond_json(401, {"error": "invalid signature"}) - return - - event_type = self.headers.get("X-GitHub-Event", "") - - if event_type == "ping": - self._respond_json(200, {"msg": "pong"}) - return - - try: - payload = json.loads(body) - except json.JSONDecodeError: - self._respond_json(400, {"error": "invalid JSON"}) - return - - if event_type == "push": - branch, sha = self._parse_push(payload) - elif event_type == "pull_request": - action = payload.get("action", "") - - if action not in ("opened", "synchronize"): - self._respond_json(200, {"msg": f"ignored PR action: {action}"}) - return - - branch, sha = self._parse_pull_request(payload) - else: - self._respond_json(200, {"msg": f"ignored event: {event_type}"}) - return - - if not branch or not sha: - self._respond_json(400, {"error": "could not extract branch/sha"}) - return - - job_ids = self._submit_jobs(branch, sha) - self._respond_json(200, {"accepted": True, "job_ids": job_ids}) - - def _handle_api_run(self, body): - """Handle /api/run: remote job trigger (requires Bearer token auth).""" - if not _verify_api_token(self): - return - - try: - payload = json.loads(body) - except json.JSONDecodeError: - self._respond_json(400, {"error": "invalid JSON"}) - return - - branch = payload.get("branch", "") - sha = payload.get("commit_sha", "") - job_name = payload.get("job") - image_tag = payload.get("image_tag") - - if not branch: - self._respond_json(400, {"error": "branch is required"}) - return - - if not sha: - sha = run.get_git_commit() - - job_ids = self._submit_jobs(branch, sha, job_name=job_name, image_tag=image_tag) - self._respond_json(200, {"accepted": True, "job_ids": job_ids}) - - def _handle_api_job(self): - """Handle `GET /api/job/{id}` and `GET /api/job/{id}/log`.""" - parts = self.path.rstrip("/").split("/") - - if len(parts) < 4: - self._respond_json(400, {"error": "missing job_id"}) - return - - job_id = parts[3] - - # `GET /api/job/{id}/log` — return full log file. - if len(parts) >= 5 and parts[4] == "log": - self._handle_job_log(job_id) - return - - info = self.server.scheduler.get_job(job_id) - - if info is None: - self._respond_json(404, {"error": f"job {job_id} not found"}) - else: - self._respond_json(200, info) - - def _handle_job_log(self, job_id): - """Return the full log file for a completed job.""" - log_file = self.server.scheduler.get_job_log_file(job_id) - - if log_file is None or not Path(log_file).is_file(): - self._respond_json(404, {"error": f"log not available for job {job_id}"}) - return - - try: - data = Path(log_file).read_bytes() - except OSError as e: - self._respond_json(500, {"error": f"failed to read log: {e}"}) - return - - self.send_response(200) - self.send_header("Content-Type", "text/plain; charset=utf-8") - self.send_header("Content-Length", str(len(data))) - self.end_headers() - self.wfile.write(data) - - def _parse_push(self, payload): - branch = payload.get("ref", "").removeprefix("refs/heads/") - sha = payload.get("after", "") - return branch, sha - - def _parse_pull_request(self, payload): - pr = payload.get("pull_request", {}) - head = pr.get("head", {}) - branch = head.get("ref", "") - sha = head.get("sha", "") - return branch, sha - - def _submit_jobs(self, branch, sha, job_name=None, image_tag=None): - config = self.server.config - - try: - job_names = run.resolve_job_names( - config.get("jobs", {}), - platform=self.server.platform, - job=job_name, - ) - except ValueError as e: - self._respond_json(400, {"error": str(e)}) - return [] - - job_ids = [] - - for name in job_names: - req = JobRequest( - job_name=name, - branch=branch, - commit_sha=sha, - config=config, - image_tag=image_tag, - results_dir=self.server.results_dir, - ) - - try: - jid = self.server.scheduler.submit(req) - except QueueFullError as e: - self._respond_json(503, {"error": str(e)}) - return job_ids - - job_ids.append(jid) - - return job_ids - - def _respond_json(self, status_code, data): - body = json.dumps(data, indent=2).encode("utf-8") - self.send_response(status_code) - self.send_header("Content-Type", "application/json") - self.send_header("Content-Length", str(len(body))) - self.end_headers() - self.wfile.write(body) - - -class AgentServer(HTTPServer): - """HTTP server with scheduler and config context.""" - - def __init__( - self, - host, - port, - config, - scheduler, - platform, - webhook_secret=None, - api_token=None, - results_dir=None, - ): - super().__init__((host, port), WebhookHandler) - self.config = config - self.scheduler = scheduler - self.platform = platform - self.webhook_secret = webhook_secret - self.api_token = api_token - self.results_dir = results_dir or Path("ci-results") - - -# --------------------------------------------------------------------------- -# Remote job dispatch (for CLI triggering remote agents) -# --------------------------------------------------------------------------- - - -def dispatch_remote_job( - agent_url, job_name, branch, commit_sha, image_tag=None, api_token=None -): - """Send a job to a remote agent via HTTP API. Returns job_id or None.""" - url = f"{agent_url.rstrip('/')}/api/run" - body = { - "branch": branch, - "commit_sha": commit_sha, - "job": job_name, - } - - if image_tag: - body["image_tag"] = image_tag - - data = json.dumps(body).encode("utf-8") - headers = {"Content-Type": "application/json"} - - if api_token: - headers["Authorization"] = f"Bearer {api_token}" - - req = urllib_request(url, data=data, headers=headers, method="POST") - - try: - with urllib_urlopen(req, timeout=30) as resp: - result = json.loads(resp.read()) - job_ids = result.get("job_ids", []) - return job_ids[0] if job_ids else None - except Exception as e: - print(f"error: failed to dispatch to {agent_url}: {e}", file=sys.stderr) - return None - - -def poll_remote_job(agent_url, job_id, interval=5.0, timeout=7200): - """Poll a remote agent for job completion. Returns final state dict or None.""" - url = f"{agent_url.rstrip('/')}/api/job/{job_id}" - deadline = time.monotonic() + timeout - consecutive_failures = 0 - - while time.monotonic() < deadline: - try: - req = urllib_request(url) - - with urllib_urlopen(req, timeout=10) as resp: - info = json.loads(resp.read()) - - consecutive_failures = 0 - state = info.get("state", "") - - if state in (STATE_SUCCESS, STATE_FAILURE): - return info - except Exception as e: - consecutive_failures += 1 - - if consecutive_failures == 1 or consecutive_failures % 20 == 0: - print( - f"warning: polling {url} failed ({consecutive_failures}x): {e}", - file=sys.stderr, - ) - - time.sleep(interval) - - return None - - -def fetch_remote_log(agent_url, job_id): - """Fetch the full log for a completed remote job. Returns text or None.""" - url = f"{agent_url.rstrip('/')}/api/job/{job_id}/log" - - try: - req = urllib_request(url) - - with urllib_urlopen(req, timeout=30) as resp: - return resp.read().decode("utf-8", errors="replace") - except Exception: - return None - - -# --------------------------------------------------------------------------- -# CLI -# --------------------------------------------------------------------------- - - -def cmd_run(args): - """Handle 'run' subcommand: dispatch jobs to platform agents via HTTP.""" - config = run.load_config(args.config) - agents = config.get("agents", {}) - branch = args.branch or config.get("repo", {}).get("branch", "master") - commit_sha = args.commit or run.get_git_commit(short=False) - - # Determine which jobs to run - try: - job_names = run.resolve_job_names( - config.get("jobs", {}), platform=args.platform, job=args.job - ) - except ValueError as e: - print(f"error: {e}", file=sys.stderr) - sys.exit(1) - - if not job_names: - print("error: no matching jobs found", file=sys.stderr) - sys.exit(1) - - # Resolve agent URL for each job - jobs_to_dispatch = [] # [(name, agent_url)] - - for name in job_names: - job = config.get("jobs", {}).get(name, {}) - platform = job.get("platform", "") - agent_url = agents.get(platform, {}).get("url", "") - - if not agent_url: - print( - f"error: no agent URL configured for platform {platform!r} (job {name})", - file=sys.stderr, - ) - sys.exit(1) - - jobs_to_dispatch.append((name, agent_url)) - - api_token = os.environ.get("AGENT_API_TOKEN", "") - results = [] - - if args.dry_run: - for name, agent_url in jobs_to_dispatch: - platform, _, job = name.partition("_") - print(f"[dry-run] dispatch {platform} {job} job to {agent_url}") - else: - # Dispatch all jobs, then poll concurrently. - dispatched = [] # [(name, agent_url, job_id)] - - for name, agent_url in jobs_to_dispatch: - platform, _, job = name.partition("_") - print( - f"==> dispatching {platform} {job} job to {agent_url}", - file=sys.stderr, - ) - job_id = dispatch_remote_job( - agent_url, - name, - branch, - commit_sha, - args.image_tag, - api_token=api_token or None, - ) - - if job_id: - print(f" job_id: {job_id}", file=sys.stderr) - dispatched.append((name, agent_url, job_id)) - else: - print(f" failed to dispatch {name}", file=sys.stderr) - results.append({"job_name": name, "state": "error"}) - - if dispatched: - with ThreadPoolExecutor(max_workers=len(dispatched)) as executor: - futures = { - executor.submit(poll_remote_job, url, jid): (name, url, jid) - for name, url, jid in dispatched - } - - # Collect name lengths for column alignment. - name_width = max(len(n) for n, _, _ in dispatched) - - for future in as_completed(futures): - name, agent_url, job_id = futures[future] - result = future.result() - - if result: - state = result.get("state", "unknown") - duration = result.get("duration_seconds", 0) - tag = "PASS" if state == STATE_SUCCESS else "FAIL" - print( - f"<== {tag} {name:<{name_width}} ({duration:.0f}s)", - file=sys.stderr, - ) - - if state != STATE_SUCCESS: - full_log = fetch_remote_log(agent_url, job_id) - - if full_log: - print( - f"--- full log ({name}) ---", - file=sys.stderr, - ) - print(full_log, file=sys.stderr) - print("---", file=sys.stderr) - else: - # Fall back to `error_tail` if full log unavailable. - error_tail = result.get("error_tail", []) - - if error_tail: - print( - f"--- error output (last {len(error_tail)} lines) ---", - file=sys.stderr, - ) - - for line in error_tail: - print(f" {line}", file=sys.stderr) - - print("---", file=sys.stderr) - - results.append(result) - else: - print( - f"<== TIMEOUT {name:<{name_width}}", - file=sys.stderr, - ) - results.append({"job_name": name, "state": "timeout"}) - - # Summary: only print when there are failures. - failed = [r for r in results if r.get("state") != STATE_SUCCESS] - - if failed: - print("\n========== Failed ==========", file=sys.stderr) - name_width = max(len(r.get("job_name", "?")) for r in failed) - - for r in failed: - name = r.get("job_name", "?") - state = r.get("state", "unknown") - duration = r.get("duration_seconds", 0) - print( - f" FAIL {name:<{name_width}} {state} ({duration:.0f}s)", - file=sys.stderr, - ) - - sys.exit(1) - - -def cmd_serve(args): - """Handle 'serve' subcommand: start webhook server.""" - config = run.load_config(args.config) - - platform = res.detect_platform() - - if not platform: - print( - "error: could not detect platform (no nvidia-smi or ixsmi found)", - file=sys.stderr, - ) - sys.exit(1) - - try: - run.resolve_job_names(config.get("jobs", {}), platform=platform) - except ValueError as e: - print(f"error: {e}", file=sys.stderr) - sys.exit(1) - - pool = res.ResourcePool( - platform, - utilization_threshold=args.utilization_threshold, - ) - scheduler = Scheduler( - config, - platform, - pool, - results_dir=args.results_dir, - ) - - webhook_secret = args.webhook_secret or os.environ.get("WEBHOOK_SECRET", "") - api_token = args.api_token or os.environ.get("AGENT_API_TOKEN", "") - - if not webhook_secret: - print( - "WARNING: No webhook secret configured. Webhook endpoint accepts " - "unsigned requests. Set --webhook-secret or WEBHOOK_SECRET for production.", - file=sys.stderr, - ) - - if not api_token: - print( - "WARNING: No API token configured. /api/run endpoint is unauthenticated. " - "Set --api-token or AGENT_API_TOKEN for production.", - file=sys.stderr, - ) - - server = AgentServer( - args.host, - args.port, - config, - scheduler, - platform, - webhook_secret=webhook_secret or None, - api_token=api_token or None, - results_dir=args.results_dir, - ) - - print( - f"Agent serving on {args.host}:{args.port} (platform={platform})", - file=sys.stderr, - ) - print(" POST /webhook — GitHub webhook", file=sys.stderr) - print(" POST /api/run — remote job trigger", file=sys.stderr) - print(" GET /health — health check", file=sys.stderr) - print(" GET /status — queue & resource status", file=sys.stderr) - print(" GET /api/job/{id} — job status", file=sys.stderr) - print(" GET /api/job/{id}/log — full job log", file=sys.stderr) - - try: - server.serve_forever() - except KeyboardInterrupt: - print("\nShutting down...", file=sys.stderr) - server.shutdown() - - -def main(): - parser = argparse.ArgumentParser( - description="CI Runner Agent: run jobs locally, dispatch remotely, or serve webhooks", - ) - subparsers = parser.add_subparsers(dest="command") - - # --- run subcommand --- - run_parser = subparsers.add_parser("run", help="Run CI jobs") - run_parser.add_argument( - "--config", - type=Path, - default=Path(__file__).resolve().parent / "config.yaml", - ) - run_parser.add_argument( - "--branch", type=str, help="Branch to test (default: config repo.branch)" - ) - run_parser.add_argument("--job", type=str, help="Specific job name") - run_parser.add_argument("--platform", type=str, help="Filter jobs by platform") - run_parser.add_argument("--image-tag", type=str, help="Override image tag") - run_parser.add_argument("--commit", type=str, help="Override commit SHA") - run_parser.add_argument("--dry-run", action="store_true") - - # --- serve subcommand --- - serve_parser = subparsers.add_parser("serve", help="Start webhook server") - serve_parser.add_argument( - "--config", - type=Path, - default=Path(__file__).resolve().parent / "config.yaml", - ) - serve_parser.add_argument("--port", type=int, default=8080) - serve_parser.add_argument("--host", type=str, default="0.0.0.0") - serve_parser.add_argument("--webhook-secret", type=str) - serve_parser.add_argument( - "--api-token", - type=str, - help="Bearer token for /api/run authentication (or AGENT_API_TOKEN env var)", - ) - serve_parser.add_argument( - "--results-dir", - type=Path, - default=Path("ci-results"), - ) - serve_parser.add_argument( - "--utilization-threshold", - type=int, - default=10, - ) - - args = parser.parse_args() - - if args.command == "run": - cmd_run(args) - elif args.command == "serve": - cmd_serve(args) - else: - parser.print_help() - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/.ci/build.py b/.ci/build.py deleted file mode 100644 index b373cb7d1..000000000 --- a/.ci/build.py +++ /dev/null @@ -1,260 +0,0 @@ -#!/usr/bin/env python3 -"""CI image builder: detect changes, build, tag, and optionally push Docker images.""" - -import argparse -import json -import os -import shlex -import subprocess -import sys -from pathlib import Path - -from utils import get_git_commit, load_config - - -def has_dockerfile_changed(dockerfile_dir, base_ref="HEAD~1"): - """Check if any file under `dockerfile_dir` changed since `base_ref`.""" - result = subprocess.run( - ["git", "diff", "--name-only", base_ref, "--", dockerfile_dir], - capture_output=True, - text=True, - ) - - if result.returncode != 0: - print( - "warning: git diff failed (shallow clone or initial commit?);" - " assuming Dockerfile changed", - file=sys.stderr, - ) - return True - - return bool(result.stdout.strip()) - - -def docker_login(registry_cfg, dry_run): - """Log in to the registry using `credentials_env` token. - - Returns True on success. - - NOTE: Registry support is currently unused (`config.yaml` has no registry - section). Retained for future integration with an external image management - system. - """ - credentials_env = registry_cfg.get("credentials_env") - registry_url = registry_cfg.get("url", "") - - if not credentials_env or not registry_url: - return True - - token = os.environ.get(credentials_env) - - if not token: - print( - f"error: {credentials_env} not set, cannot login", - file=sys.stderr, - ) - return False - - if dry_run: - print( - f"[dry-run] echo | docker login {registry_url}" - " --username token --password-stdin" - ) - return True - - result = subprocess.run( - ["docker", "login", registry_url, "--username", "token", "--password-stdin"], - input=token, - text=True, - ) - - if result.returncode != 0: - print("error: docker login failed", file=sys.stderr) - return False - - return True - - -def build_image_tag(registry_url, project, platform, tag): - if registry_url: - return f"{registry_url}/{project}/{platform}:{tag}" - - return f"{project}-ci/{platform}:{tag}" - - -def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run, logged_in): - """Build a single platform image. Returns True on success.""" - registry_url = registry_cfg.get("url", "") - project = registry_cfg.get("project", "infiniops") - dockerfile_dir = platform_cfg["dockerfile"] - commit_tag = build_image_tag(registry_url, project, platform, commit) - latest_tag = build_image_tag(registry_url, project, platform, "latest") - - build_args_cfg = platform_cfg.get("build_args", {}) - build_cmd = ["docker", "build", "--network", "host"] - - for key, value in build_args_cfg.items(): - build_cmd.extend(["--build-arg", f"{key}={value}"]) - - for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"): - proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.lower()) - - if proxy_val: - build_cmd.extend(["--build-arg", f"{proxy_var}={proxy_val}"]) - build_cmd.extend(["--build-arg", f"{proxy_var.lower()}={proxy_val}"]) - - private_sdk = platform_cfg.get("private_sdk", {}) - - if private_sdk: - source_env = private_sdk.get("source_env", "") - sdk_url = os.environ.get(source_env, "") if source_env else "" - - if sdk_url: - build_cmd.extend(["--build-arg", f"PRIVATE_SDK_URL={sdk_url}"]) - - build_cmd.extend(["-t", commit_tag, "-t", latest_tag, dockerfile_dir]) - - if dry_run: - print(f"[dry-run] {shlex.join(build_cmd)}") - - if push: - if not logged_in: - print("[dry-run] (skipping push: docker login failed)") - else: - print(f"[dry-run] docker push {commit_tag}") - print(f"[dry-run] docker push {latest_tag}") - - return True - - print(f"==> building {platform}: {commit_tag}", file=sys.stderr) - result = subprocess.run(build_cmd) - - if result.returncode != 0: - error = { - "stage": "build", - "platform": platform, - "tag": commit_tag, - "exit_code": result.returncode, - } - print(json.dumps(error), file=sys.stderr) - - return False - - if push: - if not logged_in: - print("error: docker login failed, cannot push", file=sys.stderr) - return False - - for tag in (commit_tag, latest_tag): - print(f"==> pushing {tag}", file=sys.stderr) - push_result = subprocess.run(["docker", "push", tag]) - - if push_result.returncode != 0: - error = { - "stage": "push", - "platform": platform, - "tag": tag, - "exit_code": push_result.returncode, - } - print(json.dumps(error), file=sys.stderr) - - return False - - return True - - -def main(): - parser = argparse.ArgumentParser(description="Build CI Docker images") - parser.add_argument( - "--platform", - type=str, - default="all", - help="Platform to build (nvidia, iluvatar, metax, moore, cambricon, ascend, or all). Default: all", - ) - parser.add_argument( - "--config", - type=Path, - default=Path(__file__).resolve().parent / "config.yaml", - help="Path to config.yaml", - ) - parser.add_argument( - "--commit", - type=str, - default="HEAD", - help="Git ref for tagging the image (default: HEAD)", - ) - parser.add_argument( - "--push", - action="store_true", - help="Push images to registry after building (requires registry in config)", - ) - parser.add_argument( - "--force", - action="store_true", - help="Skip change detection and force build", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="Print commands without executing", - ) - args = parser.parse_args() - - config = load_config(args.config) - registry_cfg = config.get("registry", {}) - images_cfg = config.get("images", {}) - - if not images_cfg: - print("error: no `images` section in config", file=sys.stderr) - sys.exit(1) - - if args.platform == "all": - platforms = list(images_cfg.keys()) - else: - if args.platform not in images_cfg: - print( - f"error: platform `{args.platform}` not found in config", - file=sys.stderr, - ) - sys.exit(1) - platforms = [args.platform] - - commit = get_git_commit(args.commit) - logged_in = docker_login(registry_cfg, args.dry_run) if args.push else True - failed = False - - for platform in platforms: - platform_cfg = images_cfg[platform] - dockerfile_dir = platform_cfg["dockerfile"] - - if not Path(dockerfile_dir).is_dir(): - print( - f"warning: dockerfile directory `{dockerfile_dir}` does not exist," - f" skipping {platform}", - file=sys.stderr, - ) - continue - - if not args.force and not has_dockerfile_changed(dockerfile_dir): - print(f"==> {platform}: no changes detected, skipping", file=sys.stderr) - continue - - ok = build_image( - platform, - platform_cfg, - registry_cfg, - commit, - args.push, - args.dry_run, - logged_in=logged_in, - ) - - if not ok: - failed = True - - if failed: - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py deleted file mode 100644 index b23fee73b..000000000 --- a/.ci/ci_resource.py +++ /dev/null @@ -1,575 +0,0 @@ -#!/usr/bin/env python3 -"""Resource detection and allocation for CI Runner Agent.""" - -import json -import operator -import os -import re -import shutil -import subprocess -import sys -import threading -from dataclasses import dataclass - -# Platform-to-device-env mapping for non-NVIDIA platforms. -# NVIDIA uses Docker's --gpus flag instead of an environment variable. -PLATFORM_DEVICE_ENV = { - "iluvatar": "CUDA_VISIBLE_DEVICES", - "metax": "CUDA_VISIBLE_DEVICES", - "moore": "MTHREADS_VISIBLE_DEVICES", - "cambricon": "MLU_VISIBLE_DEVICES", - "ascend": "ASCEND_VISIBLE_DEVICES", -} - - -@dataclass -class GpuInfo: - index: int - memory_used_mb: float - memory_total_mb: float - utilization_pct: float - - -@dataclass -class SystemResources: - total_memory_mb: float - available_memory_mb: float - cpu_count: int - - -class ResourcePool: - """Thread-safe GPU and system resource manager. - - Detects available GPUs via platform-specific tools (nvidia-smi, ixsmi, mx-smi, mthreads-gmi) - and tracks allocations to enable dynamic parallel scheduling. - """ - - GPU_QUERY_TOOLS = { - "nvidia": "nvidia-smi", - "iluvatar": "ixsmi", - "metax": "mx-smi", - "moore": "mthreads-gmi", - "cambricon": "cnmon", - "ascend": "npu-smi", - } - - def __init__(self, platform, utilization_threshold=10): - self._platform = platform - self._utilization_threshold = utilization_threshold - self._allocated: set[int] = set() - self._lock = threading.Lock() - - @property - def platform(self): - return self._platform - - @property - def allocated(self): - with self._lock: - return set(self._allocated) - - def detect_gpus(self) -> list[GpuInfo]: - """Query GPU status via platform-specific CLI tool.""" - if self._platform == "metax": - return self._detect_gpus_metax() - - if self._platform == "moore": - return self._detect_gpus_moore() - - if self._platform == "cambricon": - return self._detect_gpus_cambricon() - - if self._platform == "ascend": - return self._detect_gpus_ascend() - - tool = self.GPU_QUERY_TOOLS.get(self._platform) - - if not tool: - return [] - - try: - result = subprocess.run( - [ - tool, - "--query-gpu=index,memory.used,memory.total,utilization.gpu", - "--format=csv,noheader,nounits", - ], - capture_output=True, - text=True, - timeout=10, - ) - except (FileNotFoundError, subprocess.TimeoutExpired): - return [] - - if result.returncode != 0: - return [] - - gpus = [] - - for line in result.stdout.strip().splitlines(): - parts = [p.strip() for p in line.split(",")] - - if len(parts) < 4: - continue - - try: - gpus.append( - GpuInfo( - index=int(parts[0]), - memory_used_mb=float(parts[1]), - memory_total_mb=float(parts[2]), - utilization_pct=float(parts[3]), - ) - ) - except (ValueError, IndexError): - continue - - return gpus - - def _detect_gpus_metax(self) -> list[GpuInfo]: - """Parse mx-smi output for MetaX GPUs. - - Runs --show-memory and --show-usage separately and merges results. - Output format example: - GPU#0 MXC550 0000:1a:00.0 - Memory - vis_vram total : 67108864 KB - vis_vram used : 879032 KB - Utilization - GPU : 0 % - """ - - def run_mxsmi(flag): - try: - r = subprocess.run( - ["mx-smi", flag], - capture_output=True, - text=True, - timeout=10, - ) - return r.stdout if r.returncode == 0 else "" - except (FileNotFoundError, subprocess.TimeoutExpired): - return "" - - mem_out = run_mxsmi("--show-memory") - util_out = run_mxsmi("--show-usage") - - # Parse memory: collect {index: (used_kb, total_kb)} - mem = {} - current = None - for line in mem_out.splitlines(): - m = re.match(r"GPU#(\d+)", line.strip()) - if m: - current = int(m.group(1)) - mem[current] = [0.0, 0.0] - continue - if current is None: - continue - m = re.search(r"vis_vram total\s*:\s*([\d.]+)\s*KB", line) - if m: - mem[current][1] = float(m.group(1)) / 1024 # KB -> MB - m = re.search(r"vis_vram used\s*:\s*([\d.]+)\s*KB", line) - if m: - mem[current][0] = float(m.group(1)) / 1024 # KB -> MB - - # Parse utilization: collect {index: utilization_pct} - util = {} - current = None - in_util = False - for line in util_out.splitlines(): - m = re.match(r"GPU#(\d+)", line.strip()) - if m: - current = int(m.group(1)) - in_util = False - continue - if current is None: - continue - if "Utilization" in line: - in_util = True - continue - if in_util: - m = re.match(r"\s*GPU\s*:\s*([\d.]+)\s*%", line) - if m: - util[current] = float(m.group(1)) - in_util = False - - gpus = [] - for idx in sorted(mem): - used_mb, total_mb = mem[idx] - gpus.append( - GpuInfo( - index=idx, - memory_used_mb=used_mb, - memory_total_mb=total_mb, - utilization_pct=util.get(idx, 0.0), - ) - ) - return gpus - - def _detect_gpus_moore(self) -> list[GpuInfo]: - """Parse mthreads-gmi JSON output for Moore Threads GPUs. - - Uses: mthreads-gmi -q --json - Expected JSON structure: - { - "Attached GPUs": { - "GPU 00000000:3B:00.0": { - "Minor Number": "0", - "Memory Usage": { - "Total": "24576 MiB", - "Used": "512 MiB" - }, - "Utilization": { - "Gpu": "5 %" - } - } - } - } - """ - - def extract_number(s): - m = re.search(r"([\d.]+)", str(s)) - return float(m.group(1)) if m else 0.0 - - try: - result = subprocess.run( - ["mthreads-gmi", "-q", "--json"], - capture_output=True, - text=True, - timeout=10, - ) - except (FileNotFoundError, subprocess.TimeoutExpired): - return [] - - if result.returncode != 0: - return [] - - try: - data = json.loads(result.stdout) - except json.JSONDecodeError: - return [] - - gpus = [] - attached = data.get("Attached GPUs", {}) - - for gpu_data in attached.values(): - try: - index = int(gpu_data.get("Minor Number", len(gpus))) - - mem = gpu_data.get("Memory Usage", {}) - total_mb = extract_number(mem.get("Total", "0 MiB")) - used_mb = extract_number(mem.get("Used", "0 MiB")) - util_pct = extract_number( - gpu_data.get("Utilization", {}).get("Gpu", "0 %") - ) - - gpus.append( - GpuInfo( - index=index, - memory_used_mb=used_mb, - memory_total_mb=total_mb, - utilization_pct=util_pct, - ) - ) - except (ValueError, AttributeError): - continue - - return sorted(gpus, key=operator.attrgetter("index")) - - def _detect_gpus_cambricon(self) -> list[GpuInfo]: - """Parse cnmon output for Cambricon MLU cards. - - Each card appears as two consecutive data rows: - Row 1: | {card} {vf} {name} {fw} | {bus_id} | {util}% {ecc} | - Row 2: | {fan}% {temp} {pwr} | {mem_used} MiB/ {mem_total} MiB | ... | - """ - try: - result = subprocess.run( - ["cnmon"], - capture_output=True, - text=True, - timeout=10, - ) - except (FileNotFoundError, subprocess.TimeoutExpired): - return [] - - if result.returncode != 0: - return [] - - gpus = [] - lines = result.stdout.splitlines() - i = 0 - - while i < len(lines): - line = lines[i] - # Row 1: "| {index} ... | {bus_id} | {util}% {ecc} |" - m1 = re.match(r"^\|\s+(\d+)\s+.*\|\s*([\d.]+)%", line) - - if m1 and i + 1 < len(lines): - try: - card_index = int(m1.group(1)) - util_pct = float(m1.group(2)) - row2 = lines[i + 1] - mem_m = re.search(r"([\d.]+)\s+MiB/\s*([\d.]+)\s+MiB", row2) - - if mem_m: - used_mb = float(mem_m.group(1)) - total_mb = float(mem_m.group(2)) - else: - used_mb, total_mb = 0.0, 0.0 - - gpus.append( - GpuInfo( - index=card_index, - memory_used_mb=used_mb, - memory_total_mb=total_mb, - utilization_pct=util_pct, - ) - ) - except (ValueError, AttributeError): - pass - i += 2 - continue - - i += 1 - - return sorted(gpus, key=operator.attrgetter("index")) - - def _detect_gpus_ascend(self) -> list[GpuInfo]: - """Parse npu-smi info output for Huawei Ascend NPUs. - - Output format (pipe-delimited table, two rows per NPU): - | 0 910B4 | OK | 86.5 41 ... - | 0 | 0000:C1:00.0 | 0 0 / 0 2789 / 32768 | - Row 1: index, name, health, power, temp, hugepages. - Row 2: chip_id, bus_id, aicore_util, memory_usage, hbm_usage. - """ - try: - result = subprocess.run( - ["npu-smi", "info"], - capture_output=True, - text=True, - timeout=10, - ) - except (FileNotFoundError, subprocess.TimeoutExpired): - return [] - - if result.returncode != 0: - return [] - - gpus = [] - lines = result.stdout.splitlines() - i = 0 - - while i < len(lines): - line = lines[i] - # Match row 1: `| {index} {name} ...`. - m1 = re.match(r"^\|\s+(\d+)\s+", line) - - if m1 and i + 1 < len(lines): - try: - npu_index = int(m1.group(1)) - aicore_m = re.match( - r"^\|\s+\d+\s+\|\s+[\da-f:.]+\s+\|\s*([\d.]+)\s", lines[i + 1] - ) - - util_pct = float(aicore_m.group(1)) if aicore_m else 0.0 - - # Parse HBM usage from row 2. Row contains both DDR - # ("0 / 0") and HBM ("2789 / 32768"); HBM is always last. - hbm_matches = re.findall(r"([\d.]+)\s*/\s*([\d.]+)", lines[i + 1]) - - if hbm_matches: - used_mb = float(hbm_matches[-1][0]) - total_mb = float(hbm_matches[-1][1]) - else: - used_mb, total_mb = 0.0, 0.0 - - gpus.append( - GpuInfo( - index=npu_index, - memory_used_mb=used_mb, - memory_total_mb=total_mb, - utilization_pct=util_pct, - ) - ) - except (ValueError, AttributeError): - pass - - i += 2 - continue - - i += 1 - - return sorted(gpus, key=operator.attrgetter("index")) - - def detect_system_resources(self) -> SystemResources: - """Read system memory from /proc/meminfo and CPU count.""" - total_mb = 0.0 - available_mb = 0.0 - - try: - with open("/proc/meminfo", encoding="utf-8") as f: - for line in f: - if line.startswith("MemTotal:"): - total_mb = float(line.split()[1]) / 1024 - elif line.startswith("MemAvailable:"): - available_mb = float(line.split()[1]) / 1024 - except OSError: - pass - - return SystemResources( - total_memory_mb=total_mb, - available_memory_mb=available_mb, - cpu_count=os.cpu_count() or 1, - ) - - def allocate(self, gpu_count, memory_mb=0) -> tuple[list[int], bool]: - """Try to allocate GPUs and check memory. - - Returns (allocated_gpu_ids, success). On failure returns ([], False). - GPUs are selected by ascending utilization (least loaded first). - Detection runs outside the lock to avoid blocking other threads. - """ - if gpu_count <= 0: - if memory_mb > 0: - sys_res = self.detect_system_resources() - - if sys_res.available_memory_mb < memory_mb: - return ([], False) - - return ([], True) - - # Detect GPUs and memory outside the lock (subprocess.run can block). - gpus = self.detect_gpus() - sys_res = self.detect_system_resources() if memory_mb > 0 else None - - with self._lock: - available = [ - g - for g in gpus - if g.index not in self._allocated - and g.utilization_pct < self._utilization_threshold - ] - - if len(available) < gpu_count: - return ([], False) - - if sys_res is not None and sys_res.available_memory_mb < memory_mb: - return ([], False) - - # Pick least loaded GPUs. - available.sort(key=lambda g: g.utilization_pct) - selected = [g.index for g in available[:gpu_count]] - self._allocated.update(selected) - return (selected, True) - - def release(self, gpu_ids): - """Return GPUs to the free pool.""" - with self._lock: - self._allocated -= set(gpu_ids) - - def get_status(self) -> dict: - """Return current resource status for API endpoints.""" - gpus = self.detect_gpus() - sys_res = self.detect_system_resources() - - with self._lock: - allocated = sorted(self._allocated) - - return { - "platform": self._platform, - "gpus": [ - { - "index": g.index, - "memory_used_mb": g.memory_used_mb, - "memory_total_mb": g.memory_total_mb, - "utilization_pct": g.utilization_pct, - "allocated_by_agent": g.index in allocated, - } - for g in gpus - ], - "allocated_gpu_ids": allocated, - "system": { - "total_memory_mb": round(sys_res.total_memory_mb, 1), - "available_memory_mb": round(sys_res.available_memory_mb, 1), - "cpu_count": sys_res.cpu_count, - }, - "utilization_threshold": self._utilization_threshold, - } - - -def parse_gpu_requirement(job_config) -> int: - """Extract GPU count required by a job. - - Resolution rules: - - - ``gpu_ids: "auto"`` (or omitted) — dynamic allocation; returns ``ngpus`` - (default 1). - - ``gpu_ids: "all"`` — use every available GPU; returns 0 (no reservation). - - ``gpu_ids: "0,2"`` — static pinning; returns the count of listed IDs. - When ``ngpus`` is also present the two must agree. - - The platform name determines how GPUs are exposed to Docker (see - ``PLATFORM_DEVICE_ENV``) but does **not** affect GPU counting here. - """ - resources = job_config.get("resources", {}) - gpu_ids = str(resources.get("gpu_ids", "auto")).strip() - ngpus = resources.get("ngpus") - - if gpu_ids == "all": - return 0 - - if gpu_ids == "auto" or not gpu_ids: - return int(ngpus) if ngpus is not None else 1 - - # Static pinning — count explicit IDs. - count = len(gpu_ids.split(",")) - - if ngpus is not None and int(ngpus) != count: - print( - f"warning: gpu_ids has {count} device(s) but ngpus={ngpus}; " - f"using gpu_ids count ({count})", - file=sys.stderr, - ) - - return count - - -def parse_memory_requirement(job_config) -> float: - """Extract memory requirement in MB from a job config.""" - resources = job_config.get("resources", {}) - memory = str(resources.get("memory", "")) - - if not memory: - return 0 - - memory = memory.lower().strip() - - if memory.endswith("gb"): - return float(memory[:-2]) * 1024 - elif memory.endswith("g"): - return float(memory[:-1]) * 1024 - elif memory.endswith("mb"): - return float(memory[:-2]) - elif memory.endswith("m"): - return float(memory[:-1]) - - try: - return float(memory) * 1024 # Default: GB - except ValueError: - print( - f"warning: unrecognized memory format {memory!r}, treating as 0", - file=sys.stderr, - ) - - return 0 - - -def detect_platform(): - """Auto-detect the current platform by probing GPU query tools on PATH.""" - for platform, tool in ResourcePool.GPU_QUERY_TOOLS.items(): - if shutil.which(tool): - return platform - - return None diff --git a/.ci/github_status.py b/.ci/github_status.py deleted file mode 100644 index f8f017f11..000000000 --- a/.ci/github_status.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env python3 -"""GitHub Commit Status API wrapper using urllib (zero external dependencies).""" - -import json -import os -import re -import sys -import urllib.error -import urllib.request - - -def parse_repo_url(url): - """Extract (owner, repo) from a GitHub URL. - - Handles: - - https://github.com/Owner/Repo.git - - git@github.com:Owner/Repo.git - """ - # HTTPS format - m = re.match(r"https?://[^/]+/([^/]+)/([^/]+?)(?:\.git)?$", url) - - if m: - return m.group(1), m.group(2) - - # SSH format - m = re.match(r"git@[^:]+:([^/]+)/([^/]+?)(?:\.git)?$", url) - - if m: - return m.group(1), m.group(2) - - return "", "" - - -def build_status_context(prefix, job_name): - """Build status context string, e.g. 'ci/infiniops/nvidia_gpu'.""" - return f"{prefix}/{job_name}" - - -def post_commit_status( - owner, - repo, - sha, - state, - context, - description, - target_url=None, - token=None, -): - """Post a commit status to GitHub. - - Args: - state: One of 'pending', 'success', 'failure', 'error'. - Returns True on success, False on failure. - """ - token = token or os.environ.get("GITHUB_TOKEN", "") - - if not token: - print("warning: GITHUB_TOKEN not set, skipping status update", file=sys.stderr) - return False - - if not owner or not repo or not sha: - print( - "warning: missing owner/repo/sha, skipping status update", file=sys.stderr - ) - return False - - url = f"https://api.github.com/repos/{owner}/{repo}/statuses/{sha}" - body = { - "state": state, - "context": context, - "description": description[:140], - } - - if target_url: - body["target_url"] = target_url - - data = json.dumps(body).encode("utf-8") - req = urllib.request.Request( - url, - data=data, - headers={ - "Authorization": f"token {token}", - "Accept": "application/vnd.github.v3+json", - "Content-Type": "application/json", - }, - method="POST", - ) - - try: - with urllib.request.urlopen(req, timeout=30) as resp: - return 200 <= resp.status < 300 - except urllib.error.HTTPError as e: - print( - f"warning: GitHub status API returned {e.code}: {e.reason}", - file=sys.stderr, - ) - return False - except urllib.error.URLError as e: - print(f"warning: GitHub status API error: {e.reason}", file=sys.stderr) - return False diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile deleted file mode 100644 index a542b99e0..000000000 --- a/.ci/images/ascend/Dockerfile +++ /dev/null @@ -1,29 +0,0 @@ -ARG BASE_IMAGE -FROM ${BASE_IMAGE} - -ARG HTTP_PROXY -ARG HTTPS_PROXY -ARG NO_PROXY -ARG http_proxy -ARG https_proxy -ARG no_proxy - -ARG PIP_INDEX_URL=https://pypi.org/simple - -RUN pip install --no-cache-dir --progress off \ - scikit-build-core \ - libclang \ - pytest \ - pytest-cov \ - pytest-xdist \ - ruff - -ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest -ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/aarch64:${LD_LIBRARY_PATH} -ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH} -ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH} -ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME} -ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp -ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit - -WORKDIR /workspace diff --git a/.ci/images/cambricon/Dockerfile b/.ci/images/cambricon/Dockerfile deleted file mode 100644 index 138f3cb47..000000000 --- a/.ci/images/cambricon/Dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -ARG BASE_IMAGE -FROM ${BASE_IMAGE} - -# Python 3.10 executables (`pip`-installed tools) live under `/usr/local/python3.10/bin`. -ENV PATH=/usr/local/python3.10/bin:${PATH} - -ARG HTTP_PROXY -ARG HTTPS_PROXY -ARG NO_PROXY -ARG http_proxy -ARG https_proxy -ARG no_proxy - -# `git` and `cmake` are pre-installed; `coreutils-single` covers coreutils needs. -RUN dnf install -y ninja-build && dnf clean all - -ARG PIP_INDEX_URL -RUN pip install --no-cache-dir \ - ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ - scikit-build-core \ - libclang \ - pytest \ - pytest-cov \ - pytest-xdist \ - ruff==0.15.7 - -# Pin pre-installed Cambricon `torch` to prevent `pip` from replacing it with upstream version. -RUN pip show torch >/dev/null 2>&1 && \ - echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ - touch /etc/pip-constraints.txt -ENV PIP_CONSTRAINT=/etc/pip-constraints.txt - -WORKDIR /workspace diff --git a/.ci/images/iluvatar/Dockerfile b/.ci/images/iluvatar/Dockerfile deleted file mode 100644 index 79afc8585..000000000 --- a/.ci/images/iluvatar/Dockerfile +++ /dev/null @@ -1,53 +0,0 @@ -ARG BASE_IMAGE -FROM ${BASE_IMAGE} - -ENV DEBIAN_FRONTEND=noninteractive - -# CoreX runtime environment (base image sets these in `/etc/bash.bashrc`, -# but `docker build` `RUN` uses `/bin/sh` which doesn't source it). -ENV PATH=/usr/local/corex/bin:/usr/local/corex-4.3.0/corex-toolbox-1.0.0/bin:/usr/local/corex/lib64/python3/dist-packages/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin -ENV PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages -ENV LD_LIBRARY_PATH=/usr/local/corex/lib64:/usr/local/lib:/usr/local/openmpi/lib - -ARG HTTP_PROXY -ARG HTTPS_PROXY -ARG NO_PROXY -ARG http_proxy -ARG https_proxy -ARG no_proxy - -ARG APT_MIRROR -RUN if [ -n "$APT_MIRROR" ]; then \ - sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ - fi && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - git \ - ninja-build \ - coreutils \ - && rm -rf /var/lib/apt/lists/* - -RUN ln -sf $(which python3) /usr/local/bin/python 2>/dev/null || true - -ARG PIP_INDEX_URL -RUN pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir \ - ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ - scikit-build-core \ - pybind11 \ - libclang \ - pytest \ - pytest-cov \ - pytest-xdist \ - pyyaml \ - ruff==0.15.7 - -RUN pip config set global.index-url https://pypi.org/simple - -# Pin pre-installed CoreX `torch` to prevent `pip` from replacing it with upstream version. -RUN pip show torch >/dev/null 2>&1 && \ - echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ - touch /etc/pip-constraints.txt -ENV PIP_CONSTRAINT=/etc/pip-constraints.txt - -WORKDIR /workspace diff --git a/.ci/images/metax/Dockerfile b/.ci/images/metax/Dockerfile deleted file mode 100644 index 540bc9d56..000000000 --- a/.ci/images/metax/Dockerfile +++ /dev/null @@ -1,46 +0,0 @@ -ARG BASE_IMAGE -FROM ${BASE_IMAGE} - -ENV DEBIAN_FRONTEND=noninteractive - -# `conda` Python is used in this image. -ENV PATH=/opt/conda/bin:${PATH} - -ARG HTTP_PROXY -ARG HTTPS_PROXY -ARG NO_PROXY -ARG http_proxy -ARG https_proxy -ARG no_proxy - -ARG APT_MIRROR -RUN if [ -n "$APT_MIRROR" ]; then \ - sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ - fi && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - git \ - cmake \ - ninja-build \ - coreutils \ - libclang-dev \ - && rm -rf /var/lib/apt/lists/* - -ARG PIP_INDEX_URL -RUN pip install --no-cache-dir \ - ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ - scikit-build-core \ - pybind11 \ - libclang \ - pytest-cov \ - pytest-xdist \ - pyyaml \ - ruff==0.15.7 - -# Pin pre-installed MetaX `torch` to prevent `pip` from replacing it with upstream version. -RUN pip show torch >/dev/null 2>&1 && \ - echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ - touch /etc/pip-constraints.txt -ENV PIP_CONSTRAINT=/etc/pip-constraints.txt - -WORKDIR /workspace diff --git a/.ci/images/moore/Dockerfile b/.ci/images/moore/Dockerfile deleted file mode 100644 index a95d9bd15..000000000 --- a/.ci/images/moore/Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -ARG BASE_IMAGE -FROM ${BASE_IMAGE} - -ENV DEBIAN_FRONTEND=noninteractive - -# `MUSA_HOME`, `PATH`, `LD_LIBRARY_PATH` already set by base image. - -ARG HTTP_PROXY -ARG HTTPS_PROXY -ARG NO_PROXY -ARG http_proxy -ARG https_proxy -ARG no_proxy - -ARG APT_MIRROR -RUN if [ -n "$APT_MIRROR" ]; then \ - sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ - fi && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - ninja-build \ - libclang-dev \ - && rm -rf /var/lib/apt/lists/* - -ARG PIP_INDEX_URL -RUN pip install --no-cache-dir \ - ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ - scikit-build-core \ - libclang \ - pytest-cov \ - pytest-xdist \ - ruff==0.15.7 - -# Pin pre-installed `torch` to prevent `pip` from replacing it with upstream version. -RUN echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt -ENV PIP_CONSTRAINT=/etc/pip-constraints.txt - -WORKDIR /workspace diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile deleted file mode 100644 index b4984dac2..000000000 --- a/.ci/images/nvidia/Dockerfile +++ /dev/null @@ -1,46 +0,0 @@ -ARG BASE_IMAGE -FROM ${BASE_IMAGE} - -ENV DEBIAN_FRONTEND=noninteractive - -ARG HTTP_PROXY -ARG HTTPS_PROXY -ARG NO_PROXY -ARG http_proxy -ARG https_proxy -ARG no_proxy - -ARG APT_MIRROR -RUN if [ -n "$APT_MIRROR" ]; then \ - sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ - fi && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - git \ - cmake \ - ninja-build \ - coreutils \ - libclang-dev \ - && rm -rf /var/lib/apt/lists/* - - -ARG PIP_INDEX_URL -RUN pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir \ - ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ - scikit-build-core \ - pybind11 \ - libclang \ - pytest \ - pytest-cov \ - pytest-xdist \ - pyyaml \ - ruff==0.15.7 - -# Pin pre-installed `torch` to prevent `pip` from replacing it with a different version. -RUN pip show torch >/dev/null 2>&1 && \ - echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ - touch /etc/pip-constraints.txt -ENV PIP_CONSTRAINT=/etc/pip-constraints.txt - -WORKDIR /workspace diff --git a/.ci/restart-agent.sh b/.ci/restart-agent.sh deleted file mode 100755 index efe0a9001..000000000 --- a/.ci/restart-agent.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -# Usage: bash .ci/restart-agent.sh [port] [webhook-secret] -# -# Restart the CI agent with proxy configured. -# Edit the HTTPS_PROXY line below for your environment, then: -# bash .ci/restart-agent.sh -# bash .ci/restart-agent.sh 8080 my-webhook-secret - -set -euo pipefail - -PORT="${1:-8080}" -WEBHOOK_SECRET="${2:-}" - -# --- Proxy config (edit this) --- -export HTTPS_PROXY="http://your-proxy:port" -export HTTP_PROXY="$HTTPS_PROXY" -export NO_PROXY="localhost,127.0.0.1" -export https_proxy="$HTTPS_PROXY" -export http_proxy="$HTTP_PROXY" -export no_proxy="$NO_PROXY" - -# --- Kill existing agent --- -if pgrep -f "agent.py serve" > /dev/null 2>&1; then - echo "Stopping existing agent..." - pkill -f "agent.py serve" || true - sleep 2 -fi - -# --- Start agent --- -CI_DIR="$(cd "$(dirname "$0")" && pwd)" - -if [ ! -f "$CI_DIR/agent.py" ]; then - echo "error: $CI_DIR/agent.py not found" - exit 1 -fi - -ARGS="serve --port $PORT" -if [ -n "$WEBHOOK_SECRET" ]; then - ARGS="$ARGS --webhook-secret $WEBHOOK_SECRET" -fi - -echo "Starting CI agent on port $PORT..." -nohup python "$CI_DIR/agent.py" $ARGS > /tmp/ci-agent.log 2>&1 & - -HOST_IP=$(hostname -I 2>/dev/null | awk '{print $1}' || hostname) - -echo "PID: $!" -echo "Listen: http://${HOST_IP}:${PORT}" -echo "Log: /tmp/ci-agent.log" -echo "Proxy: $HTTPS_PROXY" diff --git a/.ci/run.py b/.ci/run.py deleted file mode 100644 index e293b4a28..000000000 --- a/.ci/run.py +++ /dev/null @@ -1,499 +0,0 @@ -#!/usr/bin/env python3 -"""Standalone Docker CI runner: clone repo, setup, run stages. Output to stdout.""" - -import argparse -import os -import re -import shlex -import subprocess -import sys -import uuid -import xml.etree.ElementTree as ET -from datetime import datetime -from pathlib import Path - -from ci_resource import ( - PLATFORM_DEVICE_ENV, - ResourcePool, - detect_platform, - parse_gpu_requirement, - parse_memory_requirement, -) -from utils import get_git_commit, load_config - -# Flags that consume the next token as their value (e.g. -n 4, -k expr). -_PYTEST_VALUE_FLAGS = {"-n", "-k", "-m", "-p", "--tb", "--junitxml", "--rootdir"} - - -def _junit_xml_indicates_pass(results_dir): - """Return True if `pytest` junit XML under `results_dir` reports no failures/errors. - - Used to distinguish a real CI failure from the docker 18.09 - container-teardown `SIGKILL` (exit code 137) that occurs on this host - after a child process exits successfully — bash returns 0 from inside - the container, but the docker daemon reports 137 due to a race in its - `--rm` cleanup path. The junit XML is written by pytest before that - teardown and reliably captures the real outcome of the test stage. - """ - for junit in Path(results_dir).rglob("test-results.xml"): - try: - root = ET.parse(junit).getroot() - except ET.ParseError: - continue - - suites = root.findall("testsuite") if root.tag == "testsuites" else [root] - - if not suites: - continue - - for suite in suites: - try: - if int(suite.get("failures", 0)) > 0: - return False - - if int(suite.get("errors", 0)) > 0: - return False - except ValueError: - return False - - return True - - return False - - -def apply_test_override(run_cmd, test_path): - """Replace positional test path(s) in a pytest stage command. - - For example: ``pytest tests/ -n 4 ...`` becomes - ``pytest tests/test_gemm.py -n 4 ...`` when ``test_path`` is - ``tests/test_gemm.py``. - """ - parts = shlex.split(run_cmd) - - if not parts or parts[0] != "pytest": - return run_cmd - - result = ["pytest", test_path] - skip_next = False - - for p in parts[1:]: - if skip_next: - result.append(p) - skip_next = False - continue - - if p.startswith("-"): - result.append(p) - if p in _PYTEST_VALUE_FLAGS: - skip_next = True - continue - - # Skip existing test paths; the override is already in result[1]. - if not ("/" in p or p.endswith(".py") or "::" in p): - result.append(p) - - return shlex.join(result) - - -def build_results_dir(base, platform, stages, commit): - """Build a results directory path: `{base}/{platform}_{stages}_{commit}_{timestamp}_{id}`.""" - stage_names = "+".join(s["name"] for s in stages) - safe_commit = re.sub(r"[^a-zA-Z0-9._-]", "", commit) or "unknown" - timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") - short_id = uuid.uuid4().hex[:6] - dirname = f"{platform}_{stage_names}_{safe_commit}_{timestamp}_{short_id}" - - return Path(base) / dirname - - -def resolve_image(config, platform, image_tag): - """Resolve an image reference to a full image name. - - Accepts `stable`, `latest`, or a commit hash as `image_tag`. When config - contains a registry section, returns a registry-prefixed URL. Otherwise - returns a local tag (current default). - """ - registry = config.get("registry", {}) - registry_url = registry.get("url", "") - project = registry.get("project", "infiniops") - - if not registry_url: - return f"{project}-ci/{platform}:{image_tag}" - - return f"{registry_url}/{project}/{platform}:{image_tag}" - - -def build_runner_script(): - return r""" -set -e -cd /workspace -mkdir -p /workspace/results -if [ -n "$LOCAL_SRC" ]; then - cp -r "$LOCAL_SRC" /tmp/src - cd /tmp/src -else - git clone "$REPO_URL" repo - cd repo - git checkout "$BRANCH" -fi -echo "========== Setup ==========" -eval "$SETUP_CMD" -set +e -rc=0 -for i in $(seq 1 "$NUM_STAGES"); do - name_var="STAGE_${i}_NAME" - cmd_var="STAGE_${i}_CMD" - name="${!name_var}" - cmd="${!cmd_var}" - echo "========== Stage: $name ==========" - if [ -n "$cmd" ]; then - eval "$cmd" - rc=$? - if [ $rc -ne 0 ]; then - echo "Stage '$name' failed with exit code $rc" - break - fi - fi -done -echo "========== Summary ==========" -if [ -n "$HOST_UID" ] && [ -n "$HOST_GID" ]; then - chown -R "$HOST_UID:$HOST_GID" /workspace/results 2>/dev/null || true -fi -exit $rc -""" - - -def build_docker_args( - config, - job_name, - repo_url, - branch, - stages, - workdir, - image_tag_override, - gpu_id_override=None, - results_dir=None, - local_path=None, -): - job = config["jobs"][job_name] - platform = job.get("platform", "nvidia") - image_tag = image_tag_override or job.get("image", "latest") - image = resolve_image(config, platform, image_tag) - resources = job.get("resources", {}) - setup_raw = job.get("setup", "pip install .[dev]") - - if isinstance(setup_raw, list): - setup_cmd = "\n".join(setup_raw) - else: - setup_cmd = setup_raw - - args = [ - "docker", - "run", - "--rm", - "--network", - "host", - "-i", - "-w", - workdir, - "-e", - f"REPO_URL={repo_url}", - "-e", - f"BRANCH={branch}", - "-e", - f"SETUP_CMD={setup_cmd}", - "-e", - f"NUM_STAGES={len(stages)}", - "-e", - f"HOST_UID={os.getuid()}", - "-e", - f"HOST_GID={os.getgid()}", - ] - - for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"): - proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.lower()) - - if proxy_val: - args.extend(["-e", f"{proxy_var}={proxy_val}"]) - args.extend(["-e", f"{proxy_var.lower()}={proxy_val}"]) - - for key, value in job.get("env", {}).items(): - args.extend(["-e", f"{key}={value}"]) - - if results_dir: - args.extend(["-v", f"{results_dir.resolve()}:/workspace/results"]) - - if local_path: - args.extend(["-v", f"{local_path}:/workspace/repo:ro"]) - args.extend(["-e", "LOCAL_SRC=/workspace/repo"]) - - for i, s in enumerate(stages): - args.append("-e") - args.append(f"STAGE_{i + 1}_NAME={s['name']}") - args.append("-e") - args.append(f"STAGE_{i + 1}_CMD={s.get('run', '')}") - - # Platform-specific device access - for flag in job.get("docker_args", []): - args.append(flag) - - for vol in job.get("volumes", []): - args.extend(["-v", vol]) - - raw_gpu_ids = str(resources.get("gpu_ids", "auto")).strip() - gpu_id = gpu_id_override or ("" if raw_gpu_ids == "auto" else raw_gpu_ids) - - if gpu_id: - if platform == "nvidia": - args.extend(["--gpus", "all" if gpu_id == "all" else f"device={gpu_id}"]) - elif gpu_id != "all": - device_env = PLATFORM_DEVICE_ENV.get(platform) - - if device_env: - args.extend(["-e", f"{device_env}={gpu_id}"]) - - memory = resources.get("memory") - - if memory: - mem = str(memory).lower().replace("gb", "g").replace("mb", "m") - - if not mem.endswith("g") and not mem.endswith("m"): - mem = f"{mem}g" - - args.extend(["--memory", mem]) - - shm_size = resources.get("shm_size") - - if shm_size: - args.extend(["--shm-size", str(shm_size)]) - - timeout_sec = resources.get("timeout") - args.append(image) - - if timeout_sec: - # Requires coreutils `timeout` inside the container image. - args.extend(["timeout", str(timeout_sec)]) - - args.extend(["bash", "-c", build_runner_script().strip()]) - - return args - - -def resolve_job_names(jobs, platform=None, job=None): - """Resolve job names for a platform. - - - ``job=None`` — all jobs for the platform. - - ``job="nvidia_gpu"`` — direct lookup by full name. - - Raises ``ValueError`` if no matching jobs are found. - """ - if job: - if job not in jobs: - raise ValueError(f"job {job!r} not found in config") - - return [job] - - if not platform: - return list(jobs.keys()) - - matches = [name for name, cfg in jobs.items() if cfg.get("platform") == platform] - - if not matches: - raise ValueError(f"no jobs for platform {platform!r}") - - return matches - - -def main(): - parser = argparse.ArgumentParser(description="Run Docker CI pipeline") - parser.add_argument( - "--config", - type=Path, - default=Path(__file__).resolve().parent / "config.yaml", - help="Path to config.yaml", - ) - parser.add_argument( - "--branch", type=str, help="Override repo branch (default: config repo.branch)" - ) - parser.add_argument( - "--job", - type=str, - help="Job name (e.g. nvidia_gpu, ascend_npu). Default: all jobs for detected platform", - ) - parser.add_argument( - "--stage", - type=str, - help="Run only this stage name (still runs setup first)", - ) - parser.add_argument( - "--image-tag", - type=str, - help="Override image tag (stable, latest, or commit hash)", - ) - parser.add_argument( - "--gpu-id", - type=str, - help='GPU device IDs to use, e.g. "0", "0,2", "all"', - ) - parser.add_argument( - "--results-dir", - type=Path, - default=Path("ci-results"), - help="Base directory for test results (default: ./ci-results)", - ) - parser.add_argument( - "--test", - type=str, - help='Override pytest test path, e.g. "tests/test_gemm.py" or "tests/test_gemm.py::test_gemm"', - ) - parser.add_argument( - "--local", - action="store_true", - help="Mount current directory (read-only) into the container instead of cloning from git", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="Print docker command and exit", - ) - args = parser.parse_args() - - config = load_config(args.config) - repo = config.get("repo", {}) - repo_url = repo.get("url", "https://github.com/InfiniTensor/InfiniOps.git") - branch = args.branch or repo.get("branch", "master") - - platform = detect_platform() - - if not platform: - tools = ", ".join(ResourcePool.GPU_QUERY_TOOLS.values()) - print(f"error: could not detect platform (no {tools} found)", file=sys.stderr) - sys.exit(1) - - print(f"platform: {platform}", file=sys.stderr) - - jobs = config.get("jobs", {}) - - if not jobs: - print("error: no jobs in config", file=sys.stderr) - sys.exit(1) - - try: - job_names = resolve_job_names(jobs, platform, job=args.job) - except ValueError as e: - print(f"error: {e}", file=sys.stderr) - sys.exit(1) - - pool = ResourcePool(platform) - failed = 0 - - for job_name in job_names: - job = jobs[job_name] - all_stages = job.get("stages", []) - - if args.stage: - stages = [s for s in all_stages if s["name"] == args.stage] - - if not stages: - print( - f"error: stage {args.stage!r} not found in {job_name}", - file=sys.stderr, - ) - sys.exit(1) - else: - stages = all_stages - - if args.test: - stages = [ - {**s, "run": apply_test_override(s.get("run", ""), args.test)} - for s in stages - ] - - # Resolve GPU assignment: CLI override > auto-allocate > static config. - gpu_id_override = args.gpu_id - allocated_ids = [] - raw_gpu_ids = str(job.get("resources", {}).get("gpu_ids", "auto")).strip() - - if not gpu_id_override and raw_gpu_ids == "auto": - gpu_count = parse_gpu_requirement(job) - memory_mb = parse_memory_requirement(job) - allocated_ids, ok = pool.allocate(gpu_count, memory_mb) - - if not ok: - detected = pool.detect_gpus() - if not detected: - hint = ( - f"error: cannot allocate {gpu_count} GPU(s) for {job_name}" - f" — GPU detection returned no devices" - f" (is {ResourcePool.GPU_QUERY_TOOLS.get(platform, '?')} working?)" - f"\nhint: use --gpu-id 0 to bypass auto-allocation" - ) - else: - hint = ( - f"error: cannot allocate {gpu_count} GPU(s) for {job_name}" - f" — {len(detected)} GPU(s) detected but none available" - f" (utilization threshold: {pool._utilization_threshold}%)" - f"\nhint: use --gpu-id 0 to bypass auto-allocation" - ) - print(hint, file=sys.stderr) - failed += 1 - continue - - if allocated_ids: - gpu_id_override = ",".join(str(g) for g in allocated_ids) - - job_platform = job.get("platform", platform) - commit = get_git_commit() - results_dir = build_results_dir(args.results_dir, job_platform, stages, commit) - - local_path = Path.cwd().resolve() if args.local else None - docker_args = build_docker_args( - config, - job_name, - repo_url, - branch, - stages, - "/workspace", - args.image_tag, - gpu_id_override=gpu_id_override, - results_dir=results_dir, - local_path=local_path, - ) - - if args.dry_run: - print(shlex.join(docker_args)) - pool.release(allocated_ids) - continue - - print(f"==> running job: {job_name}", file=sys.stderr) - results_dir.mkdir(parents=True, exist_ok=True) - - try: - returncode = subprocess.run(docker_args).returncode - finally: - pool.release(allocated_ids) - - if returncode != 0: - # Docker 18.09 on this host occasionally SIGKILLs containers - # during `--rm` cleanup after the inner process already exited - # cleanly, producing exit code 137. Fall back to the pytest - # junit XML to recover the real outcome in that case. - if returncode == 137 and _junit_xml_indicates_pass(results_dir): - print( - f"[warn] job {job_name}: container exited with 137 " - f"(likely docker teardown SIGKILL after clean pytest); " - f"junit XML reports no failures — treating as success", - file=sys.stderr, - ) - else: - print( - f"job {job_name} failed (exit code {returncode})", - file=sys.stderr, - ) - failed += 1 - - sys.exit(1 if failed else 0) - - -if __name__ == "__main__": - main() diff --git a/.ci/tests/__init__.py b/.ci/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/.ci/tests/conftest.py b/.ci/tests/conftest.py deleted file mode 100644 index 7b0287642..000000000 --- a/.ci/tests/conftest.py +++ /dev/null @@ -1,46 +0,0 @@ -import sys -from pathlib import Path - -# Allow `import run` and `import build` directly. -sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) - -import pytest - -from utils import normalize_config - - -@pytest.fixture -def minimal_config(): - """Minimal platform-centric config, normalized to flat format.""" - raw = { - "repo": { - "url": "https://github.com/InfiniTensor/InfiniOps.git", - "branch": "master", - }, - "platforms": { - "nvidia": { - "image": { - "dockerfile": ".ci/images/nvidia/", - "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, - }, - "setup": "pip install .[dev]", - "jobs": { - "gpu": { - "resources": { - "ngpus": 1, - "memory": "32GB", - "shm_size": "16g", - "timeout": 3600, - }, - "stages": [ - { - "name": "test", - "run": "pytest tests/ -v", - } - ], - } - }, - } - }, - } - return normalize_config(raw) diff --git a/.ci/tests/test_agent.py b/.ci/tests/test_agent.py deleted file mode 100644 index a0c8ccccf..000000000 --- a/.ci/tests/test_agent.py +++ /dev/null @@ -1,724 +0,0 @@ -import hashlib -import hmac -import json -import threading -from pathlib import Path -from unittest.mock import MagicMock - -import pytest - -import agent -import ci_resource as res -import run -from utils import normalize_config - - -# --------------------------------------------------------------------------- -# Test fixtures. -# --------------------------------------------------------------------------- - - -@pytest.fixture -def agent_config(): - raw = { - "repo": { - "url": "https://github.com/InfiniTensor/InfiniOps.git", - "branch": "master", - }, - "github": { - "status_context_prefix": "ci/infiniops", - }, - "agents": { - "nvidia": {"url": "http://nvidia-host:8080"}, - "iluvatar": {"url": "http://iluvatar-host:8080"}, - }, - "platforms": { - "nvidia": { - "image": { - "dockerfile": ".ci/images/nvidia/", - "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, - }, - "setup": "pip install .[dev]", - "jobs": { - "gpu": { - "resources": { - "ngpus": 1, - "memory": "32GB", - "shm_size": "16g", - "timeout": 3600, - }, - "stages": [{"name": "test", "run": "pytest tests/ -v"}], - }, - }, - }, - "iluvatar": { - "image": { - "dockerfile": ".ci/images/iluvatar/", - "build_args": {"BASE_IMAGE": "corex:qs_pj20250825"}, - }, - "setup": "pip install .[dev]", - "jobs": { - "gpu": { - "resources": { - "ngpus": 1, - "memory": "32GB", - "shm_size": "16g", - "timeout": 3600, - }, - "stages": [{"name": "test", "run": "pytest tests/ -v"}], - }, - }, - }, - }, - } - return normalize_config(raw) - - -@pytest.fixture -def mock_resource_pool(): - pool = MagicMock(spec=res.ResourcePool) - pool.platform = "nvidia" - pool.allocate.return_value = ([0], True) - pool.release.return_value = None - pool.get_status.return_value = { - "platform": "nvidia", - "gpus": [], - "allocated_gpu_ids": [], - "system": {}, - } - return pool - - -# --------------------------------------------------------------------------- -# Tests for `resolve_job_names`. -# --------------------------------------------------------------------------- - - -def test_resolve_job_names_by_name(agent_config): - jobs = run.resolve_job_names(agent_config["jobs"], job="nvidia_gpu") - assert jobs == ["nvidia_gpu"] - - -def test_resolve_job_names_by_platform(agent_config): - jobs = run.resolve_job_names(agent_config["jobs"], platform="nvidia") - assert jobs == ["nvidia_gpu"] - - -def test_resolve_job_names_by_platform_iluvatar(agent_config): - jobs = run.resolve_job_names(agent_config["jobs"], platform="iluvatar") - assert jobs == ["iluvatar_gpu"] - - -def test_resolve_job_names_all(agent_config): - jobs = run.resolve_job_names(agent_config["jobs"]) - assert set(jobs) == {"nvidia_gpu", "iluvatar_gpu"} - - -def test_resolve_job_names_invalid(agent_config): - with pytest.raises(ValueError, match="not_exist"): - run.resolve_job_names(agent_config["jobs"], job="not_exist") - - -# --------------------------------------------------------------------------- -# Tests for `verify_signature`. -# --------------------------------------------------------------------------- - - -def test_verify_signature_valid(): - secret = "my-secret" - body = b'{"action": "push"}' - sig = "sha256=" + hmac.new(secret.encode(), body, hashlib.sha256).hexdigest() - assert agent.verify_signature(secret, body, sig) is True - - -def test_verify_signature_invalid(): - assert agent.verify_signature("secret", b"body", "sha256=wrong") is False - - -def test_verify_signature_empty(): - assert agent.verify_signature("secret", b"body", "") is False - - -# --------------------------------------------------------------------------- -# Tests for `JobRequest` and `JobResult`. -# --------------------------------------------------------------------------- - - -def test_job_request_fields(agent_config): - req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config) - assert req.job_name == "nvidia_gpu" - assert req.platform == "nvidia" - assert req.commit_sha == "abc123" - assert len(req.job_id) == 8 - d = req.to_dict() - assert d["job_name"] == "nvidia_gpu" - - -def test_job_result_success(): - r = agent.JobResult("id1", "nvidia_gpu", "abc", 0, Path("/tmp/res"), 42.5) - assert r.state == "success" - - -def test_job_result_failure(): - r = agent.JobResult("id1", "nvidia_gpu", "abc", 1, Path("/tmp/res"), 10.0) - assert r.state == "failure" - - -# --------------------------------------------------------------------------- -# Tests for the `Scheduler` class. -# --------------------------------------------------------------------------- - - -def test_scheduler_submit_and_run(agent_config, mock_resource_pool, monkeypatch): - monkeypatch.setattr("subprocess.run", lambda cmd, **kw: MagicMock(returncode=0)) - monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) - - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - results_dir=Path("/tmp/test-results"), - no_status=True, - dry_run=True, - ) - req = agent.JobRequest( - "nvidia_gpu", - "master", - "abc123", - agent_config, - results_dir=Path("/tmp/test-results"), - ) - scheduler.submit(req) - results = scheduler.wait_all() - assert len(results) == 1 - assert results[0].state == "success" - - -def test_scheduler_queues_when_no_resources(agent_config, monkeypatch): - pool = MagicMock(spec=res.ResourcePool) - pool.allocate.return_value = ([], False) - pool.get_status.return_value = { - "platform": "nvidia", - "gpus": [], - "allocated_gpu_ids": [], - "system": {}, - } - - scheduler = agent.Scheduler( - agent_config, - "nvidia", - pool, - no_status=True, - dry_run=False, - ) - - req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config) - scheduler.submit(req) - - info = scheduler.get_job(req.job_id) - assert info["state"] == "queued" - - -def test_scheduler_get_status(agent_config, mock_resource_pool): - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - dry_run=True, - ) - - status = scheduler.get_status() - assert "queued" in status - assert "running" in status - assert "completed" in status - assert "resources" in status - - -# --------------------------------------------------------------------------- -# Tests for `WebhookHandler` push event parsing. -# --------------------------------------------------------------------------- - - -def test_webhook_parse_push(): - handler = agent.WebhookHandler.__new__(agent.WebhookHandler) - payload = {"ref": "refs/heads/feat/test", "after": "abc123def456"} - branch, sha = handler._parse_push(payload) - assert branch == "feat/test" - assert sha == "abc123def456" - - -def test_webhook_parse_pr(): - handler = agent.WebhookHandler.__new__(agent.WebhookHandler) - payload = { - "pull_request": { - "head": { - "ref": "feat/pr-branch", - "sha": "def789", - } - } - } - branch, sha = handler._parse_pull_request(payload) - assert branch == "feat/pr-branch" - assert sha == "def789" - - -# --------------------------------------------------------------------------- -# Integration-style webhook HTTP tests. -# --------------------------------------------------------------------------- - - -def _urlopen_no_proxy(url_or_req, **kwargs): - """`urlopen` mock that bypasses any `HTTP_PROXY`.""" - import urllib.request - - opener = urllib.request.build_opener(urllib.request.ProxyHandler({})) - return opener.open(url_or_req, **kwargs) - - -def test_health_endpoint(agent_config, mock_resource_pool): - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - ) - server = agent.AgentServer( - "127.0.0.1", - 0, - agent_config, - scheduler, - "nvidia", - ) - port = server.server_address[1] - - t = threading.Thread(target=server.handle_request, daemon=True) - t.start() - - try: - resp = _urlopen_no_proxy(f"http://127.0.0.1:{port}/health", timeout=5) - data = json.loads(resp.read()) - assert data["status"] == "ok" - assert data["platform"] == "nvidia" - finally: - server.server_close() - - -def test_api_run_endpoint(agent_config, mock_resource_pool, monkeypatch): - monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) - - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - dry_run=True, - ) - server = agent.AgentServer( - "127.0.0.1", - 0, - agent_config, - scheduler, - "nvidia", - results_dir=Path("/tmp/test-results"), - ) - port = server.server_address[1] - - t = threading.Thread(target=server.handle_request, daemon=True) - t.start() - - import urllib.request - - body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode() - req = urllib.request.Request( - f"http://127.0.0.1:{port}/api/run", - data=body, - headers={"Content-Type": "application/json"}, - ) - - try: - resp = _urlopen_no_proxy(req, timeout=5) - data = json.loads(resp.read()) - assert data["accepted"] is True - assert len(data["job_ids"]) >= 1 - finally: - server.server_close() - - -def test_webhook_with_signature(agent_config, mock_resource_pool, monkeypatch): - monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) - - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - dry_run=True, - ) - secret = "test-secret" - server = agent.AgentServer( - "127.0.0.1", - 0, - agent_config, - scheduler, - "nvidia", - webhook_secret=secret, - results_dir=Path("/tmp/test-results"), - ) - port = server.server_address[1] - - t = threading.Thread(target=server.handle_request, daemon=True) - t.start() - - import urllib.request - - payload = json.dumps( - { - "ref": "refs/heads/master", - "after": "abc123def456", - } - ).encode() - sig = "sha256=" + hmac.new(secret.encode(), payload, hashlib.sha256).hexdigest() - - req = urllib.request.Request( - f"http://127.0.0.1:{port}/webhook", - data=payload, - headers={ - "Content-Type": "application/json", - "X-GitHub-Event": "push", - "X-Hub-Signature-256": sig, - }, - ) - - try: - resp = _urlopen_no_proxy(req, timeout=5) - data = json.loads(resp.read()) - assert data["accepted"] is True - finally: - server.server_close() - - -def test_webhook_invalid_signature(agent_config, mock_resource_pool): - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - ) - server = agent.AgentServer( - "127.0.0.1", - 0, - agent_config, - scheduler, - "nvidia", - webhook_secret="real-secret", - ) - port = server.server_address[1] - - t = threading.Thread(target=server.handle_request, daemon=True) - t.start() - - import urllib.error - import urllib.request - - payload = b'{"ref": "refs/heads/master", "after": "abc"}' - req = urllib.request.Request( - f"http://127.0.0.1:{port}/webhook", - data=payload, - headers={ - "Content-Type": "application/json", - "X-GitHub-Event": "push", - "X-Hub-Signature-256": "sha256=invalid", - }, - ) - - try: - with pytest.raises(urllib.error.HTTPError) as exc_info: - _urlopen_no_proxy(req, timeout=5) - - assert exc_info.value.code == 401 - finally: - server.server_close() - - -# --------------------------------------------------------------------------- -# Tests for API token authentication. -# --------------------------------------------------------------------------- - - -def test_api_run_requires_token(agent_config, mock_resource_pool, monkeypatch): - """When `api_token` is set, `/api/run` rejects requests without a valid token.""" - monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) - - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - dry_run=True, - ) - server = agent.AgentServer( - "127.0.0.1", - 0, - agent_config, - scheduler, - "nvidia", - api_token="my-secret-token", - results_dir=Path("/tmp/test-results"), - ) - port = server.server_address[1] - - t = threading.Thread(target=server.handle_request, daemon=True) - t.start() - - import urllib.error - import urllib.request - - body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode() - req = urllib.request.Request( - f"http://127.0.0.1:{port}/api/run", - data=body, - headers={"Content-Type": "application/json"}, - ) - - try: - with pytest.raises(urllib.error.HTTPError) as exc_info: - _urlopen_no_proxy(req, timeout=5) - - assert exc_info.value.code == 401 - finally: - server.server_close() - - -def test_api_run_accepts_valid_token(agent_config, mock_resource_pool, monkeypatch): - """When `api_token` is set, `/api/run` accepts requests with a correct Bearer token.""" - monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) - - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - dry_run=True, - ) - server = agent.AgentServer( - "127.0.0.1", - 0, - agent_config, - scheduler, - "nvidia", - api_token="my-secret-token", - results_dir=Path("/tmp/test-results"), - ) - port = server.server_address[1] - - t = threading.Thread(target=server.handle_request, daemon=True) - t.start() - - import urllib.request - - body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode() - req = urllib.request.Request( - f"http://127.0.0.1:{port}/api/run", - data=body, - headers={ - "Content-Type": "application/json", - "Authorization": "Bearer my-secret-token", - }, - ) - - try: - resp = _urlopen_no_proxy(req, timeout=5) - data = json.loads(resp.read()) - assert data["accepted"] is True - finally: - server.server_close() - - -# --------------------------------------------------------------------------- -# Tests for queue backpressure. -# --------------------------------------------------------------------------- - - -def test_scheduler_rejects_when_queue_full(agent_config, monkeypatch): - """Scheduler raises QueueFullError when queue is at capacity.""" - pool = MagicMock(spec=res.ResourcePool) - pool.allocate.return_value = ([], False) # Never allocate → jobs stay queued. - - scheduler = agent.Scheduler( - agent_config, - "nvidia", - pool, - no_status=True, - dry_run=False, - ) - - # Fill queue to capacity. - monkeypatch.setattr(agent, "MAX_QUEUE_SIZE", 3) - - for _ in range(3): - req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config) - scheduler.submit(req) - - # Next submit should fail. - req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config) - - with pytest.raises(agent.QueueFullError): - scheduler.submit(req) - - -# --------------------------------------------------------------------------- -# Tests for `poll_remote_job` error logging. -# --------------------------------------------------------------------------- - - -def test_poll_remote_job_logs_errors(monkeypatch, capsys): - """`poll_remote_job` warns on first failure instead of silently swallowing.""" - call_count = 0 - - def fake_urlopen(req, **kwargs): - nonlocal call_count - call_count += 1 - raise ConnectionError("connection refused") - - monkeypatch.setattr(agent, "urllib_urlopen", fake_urlopen) - monkeypatch.setattr(agent, "urllib_request", lambda url: url) - - result = agent.poll_remote_job( - "http://fake:8080", "job1", interval=0.01, timeout=0.05 - ) - assert result is None - - captured = capsys.readouterr() - assert "connection refused" in captured.err - assert "warning:" in captured.err - - -# --------------------------------------------------------------------------- -# Tests for `JobResult` `log_file` field. -# --------------------------------------------------------------------------- - - -def test_job_result_includes_log_file(): - r = agent.JobResult( - "id1", - "nvidia_gpu", - "abc", - 1, - Path("/tmp/res"), - 10.0, - error_tail=["error"], - log_file=Path("/tmp/res/job.log"), - ) - d = r.to_dict() - assert d["log_file"] == "/tmp/res/job.log" - - -def test_job_result_omits_log_file_when_none(): - r = agent.JobResult("id1", "nvidia_gpu", "abc", 0, Path("/tmp/res"), 5.0) - d = r.to_dict() - assert "log_file" not in d - - -# --------------------------------------------------------------------------- -# Tests for `/api/job/{id}/log` endpoint. -# --------------------------------------------------------------------------- - - -def test_job_log_endpoint(agent_config, mock_resource_pool, monkeypatch, tmp_path): - """`GET /api/job/{id}/log` returns the full log file content.""" - monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) - - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - dry_run=True, - ) - - # Manually inject a completed job with a log file. - log_file = tmp_path / "job.log" - log_file.write_text("line 1\nline 2\nline 3\n") - - req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config) - result = agent.JobResult( - req.job_id, - "nvidia_gpu", - "abc123", - 0, - tmp_path, - 1.0, - log_file=log_file, - ) - - with scheduler._lock: - scheduler._jobs[req.job_id] = { - "request": req, - "result": result, - "state": "success", - "gpu_ids": [], - } - - server = agent.AgentServer( - "127.0.0.1", - 0, - agent_config, - scheduler, - "nvidia", - ) - port = server.server_address[1] - - t = threading.Thread(target=server.handle_request, daemon=True) - t.start() - - import urllib.request - - url = f"http://127.0.0.1:{port}/api/job/{req.job_id}/log" - req_http = urllib.request.Request(url) - - try: - resp = _urlopen_no_proxy(req_http, timeout=5) - body = resp.read().decode("utf-8") - assert "line 1" in body - assert "line 2" in body - assert "line 3" in body - assert resp.headers["Content-Type"] == "text/plain; charset=utf-8" - finally: - server.server_close() - - -def test_job_log_endpoint_not_found(agent_config, mock_resource_pool): - """`GET /api/job/{id}/log` returns 404 for unknown job.""" - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - ) - - server = agent.AgentServer( - "127.0.0.1", - 0, - agent_config, - scheduler, - "nvidia", - ) - port = server.server_address[1] - - t = threading.Thread(target=server.handle_request, daemon=True) - t.start() - - import urllib.error - import urllib.request - - url = f"http://127.0.0.1:{port}/api/job/nonexist/log" - req_http = urllib.request.Request(url) - - try: - with pytest.raises(urllib.error.HTTPError) as exc_info: - _urlopen_no_proxy(req_http, timeout=5) - - assert exc_info.value.code == 404 - finally: - server.server_close() diff --git a/.ci/tests/test_build.py b/.ci/tests/test_build.py deleted file mode 100644 index df25d6063..000000000 --- a/.ci/tests/test_build.py +++ /dev/null @@ -1,207 +0,0 @@ -from unittest.mock import MagicMock - -import build - - -# --------------------------------------------------------------------------- -# Tests for `build_image_tag`. -# --------------------------------------------------------------------------- - - -def test_build_image_tag_with_registry(): - tag = build.build_image_tag("localhost:5000", "infiniops", "nvidia", "latest") - assert tag == "localhost:5000/infiniops/nvidia:latest" - - -def test_build_image_tag_without_registry(): - tag = build.build_image_tag("", "infiniops", "nvidia", "abc1234") - assert tag == "infiniops-ci/nvidia:abc1234" - - -def test_build_image_tag_commit_hash(): - tag = build.build_image_tag( - "registry.example.com:5000", "proj", "ascend", "deadbeef" - ) - assert tag == "registry.example.com:5000/proj/ascend:deadbeef" - - -# --------------------------------------------------------------------------- -# Tests for `has_dockerfile_changed`. -# --------------------------------------------------------------------------- - - -def test_has_dockerfile_changed_true_when_stdout_nonempty(monkeypatch): - monkeypatch.setattr( - "subprocess.run", - lambda *a, **kw: MagicMock(returncode=0, stdout="Dockerfile\n"), - ) - assert build.has_dockerfile_changed(".ci/images/nvidia/") is True - - -def test_has_dockerfile_changed_false_when_stdout_empty(monkeypatch): - monkeypatch.setattr( - "subprocess.run", - lambda *a, **kw: MagicMock(returncode=0, stdout=""), - ) - assert build.has_dockerfile_changed(".ci/images/nvidia/") is False - - -def test_has_dockerfile_changed_true_on_git_error(monkeypatch): - # Shallow clone or initial commit: `git diff` returns non-zero. - monkeypatch.setattr( - "subprocess.run", - lambda *a, **kw: MagicMock(returncode=128, stdout=""), - ) - assert build.has_dockerfile_changed(".ci/images/nvidia/") is True - - -# --------------------------------------------------------------------------- -# Tests for `docker_login`. -# --------------------------------------------------------------------------- - - -def test_docker_login_no_credentials_env(monkeypatch): - called = [] - monkeypatch.setattr("subprocess.run", lambda *a, **kw: called.append(1)) - result = build.docker_login({"url": "localhost:5000"}, dry_run=False) - assert result is True - assert not called - - -def test_docker_login_token_not_set(monkeypatch): - monkeypatch.delenv("REGISTRY_TOKEN", raising=False) - called = [] - monkeypatch.setattr("subprocess.run", lambda *a, **kw: called.append(1)) - cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} - result = build.docker_login(cfg, dry_run=False) - assert result is False - assert not called - - -def test_docker_login_dry_run_does_not_call_subprocess(monkeypatch): - monkeypatch.setenv("REGISTRY_TOKEN", "mytoken") - called = [] - monkeypatch.setattr("subprocess.run", lambda *a, **kw: called.append(1)) - cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} - result = build.docker_login(cfg, dry_run=True) - assert result is True - assert not called - - -def test_docker_login_success(monkeypatch): - monkeypatch.setenv("REGISTRY_TOKEN", "mytoken") - captured = {} - - def mock_run(cmd, **kwargs): - captured["cmd"] = cmd - return MagicMock(returncode=0) - - monkeypatch.setattr("subprocess.run", mock_run) - cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} - result = build.docker_login(cfg, dry_run=False) - assert result is True - assert "docker" in captured["cmd"] - assert "login" in captured["cmd"] - - -# --------------------------------------------------------------------------- -# Tests for `build_image` dry-run mode and proxy forwarding. -# --------------------------------------------------------------------------- - - -def _platform_cfg(): - return { - "dockerfile": ".ci/images/nvidia/", - "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, - } - - -def _registry_cfg(): - return {"url": "localhost:5000", "project": "infiniops"} - - -def test_build_image_dry_run_no_subprocess(monkeypatch, capsys): - monkeypatch.delenv("HTTP_PROXY", raising=False) - monkeypatch.delenv("http_proxy", raising=False) - monkeypatch.delenv("HTTPS_PROXY", raising=False) - monkeypatch.delenv("https_proxy", raising=False) - monkeypatch.delenv("NO_PROXY", raising=False) - monkeypatch.delenv("no_proxy", raising=False) - called = [] - monkeypatch.setattr("subprocess.run", lambda *a, **kw: called.append(1)) - build.build_image( - "nvidia", - _platform_cfg(), - _registry_cfg(), - "abc1234", - push=False, - dry_run=True, - logged_in=True, - ) - assert not called - captured = capsys.readouterr() - assert "[dry-run]" in captured.out - - -def test_build_image_dry_run_output_contains_image_tag(monkeypatch, capsys): - monkeypatch.delenv("HTTP_PROXY", raising=False) - monkeypatch.delenv("http_proxy", raising=False) - monkeypatch.delenv("HTTPS_PROXY", raising=False) - monkeypatch.delenv("https_proxy", raising=False) - monkeypatch.delenv("NO_PROXY", raising=False) - monkeypatch.delenv("no_proxy", raising=False) - monkeypatch.setattr("subprocess.run", lambda *a, **kw: MagicMock(returncode=0)) - build.build_image( - "nvidia", - _platform_cfg(), - _registry_cfg(), - "abc1234", - push=False, - dry_run=True, - logged_in=True, - ) - captured = capsys.readouterr() - assert "abc1234" in captured.out - - -def test_build_image_proxy_in_build_args(monkeypatch): - monkeypatch.setenv("HTTP_PROXY", "http://proxy.test:3128") - captured = {} - - def mock_run(cmd, **kwargs): - captured["cmd"] = cmd - return MagicMock(returncode=0) - - monkeypatch.setattr("subprocess.run", mock_run) - build.build_image( - "nvidia", - _platform_cfg(), - _registry_cfg(), - "abc1234", - push=False, - dry_run=False, - logged_in=True, - ) - joined = " ".join(captured["cmd"]) - assert "HTTP_PROXY=http://proxy.test:3128" in joined - assert "http_proxy=http://proxy.test:3128" in joined - - -def test_build_image_returns_false_on_docker_error(monkeypatch): - monkeypatch.delenv("HTTP_PROXY", raising=False) - monkeypatch.delenv("http_proxy", raising=False) - monkeypatch.delenv("HTTPS_PROXY", raising=False) - monkeypatch.delenv("https_proxy", raising=False) - monkeypatch.delenv("NO_PROXY", raising=False) - monkeypatch.delenv("no_proxy", raising=False) - monkeypatch.setattr("subprocess.run", lambda *a, **kw: MagicMock(returncode=1)) - result = build.build_image( - "nvidia", - _platform_cfg(), - _registry_cfg(), - "abc1234", - push=False, - dry_run=False, - logged_in=True, - ) - assert result is False diff --git a/.ci/tests/test_github_status.py b/.ci/tests/test_github_status.py deleted file mode 100644 index 9e29c7922..000000000 --- a/.ci/tests/test_github_status.py +++ /dev/null @@ -1,145 +0,0 @@ -import json -from unittest.mock import MagicMock - - -import github_status as gh - - -# --------------------------------------------------------------------------- -# Tests for `parse_repo_url`. -# --------------------------------------------------------------------------- - - -def test_parse_repo_url_https(): - owner, repo = gh.parse_repo_url("https://github.com/InfiniTensor/InfiniOps.git") - assert owner == "InfiniTensor" - assert repo == "InfiniOps" - - -def test_parse_repo_url_https_no_git(): - owner, repo = gh.parse_repo_url("https://github.com/Owner/Repo") - assert owner == "Owner" - assert repo == "Repo" - - -def test_parse_repo_url_ssh(): - owner, repo = gh.parse_repo_url("git@github.com:Owner/Repo.git") - assert owner == "Owner" - assert repo == "Repo" - - -def test_parse_repo_url_invalid(): - owner, repo = gh.parse_repo_url("not-a-url") - assert owner == "" - assert repo == "" - - -# --------------------------------------------------------------------------- -# Tests for `build_status_context`. -# --------------------------------------------------------------------------- - - -def test_build_status_context(): - ctx = gh.build_status_context("ci/infiniops", "nvidia_gpu") - assert ctx == "ci/infiniops/nvidia_gpu" - - -# --------------------------------------------------------------------------- -# Tests for `post_commit_status`. -# --------------------------------------------------------------------------- - - -def test_post_status_no_token(monkeypatch): - monkeypatch.delenv("GITHUB_TOKEN", raising=False) - result = gh.post_commit_status("owner", "repo", "abc123", "success", "ctx", "desc") - assert result is False - - -def test_post_status_missing_owner(): - result = gh.post_commit_status( - "", "repo", "abc123", "success", "ctx", "desc", token="tok" - ) - assert result is False - - -def test_post_status_success(monkeypatch): - mock_response = MagicMock() - mock_response.status = 201 - mock_response.__enter__ = MagicMock(return_value=mock_response) - mock_response.__exit__ = MagicMock(return_value=False) - - captured_req = {} - - def mock_urlopen(req, **kwargs): - captured_req["url"] = req.full_url - captured_req["data"] = json.loads(req.data) - captured_req["headers"] = dict(req.headers) - return mock_response - - monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) - - result = gh.post_commit_status( - "InfiniTensor", - "InfiniOps", - "abc123def", - "success", - "ci/infiniops/nvidia_gpu", - "Tests passed", - token="ghp_test_token", - ) - - assert result is True - assert "abc123def" in captured_req["url"] - assert captured_req["data"]["state"] == "success" - assert captured_req["data"]["context"] == "ci/infiniops/nvidia_gpu" - assert "ghp_test_token" in captured_req["headers"]["Authorization"] - - -def test_post_status_http_error(monkeypatch): - import urllib.error - - def mock_urlopen(req, **kwargs): - raise urllib.error.HTTPError( - url="", code=422, msg="Unprocessable", hdrs=None, fp=None - ) - - monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) - - result = gh.post_commit_status( - "owner", "repo", "sha", "success", "ctx", "desc", token="tok" - ) - assert result is False - - -def test_post_status_url_error(monkeypatch): - import urllib.error - - def mock_urlopen(req, **kwargs): - raise urllib.error.URLError("connection refused") - - monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) - - result = gh.post_commit_status( - "owner", "repo", "sha", "success", "ctx", "desc", token="tok" - ) - assert result is False - - -def test_post_status_truncates_description(monkeypatch): - mock_response = MagicMock() - mock_response.status = 201 - mock_response.__enter__ = MagicMock(return_value=mock_response) - mock_response.__exit__ = MagicMock(return_value=False) - - captured = {} - - def mock_urlopen(req, **kwargs): - captured["data"] = json.loads(req.data) - return mock_response - - monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) - - long_desc = "x" * 200 - gh.post_commit_status("o", "r", "sha", "success", "ctx", long_desc, token="tok") - - assert len(captured["data"]["description"]) == 140 diff --git a/.ci/tests/test_resource.py b/.ci/tests/test_resource.py deleted file mode 100644 index a7ba8f877..000000000 --- a/.ci/tests/test_resource.py +++ /dev/null @@ -1,434 +0,0 @@ -import threading - - -import ci_resource as res - - -# --------------------------------------------------------------------------- -# Tests for `GpuInfo` and `SystemResources`. -# --------------------------------------------------------------------------- - - -def test_gpu_info_fields(): - g = res.GpuInfo( - index=0, memory_used_mb=1000, memory_total_mb=8000, utilization_pct=50 - ) - assert g.index == 0 - assert g.memory_total_mb == 8000 - - -def test_system_resources_fields(): - s = res.SystemResources( - total_memory_mb=32000, available_memory_mb=16000, cpu_count=8 - ) - assert s.cpu_count == 8 - - -# --------------------------------------------------------------------------- -# Tests for `detect_gpus`. -# --------------------------------------------------------------------------- - - -def test_detect_gpus_nvidia_parses_csv(monkeypatch): - csv_output = "0, 512, 8192, 5\n1, 1024, 8192, 80\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia") - gpus = pool.detect_gpus() - assert len(gpus) == 2 - assert gpus[0].index == 0 - assert gpus[0].memory_used_mb == 512 - assert gpus[0].utilization_pct == 5 - assert gpus[1].index == 1 - assert gpus[1].utilization_pct == 80 - - -def test_detect_gpus_empty_on_failure(monkeypatch): - def mock_run(cmd, **kwargs): - class R: - returncode = 1 - stdout = "" - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia") - assert pool.detect_gpus() == [] - - -def test_detect_gpus_unknown_platform(): - pool = res.ResourcePool("unknown_platform") - assert pool.detect_gpus() == [] - - -def test_detect_gpus_file_not_found(monkeypatch): - def mock_run(cmd, **kwargs): - raise FileNotFoundError("nvidia-smi not found") - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia") - assert pool.detect_gpus() == [] - - -# --------------------------------------------------------------------------- -# Tests for `detect_system_resources`. -# --------------------------------------------------------------------------- - - -def test_detect_system_resources(monkeypatch, tmp_path): - meminfo = tmp_path / "meminfo" - meminfo.write_text( - "MemTotal: 32000000 kB\n" - "MemFree: 10000000 kB\n" - "MemAvailable: 20000000 kB\n" - ) - - _real_open = open - - def fake_open(path, **kw): - if str(path) == "/proc/meminfo": - return _real_open(str(meminfo), **kw) - return _real_open(path, **kw) - - monkeypatch.setattr("builtins.open", fake_open) - - pool = res.ResourcePool("nvidia") - sys_res = pool.detect_system_resources() - assert abs(sys_res.total_memory_mb - 32000000 / 1024) < 1 - assert abs(sys_res.available_memory_mb - 20000000 / 1024) < 1 - assert sys_res.cpu_count > 0 - - -# --------------------------------------------------------------------------- -# Tests for `allocate` picking least-loaded GPUs. -# --------------------------------------------------------------------------- - - -def test_allocate_picks_least_loaded(monkeypatch): - csv_output = "0, 100, 8192, 8\n1, 200, 8192, 2\n2, 300, 8192, 5\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia", utilization_threshold=10) - gpu_ids, ok = pool.allocate(1) - assert ok is True - assert gpu_ids == [1] # GPU 1 has lowest utilization (2%). - - -def test_allocate_picks_two_least_loaded(monkeypatch): - csv_output = "0, 100, 8192, 8\n1, 200, 8192, 2\n2, 300, 8192, 5\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia", utilization_threshold=10) - gpu_ids, ok = pool.allocate(2) - assert ok is True - assert gpu_ids == [1, 2] # Sorted by utilization: 2% then 5%. - - -def test_allocate_skips_busy_gpus(monkeypatch): - csv_output = "0, 100, 8192, 5\n1, 4000, 8192, 95\n2, 200, 8192, 8\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia", utilization_threshold=10) - gpu_ids, ok = pool.allocate(2) - assert ok is True - assert set(gpu_ids) == {0, 2} - assert 1 not in gpu_ids # GPU 1 at 95% is above threshold - - -# --------------------------------------------------------------------------- -# Tests for `allocate` and `release`. -# --------------------------------------------------------------------------- - - -def test_allocate_success(monkeypatch): - csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia", utilization_threshold=10) - gpu_ids, ok = pool.allocate(1) - assert ok is True - assert len(gpu_ids) == 1 - assert gpu_ids[0] in (0, 1) - - -def test_allocate_insufficient_gpus(monkeypatch): - csv_output = "0, 100, 8192, 5\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia", utilization_threshold=10) - gpu_ids, ok = pool.allocate(3) - assert ok is False - assert gpu_ids == [] - - -def test_allocate_zero_gpus(): - pool = res.ResourcePool("unknown") - gpu_ids, ok = pool.allocate(0) - assert ok is True - assert gpu_ids == [] - - -def test_release_frees_gpus(monkeypatch): - csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia", utilization_threshold=10) - gpu_ids, ok = pool.allocate(2) - assert ok is True - assert len(gpu_ids) == 2 - - # All GPUs allocated; next allocation should fail. - _, ok2 = pool.allocate(1) - assert ok2 is False - - # Release one GPU. - pool.release([gpu_ids[0]]) - gpu_ids2, ok3 = pool.allocate(1) - assert ok3 is True - assert gpu_ids2 == [gpu_ids[0]] - - -def test_allocate_excludes_allocated(monkeypatch): - csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia", utilization_threshold=10) - gpu_ids1, _ = pool.allocate(1) - gpu_ids2, _ = pool.allocate(1) - - assert gpu_ids1 != gpu_ids2 - assert set(gpu_ids1 + gpu_ids2) == {0, 1} - - -def test_thread_safety(monkeypatch): - csv_output = "0, 0, 8192, 0\n1, 0, 8192, 0\n2, 0, 8192, 0\n3, 0, 8192, 0\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia", utilization_threshold=50) - allocated_all = [] - lock = threading.Lock() - - def allocate_one(): - ids, ok = pool.allocate(1) - - if ok: - with lock: - allocated_all.extend(ids) - - threads = [threading.Thread(target=allocate_one) for _ in range(4)] - - for t in threads: - t.start() - - for t in threads: - t.join() - - assert len(allocated_all) == 4 - assert len(set(allocated_all)) == 4 - - -# --------------------------------------------------------------------------- -# Tests for `get_status`. -# --------------------------------------------------------------------------- - - -def test_get_status(monkeypatch): - csv_output = "0, 512, 8192, 5\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia") - status = pool.get_status() - assert status["platform"] == "nvidia" - assert len(status["gpus"]) == 1 - assert "system" in status - - -# --------------------------------------------------------------------------- -# Tests for `parse_gpu_requirement` and `parse_memory_requirement`. -# --------------------------------------------------------------------------- - - -def test_parse_gpu_requirement_auto_default(): - """`gpu_ids` omitted (defaults to `auto`) with `ngpus=1`.""" - job = {"resources": {"ngpus": 1}} - assert res.parse_gpu_requirement(job) == 1 - - -def test_parse_gpu_requirement_auto_explicit(): - """`gpu_ids=auto` with `ngpus=2`.""" - job = {"resources": {"gpu_ids": "auto", "ngpus": 2}} - assert res.parse_gpu_requirement(job) == 2 - - -def test_parse_gpu_requirement_auto_no_ngpus(): - """`gpu_ids=auto` without `ngpus` defaults to 1.""" - job = {"resources": {"gpu_ids": "auto"}} - assert res.parse_gpu_requirement(job) == 1 - - -def test_parse_gpu_requirement_auto_implicit_no_ngpus(): - """No `gpu_ids` and no `ngpus` defaults to 1.""" - job = {"resources": {}} - assert res.parse_gpu_requirement(job) == 1 - - -def test_parse_gpu_requirement_static_pinning(): - """Static `gpu_ids` counts explicit device IDs.""" - job = {"resources": {"gpu_ids": "0,1"}} - assert res.parse_gpu_requirement(job) == 2 - - -def test_parse_gpu_requirement_static_single(): - job = {"resources": {"gpu_ids": "0"}} - assert res.parse_gpu_requirement(job) == 1 - - -def test_parse_gpu_requirement_all(): - job = {"resources": {"gpu_ids": "all"}} - assert res.parse_gpu_requirement(job) == 0 - - -def test_parse_gpu_requirement_ngpus_mismatch_warns(capsys): - """Warn when static `gpu_ids` count differs from `ngpus`.""" - job = {"resources": {"gpu_ids": "0,1", "ngpus": 3}} - assert res.parse_gpu_requirement(job) == 2 - - captured = capsys.readouterr() - assert "warning:" in captured.err - assert "ngpus=3" in captured.err - - -def test_parse_gpu_requirement_ignores_unknown_keys(): - """Unknown keys in resources do not affect GPU counting.""" - job = {"resources": {"gpu_ids": "0", "extra_key": "value"}} - assert res.parse_gpu_requirement(job) == 1 - - -def test_detect_gpus_ascend_hbm_parsing(monkeypatch): - """`npu-smi` row 2 has DDR (0/0) and HBM (2789/32768); we want HBM.""" - npu_output = ( - "+---------------------------+---------------+-------------------------------+\n" - "| 0 910B4 | OK | 86.5 41 |\n" - "| 0 | 0000:c1:00.0 | 5 0 / 0 2789 / 32768 |\n" - "+---------------------------+---------------+-------------------------------+\n" - ) - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = npu_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("ascend") - gpus = pool.detect_gpus() - assert len(gpus) == 1 - assert gpus[0].index == 0 - assert gpus[0].utilization_pct == 5.0 - assert gpus[0].memory_used_mb == 2789.0 - assert gpus[0].memory_total_mb == 32768.0 - - -def test_parse_memory_requirement_gb(): - assert res.parse_memory_requirement({"resources": {"memory": "32GB"}}) == 32 * 1024 - - -def test_parse_memory_requirement_mb(): - assert res.parse_memory_requirement({"resources": {"memory": "512MB"}}) == 512 - - -def test_parse_memory_requirement_empty(): - assert res.parse_memory_requirement({"resources": {}}) == 0 - - -def test_parse_memory_requirement_invalid_warns(capsys): - result = res.parse_memory_requirement({"resources": {"memory": "abc xyz"}}) - assert result == 0 - - captured = capsys.readouterr() - assert "warning:" in captured.err - assert "abc xyz" in captured.err diff --git a/.ci/tests/test_run.py b/.ci/tests/test_run.py deleted file mode 100644 index 844d941dc..000000000 --- a/.ci/tests/test_run.py +++ /dev/null @@ -1,450 +0,0 @@ -from pathlib import Path - -import pytest - -import run - - -# --------------------------------------------------------------------------- -# Tests for `resolve_image`. -# --------------------------------------------------------------------------- - - -def test_resolve_image_with_registry(): - cfg = {"registry": {"url": "localhost:5000", "project": "infiniops"}} - img = run.resolve_image(cfg, "nvidia", "latest") - assert img == "localhost:5000/infiniops/nvidia:latest" - - -def test_resolve_image_without_registry(minimal_config): - img = run.resolve_image(minimal_config, "nvidia", "abc1234") - assert img == "infiniops-ci/nvidia:abc1234" - - -# --------------------------------------------------------------------------- -# Tests for `build_runner_script`. -# --------------------------------------------------------------------------- - - -def test_runner_script_contains_git_clone(): - script = run.build_runner_script() - assert "git clone" in script - - -def test_runner_script_contains_setup_cmd(): - script = run.build_runner_script() - assert "SETUP_CMD" in script - - -def test_runner_script_exits_on_failure(): - script = run.build_runner_script() - assert "exit $rc" in script - - -def test_runner_script_creates_results_dir(): - script = run.build_runner_script() - assert "mkdir -p /workspace/results" in script - - -# --------------------------------------------------------------------------- -# Tests for `build_docker_args` basic structure. -# --------------------------------------------------------------------------- - - -def test_docker_args_basic_structure(minimal_config): - args = run.build_docker_args( - minimal_config, - "nvidia_gpu", - "https://github.com/example/repo.git", - "master", - minimal_config["jobs"]["nvidia_gpu"]["stages"], - "/workspace", - None, - ) - assert args[0] == "docker" - assert "run" in args - assert "--rm" in args - - -def test_docker_args_correct_image(minimal_config): - args = run.build_docker_args( - minimal_config, - "nvidia_gpu", - "https://github.com/example/repo.git", - "master", - minimal_config["jobs"]["nvidia_gpu"]["stages"], - "/workspace", - None, - ) - assert "infiniops-ci/nvidia:latest" in args - - -def test_docker_args_image_tag_override(minimal_config): - args = run.build_docker_args( - minimal_config, - "nvidia_gpu", - "https://github.com/example/repo.git", - "master", - minimal_config["jobs"]["nvidia_gpu"]["stages"], - "/workspace", - "abc1234", - ) - assert "infiniops-ci/nvidia:abc1234" in args - - -# --------------------------------------------------------------------------- -# Tests for `build_docker_args` proxy passthrough. -# --------------------------------------------------------------------------- - - -def test_docker_args_proxy_present_when_set(minimal_config, monkeypatch): - monkeypatch.setenv("HTTP_PROXY", "http://proxy.example.com:8080") - args = run.build_docker_args( - minimal_config, - "nvidia_gpu", - "https://github.com/example/repo.git", - "master", - minimal_config["jobs"]["nvidia_gpu"]["stages"], - "/workspace", - None, - ) - assert "-e" in args - assert "HTTP_PROXY=http://proxy.example.com:8080" in args - assert "http_proxy=http://proxy.example.com:8080" in args - - -def test_docker_args_proxy_absent_when_not_set(minimal_config, monkeypatch): - monkeypatch.delenv("HTTP_PROXY", raising=False) - monkeypatch.delenv("http_proxy", raising=False) - monkeypatch.delenv("HTTPS_PROXY", raising=False) - monkeypatch.delenv("https_proxy", raising=False) - monkeypatch.delenv("NO_PROXY", raising=False) - monkeypatch.delenv("no_proxy", raising=False) - args = run.build_docker_args( - minimal_config, - "nvidia_gpu", - "https://github.com/example/repo.git", - "master", - minimal_config["jobs"]["nvidia_gpu"]["stages"], - "/workspace", - None, - ) - - for arg in args: - assert not arg.startswith("HTTP_PROXY=") - assert not arg.startswith("http_proxy=") - assert not arg.startswith("HTTPS_PROXY=") - assert not arg.startswith("https_proxy=") - assert not arg.startswith("NO_PROXY=") - assert not arg.startswith("no_proxy=") - - -def test_docker_args_proxy_lowercase_fallback(minimal_config, monkeypatch): - monkeypatch.delenv("HTTP_PROXY", raising=False) - monkeypatch.setenv("http_proxy", "http://lowercase.proxy:3128") - args = run.build_docker_args( - minimal_config, - "nvidia_gpu", - "https://github.com/example/repo.git", - "master", - minimal_config["jobs"]["nvidia_gpu"]["stages"], - "/workspace", - None, - ) - assert "HTTP_PROXY=http://lowercase.proxy:3128" in args - assert "http_proxy=http://lowercase.proxy:3128" in args - - -# --------------------------------------------------------------------------- -# Tests for `build_docker_args` GPU flags. -# --------------------------------------------------------------------------- - - -def _make_args(config, gpu_id_override=None): - return run.build_docker_args( - config, - "nvidia_gpu", - "https://github.com/example/repo.git", - "master", - config["jobs"]["nvidia_gpu"]["stages"], - "/workspace", - None, - gpu_id_override=gpu_id_override, - ) - - -def test_docker_args_gpu_auto_no_override(minimal_config): - """`gpu_ids=auto` (default) without override produces no `--gpus` flag.""" - args = _make_args(minimal_config) - assert "--gpus" not in args - - -def test_docker_args_gpu_auto_with_override(minimal_config): - """`gpu_ids=auto` with allocator override sets `--gpus device=...`.""" - args = _make_args(minimal_config, gpu_id_override="2") - idx = args.index("--gpus") - assert args[idx + 1] == "device=2" - - -def test_docker_args_gpu_static(minimal_config): - """Static `gpu_ids` pins to specific devices.""" - minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "0" - args = _make_args(minimal_config) - idx = args.index("--gpus") - assert args[idx + 1] == "device=0" - - -def test_docker_args_gpu_all(minimal_config): - minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "all" - args = _make_args(minimal_config) - idx = args.index("--gpus") - assert args[idx + 1] == "all" - - -def test_docker_args_gpu_override_trumps_static(minimal_config): - """CLI `gpu_id_override` takes precedence over static `gpu_ids`.""" - minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "0" - args = _make_args(minimal_config, gpu_id_override="2,3") - idx = args.index("--gpus") - assert args[idx + 1] == "device=2,3" - - -# --------------------------------------------------------------------------- -# Tests for `build_docker_args` platform-specific device env vars. -# --------------------------------------------------------------------------- - - -def _make_platform_config(platform, job_suffix="gpu"): - """Build a minimal normalized config for a given platform.""" - from utils import normalize_config - - raw = { - "platforms": { - platform: { - "image": {"dockerfile": f".ci/images/{platform}/"}, - "setup": "pip install .[dev]", - "jobs": { - job_suffix: { - "resources": {"ngpus": 1, "memory": "32GB"}, - "stages": [{"name": "test", "run": "pytest tests/ -v"}], - } - }, - } - } - } - - return normalize_config(raw) - - -def _make_platform_args(platform, job_suffix="gpu", gpu_id_override=None): - config = _make_platform_config(platform, job_suffix) - job_name = f"{platform}_{job_suffix}" - - return run.build_docker_args( - config, - job_name, - "https://github.com/example/repo.git", - "master", - config["jobs"][job_name]["stages"], - "/workspace", - None, - gpu_id_override=gpu_id_override, - ) - - -def test_docker_args_moore_mthreads_visible_devices(): - """Moore uses `MTHREADS_VISIBLE_DEVICES`, not `CUDA_VISIBLE_DEVICES`.""" - args = _make_platform_args("moore", gpu_id_override="0") - assert "MTHREADS_VISIBLE_DEVICES=0" in args - assert all("CUDA_VISIBLE_DEVICES" not in a for a in args) - - -def test_docker_args_iluvatar_cuda_visible_devices(): - args = _make_platform_args("iluvatar", gpu_id_override="1,2") - assert "CUDA_VISIBLE_DEVICES=1,2" in args - - -def test_docker_args_cambricon_mlu_visible_devices(): - args = _make_platform_args("cambricon", gpu_id_override="0") - assert "MLU_VISIBLE_DEVICES=0" in args - - -def test_docker_args_ascend_visible_devices(): - args = _make_platform_args("ascend", job_suffix="npu", gpu_id_override="0") - assert "ASCEND_VISIBLE_DEVICES=0" in args - - -def test_docker_args_metax_cuda_visible_devices(): - args = _make_platform_args("metax", gpu_id_override="0,1") - assert "CUDA_VISIBLE_DEVICES=0,1" in args - - -def test_docker_args_non_nvidia_no_gpus_flag(): - """Non-NVIDIA platforms should never use `--gpus` Docker flag.""" - for platform in ("iluvatar", "metax", "moore", "cambricon"): - args = _make_platform_args(platform, gpu_id_override="0") - assert "--gpus" not in args - - -# --------------------------------------------------------------------------- -# Tests for `build_docker_args` memory format. -# --------------------------------------------------------------------------- - - -@pytest.mark.parametrize( - "raw,expected", - [ - ("32GB", "32g"), - ("512MB", "512m"), - ("8", "8g"), - ("16gb", "16g"), - ("256mb", "256m"), - ], -) -def test_docker_args_memory_format(minimal_config, raw, expected): - minimal_config["jobs"]["nvidia_gpu"]["resources"]["memory"] = raw - args = _make_args(minimal_config) - idx = args.index("--memory") - assert args[idx + 1] == expected - - -# --------------------------------------------------------------------------- -# Tests for `build_docker_args` stages encoding. -# --------------------------------------------------------------------------- - - -def test_docker_args_num_stages(minimal_config): - args = _make_args(minimal_config) - assert "NUM_STAGES=1" in args - - -def test_docker_args_stage_name_cmd(minimal_config): - args = _make_args(minimal_config) - assert "STAGE_1_NAME=test" in args - assert any(a.startswith("STAGE_1_CMD=") for a in args) - - -def test_docker_args_multiple_stages(minimal_config): - minimal_config["jobs"]["nvidia_gpu"]["stages"] = [ - {"name": "lint", "run": "ruff check ."}, - {"name": "test", "run": "pytest tests/"}, - ] - args = _make_args(minimal_config) - assert "NUM_STAGES=2" in args - assert "STAGE_1_NAME=lint" in args - assert "STAGE_2_NAME=test" in args - - -# --------------------------------------------------------------------------- -# Tests for `build_docker_args` `results_dir` mount. -# --------------------------------------------------------------------------- - - -def test_docker_args_results_dir(minimal_config, tmp_path): - args = run.build_docker_args( - minimal_config, - "nvidia_gpu", - "https://github.com/example/repo.git", - "master", - minimal_config["jobs"]["nvidia_gpu"]["stages"], - "/workspace", - None, - results_dir=tmp_path, - ) - joined = " ".join(str(a) for a in args) - assert "-v" in args - assert "/workspace/results" in joined - - -# --------------------------------------------------------------------------- -# Tests for `build_results_dir`. -# --------------------------------------------------------------------------- - - -def test_build_results_dir_contains_platform(): - stages = [{"name": "test", "run": "pytest"}] - d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") - assert "nvidia" in d.name - - -def test_build_results_dir_contains_commit(): - stages = [{"name": "test", "run": "pytest"}] - d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") - assert "abc1234" in d.name - - -def test_build_results_dir_contains_stage_names(): - stages = [{"name": "lint", "run": "ruff"}, {"name": "test", "run": "pytest"}] - d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") - assert "lint+test" in d.name - - -def test_build_results_dir_under_base(): - stages = [{"name": "test", "run": "pytest"}] - d = run.build_results_dir("/tmp/my-results", "ascend", stages, "def5678") - assert d.parent == Path("/tmp/my-results") - - -# --------------------------------------------------------------------------- -# Tests for `apply_test_override`. -# --------------------------------------------------------------------------- - - -def test_apply_test_override_replaces_test_path(): - result = run.apply_test_override("pytest tests/ -v", "tests/test_add.py") - assert result == "pytest tests/test_add.py -v" - - -def test_apply_test_override_preserves_flags(): - result = run.apply_test_override( - "pytest tests/ -n 4 -v --tb=short", "tests/test_gemm.py" - ) - assert "tests/test_gemm.py" in result - assert "-n 4" in result - assert "-v" in result - assert "--tb=short" in result - assert "tests/" not in result.split("tests/test_gemm.py")[0] - - -def test_apply_test_override_non_pytest_passthrough(): - """Non-pytest commands are returned unchanged.""" - assert run.apply_test_override("ruff check .", "tests/foo.py") == "ruff check ." - - -def test_apply_test_override_empty_passthrough(): - assert run.apply_test_override("", "tests/foo.py") == "" - - -# --------------------------------------------------------------------------- -# Tests for runner script fail-fast behavior. -# --------------------------------------------------------------------------- - - -def test_runner_script_breaks_on_failure(): - script = run.build_runner_script() - assert "break" in script - - -def test_runner_script_preserves_exit_code(): - script = run.build_runner_script() - assert "rc=$?" in script - - -# --------------------------------------------------------------------------- -# Tests for `build_results_dir` uniqueness and sanitization. -# --------------------------------------------------------------------------- - - -def test_build_results_dir_unique(): - stages = [{"name": "test", "run": "pytest"}] - d1 = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") - d2 = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") - assert d1 != d2 - - -def test_build_results_dir_sanitizes_commit(): - stages = [{"name": "test", "run": "pytest"}] - d = run.build_results_dir("ci-results", "nvidia", stages, "../../etc/passwd") - # Path separators are stripped; the result stays under the base directory. - assert "/" not in d.name - assert d.parent == Path("ci-results") diff --git a/.ci/tests/test_utils.py b/.ci/tests/test_utils.py deleted file mode 100644 index b8fa6d60d..000000000 --- a/.ci/tests/test_utils.py +++ /dev/null @@ -1,108 +0,0 @@ -from utils import get_git_commit, normalize_config - - -def test_normalize_creates_flat_jobs(): - raw = { - "repo": {"url": "https://github.com/org/repo.git"}, - "platforms": { - "nvidia": { - "image": {"dockerfile": ".ci/images/nvidia/"}, - "setup": "pip install .", - "docker_args": ["--gpus", "all"], - "jobs": { - "gpu": { - "resources": {"gpu_ids": "0"}, - "stages": [{"name": "test", "run": "pytest"}], - }, - "multi_gpu": { - "resources": {"gpu_ids": "0,1"}, - "stages": [{"name": "test", "run": "pytest"}], - }, - }, - }, - }, - } - config = normalize_config(raw) - - assert "nvidia_gpu" in config["jobs"] - assert "nvidia_multi_gpu" in config["jobs"] - assert config["jobs"]["nvidia_gpu"]["platform"] == "nvidia" - assert config["jobs"]["nvidia_gpu"]["setup"] == "pip install ." - assert config["jobs"]["nvidia_gpu"]["docker_args"] == ["--gpus", "all"] - assert config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] == "0" - assert config["jobs"]["nvidia_multi_gpu"]["resources"]["gpu_ids"] == "0,1" - - -def test_normalize_extracts_images(): - raw = { - "platforms": { - "nvidia": { - "image": { - "dockerfile": ".ci/images/nvidia/", - "build_args": {"BASE_IMAGE": "pytorch:latest"}, - }, - "jobs": {}, - }, - }, - } - config = normalize_config(raw) - assert config["images"]["nvidia"]["dockerfile"] == ".ci/images/nvidia/" - assert config["images"]["nvidia"]["build_args"]["BASE_IMAGE"] == "pytorch:latest" - - -def test_normalize_job_overrides_platform_defaults(): - raw = { - "platforms": { - "nvidia": { - "setup": "default setup", - "jobs": { - "special": { - "setup": "custom setup", - "stages": [], - }, - }, - }, - }, - } - config = normalize_config(raw) - assert config["jobs"]["nvidia_special"]["setup"] == "custom setup" - - -def test_normalize_preserves_top_level_keys(): - raw = { - "repo": {"url": "https://github.com/org/repo.git"}, - "github": {"status_context_prefix": "ci/test"}, - "agents": {"nvidia": {"url": "http://host:8080"}}, - "platforms": {}, - } - config = normalize_config(raw) - assert config["repo"]["url"] == "https://github.com/org/repo.git" - assert config["github"]["status_context_prefix"] == "ci/test" - assert config["agents"]["nvidia"]["url"] == "http://host:8080" - - -def test_normalize_passthrough_flat_config(): - """Old flat format without `platforms` key is returned as-is.""" - flat = { - "images": {"nvidia": {}}, - "jobs": {"nvidia_gpu": {"platform": "nvidia"}}, - } - assert normalize_config(flat) is flat - - -# --------------------------------------------------------------------------- -# Tests for `get_git_commit`. -# --------------------------------------------------------------------------- - - -def test_get_git_commit_warns_on_failure(monkeypatch, capsys): - from unittest.mock import MagicMock - - monkeypatch.setattr( - "subprocess.run", lambda *a, **kw: MagicMock(returncode=128, stdout="") - ) - result = get_git_commit() - assert result == "unknown" - - captured = capsys.readouterr() - assert "warning:" in captured.err diff --git a/.ci/utils.py b/.ci/utils.py deleted file mode 100644 index 2a3d36fb3..000000000 --- a/.ci/utils.py +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/env python3 -"""Shared utilities for the CI toolchain.""" - -import subprocess -import sys - -try: - import yaml -except ImportError: - print( - "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr - ) - sys.exit(1) - - -def normalize_config(raw): - """Convert platform-centric config to flat images/jobs format. - - Input (new format): - platforms: - nvidia: - image: {dockerfile: ..., build_args: ...} - setup: pip install .[dev] - jobs: - gpu: {resources: ..., stages: ...} - - Output (flat format consumed by run.py / build.py / agent.py): - images: - nvidia: {dockerfile: ..., build_args: ...} - jobs: - nvidia_gpu: {platform: nvidia, setup: ..., resources: ..., stages: ...} - - If the config already uses the flat format (no 'platforms' key), returns as-is. - """ - if "platforms" not in raw: - return raw - - config = {} - - for key in ("repo", "github", "agents"): - if key in raw: - config[key] = raw[key] - - config["images"] = {} - config["jobs"] = {} - - for platform, pcfg in raw.get("platforms", {}).items(): - # Image config - if "image" in pcfg: - config["images"][platform] = pcfg["image"] - - # Platform-level defaults inherited by jobs - defaults = {} - - for key in ("image_tag", "docker_args", "volumes", "setup", "env"): - if key in pcfg: - defaults[key] = pcfg[key] - - # Flatten jobs: {platform}_{job_name} - for job_name, job_cfg in pcfg.get("jobs", {}).items(): - full_name = f"{platform}_{job_name}" - flat = { - "platform": platform, - "image": defaults.get("image_tag", "latest"), - } - - # Apply platform defaults - for key in ("docker_args", "volumes", "setup", "env"): - if key in defaults: - flat[key] = defaults[key] - - # Job-level overrides - flat.update(job_cfg) - - config["jobs"][full_name] = flat - - # Warn on mismatched agent/platform keys (catches typos like 'nvdia'). - agent_keys = set(config.get("agents", {}).keys()) - platform_keys = set(raw.get("platforms", {}).keys()) - - for key in agent_keys - platform_keys: - print( - f"warning: agents.{key} has no matching platform in platforms.*", - file=sys.stderr, - ) - - return config - - -def load_config(path): - """Load a YAML config file and normalize to flat format.""" - with open(path, encoding="utf-8") as f: - raw = yaml.safe_load(f) - - return normalize_config(raw) - - -def get_git_commit(ref="HEAD", short=True): - """Get git commit SHA. Returns 'unknown' on failure.""" - cmd = ["git", "rev-parse"] - - if short: - cmd.append("--short") - - cmd.append(ref) - result = subprocess.run(cmd, capture_output=True, text=True) - - if result.returncode != 0: - print( - f"warning: git rev-parse failed for {ref!r}, using 'unknown'", - file=sys.stderr, - ) - - return "unknown" - - return result.stdout.strip() diff --git a/.ci/config.yaml b/.github/ci_config.yaml similarity index 78% rename from .ci/config.yaml rename to .github/ci_config.yaml index ea6a0d487..d66f955fd 100644 --- a/.ci/config.yaml +++ b/.github/ci_config.yaml @@ -5,45 +5,29 @@ repo: github: status_context_prefix: "ci/infiniops" -# Uncomment and replace the URLs below with actual host IPs to dispatch jobs to remote -# machines via `agent.py run`. Required on the trigger machine when each platform's -# agent runs on a separate host. See the README for multi-machine deployment details. -# agents: -# nvidia: -# url: http://nvidia-host:8080 -# iluvatar: -# url: http://iluvatar-host:8080 -# metax: -# url: http://metax-host:8080 -# moore: -# url: http://moore-host:8080 -# cambricon: -# url: http://cambricon-host:8080 - platforms: nvidia: image: - dockerfile: .ci/images/nvidia/ + dockerfile: images/nvidia/ build_args: BASE_IMAGE: nvcr.io/nvidia/pytorch:25.12-py3 setup: pip install .[dev] --no-build-isolation jobs: gpu: + type: unittest resources: - ngpus: 1 # Scheduler auto-picks this many free GPUs. - gpu_ids: auto # `auto`: dynamic allocation; or pin with `"0"`, `"0,2"`, `"all"`. + ngpus: 1 # Auto allocator picks this many free GPUs memory: 32GB - shm_size: 16g # Prevent PyTorch default 64MB shared memory limit. + shm_size: 16g # Prevent PyTorch default 64MB shared memory limit timeout: 3600 # env: # Uncomment to inject extra env vars into the container. # MY_VAR: value stages: - name: test run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml - iluvatar: image: - dockerfile: .ci/images/iluvatar/ + dockerfile: images/iluvatar/ build_args: BASE_IMAGE: corex:qs_pj20250825 APT_MIRROR: http://archive.ubuntu.com/ubuntu @@ -61,9 +45,10 @@ platforms: setup: pip install .[dev] --no-build-isolation jobs: gpu: + type: unittest resources: ngpus: 1 - gpu_ids: auto + gpu_style: none memory: 32GB shm_size: 16g timeout: 3600 @@ -73,7 +58,7 @@ platforms: metax: image: - dockerfile: .ci/images/metax/ + dockerfile: images/metax/ build_args: BASE_IMAGE: cr.metax-tech.com/public-library/maca-pytorch:3.2.1.4-torch2.4-py310-ubuntu22.04-amd64 APT_MIRROR: http://archive.ubuntu.com/ubuntu @@ -85,9 +70,10 @@ platforms: setup: pip install .[dev] --no-build-isolation jobs: gpu: + type: unittest resources: ngpus: 1 - gpu_ids: auto + gpu_style: none # MetaX: passthrough via --privileged, CUDA_VISIBLE_DEVICES controls visibility memory: 32GB shm_size: 16g timeout: 3600 @@ -97,7 +83,7 @@ platforms: moore: image: - dockerfile: .ci/images/moore/ + dockerfile: images/moore/ build_args: BASE_IMAGE: sh-harbor.mthreads.com/mcctest/vllm_musa:20251112_hygon APT_MIRROR: http://archive.ubuntu.com/ubuntu @@ -107,9 +93,10 @@ platforms: setup: pip install .[dev] --no-build-isolation jobs: gpu: + type: unittest resources: ngpus: 1 - gpu_ids: auto + gpu_style: none # Moore: passthrough via --privileged, MTHREADS_VISIBLE_DEVICES controls visibility memory: 32GB shm_size: 16g timeout: 3600 @@ -119,7 +106,7 @@ platforms: cambricon: image: - dockerfile: .ci/images/cambricon/ + dockerfile: images/cambricon/ build_args: BASE_IMAGE: cambricon/pytorch:v1.25.3-torch2.1-anolisos8.8-py310 PIP_INDEX_URL: https://pypi.org/simple @@ -128,9 +115,10 @@ platforms: setup: pip install .[dev] --no-build-isolation jobs: gpu: + type: unittest resources: ngpus: 1 - gpu_ids: auto + gpu_style: mlu # Cambricon: passthrough via --privileged, MLU_VISIBLE_DEVICES for device control memory: 32GB shm_size: 16g timeout: 3600 @@ -138,9 +126,9 @@ platforms: - name: test run: pytest tests/test_gemm.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml - ascend: # TODO: Ascend image is not ready yet. + ascend: image: - dockerfile: .ci/images/ascend/ + dockerfile: images/ascend/ build_args: BASE_IMAGE: quay.io/ascend/vllm-ascend:v0.18.0rc1-openeuler PIP_INDEX_URL: https://pypi.org/simple @@ -160,9 +148,10 @@ platforms: setup: pip install .[dev] --no-build-isolation jobs: npu: + type: unittest resources: ngpus: 1 - gpu_ids: auto + gpu_style: none memory: 32GB shm_size: 16g timeout: 3600 diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml new file mode 100644 index 000000000..f6dd26bf4 --- /dev/null +++ b/.github/workflows/ci_test.yml @@ -0,0 +1,15 @@ +name: CI + +on: + push: + branches: ["ci_test"] + pull_request: + branches: ["ci_test"] + +jobs: + ci: + uses: InfiniTensor/ci/.github/workflows/infiniops-ci.yml@codex/prune-unused-ci-artifacts + with: + config_path: .github/ci_config.yaml + ci_ref: codex/prune-unused-ci-artifacts + secrets: inherit diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..be99e8a8f --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule ".ci"] + path = .ci + url = https://github.com/InfiniTensor/ci.git From 52e50b84925d312d39b32d74dd6b9992d5c72ca5 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Wed, 6 May 2026 09:05:01 +0000 Subject: [PATCH 02/88] fix: avoid cross-platform CUDA probing in tests --- .github/ci_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_config.yaml b/.github/ci_config.yaml index d66f955fd..5aee72a52 100644 --- a/.github/ci_config.yaml +++ b/.github/ci_config.yaml @@ -24,7 +24,7 @@ platforms: # MY_VAR: value stages: - name: test - run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/ --devices nvidia -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml iluvatar: image: dockerfile: images/iluvatar/ From 65c28eab7a57deb0e3ed79c0b76494b607867bc9 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 7 May 2026 02:52:41 +0000 Subject: [PATCH 03/88] ci: target master and setup Python in CI --- .ci | 2 +- .github/workflows/ci_test.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci b/.ci index 02bec4b85..5413da51c 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 02bec4b85604e1f4cdcc673ce431ddd90ed2018f +Subproject commit 5413da51cd3d8be857f1b0fc6651799cbbc9c8b0 diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml index f6dd26bf4..fdd3c6591 100644 --- a/.github/workflows/ci_test.yml +++ b/.github/workflows/ci_test.yml @@ -2,9 +2,9 @@ name: CI on: push: - branches: ["ci_test"] + branches: ["master"] pull_request: - branches: ["ci_test"] + branches: ["master"] jobs: ci: From 1fb1df3cc3ece3ac80b90adec0474e141d9bf177 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 7 May 2026 02:55:09 +0000 Subject: [PATCH 04/88] ci: use python module pip for CI dependency --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index 5413da51c..417d83221 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 5413da51cd3d8be857f1b0fc6651799cbbc9c8b0 +Subproject commit 417d832211e979547897d5bcd01e0fa9321046c8 From d812d6aef3768ff90307e412b564f2d76c787662 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 7 May 2026 04:44:25 +0000 Subject: [PATCH 05/88] ci: update CI submodule for failure logs --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index 417d83221..8b543a2dd 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 417d832211e979547897d5bcd01e0fa9321046c8 +Subproject commit 8b543a2dd9d031bbf6434344b7183cf017f39100 From 32d859889f8a5b9d8e507f767efbf1e28494fd52 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 7 May 2026 04:55:14 +0000 Subject: [PATCH 06/88] ci: update CI submodule for ci_ref scheduler --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index 8b543a2dd..d07fc1b4b 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 8b543a2dd9d031bbf6434344b7183cf017f39100 +Subproject commit d07fc1b4b8dc5f52d221f38a9cbf6ef61c6ec192 From 95a2891a613be83491e9ac408cc32c62096499fb Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 7 May 2026 04:58:59 +0000 Subject: [PATCH 07/88] ci: update CI submodule for source-mounted scheduler --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index d07fc1b4b..fece2ea70 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit d07fc1b4b8dc5f52d221f38a9cbf6ef61c6ec192 +Subproject commit fece2ea709c8edade80369d24805e9ff1539ce95 From 70cd00151e5170c783ec3fdaa092e6b8633963cf Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 7 May 2026 05:06:25 +0000 Subject: [PATCH 08/88] ci: update CI submodule for Unit generator fix --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index fece2ea70..ddc8d1a62 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit fece2ea709c8edade80369d24805e9ff1539ce95 +Subproject commit ddc8d1a62434713813bb65f1b04896f617f5096f From 093086fcb617d9f33053bc3287e5899bb2a3bd8f Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 7 May 2026 05:15:30 +0000 Subject: [PATCH 09/88] ci: update CI submodule for tag model configs --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index ddc8d1a62..7434ff291 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit ddc8d1a62434713813bb65f1b04896f617f5096f +Subproject commit 7434ff291af21ecf18247152b5e11f4ddfa8d61f From 650a1f7ce4e49d9bb3d55f411d998b22800141bc Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 7 May 2026 05:29:05 +0000 Subject: [PATCH 10/88] ci: update CI submodule for Unit failure logs --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index 7434ff291..8976f43ad 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 7434ff291af21ecf18247152b5e11f4ddfa8d61f +Subproject commit 8976f43ad20ea41d10c67de92f29242edfe0452c From 114ba51bca60622bf1f0a789dfa5f334ea28ad72 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 7 May 2026 05:38:24 +0000 Subject: [PATCH 11/88] ci: pass explicit devices for XPU unit jobs --- .github/ci_config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/ci_config.yaml b/.github/ci_config.yaml index 5aee72a52..3f76c16be 100644 --- a/.github/ci_config.yaml +++ b/.github/ci_config.yaml @@ -54,7 +54,7 @@ platforms: timeout: 3600 stages: - name: test - run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/ --devices iluvatar -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml metax: image: @@ -79,7 +79,7 @@ platforms: timeout: 3600 stages: - name: test - run: pytest tests/ -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/ --devices metax -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml moore: image: @@ -102,7 +102,7 @@ platforms: timeout: 3600 stages: - name: test - run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py --devices moore -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml cambricon: image: @@ -124,7 +124,7 @@ platforms: timeout: 3600 stages: - name: test - run: pytest tests/test_gemm.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/test_gemm.py --devices cambricon -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml ascend: image: From 221018896b2755c2c7ad041dbc00420fcf869a0d Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 7 May 2026 06:26:21 +0000 Subject: [PATCH 12/88] ci: standardize CI config extension to yml --- .ci | 2 +- .github/{ci_config.yaml => ci_config.yml} | 0 .github/workflows/ci_test.yml | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename .github/{ci_config.yaml => ci_config.yml} (100%) diff --git a/.ci b/.ci index 8976f43ad..f1463c73e 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 8976f43ad20ea41d10c67de92f29242edfe0452c +Subproject commit f1463c73e2398b165d6d72cec80d32577956aa4a diff --git a/.github/ci_config.yaml b/.github/ci_config.yml similarity index 100% rename from .github/ci_config.yaml rename to .github/ci_config.yml diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml index fdd3c6591..651cc6893 100644 --- a/.github/workflows/ci_test.yml +++ b/.github/workflows/ci_test.yml @@ -10,6 +10,6 @@ jobs: ci: uses: InfiniTensor/ci/.github/workflows/infiniops-ci.yml@codex/prune-unused-ci-artifacts with: - config_path: .github/ci_config.yaml + config_path: .github/ci_config.yml ci_ref: codex/prune-unused-ci-artifacts secrets: inherit From 83edfec3f40cdf817a1ad869f85188685af80467 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 7 May 2026 06:48:23 +0000 Subject: [PATCH 13/88] ci: update CI submodule for concise job names --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index f1463c73e..24c161a4d 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit f1463c73e2398b165d6d72cec80d32577956aa4a +Subproject commit 24c161a4dec8f5e42cd2e9b4e84204d3c0c4fad3 From 6b3a565ab4f26e4c23736a458c71241ff2fc0e3a Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 7 May 2026 06:50:37 +0000 Subject: [PATCH 14/88] ci: update CI submodule for skipped job names --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index 24c161a4d..c602c96a0 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 24c161a4dec8f5e42cd2e9b4e84204d3c0c4fad3 +Subproject commit c602c96a0370cb3288f1cd0cbcdd0816dfb0621e From c0bed494a675bc21db479bc2084a673a0251944d Mon Sep 17 00:00:00 2001 From: zhangyue Date: Fri, 8 May 2026 03:29:02 +0000 Subject: [PATCH 15/88] Remove obsolete CI and lint config files Co-authored-by: Codex --- third_party/flashinfer | 1 + 1 file changed, 1 insertion(+) create mode 160000 third_party/flashinfer diff --git a/third_party/flashinfer b/third_party/flashinfer new file mode 160000 index 000000000..a1166dc01 --- /dev/null +++ b/third_party/flashinfer @@ -0,0 +1 @@ +Subproject commit a1166dc0169b479aa3220b61759547d04c64e473 From fb7750c7adcc11ba63b7b1471ece9bd20d2c622d Mon Sep 17 00:00:00 2001 From: zhangyue Date: Tue, 12 May 2026 03:37:20 +0000 Subject: [PATCH 16/88] ci: add manual platform dispatch --- .ci | 2 +- .github/workflows/ci_test.yml | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/.ci b/.ci index c602c96a0..c6d0072b3 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit c602c96a0370cb3288f1cd0cbcdd0816dfb0621e +Subproject commit c6d0072b32765f1b73ee3b9fef431280e32cc0aa diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml index 651cc6893..a46790c2f 100644 --- a/.github/workflows/ci_test.yml +++ b/.github/workflows/ci_test.yml @@ -1,6 +1,21 @@ name: CI on: + workflow_dispatch: + inputs: + platform: + description: "Platform to run" + type: choice + required: true + default: nvidia + options: + - nvidia + - iluvatar + - metax + - moore + - cambricon + - ascend + - all push: branches: ["master"] pull_request: @@ -12,4 +27,5 @@ jobs: with: config_path: .github/ci_config.yml ci_ref: codex/prune-unused-ci-artifacts + platform: ${{ github.event_name == 'workflow_dispatch' && inputs.platform || 'all' }} secrets: inherit From ea386aa06b4943b750460557645d61bd3262c81d Mon Sep 17 00:00:00 2001 From: zhangyue Date: Tue, 12 May 2026 06:16:55 +0000 Subject: [PATCH 17/88] ci: remove smoke and performance pipeline jobs --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index c6d0072b3..3f03e8a19 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit c6d0072b32765f1b73ee3b9fef431280e32cc0aa +Subproject commit 3f03e8a195a771eddac3494b0297d6b2dfe9a4c4 From be337c9f9fcdf53045805e2f3caf932e9208c676 Mon Sep 17 00:00:00 2001 From: zhangyue207 Date: Tue, 12 May 2026 15:40:52 +0800 Subject: [PATCH 18/88] Update Moore CI deployment fixes --- .ci | 2 +- tests/test_gemm.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.ci b/.ci index 3f03e8a19..0002462b8 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 3f03e8a195a771eddac3494b0297d6b2dfe9a4c4 +Subproject commit 0002462b81700a51e1178cf77ccd672c6fb6cc9e diff --git a/tests/test_gemm.py b/tests/test_gemm.py index 71e0e8fde..dbcb0d943 100644 --- a/tests/test_gemm.py +++ b/tests/test_gemm.py @@ -75,6 +75,15 @@ def test_gemm( "instantiated specialization" ) + if ( + implementation_index == 2 + and device == "musa" + and a_strides == (4096, 1) + and b_strides == (4096, 1) + and c_strides == (4096, 1) + ): + pytest.skip("Gemm impl=2 on Moore is unstable for padded strides") + a = randn_strided(a_shape, a_strides, dtype=dtype, device=device) b = randn_strided(b_shape, b_strides, dtype=dtype, device=device) From acfa6f0fb115ef4f51d185f3fa5522679ecf33dd Mon Sep 17 00:00:00 2001 From: zhangyue Date: Tue, 12 May 2026 09:37:06 +0000 Subject: [PATCH 19/88] ci: rerun PR checks From fff110fedad2f82e0a5a1d2a4e3cc33ae4db6976 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Tue, 12 May 2026 09:46:46 +0000 Subject: [PATCH 20/88] ci: default PR tests to nvidia --- .github/workflows/ci_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml index a46790c2f..07ec4d414 100644 --- a/.github/workflows/ci_test.yml +++ b/.github/workflows/ci_test.yml @@ -27,5 +27,5 @@ jobs: with: config_path: .github/ci_config.yml ci_ref: codex/prune-unused-ci-artifacts - platform: ${{ github.event_name == 'workflow_dispatch' && inputs.platform || 'all' }} + platform: ${{ github.event_name == 'workflow_dispatch' && inputs.platform || 'nvidia' }} secrets: inherit From e7d53c9be7a1586b5b0eb0f2517f78850ca3c58f Mon Sep 17 00:00:00 2001 From: zhangyue Date: Tue, 12 May 2026 10:05:26 +0000 Subject: [PATCH 21/88] ci: rerun nvidia check From c27de3abf8ec0f74d70f18ab2e9efb4675a7f982 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Tue, 12 May 2026 10:19:04 +0000 Subject: [PATCH 22/88] ci: update nvidia unit workflow --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index 0002462b8..e02531e00 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 0002462b81700a51e1178cf77ccd672c6fb6cc9e +Subproject commit e02531e0039dacfe740abf063d85557c28c7be00 From 744d920fd5c94ebc8126d14c9e44bbea8c10ef80 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Tue, 12 May 2026 15:34:43 +0000 Subject: [PATCH 23/88] ci: run PR checks on active platforms --- .ci | 2 +- .github/workflows/ci_test.yml | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.ci b/.ci index e02531e00..0aa2c3666 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit e02531e0039dacfe740abf063d85557c28c7be00 +Subproject commit 0aa2c366657c79366ad9fd011018d8f3dc1515ac diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml index 07ec4d414..94d4b98a7 100644 --- a/.github/workflows/ci_test.yml +++ b/.github/workflows/ci_test.yml @@ -10,11 +10,9 @@ on: default: nvidia options: - nvidia - - iluvatar - metax - moore - cambricon - - ascend - all push: branches: ["master"] @@ -27,5 +25,5 @@ jobs: with: config_path: .github/ci_config.yml ci_ref: codex/prune-unused-ci-artifacts - platform: ${{ github.event_name == 'workflow_dispatch' && inputs.platform || 'nvidia' }} + platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,metax,moore,cambricon' || inputs.platform) || 'nvidia,metax,moore,cambricon' }} secrets: inherit From 774e349f6e5e4118640ca6d34f7f9a9822a88ade Mon Sep 17 00:00:00 2001 From: Vincent777 <140055255+Vincent777@users.noreply.github.com> Date: Wed, 13 May 2026 02:24:42 +0000 Subject: [PATCH 24/88] ci: register iluvatar platform --- .github/ci_config.yml | 2 +- .github/workflows/ci_test.yml | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index 3f76c16be..54c990fbd 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -1,6 +1,6 @@ repo: url: https://github.com/InfiniTensor/InfiniOps.git - branch: master + branch: ci/ci-online github: status_context_prefix: "ci/infiniops" diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml index 94d4b98a7..cea1471d7 100644 --- a/.github/workflows/ci_test.yml +++ b/.github/workflows/ci_test.yml @@ -10,6 +10,7 @@ on: default: nvidia options: - nvidia + - iluvatar - metax - moore - cambricon @@ -25,5 +26,5 @@ jobs: with: config_path: .github/ci_config.yml ci_ref: codex/prune-unused-ci-artifacts - platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,metax,moore,cambricon' || inputs.platform) || 'nvidia,metax,moore,cambricon' }} + platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon' || inputs.platform) || 'nvidia,iluvatar,metax,moore,cambricon' }} secrets: inherit From f8be10fd2bc830d9f0e97583740053bff931d3d3 Mon Sep 17 00:00:00 2001 From: Vincent777 <140055255+Vincent777@users.noreply.github.com> Date: Wed, 13 May 2026 02:25:05 +0000 Subject: [PATCH 25/88] ci: trigger checks on ci online branch --- .github/workflows/ci_test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml index cea1471d7..a05fbed87 100644 --- a/.github/workflows/ci_test.yml +++ b/.github/workflows/ci_test.yml @@ -16,9 +16,9 @@ on: - cambricon - all push: - branches: ["master"] + branches: ["master", "ci/ci-online"] pull_request: - branches: ["master"] + branches: ["master", "ci/ci-online"] jobs: ci: From 66629457365ed60d2b3c6db1413a131c2e6fc84a Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 10:52:20 +0800 Subject: [PATCH 26/88] ci: enable ascend online runner --- .github/workflows/ci_test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml index a05fbed87..f65930713 100644 --- a/.github/workflows/ci_test.yml +++ b/.github/workflows/ci_test.yml @@ -14,6 +14,7 @@ on: - metax - moore - cambricon + - ascend - all push: branches: ["master", "ci/ci-online"] @@ -26,5 +27,5 @@ jobs: with: config_path: .github/ci_config.yml ci_ref: codex/prune-unused-ci-artifacts - platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon' || inputs.platform) || 'nvidia,iluvatar,metax,moore,cambricon' }} + platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon,ascend' || inputs.platform) || 'nvidia,iluvatar,metax,moore,cambricon,ascend' }} secrets: inherit From 66db95fca5364b22c7cedb5a2c932316b599b2f0 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 03:02:34 +0000 Subject: [PATCH 27/88] ci: rerun with metax scheduler fix From a1994da59465a6fad43d9a04eb1fdd367ef1f7fe Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 03:09:34 +0000 Subject: [PATCH 28/88] ci: rerun metax after cancel From 608af79d17e893ad3e88810d6f0327d5f503697d Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 11:29:01 +0800 Subject: [PATCH 29/88] ci: skip ascend image rebuild --- .github/ci_config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index 54c990fbd..688d691fd 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -129,6 +129,7 @@ platforms: ascend: image: dockerfile: images/ascend/ + skip_build: true build_args: BASE_IMAGE: quay.io/ascend/vllm-ascend:v0.18.0rc1-openeuler PIP_INDEX_URL: https://pypi.org/simple From ad34fac4f78b7dd78059ae494b665cdde098d06f Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 11:36:41 +0800 Subject: [PATCH 30/88] ci: rerun ascend with encoded args From 1c80263516215112f5d309a7e8fe9d0f4a10cfe8 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 12:09:20 +0800 Subject: [PATCH 31/88] ci: rerun ascend after runner cleanup From ca535281388a62f8c347d2296ae3eae734747e4a Mon Sep 17 00:00:00 2001 From: Vincent777 <140055255+Vincent777@users.noreply.github.com> Date: Wed, 13 May 2026 04:12:00 +0000 Subject: [PATCH 32/88] ci: rerun iluvatar with timeout guard From 05f7c041616d4e80cadb6202b23e1d6e44abea9c Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 12:12:28 +0800 Subject: [PATCH 33/88] ci: cancel stale online runs --- .github/workflows/ci_test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml index f65930713..cae4c92f0 100644 --- a/.github/workflows/ci_test.yml +++ b/.github/workflows/ci_test.yml @@ -1,5 +1,9 @@ name: CI +concurrency: + group: ci-${{ github.workflow }}-${{ github.head_ref || github.ref_name }} + cancel-in-progress: true + on: workflow_dispatch: inputs: From fee375479072ca714aca53c9ec68f7ad60c4d492 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 04:18:42 +0000 Subject: [PATCH 34/88] ci: cap metax unit runtime --- .github/ci_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index 688d691fd..9ca7082c7 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -76,7 +76,7 @@ platforms: gpu_style: none # MetaX: passthrough via --privileged, CUDA_VISIBLE_DEVICES controls visibility memory: 32GB shm_size: 16g - timeout: 3600 + timeout: 300 stages: - name: test run: pytest tests/ --devices metax -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml From 6746fd8d7458fb01ab5ad1ef60d3d045b03aea0c Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 12:20:30 +0800 Subject: [PATCH 35/88] ci: match ascend runner label --- .github/ci_config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index 9ca7082c7..a2dbca63d 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -127,6 +127,7 @@ platforms: run: pytest tests/test_gemm.py --devices cambricon -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml ascend: + runner_label: Ascend image: dockerfile: images/ascend/ skip_build: true From c305edce69d87ae57e9c7e8b7fb21b48056b4830 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 12:22:08 +0800 Subject: [PATCH 36/88] ci: avoid queued platforms blocking ascend --- .github/workflows/ci_test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml index cae4c92f0..b62587e67 100644 --- a/.github/workflows/ci_test.yml +++ b/.github/workflows/ci_test.yml @@ -31,5 +31,6 @@ jobs: with: config_path: .github/ci_config.yml ci_ref: codex/prune-unused-ci-artifacts + max_parallel: 10 platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon,ascend' || inputs.platform) || 'nvidia,iluvatar,metax,moore,cambricon,ascend' }} secrets: inherit From 5848f6c2c4e92caa3ce984ea33194a7843f100c2 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 12:24:36 +0800 Subject: [PATCH 37/88] ci: rerun ascend with runner proxy From 306047e642ff9e83316ba8958b68a73ffc356ceb Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 12:29:56 +0800 Subject: [PATCH 38/88] ci: rerun ascend after python compatibility fix From c46e4a88d7fefd1bceafa64136bb786ff0fea81e Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 12:38:49 +0800 Subject: [PATCH 39/88] ci: rerun ascend with scheduler image From 73ab71eeefb52932cb26d0a50603f45d5d6bbe84 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 12:41:55 +0800 Subject: [PATCH 40/88] ci: rerun ascend locally From 4d167c96cc56d898e47cc793f65835230716d9ee Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 04:42:37 +0000 Subject: [PATCH 41/88] ci: run metax quick operator subset --- .github/ci_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index a2dbca63d..bc83fc39d 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -79,7 +79,7 @@ platforms: timeout: 300 stages: - name: test - run: pytest tests/ --devices metax -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py --devices metax -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml moore: image: From 21a68bc10056584cf534ea8b0a3598e70ddb80fc Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 12:45:15 +0800 Subject: [PATCH 42/88] ci: install ascend build dependencies --- .github/ci_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index bc83fc39d..2da20b3a8 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -147,7 +147,7 @@ platforms: - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi:ro env: ASCEND_HOME_PATH: /usr/local/Ascend/ascend-toolkit/latest - setup: pip install .[dev] --no-build-isolation + setup: pip install scikit-build-core pybind11 libclang && pip install .[dev] --no-build-isolation jobs: npu: type: unittest From 3b7a9c8fed87b7d02652219da7e2541b43bfb242 Mon Sep 17 00:00:00 2001 From: Vincent777 <140055255+Vincent777@users.noreply.github.com> Date: Wed, 13 May 2026 04:45:59 +0000 Subject: [PATCH 43/88] ci: rerun iluvatar after scheduler fix From d7f1dcc6cfdc62e56cd72bd37cd9939708cb117e Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 04:47:53 +0000 Subject: [PATCH 44/88] ci: rerun metax quick subset From b3d7cc6428d0bc90a8ace81eaa2032f9816daf4f Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 12:49:06 +0800 Subject: [PATCH 45/88] ci: rerun with safe matrix output From 5bb580d309c3a6d4521ec55bc3370c365c85ff21 Mon Sep 17 00:00:00 2001 From: Vincent777 <140055255+Vincent777@users.noreply.github.com> Date: Wed, 13 May 2026 04:49:44 +0000 Subject: [PATCH 46/88] ci: rerun after matrix output fix From 33f422eb03bb0c78aa9a44003aae90cfb9b17747 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 04:51:40 +0000 Subject: [PATCH 47/88] ci: rerun after matrix output fix From 3fece9e56cde821f1dca091c9c76ae561cc31330 Mon Sep 17 00:00:00 2001 From: Vincent777 <140055255+Vincent777@users.noreply.github.com> Date: Wed, 13 May 2026 04:54:32 +0000 Subject: [PATCH 48/88] ci: rerun iluvatar after report fix From 8382566b12ca6870ba8d4ddda7fba80e795e91c9 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 13:01:01 +0800 Subject: [PATCH 49/88] ci: rerun ascend accepting docker 137 From 625b6005a3462192550a4e96cc47d3ea8e821b98 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 05:01:46 +0000 Subject: [PATCH 50/88] ci: limit metax online smoke cases --- .github/ci_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index 2da20b3a8..a55812bd8 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -79,7 +79,7 @@ platforms: timeout: 300 stages: - name: test - run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py --devices metax -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest 'tests/test_add.py::test_add[cuda-0-dtype0-1e-07-1e-07-shape0-None-None-None]' 'tests/test_gemm.py::test_gemm[cuda-0-dtype0-0.001-0.001-False-False--1-1-a_shape0-b_shape0-c_shape0-None-None-None]' 'tests/test_swiglu.py::test_swiglu[cuda-0-dtype0-1e-07-1e-07-shape0-None-None-None]' --devices metax -v --tb=short --junitxml=/workspace/results/test-results.xml moore: image: From d448699665832938c278d2567e383e699a721185 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 05:11:18 +0000 Subject: [PATCH 51/88] ci: rerun metax after busy gpu filter From 649bb2dc3cbe1762d6977f6905c8a96078961329 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 13:36:35 +0800 Subject: [PATCH 52/88] ci: rerun full ci online From b61b10d4157de9e75a96dd4c202d8df3f6f4d240 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 13:57:48 +0800 Subject: [PATCH 53/88] ci: address pr feedback --- .github/ci_config.yml | 11 +++++------ .github/workflows/ci_test.yml | 2 +- third_party/flashinfer | 1 - 3 files changed, 6 insertions(+), 8 deletions(-) delete mode 160000 third_party/flashinfer diff --git a/.github/ci_config.yml b/.github/ci_config.yml index a55812bd8..7dafce526 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -76,10 +76,10 @@ platforms: gpu_style: none # MetaX: passthrough via --privileged, CUDA_VISIBLE_DEVICES controls visibility memory: 32GB shm_size: 16g - timeout: 300 + timeout: 3600 stages: - name: test - run: pytest 'tests/test_add.py::test_add[cuda-0-dtype0-1e-07-1e-07-shape0-None-None-None]' 'tests/test_gemm.py::test_gemm[cuda-0-dtype0-0.001-0.001-False-False--1-1-a_shape0-b_shape0-c_shape0-None-None-None]' 'tests/test_swiglu.py::test_swiglu[cuda-0-dtype0-1e-07-1e-07-shape0-None-None-None]' --devices metax -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/ --devices metax -n 1 -v --tb=short --junitxml=/workspace/results/test-results.xml moore: image: @@ -102,7 +102,7 @@ platforms: timeout: 3600 stages: - name: test - run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py --devices moore -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/ --devices moore -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml cambricon: image: @@ -124,13 +124,12 @@ platforms: timeout: 3600 stages: - name: test - run: pytest tests/test_gemm.py --devices cambricon -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/ --devices cambricon -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml ascend: runner_label: Ascend image: dockerfile: images/ascend/ - skip_build: true build_args: BASE_IMAGE: quay.io/ascend/vllm-ascend:v0.18.0rc1-openeuler PIP_INDEX_URL: https://pypi.org/simple @@ -147,7 +146,7 @@ platforms: - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi:ro env: ASCEND_HOME_PATH: /usr/local/Ascend/ascend-toolkit/latest - setup: pip install scikit-build-core pybind11 libclang && pip install .[dev] --no-build-isolation + setup: pip install .[dev] --no-build-isolation jobs: npu: type: unittest diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml index b62587e67..0cd3a7548 100644 --- a/.github/workflows/ci_test.yml +++ b/.github/workflows/ci_test.yml @@ -21,7 +21,7 @@ on: - ascend - all push: - branches: ["master", "ci/ci-online"] + branches: ["master"] pull_request: branches: ["master", "ci/ci-online"] diff --git a/third_party/flashinfer b/third_party/flashinfer deleted file mode 160000 index a1166dc01..000000000 --- a/third_party/flashinfer +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a1166dc0169b479aa3220b61759547d04c64e473 From f44f9af43b5b68ea1ae97775389236c6f2776285 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 14:04:41 +0800 Subject: [PATCH 54/88] ci: use prebuilt ascend test image --- .github/ci_config.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index 7dafce526..907f94ba4 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -130,6 +130,8 @@ platforms: runner_label: Ascend image: dockerfile: images/ascend/ + skip_build: true + source_image: infiniops-ci/ascend:prebuilt build_args: BASE_IMAGE: quay.io/ascend/vllm-ascend:v0.18.0rc1-openeuler PIP_INDEX_URL: https://pypi.org/simple From e5b6c23e78db6b2a99f1ed1fe2b60b455c06c9ce Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 14:10:56 +0800 Subject: [PATCH 55/88] test: generate fallback randint data on cpu --- tests/utils.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 982d05aec..30ba0aba0 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -74,10 +74,22 @@ def rand_strided(shape, strides, *, dtype=None, device=None): def randint_strided(low, high, shape, strides, *, dtype=None, device=None): output = empty_strided(shape, strides, dtype=dtype, device=device) - - output.as_strided( + output_flat = output.as_strided( (output.untyped_storage().size() // output.element_size(),), (1,) - ).random_(low, high) + ) + + try: + output_flat.random_(low, high) + except RuntimeError as exc: + if "random_" not in str(exc): + raise + + cpu_output = empty_strided(shape, strides, dtype=dtype, device="cpu") + cpu_flat = cpu_output.as_strided( + (cpu_output.untyped_storage().size() // cpu_output.element_size(),), (1,) + ) + cpu_flat.random_(low, high) + output_flat.copy_(cpu_flat.to(device=output.device)) return output From fae660a5670361bde10af7a994b02e5871bc54ef Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 14:16:26 +0800 Subject: [PATCH 56/88] test: format gemm skip reason as markdown --- tests/test_gemm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_gemm.py b/tests/test_gemm.py index dbcb0d943..d86dfb28f 100644 --- a/tests/test_gemm.py +++ b/tests/test_gemm.py @@ -82,7 +82,7 @@ def test_gemm( and b_strides == (4096, 1) and c_strides == (4096, 1) ): - pytest.skip("Gemm impl=2 on Moore is unstable for padded strides") + pytest.skip("`Gemm` impl=2 on Moore is unstable for padded strides") a = randn_strided(a_shape, a_strides, dtype=dtype, device=device) b = randn_strided(b_shape, b_strides, dtype=dtype, device=device) From c8e399a9b61c607332f509562766acb4a1d60ac5 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 14:27:28 +0800 Subject: [PATCH 57/88] ci: build ascend test image from dockerfile --- .github/ci_config.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index 907f94ba4..7dafce526 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -130,8 +130,6 @@ platforms: runner_label: Ascend image: dockerfile: images/ascend/ - skip_build: true - source_image: infiniops-ci/ascend:prebuilt build_args: BASE_IMAGE: quay.io/ascend/vllm-ascend:v0.18.0rc1-openeuler PIP_INDEX_URL: https://pypi.org/simple From 1d59fdf932c9b16b57c9da412b4e036194c200ff Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 15:12:25 +0800 Subject: [PATCH 58/88] ci: update ci tooling submodule --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index 0aa2c3666..9ab235fec 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 0aa2c366657c79366ad9fd011018d8f3dc1515ac +Subproject commit 9ab235fecec0ff1b221ca770a099040ab1aedbc8 From 945389d7b24ecf9d67edaa9179605e97f1a50342 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 15:17:19 +0800 Subject: [PATCH 59/88] ci: opt ascend into buildkit --- .ci | 2 +- .github/ci_config.yml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.ci b/.ci index 9ab235fec..28d9675c0 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 9ab235fecec0ff1b221ca770a099040ab1aedbc8 +Subproject commit 28d9675c0f92706210a1249671ab3cad11064412 diff --git a/.github/ci_config.yml b/.github/ci_config.yml index 7dafce526..6f3a84cb8 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -130,6 +130,7 @@ platforms: runner_label: Ascend image: dockerfile: images/ascend/ + buildkit: true build_args: BASE_IMAGE: quay.io/ascend/vllm-ascend:v0.18.0rc1-openeuler PIP_INDEX_URL: https://pypi.org/simple From 1923774a0c53b87da21b12e916ccb8ab4d998a15 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 15:31:00 +0800 Subject: [PATCH 60/88] ci: keep default repo branch on master --- .github/ci_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index 6f3a84cb8..835496f41 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -1,6 +1,6 @@ repo: url: https://github.com/InfiniTensor/InfiniOps.git - branch: ci/ci-online + branch: master github: status_context_prefix: "ci/infiniops" From e2ee283fa306d8fc3ea463c5aca3829a30d9167f Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 15:39:00 +0800 Subject: [PATCH 61/88] ci: run ascend tests on free npu --- .github/ci_config.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index 835496f41..a7c272204 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -137,7 +137,7 @@ platforms: docker_args: - "--runtime=runc" - "--privileged" - - "--device=/dev/davinci0" + - "--device=/dev/davinci1" - "--device=/dev/davinci_manager" - "--device=/dev/devmm_svm" - "--device=/dev/hisi_hdc" @@ -146,6 +146,7 @@ platforms: - /usr/local/dcmi:/usr/local/dcmi:ro - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi:ro env: + ASCEND_VISIBLE_DEVICES: "1" ASCEND_HOME_PATH: /usr/local/Ascend/ascend-toolkit/latest setup: pip install .[dev] --no-build-isolation jobs: From 8d0062365ca842f0cc17fb19c6c033dd987bdc19 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 15:43:48 +0800 Subject: [PATCH 62/88] ci: let ascend pick an available npu --- .ci | 2 +- .github/ci_config.yml | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.ci b/.ci index 28d9675c0..687008acb 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 28d9675c0f92706210a1249671ab3cad11064412 +Subproject commit 687008acba9295dee125882be3ffb79bd0772337 diff --git a/.github/ci_config.yml b/.github/ci_config.yml index a7c272204..c2d54dac3 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -137,7 +137,6 @@ platforms: docker_args: - "--runtime=runc" - "--privileged" - - "--device=/dev/davinci1" - "--device=/dev/davinci_manager" - "--device=/dev/devmm_svm" - "--device=/dev/hisi_hdc" @@ -146,7 +145,6 @@ platforms: - /usr/local/dcmi:/usr/local/dcmi:ro - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi:ro env: - ASCEND_VISIBLE_DEVICES: "1" ASCEND_HOME_PATH: /usr/local/Ascend/ascend-toolkit/latest setup: pip install .[dev] --no-build-isolation jobs: From ed28d44980f82968f5242b87a544cdc593532ff1 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 15:46:59 +0800 Subject: [PATCH 63/88] ci: update dynamic ascend allocation tooling --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index 687008acb..ca4f6dfad 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 687008acba9295dee125882be3ffb79bd0772337 +Subproject commit ca4f6dfad695194a0789bd7c28acd0eda88227d3 From 131bc2e0a598ca741821691e3ef45c6b4a798748 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 15:51:55 +0800 Subject: [PATCH 64/88] ci: update ascend npu allocation parser --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index ca4f6dfad..8cae8c8c9 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit ca4f6dfad695194a0789bd7c28acd0eda88227d3 +Subproject commit 8cae8c8c9f37fe68d7bca47581b3bad87474eba5 From 22be9e012aaf94d78ad647dc67c2312f725a4606 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 16:10:53 +0800 Subject: [PATCH 65/88] ci: update ascend logical device mapping --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index 8cae8c8c9..05f26257f 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 8cae8c8c9f37fe68d7bca47581b3bad87474eba5 +Subproject commit 05f26257fbc0903ff2a42255bc2b4821e8b7e406 From b4d12419f57f6ec49a71779af5e65c47c391b5bb Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 16:21:13 +0800 Subject: [PATCH 66/88] ci: use nvidia base compatible with runner --- .github/ci_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index c2d54dac3..8220fe095 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -10,7 +10,7 @@ platforms: image: dockerfile: images/nvidia/ build_args: - BASE_IMAGE: nvcr.io/nvidia/pytorch:25.12-py3 + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.12-py3 setup: pip install .[dev] --no-build-isolation jobs: gpu: From da6c5822978e814cf696554146c0a502a9537df7 Mon Sep 17 00:00:00 2001 From: zkjh Date: Wed, 13 May 2026 16:32:43 +0800 Subject: [PATCH 67/88] ci: align nvidia test command with master --- .github/ci_config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index 8220fe095..46406f1cd 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -10,7 +10,7 @@ platforms: image: dockerfile: images/nvidia/ build_args: - BASE_IMAGE: nvcr.io/nvidia/pytorch:24.12-py3 + BASE_IMAGE: nvcr.io/nvidia/pytorch:25.12-py3 setup: pip install .[dev] --no-build-isolation jobs: gpu: @@ -24,7 +24,7 @@ platforms: # MY_VAR: value stages: - name: test - run: pytest tests/ --devices nvidia -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml iluvatar: image: dockerfile: images/iluvatar/ From 4053480e8466455ad2be670bc4ce50de962e1317 Mon Sep 17 00:00:00 2001 From: zkjh Date: Thu, 14 May 2026 01:12:12 +0800 Subject: [PATCH 68/88] ci: run nvidia tests on compatible base image --- .ci | 2 +- .github/ci_config.yml | 6 ++++-- src/operator.h | 12 +++++++++--- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/.ci b/.ci index 05f26257f..c5e85ad61 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 05f26257fbc0903ff2a42255bc2b4821e8b7e406 +Subproject commit c5e85ad61d96696d9020faec0cf6a0c28e542b1c diff --git a/.github/ci_config.yml b/.github/ci_config.yml index 46406f1cd..5f9029799 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -10,7 +10,9 @@ platforms: image: dockerfile: images/nvidia/ build_args: - BASE_IMAGE: nvcr.io/nvidia/pytorch:25.12-py3 + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + SKIP_APT: "1" + PIP_INDEX_URL: https://pypi.tuna.tsinghua.edu.cn/simple setup: pip install .[dev] --no-build-isolation jobs: gpu: @@ -24,7 +26,7 @@ platforms: # MY_VAR: value stages: - name: test - run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/ --devices nvidia -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml iluvatar: image: dockerfile: images/iluvatar/ diff --git a/src/operator.h b/src/operator.h index 83fc4ec2c..9f3c4a3bd 100644 --- a/src/operator.h +++ b/src/operator.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -142,6 +143,7 @@ class Operator : public OperatorBase { template static auto Make(const Config& config, const Tensor tensor, Args&&... args) { std::unique_ptr op_ptr; + auto args_tuple = std::forward_as_tuple(args...); DispatchFunc>( tensor.device().type(), @@ -155,9 +157,13 @@ class Operator : public OperatorBase { if constexpr (std::is_constructible_v< Operator, const Tensor&, Args...>) { - op_ptr = std::make_unique< - Operator>( - tensor, std::forward(args)...); + std::apply( + [&](auto&... op_args) { + op_ptr = std::make_unique< + Operator>( + tensor, op_args...); + }, + args_tuple); } else { assert(false && "operator is not implemented for this device and " From d947c3fcee30f42fac590fb39c202b1f99de74d8 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 02:26:34 +0000 Subject: [PATCH 69/88] ci: address review comments --- .github/ci_config.yml | 10 +++++----- src/operator.h | 12 +++--------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index 5f9029799..d772fa16c 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -18,9 +18,9 @@ platforms: gpu: type: unittest resources: - ngpus: 1 # Auto allocator picks this many free GPUs + ngpus: 1 # Auto allocator picks this many free GPUs. memory: 32GB - shm_size: 16g # Prevent PyTorch default 64MB shared memory limit + shm_size: 16g # Prevent PyTorch default 64MB shared memory limit. timeout: 3600 # env: # Uncomment to inject extra env vars into the container. # MY_VAR: value @@ -75,7 +75,7 @@ platforms: type: unittest resources: ngpus: 1 - gpu_style: none # MetaX: passthrough via --privileged, CUDA_VISIBLE_DEVICES controls visibility + gpu_style: none # MetaX uses passthrough via `--privileged`; `CUDA_VISIBLE_DEVICES` controls visibility. memory: 32GB shm_size: 16g timeout: 3600 @@ -98,7 +98,7 @@ platforms: type: unittest resources: ngpus: 1 - gpu_style: none # Moore: passthrough via --privileged, MTHREADS_VISIBLE_DEVICES controls visibility + gpu_style: none # Moore uses passthrough via `--privileged`; `MTHREADS_VISIBLE_DEVICES` controls visibility. memory: 32GB shm_size: 16g timeout: 3600 @@ -120,7 +120,7 @@ platforms: type: unittest resources: ngpus: 1 - gpu_style: mlu # Cambricon: passthrough via --privileged, MLU_VISIBLE_DEVICES for device control + gpu_style: mlu # Cambricon uses passthrough via `--privileged`; `MLU_VISIBLE_DEVICES` controls device selection. memory: 32GB shm_size: 16g timeout: 3600 diff --git a/src/operator.h b/src/operator.h index 9f3c4a3bd..83fc4ec2c 100644 --- a/src/operator.h +++ b/src/operator.h @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -143,7 +142,6 @@ class Operator : public OperatorBase { template static auto Make(const Config& config, const Tensor tensor, Args&&... args) { std::unique_ptr op_ptr; - auto args_tuple = std::forward_as_tuple(args...); DispatchFunc>( tensor.device().type(), @@ -157,13 +155,9 @@ class Operator : public OperatorBase { if constexpr (std::is_constructible_v< Operator, const Tensor&, Args...>) { - std::apply( - [&](auto&... op_args) { - op_ptr = std::make_unique< - Operator>( - tensor, op_args...); - }, - args_tuple); + op_ptr = std::make_unique< + Operator>( + tensor, std::forward(args)...); } else { assert(false && "operator is not implemented for this device and " From cae856a6dc1bf25732b4ea0767b56204b12d2d95 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 02:29:34 +0000 Subject: [PATCH 70/88] fix: restore operator argument dispatch --- src/operator.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/operator.h b/src/operator.h index 83fc4ec2c..9f3c4a3bd 100644 --- a/src/operator.h +++ b/src/operator.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -142,6 +143,7 @@ class Operator : public OperatorBase { template static auto Make(const Config& config, const Tensor tensor, Args&&... args) { std::unique_ptr op_ptr; + auto args_tuple = std::forward_as_tuple(args...); DispatchFunc>( tensor.device().type(), @@ -155,9 +157,13 @@ class Operator : public OperatorBase { if constexpr (std::is_constructible_v< Operator, const Tensor&, Args...>) { - op_ptr = std::make_unique< - Operator>( - tensor, std::forward(args)...); + std::apply( + [&](auto&... op_args) { + op_ptr = std::make_unique< + Operator>( + tensor, op_args...); + }, + args_tuple); } else { assert(false && "operator is not implemented for this device and " From 4a487f3a950207aa9df3faab513f187d0f03fdbf Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 02:47:47 +0000 Subject: [PATCH 71/88] ci: update moore resource locking --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index c5e85ad61..4695e258c 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit c5e85ad61d96696d9020faec0cf6a0c28e542b1c +Subproject commit 4695e258c511976e5ab8c1d7473f2907c72f082c From 6ab4c24ef829a550cf61449992444afccddf5250 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 03:12:04 +0000 Subject: [PATCH 72/88] ci: update scheduler stale lock cleanup --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index 4695e258c..f1eac13d1 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 4695e258c511976e5ab8c1d7473f2907c72f082c +Subproject commit f1eac13d1f6252104d5c4bf66cc46864926b8524 From 02aa490b519e7150ae34f7a7f84cc263ad13380a Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 03:37:05 +0000 Subject: [PATCH 73/88] ci: update nvidia gpu allocation --- .ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci b/.ci index f1eac13d1..48cd48a00 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit f1eac13d1f6252104d5c4bf66cc46864926b8524 +Subproject commit 48cd48a003b9362aaebda1e36ee4c42a029248d8 From 84567bbb7883e738e12cb15b6dfc2c68a5622c2f Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 04:53:01 +0000 Subject: [PATCH 74/88] ci: add v2 shadow workflow --- .ci | 2 +- .github/ci_config.yml | 23 +++++++++++++++++++ .github/workflows/ci_v2_shadow.yml | 36 ++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/ci_v2_shadow.yml diff --git a/.ci b/.ci index 48cd48a00..193251dac 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 48cd48a003b9362aaebda1e36ee4c42a029248d8 +Subproject commit 193251daca4140d4635df2cdde8268573ef994b7 diff --git a/.github/ci_config.yml b/.github/ci_config.yml index d772fa16c..8b986a4ab 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -7,6 +7,8 @@ github: platforms: nvidia: + runner_label: nvidia + execution_mode: agent_local image: dockerfile: images/nvidia/ build_args: @@ -22,12 +24,16 @@ platforms: memory: 32GB shm_size: 16g # Prevent PyTorch default 64MB shared memory limit. timeout: 3600 + queue_timeout: 1800 + junit_path: test-results.xml # env: # Uncomment to inject extra env vars into the container. # MY_VAR: value stages: - name: test run: pytest tests/ --devices nvidia -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml iluvatar: + runner_label: iluvatar + execution_mode: agent_local image: dockerfile: images/iluvatar/ build_args: @@ -54,11 +60,15 @@ platforms: memory: 32GB shm_size: 16g timeout: 3600 + queue_timeout: 1800 + junit_path: test-results.xml stages: - name: test run: pytest tests/ --devices iluvatar -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml metax: + runner_label: metax + execution_mode: agent_local image: dockerfile: images/metax/ build_args: @@ -79,11 +89,15 @@ platforms: memory: 32GB shm_size: 16g timeout: 3600 + queue_timeout: 1800 + junit_path: test-results.xml stages: - name: test run: pytest tests/ --devices metax -n 1 -v --tb=short --junitxml=/workspace/results/test-results.xml moore: + runner_label: moore + execution_mode: agent_local image: dockerfile: images/moore/ build_args: @@ -102,11 +116,15 @@ platforms: memory: 32GB shm_size: 16g timeout: 3600 + queue_timeout: 1800 + junit_path: test-results.xml stages: - name: test run: pytest tests/ --devices moore -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml cambricon: + runner_label: cambricon + execution_mode: agent_local image: dockerfile: images/cambricon/ build_args: @@ -124,12 +142,15 @@ platforms: memory: 32GB shm_size: 16g timeout: 3600 + queue_timeout: 1800 + junit_path: test-results.xml stages: - name: test run: pytest tests/ --devices cambricon -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml ascend: runner_label: Ascend + execution_mode: agent_local image: dockerfile: images/ascend/ buildkit: true @@ -158,6 +179,8 @@ platforms: memory: 32GB shm_size: 16g timeout: 3600 + queue_timeout: 1800 + junit_path: test-results.xml stages: - name: test run: pytest tests/ -n 1 --devices ascend -v --tb=short --junitxml=/workspace/results/test-results.xml diff --git a/.github/workflows/ci_v2_shadow.yml b/.github/workflows/ci_v2_shadow.yml new file mode 100644 index 000000000..0d4e9a8f2 --- /dev/null +++ b/.github/workflows/ci_v2_shadow.yml @@ -0,0 +1,36 @@ +name: CI v2 Shadow + +concurrency: + group: ci-v2-shadow-${{ github.workflow }}-${{ github.head_ref || github.ref_name }} + cancel-in-progress: true + +on: + workflow_dispatch: + inputs: + platform: + description: "Platform to run" + type: choice + required: true + default: nvidia + options: + - nvidia + - iluvatar + - metax + - moore + - cambricon + - ascend + - all + push: + branches: ["master"] + pull_request: + branches: ["master", "ci/ci-online"] + +jobs: + ci-v2-shadow: + uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@193251daca4140d4635df2cdde8268573ef994b7 + with: + config_path: .github/ci_config.yml + ci_ref: 193251daca4140d4635df2cdde8268573ef994b7 + max_parallel: 10 + platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon,ascend' || inputs.platform) || 'nvidia,iluvatar,metax,moore,cambricon,ascend' }} + secrets: inherit From 5c2f10d759999a107195476cf4379f14b8f2fdfe Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 04:57:59 +0000 Subject: [PATCH 75/88] ci: handle unavailable v2 shadow agents --- .ci | 2 +- .github/workflows/ci_v2_shadow.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci b/.ci index 193251dac..8038e6528 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 193251daca4140d4635df2cdde8268573ef994b7 +Subproject commit 8038e6528c28c134e6e3b3e919d1e959094626ee diff --git a/.github/workflows/ci_v2_shadow.yml b/.github/workflows/ci_v2_shadow.yml index 0d4e9a8f2..d02369b14 100644 --- a/.github/workflows/ci_v2_shadow.yml +++ b/.github/workflows/ci_v2_shadow.yml @@ -27,10 +27,10 @@ on: jobs: ci-v2-shadow: - uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@193251daca4140d4635df2cdde8268573ef994b7 + uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@8038e6528c28c134e6e3b3e919d1e959094626ee with: config_path: .github/ci_config.yml - ci_ref: 193251daca4140d4635df2cdde8268573ef994b7 + ci_ref: 8038e6528c28c134e6e3b3e919d1e959094626ee max_parallel: 10 platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon,ascend' || inputs.platform) || 'nvidia,iluvatar,metax,moore,cambricon,ascend' }} secrets: inherit From d406dc7c8c98d0175283136884281363f72fdb53 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 05:11:58 +0000 Subject: [PATCH 76/88] ci: add v2 agent installer --- .ci | 2 +- .github/workflows/ci_v2_shadow.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci b/.ci index 8038e6528..4c40d1234 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 8038e6528c28c134e6e3b3e919d1e959094626ee +Subproject commit 4c40d123407e0fe013bd4ab9922e82cd1d92b2f4 diff --git a/.github/workflows/ci_v2_shadow.yml b/.github/workflows/ci_v2_shadow.yml index d02369b14..063c29df1 100644 --- a/.github/workflows/ci_v2_shadow.yml +++ b/.github/workflows/ci_v2_shadow.yml @@ -27,10 +27,10 @@ on: jobs: ci-v2-shadow: - uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@8038e6528c28c134e6e3b3e919d1e959094626ee + uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@4c40d123407e0fe013bd4ab9922e82cd1d92b2f4 with: config_path: .github/ci_config.yml - ci_ref: 8038e6528c28c134e6e3b3e919d1e959094626ee + ci_ref: 4c40d123407e0fe013bd4ab9922e82cd1d92b2f4 max_parallel: 10 platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon,ascend' || inputs.platform) || 'nvidia,iluvatar,metax,moore,cambricon,ascend' }} secrets: inherit From 679dd8957c1b827847ab8ab82280712899f7604c Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 07:22:26 +0000 Subject: [PATCH 77/88] ci: match v2 runner labels --- .github/ci_config.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index 8b986a4ab..5a21a6c48 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -7,7 +7,7 @@ github: platforms: nvidia: - runner_label: nvidia + runner_label: Nvidia execution_mode: agent_local image: dockerfile: images/nvidia/ @@ -67,7 +67,7 @@ platforms: run: pytest tests/ --devices iluvatar -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml metax: - runner_label: metax + runner_label: Metax execution_mode: agent_local image: dockerfile: images/metax/ @@ -96,7 +96,7 @@ platforms: run: pytest tests/ --devices metax -n 1 -v --tb=short --junitxml=/workspace/results/test-results.xml moore: - runner_label: moore + runner_label: Moore execution_mode: agent_local image: dockerfile: images/moore/ @@ -123,7 +123,7 @@ platforms: run: pytest tests/ --devices moore -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml cambricon: - runner_label: cambricon + runner_label: Cambricon execution_mode: agent_local image: dockerfile: images/cambricon/ From 7893086ce02675c41171d57934613a4696a5cba1 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 07:55:28 +0000 Subject: [PATCH 78/88] ci: enforce v2 shadow checks --- .ci | 2 +- .github/workflows/ci_v2_shadow.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci b/.ci index 4c40d1234..0b4e5fb39 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 4c40d123407e0fe013bd4ab9922e82cd1d92b2f4 +Subproject commit 0b4e5fb3990a5822cc35b1ee84d3d4112f14e7ca diff --git a/.github/workflows/ci_v2_shadow.yml b/.github/workflows/ci_v2_shadow.yml index 063c29df1..b095e3d1c 100644 --- a/.github/workflows/ci_v2_shadow.yml +++ b/.github/workflows/ci_v2_shadow.yml @@ -27,10 +27,10 @@ on: jobs: ci-v2-shadow: - uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@4c40d123407e0fe013bd4ab9922e82cd1d92b2f4 + uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@0b4e5fb3990a5822cc35b1ee84d3d4112f14e7ca with: config_path: .github/ci_config.yml - ci_ref: 4c40d123407e0fe013bd4ab9922e82cd1d92b2f4 + ci_ref: 0b4e5fb3990a5822cc35b1ee84d3d4112f14e7ca max_parallel: 10 platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon,ascend' || inputs.platform) || 'nvidia,iluvatar,metax,moore,cambricon,ascend' }} secrets: inherit From 43509f1e37af1f709754fa1936b18f7984454865 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 08:39:50 +0000 Subject: [PATCH 79/88] ci: update v2 runner user agent --- .ci | 2 +- .github/workflows/ci_v2_shadow.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci b/.ci index 0b4e5fb39..2b1497aa9 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 0b4e5fb3990a5822cc35b1ee84d3d4112f14e7ca +Subproject commit 2b1497aa957d6b63aaf999dd45ce0be267a5373e diff --git a/.github/workflows/ci_v2_shadow.yml b/.github/workflows/ci_v2_shadow.yml index b095e3d1c..19d674f53 100644 --- a/.github/workflows/ci_v2_shadow.yml +++ b/.github/workflows/ci_v2_shadow.yml @@ -27,10 +27,10 @@ on: jobs: ci-v2-shadow: - uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@0b4e5fb3990a5822cc35b1ee84d3d4112f14e7ca + uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@2b1497aa957d6b63aaf999dd45ce0be267a5373e with: config_path: .github/ci_config.yml - ci_ref: 0b4e5fb3990a5822cc35b1ee84d3d4112f14e7ca + ci_ref: 2b1497aa957d6b63aaf999dd45ce0be267a5373e max_parallel: 10 platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon,ascend' || inputs.platform) || 'nvidia,iluvatar,metax,moore,cambricon,ascend' }} secrets: inherit From 72a8094a9749ef1674abdf0558342a6b6e116cc4 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 09:17:21 +0000 Subject: [PATCH 80/88] ci: default v2 shadow to active platforms --- .github/workflows/ci_v2_shadow.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci_v2_shadow.yml b/.github/workflows/ci_v2_shadow.yml index 19d674f53..af8d3e590 100644 --- a/.github/workflows/ci_v2_shadow.yml +++ b/.github/workflows/ci_v2_shadow.yml @@ -11,8 +11,9 @@ on: description: "Platform to run" type: choice required: true - default: nvidia + default: active options: + - active - nvidia - iluvatar - metax @@ -32,5 +33,5 @@ jobs: config_path: .github/ci_config.yml ci_ref: 2b1497aa957d6b63aaf999dd45ce0be267a5373e max_parallel: 10 - platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon,ascend' || inputs.platform) || 'nvidia,iluvatar,metax,moore,cambricon,ascend' }} + platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon,ascend' || inputs.platform == 'active' && 'nvidia,metax,moore,cambricon,ascend' || inputs.platform) || 'nvidia,metax,moore,cambricon,ascend' }} secrets: inherit From 4bcd54a531f65ef692e25497ce417742d7086222 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 09:28:13 +0000 Subject: [PATCH 81/88] ci: limit v2 agent queue wait to ten minutes --- .github/ci_config.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/ci_config.yml b/.github/ci_config.yml index 5a21a6c48..cab0e96a6 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -24,7 +24,7 @@ platforms: memory: 32GB shm_size: 16g # Prevent PyTorch default 64MB shared memory limit. timeout: 3600 - queue_timeout: 1800 + queue_timeout: 600 junit_path: test-results.xml # env: # Uncomment to inject extra env vars into the container. # MY_VAR: value @@ -60,7 +60,7 @@ platforms: memory: 32GB shm_size: 16g timeout: 3600 - queue_timeout: 1800 + queue_timeout: 600 junit_path: test-results.xml stages: - name: test @@ -89,7 +89,7 @@ platforms: memory: 32GB shm_size: 16g timeout: 3600 - queue_timeout: 1800 + queue_timeout: 600 junit_path: test-results.xml stages: - name: test @@ -116,7 +116,7 @@ platforms: memory: 32GB shm_size: 16g timeout: 3600 - queue_timeout: 1800 + queue_timeout: 600 junit_path: test-results.xml stages: - name: test @@ -142,7 +142,7 @@ platforms: memory: 32GB shm_size: 16g timeout: 3600 - queue_timeout: 1800 + queue_timeout: 600 junit_path: test-results.xml stages: - name: test @@ -179,7 +179,7 @@ platforms: memory: 32GB shm_size: 16g timeout: 3600 - queue_timeout: 1800 + queue_timeout: 600 junit_path: test-results.xml stages: - name: test From 36ecdaddbb160d531456050d80123e73c5381247 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 09:34:15 +0000 Subject: [PATCH 82/88] ci: use self-healing v2 agent workflow --- .ci | 2 +- .github/workflows/ci_v2_shadow.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci b/.ci index 2b1497aa9..0a13ba9e2 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 2b1497aa957d6b63aaf999dd45ce0be267a5373e +Subproject commit 0a13ba9e2aa3921ff7131ce444fb663131eee556 diff --git a/.github/workflows/ci_v2_shadow.yml b/.github/workflows/ci_v2_shadow.yml index af8d3e590..964b20563 100644 --- a/.github/workflows/ci_v2_shadow.yml +++ b/.github/workflows/ci_v2_shadow.yml @@ -28,10 +28,10 @@ on: jobs: ci-v2-shadow: - uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@2b1497aa957d6b63aaf999dd45ce0be267a5373e + uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@0a13ba9e2aa3921ff7131ce444fb663131eee556 with: config_path: .github/ci_config.yml - ci_ref: 2b1497aa957d6b63aaf999dd45ce0be267a5373e + ci_ref: 0a13ba9e2aa3921ff7131ce444fb663131eee556 max_parallel: 10 platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon,ascend' || inputs.platform == 'active' && 'nvidia,metax,moore,cambricon,ascend' || inputs.platform) || 'nvidia,metax,moore,cambricon,ascend' }} secrets: inherit From ed911f92f31dd77147045c74bd1d6c4a796bada8 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 09:41:35 +0000 Subject: [PATCH 83/88] ci: use transient state dir fallback --- .ci | 2 +- .github/workflows/ci_v2_shadow.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci b/.ci index 0a13ba9e2..90e16e9b3 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 0a13ba9e2aa3921ff7131ce444fb663131eee556 +Subproject commit 90e16e9b315ae529cffd85ef9a2700f62751132f diff --git a/.github/workflows/ci_v2_shadow.yml b/.github/workflows/ci_v2_shadow.yml index 964b20563..92ef05515 100644 --- a/.github/workflows/ci_v2_shadow.yml +++ b/.github/workflows/ci_v2_shadow.yml @@ -28,10 +28,10 @@ on: jobs: ci-v2-shadow: - uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@0a13ba9e2aa3921ff7131ce444fb663131eee556 + uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@90e16e9b315ae529cffd85ef9a2700f62751132f with: config_path: .github/ci_config.yml - ci_ref: 0a13ba9e2aa3921ff7131ce444fb663131eee556 + ci_ref: 90e16e9b315ae529cffd85ef9a2700f62751132f max_parallel: 10 platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon,ascend' || inputs.platform == 'active' && 'nvidia,metax,moore,cambricon,ascend' || inputs.platform) || 'nvidia,metax,moore,cambricon,ascend' }} secrets: inherit From 5a751afb393316e5eb6257deabc613c5091685bb Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 09:48:57 +0000 Subject: [PATCH 84/88] ci: use platform lock probe workflow --- .ci | 2 +- .github/workflows/ci_v2_shadow.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci b/.ci index 90e16e9b3..dd2a62d15 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 90e16e9b315ae529cffd85ef9a2700f62751132f +Subproject commit dd2a62d15b129d6b79298d967502f009c690feac diff --git a/.github/workflows/ci_v2_shadow.yml b/.github/workflows/ci_v2_shadow.yml index 92ef05515..3c2a091b1 100644 --- a/.github/workflows/ci_v2_shadow.yml +++ b/.github/workflows/ci_v2_shadow.yml @@ -28,10 +28,10 @@ on: jobs: ci-v2-shadow: - uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@90e16e9b315ae529cffd85ef9a2700f62751132f + uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@dd2a62d15b129d6b79298d967502f009c690feac with: config_path: .github/ci_config.yml - ci_ref: 90e16e9b315ae529cffd85ef9a2700f62751132f + ci_ref: dd2a62d15b129d6b79298d967502f009c690feac max_parallel: 10 platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon,ascend' || inputs.platform == 'active' && 'nvidia,metax,moore,cambricon,ascend' || inputs.platform) || 'nvidia,metax,moore,cambricon,ascend' }} secrets: inherit From 9e46f74dca59bb11cd6f7ea9ccc3dcaac02e84b4 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 09:55:05 +0000 Subject: [PATCH 85/88] ci: use checkout-free self-hosted workflow --- .ci | 2 +- .github/workflows/ci_v2_shadow.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci b/.ci index dd2a62d15..8329a453a 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit dd2a62d15b129d6b79298d967502f009c690feac +Subproject commit 8329a453a01b90a5642eb14089b95ca68890b529 diff --git a/.github/workflows/ci_v2_shadow.yml b/.github/workflows/ci_v2_shadow.yml index 3c2a091b1..9cf26ab82 100644 --- a/.github/workflows/ci_v2_shadow.yml +++ b/.github/workflows/ci_v2_shadow.yml @@ -28,10 +28,10 @@ on: jobs: ci-v2-shadow: - uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@dd2a62d15b129d6b79298d967502f009c690feac + uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@8329a453a01b90a5642eb14089b95ca68890b529 with: config_path: .github/ci_config.yml - ci_ref: dd2a62d15b129d6b79298d967502f009c690feac + ci_ref: 8329a453a01b90a5642eb14089b95ca68890b529 max_parallel: 10 platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon,ascend' || inputs.platform == 'active' && 'nvidia,metax,moore,cambricon,ascend' || inputs.platform) || 'nvidia,metax,moore,cambricon,ascend' }} secrets: inherit From a78f760fef62bc867ebe815e9a5d6bdf2fbe3fb2 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 10:05:38 +0000 Subject: [PATCH 86/88] ci: use nested junit result detection --- .ci | 2 +- .github/workflows/ci_v2_shadow.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci b/.ci index 8329a453a..f8d6bfcb6 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 8329a453a01b90a5642eb14089b95ca68890b529 +Subproject commit f8d6bfcb63d569c16b1dd15cb02c6b374cc6160d diff --git a/.github/workflows/ci_v2_shadow.yml b/.github/workflows/ci_v2_shadow.yml index 9cf26ab82..cd19c03ef 100644 --- a/.github/workflows/ci_v2_shadow.yml +++ b/.github/workflows/ci_v2_shadow.yml @@ -28,10 +28,10 @@ on: jobs: ci-v2-shadow: - uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@8329a453a01b90a5642eb14089b95ca68890b529 + uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@f8d6bfcb63d569c16b1dd15cb02c6b374cc6160d with: config_path: .github/ci_config.yml - ci_ref: 8329a453a01b90a5642eb14089b95ca68890b529 + ci_ref: f8d6bfcb63d569c16b1dd15cb02c6b374cc6160d max_parallel: 10 platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon,ascend' || inputs.platform == 'active' && 'nvidia,metax,moore,cambricon,ascend' || inputs.platform) || 'nvidia,metax,moore,cambricon,ascend' }} secrets: inherit From 77c98a9004ee296fd066292fce7a8bb26da098da Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 10:16:49 +0000 Subject: [PATCH 87/88] ci: use per-job checked-out agent --- .ci | 2 +- .github/workflows/ci_v2_shadow.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci b/.ci index f8d6bfcb6..16d9e363f 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit f8d6bfcb63d569c16b1dd15cb02c6b374cc6160d +Subproject commit 16d9e363fa8b5b39b7e825bd64706a2b5b6511a0 diff --git a/.github/workflows/ci_v2_shadow.yml b/.github/workflows/ci_v2_shadow.yml index cd19c03ef..e1d00f273 100644 --- a/.github/workflows/ci_v2_shadow.yml +++ b/.github/workflows/ci_v2_shadow.yml @@ -28,10 +28,10 @@ on: jobs: ci-v2-shadow: - uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@f8d6bfcb63d569c16b1dd15cb02c6b374cc6160d + uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@16d9e363fa8b5b39b7e825bd64706a2b5b6511a0 with: config_path: .github/ci_config.yml - ci_ref: f8d6bfcb63d569c16b1dd15cb02c6b374cc6160d + ci_ref: 16d9e363fa8b5b39b7e825bd64706a2b5b6511a0 max_parallel: 10 platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon,ascend' || inputs.platform == 'active' && 'nvidia,metax,moore,cambricon,ascend' || inputs.platform) || 'nvidia,metax,moore,cambricon,ascend' }} secrets: inherit From aa4f119d1c52db024a6b77dd5458922302be81d4 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 14 May 2026 10:31:51 +0000 Subject: [PATCH 88/88] ci: use metax resource allocation fix --- .ci | 2 +- .github/workflows/ci_v2_shadow.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci b/.ci index 16d9e363f..c7a8cc4c2 160000 --- a/.ci +++ b/.ci @@ -1 +1 @@ -Subproject commit 16d9e363fa8b5b39b7e825bd64706a2b5b6511a0 +Subproject commit c7a8cc4c2e3d98e0ce8c63a54b161170bf26b4ed diff --git a/.github/workflows/ci_v2_shadow.yml b/.github/workflows/ci_v2_shadow.yml index e1d00f273..070324817 100644 --- a/.github/workflows/ci_v2_shadow.yml +++ b/.github/workflows/ci_v2_shadow.yml @@ -28,10 +28,10 @@ on: jobs: ci-v2-shadow: - uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@16d9e363fa8b5b39b7e825bd64706a2b5b6511a0 + uses: InfiniTensor/ci/.github/workflows/infiniops-ci-v2-shadow.yml@c7a8cc4c2e3d98e0ce8c63a54b161170bf26b4ed with: config_path: .github/ci_config.yml - ci_ref: 16d9e363fa8b5b39b7e825bd64706a2b5b6511a0 + ci_ref: c7a8cc4c2e3d98e0ce8c63a54b161170bf26b4ed max_parallel: 10 platform: ${{ github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' && 'nvidia,iluvatar,metax,moore,cambricon,ascend' || inputs.platform == 'active' && 'nvidia,metax,moore,cambricon,ascend' || inputs.platform) || 'nvidia,metax,moore,cambricon,ascend' }} secrets: inherit