diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index ddfaa170..caa00ff3 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -15,9 +15,9 @@ jobs: - name: Setup Go uses: actions/setup-go@v5 with: - go-version: '~1.24.0' + go-version: '~1.25.0' - name: Run linter uses: golangci/golangci-lint-action@v8 with: - version: v2.1.5 + version: v2.12.2 diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 8949c76b..5dcc90bb 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -18,6 +18,7 @@ jobs: secrets: inherit publish-kustomize-bundles: + needs: publish-container-image permissions: id-token: write contents: read @@ -26,4 +27,6 @@ jobs: with: bundle-name: ghcr.io/datum-cloud/compute-kustomize bundle-path: config + image-name: ghcr.io/datum-cloud/compute + image-overlays: config/base/manager secrets: inherit diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml index 8429bf2d..9bede775 100644 --- a/.github/workflows/test-e2e.yml +++ b/.github/workflows/test-e2e.yml @@ -15,7 +15,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v5 with: - go-version: '~1.24.0' + go-version: '~1.25.0' - name: Install the latest version of kind run: | diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 834d33a0..462cbf3d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,7 +15,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v5 with: - go-version: '~1.24.0' + go-version: '~1.25.0' - name: Running Tests run: | diff --git a/.gitignore b/.gitignore index 2b0c6e44..d5cc564d 100644 --- a/.gitignore +++ b/.gitignore @@ -14,8 +14,8 @@ # Output of the go coverage tool, specifically when used with LiteIDE *.out -# Dependency directories (remove the comment below to include it) -# vendor/ +# Dependency directories +vendor/ # Go workspace file go.work @@ -25,3 +25,6 @@ go.work.sum .env bin/ + +# Local e2e environment artefacts (Kind kubeconfigs, etc.) +tmp/ diff --git a/.golangci.yml b/.golangci.yml index a7246fbb..f5834e3c 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -35,6 +35,16 @@ linters: - dupl - lll path: internal/* + # field.ErrorList{} is the idiomatic Kubernetes validation init pattern; + # preallocating requires knowing the error count in advance which is not + # possible in recursive validation helpers. + - linters: + - prealloc + path: internal/validation/ + # Test helpers that build slices via append are clearer without prealloc. + - linters: + - prealloc + path: internal/controller/instancecontrol/ paths: - third_party$ - builtin$ diff --git a/Makefile b/Makefile index 61744a36..3d6a3e2e 100644 --- a/Makefile +++ b/Makefile @@ -177,7 +177,7 @@ KUSTOMIZE_VERSION ?= v5.5.0 CONTROLLER_TOOLS_VERSION ?= v0.16.4 DEFAULTER_GEN_VERSION ?= v0.32.3 ENVTEST_VERSION ?= release-0.19 -GOLANGCI_LINT_VERSION ?= v2.1.5 +GOLANGCI_LINT_VERSION ?= v2.12.2 # renovate: datasource=go depName=fybrik.io/crdoc CRDOC_VERSION ?= v0.6.4 diff --git a/Taskfile.yaml b/Taskfile.yaml new file mode 100644 index 00000000..bcfbb0f8 --- /dev/null +++ b/Taskfile.yaml @@ -0,0 +1,481 @@ +version: '3' + +# ─── Variables ────────────────────────────────────────────────────────────── + +vars: + # Karmada Helm chart version to install (karmada-charts/karmada) + KARMADA_VERSION: v1.16.0 + + # karmadactl CLI version for cluster registration + KARMADACTL_VERSION: v1.16.0 + + # Chainsaw version for e2e testing (kyverno/chainsaw) + CHAINSAW_VERSION: v0.2.15 + + # Local tool directory (mirrors Makefile convention) + LOCALBIN: '{{.ROOT_DIR}}/bin' + KARMADACTL: '{{.ROOT_DIR}}/bin/karmadactl' + CHAINSAW: '{{.ROOT_DIR}}/bin/chainsaw' + + # Kind cluster names + KIND_CONTROL_PLANE: compute-control-plane + KIND_POP_DFW: compute-pop-dfw + KIND_POP_ORD: compute-pop-ord + + # All cluster names (for CRD installation loops) + KIND_ALL_CLUSTERS: '{{.KIND_CONTROL_PLANE}} {{.KIND_POP_DFW}} {{.KIND_POP_ORD}}' + + # Working directory for e2e artefacts (gitignored) + E2E_DIR: '{{.ROOT_DIR}}/tmp/e2e' + KUBECONFIG_DIR: '{{.ROOT_DIR}}/tmp/e2e/kubeconfigs' + + # Fixed NodePort for the Karmada API server. + # The Kind management cluster is created with an extraPortMapping for this port + # so it is reachable at https://localhost:32443 from the developer's machine. + KARMADA_API_NODEPORT: "32443" + +# ─── Tasks ────────────────────────────────────────────────────────────────── + +tasks: + + default: + cmds: + - task --list + silent: true + + # ════════════════════════════════════════════════════════════════════════ + # e2e environment lifecycle + # ════════════════════════════════════════════════════════════════════════ + + e2e:up: + desc: "Create the full local Kind+Karmada e2e environment (idempotent)" + cmds: + - task: e2e:tools + - task: e2e:clusters:create + - task: e2e:karmada:install + - task: e2e:karmada:configure + - task: e2e:karmada:join-clusters + - task: e2e:crds:install + - cmd: | + echo "" + echo "╔══════════════════════════════════════════════════════════╗" + echo "║ e2e environment ready ║" + echo "╠══════════════════════════════════════════════════════════╣" + echo "║ Control plane: {{.KUBECONFIG_DIR}}/control-plane.yaml" + echo "║ Karmada API: {{.KUBECONFIG_DIR}}/karmada.yaml" + echo "║ POP DFW: {{.KUBECONFIG_DIR}}/pop-dfw.yaml" + echo "║ POP ORD: {{.KUBECONFIG_DIR}}/pop-ord.yaml" + echo "╠══════════════════════════════════════════════════════════╣" + echo "║ Export for kubectl: ║" + echo "║ export KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml" + echo "╚══════════════════════════════════════════════════════════╝" + silent: false + + e2e:down: + desc: "Tear down the local e2e environment" + cmds: + - kind delete cluster --name {{.KIND_CONTROL_PLANE}} 2>/dev/null || true + - kind delete cluster --name {{.KIND_POP_DFW}} 2>/dev/null || true + - kind delete cluster --name {{.KIND_POP_ORD}} 2>/dev/null || true + - rm -rf {{.E2E_DIR}} + - cmd: echo "✓ e2e environment torn down" + silent: false + + e2e:test: + desc: "Run Chainsaw e2e tests against the local Kind+Karmada environment" + deps: [e2e:tools:chainsaw] + cmds: + - | + KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.CHAINSAW}} test \ + --config test/e2e/chainsaw-config.yaml \ + test/e2e/ \ + {{.CLI_ARGS}} + + e2e:test:filter: + desc: "Run a subset of e2e tests by name regex (e.g. task e2e:test:filter -- --include-test-regex federation)" + deps: [e2e:tools:chainsaw] + cmds: + - | + KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.CHAINSAW}} test \ + --config test/e2e/chainsaw-config.yaml \ + {{.CLI_ARGS}} \ + test/e2e/ + + # ════════════════════════════════════════════════════════════════════════ + # Tool installation + # ════════════════════════════════════════════════════════════════════════ + + e2e:tools: + desc: "Install e2e-specific tooling (karmadactl, chainsaw, helm repo)" + cmds: + - task: e2e:tools:karmadactl + - task: e2e:tools:chainsaw + - task: e2e:tools:helm-repo + + e2e:tools:karmadactl: + desc: "Download karmadactl {{.KARMADACTL_VERSION}}" + cmds: + - mkdir -p {{.LOCALBIN}} + - | + if [ ! -f "{{.KARMADACTL}}" ]; then + OS=$(uname -s | tr '[:upper:]' '[:lower:]') + ARCH=$(uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/') + URL="https://github.com/karmada-io/karmada/releases/download/{{.KARMADACTL_VERSION}}/karmadactl-${OS}-${ARCH}.tgz" + echo "Downloading karmadactl {{.KARMADACTL_VERSION}} (${OS}/${ARCH}) from ${URL}..." + curl -sSfL "${URL}" | tar -xz -C {{.LOCALBIN}} karmadactl + chmod +x {{.KARMADACTL}} + echo "karmadactl installed → {{.KARMADACTL}}" + else + echo "karmadactl already present at {{.KARMADACTL}}" + fi + status: + - test -f {{.KARMADACTL}} + + e2e:tools:chainsaw: + desc: "Download chainsaw {{.CHAINSAW_VERSION}}" + cmds: + - mkdir -p {{.LOCALBIN}} + - | + if [ ! -f "{{.CHAINSAW}}" ]; then + OS=$(uname -s | tr '[:upper:]' '[:lower:]') + ARCH=$(uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/') + URL="https://github.com/kyverno/chainsaw/releases/download/{{.CHAINSAW_VERSION}}/chainsaw_${OS}_${ARCH}.tar.gz" + echo "Downloading chainsaw {{.CHAINSAW_VERSION}} (${OS}/${ARCH}) from ${URL}..." + curl -sSfL "${URL}" | tar -xz -C {{.LOCALBIN}} chainsaw + chmod +x {{.CHAINSAW}} + echo "chainsaw installed → {{.CHAINSAW}}" + else + echo "chainsaw already present at {{.CHAINSAW}}" + fi + status: + - test -f {{.CHAINSAW}} + + e2e:tools:helm-repo: + desc: "Add/update karmada-charts Helm repository" + cmds: + - | + if ! helm repo list 2>/dev/null | grep -q karmada-charts; then + helm repo add karmada-charts https://raw.githubusercontent.com/karmada-io/karmada/master/charts + echo "Added karmada-charts Helm repository" + fi + helm repo update karmada-charts + status: + - helm repo list 2>/dev/null | grep -q karmada-charts + + # ════════════════════════════════════════════════════════════════════════ + # Kind cluster management + # ════════════════════════════════════════════════════════════════════════ + + e2e:clusters:create: + desc: "Create all Kind clusters (idempotent)" + cmds: + # Management / control-plane cell cluster — needs extraPortMappings for + # the Karmada API server NodePort so it is accessible at localhost:32443. + - task: _e2e:cluster:create + vars: + CLUSTER_NAME: "{{.KIND_CONTROL_PLANE}}" + KIND_CONFIG: hack/e2e/kind-control-plane.yaml + # POP cell clusters — default Kind config is sufficient. + - task: _e2e:cluster:create + vars: + CLUSTER_NAME: "{{.KIND_POP_DFW}}" + KIND_CONFIG: "" + - task: _e2e:cluster:create + vars: + CLUSTER_NAME: "{{.KIND_POP_ORD}}" + KIND_CONFIG: "" + - mkdir -p {{.KUBECONFIG_DIR}} + - task: _e2e:kubeconfigs:export + + _e2e:cluster:create: + internal: true + cmds: + - | + if kind get clusters 2>/dev/null | grep -qx '{{.CLUSTER_NAME}}'; then + echo "Kind cluster '{{.CLUSTER_NAME}}' already exists — skipping" + else + echo "Creating Kind cluster '{{.CLUSTER_NAME}}'..." + CONFIG_FLAG="" + if [ -n "{{.KIND_CONFIG}}" ]; then + CONFIG_FLAG="--config {{.KIND_CONFIG}}" + fi + kind create cluster \ + --name {{.CLUSTER_NAME}} \ + $CONFIG_FLAG \ + --wait 90s + fi + + _e2e:kubeconfigs:export: + internal: true + desc: "Export Kind kubeconfigs and create Docker-IP variants for cross-cluster use" + cmds: + # Standard kubeconfigs (localhost-based, for developer kubectl use) + - kind export kubeconfig --name {{.KIND_CONTROL_PLANE}} --kubeconfig {{.KUBECONFIG_DIR}}/control-plane.yaml + - kind export kubeconfig --name {{.KIND_POP_DFW}} --kubeconfig {{.KUBECONFIG_DIR}}/pop-dfw.yaml + - kind export kubeconfig --name {{.KIND_POP_ORD}} --kubeconfig {{.KUBECONFIG_DIR}}/pop-ord.yaml + # Docker-IP kubeconfigs (used by Karmada controller, running inside Docker, + # to reach POP cell API servers across the kind bridge network) + - | + hack/e2e/make-internal-kubeconfig.sh \ + {{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + {{.KUBECONFIG_DIR}}/pop-dfw-internal.yaml \ + {{.KIND_POP_DFW}} + - | + hack/e2e/make-internal-kubeconfig.sh \ + {{.KUBECONFIG_DIR}}/pop-ord.yaml \ + {{.KUBECONFIG_DIR}}/pop-ord-internal.yaml \ + {{.KIND_POP_ORD}} + + # ════════════════════════════════════════════════════════════════════════ + # Karmada installation + # ════════════════════════════════════════════════════════════════════════ + + e2e:karmada:install: + desc: "Install Karmada into the management cluster via Helm (idempotent)" + cmds: + - | + if kubectl --kubeconfig={{.KUBECONFIG_DIR}}/control-plane.yaml \ + get ns karmada-system &>/dev/null; then + echo "Karmada already installed (karmada-system namespace exists)" + else + echo "Installing Karmada {{.KARMADA_VERSION}} via Helm..." + helm install karmada karmada-charts/karmada \ + --kubeconfig={{.KUBECONFIG_DIR}}/control-plane.yaml \ + --namespace karmada-system \ + --create-namespace \ + --version {{.KARMADA_VERSION}} \ + --set apiServer.serviceType=NodePort \ + --set apiServer.nodePort={{.KARMADA_API_NODEPORT}} \ + --wait \ + --timeout 5m + echo "Karmada installed" + fi + - task: _e2e:karmada:build-kubeconfig + + e2e:karmada:configure: + desc: "Apply federation component config to the Karmada API server (idempotent)" + cmds: + - | + echo "Applying federation component to Karmada..." + kubectl --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml apply \ + -k config/components/federation/ + echo "Federation component applied" + + _e2e:karmada:build-kubeconfig: + internal: true + desc: "Extract Karmada kubeconfig from secret and patch server to localhost:{{.KARMADA_API_NODEPORT}}" + cmds: + - | + echo "Building Karmada kubeconfig → {{.KUBECONFIG_DIR}}/karmada.yaml" + # Extract raw kubeconfig from the secret the Helm chart creates + kubectl --kubeconfig={{.KUBECONFIG_DIR}}/control-plane.yaml \ + get secret karmada-kubeconfig \ + -n karmada-system \ + -o jsonpath='{.data.kubeconfig}' \ + | base64 -d > {{.KUBECONFIG_DIR}}/karmada-raw.yaml + # Rewrite the server address to the NodePort exposed on localhost + python3 - {{.KUBECONFIG_DIR}}/karmada-raw.yaml {{.KUBECONFIG_DIR}}/karmada.yaml 127.0.0.1 {{.KARMADA_API_NODEPORT}} << 'PYEOF' + import sys, yaml + + src, dst, host, port = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4] + + with open(src) as f: + cfg = yaml.safe_load(f) + + for cluster in cfg.get('clusters', []): + old = cluster['cluster'].get('server', '') + cluster['cluster']['server'] = f'https://{host}:{port}' + # The cert is for the internal cluster IP, so skip TLS verification. + # This is a local dev-only environment. + cluster['cluster']['insecure-skip-tls-verify'] = True + cluster['cluster'].pop('certificate-authority-data', None) + print(f" karmada server: {old} → https://{host}:{port}", file=sys.stderr) + + with open(dst, 'w') as f: + yaml.dump(cfg, f, default_flow_style=False) + PYEOF + rm {{.KUBECONFIG_DIR}}/karmada-raw.yaml + + # ════════════════════════════════════════════════════════════════════════ + # POP cell cluster registration + # ════════════════════════════════════════════════════════════════════════ + + e2e:karmada:join-clusters: + desc: "Register POP cell clusters with Karmada and apply city-code labels" + cmds: + - task: _e2e:karmada:join-cluster + vars: + CLUSTER_NAME: "{{.KIND_POP_DFW}}" + CITY_CODE: dfw + EXTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-dfw.yaml" + INTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-dfw-internal.yaml" + - task: _e2e:karmada:join-cluster + vars: + CLUSTER_NAME: "{{.KIND_POP_ORD}}" + CITY_CODE: ord + EXTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-ord.yaml" + INTERNAL_KUBECONFIG: "{{.KUBECONFIG_DIR}}/pop-ord-internal.yaml" + + _e2e:karmada:join-cluster: + internal: true + cmds: + # ── Register with karmadactl join ────────────────────────────────── + # We pass the EXTERNAL kubeconfig (localhost-based) here so karmadactl + # can reach the member cluster from this macOS host to set up initial + # RBAC. The stored secret is patched below to the Docker-IP variant. + - | + if kubectl --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + get cluster {{.CLUSTER_NAME}} &>/dev/null; then + echo "Cluster '{{.CLUSTER_NAME}}' already registered in Karmada — skipping join" + else + echo "Joining '{{.CLUSTER_NAME}}' to Karmada..." + {{.KARMADACTL}} join {{.CLUSTER_NAME}} \ + --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + --cluster-kubeconfig={{.EXTERNAL_KUBECONFIG}} \ + --cluster-context=kind-{{.CLUSTER_NAME}} + echo "Cluster '{{.CLUSTER_NAME}}' registered" + fi + # ── Patch cluster secret → Docker-IP kubeconfig ─────────────────── + # The Karmada controller manager runs inside Docker; it cannot use + # localhost to reach POP cell API servers. We update the stored secret + # with a kubeconfig whose server address uses the Kind container IP so + # container-to-container communication works across the kind bridge. + - | + hack/e2e/patch-cluster-secret.sh \ + {{.KUBECONFIG_DIR}}/karmada.yaml \ + {{.CLUSTER_NAME}} \ + {{.INTERNAL_KUBECONFIG}} + # ── Apply city-code label ────────────────────────────────────────── + - | + kubectl --kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + label cluster {{.CLUSTER_NAME}} \ + topology.datum.net/city-code={{.CITY_CODE}} \ + --overwrite + echo "Labeled cluster '{{.CLUSTER_NAME}}' with topology.datum.net/city-code={{.CITY_CODE}}" + + # ════════════════════════════════════════════════════════════════════════ + # CRD installation + # ════════════════════════════════════════════════════════════════════════ + + e2e:crds:install: + desc: "Install compute + NSO CRDs to all clusters" + cmds: + - task: _e2e:crds:compute + - task: _e2e:crds:nso + + _e2e:crds:compute: + internal: true + desc: "Apply compute CRDs to all clusters and the Karmada API server" + cmds: + # All three Kind clusters + the Karmada API server get the compute CRDs. + # The Karmada API server needs them so it can store and propagate + # WorkloadDeployment objects. + - | + for KC in \ + {{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.KUBECONFIG_DIR}}/karmada.yaml \ + {{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + {{.KUBECONFIG_DIR}}/pop-ord.yaml; do + echo "Installing compute CRDs → $(basename $KC .yaml)..." + kubectl --kubeconfig="$KC" apply -k config/base/crd --server-side + done + + _e2e:crds:nso: + internal: true + desc: "Apply NSO CRDs to control-plane and POP cell clusters" + cmds: + # NSO CRDs (NetworkBinding, SubnetClaim, etc.) are installed on the + # control-plane as well as POP cells. The control-plane operator needs them + # so that Subnet/SubnetClaim informer watches can start without cache errors, + # even though NSO controllers themselves only run on POP cells. + - | + go mod download go.datum.net/network-services-operator + NSO_VERSION=$(go list -m -json go.datum.net/network-services-operator \ + | python3 -c "import sys, json; print(json.load(sys.stdin)['Version'])") + NSO_CRD_PATH="$(go env GOMODCACHE)/go.datum.net/network-services-operator@${NSO_VERSION}/config/crd" + echo "NSO CRDs from: ${NSO_CRD_PATH}" + for KC in \ + {{.KUBECONFIG_DIR}}/control-plane.yaml \ + {{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + {{.KUBECONFIG_DIR}}/pop-ord.yaml; do + echo "Installing NSO CRDs → $(basename $KC .yaml)..." + kubectl --kubeconfig="$KC" apply -k "${NSO_CRD_PATH}" --server-side + done + + # ════════════════════════════════════════════════════════════════════════ + # Operator lifecycle (background processes for federation e2e) + # ════════════════════════════════════════════════════════════════════════ + + e2e:operator:start: + desc: "Start management (control-plane) and cell (pop-dfw) operator instances in the background" + cmds: + - mkdir -p {{.E2E_DIR}}/logs {{.E2E_DIR}}/pids + - | + echo "Starting management operator (control-plane)..." + KUBECONFIG={{.KUBECONFIG_DIR}}/control-plane.yaml \ + go run ./cmd/main.go \ + --karmada-kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + --enable-cell-controllers=false \ + --leader-elect=false \ + --health-probe-bind-address=:9091 \ + > {{.E2E_DIR}}/logs/operator-management.log 2>&1 & + echo $! > {{.E2E_DIR}}/pids/operator-management.pid + echo "Management operator PID: $!" + - | + echo "Waiting for management operator health check on :9091..." + deadline=$((SECONDS + 15)) + until curl -sf http://localhost:9091/healthz >/dev/null 2>&1; do + if [ $SECONDS -ge $deadline ]; then + echo "ERROR: management operator did not become healthy within 15s" + cat {{.E2E_DIR}}/logs/operator-management.log || true + exit 1 + fi + sleep 1 + done + echo "Management operator is healthy" + - | + echo "Starting cell operator (pop-dfw)..." + KUBECONFIG={{.KUBECONFIG_DIR}}/pop-dfw.yaml \ + go run ./cmd/main.go \ + --karmada-kubeconfig={{.KUBECONFIG_DIR}}/karmada.yaml \ + --enable-management-controllers=false \ + --leader-elect=false \ + --health-probe-bind-address=:9092 \ + > {{.E2E_DIR}}/logs/operator-cell-dfw.log 2>&1 & + echo $! > {{.E2E_DIR}}/pids/operator-cell-dfw.pid + echo "Cell operator PID: $!" + - | + echo "Waiting for cell operator health check on :9092..." + deadline=$((SECONDS + 15)) + until curl -sf http://localhost:9092/healthz >/dev/null 2>&1; do + if [ $SECONDS -ge $deadline ]; then + echo "ERROR: cell operator did not become healthy within 15s" + cat {{.E2E_DIR}}/logs/operator-cell-dfw.log || true + exit 1 + fi + sleep 1 + done + echo "Cell operator is healthy" + + e2e:operator:stop: + desc: "Stop background operator instances" + cmds: + - | + for PIDFILE in \ + {{.E2E_DIR}}/pids/operator-management.pid \ + {{.E2E_DIR}}/pids/operator-cell-dfw.pid; do + if [ -f "$PIDFILE" ]; then + PID=$(cat "$PIDFILE") + if kill -0 "$PID" 2>/dev/null; then + echo "Stopping PID $PID ($(basename $PIDFILE .pid))..." + kill -TERM "$PID" || true + else + echo "Process $PID ($(basename $PIDFILE .pid)) is not running" + fi + rm -f "$PIDFILE" + else + echo "PID file not found: $PIDFILE" + fi + done diff --git a/api/v1alpha/instance_types.go b/api/v1alpha/instance_types.go index 57e7f560..cb1698b3 100644 --- a/api/v1alpha/instance_types.go +++ b/api/v1alpha/instance_types.go @@ -107,6 +107,26 @@ type SandboxContainer struct { // +kubebuilder:validation:Required Image string `json:"image"` + // Entrypoint array to run in the container image, overriding the image's + // ENTRYPOINT. Each element is a separate token, not a shell command — to run a + // shell command use: ["sh", "-c", "my command"]. + // + // If not provided, the container image's own ENTRYPOINT is used. + // + // +kubebuilder:validation:Optional + Command []string `json:"command,omitempty"` + + // Arguments to the entrypoint, overriding the image's CMD. Combined with + // Command: when Command is also set the resulting invocation is + // append(Command, Args...). When only Args is set it overrides CMD while + // preserving the image's ENTRYPOINT. + // + // If neither Command nor Args is set, the image's own ENTRYPOINT and CMD + // are used unchanged. + // + // +kubebuilder:validation:Optional + Args []string `json:"args,omitempty"` + // List of environment variables to set in the container. // // +kubebuilder:validation:Optional @@ -400,6 +420,38 @@ const ( InstanceQuotaGrantedReasonQuotaExceeded = "QuotaExceeded" InstanceQuotaGrantedReasonValidationFailed = "ValidationFailed" InstanceProgrammedReasonPendingQuota = "PendingQuota" + + // InstanceQuotaGrantedReasonQuotaDisabled indicates quota enforcement is + // intentionally disabled: no credential path was configured. + InstanceQuotaGrantedReasonQuotaDisabled = "QuotaDisabled" + + // InstanceQuotaGrantedReasonBackendUnavailable indicates quota enforcement + // is configured but the Milo quota backend is unreachable (network error, + // TLS failure, 401/503). + InstanceQuotaGrantedReasonBackendUnavailable = "QuotaBackendUnavailable" + + // InstanceQuotaGrantedReasonProjectNotFound indicates the Milo project + // referenced by this instance does not exist (404 on the project control plane). + InstanceQuotaGrantedReasonProjectNotFound = "QuotaProjectNotFound" + + // InstanceQuotaGrantedReasonNamespaceNotFound indicates the claim namespace + // does not exist on the Milo project control plane (FM-5). + InstanceQuotaGrantedReasonNamespaceNotFound = "QuotaNamespaceNotFound" + + // InstanceQuotaGrantedReasonMisconfigured indicates the ResourceClaim was + // rejected by the Milo admission plugin (403/422): ResourceRegistration absent + // or claimingRules mismatch. + InstanceQuotaGrantedReasonMisconfigured = "QuotaMisconfigured" + + // InstanceQuotaGrantedReasonProjectIDUnresolvable indicates the namespace + // label required to derive the Milo project ID is missing or unreadable. + InstanceQuotaGrantedReasonProjectIDUnresolvable = "QuotaProjectIDUnresolvable" + + // InstanceQuotaGrantedReasonNoBudget indicates the ResourceClaim exists and + // is pending because no AllowanceBucket has been configured for the project. + // This is distinct from PendingEvaluation (claim not yet created or first eval + // in progress) and from QuotaExceeded (explicitly denied). + InstanceQuotaGrantedReasonNoBudget = "QuotaNoBudget" ) const ( @@ -453,6 +505,7 @@ type InstanceTemplateSpec struct { // +kubebuilder:printcolumn:name="Network IP",type=string,JSONPath=`.status.networkInterfaces[0].assignments.networkIP` // +kubebuilder:printcolumn:name="External IP",type=string,JSONPath=`.status.networkInterfaces[0].assignments.externalIP` // +kubebuilder:printcolumn:name="Message",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].message`,priority=1 +// +kubebuilder:printcolumn:name="Quota",type=string,JSONPath=`.status.conditions[?(@.type=="QuotaGranted")].reason`,priority=1 type Instance struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` diff --git a/api/v1alpha/labels.go b/api/v1alpha/labels.go index e1dac308..c75a1f26 100644 --- a/api/v1alpha/labels.go +++ b/api/v1alpha/labels.go @@ -7,4 +7,20 @@ const ( WorkloadDeploymentUIDLabel = LabelNamespace + "/workload-deployment-uid" InstanceIndexLabel = LabelNamespace + "/instance-index" + + // WorkloadDeploymentNameLabel carries the name of the WorkloadDeployment + // that owns an Instance. Stamped at creation and kept current on updates. + WorkloadDeploymentNameLabel = LabelNamespace + "/workload-deployment-name" + + // CityCodeLabel carries the city code of the WorkloadDeployment that owns + // an Instance, matching WorkloadDeploymentSpec.CityCode. + CityCodeLabel = LabelNamespace + "/city-code" + + // WorkloadNameLabel carries the name of the Workload that an Instance + // ultimately belongs to, sourced from WorkloadDeploymentSpec.WorkloadRef.Name. + WorkloadNameLabel = LabelNamespace + "/workload-name" + + // PlacementNameLabel carries the placement name from the Workload that drove + // this Instance's deployment, sourced from WorkloadDeploymentSpec.PlacementName. + PlacementNameLabel = LabelNamespace + "/placement-name" ) diff --git a/api/v1alpha/zz_generated.deepcopy.go b/api/v1alpha/zz_generated.deepcopy.go index 8ecc1bae..926e222c 100644 --- a/api/v1alpha/zz_generated.deepcopy.go +++ b/api/v1alpha/zz_generated.deepcopy.go @@ -651,6 +651,16 @@ func (in *ResourceMetricSource) DeepCopy() *ResourceMetricSource { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SandboxContainer) DeepCopyInto(out *SandboxContainer) { *out = *in + if in.Command != nil { + in, out := &in.Command, &out.Command + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Args != nil { + in, out := &in.Args, &out.Args + *out = make([]string, len(*in)) + copy(*out, *in) + } if in.Env != nil { in, out := &in.Env, &out.Env *out = make([]v1.EnvVar, len(*in)) diff --git a/cmd/main.go b/cmd/main.go index 3bb44bc9..01d3eddd 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -8,6 +8,8 @@ import ( "flag" "fmt" "os" + "strings" + "time" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. @@ -18,29 +20,42 @@ import ( "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/webhook" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcsingle "sigs.k8s.io/multicluster-runtime/providers/single" + karmadaclusterv1alpha1 "github.com/karmada-io/api/cluster/v1alpha1" + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/config" "go.datum.net/compute/internal/controller" + "go.datum.net/compute/internal/features" + quotametrics "go.datum.net/compute/internal/quota" computewebhook "go.datum.net/compute/internal/webhook" computev1alphawebhooks "go.datum.net/compute/internal/webhook/v1alpha" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" + "go.miloapis.com/milo/pkg/downstreamclient" multiclusterproviders "go.miloapis.com/milo/pkg/multicluster-runtime" milomulticluster "go.miloapis.com/milo/pkg/multicluster-runtime/milo" + corev1 "k8s.io/api/core/v1" // +kubebuilder:scaffold:imports ) +// singleClusterName is the fixed cluster name that mcsingle.New registers. +// All single-mode wiring that references this cluster must use this constant. +const singleClusterName = "single" + var ( scheme = runtime.NewScheme() setupLog = ctrl.Log.WithName("setup") @@ -51,6 +66,11 @@ var ( gitCommit = "unknown" gitTreeState = "unknown" buildDate = "unknown" + + // federationRestConfig holds the REST config for the Karmada federation control + // plane. It is populated from --federation-kubeconfig when set, and is nil + // when the flag is omitted. + federationRestConfig *rest.Config ) func init() { @@ -61,22 +81,45 @@ func init() { utilruntime.Must(computev1alpha.AddToScheme(scheme)) utilruntime.Must(networkingv1alpha.AddToScheme(scheme)) utilruntime.Must(quotav1alpha1.AddToScheme(scheme)) + utilruntime.Must(karmadapolicyv1alpha1.Install(scheme)) + utilruntime.Must(karmadaclusterv1alpha1.Install(scheme)) // +kubebuilder:scaffold:scheme } +//nolint:gocyclo // main wires all controller paths; complexity is inherent to startup sequencing func main() { var enableLeaderElection bool var leaderElectionNamespace string var probeAddr string var serverConfigFile string + var federationKubeconfig string + var federationContext string + var enableManagementControllers bool + var enableCellControllers bool flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.BoolVar(&enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. "+ "Enabling this will ensure there is only one active controller manager.") flag.StringVar(&leaderElectionNamespace, "leader-elect-namespace", "", "The namespace to use for leader election.") + flag.StringVar(&federationKubeconfig, "federation-kubeconfig", "", + "Path to the kubeconfig file for the Karmada federation control plane. "+ + "Required when --enable-management-controllers is set. "+ + "When omitted, federation features are disabled.") + flag.StringVar(&federationContext, "federation-context", "", + "Context to use from the federation kubeconfig. When omitted, the current context is used.") + flag.BoolVar(&enableManagementControllers, "enable-management-controllers", false, + "Enable management-plane controllers (WorkloadDeploymentFederator, InstanceProjector).") + flag.BoolVar(&enableCellControllers, "enable-cell-controllers", false, + "Enable cell controllers (WorkloadDeploymentReconciler, InstanceReconciler).") + + var featureGatesFlag string + flag.StringVar(&featureGatesFlag, "feature-gates", "", + "A set of key=value pairs that describe feature gates for the compute operator. "+ + "Example: --feature-gates=NetworkingIntegration=false. "+ + "Available features: NetworkingIntegration (default=true).") opts := zap.Options{ Development: true, @@ -87,8 +130,47 @@ func main() { opts.BindFlags(flag.CommandLine) flag.Parse() + if featureGatesFlag != "" { + if err := features.MutableFeatureGate.Set(featureGatesFlag); err != nil { + setupLog.Error(err, "unable to parse feature gates", "feature-gates", featureGatesFlag) + os.Exit(1) + } + } + setupLog.Info("feature gates", "NetworkingIntegration", features.FeatureGate.Enabled(features.NetworkingIntegration)) + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + // Load the federation (Karmada) control plane REST config when + // --federation-kubeconfig is provided. When the flag is omitted, + // federationRestConfig remains nil; management controllers will refuse to + // start if --enable-management-controllers is also set. + if federationKubeconfig != "" { + loader := clientcmd.NewNonInteractiveDeferredLoadingClientConfig( + &clientcmd.ClientConfigLoadingRules{ExplicitPath: federationKubeconfig}, + &clientcmd.ConfigOverrides{CurrentContext: federationContext}, + ) + var err error + federationRestConfig, err = loader.ClientConfig() + if err != nil { + setupLog.Error(err, "unable to load federation kubeconfig", "path", federationKubeconfig) + os.Exit(1) + } + setupLog.Info("federation kubeconfig loaded", "path", federationKubeconfig) + } + + // Fail loud: management controllers require a federation kubeconfig. Silently + // skipping them when --enable-management-controllers is set would leave + // federation and instance projection broken with no visible signal — the same + // class of failure as the quota P1 issue. An operator who explicitly enables + // management controllers but omits --federation-kubeconfig has a misconfiguration + // that must surface immediately rather than at runtime. + if enableManagementControllers && federationRestConfig == nil { + setupLog.Error(nil, + "management controllers enabled but no federation kubeconfig configured", + "hint", "set --federation-kubeconfig") + os.Exit(1) + } + setupLog.Info("starting compute", "version", version, "gitCommit", gitCommit, @@ -96,24 +178,28 @@ func main() { "buildDate", buildDate, ) - var serverConfig config.WorkloadOperator - var configData []byte - if len(serverConfigFile) > 0 { - var err error - configData, err = os.ReadFile(serverConfigFile) - if err != nil { - setupLog.Error(fmt.Errorf("unable to read server config from %q", serverConfigFile), "") - os.Exit(1) - } - } - - if err := runtime.DecodeInto(codecs.UniversalDecoder(), configData, &serverConfig); err != nil { - setupLog.Error(err, "unable to decode server config") + serverConfig, err := loadServerConfig(serverConfigFile) + if err != nil { + setupLog.Error(err, "unable to load server config") os.Exit(1) } setupLog.Info("server config", "config", serverConfig) + quotaRestConfig, err := serverConfig.Discovery.QuotaRestConfig() + if err != nil { + setupLog.Error(err, "unable to load quota REST config") + os.Exit(1) + } + if quotaRestConfig != nil { + setupLog.Info("quota REST config loaded", "path", serverConfig.Discovery.QuotaKubeconfigPath) + quotametrics.EnforcementEnabled.Set(1) + } else { + setupLog.Error(nil, "quota enforcement is DISABLED — workloads will schedule without quota accounting; "+ + "set quotaKubeconfigPath in server config to enable enforcement") + quotametrics.EnforcementEnabled.Set(0) + } + cfg := ctrl.GetConfigOrDie() deploymentCluster, err := cluster.New(cfg, func(o *cluster.Options) { @@ -124,7 +210,9 @@ func main() { os.Exit(1) } - runnables, provider, err := initializeClusterDiscovery(serverConfig, deploymentCluster, scheme) + runnables, provider, edgeClusterName, err := initializeClusterDiscovery( + serverConfig, deploymentCluster, scheme, + ) if err != nil { setupLog.Error(err, "unable to initialize cluster discovery") os.Exit(1) @@ -176,21 +264,65 @@ func main() { os.Exit(1) } - if err = (&controller.WorkloadReconciler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Workload") - os.Exit(1) + if enableManagementControllers { + if err = (&controller.WorkloadReconciler{}).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Workload") + os.Exit(1) + } } - if err = (&controller.WorkloadDeploymentReconciler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment") - os.Exit(1) + + // Build a single federation client shared across all controllers that need to + // read or write to the Karmada federation control plane. This is the hub that + // the management controllers federate through and that edge cells write back to. + // Nil when --federation-kubeconfig is not set (i.e. federation is disabled). + var federationClient client.Client + if federationRestConfig != nil { + federationClient, err = client.New(federationRestConfig, client.Options{Scheme: scheme}) + if err != nil { + setupLog.Error(err, "unable to create federation client") + os.Exit(1) + } } - if err = (&controller.WorkloadDeploymentScheduler{}).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeploymentScheduler") - os.Exit(1) + + if enableCellControllers { + if err = (&controller.WorkloadDeploymentReconciler{ + NetworkingEnabled: features.FeatureGate.Enabled(features.NetworkingIntegration), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment") + os.Exit(1) + } } - if err = (&controller.InstanceReconciler{}).SetupWithManager(mgr, deploymentCluster); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Instance") - os.Exit(1) + + if enableCellControllers { + clusterNameForProject := func(_ string) multicluster.ClusterName { + return multicluster.ClusterName(singleClusterName) + } + instanceReconciler := &controller.InstanceReconciler{FederationClient: federationClient} + err = instanceReconciler.SetupWithManager( + mgr, + quotaRestConfig, + singleModeProjectID(mgr), + singleModeProjectNamespace(mgr), + edgeClusterName, + clusterNameForProject, + ) + if err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Instance") + os.Exit(1) + } + } + + // WorkloadDeploymentFederator and InstanceProjector are management-plane + // controllers that run on the control-plane cluster. The fail-loud guard above + // ensures federationRestConfig is non-nil when enableManagementControllers is + // true; the nil check here is a defensive belt-and-suspenders guard. + if enableManagementControllers && federationRestConfig != nil { + extra, err := setupManagementControllers(mgr, federationClient) + if err != nil { + setupLog.Error(err, "unable to set up management controllers") + os.Exit(1) + } + runnables = append(runnables, extra...) } if serverConfig.WebhookServer != nil { @@ -223,11 +355,6 @@ func main() { }) } - setupLog.Info("starting cluster discovery provider") - g.Go(func() error { - return ignoreCanceled(provider.Run(ctx, mgr)) - }) - setupLog.Info("starting multicluster manager") g.Go(func() error { return ignoreCanceled(mgr.Start(ctx)) @@ -239,51 +366,33 @@ func main() { } } -type runnableProvider interface { - multicluster.Provider - Run(context.Context, mcmanager.Manager) error -} - -// Needed until we contribute the patch in the following PR again (need to sign CLA): -// -// See: https://github.com/kubernetes-sigs/multicluster-runtime/pull/18 -type wrappedSingleClusterProvider struct { - multicluster.Provider - cluster cluster.Cluster -} - -func (p *wrappedSingleClusterProvider) Run(ctx context.Context, mgr mcmanager.Manager) error { - if err := mgr.Engage(ctx, "single", p.cluster); err != nil { - return err - } - return p.Provider.(runnableProvider).Run(ctx, mgr) -} - func initializeClusterDiscovery( serverConfig config.WorkloadOperator, deploymentCluster cluster.Cluster, scheme *runtime.Scheme, -) (runnables []manager.Runnable, provider runnableProvider, err error) { +) (runnables []manager.Runnable, provider multicluster.Provider, edgeClusterName string, err error) { runnables = append(runnables, deploymentCluster) switch serverConfig.Discovery.Mode { case multiclusterproviders.ProviderSingle: - provider = &wrappedSingleClusterProvider{ - Provider: mcsingle.New("single", deploymentCluster), - cluster: deploymentCluster, + provider = mcsingle.New(multicluster.ClusterName(singleClusterName), deploymentCluster) + edgeClusterName = serverConfig.Discovery.ClusterName + if edgeClusterName == "" { + edgeClusterName = singleClusterName } case multiclusterproviders.ProviderMilo: discoveryRestConfig, err := serverConfig.Discovery.DiscoveryRestConfig() if err != nil { - return nil, nil, fmt.Errorf("unable to get discovery rest config: %w", err) + return nil, nil, "", fmt.Errorf("unable to get discovery rest config: %w", err) } projectRestConfig, err := serverConfig.Discovery.ProjectRestConfig() if err != nil { - return nil, nil, fmt.Errorf("unable to get project rest config: %w", err) + return nil, nil, "", fmt.Errorf("unable to get project rest config: %w", err) } discoveryManager, err := manager.New(discoveryRestConfig, manager.Options{ + Metrics: metricsserver.Options{BindAddress: "0"}, Client: client.Options{ Cache: &client.CacheOptions{ Unstructured: true, @@ -291,7 +400,7 @@ func initializeClusterDiscovery( }, }) if err != nil { - return nil, nil, fmt.Errorf("unable to set up overall controller manager: %w", err) + return nil, nil, "", fmt.Errorf("unable to set up overall controller manager: %w", err) } provider, err = milomulticluster.New(discoveryManager, milomulticluster.Options{ @@ -304,10 +413,11 @@ func initializeClusterDiscovery( ProjectRestConfig: projectRestConfig, }) if err != nil { - return nil, nil, fmt.Errorf("unable to create datum project provider: %w", err) + return nil, nil, "", fmt.Errorf("unable to create datum project provider: %w", err) } runnables = append(runnables, discoveryManager) + edgeClusterName = serverConfig.Discovery.ClusterName // case providers.ProviderKind: // provider = mckind.New(mckind.Options{ @@ -319,13 +429,29 @@ func initializeClusterDiscovery( // }) default: - return nil, nil, fmt.Errorf( + return nil, nil, "", fmt.Errorf( "unsupported cluster discovery mode %s", serverConfig.Discovery.Mode, ) } - return runnables, provider, nil + return runnables, provider, edgeClusterName, nil +} + +func loadServerConfig(path string) (config.WorkloadOperator, error) { + var serverConfig config.WorkloadOperator + var configData []byte + if len(path) > 0 { + var err error + configData, err = os.ReadFile(path) + if err != nil { + return serverConfig, fmt.Errorf("unable to read server config from %q: %w", path, err) + } + } + if err := runtime.DecodeInto(codecs.UniversalDecoder(), configData, &serverConfig); err != nil { + return serverConfig, fmt.Errorf("unable to decode server config: %w", err) + } + return serverConfig, nil } func ignoreCanceled(err error) error { @@ -334,3 +460,102 @@ func ignoreCanceled(err error) error { } return err } + +// setupManagementControllers wires the WorkloadDeploymentFederator and +// InstanceProjector onto mgr. It returns any additional Runnable objects that +// must be started alongside the main manager (the federation manager used by +// InstanceProjector). Called only when management controllers are enabled and +// a federation REST config is available. +func setupManagementControllers(mgr mcmanager.Manager, federationClient client.Client) ([]manager.Runnable, error) { + federator := &controller.WorkloadDeploymentFederator{FederationClient: federationClient} + if err := federator.SetupWithManager(mgr); err != nil { + return nil, fmt.Errorf("WorkloadDeploymentFederator: %w", err) + } + + // InstanceProjector runs in the management plane, watches Instances written + // back by POP-cell operators to the Karmada federation control plane, and + // projects them into the corresponding project namespaces via the multicluster manager. + federationMgr, err := manager.New(federationRestConfig, manager.Options{ + Scheme: scheme, + Metrics: metricsserver.Options{BindAddress: "0"}, + }) + if err != nil { + return nil, fmt.Errorf("federation manager for InstanceProjector: %w", err) + } + if err = (&controller.InstanceProjector{ + FederationClient: federationClient, + MCManager: mgr, + }).SetupWithManager(federationMgr); err != nil { + return nil, fmt.Errorf("InstanceProjector: %w", err) + } + + return []manager.Runnable{federationMgr}, nil +} + +// singleModeProjectID returns an InstanceProjectIDFunc for single-cell mode. +// It reads the upstream-cluster-name label on the edge namespace (e.g. +// "cluster-datum-cloud") and decodes it to the project ID ("datum-cloud"). +// This is the inverse of the "cluster-" encoding used by NSO's +// MappedNamespaceResourceStrategy when stamping cluster-scoped namespace labels. +// Returns ("", err) on transient API failures (triggers requeue with backoff). +// Returns ("", nil) when the label is absent (not yet propagated; quota skipped). +func singleModeProjectID(mgr mcmanager.Manager) controller.InstanceProjectIDFunc { + return func(ctx context.Context, cn multicluster.ClusterName, inst *computev1alpha.Instance) (string, error) { + ns, err := readEdgeNamespace(ctx, mgr, cn, inst.Namespace) + if err != nil { + return "", err + } + encoded := ns.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] + if encoded == "" { + setupLog.Info("singleModeProjectID: upstream-cluster-name label missing", + "namespace", inst.Namespace) + return "", nil + } + projectID := strings.TrimPrefix(encoded, "cluster-") + return strings.ReplaceAll(projectID, "_", "/"), nil + } +} + +// singleModeProjectNamespace returns an InstanceProjectNamespaceFunc for +// single-cell mode. It reads the upstream-namespace label on the edge namespace +// (e.g. "ns-efdf8ca1-...") to find the in-project namespace ("default") where +// ResourceClaims must be created in the project control plane. +// Returns ("", err) on transient API failures (triggers requeue with backoff). +// Returns ("", nil) when the label is absent (not yet propagated; quota skipped). +func singleModeProjectNamespace(mgr mcmanager.Manager) controller.InstanceProjectNamespaceFunc { + return func(ctx context.Context, cn multicluster.ClusterName, inst *computev1alpha.Instance) (string, error) { + ns, err := readEdgeNamespace(ctx, mgr, cn, inst.Namespace) + if err != nil { + return "", err + } + projectNS := ns.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + if projectNS == "" { + setupLog.Info("singleModeProjectNamespace: upstream-namespace label missing", + "namespace", inst.Namespace) + return "", nil + } + return projectNS, nil + } +} + +// readEdgeNamespace reads the edge namespace object via the uncached APIReader +// (no informer started, no cache sync required) with a short deadline. +// Returns a transient error on API failures so callers can requeue with backoff. +func readEdgeNamespace( + ctx context.Context, + mgr mcmanager.Manager, + clusterName multicluster.ClusterName, + namespace string, +) (corev1.Namespace, error) { + cl, err := mgr.GetCluster(ctx, clusterName) + if err != nil { + return corev1.Namespace{}, fmt.Errorf("readEdgeNamespace: getting cluster %q: %w", clusterName, err) + } + var ns corev1.Namespace + getCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + if err := cl.GetAPIReader().Get(getCtx, client.ObjectKey{Name: namespace}, &ns); err != nil { + return corev1.Namespace{}, fmt.Errorf("readEdgeNamespace: reading namespace %q: %w", namespace, err) + } + return ns, nil +} diff --git a/config/base/certmanager/certificate.yaml b/config/base/certmanager/certificate.yaml deleted file mode 100644 index 3b15b5b3..00000000 --- a/config/base/certmanager/certificate.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: cert-manager.io/v1 -kind: Issuer -metadata: - labels: - app.kubernetes.io/name: compute - app.kubernetes.io/managed-by: kustomize - name: selfsigned-issuer -spec: - selfSigned: {} ---- -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - labels: - app.kubernetes.io/name: compute - app.kubernetes.io/managed-by: kustomize - name: compute-serving-cert -spec: - # The Service name and namespace get substituted in by kustomize - # replacements in the consuming overlay. - dnsNames: - - SERVICE_NAME.SERVICE_NAMESPACE.svc - - SERVICE_NAME.SERVICE_NAMESPACE.svc.cluster.local - issuerRef: - kind: Issuer - name: selfsigned-issuer - secretName: compute-webhook-cert diff --git a/config/base/certmanager/kustomization.yaml b/config/base/certmanager/kustomization.yaml deleted file mode 100644 index bebea5a5..00000000 --- a/config/base/certmanager/kustomization.yaml +++ /dev/null @@ -1,5 +0,0 @@ -resources: -- certificate.yaml - -configurations: -- kustomizeconfig.yaml diff --git a/config/base/certmanager/kustomizeconfig.yaml b/config/base/certmanager/kustomizeconfig.yaml deleted file mode 100644 index cf6f89e8..00000000 --- a/config/base/certmanager/kustomizeconfig.yaml +++ /dev/null @@ -1,8 +0,0 @@ -# This configuration is for teaching kustomize how to update name ref substitution -nameReference: -- kind: Issuer - group: cert-manager.io - fieldSpecs: - - kind: Certificate - group: cert-manager.io - path: spec/issuerRef/name diff --git a/config/base/crd/bases/compute.datumapis.com_instances.yaml b/config/base/crd/bases/compute.datumapis.com_instances.yaml index 8c86fb90..c9301561 100644 --- a/config/base/crd/bases/compute.datumapis.com_instances.yaml +++ b/config/base/crd/bases/compute.datumapis.com_instances.yaml @@ -35,6 +35,10 @@ spec: name: Message priority: 1 type: string + - jsonPath: .status.conditions[?(@.type=="QuotaGranted")].reason + name: Quota + priority: 1 + type: string name: v1alpha schema: openAPIV3Schema: @@ -262,6 +266,28 @@ spec: description: A list of containers to run within the sandbox. items: properties: + args: + description: |- + Arguments to the entrypoint, overriding the image's CMD. Combined with + Command: when Command is also set the resulting invocation is + append(Command, Args...). When only Args is set it overrides CMD while + preserving the image's ENTRYPOINT. + + If neither Command nor Args is set, the image's own ENTRYPOINT and CMD + are used unchanged. + items: + type: string + type: array + command: + description: |- + Entrypoint array to run in the container image, overriding the image's + ENTRYPOINT. Each element is a separate token, not a shell command — to run a + shell command use: ["sh", "-c", "my command"]. + + If not provided, the container image's own ENTRYPOINT is used. + items: + type: string + type: array env: description: |- List of environment variables to set in the container. @@ -272,8 +298,9 @@ spec: present in a Container. properties: name: - description: Name of the environment variable. - Must be a C_IDENTIFIER. + description: |- + Name of the environment variable. + May consist of any printable ASCII characters except '='. type: string value: description: |- @@ -332,6 +359,43 @@ spec: - fieldPath type: object x-kubernetes-map-type: atomic + fileKeyRef: + description: |- + FileKeyRef selects a key of the env file. + Requires the EnvFiles feature gate to be enabled. + properties: + key: + description: |- + The key within the env file. An invalid key will prevent the pod from starting. + The keys defined within a source may consist of any printable ASCII characters except '='. + During Alpha stage of the EnvFiles feature gate, the key size is limited to 128 characters. + type: string + optional: + default: false + description: |- + Specify whether the file or its key must be defined. If the file or key + does not exist, then the env var is not published. + If optional is set to true and the specified key does not exist, + the environment variable will not be set in the Pod's containers. + + If optional is set to false and the specified key does not exist, + an error will be returned during Pod creation. + type: boolean + path: + description: |- + The path within the volume from which to select the file. + Must be relative and may not contain the '..' path or start with '..'. + type: string + volumeName: + description: The name of the volume mount + containing the env file. + type: string + required: + - key + - path + - volumeName + type: object + x-kubernetes-map-type: atomic resourceFieldRef: description: |- Selects a resource of the container: only resources limits and requests @@ -829,6 +893,11 @@ spec: reason: Pending status: Unknown type: Ready + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for quota evaluation + reason: PendingEvaluation + status: Unknown + type: QuotaGranted description: Status defines the current state of an Instance. properties: conditions: diff --git a/config/base/crd/bases/compute.datumapis.com_workloaddeployments.yaml b/config/base/crd/bases/compute.datumapis.com_workloaddeployments.yaml index 50c9458b..48a2501d 100644 --- a/config/base/crd/bases/compute.datumapis.com_workloaddeployments.yaml +++ b/config/base/crd/bases/compute.datumapis.com_workloaddeployments.yaml @@ -375,6 +375,28 @@ spec: sandbox. items: properties: + args: + description: |- + Arguments to the entrypoint, overriding the image's CMD. Combined with + Command: when Command is also set the resulting invocation is + append(Command, Args...). When only Args is set it overrides CMD while + preserving the image's ENTRYPOINT. + + If neither Command nor Args is set, the image's own ENTRYPOINT and CMD + are used unchanged. + items: + type: string + type: array + command: + description: |- + Entrypoint array to run in the container image, overriding the image's + ENTRYPOINT. Each element is a separate token, not a shell command — to run a + shell command use: ["sh", "-c", "my command"]. + + If not provided, the container image's own ENTRYPOINT is used. + items: + type: string + type: array env: description: |- List of environment variables to set in the container. @@ -385,8 +407,9 @@ spec: variable present in a Container. properties: name: - description: Name of the environment variable. - Must be a C_IDENTIFIER. + description: |- + Name of the environment variable. + May consist of any printable ASCII characters except '='. type: string value: description: |- @@ -448,6 +471,43 @@ spec: - fieldPath type: object x-kubernetes-map-type: atomic + fileKeyRef: + description: |- + FileKeyRef selects a key of the env file. + Requires the EnvFiles feature gate to be enabled. + properties: + key: + description: |- + The key within the env file. An invalid key will prevent the pod from starting. + The keys defined within a source may consist of any printable ASCII characters except '='. + During Alpha stage of the EnvFiles feature gate, the key size is limited to 128 characters. + type: string + optional: + default: false + description: |- + Specify whether the file or its key must be defined. If the file or key + does not exist, then the env var is not published. + If optional is set to true and the specified key does not exist, + the environment variable will not be set in the Pod's containers. + + If optional is set to false and the specified key does not exist, + an error will be returned during Pod creation. + type: boolean + path: + description: |- + The path within the volume from which to select the file. + Must be relative and may not contain the '..' path or start with '..'. + type: string + volumeName: + description: The name of the volume + mount containing the env file. + type: string + required: + - key + - path + - volumeName + type: object + x-kubernetes-map-type: atomic resourceFieldRef: description: |- Selects a resource of the container: only resources limits and requests diff --git a/config/base/crd/bases/compute.datumapis.com_workloads.yaml b/config/base/crd/bases/compute.datumapis.com_workloads.yaml index edae1e1c..c452910f 100644 --- a/config/base/crd/bases/compute.datumapis.com_workloads.yaml +++ b/config/base/crd/bases/compute.datumapis.com_workloads.yaml @@ -385,6 +385,28 @@ spec: sandbox. items: properties: + args: + description: |- + Arguments to the entrypoint, overriding the image's CMD. Combined with + Command: when Command is also set the resulting invocation is + append(Command, Args...). When only Args is set it overrides CMD while + preserving the image's ENTRYPOINT. + + If neither Command nor Args is set, the image's own ENTRYPOINT and CMD + are used unchanged. + items: + type: string + type: array + command: + description: |- + Entrypoint array to run in the container image, overriding the image's + ENTRYPOINT. Each element is a separate token, not a shell command — to run a + shell command use: ["sh", "-c", "my command"]. + + If not provided, the container image's own ENTRYPOINT is used. + items: + type: string + type: array env: description: |- List of environment variables to set in the container. @@ -395,8 +417,9 @@ spec: variable present in a Container. properties: name: - description: Name of the environment variable. - Must be a C_IDENTIFIER. + description: |- + Name of the environment variable. + May consist of any printable ASCII characters except '='. type: string value: description: |- @@ -458,6 +481,43 @@ spec: - fieldPath type: object x-kubernetes-map-type: atomic + fileKeyRef: + description: |- + FileKeyRef selects a key of the env file. + Requires the EnvFiles feature gate to be enabled. + properties: + key: + description: |- + The key within the env file. An invalid key will prevent the pod from starting. + The keys defined within a source may consist of any printable ASCII characters except '='. + During Alpha stage of the EnvFiles feature gate, the key size is limited to 128 characters. + type: string + optional: + default: false + description: |- + Specify whether the file or its key must be defined. If the file or key + does not exist, then the env var is not published. + If optional is set to true and the specified key does not exist, + the environment variable will not be set in the Pod's containers. + + If optional is set to false and the specified key does not exist, + an error will be returned during Pod creation. + type: boolean + path: + description: |- + The path within the volume from which to select the file. + Must be relative and may not contain the '..' path or start with '..'. + type: string + volumeName: + description: The name of the volume + mount containing the env file. + type: string + required: + - key + - path + - volumeName + type: object + x-kubernetes-map-type: atomic resourceFieldRef: description: |- Selects a resource of the container: only resources limits and requests diff --git a/config/base/downstream-rbac/kustomization.yaml b/config/base/downstream-rbac/kustomization.yaml new file mode 100644 index 00000000..4c4dbe44 --- /dev/null +++ b/config/base/downstream-rbac/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - rbac.yaml diff --git a/config/base/downstream-rbac/rbac.yaml b/config/base/downstream-rbac/rbac.yaml new file mode 100644 index 00000000..1937ef02 --- /dev/null +++ b/config/base/downstream-rbac/rbac.yaml @@ -0,0 +1,35 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: compute-manager +rules: + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list", "watch", "create", "update", "patch"] + - apiGroups: ["compute.datumapis.com"] + resources: ["workloaddeployments", "workloaddeployments/status", "instances", "instances/status"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["policy.karmada.io"] + resources: ["propagationpolicies", "clusterpropagationpolicies"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["cluster.karmada.io"] + resources: ["clusters"] + verbs: ["get", "list", "watch"] + - apiGroups: ["work.karmada.io"] + resources: ["resourcebindings", "clusterresourcebindings"] + verbs: ["get", "list", "watch"] + - apiGroups: ["config.karmada.io"] + resources: ["resourceinterpreterwebhookconfigurations", "resourceinterpretercustomizations"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: compute-manager +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: compute-manager +subjects: + - kind: User + name: system:serviceaccount:compute-system:compute-manager diff --git a/config/base/federation/kustomization.yaml b/config/base/federation/kustomization.yaml new file mode 100644 index 00000000..1261dac6 --- /dev/null +++ b/config/base/federation/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../crd/bases/compute.datumapis.com_instances.yaml + - ../crd/bases/compute.datumapis.com_workloaddeployments.yaml + - ../crd/bases/compute.datumapis.com_workloads.yaml + +components: + - ../../components/federation diff --git a/config/base/manager/manager.yaml b/config/base/manager/manager.yaml index 03028177..8ef18135 100644 --- a/config/base/manager/manager.yaml +++ b/config/base/manager/manager.yaml @@ -26,14 +26,33 @@ spec: seccompProfile: type: RuntimeDefault containers: - - command: + - name: manager + command: - /manager args: - - --leader-elect - - --health-probe-bind-address=:8081 - - --server-config=/config/config.yaml + - --leader-elect=$(LEADER_ELECT) + - --health-probe-bind-address=$(HEALTH_PROBE_BIND_ADDRESS) + - --server-config=$(SERVER_CONFIG) + - --federation-kubeconfig=$(FEDERATION_KUBECONFIG) + - --enable-management-controllers=$(ENABLE_MANAGEMENT_CONTROLLERS) + - --enable-cell-controllers=$(ENABLE_CELL_CONTROLLERS) + - --feature-gates=$(FEATURE_GATES) + env: + - name: LEADER_ELECT + value: "true" + - name: HEALTH_PROBE_BIND_ADDRESS + value: ":8081" + - name: SERVER_CONFIG + value: /config/config.yaml + - name: FEDERATION_KUBECONFIG + value: "" + - name: ENABLE_MANAGEMENT_CONTROLLERS + value: "false" + - name: ENABLE_CELL_CONTROLLERS + value: "false" + - name: FEATURE_GATES + value: "" image: ghcr.io/datum-cloud/compute:latest - name: manager ports: - containerPort: 9443 name: webhook-server @@ -66,20 +85,9 @@ spec: volumeMounts: - name: config mountPath: /config - - name: webhook-cert - mountPath: /tmp/k8s-webhook-server/serving-certs - readOnly: true - serviceAccountName: compute + serviceAccountName: compute-manager terminationGracePeriodSeconds: 10 volumes: - name: config configMap: name: compute-config - # Optional so the manager can run without admission webhooks: when - # `webhookServer:` is omitted from the server config, the binary - # skips the webhook server entirely and the missing Secret is fine. - - name: webhook-cert - secret: - secretName: compute-webhook-cert - defaultMode: 420 - optional: true diff --git a/config/base/manager/service_account.yaml b/config/base/manager/service_account.yaml index f8711deb..cc6bd6cc 100644 --- a/config/base/manager/service_account.yaml +++ b/config/base/manager/service_account.yaml @@ -4,4 +4,4 @@ metadata: labels: app.kubernetes.io/name: compute app.kubernetes.io/managed-by: kustomize - name: compute + name: compute-manager diff --git a/config/components/cell-controllers/kustomization.yaml b/config/components/cell-controllers/kustomization.yaml new file mode 100644 index 00000000..3f32da3b --- /dev/null +++ b/config/components/cell-controllers/kustomization.yaml @@ -0,0 +1,20 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + env: + - name: ENABLE_CELL_CONTROLLERS + value: "true" diff --git a/config/components/controller_rbac/metrics_auth_role_binding.yaml b/config/components/controller_rbac/metrics_auth_role_binding.yaml index 1ea3d974..ada1a1de 100644 --- a/config/components/controller_rbac/metrics_auth_role_binding.yaml +++ b/config/components/controller_rbac/metrics_auth_role_binding.yaml @@ -8,4 +8,4 @@ roleRef: name: compute-metrics-auth-role subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/controller_rbac/role.yaml b/config/components/controller_rbac/role.yaml index 5d803d2c..e8721899 100644 --- a/config/components/controller_rbac/role.yaml +++ b/config/components/controller_rbac/role.yaml @@ -4,6 +4,13 @@ kind: ClusterRole metadata: name: compute rules: +- apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - list - apiGroups: - compute.datumapis.com resources: @@ -36,3 +43,36 @@ rules: - get - patch - update +- apiGroups: + - networking.datumapis.com + resources: + - locations + - networkcontexts + - subnets + verbs: + - get + - list + - watch +- apiGroups: + - networking.datumapis.com + resources: + - networkbindings + - subnetclaims + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - quota.miloapis.com + resources: + - resourceclaims + verbs: + - create + - delete + - get + - list + - watch diff --git a/config/components/controller_rbac/role_binding.yaml b/config/components/controller_rbac/role_binding.yaml index 6256bf3f..2f3e2676 100644 --- a/config/components/controller_rbac/role_binding.yaml +++ b/config/components/controller_rbac/role_binding.yaml @@ -11,4 +11,4 @@ roleRef: name: compute subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/csi-webhook-cert/kustomization.yaml b/config/components/csi-webhook-cert/kustomization.yaml new file mode 100644 index 00000000..feade65a --- /dev/null +++ b/config/components/csi-webhook-cert/kustomization.yaml @@ -0,0 +1,32 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + # Add the CSI webhook cert volume and volumeMount to the manager Deployment. + # The issuer (csi.cert-manager.io/issuer-kind and csi.cert-manager.io/issuer-name) + # must be patched in by the consuming overlay or infra repo. + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + volumeMounts: + - name: webhook-server-tls + mountPath: /tmp/k8s-webhook-server/serving-certs + readOnly: true + volumes: + - name: webhook-server-tls + csi: + driver: csi.cert-manager.io + readOnly: true + volumeAttributes: + csi.cert-manager.io/fs-group: "65532" + csi.cert-manager.io/dns-names: compute-webhook.compute-system.svc,compute-webhook.compute-system.svc.cluster.local diff --git a/config/components/federation/kustomization.yaml b/config/components/federation/kustomization.yaml new file mode 100644 index 00000000..3ba207ff --- /dev/null +++ b/config/components/federation/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +resources: + - workloaddeployment-interpreter.yaml diff --git a/config/components/federation/workloaddeployment-interpreter.yaml b/config/components/federation/workloaddeployment-interpreter.yaml new file mode 100644 index 00000000..2743a63b --- /dev/null +++ b/config/components/federation/workloaddeployment-interpreter.yaml @@ -0,0 +1,28 @@ +apiVersion: config.karmada.io/v1alpha1 +kind: ResourceInterpreterCustomization +metadata: + name: workloaddeployment +spec: + target: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + customizations: + statusReflection: + luaScript: | + function ReflectStatus(observedObj) + if observedObj.status == nil then + return nil + end + return observedObj.status + end + statusAggregation: + luaScript: | + function AggregateStatus(desiredObj, statusItems) + if statusItems == nil or #statusItems == 0 then + return desiredObj + end + if statusItems[1].status ~= nil then + desiredObj.status = statusItems[1].status + end + return desiredObj + end diff --git a/config/components/leader_election/leader_election_role_binding.yaml b/config/components/leader_election/leader_election_role_binding.yaml index a5fe9996..d6783c07 100644 --- a/config/components/leader_election/leader_election_role_binding.yaml +++ b/config/components/leader_election/leader_election_role_binding.yaml @@ -11,4 +11,4 @@ roleRef: name: compute-leader-election subjects: - kind: ServiceAccount - name: compute + name: compute-manager diff --git a/config/components/management-controllers/kustomization.yaml b/config/components/management-controllers/kustomization.yaml new file mode 100644 index 00000000..d1e29e7f --- /dev/null +++ b/config/components/management-controllers/kustomization.yaml @@ -0,0 +1,20 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + env: + - name: ENABLE_MANAGEMENT_CONTROLLERS + value: "true" diff --git a/config/components/quota-credentials/kustomization.yaml b/config/components/quota-credentials/kustomization.yaml new file mode 100644 index 00000000..ffc9a6d8 --- /dev/null +++ b/config/components/quota-credentials/kustomization.yaml @@ -0,0 +1,26 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: + - target: + kind: Deployment + name: compute-manager + patch: |- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: compute-manager + spec: + template: + spec: + containers: + - name: manager + volumeMounts: + - name: quota-credentials + mountPath: /etc/quota-credentials + readOnly: true + volumes: + - name: quota-credentials + secret: + secretName: compute-quota-credentials + optional: true diff --git a/config/components/service-catalog/service-configuration.yaml b/config/components/service-catalog/service-configuration.yaml index 202ac8af..8c29a50e 100644 --- a/config/components/service-catalog/service-configuration.yaml +++ b/config/components/service-catalog/service-configuration.yaml @@ -6,6 +6,9 @@ spec: serviceRef: name: compute phase: Published + locations: + supportedClasses: + - datum-managed monitoredResourceTypes: - type: compute.datumapis.com/Instance displayName: Compute Instance @@ -44,6 +47,26 @@ spec: description: Seconds the instance has been in a running state. kind: Cumulative unit: s + - name: compute.datumapis.com/workloads + displayName: Compute Workloads + description: Number of compute workloads. + kind: Gauge + unit: '{workload}' + - name: compute.datumapis.com/instances + displayName: Compute Instances + description: Number of compute instances. + kind: Gauge + unit: '{instance}' + - name: compute.datumapis.com/vcpus + displayName: Compute vCPUs + description: Number of vCPUs allocated across all instances. + kind: Gauge + unit: '{millicore}' + - name: compute.datumapis.com/memory + displayName: Compute Memory + description: Memory allocated across all instances. + kind: Gauge + unit: MiB billing: consumerDestinations: - monitoredResourceType: compute.datumapis.com/Instance @@ -53,13 +76,13 @@ spec: - compute.datumapis.com/instance/cpu-allocated - compute.datumapis.com/instance/memory-allocated - compute.datumapis.com/instance/uptime-seconds + quota: metricRules: - selector: apiGroup: compute.datumapis.com kind: Workload metricCosts: compute.datumapis.com/workloads: 1 - quota: limits: - name: compute-workloads metric: compute.datumapis.com/workloads diff --git a/config/overlays/cell/disable_webhook_patch.yaml b/config/overlays/cell/disable_webhook_patch.yaml new file mode 100644 index 00000000..85b57f09 --- /dev/null +++ b/config/overlays/cell/disable_webhook_patch.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: compute-config +data: + config.yaml: | + apiVersion: apiserver.config.datumapis.com/v1alpha1 + kind: WorkloadOperator + metricsServer: + bindAddress: "0" + discovery: + quotaKubeconfigPath: /etc/quota-credentials/kubeconfig diff --git a/config/overlays/cell/kustomization.yaml b/config/overlays/cell/kustomization.yaml new file mode 100644 index 00000000..80925ee2 --- /dev/null +++ b/config/overlays/cell/kustomization.yaml @@ -0,0 +1,17 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# All namespaced resources land here. Override via Flux's targetNamespace +# (or by editing this overlay) to install into a different namespace. +namespace: compute-system + +resources: + - ../../base/manager +components: + - ../../components/leader_election + - ../../components/controller_rbac + - ../../components/cell-controllers + - ../../components/quota-credentials + +patches: +- path: disable_webhook_patch.yaml diff --git a/config/overlays/dev/config.yaml b/config/overlays/dev/config.yaml index 1d49a6c6..6ef2f00e 100644 --- a/config/overlays/dev/config.yaml +++ b/config/overlays/dev/config.yaml @@ -2,9 +2,4 @@ apiVersion: apiserver.config.datumapis.com/v1alpha1 kind: WorkloadOperator metricsServer: bindAddress: "0" - -webhookServer: - tls: - secretRef: - name: compute-webhook-cert - namespace: kube-system +webhookServer: {} diff --git a/config/overlays/dev/kustomization.yaml b/config/overlays/dev/kustomization.yaml index 7b076890..339cee0f 100644 --- a/config/overlays/dev/kustomization.yaml +++ b/config/overlays/dev/kustomization.yaml @@ -1,55 +1,29 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: compute-system + resources: - ../../base/crd - ../../base/webhook - - ../../base/certmanager + - webhook-cert.yaml -replacements: - - source: - kind: Certificate - group: cert-manager.io - version: v1 - name: compute-serving-cert - fieldPath: .metadata.namespace - targets: - - select: - kind: ValidatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 0 - create: true - - select: - kind: MutatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 0 - create: true - - source: - kind: Certificate - group: cert-manager.io - version: v1 - name: compute-serving-cert - fieldPath: .metadata.name - targets: - - select: - kind: ValidatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 1 - create: true - - select: - kind: MutatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 1 - create: true +patches: + # Wire cainjector to the dev cert so the API server can verify the webhook. + - patch: |- + apiVersion: admissionregistration.k8s.io/v1 + kind: MutatingWebhookConfiguration + metadata: + name: compute-mutating + annotations: + cert-manager.io/inject-ca-from: compute-system/compute-serving-cert + - patch: |- + apiVersion: admissionregistration.k8s.io/v1 + kind: ValidatingWebhookConfiguration + metadata: + name: compute-validating + annotations: + cert-manager.io/inject-ca-from: compute-system/compute-serving-cert transformers: - webhook_patch.yaml diff --git a/config/overlays/dev/webhook-cert.yaml b/config/overlays/dev/webhook-cert.yaml new file mode 100644 index 00000000..db7bf928 --- /dev/null +++ b/config/overlays/dev/webhook-cert.yaml @@ -0,0 +1,18 @@ +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: selfsigned-issuer +spec: + selfSigned: {} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: compute-serving-cert +spec: + dnsNames: + - host.docker.internal + issuerRef: + kind: Issuer + name: selfsigned-issuer + secretName: compute-webhook-cert diff --git a/config/overlays/dev/webhook_patch.yaml b/config/overlays/dev/webhook_patch.yaml index 846649e3..bb302318 100644 --- a/config/overlays/dev/webhook_patch.yaml +++ b/config/overlays/dev/webhook_patch.yaml @@ -1,23 +1,6 @@ --- apiVersion: builtin kind: PatchTransformer -metadata: - name: webhook-cert-patch -patch: |- - - op: replace - path: /spec/dnsNames - value: ["host.docker.internal"] - - op: replace - path: /spec/secretName - value: compute-webhook-cert -target: - kind: Certificate - group: cert-manager.io - version: v1 - name: compute-serving-cert ---- -apiVersion: builtin -kind: PatchTransformer metadata: name: mutatingwebhook-url-patch patch: |- diff --git a/config/overlays/management-plane/discovery_mode_patch.yaml b/config/overlays/management-plane/discovery_mode_patch.yaml new file mode 100644 index 00000000..97bf762c --- /dev/null +++ b/config/overlays/management-plane/discovery_mode_patch.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: compute-config +data: + config.yaml: | + apiVersion: apiserver.config.datumapis.com/v1alpha1 + kind: WorkloadOperator + metricsServer: + bindAddress: "0" + webhookServer: {} + discovery: + mode: milo diff --git a/config/overlays/management-plane/downstream_kubeconfig_patch.yaml b/config/overlays/management-plane/downstream_kubeconfig_patch.yaml new file mode 100644 index 00000000..7b3b764b --- /dev/null +++ b/config/overlays/management-plane/downstream_kubeconfig_patch.yaml @@ -0,0 +1,29 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: compute-manager +spec: + template: + spec: + containers: + - name: manager + env: + - name: FEDERATION_KUBECONFIG + value: /etc/kubernetes/downstream/auth/downstream-kubeconfig.yaml + volumeMounts: + - name: downstream-kubeconfig + mountPath: /etc/kubernetes/downstream/auth + readOnly: true + - name: karmada-token + mountPath: /etc/kubernetes/karmada-token + readOnly: true + volumes: + - name: downstream-kubeconfig + configMap: + name: compute-downstream-kubeconfig + - name: karmada-token + projected: + sources: + - serviceAccountToken: + audience: https://karmada-apiserver.karmada-system.svc.cluster.local:5443 + path: token diff --git a/config/overlays/management-plane/kustomization.yaml b/config/overlays/management-plane/kustomization.yaml new file mode 100644 index 00000000..dae13c58 --- /dev/null +++ b/config/overlays/management-plane/kustomization.yaml @@ -0,0 +1,21 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# All namespaced resources land here. Override via Flux's targetNamespace +# (or by editing this overlay) to install into a different namespace. +namespace: compute-system + +resources: + - ../../base/manager + - ../../base/webhook +components: + - ../../components/leader_election + - ../../components/controller_rbac + - ../../components/resource-metrics + - ../../components/high-availability + - ../../components/management-controllers + - ../../components/csi-webhook-cert + +patches: +- path: downstream_kubeconfig_patch.yaml +- path: discovery_mode_patch.yaml diff --git a/config/overlays/single-cluster/kustomization.yaml b/config/overlays/single-cluster/kustomization.yaml index 160b894d..4d72934e 100644 --- a/config/overlays/single-cluster/kustomization.yaml +++ b/config/overlays/single-cluster/kustomization.yaml @@ -9,100 +9,11 @@ resources: - ../../base/crd - ../../base/manager - ../../base/webhook - - ../../base/certmanager components: - ../../components/leader_election - ../../components/controller_rbac - ../../components/resource-metrics - ../../components/high-availability - -patches: -- path: webhookcainjection_patch.yaml - -replacements: -# Fill in SERVICE_NAME / SERVICE_NAMESPACE placeholders in the Certificate's -# dnsNames so the cert is issued for the actual webhook Service location. -- source: - kind: Service - version: v1 - name: compute-webhook - fieldPath: .metadata.name - targets: - - select: - kind: Certificate - group: cert-manager.io - version: v1 - name: compute-serving-cert - fieldPaths: - - .spec.dnsNames.0 - - .spec.dnsNames.1 - options: - delimiter: '.' - index: 0 - create: true -- source: - kind: Service - version: v1 - name: compute-webhook - fieldPath: .metadata.namespace - targets: - - select: - kind: Certificate - group: cert-manager.io - version: v1 - name: compute-serving-cert - fieldPaths: - - .spec.dnsNames.0 - - .spec.dnsNames.1 - options: - delimiter: '.' - index: 1 - create: true -# Wire the Certificate namespace + name into the cert-manager.io/inject-ca-from -# annotation on the webhook configurations so cainjector populates caBundle. -- source: - kind: Certificate - group: cert-manager.io - version: v1 - name: compute-serving-cert - fieldPath: .metadata.namespace - targets: - - select: - kind: ValidatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 0 - create: true - - select: - kind: MutatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 0 - create: true -- source: - kind: Certificate - group: cert-manager.io - version: v1 - name: compute-serving-cert - fieldPath: .metadata.name - targets: - - select: - kind: ValidatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 1 - create: true - - select: - kind: MutatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 1 - create: true + - ../../components/csi-webhook-cert + - ../../components/management-controllers + - ../../components/cell-controllers diff --git a/config/overlays/single-cluster/webhookcainjection_patch.yaml b/config/overlays/single-cluster/webhookcainjection_patch.yaml deleted file mode 100644 index 41718fb7..00000000 --- a/config/overlays/single-cluster/webhookcainjection_patch.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: admissionregistration.k8s.io/v1 -kind: MutatingWebhookConfiguration -metadata: - labels: - app.kubernetes.io/name: compute - app.kubernetes.io/managed-by: kustomize - name: compute-mutating - annotations: - cert-manager.io/inject-ca-from: system/compute-serving-cert ---- -apiVersion: admissionregistration.k8s.io/v1 -kind: ValidatingWebhookConfiguration -metadata: - labels: - app.kubernetes.io/name: compute - app.kubernetes.io/managed-by: kustomize - name: compute-validating - annotations: - cert-manager.io/inject-ca-from: system/compute-serving-cert diff --git a/docs/enhancements/federated-deployment-scheduling.md b/docs/enhancements/federated-deployment-scheduling.md new file mode 100644 index 00000000..be2e0dde --- /dev/null +++ b/docs/enhancements/federated-deployment-scheduling.md @@ -0,0 +1,363 @@ +# Federated Deployment Scheduling + +**Issue:** [#85 — Define integration strategy with federated control plane for workload deployment scheduling](https://github.com/datum-cloud/compute/issues/85) +**Status:** Draft + +--- + +## Summary + +When you deploy a workload to a city location, Datum needs to route it to the right physical site and keep you informed of its status. Today that routing logic lives in a single place; this enhancement distributes it across a federation of regional clusters using Karmada. + +From a user perspective, nothing changes — you still specify city codes, and your workloads, deployments, and instances appear exactly where you'd expect them. Behind the scenes, a dedicated federation layer takes over scheduling, so deployments reach their target locations faster, scale decisions happen locally at each site without depending on a central coordinator, and the platform remains operational even when parts of the control plane are temporarily unavailable. + +--- + +## Terminology + +- **Project** — An isolated tenant environment where a user's resources (Workloads, Deployments, Instances) are created and visible. +- **Workload** — A user-defined application specification, including the container image, resource requirements, and target city locations. +- **WorkloadDeployment** — A per-city deployment intent derived from a Workload. Tracks how many replicas should be running and reports their current status. +- **Instance** — A single running replica of a WorkloadDeployment at a specific POP Cell. +- **POP Cell** — A physical point-of-presence site (e.g., DFW-01) where Instances actually run. Each city code maps to one POP Cell. +- **Control Plane Cell** — The central compute operator that coordinates between Projects and the Karmada federation layer. +- **Karmada** — An open-source multi-cluster orchestration system that distributes workloads across registered member clusters (POP Cells) and aggregates their status. +- **Karmada API Server** — The central federation API server managed by Karmada. WorkloadDeployments are written here so Karmada can propagate them to the correct POP Cell. +- **PropagationPolicy** — A Karmada resource that defines which clusters a resource should be sent to, based on label selectors. One is created per city code per project namespace. +- **Management Cluster** — The central Kubernetes cluster that hosts shared platform infrastructure. +- **NSO** — Network Services Operator — runs in each POP Cell to provision networking resources (NetworkBinding, SubnetClaim, Subnet) needed by Instances. +- **Milo** — Datum's shared platform library. Provides utilities like namespace mapping and multi-tenant client strategies used across services. +- **Scheduling Gate** — A hold placed on an Instance that prevents it from running until a specific condition is met (e.g., network ready, quota granted). + +--- + +## Overview + +The compute service must be adapted to work with the Karmada-based federated control plane +that replaces the single-platform-API-server MVP architecture. This document defines: + +- Which control plane each resource lives in +- How the compute operator's topology changes +- How `WorkloadDeploymentScheduler` is replaced by Karmada propagation +- How `Instance` information is surfaced back to the user's project + +### Design Constraints + +- The consumer-facing `Workload` and `WorkloadDeployment` API surface does not change. +- Karmada unavailability is an internal infrastructure concern — no user-visible conditions. +- Multi-cell-per-city is deferred; each city code maps to exactly one Karmada member cluster at launch. + +--- + +## Control Plane Topology + +``` +┌─────────────────────────────────────────────────────────┐ +│ Project (one per project, discovered via Milo) │ +│ │ +│ Workload (consumer write) │ +│ WorkloadDeployment (spec by operator, status by op.) │ +│ Instance (read-only projection by InstanceProjector) │ +└───────────────────┬─────────────────────────────────────┘ + │ read Workload + │ write WorkloadDeployment spec + status + │ write Instance projection + │ +┌───────────────────▼─────────────────────────────────────┐ +│ Control Plane Cell (compute operator) │ +│ │ +│ WorkloadReconciler ← watches projects │ +│ WorkloadDeploymentFederator ← syncs to Karmada │ +│ InstanceProjector ← mirrors to projects │ +└───────────────────┬─────────────────────────────────────┘ + │ write WorkloadDeployment + PropagationPolicy + │ read Instance (written back by POP cell) + │ write Instance projection to project + │ +┌───────────────────▼─────────────────────────────────────┐ +│ Karmada Federation API Server │ +│ │ +│ WorkloadDeployment (propagated to POP cells) │ +│ PropagationPolicy (one per city code per namespace) │ +│ Instance (written back by POP cell for visibility) │ +│ Cluster objects (one per POP cell, labeled by city) │ +└───────────────────┬─────────────────────────────────────┘ + │ Karmada propagates WorkloadDeployment + │ POP cell writes Instance back + │ +┌───────────────────▼─────────────────────────────────────┐ +│ POP Cell (e.g., DFW-01) [member cluster in Karmada] │ +│ │ +│ WorkloadDeployment (propagated by Karmada) │ +│ Instance (created locally) │ +│ NetworkBinding / SubnetClaim (created locally) │ +│ │ +│ WorkloadDeploymentReconciler ← creates Instances, │ +│ NetworkBinding, │ +│ SubnetClaim, gates │ +│ InstanceReconciler ← quota, status, │ +│ write-back to Karmada │ +│ NSO controllers ← NetworkBinding, │ +│ SubnetClaim, Subnet │ +└─────────────────────────────────────────────────────────┘ +``` + +--- + +## Resource Locations + +| Resource | Lives In | Written By | +|---|---|---| +| `Workload` | Project | Consumer | +| `WorkloadDeployment` (consumer-facing) | Project | `WorkloadReconciler` (spec), `WorkloadDeploymentFederator` (status) | +| `WorkloadDeployment` (federation intent) | Karmada API Server | `WorkloadDeploymentFederator` | +| `PropagationPolicy` | Karmada API Server | `WorkloadDeploymentFederator` (one per city code per namespace, lazy) | +| `Instance` (write-back) | Karmada API Server | `InstanceReconciler` (POP cell) | +| `Instance` (local execution) | POP Cell | `WorkloadDeploymentReconciler` (POP cell) | +| `Instance` (projection) | Project | `InstanceProjector` | +| `Location` | Project | `network-services-operator` | +| `NetworkBinding` | POP Cell | `WorkloadDeploymentReconciler`, reconciled by NSO (POP cell) | +| `SubnetClaim` | POP Cell | `WorkloadDeploymentReconciler`, reconciled by NSO (POP cell) | +| `ResourceClaim` (quota) | Project | `InstanceReconciler` (POP cell) | + +--- + +## Control Flow + +### Creation Path + +```mermaid +sequenceDiagram + actor Consumer + participant Project + participant CPC as Control Plane Cell + participant Karmada as Karmada API Server + participant POP as POP Cell + + Consumer->>Project: create Workload + + Project->>CPC: WorkloadReconciler watches Workload + CPC->>Project: query Locations for city codes + CPC->>Project: create WorkloadDeployment (spec only, per city) + + Project->>CPC: WorkloadDeploymentFederator watches WorkloadDeployment + CPC->>Karmada: create WorkloadDeployment (labeled with city code) + CPC->>Karmada: create PropagationPolicy (once per city code, lazy) + + Karmada->>POP: propagate WorkloadDeployment + + POP->>POP: WorkloadDeploymentReconciler creates Instances,\nNetworkBinding, SubnetClaim + POP->>POP: NSO reconciles NetworkBinding & SubnetClaim + POP->>POP: remove network SchedulingGate once networks ready + POP->>Karmada: aggregate WorkloadDeployment.status + + POP->>Project: InstanceReconciler creates ResourceClaim (quota) + Project-->>POP: quota granted → remove quota SchedulingGate + POP->>Karmada: write back Instance (for visibility) + + Karmada->>CPC: WorkloadDeploymentFederator reads aggregated status + CPC->>Project: write WorkloadDeployment.status + + Karmada->>CPC: InstanceProjector watches Instance write-backs + CPC->>Project: create read-only Instance projection + + Project->>CPC: WorkloadReconciler aggregates WorkloadDeployment.status + CPC->>Project: write Workload.status +``` + +### Deletion Path + +```mermaid +sequenceDiagram + actor Consumer + participant Project + participant CPC as Control Plane Cell + participant Karmada as Karmada API Server + participant POP as POP Cell + + Consumer->>Project: delete Workload + Project->>CPC: WorkloadReconciler watches deletion + CPC->>Project: delete child WorkloadDeployment objects + + Project->>CPC: WorkloadDeploymentFederator watches deletion + CPC->>Karmada: delete WorkloadDeployment + CPC->>Karmada: remove PropagationPolicy (if no remaining deployments for city) + + Karmada->>POP: remove propagated WorkloadDeployment + POP->>POP: WorkloadDeploymentReconciler deletes Instances,\nNetworkBinding, SubnetClaim + POP->>Karmada: InstanceReconciler removes write-back Instance + + Karmada->>CPC: InstanceProjector detects Instance removal + CPC->>Project: garbage-collect projected Instance objects +``` + +--- + +## Instance Visibility + +`Instance` objects must remain visible in the project because they are part of the consumer-facing API surface (network IPs, readiness conditions, etc.). + +Since instances are created locally in POP cells, the `InstanceReconciler` writes a corresponding `Instance` object to the Karmada API Server after each status update. This uses the `MappedNamespaceResourceStrategy` (promoted into Milo as part of this work), applying the `ns-` namespace convention and the `meta.datumapis.com/*` label tracking used throughout the platform. + +The `InstanceProjector` in the Control Plane Cell watches these Karmada-side `Instance` objects and mirrors them into the project as read-only projections. + +No changes are required to `WorkloadDeployment.status` — it remains aggregate counts only. + +### Projected Instance Fields + +| Field | Source | +|---|---| +| `metadata.name` | Karmada-side Instance name | +| `metadata.ownerReferences` | Owned by the project `WorkloadDeployment` — cascading deletion | +| `spec` | Copied from Karmada-side Instance spec | +| `status` | Copied from Karmada-side Instance status | + +--- + +## Operator Changes + +### `WorkloadReconciler` + +- **Unchanged**: Queries `Location` resources from the project; creates `WorkloadDeployment` objects in the project; aggregates `Workload.status`. + +### `WorkloadDeploymentScheduler` + +- **Removed entirely.** City code → cluster selection is handled by Karmada via `PropagationPolicy.placement.clusterAffinity.labelSelector`. + +### New: `WorkloadDeploymentFederator` + +A new controller in the Control Plane Cell: + +- Watches `WorkloadDeployment` in every project (via multicluster-runtime). +- On create/update: upserts a corresponding `WorkloadDeployment` (labeled with city code) in the Karmada API Server. +- Creates a `PropagationPolicy` per city code per project namespace lazily on first use. +- Reads aggregated `WorkloadDeployment.status` from the Karmada API Server and writes it to the project. +- On delete: removes the Karmada-side `WorkloadDeployment`. Removes the `PropagationPolicy` when no remaining deployment in the namespace targets that city code. + +### `WorkloadDeploymentReconciler` + +- **Runs in POP cell operators** — watches locally-propagated `WorkloadDeployment` objects. +- Unchanged behavior: creates `Instance`, `NetworkBinding`, `SubnetClaim` using existing stateful control logic. +- Manages `network` scheduling gate removal once NSO signals networks are ready. +- Updates local `WorkloadDeployment.status` with aggregate replica counts (Karmada aggregates this back natively). +- **Remove**: `WorkloadDeployment.status.location` (location is now implicit in `spec.cityCode`). + +### `InstanceReconciler` + +- **Runs in POP cell operators** alongside `WorkloadDeploymentReconciler`. +- Manages `ResourceClaim` in the project for quota (unchanged). +- Manages `quota` scheduling gate removal once quota is granted. +- **New**: After updating local `Instance.status`, writes a corresponding `Instance` to the Karmada API Server for visibility. +- Requires two injected kubeconfigs at POP cell registration: project (quota) and Karmada API Server (write-back). + +### New: `InstanceProjector` + +A new controller in the Control Plane Cell: + +- Watches `Instance` objects written back to the Karmada API Server. +- Creates/updates read-only `Instance` projections in the corresponding project, owned by the project `WorkloadDeployment`. +- Deletes projections when the Karmada-side `Instance` is removed. + +--- + +## Auto Scaling + +Auto scaling is not implemented at launch, but the federation architecture is designed to support it without the Control Plane Cell being in the critical path. + +### Model + +Scaling decisions run **locally in the POP cell**. The `WorkloadDeploymentReconciler` observes local instance metrics against the policy in the propagated `WorkloadDeployment`, creates or deletes `Instance` objects locally, and triggers `NetworkBinding`/`SubnetClaim` setup via local NSO — all without a round-trip to the Control Plane Cell. + +**Quota is the single upstream dependency.** A new `Instance` is immediately stamped with the `quota` scheduling gate and a `ResourceClaim` is created in the project. The instance queues pending authorization and starts running as soon as the grant arrives. The scaling *decision* is never blocked — only the *execution* of new instances. + +```mermaid +sequenceDiagram + participant POP as POP Cell + participant Project + participant Karmada as Karmada API Server + participant CPC as Control Plane Cell + + POP->>POP: WorkloadDeploymentReconciler observes\nmetrics vs. WorkloadDeployment policy + + alt Scale Up + POP->>POP: create new Instance (quota gate applied) + POP->>POP: create NetworkBinding & SubnetClaim + POP->>POP: NSO reconciles networking + POP->>POP: remove network SchedulingGate + POP->>Project: InstanceReconciler creates ResourceClaim + Project-->>POP: quota granted → remove quota SchedulingGate + Note over POP: Instance starts running + POP->>Karmada: write back Instance status + Karmada->>CPC: InstanceProjector mirrors to Project + else Scale Down + POP->>POP: delete Instance, NetworkBinding, SubnetClaim + POP->>Karmada: InstanceReconciler removes write-back Instance + Karmada->>CPC: InstanceProjector removes projection from Project + end + + POP->>Karmada: aggregate updated WorkloadDeployment.status + Karmada->>CPC: WorkloadDeploymentFederator reads aggregated status + CPC->>Project: write WorkloadDeployment.status +``` + +### Failure behavior + +If the Control Plane Cell or Karmada is temporarily unavailable: + +- Existing instances continue running unaffected. +- Local scaling decisions still happen — the `WorkloadDeploymentReconciler` continues to act on observed metrics. +- Scale-down is fully local and unaffected. +- Scale-up of new instances is gated on quota grants, which require the project to be reachable. + +--- + +## Multicluster-Runtime Configuration + +The Control Plane Cell operator connects to: + +| Connection | Purpose | Config | +|---|---|---| +| Karmada Federation API Server | Write `WorkloadDeployment`, `PropagationPolicy`; read Instance write-backs | Static kubeconfig | +| Projects | Read `Workload`; write `WorkloadDeployment` spec/status, `Instance` projections | Milo provider (unchanged) | + +POP cell operators connect to: + +| Connection | Purpose | Config | +|---|---|---| +| Local POP cell | All local resource management | In-cluster config | +| Project | Write `ResourceClaim` for quota | Milo provider (unchanged) | +| Karmada Federation API Server | Write `Instance` objects for visibility | Static kubeconfig | + +--- + +## Namespace Mapping + +Resources written to the Karmada API Server follow the `ns-` convention established by the network-services-operator's `MappedNamespaceResourceStrategy`. This avoids collisions when multiple projects federate into a single Karmada API Server. Namespaces are auto-created on demand. + +The `MappedNamespaceResourceStrategy` pattern will be promoted from NSO's `internal/downstreamclient/` into **Milo** as part of this work, making it available to both the compute service and POP cell operators without duplication. + +`PropagationPolicy` objects live in the same namespace as the `WorkloadDeployment` objects they govern (`ns-`). + +--- + +## Decisions + +### Namespace Mapping Convention + +Resources written to the Karmada API Server follow the `ns-` convention. Namespaces are auto-created on demand. `PropagationPolicy` resources live in the same namespace as the `WorkloadDeployment` objects they govern. + +### Shared Downstream Client Library + +The `MappedNamespaceResourceStrategy` pattern will be promoted from NSO's `internal/downstreamclient/` into **Milo** as part of this work. Both the Control Plane Cell operator and POP cell operators will depend on the Milo-hosted version. + +### PropagationPolicy Scope + +One `PropagationPolicy` per city code per project namespace, using a `labelSelector` to match all `WorkloadDeployment` objects labeled with `topology.datum.net/city-code: `. Created lazily on first use, deleted when no deployment in the namespace targets that city. + +### NSO in POP Cells + +`network-services-operator` runs in each POP cell to reconcile `NetworkBinding`, `SubnetClaim`, and `Subnet` resources created locally by `WorkloadDeploymentReconciler`. This keeps all networking setup local to the POP cell, eliminating any dependency on the Control Plane Cell for network provisioning. + +### Auto Scaling + +Auto scaling decisions are local to the POP cell. Quota is the single upstream dependency — new instances queue with a `quota` scheduling gate and start as soon as the grant arrives. The Control Plane Cell is not in the critical path for scaling latency or availability. diff --git a/go.mod b/go.mod index 19fc0103..48bab65b 100644 --- a/go.mod +++ b/go.mod @@ -1,31 +1,34 @@ module go.datum.net/compute -go 1.24.0 - -toolchain go1.24.2 +go 1.25.0 require ( + github.com/go-logr/logr v1.4.3 github.com/google/go-cmp v0.7.0 - github.com/onsi/ginkgo/v2 v2.23.4 - github.com/onsi/gomega v1.37.0 + github.com/karmada-io/api v1.15.0 + github.com/onsi/ginkgo/v2 v2.27.2 + github.com/onsi/gomega v1.38.2 + github.com/prometheus/client_golang v1.23.2 github.com/stretchr/testify v1.11.1 - go.datum.net/network-services-operator v0.1.0 - go.miloapis.com/milo v0.24.11 - golang.org/x/crypto v0.39.0 - golang.org/x/sync v0.16.0 + go.datum.net/network-services-operator v0.21.10-0.20260528021428-b0f2347f5359 + go.miloapis.com/milo v0.25.2-0.20260528192736-e4258524ad42 + golang.org/x/crypto v0.45.0 + golang.org/x/sync v0.18.0 google.golang.org/protobuf v1.36.11 - k8s.io/api v0.33.1 - k8s.io/apimachinery v0.33.2 - k8s.io/client-go v0.33.1 - k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 - sigs.k8s.io/controller-runtime v0.21.0 - sigs.k8s.io/gateway-api v1.2.1 - sigs.k8s.io/multicluster-runtime v0.21.0-alpha.8 + k8s.io/api v0.35.0 + k8s.io/apimachinery v0.35.0 + k8s.io/client-go v0.35.0 + k8s.io/component-base v0.35.0 + k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 + sigs.k8s.io/controller-runtime v0.23.3 + sigs.k8s.io/gateway-api v1.3.1-0.20250527223622-54df0a899c1c + sigs.k8s.io/multicluster-runtime v0.23.3 ) require ( - cel.dev/expr v0.19.1 // indirect - github.com/antlr4-go/antlr/v4 v4.13.0 // indirect + cel.dev/expr v0.24.0 // indirect + github.com/Masterminds/semver/v3 v3.4.0 // indirect + github.com/antlr4-go/antlr/v4 v4.13.1 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect @@ -35,74 +38,70 @@ require ( github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect - github.com/fxamacker/cbor/v2 v2.8.0 // indirect - github.com/go-logr/logr v1.4.3 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.21.1 // indirect github.com/go-openapi/jsonreference v0.21.0 // indirect github.com/go-openapi/swag v0.23.1 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect - github.com/gogo/protobuf v1.3.2 // indirect github.com/google/btree v1.1.3 // indirect - github.com/google/cel-go v0.23.2 // indirect - github.com/google/gnostic-models v0.6.9 // indirect + github.com/google/cel-go v0.26.0 // indirect + github.com/google/gnostic-models v0.7.0 // indirect github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect github.com/google/uuid v1.6.0 // indirect - github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.9.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.22.0 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.64.0 // indirect - github.com/prometheus/procfs v0.16.1 // indirect - github.com/spf13/cobra v1.9.1 // indirect - github.com/spf13/pflag v1.0.7 // indirect - github.com/stoewer/go-strcase v1.3.0 // indirect + github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/procfs v0.17.0 // indirect + github.com/spf13/cobra v1.10.0 // indirect + github.com/spf13/pflag v1.0.9 // indirect + github.com/stoewer/go-strcase v1.3.1 // indirect github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect - go.opentelemetry.io/otel v1.35.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 // indirect - go.opentelemetry.io/otel/metric v1.35.0 // indirect - go.opentelemetry.io/otel/sdk v1.34.0 // indirect - go.opentelemetry.io/otel/trace v1.35.0 // indirect - go.opentelemetry.io/proto/otlp v1.4.0 // indirect - go.uber.org/automaxprocs v1.6.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 // indirect + go.opentelemetry.io/otel v1.37.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.35.0 // indirect + go.opentelemetry.io/otel/metric v1.37.0 // indirect + go.opentelemetry.io/otel/sdk v1.37.0 // indirect + go.opentelemetry.io/otel/trace v1.37.0 // indirect + go.opentelemetry.io/proto/otlp v1.7.1 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect - go.yaml.in/yaml/v2 v2.4.2 // indirect - golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect - golang.org/x/net v0.41.0 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 // indirect + golang.org/x/mod v0.29.0 // indirect + golang.org/x/net v0.47.0 // indirect golang.org/x/oauth2 v0.30.0 // indirect - golang.org/x/sys v0.33.0 // indirect - golang.org/x/term v0.32.0 // indirect - golang.org/x/text v0.26.0 // indirect + golang.org/x/sys v0.38.0 // indirect + golang.org/x/term v0.37.0 // indirect + golang.org/x/text v0.31.0 // indirect golang.org/x/time v0.12.0 // indirect - golang.org/x/tools v0.33.0 // indirect + golang.org/x/tools v0.38.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb // indirect - google.golang.org/grpc v1.71.1 // indirect - gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250728155136-f173205681a0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250728155136-f173205681a0 // indirect + google.golang.org/grpc v1.74.2 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.33.1 // indirect - k8s.io/apiserver v0.33.1 // indirect - k8s.io/component-base v0.33.1 // indirect + k8s.io/apiextensions-apiserver v0.35.0 // indirect + k8s.io/apiserver v0.35.0 // indirect k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a // indirect + k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect - sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect - sigs.k8s.io/yaml v1.5.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index c472bd8b..42a98554 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,9 @@ -cel.dev/expr v0.19.1 h1:NciYrtDRIR0lNCnH1LFJegdjspNx9fI59O7TWcua/W4= -cel.dev/expr v0.19.1/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= -github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= -github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= +cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY= +cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYWrPrQ= +github.com/antlr4-go/antlr/v4 v4.13.1/go.mod h1:GKmUxMtwp6ZgGwZSva4eWPC5mS6vUAmOABFgjdkM7Nw= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= @@ -17,16 +19,22 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/evanphx/json-patch v5.7.0+incompatible h1:vgGkfT/9f8zE6tvSCe74nfpAVDQ2tG6yudJd8LBksgI= -github.com/evanphx/json-patch v5.7.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/evanphx/json-patch v5.9.11+incompatible h1:ixHHqfcGvxhWkniF1tWxBHA0yb4Z+d1UQi45df52xW8= +github.com/evanphx/json-patch v5.9.11+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= -github.com/fxamacker/cbor/v2 v2.8.0 h1:fFtUGXUzXPHTIUdne5+zzMPTfffl3RD5qYnkY40vtxU= -github.com/fxamacker/cbor/v2 v2.8.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BNhXs= +github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo= +github.com/gkampitakis/go-diff v1.3.2 h1:Qyn0J9XJSDTgnsgHRdz9Zp24RaJeKMUHg2+PDZZdC4M= +github.com/gkampitakis/go-diff v1.3.2/go.mod h1:LLgOrpqleQe26cte8s36HTWcTmMEur6OPYerdAAS9tk= +github.com/gkampitakis/go-snaps v0.5.15 h1:amyJrvM1D33cPHwVrjo9jQxX8g/7E2wYdZ+01KS3zGE= +github.com/gkampitakis/go-snaps v0.5.15/go.mod h1:HNpx/9GoKisdhw9AFOBT1N7DBs9DiHo/hGheFGBZ+mc= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= @@ -42,17 +50,16 @@ github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZ github.com/go-openapi/swag v0.23.1/go.mod h1:STZs8TbRvEQQKUA+JZNAm3EWlgaOBGpyFDqQnDHMef0= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= -github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= -github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= +github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= -github.com/google/cel-go v0.23.2 h1:UdEe3CvQh3Nv+E/j9r1Y//WO0K0cSyD7/y0bzyLIMI4= -github.com/google/cel-go v0.23.2/go.mod h1:52Pb6QsDbC5kvgxvZhiL9QX1oZEkcUF/ZqaPx1J5Wwo= -github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= -github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/cel-go v0.26.0 h1:DPGjXackMpJWH680oGY4lZhYjIameYmR+/6RBdDGmaI= +github.com/google/cel-go v0.26.0/go.mod h1:A9O8OU9rdvrK5MQyrqfIxo1a0u4g3sF8KB6PUIaryMM= +github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= +github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -62,18 +69,18 @@ github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 h1:TmHmbvxPmaegwhDubVz0lICL0J5Ka2vwTzhoePEXsGE= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0/go.mod h1:qztMSjm835F2bXf+5HKAPIS5qsmQDqZna/PgVt4rWtI= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1 h1:X5VWvz21y3gzm9Nw/kaUeku/1+uBhcekkmy4IkffJww= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1/go.mod h1:Zanoh4+gvIgluNqcfMVTJueD4wSS5hT7zTt4Mrutd90= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/joshdk/go-junit v1.0.0 h1:S86cUKIdwBHWwA6xCmFlf3RTLfVXYQfvanM5Uh+K6GE= +github.com/joshdk/go-junit v1.0.0/go.mod h1:TiiV0PqkaNfFXjEiyjWM3XXrhVyCa1K4Zfga6W52ung= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= -github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/karmada-io/api v1.15.0 h1:6Dx+Q36LaoPqKM4gduUuhSBQ3eKjKusjkvmggLpt9xs= +github.com/karmada-io/api v1.15.0/go.mod h1:wNbBEmXYkrRLSC2VgmXizIG12FW+/sAUF7UIz5WlYAU= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= @@ -84,42 +91,43 @@ github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= +github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= +github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg= +github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3vE= +github.com/mfridman/tparse v0.18.0/go.mod h1:gEvqZTuCgEhPbYk/2lS3Kcxg1GmTxxU7kTC8DvP0i/A= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= -github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= -github.com/onsi/gomega v1.37.0 h1:CdEG8g0S133B4OswTDC/5XPSzE1OeP29QOioj2PID2Y= -github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= -github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= -github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= +github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= +github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= +github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= -github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= -github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= -github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.64.0 h1:pdZeA+g617P7oGv1CzdTzyeShxAGrTBsolKNOLQPGO4= -github.com/prometheus/common v0.64.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= -github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= -github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= -github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= -github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= +github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= -github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= -github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/spf13/pflag v1.0.7 h1:vN6T9TfwStFPFM5XzjsvmzZkLuaLX+HS+0SeFLRgU6M= -github.com/spf13/pflag v1.0.7/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs= -github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo= +github.com/spf13/cobra v1.10.0 h1:a5/WeUlSDCvV5a45ljW2ZFtV0bTDpkfSAj3uqB6Sc+0= +github.com/spf13/cobra v1.10.0/go.mod h1:9dhySC7dnTtEiqzmqfkLj47BslqLCUPMXjG2lj/NgoE= +github.com/spf13/pflag v1.0.8/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= +github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stoewer/go-strcase v1.3.1 h1:iS0MdW+kVTxgMoE1LAZyMiYJFKlOzLooE4MxjirtkAs= +github.com/stoewer/go-strcase v1.3.1/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= @@ -129,160 +137,125 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= -github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -go.datum.net/network-services-operator v0.1.0 h1:PAXOZ5DdJFgRoeVBPIXhqkCm6DxbP4tVOPcr3Y7h/So= -go.datum.net/network-services-operator v0.1.0/go.mod h1:uloVfxqE+8DgSiMB651X8UC9yECpXbwp/NBstofCceE= -go.miloapis.com/milo v0.1.0 h1:AYFVz1lfta/NbWSFSSKPtnkCA2rN+iegxlfQrDgEvYY= -go.miloapis.com/milo v0.1.0/go.mod h1:X+DpWOchv/Vm63mwHnboW00KRGsODY2bUTS/bBbK1+E= -go.miloapis.com/milo v0.24.11 h1:rByXDKbP4ZEN0I/z1C2RyUCyQi0NWrITLqoQILSAn2E= -go.miloapis.com/milo v0.24.11/go.mod h1:xOFYvUsvSZV3z6eow5YdB5C/qRQf2s/5/arcfJs5XPg= +go.datum.net/network-services-operator v0.21.10-0.20260528021428-b0f2347f5359 h1:P3dePA6cCXKimZzE6d7Xxpj2rz54BxOHI8K8ic7VQ+c= +go.datum.net/network-services-operator v0.21.10-0.20260528021428-b0f2347f5359/go.mod h1:Nr0PsCodkTW31vWVxR9dhAP9w0y+WHUYeyrcRnchcIE= +go.miloapis.com/milo v0.25.2-0.20260528192736-e4258524ad42 h1:LSHyqLt/jus6iEMvo8pc731L+PyrTHP2bqfMMtHPSWc= +go.miloapis.com/milo v0.25.2-0.20260528192736-e4258524ad42/go.mod h1:p9O2kk194mvoL8rhqjwb+LWB+GIyY4vQqiTowwibVWo= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 h1:yd02MEjBdJkG3uabWP9apV+OuWRIXGDuJEUJbOHmCFU= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0/go.mod h1:umTcuxiv1n/s/S6/c2AT/g2CQ7u5C59sHDNmfSwgz7Q= -go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= -go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 h1:Vh5HayB/0HHfOQA7Ctx69E/Y/DcQSMPpKANYVMQ7fBA= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0/go.mod h1:cpgtDBaqD/6ok/UG0jT15/uKjAY8mRA53diogHBg3UI= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 h1:5pojmb1U1AogINhN3SurB+zm/nIcusopeBNp42f45QM= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0/go.mod h1:57gTHJSE5S1tqg+EKsLPlTWhpHMsWlVmer+LA926XiA= -go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= -go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= -go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A= -go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU= -go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= -go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= -go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= -go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= -go.opentelemetry.io/proto/otlp v1.4.0 h1:TA9WRvW6zMwP+Ssb6fLoUIuirti1gGbP28GcKG1jgeg= -go.opentelemetry.io/proto/otlp v1.4.0/go.mod h1:PPBWZIP98o2ElSqI35IHfu7hIhSwvc5N38Jw8pXuGFY= -go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= -go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 h1:Hf9xI/XLML9ElpiHVDNwvqI0hIFlzV8dgIr35kV1kRU= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0/go.mod h1:NfchwuyNoMcZ5MLHwPrODwUF1HWCXWrL31s8gSAdIKY= +go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= +go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0 h1:Ahq7pZmv87yiyn3jeFz/LekZmPLLdKejuO3NcK9MssM= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0/go.mod h1:MJTqhM0im3mRLw1i8uGHnCvUEeS7VwRyxlLC78PA18M= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.35.0 h1:m639+BofXTvcY1q8CGs4ItwQarYtJPOWmVobfM1HpVI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.35.0/go.mod h1:LjReUci/F4BUyv+y4dwnq3h/26iNOeC3wAIqgvTIZVo= +go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= +go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= +go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= +go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= +go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= +go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= +go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= +go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= +go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4= +go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= -go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= -go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= -go.yaml.in/yaml/v3 v3.0.3 h1:bXOww4E/J3f66rav3pX3m8w6jDE4knZjGOw8b5Y6iNE= -go.yaml.in/yaml/v3 v3.0.3/go.mod h1:tBHosrYAkRZjRAOREWbDnBXUf08JOwYq++0QNwQiWzI= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= -golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= -golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= -golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= -golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= -golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q= +golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4= +golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 h1:R9PFI6EUdfVKgwKjZef7QIwGcBKu86OEFpJ9nUEP2l4= +golang.org/x/exp v0.0.0-20250718183923-645b1fa84792/go.mod h1:A+z0yzpGtvnG90cToK5n2tu8UJVP2XUATh+r+sfOOOc= +golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= +golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= -golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= -golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= -golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= -golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg= -golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= -golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= +golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= +golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= +golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.33.0 h1:4qz2S3zmRxbGIhDIAgjxvFutSvH5EfnsYrRBj0UI0bc= -golang.org/x/tools v0.33.0/go.mod h1:CIJMaWEY88juyUfo7UbgPqbC8rU2OqfAV1h2Qp0oMYI= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0= gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 h1:GVIKPyP/kLIyVOgOnTwFOrvQaQUzOzGMCxgFUOEmm24= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422/go.mod h1:b6h1vNKhxaSoEI+5jc3PJUCustfli/mRab7295pY7rw= -google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb h1:p31xT4yrYrSM/G4Sn2+TNUkVhFCbG9y8itM2S6Th950= -google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:jbe3Bkdp+Dh2IrslsFCklNhweNTBgSYanP1UXhJDhKg= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a h1:51aaUVRocpvUOSQKM6Q7VuoaktNIaMCLuhZB6DKksq4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a/go.mod h1:uRxBH1mhmO8PGhU89cMcHaXKZqO+OfakD8QQO0oYwlQ= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb h1:TLPQVbx1GJ8VKZxz52VAxl1EBgKXXbTiU9Fc5fZeLn4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I= -google.golang.org/grpc v1.71.0 h1:kF77BGdPTQ4/JZWMlb9VpJ5pa25aqvVqogsxNHHdeBg= -google.golang.org/grpc v1.71.0/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= -google.golang.org/grpc v1.71.1 h1:ffsFWr7ygTUscGPI0KKK6TLrGz0476KUvvsbqWK0rPI= -google.golang.org/grpc v1.71.1/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +google.golang.org/genproto/googleapis/api v0.0.0-20250728155136-f173205681a0 h1:0UOBWO4dC+e51ui0NFKSPbkHHiQ4TmrEfEZMLDyRmY8= +google.golang.org/genproto/googleapis/api v0.0.0-20250728155136-f173205681a0/go.mod h1:8ytArBbtOy2xfht+y2fqKd5DRDJRUQhqbyEnQ4bDChs= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250728155136-f173205681a0 h1:MAKi5q709QWfnkkpNQ0M12hYJ1+e8qYVDyowc4U1XZM= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250728155136-f173205681a0/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= +google.golang.org/grpc v1.74.2 h1:WoosgB65DlWVC9FqI82dGsZhWFNBSLjQ84bjROOpMu4= +google.golang.org/grpc v1.74.2/go.mod h1:CtQ+BGjaAIXHs/5YS3i473GqwBBa1zGQNevxdeBEXrM= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= -gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.33.1 h1:tA6Cf3bHnLIrUK4IqEgb2v++/GYUtqiu9sRVk3iBXyw= -k8s.io/api v0.33.1/go.mod h1:87esjTn9DRSRTD4fWMXamiXxJhpOIREjWOSjsW1kEHw= -k8s.io/apiextensions-apiserver v0.33.1 h1:N7ccbSlRN6I2QBcXevB73PixX2dQNIW0ZRuguEE91zI= -k8s.io/apiextensions-apiserver v0.33.1/go.mod h1:uNQ52z1A1Gu75QSa+pFK5bcXc4hq7lpOXbweZgi4dqA= -k8s.io/apimachinery v0.33.2 h1:IHFVhqg59mb8PJWTLi8m1mAoepkUNYmptHsV+Z1m5jY= -k8s.io/apimachinery v0.33.2/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= -k8s.io/apiserver v0.33.1 h1:yLgLUPDVC6tHbNcw5uE9mo1T6ELhJj7B0geifra3Qdo= -k8s.io/apiserver v0.33.1/go.mod h1:VMbE4ArWYLO01omz+k8hFjAdYfc3GVAYPrhP2tTKccs= -k8s.io/client-go v0.33.1 h1:ZZV/Ks2g92cyxWkRRnfUDsnhNn28eFpt26aGc8KbXF4= -k8s.io/client-go v0.33.1/go.mod h1:JAsUrl1ArO7uRVFWfcj6kOomSlCv+JpvIsp6usAGefA= -k8s.io/component-base v0.33.1 h1:EoJ0xA+wr77T+G8p6T3l4efT2oNwbqBVKR71E0tBIaI= -k8s.io/component-base v0.33.1/go.mod h1:guT/w/6piyPfTgq7gfvgetyXMIh10zuXA6cRRm3rDuY= +k8s.io/api v0.35.0 h1:iBAU5LTyBI9vw3L5glmat1njFK34srdLmktWwLTprlY= +k8s.io/api v0.35.0/go.mod h1:AQ0SNTzm4ZAczM03QH42c7l3bih1TbAXYo0DkF8ktnA= +k8s.io/apiextensions-apiserver v0.35.0 h1:3xHk2rTOdWXXJM+RDQZJvdx0yEOgC0FgQ1PlJatA5T4= +k8s.io/apiextensions-apiserver v0.35.0/go.mod h1:E1Ahk9SADaLQ4qtzYFkwUqusXTcaV2uw3l14aqpL2LU= +k8s.io/apimachinery v0.35.0 h1:Z2L3IHvPVv/MJ7xRxHEtk6GoJElaAqDCCU0S6ncYok8= +k8s.io/apimachinery v0.35.0/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= +k8s.io/apiserver v0.35.0 h1:CUGo5o+7hW9GcAEF3x3usT3fX4f9r8xmgQeCBDaOgX4= +k8s.io/apiserver v0.35.0/go.mod h1:QUy1U4+PrzbJaM3XGu2tQ7U9A4udRRo5cyxkFX0GEds= +k8s.io/client-go v0.35.0 h1:IAW0ifFbfQQwQmga0UdoH0yvdqrbwMdq9vIFEhRpxBE= +k8s.io/client-go v0.35.0/go.mod h1:q2E5AAyqcbeLGPdoRB+Nxe3KYTfPce1Dnu1myQdqz9o= +k8s.io/component-base v0.35.0 h1:+yBrOhzri2S1BVqyVSvcM3PtPyx5GUxCK2tinZz1G94= +k8s.io/component-base v0.35.0/go.mod h1:85SCX4UCa6SCFt6p3IKAPej7jSnF3L8EbfSyMZayJR0= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a h1:ZV3Zr+/7s7aVbjNGICQt+ppKWsF1tehxggNfbM7XnG8= -k8s.io/kube-openapi v0.0.0-20250610211856-8b98d1ed966a/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= -sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8= -sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM= -sigs.k8s.io/gateway-api v1.2.1 h1:fZZ/+RyRb+Y5tGkwxFKuYuSRQHu9dZtbjenblleOLHM= -sigs.k8s.io/gateway-api v1.2.1/go.mod h1:EpNfEXNjiYfUJypf0eZ0P5iXA9ekSGWaS1WgPaM42X0= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= -sigs.k8s.io/multicluster-runtime v0.21.0-alpha.8 h1:Pq69tTKfN8ADw8m8A3wUtP8wJ9SPQbbOsgapm3BZEPw= -sigs.k8s.io/multicluster-runtime v0.21.0-alpha.8/go.mod h1:CpBzLMLQKdm+UCchd2FiGPiDdCxM5dgCCPKuaQ6Fsv0= -sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/controller-runtime v0.23.3 h1:VjB/vhoPoA9l1kEKZHBMnQF33tdCLQKJtydy4iqwZ80= +sigs.k8s.io/controller-runtime v0.23.3/go.mod h1:B6COOxKptp+YaUT5q4l6LqUJTRpizbgf9KSRNdQGns0= +sigs.k8s.io/gateway-api v1.3.1-0.20250527223622-54df0a899c1c h1:GS4VnGRV90GEUjrgQ2GT5ii6yzWj3KtgUg+sVMdhs5c= +sigs.k8s.io/gateway-api v1.3.1-0.20250527223622-54df0a899c1c/go.mod h1:d8NV8nJbaRbEKem+5IuxkL8gJGOZ+FJ+NvOIltV8gDk= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/multicluster-runtime v0.23.3 h1:vrzlXRzHTDsjspUAfoW2rCtr0agoI4q20p9x4Fz4png= +sigs.k8s.io/multicluster-runtime v0.23.3/go.mod h1:r/UA4GHgFoXCcR4tcvlZz7SiLx3l1kJKDuBAhILNIHs= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= -sigs.k8s.io/structured-merge-diff/v4 v4.7.0 h1:qPeWmscJcXP0snki5IYF79Z8xrl8ETFxgMd7wez1XkI= -sigs.k8s.io/structured-merge-diff/v4 v4.7.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= -sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= -sigs.k8s.io/yaml v1.5.0 h1:M10b2U7aEUY6hRtU870n2VTPgR5RZiL/I6Lcc2F4NUQ= -sigs.k8s.io/yaml v1.5.0/go.mod h1:wZs27Rbxoai4C0f8/9urLZtZtF3avA3gKvGyPdDqTO4= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 h1:2WOzJpHUBVrrkDjU4KBT8n5LDcj824eX0I5UKcgeRUs= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/hack/e2e/kind-control-plane.yaml b/hack/e2e/kind-control-plane.yaml new file mode 100644 index 00000000..47f3c63b --- /dev/null +++ b/hack/e2e/kind-control-plane.yaml @@ -0,0 +1,17 @@ +# Kind cluster configuration for the compute-control-plane management cluster. +# +# extraPortMappings exposes port 32443 on the macOS host so that the Karmada +# API server NodePort service (nodePort: 32443) is accessible at +# https://localhost:32443 without any additional port-forwarding. +# +# This matches KARMADA_API_NODEPORT in Taskfile.yaml. + +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: + - role: control-plane + extraPortMappings: + - containerPort: 32443 # Karmada API server NodePort + hostPort: 32443 + protocol: TCP + listenAddress: "127.0.0.1" diff --git a/hack/e2e/make-internal-kubeconfig.sh b/hack/e2e/make-internal-kubeconfig.sh new file mode 100755 index 00000000..3303a5bd --- /dev/null +++ b/hack/e2e/make-internal-kubeconfig.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +# make-internal-kubeconfig.sh +# +# Produces a kubeconfig variant that uses the Kind node's Docker container IP +# instead of localhost. This variant is stored in Karmada so the controller +# manager (running inside Docker) can reach member cluster API servers across +# the kind bridge network. +# +# Background: Kind maps each cluster's API server to a random localhost port +# on the developer machine. Inside Docker containers, "localhost" refers to the +# container's own loopback — not the host. We therefore swap the server address +# to the Kind control-plane container's Docker bridge IP (e.g. 172.18.0.x) and +# set insecure-skip-tls-verify because the node certificate does not include +# the Docker bridge IP in its SANs. +# +# Usage: +# hack/e2e/make-internal-kubeconfig.sh \ +# tmp/e2e/kubeconfigs/pop-dfw.yaml \ +# tmp/e2e/kubeconfigs/pop-dfw-internal.yaml \ +# compute-pop-dfw + +set -euo pipefail + +INPUT="${1:?usage: $0 }" +OUTPUT="${2:?usage: $0 }" +CLUSTER_NAME="${3:?usage: $0 }" + +CONTAINER_NAME="${CLUSTER_NAME}-control-plane" + +# Resolve the container's Docker bridge IP. +DOCKER_IP=$(docker inspect \ + -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' \ + "${CONTAINER_NAME}" 2>/dev/null || true) + +if [ -z "${DOCKER_IP}" ]; then + echo "ERROR: Could not resolve Docker IP for container '${CONTAINER_NAME}'." >&2 + echo " Is the Kind cluster '${CLUSTER_NAME}' running?" >&2 + exit 1 +fi + +echo " ${CLUSTER_NAME}: Docker IP ${DOCKER_IP} → ${OUTPUT}" + +python3 - "${INPUT}" "${OUTPUT}" "${DOCKER_IP}" <<'PYEOF' +import sys, yaml + +src, dst, docker_ip = sys.argv[1], sys.argv[2], sys.argv[3] + +with open(src) as f: + cfg = yaml.safe_load(f) + +for cluster in cfg.get('clusters', []): + # Kind API server always listens on port 6443 inside the container. + cluster['cluster']['server'] = f'https://{docker_ip}:6443' + # The node cert only covers localhost / 127.0.0.1, not the bridge IP. + cluster['cluster']['insecure-skip-tls-verify'] = True + cluster['cluster'].pop('certificate-authority-data', None) + +with open(dst, 'w') as f: + yaml.dump(cfg, f, default_flow_style=False) +PYEOF diff --git a/hack/e2e/patch-cluster-secret.sh b/hack/e2e/patch-cluster-secret.sh new file mode 100755 index 00000000..e29ed383 --- /dev/null +++ b/hack/e2e/patch-cluster-secret.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +# patch-cluster-secret.sh +# +# After "karmadactl join", Karmada stores the member cluster's kubeconfig in a +# Secret referenced by the Cluster object's spec.secretRef, and sets +# spec.apiEndpoint to the localhost address it resolved from the external +# kubeconfig. The Karmada controller manager runs inside Docker and cannot use +# localhost to reach POP cell API servers. +# +# This script: +# 1. Replaces the kubeconfig in the Secret with the Docker-IP variant so that +# the Karmada controller can make API calls to the member cluster. +# 2. Patches spec.apiEndpoint on the Cluster object so that health checks also +# use the Docker bridge IP instead of localhost. +# +# Usage: +# hack/e2e/patch-cluster-secret.sh \ +# tmp/e2e/kubeconfigs/karmada.yaml \ +# compute-pop-dfw \ +# tmp/e2e/kubeconfigs/pop-dfw-internal.yaml + +set -euo pipefail + +KARMADA_KUBECONFIG="${1:?usage: $0 }" +CLUSTER_NAME="${2:?usage: $0 }" +INTERNAL_KUBECONFIG="${3:?usage: $0 }" + +# ------------------------------------------------------------------ +# Read the Cluster object's secretRef (name + namespace) +# ------------------------------------------------------------------ +SECRET_NAME=$(kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + get cluster "${CLUSTER_NAME}" \ + -o jsonpath='{.spec.secretRef.name}' 2>/dev/null || true) + +if [ -z "${SECRET_NAME}" ]; then + echo "ERROR: Could not find spec.secretRef.name on cluster '${CLUSTER_NAME}'." >&2 + echo " Has karmadactl join completed successfully?" >&2 + exit 1 +fi + +SECRET_NAMESPACE=$(kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + get cluster "${CLUSTER_NAME}" \ + -o jsonpath='{.spec.secretRef.namespace}' 2>/dev/null || true) + +SECRET_NAMESPACE="${SECRET_NAMESPACE:-karmada-system}" + +echo " Patching secret ${SECRET_NAMESPACE}/${SECRET_NAME} with Docker-IP kubeconfig..." + +# ------------------------------------------------------------------ +# Replace the kubeconfig data in the secret +# ------------------------------------------------------------------ +kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + create secret generic "${SECRET_NAME}" \ + --namespace="${SECRET_NAMESPACE}" \ + --from-file=kubeconfig="${INTERNAL_KUBECONFIG}" \ + --dry-run=client -o yaml \ + | kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + apply -f - + +echo " Secret ${SECRET_NAMESPACE}/${SECRET_NAME} updated — Karmada controller will use Docker bridge IP" + +# ------------------------------------------------------------------ +# Extract the Docker-IP server URL from the internal kubeconfig and +# patch spec.apiEndpoint on the Cluster object so that Karmada's +# cluster-status controller uses the same reachable address for health +# checks. Without this patch the controller continues to probe the +# localhost address stored by karmadactl join and the cluster never +# transitions to Ready. +# ------------------------------------------------------------------ +DOCKER_SERVER=$(kubectl \ + --kubeconfig="${INTERNAL_KUBECONFIG}" \ + config view --minify -o jsonpath='{.clusters[0].cluster.server}') + +if [ -z "${DOCKER_SERVER}" ]; then + echo "ERROR: Could not read server URL from ${INTERNAL_KUBECONFIG}" >&2 + exit 1 +fi + +echo " Patching spec.apiEndpoint on cluster '${CLUSTER_NAME}' → ${DOCKER_SERVER}..." +kubectl \ + --kubeconfig="${KARMADA_KUBECONFIG}" \ + patch cluster "${CLUSTER_NAME}" \ + --type=merge \ + -p "{\"spec\":{\"apiEndpoint\":\"${DOCKER_SERVER}\"}}" + +echo " Cluster '${CLUSTER_NAME}' patched — health checks will now use Docker bridge IP" diff --git a/internal/config/config.go b/internal/config/config.go index dddb7926..4a6e8e76 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -229,6 +229,23 @@ type DiscoveryConfig struct { // template when connecting to project control planes. When not provided, // the operator will use the in-cluster config. ProjectKubeconfigPath string `json:"projectKubeconfigPath"` + + // ClusterName is the stable, unique name for this edge cluster. It is + // stamped onto ResourceClaim objects so that each edge controller can + // distinguish its own claims from those created by other edge controllers + // in the same project control planes. + // + // Required when Mode is "milo". Optional in single mode; defaults to "single". + ClusterName string `json:"clusterName"` + + // QuotaKubeconfigPath is the path to the kubeconfig file used when creating + // ResourceClaim objects against Milo project control planes. When set it + // takes precedence over ProjectKubeconfigPath for quota calls. When both are + // unset, quota accounting is disabled. + // + // Use this field in deployments (mode: single or mode: milo) that need to + // talk to api.datum.net for quota enforcement. + QuotaKubeconfigPath string `json:"quotaKubeconfigPath"` } func SetDefaults_DiscoveryConfig(obj *DiscoveryConfig) { @@ -253,6 +270,36 @@ func (c *DiscoveryConfig) ProjectRestConfig() (*rest.Config, error) { return clientcmd.BuildConfigFromFlags("", c.ProjectKubeconfigPath) } +// QuotaRestConfig returns the REST config for quota ResourceClaim management +// against Milo project control planes. QuotaKubeconfigPath is preferred; if +// unset, ProjectKubeconfigPath is used as a fallback. +// +// Returns (nil, nil) when no credential path is configured at all — this is +// the intentional opt-out case and the caller should disable quota enforcement. +// +// Returns (nil, error) when a credential path IS configured but the file does +// not exist on disk. This is a misconfiguration (Secret not mounted, wrong +// path) that must not silently disable enforcement; callers should treat this +// as a fatal startup error. +func (c *DiscoveryConfig) QuotaRestConfig() (*rest.Config, error) { + path := c.QuotaKubeconfigPath + if path == "" { + path = c.ProjectKubeconfigPath + } + if path == "" { + // No credential path configured: intentional opt-out. Caller logs and + // disables enforcement. + return nil, nil + } + if _, err := os.Stat(path); os.IsNotExist(err) { + // Path explicitly configured but file absent: operator intended enforcement + // but the credential is missing (unmounted Secret, wrong path). Fail loud. + return nil, fmt.Errorf("quota kubeconfig path %q is configured but file does not exist: "+ + "ensure the quota credential Secret is mounted correctly", path) + } + return clientcmd.BuildConfigFromFlags("", path) +} + func init() { SchemeBuilder.Register(&WorkloadOperator{}) } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 5f586932..bff584a6 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -1,6 +1,8 @@ package config import ( + "os" + "path/filepath" "testing" "k8s.io/apimachinery/pkg/runtime" @@ -56,3 +58,68 @@ webhookServer: t.Error("TLS.CertDir was not defaulted") } } + +// TestQuotaRestConfig_NilWhenNoPath verifies that omitting quotaKubeconfigPath +// returns (nil, nil) — the intentional opt-out / enforcement-disabled case. +func TestQuotaRestConfig_NilWhenNoPath(t *testing.T) { + cfg := &DiscoveryConfig{} + restCfg, err := cfg.QuotaRestConfig() + if err != nil { + t.Fatalf("QuotaRestConfig() error = %v, want nil", err) + } + if restCfg != nil { + t.Errorf("QuotaRestConfig() = non-nil, want nil (no path configured)") + } +} + +// TestQuotaRestConfig_ErrorWhenPathMissing verifies that explicitly setting a +// kubeconfig path that does not exist on disk returns a non-nil error (fail-loud). +// This reverses the old da63916 behavior of silently returning (nil, nil). +func TestQuotaRestConfig_ErrorWhenPathMissing(t *testing.T) { + cfg := &DiscoveryConfig{ + QuotaKubeconfigPath: "/nonexistent/path/quota.kubeconfig", + } + restCfg, err := cfg.QuotaRestConfig() + if err == nil { + t.Fatal("QuotaRestConfig() error = nil, want non-nil error when path is configured but file absent") + } + if restCfg != nil { + t.Errorf("QuotaRestConfig() returned non-nil config alongside error") + } +} + +// TestQuotaRestConfig_SuccessWhenFileExists verifies that a configured path +// pointing to an existing (though minimal) kubeconfig file succeeds. +func TestQuotaRestConfig_SuccessWhenFileExists(t *testing.T) { + // Write a minimal kubeconfig that clientcmd can parse. + dir := t.TempDir() + kubeconfigPath := filepath.Join(dir, "quota.kubeconfig") + minimalKubeconfig := []byte(`apiVersion: v1 +kind: Config +clusters: +- cluster: + server: https://localhost:1234 + name: test +contexts: +- context: + cluster: test + user: test + name: test +current-context: test +users: +- name: test + user: {} +`) + if err := os.WriteFile(kubeconfigPath, minimalKubeconfig, 0600); err != nil { + t.Fatalf("WriteFile: %v", err) + } + + cfg := &DiscoveryConfig{QuotaKubeconfigPath: kubeconfigPath} + restCfg, err := cfg.QuotaRestConfig() + if err != nil { + t.Fatalf("QuotaRestConfig() error = %v, want nil", err) + } + if restCfg == nil { + t.Error("QuotaRestConfig() = nil, want non-nil when file exists") + } +} diff --git a/internal/controller/indexers.go b/internal/controller/indexers.go index fb0ebe88..7d9e1ae1 100644 --- a/internal/controller/indexers.go +++ b/internal/controller/indexers.go @@ -15,7 +15,10 @@ import ( const ( deploymentWorkloadUIDIndex = "deploymentWorkloadUIDIndex" workloadNetworksIndex = "workloadNetworksIndex" - deploymentLocationIndex = "deploymentLocationIndex" + // deploymentCityCodeIndex indexes WorkloadDeployments by their Spec.CityCode + // so that SubnetClaim/Subnet watches can efficiently find the deployments + // that target the same city as a changed networking resource. + deploymentCityCodeIndex = "deploymentCityCodeIndex" ) func AddIndexers(ctx context.Context, mgr mcmanager.Manager) error { @@ -30,9 +33,10 @@ func addWorkloadDeploymentIndexers(ctx context.Context, mgr mcmanager.Manager) e return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentWorkloadUIDIndex, err) } - // Index workload deployments by location - if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, deploymentLocationIndex, deploymentLocationIndexFunc); err != nil { - return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentLocationIndex, err) + // Index workload deployments by city code so that SubnetClaim/Subnet watch + // handlers can efficiently find deployments targeting the same city. + if err := mgr.GetFieldIndexer().IndexField(ctx, &computev1alpha.WorkloadDeployment{}, deploymentCityCodeIndex, deploymentCityCodeIndexFunc); err != nil { + return fmt.Errorf("failed to add workload deployment indexer %q: %w", deploymentCityCodeIndex, err) } return nil @@ -44,18 +48,12 @@ func deploymentWorkloadUIDIndexFunc(o client.Object) []string { } } -func deploymentLocationIndexFunc(o client.Object) []string { +func deploymentCityCodeIndexFunc(o client.Object) []string { deployment := o.(*computev1alpha.WorkloadDeployment) - if deployment.Status.Location == nil { + if deployment.Spec.CityCode == "" { return nil } - - return []string{ - types.NamespacedName{ - Namespace: deployment.Status.Location.Namespace, - Name: deployment.Status.Location.Name, - }.String(), - } + return []string{deployment.Spec.CityCode} } func addWorkloadIndexers(ctx context.Context, mgr mcmanager.Manager) error { diff --git a/internal/controller/instance_controller.go b/internal/controller/instance_controller.go index e5bc3564..f11520a7 100644 --- a/internal/controller/instance_controller.go +++ b/internal/controller/instance_controller.go @@ -5,52 +5,154 @@ package controller import ( "context" "fmt" + "maps" "strings" corev1 "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" - ctrlsource "sigs.k8s.io/controller-runtime/pkg/source" mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" + "go.miloapis.com/milo/pkg/downstreamclient" "go.datum.net/compute/internal/controller/instancecontrol" + quotametrics "go.datum.net/compute/internal/quota" ) -const instanceQuotaFinalizer = "quota.compute.datumapis.com/claim-cleanup" +const ( + // instanceQuotaFinalizer ensures the quota ResourceClaim is deleted when + // an Instance is removed. + instanceQuotaFinalizer = "quota.compute.datumapis.com/claim-cleanup" + + // instanceControllerFinalizer is registered with the finalizer framework and + // triggers downstream write-back cleanup on deletion. + instanceControllerFinalizer = "compute.datumapis.com/instance-controller" + + // instanceQuotaClaimSourceLabel is stamped on ResourceClaim objects with the + // name of the edge cluster that created them. The claim watch predicate uses + // this label to filter out claims written by other edge controllers targeting + // the same project control planes. + instanceQuotaClaimSourceLabel = "compute.datumapis.com/source-cluster" + + // quotaResourceTypeInstances is the quota resource type for Instance count. + quotaResourceTypeInstances = "compute.datumapis.com/instances" + + // miloProjectAPIGroup is the API group for Milo resource-manager resources. + miloProjectAPIGroup = "resourcemanager.miloapis.com" + + // miloProjectKind is the Kind used for Milo Project resources. + miloProjectKind = "Project" + + // msgNotProgrammed is the human-readable message for the not-programmed state. + msgNotProgrammed = "Instance has not been programmed" + + // msgInstanceReady is the human-readable message for the ready state. + msgInstanceReady = "Instance is ready" + + // msgInstanceProgrammed is the human-readable message for the programmed state. + msgInstanceProgrammed = "Instance has been programmed" + + // msgInstanceRunning is the human-readable message for the running state. + msgInstanceRunning = "Instance is running" + + // reasonNetworkFailedToCreate is the reason code for network creation failure. + reasonNetworkFailedToCreate = "NetworkFailedToCreate" +) // clusterGetter is the subset of mcmanager.Manager used by InstanceReconciler. // Keeping it narrow allows unit tests to substitute a minimal fake. type clusterGetter interface { - GetCluster(ctx context.Context, clusterName string) (cluster.Cluster, error) + GetCluster(ctx context.Context, clusterName multicluster.ClusterName) (cluster.Cluster, error) } +// InstanceProjectIDFunc derives the Milo project ID for a given Instance. +// In Milo mode the project ID equals the multicluster ClusterName. In +// single-cell mode it is decoded from the upstream-cluster-name namespace label. +// Returns ("", nil) when the instance has no project affiliation (skip quota). +// Returns ("", err) for transient failures that should trigger a requeue. +type InstanceProjectIDFunc func( + ctx context.Context, + clusterName multicluster.ClusterName, + instance *computev1alpha.Instance, +) (string, error) + +// InstanceProjectNamespaceFunc derives the in-project namespace where +// ResourceClaims for a given Instance should be created. In Milo mode this +// equals instance.Namespace. In single-cell mode it comes from the +// upstream-namespace namespace label. +// Returns ("", nil) when the instance has no project affiliation (skip quota). +// Returns ("", err) for transient failures that should trigger a requeue. +type InstanceProjectNamespaceFunc func( + ctx context.Context, + clusterName multicluster.ClusterName, + instance *computev1alpha.Instance, +) (string, error) + // InstanceReconciler reconciles an Instance object type InstanceReconciler struct { - mgr clusterGetter - managementCluster cluster.Cluster + mgr clusterGetter + scheme *runtime.Scheme + quotaClientManager *quotametrics.ProjectQuotaClientManager + edgeClusterName string + // recorder emits Kubernetes events on the Instance object for quota failure + // modes so operators can diagnose issues via `kubectl describe`. + recorder record.EventRecorder + // projectIDForInstance derives the Milo project ID used for quota + // ResourceClaim management. In Milo mode it returns string(clusterName); in + // single-cell mode it reads the upstream-cluster-name label from the edge + // namespace and decodes "cluster-" → "". + projectIDForInstance InstanceProjectIDFunc + // projectNamespaceForInstance derives the in-project namespace where + // ResourceClaims must be created. In Milo mode the ResourceClaim lives in + // instance.Namespace (the project-level namespace); in single-cell mode the + // edge namespace is ns-{uid} which does not exist in the project control + // plane — the real namespace is the upstream-namespace label value (e.g. + // "default"). When nil, falls back to instance.Namespace. + projectNamespaceForInstance InstanceProjectNamespaceFunc + // clusterNameForProject maps a Milo project ID back to the multicluster + // ClusterName that owns that project's workloads. In Milo mode the + // ClusterName equals the project ID. In single-cell mode the only registered + // cluster is "single" regardless of project ID. When nil, falls back to + // multicluster.ClusterName(projectID), which is correct for Milo mode. + clusterNameForProject func(projectID string) multicluster.ClusterName + // FederationClient is an optional client pointing at the upstream + // Karmada/federation control plane (configured via --federation-kubeconfig). + // When non-nil, the reconciler writes a copy of each Instance back to the + // federation control plane so that the InstanceProjector (running in the + // management cluster) can aggregate status across all POP cells. Set to nil to + // disable federation write-back (e.g. in non-federation deployments). + FederationClient client.Client + finalizers finalizer.Finalizers } // +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances/status,verbs=get;update;patch // +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances/finalizers,verbs=update // +kubebuilder:rbac:groups=quota.miloapis.com,resources=resourceclaims,verbs=get;list;watch;create;delete +// +kubebuilder:rbac:groups="",resources=namespaces,verbs=get func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Request) (_ ctrl.Result, err error) { logger := log.FromContext(ctx) @@ -69,29 +171,24 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ return ctrl.Result{}, err } + // Run the finalizer framework first. This handles downstream write-back cleanup + // via the Finalize method registered below. + finalizationResult, err := r.finalizers.Finalize(ctx, &instance) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to finalize: %w", err) + } + if finalizationResult.Updated { + if err = cl.GetClient().Update(ctx, &instance); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update based on finalization result: %w", err) + } + return ctrl.Result{}, nil + } + logger.Info("reconciling instance") defer logger.Info("reconcile complete") if !instance.DeletionTimestamp.IsZero() { - if controllerutil.ContainsFinalizer(&instance, instanceQuotaFinalizer) { - claimName := fmt.Sprintf("%s--%s", instance.Namespace, instance.Name) - var claim quotav1alpha1.ResourceClaim - if err := r.managementCluster.GetClient().Get(ctx, client.ObjectKey{Namespace: instance.Namespace, Name: claimName}, &claim); err != nil { - if !apierrors.IsNotFound(err) { - return ctrl.Result{}, fmt.Errorf("failed getting resource claim for deletion: %w", err) - } - } else { - if err := r.managementCluster.GetClient().Delete(ctx, &claim); client.IgnoreNotFound(err) != nil { - return ctrl.Result{}, fmt.Errorf("failed deleting resource claim: %w", err) - } - } - - controllerutil.RemoveFinalizer(&instance, instanceQuotaFinalizer) - if err := cl.GetClient().Update(ctx, &instance); err != nil { - return ctrl.Result{}, fmt.Errorf("failed removing quota finalizer: %w", err) - } - } - return ctrl.Result{}, nil + return ctrl.Result{}, r.reconcileDeletion(ctx, cl.GetClient(), req.ClusterName, &instance) } if !controllerutil.ContainsFinalizer(&instance, instanceQuotaFinalizer) { @@ -102,94 +199,439 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ return ctrl.Result{}, nil } - grantedCondition, err := r.reconcileQuotaClaim(ctx, req.ClusterName, &instance) + statusChanged, quotaErr := r.reconcileQuotaCondition(ctx, req.ClusterName, &instance) + + // Even when reconcileQuotaCondition returns a transient error, persist any + // condition change first so the failure reason is visible on the Instance. + // We return the error afterwards so controller-runtime requeues with backoff. + readyChanged, err := r.reconcileInstanceReadyCondition(ctx, cl.GetClient(), &instance, r.checkForNetworkCreationFailure) if err != nil { - return ctrl.Result{}, fmt.Errorf("failed reconciling quota claim: %w", err) + return ctrl.Result{}, err } - statusChanged := false + if statusChanged || readyChanged { + if err := cl.GetClient().Status().Update(ctx, &instance); err != nil { + return ctrl.Result{}, err + } + // Return with the quota error (nil or transient) so controller-runtime + // requeues with backoff on failures. On the success path (quotaErr==nil) + // we fall through to removeQuotaSchedulingGate below instead of returning + // early, so the gate is cleared in the same reconcile pass rather than + // waiting for a requeue that may never come (ResourceClaim is immutable + // and local Instances are not watched). + if quotaErr != nil { + if err := r.writeBackToUpstream(ctx, req.ClusterName, &instance); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, quotaErr + } + } else if quotaErr != nil { + // No status change but quota evaluation failed — return error to requeue. + return ctrl.Result{}, quotaErr + } + + if err := r.removeQuotaSchedulingGate(ctx, cl.GetClient(), &instance); err != nil { + return ctrl.Result{}, err + } + + if err := r.writeBackToUpstream(ctx, req.ClusterName, &instance); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, nil +} + +// reconcileDeletion handles quota-claim cleanup when an Instance is being +// deleted. It removes the quota finalizer once the ResourceClaim is gone. +func (r *InstanceReconciler) reconcileDeletion(ctx context.Context, cl client.Client, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) error { + if !controllerutil.ContainsFinalizer(instance, instanceQuotaFinalizer) { + return nil + } + + if r.quotaClientManager != nil { + projectID, err := r.resolveProjectID(ctx, clusterName, instance) + if err != nil { + return fmt.Errorf("resolving project ID during deletion: %w", err) + } + if projectID == "" { + // Cannot locate the claim without a project ID. Log at ERROR and emit an + // event so the operator is aware of the orphaned claim. Fall through to + // finalizer removal so the Instance is not permanently stuck in Terminating. + // The orphaned claim will count against project budget until Milo's TTL/GC + // removes it. + log.FromContext(ctx).Error(nil, "project ID unresolvable during deletion; ResourceClaim may be orphaned — budget leak possible", + "instance", instance.Name, "namespace", instance.Namespace) + r.recorder.Event(instance, corev1.EventTypeWarning, + "QuotaClaimOrphaned", + "Skipping ResourceClaim cleanup: project ID could not be resolved; claim may be orphaned in Milo project control plane") + quotametrics.ClaimOrphanedTotal.Inc() + } else { + projectClient, err := r.quotaClientManager.ClientForProject(ctx, projectID, r.scheme) + if err != nil { + return fmt.Errorf("failed getting quota client for deletion: %w", err) + } + + claimNamespace, err := r.resolveProjectNamespace(ctx, clusterName, instance) + if err != nil { + return fmt.Errorf("resolving project namespace during deletion: %w", err) + } + claimName := fmt.Sprintf("%s--%s", instance.Namespace, instance.Name) + var claim quotav1alpha1.ResourceClaim + if err := projectClient.Get(ctx, client.ObjectKey{Namespace: claimNamespace, Name: claimName}, &claim); err != nil { + if !apierrors.IsNotFound(err) { + return fmt.Errorf("failed getting resource claim for deletion: %w", err) + } + } else { + if err := projectClient.Delete(ctx, &claim); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("failed deleting resource claim: %w", err) + } + } + } + } + + controllerutil.RemoveFinalizer(instance, instanceQuotaFinalizer) + if err := cl.Update(ctx, instance); err != nil { + return fmt.Errorf("failed removing quota finalizer: %w", err) + } + return nil +} + +// reconcileQuotaCondition reconciles the ResourceClaim and updates the +// InstanceQuotaGranted status condition. It returns (changed, err) where +// changed=true means a status update is required, and err non-nil means the +// reconciler should requeue (with backoff) in addition to writing the condition. +func (r *InstanceReconciler) reconcileQuotaCondition(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (bool, error) { + grantedCondition, claimErr := r.reconcileQuotaClaim(ctx, clusterName, instance) + + // reconcileQuotaClaim returns (condition, err). A non-nil error signals a + // transient infrastructure failure; a non-nil condition carries the reason to + // write. Both can be non-nil: write the condition AND requeue with backoff. switch { - case grantedCondition == nil || (grantedCondition.Status == metav1.ConditionFalse && grantedCondition.Reason == quotav1alpha1.ResourceClaimPendingReason): - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + case grantedCondition == nil && claimErr == nil: + // No claim yet and no error: labels not yet propagated. Stay PendingEvaluation. + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionUnknown, Reason: computev1alpha.InstanceQuotaGrantedReasonPendingEvaluation, Message: "Waiting for quota evaluation", ObservedGeneration: instance.Generation, + }), nil + + case grantedCondition != nil && grantedCondition.Status == metav1.ConditionFalse && + grantedCondition.Reason == quotav1alpha1.ResourceClaimPendingReason: + // Claim exists but pending — no AllowanceBucket. Distinct from "evaluating". + changed := apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionUnknown, + Reason: computev1alpha.InstanceQuotaGrantedReasonNoBudget, + Message: "ResourceClaim is pending: no AllowanceBucket configured for this project", + ObservedGeneration: instance.Generation, + }) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonNoBudget, + "ResourceClaim pending: no AllowanceBucket configured for this project") + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonNoBudget).Inc() + return changed, claimErr + + case grantedCondition != nil && grantedCondition.Type == computev1alpha.InstanceQuotaGranted: + // reconcileQuotaClaim populated a structured failure condition. + changed := apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: grantedCondition.Status, + Reason: grantedCondition.Reason, + Message: grantedCondition.Message, + ObservedGeneration: instance.Generation, }) + return changed, claimErr - case grantedCondition.Status == metav1.ConditionTrue: - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + case grantedCondition != nil && grantedCondition.Status == metav1.ConditionTrue: + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, Message: grantedCondition.Message, ObservedGeneration: instance.Generation, - }) + }), claimErr - case grantedCondition.Status == metav1.ConditionFalse: + case grantedCondition != nil: // False, non-pending reason from ResourceClaim reason := computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded if grantedCondition.Reason == quotav1alpha1.ResourceClaimValidationFailedReason { reason = computev1alpha.InstanceQuotaGrantedReasonValidationFailed } - statusChanged = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + return apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceQuotaGranted, Status: metav1.ConditionFalse, Reason: reason, Message: grantedCondition.Message, ObservedGeneration: instance.Generation, - }) + }), claimErr + + default: // grantedCondition == nil && claimErr != nil — should not reach here + return false, claimErr } +} - readyChanged, err := r.reconcileInstanceReadyCondition(ctx, cl.GetClient(), &instance, r.checkForNetworkCreationFailure) +// removeQuotaSchedulingGate removes the quota scheduling gate from the +// Instance spec once QuotaGranted=True has been persisted to status. +// It guards on ObservedGeneration to prevent a stale True condition from +// generation N unblocking a generation N+1 instance before quota for the +// new spec has been evaluated. +func (r *InstanceReconciler) removeQuotaSchedulingGate(ctx context.Context, cl client.Client, instance *computev1alpha.Instance) error { + quotaGrantedCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) + if quotaGrantedCond == nil || quotaGrantedCond.Status != metav1.ConditionTrue { + return nil + } + // Stale condition guard: only remove the gate if the condition reflects the + // current spec generation. A condition from an older generation means quota + // has not yet been evaluated for the current spec. + if quotaGrantedCond.ObservedGeneration != instance.Generation { + return nil + } + if instance.Spec.Controller == nil { + return nil + } + + newGates := make([]computev1alpha.SchedulingGate, 0, len(instance.Spec.Controller.SchedulingGates)) + gateRemoved := false + for _, gate := range instance.Spec.Controller.SchedulingGates { + if gate.Name == instancecontrol.QuotaSchedulingGate.String() { + gateRemoved = true + continue + } + newGates = append(newGates, gate) + } + if !gateRemoved { + return nil + } + + patch := client.MergeFrom(instance.DeepCopy()) + instance.Spec.Controller.SchedulingGates = newGates + if err := cl.Patch(ctx, instance, patch); err != nil { + return fmt.Errorf("failed patching quota scheduling gate: %w", err) + } + return nil +} + +// Finalize removes the downstream write-back Instance when the local Instance is +// deleted. It is a no-op when downstream federation is disabled. +func (r *InstanceReconciler) Finalize(ctx context.Context, obj client.Object) (finalizer.Result, error) { + if r.FederationClient == nil { + return finalizer.Result{}, nil + } + + instance := obj.(*computev1alpha.Instance) + + downstreamInstance := &computev1alpha.Instance{} + err := r.FederationClient.Get(ctx, client.ObjectKeyFromObject(instance), downstreamInstance) + if apierrors.IsNotFound(err) { + // Already gone — nothing to do. + return finalizer.Result{}, nil + } if err != nil { - return ctrl.Result{}, err + return finalizer.Result{}, fmt.Errorf("failed getting downstream instance for deletion: %w", err) } - if statusChanged || readyChanged { - if err := cl.GetClient().Status().Update(ctx, &instance); err != nil { - return ctrl.Result{}, err + if err := r.FederationClient.Delete(ctx, downstreamInstance); client.IgnoreNotFound(err) != nil { + return finalizer.Result{}, fmt.Errorf("failed deleting downstream write-back instance: %w", err) + } + + return finalizer.Result{}, nil +} + +// writeBackToUpstream copies the Instance spec and status to the upstream +// Karmada/federation control plane so that the InstanceProjector can aggregate +// state from all POP cells. It is a no-op when FederationClient is nil (federation disabled). +func (r *InstanceReconciler) writeBackToUpstream(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) error { + if r.FederationClient == nil { + return nil + } + + // Encode the POP-cell cluster name using the same convention as NSO's + // MappedNamespaceResourceStrategy: "cluster-" with "/" → "_". + // This is the fallback; the namespace label takes precedence when present. + encodedClusterName := "cluster-" + strings.ReplaceAll(string(clusterName), "/", "_") + + // Read the upstream project namespace name and cluster name from the namespace + // labels stamped by NSO's MappedNamespaceResourceStrategy. These carry the true + // project cluster name (e.g. "cluster-datum-cloud") and upstream namespace (e.g. + // "default"), which the InstanceProjector needs to find the right project cluster. + upstreamNamespace := instance.Namespace // fallback: cell namespace (ns-) + var downstreamNS corev1.Namespace + if err := r.FederationClient.Get(ctx, client.ObjectKey{Name: instance.Namespace}, &downstreamNS); err == nil { + if v := downstreamNS.Labels[downstreamclient.UpstreamOwnerNamespaceLabel]; v != "" { + upstreamNamespace = v + } + if v := downstreamNS.Labels[downstreamclient.UpstreamOwnerClusterNameLabel]; v != "" { + encodedClusterName = v } - // Return after the status update so that the next reconcile sees the - // updated QuotaGranted condition before attempting spec changes. - return ctrl.Result{}, nil } - // Remove the quota scheduling gate once QuotaGranted=True is persisted. - quotaGrantedCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) - if quotaGrantedCond != nil && quotaGrantedCond.Status == metav1.ConditionTrue { - if instance.Spec.Controller != nil { - newGates := make([]computev1alpha.SchedulingGate, 0, len(instance.Spec.Controller.SchedulingGates)) - gateRemoved := false - for _, gate := range instance.Spec.Controller.SchedulingGates { - if gate.Name == instancecontrol.QuotaSchedulingGate.String() { - gateRemoved = true - continue - } - newGates = append(newGates, gate) - } - if gateRemoved { - patch := client.MergeFrom(instance.DeepCopy()) - instance.Spec.Controller.SchedulingGates = newGates - if err := cl.GetClient().Patch(ctx, &instance, patch); err != nil { - return ctrl.Result{}, fmt.Errorf("failed patching quota scheduling gate: %w", err) - } - } + logger := log.FromContext(ctx) + missingLabels := []string{} + for _, key := range []string{ + computev1alpha.WorkloadUIDLabel, + computev1alpha.WorkloadDeploymentUIDLabel, + computev1alpha.InstanceIndexLabel, + } { + if instance.Labels[key] == "" { + missingLabels = append(missingLabels, key) } } + if len(missingLabels) > 0 { + logger.Info("instance is missing linking labels for write-back; projection owner-ref will not be set", + "instance", instance.Name, "namespace", instance.Namespace, + "missingLabels", missingLabels) + } - return ctrl.Result{}, nil + writeBack := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instance.Name, + Namespace: instance.Namespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedClusterName, + downstreamclient.UpstreamOwnerNamespaceLabel: upstreamNamespace, + computev1alpha.WorkloadUIDLabel: instance.Labels[computev1alpha.WorkloadUIDLabel], + computev1alpha.WorkloadDeploymentUIDLabel: instance.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + computev1alpha.InstanceIndexLabel: instance.Labels[computev1alpha.InstanceIndexLabel], + computev1alpha.WorkloadDeploymentNameLabel: instance.Labels[computev1alpha.WorkloadDeploymentNameLabel], + computev1alpha.CityCodeLabel: instance.Labels[computev1alpha.CityCodeLabel], + computev1alpha.WorkloadNameLabel: instance.Labels[computev1alpha.WorkloadNameLabel], + computev1alpha.PlacementNameLabel: instance.Labels[computev1alpha.PlacementNameLabel], + }, + }, + Spec: instance.Spec, + } + + existing := &computev1alpha.Instance{} + err := r.FederationClient.Get(ctx, client.ObjectKeyFromObject(writeBack), existing) + if apierrors.IsNotFound(err) { + // Ensure the namespace exists in the downstream control plane before creating the Instance. + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: instance.Namespace}} + if err := r.FederationClient.Create(ctx, ns); err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed ensuring downstream namespace: %w", err) + } + if err := r.FederationClient.Create(ctx, writeBack); err != nil { + return fmt.Errorf("failed creating downstream write-back instance: %w", err) + } + writeBack.Status = instance.Status + if err := r.FederationClient.Status().Update(ctx, writeBack); err != nil { + return fmt.Errorf("failed updating downstream write-back instance status after create: %w", err) + } + return nil + } + if err != nil { + return fmt.Errorf("failed getting downstream instance: %w", err) + } + + // Build a comparable map containing only the keys this function owns so that + // Karmada-managed labels on the existing object do not cause spurious updates. + ownedLabels := make(map[string]string, len(writeBack.Labels)) + for k := range writeBack.Labels { + ownedLabels[k] = existing.Labels[k] + } + + // Update spec + labels only if owned keys differ. + if !apiequality.Semantic.DeepEqual(existing.Spec, instance.Spec) || + !apiequality.Semantic.DeepEqual(ownedLabels, writeBack.Labels) { + existing.Spec = instance.Spec + // Merge writeBack.Labels into existing.Labels. Only keys owned by + // writeBackToUpstream are written; any labels Karmada or other actors + // have placed on the downstream object are preserved. + if existing.Labels == nil { + existing.Labels = make(map[string]string) + } + maps.Copy(existing.Labels, writeBack.Labels) + if err := r.FederationClient.Update(ctx, existing); err != nil { + return fmt.Errorf("failed updating downstream write-back instance: %w", err) + } + } + + // Update status only if it differs. + if !apiequality.Semantic.DeepEqual(existing.Status, instance.Status) { + existing.Status = instance.Status + if err := r.FederationClient.Status().Update(ctx, existing); err != nil { + return fmt.Errorf("failed updating downstream write-back instance status: %w", err) + } + } + + return nil } -func (r *InstanceReconciler) reconcileQuotaClaim(ctx context.Context, clusterName string, instance *computev1alpha.Instance) (*metav1.Condition, error) { +// reconcileQuotaClaim attempts to create or observe a ResourceClaim for the +// given instance. It returns: +// - (nil, nil) — labels not yet propagated; caller sets PendingEvaluation +// - (condition, nil) — terminal condition (True/False/Unknown from claim or failure) +// - (condition, err) — condition to write + transient error to requeue with backoff +// +// The condition's Type field is always InstanceQuotaGranted when set by this function +// to distinguish it from ResourceClaim conditions returned directly. +func (r *InstanceReconciler) reconcileQuotaClaim(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (*metav1.Condition, error) { + if r.quotaClientManager == nil { + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaDisabled, + Message: "Quota enforcement disabled: no credential configured", + }, nil + } + logger := log.FromContext(ctx) + projectID, err := r.resolveProjectID(ctx, clusterName, instance) + if err != nil { + // Transient: namespace API unreachable. Return structured condition + error. + msg := fmt.Sprintf("Could not resolve project ID: %v", err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonProjectIDUnresolvable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, + Message: msg, + }, fmt.Errorf("resolving project ID for instance %s/%s: %w", instance.Namespace, instance.Name, err) + } + if projectID == "" { + // Labels not yet propagated — bootstrap transient, not an error. + return nil, nil + } + + projectClient, err := r.quotaClientManager.ClientForProject(ctx, projectID, r.scheme) + if err != nil { + msg := fmt.Sprintf("Failed to build quota client for project %q: %v", projectID, err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonBackendUnavailable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, + Message: msg, + }, fmt.Errorf("failed getting quota client for project %q: %w", projectID, err) + } + + claimNamespace, err := r.resolveProjectNamespace(ctx, clusterName, instance) + if err != nil { + msg := fmt.Sprintf("Could not resolve project namespace: %v", err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonProjectIDUnresolvable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonProjectIDUnresolvable, + Message: msg, + }, fmt.Errorf("resolving project namespace for instance %s/%s: %w", instance.Namespace, instance.Name, err) + } + if claimNamespace == "" { + return nil, nil + } + claimName := fmt.Sprintf("%s--%s", instance.Namespace, instance.Name) requests := []quotav1alpha1.ResourceRequest{ { - ResourceType: "compute.datumapis.com/instances", + ResourceType: quotaResourceTypeInstances, Amount: 1, }, } @@ -213,39 +655,99 @@ func (r *InstanceReconciler) reconcileQuotaClaim(ctx context.Context, clusterNam desired := "av1alpha1.ResourceClaim{ ObjectMeta: metav1.ObjectMeta{ Name: claimName, - Namespace: instance.Namespace, + Namespace: claimNamespace, + Labels: map[string]string{ + instanceQuotaClaimSourceLabel: r.edgeClusterName, + }, }, Spec: quotav1alpha1.ResourceClaimSpec{ ConsumerRef: quotav1alpha1.ConsumerRef{ - APIGroup: "resourcemanager.miloapis.com", - Kind: "Project", - Name: clusterName, + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, }, ResourceRef: quotav1alpha1.UnversionedObjectReference{ - APIGroup: "compute.datumapis.com", - Kind: "Instance", - Name: instance.Name, - Namespace: instance.Namespace, + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, }, Requests: requests, }, } var existing quotav1alpha1.ResourceClaim - if err := r.managementCluster.GetClient().Get(ctx, client.ObjectKey{Namespace: desired.Namespace, Name: desired.Name}, &existing); err != nil { - if !apierrors.IsNotFound(err) { - return nil, fmt.Errorf("failed getting resource claim: %w", err) - } - if err := r.managementCluster.GetClient().Create(ctx, desired); err != nil { - return nil, fmt.Errorf("failed creating resource claim: %w", err) + if err := projectClient.Get(ctx, client.ObjectKey{Namespace: desired.Namespace, Name: desired.Name}, &existing); err != nil { + if apierrors.IsNotFound(err) { + // Claim doesn't exist yet — attempt to create it. + createErr := projectClient.Create(ctx, desired) + if createErr == nil { + return nil, nil + } + return r.classifyCreateError(instance, projectID, claimNamespace, createErr) } - return nil, nil + // GET itself failed — treat as backend unavailable. + msg := fmt.Sprintf("Quota backend unreachable getting ResourceClaim: %v", err) + r.recorder.Event(instance, corev1.EventTypeWarning, + computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(quotametrics.ReasonBackendUnavailable).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, + Message: msg, + }, fmt.Errorf("failed getting resource claim: %w", err) } grantedCondition := apimeta.FindStatusCondition(existing.Status.Conditions, quotav1alpha1.ResourceClaimGranted) return grantedCondition, nil } +// classifyCreateError maps a ResourceClaim creation error to a structured +// QuotaGranted condition with a specific reason, emits a Kubernetes event, and +// increments the appropriate metric counter. +func (r *InstanceReconciler) classifyCreateError( + instance *computev1alpha.Instance, + projectID, claimNamespace string, + err error, +) (*metav1.Condition, error) { + var reason, metricLabel, msg string + + switch { + case apierrors.IsNotFound(err): + // 404 on Create: either the project control plane path doesn't exist + // (project deleted) or the namespace doesn't exist yet. + if claimNamespace != "" { + // Namespace-level 404. + reason = computev1alpha.InstanceQuotaGrantedReasonNamespaceNotFound + metricLabel = quotametrics.ReasonNamespaceNotFound + msg = fmt.Sprintf("Quota claim namespace %q not found on project %q control plane", claimNamespace, projectID) + } else { + reason = computev1alpha.InstanceQuotaGrantedReasonProjectNotFound + metricLabel = quotametrics.ReasonProjectNotFound + msg = fmt.Sprintf("Milo project %q not found", projectID) + } + case apierrors.IsForbidden(err) || apierrors.IsInvalid(err): + // 403/422: quota admission plugin rejected the claim. + reason = computev1alpha.InstanceQuotaGrantedReasonMisconfigured + metricLabel = quotametrics.ReasonMisconfigured + msg = fmt.Sprintf("Quota admission rejected ResourceClaim for project %q: %v", projectID, err) + default: + // Connectivity or server error — treat as backend unavailable. + reason = computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable + metricLabel = quotametrics.ReasonBackendUnavailable + msg = fmt.Sprintf("Quota backend unreachable creating ResourceClaim: %v", err) + } + + r.recorder.Event(instance, corev1.EventTypeWarning, reason, msg) + quotametrics.EvalFailuresTotal.WithLabelValues(metricLabel).Inc() + return &metav1.Condition{ + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: reason, + Message: msg, + }, fmt.Errorf("failed creating resource claim: %w", err) +} + func resolveInstanceResources(instance *computev1alpha.Instance) (cpuMillicores int64, memMiB int64, resolved bool) { rt := instance.Spec.Runtime if rt.Sandbox != nil { @@ -327,7 +829,7 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, ObservedGeneration: instance.Generation, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, } } else { readyCondition = readyCondition.DeepCopy() @@ -344,8 +846,9 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( return false, fmt.Errorf("failed checking for network creation failure: %w", err) } + readyCondition.Status = metav1.ConditionFalse if networkCreationFailure { - readyCondition.Reason = "NetworkFailedToCreate" + readyCondition.Reason = reasonNetworkFailedToCreate readyCondition.Message = networkCreationFailureMessage } else { readyCondition.Reason = computev1alpha.InstanceReadyReasonSchedulingGatesPresent @@ -360,12 +863,13 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( if programmedCondition == nil || programmedCondition.Status != metav1.ConditionTrue { logger.Info("instance is not programmed", "instance", instance.Name) + readyCondition.Status = metav1.ConditionFalse readyCondition.Reason = computev1alpha.InstanceProgrammedReasonPendingProgramming if programmedCondition != nil && programmedCondition.Reason != pendingReason { readyCondition.Reason = programmedCondition.Reason } - readyCondition.Message = "Instance has not been programmed" + readyCondition.Message = msgNotProgrammed if programmedCondition != nil && programmedCondition.Status != metav1.ConditionUnknown { readyCondition.Message = programmedCondition.Message } @@ -379,6 +883,7 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( if runningCondition == nil || runningCondition.Status != metav1.ConditionTrue { logger.Info("instance is not running", "instance", instance.Name) + readyCondition.Status = metav1.ConditionFalse readyCondition.Reason = pendingReason if runningCondition != nil && runningCondition.Reason != pendingReason { readyCondition.Reason = runningCondition.Reason @@ -394,7 +899,7 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( readyCondition.Status = metav1.ConditionTrue readyCondition.Reason = computev1alpha.InstanceReadyReasonRunning - readyCondition.Message = "Instance is ready" + readyCondition.Message = msgInstanceReady return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition), nil } @@ -436,38 +941,118 @@ func (r *InstanceReconciler) checkForNetworkCreationFailure(ctx context.Context, return false, "", nil } +// resolveProjectID returns the Milo project ID to use for quota calls. +// When projectIDForInstance is set it delegates to that function; otherwise it +// falls back to string(clusterName), which is correct for Milo-mode deployments +// where the cluster name IS the project name. +// Returns ("", nil) to signal "no project, skip quota". Returns ("", err) for +// transient failures that should cause a reconcile requeue. +func (r *InstanceReconciler) resolveProjectID(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (string, error) { + if r.projectIDForInstance != nil { + return r.projectIDForInstance(ctx, clusterName, instance) + } + return string(clusterName), nil +} + +// resolveProjectNamespace returns the namespace within the Milo project control +// plane where ResourceClaims for this instance should be created. +// When projectNamespaceForInstance is set it delegates to that function; +// otherwise it falls back to instance.Namespace, which is correct for +// Milo-mode deployments where the project-side namespace already matches the +// instance namespace. +// Returns ("", nil) to signal "no project, skip quota". Returns ("", err) for +// transient failures that should cause a reconcile requeue. +func (r *InstanceReconciler) resolveProjectNamespace(ctx context.Context, clusterName multicluster.ClusterName, instance *computev1alpha.Instance) (string, error) { + if r.projectNamespaceForInstance != nil { + return r.projectNamespaceForInstance(ctx, clusterName, instance) + } + return instance.Namespace, nil +} + +// resolveClusterNameForProject returns the multicluster ClusterName for the +// given project ID. When clusterNameForProject is set it delegates to that +// function; otherwise it falls back to multicluster.ClusterName(projectID), +// which is correct for Milo-mode deployments where the cluster name IS the +// project name. +func (r *InstanceReconciler) resolveClusterNameForProject(projectID string) multicluster.ClusterName { + if r.clusterNameForProject != nil { + return r.clusterNameForProject(projectID) + } + return multicluster.ClusterName(projectID) +} + // SetupWithManager sets up the controller with the Manager. -func (r *InstanceReconciler) SetupWithManager(mgr mcmanager.Manager, managementCluster cluster.Cluster) error { +// +// quotaRestConfig is the REST config used to reach Milo project control planes +// for ResourceClaim management. Pass nil to disable quota accounting. +// +// projectIDForInstance derives the Milo project ID for each reconcile request. +// In Milo mode pass nil (falls back to using ClusterName). In single-cell mode +// pass a function that returns instance.Namespace. +// +// clusterNameForProject maps a project ID back to the multicluster ClusterName. +// In Milo mode pass nil (falls back to ClusterName(projectID)). In single-cell +// mode pass a function that always returns "single". +func (r *InstanceReconciler) SetupWithManager( + mgr mcmanager.Manager, + quotaRestConfig *rest.Config, + projectIDForInstance InstanceProjectIDFunc, + projectNamespaceForInstance InstanceProjectNamespaceFunc, + edgeClusterName string, + clusterNameForProject func(projectID string) multicluster.ClusterName, +) error { r.mgr = mgr - r.managementCluster = managementCluster - - // Watch ResourceClaim objects on the management cluster directly, bypassing - // the multicluster clusterInjectingQueue which would overwrite ClusterName. - // Using ctrlsource.TypedKind lets the handler produce mcreconcile.Request - // values with the correct ClusterName taken from claim.Spec.ConsumerRef.Name. - claimSource := ctrlsource.TypedKind( - managementCluster.GetCache(), - "av1alpha1.ResourceClaim{}, - handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, claim *quotav1alpha1.ResourceClaim) []mcreconcile.Request { - if claim.Spec.ResourceRef.Kind != "Instance" || claim.Spec.ResourceRef.APIGroup != "compute.datumapis.com" { - return nil - } - return []mcreconcile.Request{ - { - Request: reconcile.Request{ - NamespacedName: types.NamespacedName{ - Name: claim.Spec.ResourceRef.Name, - Namespace: claim.Spec.ResourceRef.Namespace, - }, - }, - ClusterName: claim.Spec.ConsumerRef.Name, - }, - } - }), - ) + r.scheme = mgr.GetLocalManager().GetScheme() + //nolint:staticcheck // GetEventRecorder (new events API) has an incompatible Eventf + // signature (requires related object + action args) that would require migrating + // all emit sites. GetEventRecorderFor remains correct; migration is deferred. + r.recorder = mgr.GetLocalManager().GetEventRecorderFor("instance-controller") + r.edgeClusterName = edgeClusterName + r.projectIDForInstance = projectIDForInstance + r.projectNamespaceForInstance = projectNamespaceForInstance + r.clusterNameForProject = clusterNameForProject + if quotaRestConfig != nil { + if edgeClusterName == "" { + return fmt.Errorf("edgeClusterName must be set when quota enforcement is enabled; set discovery.clusterName in the server config") + } + r.quotaClientManager = quotametrics.New(quotaRestConfig) + } + + r.finalizers = finalizer.NewFinalizers() + if err := r.finalizers.Register(instanceControllerFinalizer, r); err != nil { + return fmt.Errorf("failed to register finalizer: %w", err) + } + + edgeClusterNameVal := r.edgeClusterName return mcbuilder.ControllerManagedBy(mgr). For(&computev1alpha.Instance{}, mcbuilder.WithEngageWithLocalCluster(false)). - WatchesRawSource(claimSource). + Watches( + "av1alpha1.ResourceClaim{}, + func(_ multicluster.ClusterName, _ cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc( + func(ctx context.Context, obj client.Object) []mcreconcile.Request { + claim := obj.(*quotav1alpha1.ResourceClaim) + if claim.Spec.ResourceRef.Name == "" { + return nil + } + return []mcreconcile.Request{ + { + Request: reconcile.Request{ + NamespacedName: types.NamespacedName{ + Namespace: claim.Spec.ResourceRef.Namespace, + Name: claim.Spec.ResourceRef.Name, + }, + }, + ClusterName: r.resolveClusterNameForProject(claim.Spec.ConsumerRef.Name), + }, + } + }, + ) + }, + mcbuilder.WithPredicates(predicate.NewPredicateFuncs(func(obj client.Object) bool { + return obj.GetLabels()[instanceQuotaClaimSourceLabel] == edgeClusterNameVal + })), + ). Complete(r) } diff --git a/internal/controller/instance_controller_test.go b/internal/controller/instance_controller_test.go index 1a15090b..31636c3f 100644 --- a/internal/controller/instance_controller_test.go +++ b/internal/controller/instance_controller_test.go @@ -3,7 +3,6 @@ package controller import ( "context" "fmt" - "net/http" "testing" "github.com/stretchr/testify/assert" @@ -12,50 +11,39 @@ import ( apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/rest" "k8s.io/client-go/tools/record" - "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/controller/instancecontrol" + "go.datum.net/compute/internal/quota" quotav1alpha1 "go.miloapis.com/milo/pkg/apis/quota/v1alpha1" ) -// fakeCluster implements cluster.Cluster for testing using a fake client. -type fakeCluster struct { - client client.Client - scheme *runtime.Scheme -} - -func (f *fakeCluster) GetHTTPClient() *http.Client { return nil } -func (f *fakeCluster) GetConfig() *rest.Config { return nil } -func (f *fakeCluster) GetCache() cache.Cache { return nil } -func (f *fakeCluster) GetScheme() *runtime.Scheme { return f.scheme } -func (f *fakeCluster) GetClient() client.Client { return f.client } -func (f *fakeCluster) GetFieldIndexer() client.FieldIndexer { return nil } -func (f *fakeCluster) GetEventRecorderFor(string) record.EventRecorder { return nil } -func (f *fakeCluster) GetRESTMapper() apimeta.RESTMapper { return nil } -func (f *fakeCluster) GetAPIReader() client.Reader { return f.client } -func (f *fakeCluster) Start(context.Context) error { return nil } - -// fakeMCManager is a minimal multicluster manager that returns a single cluster. -type fakeMCManager struct { - clusters map[string]cluster.Cluster -} - -func (m *fakeMCManager) GetCluster(ctx context.Context, clusterName string) (cluster.Cluster, error) { - cl, ok := m.clusters[clusterName] - if !ok { - return nil, fmt.Errorf("cluster %q not found", clusterName) - } - return cl, nil -} +// Test constants for repeated string literals across controller package tests. +const ( + testInstanceName = "test-instance" + testReasonString = "TestReason" + testMessageString = "Test message" + testUIDString = "test-uid" + testInstanceType = "d1-standard-2" + testDefaultPlacement = "default" + testDefaultNamespace = "default" + testEdgeClusterName = "test-edge" + testComputeAPIVersion = "compute.datumapis.com/v1alpha" + testQuotaAPIGroup = "quota.miloapis.com" + testQuotaResource = "resourceclaims" + kindWorkloadDeploymentTest = "WorkloadDeployment" // mirrors kindWorkloadDeployment +) // newTestScheme builds a runtime.Scheme with the types needed for instance reconcile tests. func newTestScheme(t *testing.T) *runtime.Scheme { @@ -79,8 +67,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance without ready condition should create default", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, }, @@ -89,7 +77,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, ObservedGeneration: 1, }, }, @@ -97,8 +85,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance with scheduling gates should set scheduling gates present", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Spec: computev1alpha.InstanceSpec{ @@ -114,7 +102,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, ObservedGeneration: 1, LastTransitionTime: metav1.Now(), }, @@ -134,8 +122,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance with scheduling gates and network failure should set network failed", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Spec: computev1alpha.InstanceSpec{ @@ -153,7 +141,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, - Reason: "NetworkFailedToCreate", + Reason: reasonNetworkFailedToCreate, Message: "Network creation failed: timeout", ObservedGeneration: 1, }, @@ -162,8 +150,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance not programmed should set pending programming", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -171,8 +159,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, }, }, }, @@ -181,8 +169,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, ObservedGeneration: 1, }, }, @@ -190,8 +178,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance programmed but not running should wait for running", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -200,13 +188,13 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, }, { Type: computev1alpha.InstanceRunning, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, }, }, }, @@ -215,8 +203,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { expectedCondition: &metav1.Condition{ Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, - Reason: "TestReason", - Message: "Test message", + Reason: testReasonString, + Message: testMessageString, ObservedGeneration: 1, }, }, @@ -224,8 +212,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "instance fully ready should set ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -234,13 +222,13 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, }, { Type: computev1alpha.InstanceRunning, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceRunningReasonRunning, - Message: "Instance is running", + Message: msgInstanceRunning, }, }, }, @@ -250,7 +238,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceReadyReasonRunning, - Message: "Instance is ready", + Message: msgInstanceReady, ObservedGeneration: 1, }, }, @@ -258,8 +246,8 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { name: "no change when condition already matches", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -268,7 +256,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceReadyReasonRunning, - Message: "Instance is ready", + Message: msgInstanceReady, ObservedGeneration: 1, LastTransitionTime: metav1.Now(), }, @@ -276,13 +264,13 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, }, { Type: computev1alpha.InstanceRunning, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceRunningReasonRunning, - Message: "Instance is running", + Message: msgInstanceRunning, }, }, }, @@ -292,7 +280,7 @@ func TestReconcileInstanceReadyCondition(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceReadyReasonRunning, - Message: "Instance is ready", + Message: msgInstanceReady, ObservedGeneration: 1, }, }, @@ -343,8 +331,8 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { name: "quota denied blocks ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -360,14 +348,14 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, LastTransitionTime: metav1.Now(), }, { Type: computev1alpha.InstanceRunning, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceRunningReasonRunning, - Message: "Instance is running", + Message: msgInstanceRunning, LastTransitionTime: metav1.Now(), }, }, @@ -385,8 +373,8 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { name: "quota available does not block ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -402,14 +390,14 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceProgrammed, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceProgrammedReasonProgrammed, - Message: "Instance has been programmed", + Message: msgInstanceProgrammed, LastTransitionTime: metav1.Now(), }, { Type: computev1alpha.InstanceRunning, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceRunningReasonRunning, - Message: "Instance is running", + Message: msgInstanceRunning, LastTransitionTime: metav1.Now(), }, }, @@ -420,15 +408,15 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionTrue, Reason: computev1alpha.InstanceReadyReasonRunning, - Message: "Instance is ready", + Message: msgInstanceReady, }, }, { name: "quota pending unknown does not block ready condition", instance: &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-instance", - Namespace: "default", + Name: testInstanceName, + Namespace: testDefaultNamespace, Generation: 1, }, Status: computev1alpha.InstanceStatus{ @@ -448,7 +436,7 @@ func TestReconcileInstanceReadyConditionWithQuota(t *testing.T) { Type: computev1alpha.InstanceReady, Status: metav1.ConditionFalse, Reason: computev1alpha.InstanceProgrammedReasonPendingProgramming, - Message: "Instance has not been programmed", + Message: msgNotProgrammed, }, }, } @@ -501,25 +489,28 @@ func TestReconcileQuota(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: deploymentName, Namespace: namespace, - UID: "test-uid", + UID: testUIDString, }, } } // makeInstance creates a test Instance with an owner reference to the // deployment so that checkForNetworkCreationFailure can look it up. + // Both finalizers are pre-populated so that the finalizer framework does + // not need to add instanceControllerFinalizer on the first reconcile, + // which would cause an early return before quota logic runs. makeInstance := func(_ *runtime.Scheme, gates ...computev1alpha.SchedulingGate) *computev1alpha.Instance { return &computev1alpha.Instance{ ObjectMeta: metav1.ObjectMeta{ Name: instanceName, - Namespace: namespace, - Finalizers: []string{instanceQuotaFinalizer}, + Namespace: testDefaultNamespace, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, OwnerReferences: []metav1.OwnerReference{ { - APIVersion: "compute.datumapis.com/v1alpha", - Kind: "WorkloadDeployment", + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, Name: deploymentName, - UID: "test-uid", + UID: testUIDString, Controller: func() *bool { b := true; return &b }(), }, }, @@ -529,7 +520,7 @@ func TestReconcileQuota(t *testing.T) { SchedulingGates: gates, }, Runtime: computev1alpha.InstanceRuntimeSpec{ - Resources: computev1alpha.InstanceRuntimeResources{InstanceType: "d1-standard-2"}, + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, }, NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, }, @@ -544,18 +535,21 @@ func TestReconcileQuota(t *testing.T) { }, Spec: quotav1alpha1.ResourceClaimSpec{ ConsumerRef: quotav1alpha1.ConsumerRef{ - APIGroup: "resourcemanager.miloapis.com", - Kind: "Project", + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, Name: clusterName, }, + // ResourceRef points at the Project resource (cluster-scoped), not the + // Instance. The quota admission plugin validates against the + // ResourceRegistration's claimingResources, which only allows + // resourcemanager.miloapis.com/Project. ResourceRef: quotav1alpha1.UnversionedObjectReference{ - APIGroup: "compute.datumapis.com", - Kind: "Instance", - Name: instanceName, - Namespace: namespace, + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: clusterName, }, Requests: []quotav1alpha1.ResourceRequest{ - {ResourceType: "compute.datumapis.com/instances", Amount: 1}, + {ResourceType: quotaResourceTypeInstances, Amount: 1}, }, }, Status: quotav1alpha1.ResourceClaimStatus{ @@ -572,7 +566,7 @@ func TestReconcileQuota(t *testing.T) { } } - newReconciler := func(t *testing.T, projectObjs []client.Object, mgmtObjs []client.Object) (*InstanceReconciler, client.Client, client.Client) { + newReconciler := func(t *testing.T, projectObjs []client.Object, quotaObjs []client.Object) (*InstanceReconciler, client.Client, client.Client) { t.Helper() s := newTestScheme(t) @@ -582,26 +576,44 @@ func TestReconcileQuota(t *testing.T) { WithStatusSubresource(&computev1alpha.Instance{}). Build() - mgmtClient := fake.NewClientBuilder(). + quotaClient := fake.NewClientBuilder(). WithScheme(s). - WithObjects(mgmtObjs...). + WithObjects(quotaObjs...). WithStatusSubresource("av1alpha1.ResourceClaim{}). Build() mgr := &fakeMCManager{ clusters: map[string]cluster.Cluster{ - clusterName: &fakeCluster{client: projectClient, scheme: s}, + clusterName: newFakeCluster(projectClient), }, } + qm := quota.New(nil) + qm.StoreClient(clusterName, quotaClient) + r := &InstanceReconciler{ - mgr: mgr, - managementCluster: &fakeCluster{client: mgmtClient, scheme: s}, + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + // Milo mode: project ID == ClusterName; claim namespace == instance.Namespace. + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + // nil → falls back to instance.Namespace, which is correct for Milo mode. + projectNamespaceForInstance: nil, } - return r, projectClient, mgmtClient + + // Initialize the finalizer registry so that r.finalizers.Finalize is not + // a nil-pointer dereference. SetupWithManager does this in production; in + // tests we replicate the same steps manually. + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + return r, projectClient, quotaClient } - t.Run("quota granted flow: claim granted removes gate and sets QuotaGranted=True", func(t *testing.T) { + t.Run("quota granted flow: claim granted removes gate and sets QuotaGranted=True in single reconcile", func(t *testing.T) { s := newTestScheme(t) instance := makeInstance(s, computev1alpha.SchedulingGate{Name: instancecontrol.NetworkSchedulingGate.String()}, @@ -611,7 +623,10 @@ func TestReconcileQuota(t *testing.T) { r, projectClient, _ := newReconciler(t, []client.Object{instance, makeDeployment()}, []client.Object{claim}) - // First reconcile: sets QuotaGranted=True in status, returns early. + // Single reconcile: sets QuotaGranted=True in status AND removes the + // Quota scheduling gate in the same pass. The early-return-before-gate- + // removal bug required a second reconcile that never arrived because + // ResourceClaims are immutable and local Instances are not watched. _, err := r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) require.NoError(t, err) @@ -623,19 +638,13 @@ func TestReconcileQuota(t *testing.T) { assert.Equal(t, metav1.ConditionTrue, quotaCond.Status) assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, quotaCond.Reason) - // Second reconcile: status is already set, so removes the scheduling gate. - _, err = r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) - require.NoError(t, err) - - require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: instanceName}, &updated)) - hasQuotaGate := false for _, g := range updated.Spec.Controller.SchedulingGates { if g.Name == instancecontrol.QuotaSchedulingGate.String() { hasQuotaGate = true } } - assert.False(t, hasQuotaGate, "QuotaSchedulingGate should have been removed") + assert.False(t, hasQuotaGate, "QuotaSchedulingGate must be removed in the same reconcile pass as the status update") }) t.Run("quota exceeded flow: conditions cascade to block Programmed/Running/Ready", func(t *testing.T) { @@ -709,7 +718,9 @@ func TestReconcileQuota(t *testing.T) { } require.NoError(t, mgmtClient.Status().Update(context.Background(), &existingClaim)) - // Second reconcile should see granted claim and update status. + // Second reconcile should see the granted claim, update status to + // QuotaGranted=True, AND remove the gate in the same pass (no third + // reconcile required). _, err = r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) require.NoError(t, err) @@ -719,28 +730,41 @@ func TestReconcileQuota(t *testing.T) { require.NotNil(t, quotaCond) assert.Equal(t, metav1.ConditionTrue, quotaCond.Status) - // Third reconcile removes the gate (status is already true, no more status write needed). - _, err = r.Reconcile(context.Background(), mcreconcile.Request{Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, ClusterName: clusterName}) - require.NoError(t, err) - - require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: instanceName}, &recovered)) hasQuotaGate := false for _, g := range recovered.Spec.Controller.SchedulingGates { if g.Name == instancecontrol.QuotaSchedulingGate.String() { hasQuotaGate = true } } - assert.False(t, hasQuotaGate, "QuotaSchedulingGate should have been removed after quota granted") + assert.False(t, hasQuotaGate, "QuotaSchedulingGate should be removed in the same reconcile pass that sets QuotaGranted=True") }) t.Run("deleted before grant: finalizer deletes claim and is removed", func(t *testing.T) { s := newTestScheme(t) now := metav1.Now() - instance := makeInstance(s, - computev1alpha.SchedulingGate{Name: instancecontrol.QuotaSchedulingGate.String()}, - ) - instance.DeletionTimestamp = &now + // Build the instance directly without instanceControllerFinalizer to + // represent the state after the Karmada finalizer has already been + // cleaned up; only the quota finalizer remains to be processed. + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: namespace, + DeletionTimestamp: &now, + Finalizers: []string{instanceQuotaFinalizer}, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } claim := makeClaim(s, metav1.ConditionFalse, quotav1alpha1.ResourceClaimPendingReason) @@ -766,3 +790,783 @@ func TestReconcileQuota(t *testing.T) { } }) } + +// TestQuotaGateRemovedInSingleReconcile is a regression test for the bug where +// the Quota scheduling gate was never removed from an Instance after quota was +// granted. The root cause was an early return in the Reconcile function: when +// reconcileQuotaCondition set QuotaGranted=True (statusChanged=true), the code +// wrote the status update and returned before reaching removeQuotaSchedulingGate. +// Because ResourceClaims are immutable (no further transitions) and local +// Instances are not watched (WithEngageWithLocalCluster(false)), no requeue ever +// arrived — leaving the Quota gate stranded in spec.controller.schedulingGates +// and the projected Instance stuck "Pending (SchedulingGatesPresent)". +// +// The fix: on the success path (quotaErr==nil), fall through to +// removeQuotaSchedulingGate after persisting the status update, so gate removal +// happens in the same reconcile pass as the QuotaGranted=True status write. +func TestQuotaGateRemovedInSingleReconcile(t *testing.T) { + const ( + clusterName = "test-project" + namespace = "default" + instanceName = "my-instance" + deploymentName = "my-deployment" + ) + + claimName := namespace + "--" + instanceName + + tests := []struct { + name string + initialGates []computev1alpha.SchedulingGate + expectGateGone bool + }{ + { + name: "Quota gate only: removed in single reconcile when claim is granted", + initialGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + expectGateGone: true, + }, + { + name: "Quota gate plus Network gate: Quota removed, Network preserved", + initialGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.NetworkSchedulingGate.String()}, + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + expectGateGone: true, + }, + { + name: "No gates: no-op, reconcile completes cleanly", + initialGates: []computev1alpha.SchedulingGate{}, + expectGateGone: false, // no gate to begin with + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s := newTestScheme(t) + + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: namespace, + Generation: 1, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, + Name: deploymentName, + UID: testUIDString, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: tt.initialGates, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: deploymentName, Namespace: namespace, UID: testUIDString}, + } + + // ResourceClaim already in QuotaAvailable state — simulates the state + // that triggered the bug: claim already granted but gate still present. + claim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: namespace}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{ + APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: clusterName, + }, + ResourceRef: quotav1alpha1.UnversionedObjectReference{ + APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: clusterName, + }, + Requests: []quotav1alpha1.ResourceRequest{ + {ResourceType: quotaResourceTypeInstances, Amount: 1}, + }, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionTrue, + Reason: quotav1alpha1.ResourceClaimGrantedReason, + Message: "quota available", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, deployment). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(claim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + clusterName: newFakeCluster(projectClient), + }, + } + + qm := quota.New(nil) + qm.StoreClient(clusterName, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + // Exactly one reconcile — must be sufficient to both set QuotaGranted=True + // and remove the Quota gate. No second reconcile should be required. + _, err := r.Reconcile(context.Background(), mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace, Name: instanceName}}, + ClusterName: clusterName, + }) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: namespace, Name: instanceName}, &updated)) + + // QuotaGranted condition must be set to True. + quotaCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, quotaCond, "QuotaGranted condition must be present") + assert.Equal(t, metav1.ConditionTrue, quotaCond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, quotaCond.Reason) + + // Quota gate must be gone after the single reconcile. + hasQuotaGate := false + for _, g := range updated.Spec.Controller.SchedulingGates { + if g.Name == instancecontrol.QuotaSchedulingGate.String() { + hasQuotaGate = true + } + } + if tt.expectGateGone { + assert.False(t, hasQuotaGate, + "Quota gate must be removed in the same reconcile pass as the QuotaGranted=True status write; "+ + "a stranded gate leaves the projected Instance stuck Pending (SchedulingGatesPresent)") + } + + // Network gate (if present) must be preserved — only the Quota gate is + // cleared by InstanceReconciler; NetworkSchedulingGate is owned by + // WorkloadDeploymentReconciler. + for _, g := range updated.Spec.Controller.SchedulingGates { + assert.NotEqual(t, instancecontrol.QuotaSchedulingGate.String(), g.Name, + "Quota gate must not remain after granted claim") + } + }) + } +} + +// TestReconcileQuotaSingleMode verifies that in single-cell mode: +// - the project ID is decoded from the upstream-cluster-name label on the edge +// namespace (not taken from the always-"single" ClusterName) +// - the ResourceClaim is created in the in-project namespace (upstream-namespace +// label, e.g. "default"), not in the edge namespace (ns-abc123) +// - the ResourceRef points at resourcemanager.miloapis.com/Project, not Instance +func TestReconcileQuotaSingleMode(t *testing.T) { + const ( + instanceName = "my-instance" + edgeNS = "ns-abc123" // edge namespace (ns-{uid}) — does NOT exist in project CP + projectID = "datum-cloud" // decoded from "cluster-datum-cloud" + projectNS = "default" // upstream-namespace label value — where claims live + deploymentName = "my-deployment" + ) + + // Claim name uses the edge namespace prefix (stable identifier for the claim) + // but the claim object itself lives in projectNS. + claimName := edgeNS + "--" + instanceName + + s := newTestScheme(t) + + instance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: edgeNS, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, + Name: deploymentName, + UID: testUIDString, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: deploymentName, Namespace: edgeNS, UID: "test-uid"}, + } + + // ResourceClaim lives in projectNS ("default"), not edgeNS ("ns-abc123"). + // ResourceRef points at the Project resource, matching the ResourceRegistration's + // claimingResources (resourcemanager.miloapis.com/Project only). + claim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: projectNS}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, + }, + ResourceRef: quotav1alpha1.UnversionedObjectReference{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: projectID, + }, + Requests: []quotav1alpha1.ResourceRequest{ + {ResourceType: quotaResourceTypeInstances, Amount: 1}, + }, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionTrue, + Reason: quotav1alpha1.ResourceClaimGrantedReason, + Message: "quota granted", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, deployment). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + // The quota client is keyed by projectID ("datum-cloud"), matching what + // projectIDForInstance returns after decoding "cluster-datum-cloud". + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(claim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + qm := quota.New(nil) + qm.StoreClient(projectID, quotaClient) + + const singleCluster = "single" + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + singleCluster: newFakeCluster(projectClient), + }, + } + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: singleCluster, + // Single-cell mode: project ID decoded from upstream-cluster-name label. + // Simulates what cmd/main.go does for "cluster-datum-cloud" → "datum-cloud". + projectIDForInstance: func(_ context.Context, _ multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return projectID, nil + }, + // Single-cell mode: claim namespace comes from upstream-namespace label. + // Simulates what cmd/main.go does by reading the edge namespace labels. + projectNamespaceForInstance: func(_ context.Context, _ multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return projectNS, nil + }, + // Single-cell mode: watch map func must always return "single". + clusterNameForProject: func(_ string) multicluster.ClusterName { + return singleCluster + }, + } + + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + req := mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: edgeNS, Name: instanceName}}, + ClusterName: singleCluster, + } + + _, err := r.Reconcile(context.Background(), req) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), types.NamespacedName{Namespace: edgeNS, Name: instanceName}, &updated)) + + quotaCond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, quotaCond, "QuotaGranted condition must be set") + assert.Equal(t, metav1.ConditionTrue, quotaCond.Status, "quota should be granted in single mode") + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, quotaCond.Reason) + + // Verify clusterNameForProject always returns "single" so the watch map func + // never enqueues an unknown cluster name. + assert.Equal(t, multicluster.ClusterName(singleCluster), r.resolveClusterNameForProject(projectID)) + assert.Equal(t, multicluster.ClusterName(singleCluster), r.resolveClusterNameForProject("any-other-project")) + + // Verify resolveProjectNamespace returns the in-project namespace, not the edge namespace. + resolvedNS, resolveErr := r.resolveProjectNamespace(context.Background(), singleCluster, instance) + require.NoError(t, resolveErr) + assert.Equal(t, projectNS, resolvedNS, "claim namespace must be the in-project namespace, not the edge namespace") +} + +// TestReconcileQuotaFailureModes verifies that infrastructure failures in the +// quota path set specific QuotaGranted=False conditions (fail-closed) rather +// than silently allowing workloads to schedule. +func TestReconcileQuotaFailureModes(t *testing.T) { + const ( + testProject = "test-project" + testNS = "default" + testInstance = "my-instance" + testDeployment = "my-deployment" + ) + + makeInstance := func() *computev1alpha.Instance { + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstance, + Namespace: testNS, + Finalizers: []string{instanceQuotaFinalizer, instanceControllerFinalizer}, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: testComputeAPIVersion, + Kind: kindWorkloadDeploymentTest, + Name: testDeployment, + UID: testUIDString, + Controller: func() *bool { b := true; return &b }(), + }, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + }, + }, + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + NetworkInterfaces: []computev1alpha.InstanceNetworkInterface{}, + }, + } + } + + makeDeployment := func() *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: testDeployment, Namespace: testNS, UID: testUIDString}, + } + } + + newReconcilerWithInterceptor := func( + t *testing.T, + funcs interceptor.Funcs, + fakeRecorder *record.FakeRecorder, + ) (*InstanceReconciler, client.Client) { + t.Helper() + s := newTestScheme(t) + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(makeInstance(), makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithInterceptorFuncs(funcs). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + + qm := quota.New(nil) + qm.StoreClient(testProject, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: fakeRecorder, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + return r, projectClient + } + + reconcileReq := func() mcreconcile.Request { + return mcreconcile.Request{ + Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: testNS, Name: testInstance}}, + ClusterName: testProject, + } + } + + t.Run("FM-2: backend unreachable sets QuotaBackendUnavailable", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + r, projectClient := newReconcilerWithInterceptor(t, interceptor.Funcs{ + Get: func(_ context.Context, _ client.WithWatch, _ client.ObjectKey, _ client.Object, _ ...client.GetOption) error { + return fmt.Errorf("connection refused") + }, + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + // Reconcile returns error for transient failures. + require.Error(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable, cond.Reason) + + // Event should have been emitted. + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonBackendUnavailable) + default: + t.Error("expected a Warning event for backend unavailable, got none") + } + }) + + // FM-4/FM-5: 404 on Create maps to NamespaceNotFound when the claim namespace + // is known (the more common case for project-exists-but-namespace-absent), and + // to ProjectNotFound when the namespace itself is empty (project CP path missing). + t.Run("FM-5: 404 on Create with known namespace sets QuotaNamespaceNotFound", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + notFoundErr := apierrors.NewNotFound( + schema.GroupResource{Group: testQuotaAPIGroup, Resource: testQuotaResource}, "claim") + r, projectClient := newReconcilerWithInterceptor(t, interceptor.Funcs{ + Get: func(_ context.Context, _ client.WithWatch, _ client.ObjectKey, _ client.Object, _ ...client.GetOption) error { + return notFoundErr + }, + Create: func(_ context.Context, _ client.WithWatch, _ client.Object, _ ...client.CreateOption) error { + return notFoundErr + }, + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.Error(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + // claimNamespace == testNS (non-empty) → NamespaceNotFound, not ProjectNotFound. + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonNamespaceNotFound, cond.Reason, + "404 on Create with known namespace should map to NamespaceNotFound") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonNamespaceNotFound) + default: + t.Error("expected a Warning event for namespace not found, got none") + } + }) + + t.Run("FM-6: 403 on Create sets QuotaMisconfigured", func(t *testing.T) { + fakeRecorder := record.NewFakeRecorder(10) + forbiddenErr := apierrors.NewForbidden( + schema.GroupResource{Group: testQuotaAPIGroup, Resource: testQuotaResource}, "claim", + fmt.Errorf("ResourceRegistration not found")) + r, projectClient := newReconcilerWithInterceptor(t, interceptor.Funcs{ + Get: func(_ context.Context, _ client.WithWatch, _ client.ObjectKey, _ client.Object, _ ...client.GetOption) error { + return apierrors.NewNotFound( + schema.GroupResource{Group: testQuotaAPIGroup, Resource: testQuotaResource}, "claim") + }, + Create: func(_ context.Context, _ client.WithWatch, _ client.Object, _ ...client.CreateOption) error { + return forbiddenErr + }, + }, fakeRecorder) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.Error(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonMisconfigured, cond.Reason, + "403 on Create should map to Misconfigured") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonMisconfigured) + default: + t.Error("expected a Warning event for misconfigured quota, got none") + } + }) + + t.Run("FM-7: claim pending with no budget sets QuotaNoBudget", func(t *testing.T) { + s := newTestScheme(t) + fakeRecorder := record.NewFakeRecorder(10) + + claimName := testNS + "--" + testInstance + pendingClaim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: testNS}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: testProject, + }, + ResourceRef: quotav1alpha1.UnversionedObjectReference{ + APIGroup: miloProjectAPIGroup, + Kind: miloProjectKind, + Name: testProject, + }, + Requests: []quotav1alpha1.ResourceRequest{ + {ResourceType: quotaResourceTypeInstances, Amount: 1}, + }, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionFalse, + Reason: quotav1alpha1.ResourceClaimPendingReason, + Message: "No AllowanceBucket configured", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(makeInstance(), makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(pendingClaim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + qm := quota.New(nil) + qm.StoreClient(testProject, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: fakeRecorder, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.NoError(t, err, "pending-no-budget is not a transient error — no requeue needed") + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionUnknown, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonNoBudget, cond.Reason, + "pending claim with no budget should use NoBudget reason, not PendingEvaluation") + + select { + case event := <-fakeRecorder.Events: + assert.Contains(t, event, computev1alpha.InstanceQuotaGrantedReasonNoBudget) + default: + t.Error("expected a Warning event for no budget, got none") + } + }) + + t.Run("quota disabled: quotaClientManager nil sets QuotaDisabled (not QuotaAvailable)", func(t *testing.T) { + s := newTestScheme(t) + instance := makeInstance() + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: nil, // explicitly disabled + edgeClusterName: testEdgeClusterName, + recorder: record.NewFakeRecorder(10), + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionTrue, cond.Status) + assert.Equal(t, computev1alpha.InstanceQuotaGrantedReasonQuotaDisabled, cond.Reason, + "intentionally disabled quota should use QuotaDisabled reason") + }) + + t.Run("observedGeneration guard: stale True condition does not remove gate for new generation", func(t *testing.T) { + s := newTestScheme(t) + fakeRecorder := record.NewFakeRecorder(10) + + // Instance at generation 2 with a stale QuotaGranted=True from generation 1. + instance := makeInstance() + instance.Generation = 2 + instance.Status.Conditions = []metav1.Condition{ + { + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaAvailable, + Message: "quota granted (generation 1)", + ObservedGeneration: 1, // stale — does not match instance.Generation=2 + LastTransitionTime: metav1.Now(), + }, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(instance, makeDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + claimName := testNS + "--" + testInstance + grantedClaim := "av1alpha1.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: testNS}, + Spec: quotav1alpha1.ResourceClaimSpec{ + ConsumerRef: quotav1alpha1.ConsumerRef{APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: testProject}, + ResourceRef: quotav1alpha1.UnversionedObjectReference{APIGroup: miloProjectAPIGroup, Kind: miloProjectKind, Name: testProject}, + Requests: []quotav1alpha1.ResourceRequest{{ResourceType: quotaResourceTypeInstances, Amount: 1}}, + }, + Status: quotav1alpha1.ResourceClaimStatus{ + Conditions: []metav1.Condition{ + { + Type: quotav1alpha1.ResourceClaimGranted, + Status: metav1.ConditionTrue, + Reason: quotav1alpha1.ResourceClaimGrantedReason, + Message: "granted", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + quotaClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(grantedClaim). + WithStatusSubresource("av1alpha1.ResourceClaim{}). + Build() + + mgr := &fakeMCManager{ + clusters: map[string]cluster.Cluster{ + testProject: newFakeCluster(projectClient), + }, + } + qm := quota.New(nil) + qm.StoreClient(testProject, quotaClient) + + r := &InstanceReconciler{ + mgr: mgr, + scheme: s, + quotaClientManager: qm, + edgeClusterName: testEdgeClusterName, + recorder: fakeRecorder, + projectIDForInstance: func(_ context.Context, cn multicluster.ClusterName, _ *computev1alpha.Instance) (string, error) { + return string(cn), nil + }, + } + r.finalizers = finalizer.NewFinalizers() + require.NoError(t, r.finalizers.Register(instanceControllerFinalizer, r)) + + // Single reconcile: reconcileQuotaCondition writes QuotaGranted=True with + // ObservedGeneration=2 into the in-memory instance, status is persisted, + // then removeQuotaSchedulingGate reads the in-memory condition (gen=2 == + // instance.Generation=2) and removes the gate — all in one pass. + _, err := r.Reconcile(context.Background(), reconcileReq()) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Namespace: testNS, Name: testInstance}, &updated)) + + hasGate := false + for _, g := range updated.Spec.Controller.SchedulingGates { + if g.Name == instancecontrol.QuotaSchedulingGate.String() { + hasGate = true + } + } + assert.False(t, hasGate, "gate should be removed in the same reconcile that refreshes the condition to current generation") + + cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.InstanceQuotaGranted) + require.NotNil(t, cond) + assert.Equal(t, int64(2), cond.ObservedGeneration, "condition must reflect current generation") + }) +} diff --git a/internal/controller/instance_projector.go b/internal/controller/instance_projector.go new file mode 100644 index 00000000..fa0b69b6 --- /dev/null +++ b/internal/controller/instance_projector.go @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// InstanceProjector watches Instance objects written back to the upstream +// Karmada/management control plane by POP-cell InstanceReconcilers and creates +// read-only projections in the corresponding project namespace within each +// project cluster. +// +// Namespace resolution: an upstream Instance lives in namespace +// `ns-`. The UID portion is matched against the UID of +// namespaces in the project cluster to find the target namespace. +// +// Ownership: each projected Instance is owned by the project WorkloadDeployment +// so that it is garbage-collected via cascading deletion when the deployment is +// removed from the project cluster. +// +// The controller is registered with a standard manager.Manager pointed at the +// upstream Karmada control plane — NOT the multicluster-runtime manager — so +// informer watches are scoped to the upstream control plane. +type InstanceProjector struct { + // FederationClient reads Instance objects from the Karmada federation control + // plane (configured via --federation-kubeconfig). Must be set before + // SetupWithManager is called. + FederationClient client.Client + + // MCManager provides access to project cluster clients via GetCluster. + MCManager mcmanager.Manager +} + +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=instances/status,verbs=get;update;patch + +func (r *InstanceProjector) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx).WithValues("instance", req.NamespacedName) + + // 1. Fetch the Instance from the upstream Karmada control plane. + var downstreamInstance computev1alpha.Instance + if err := r.FederationClient.Get(ctx, req.NamespacedName, &downstreamInstance); err != nil { + if apierrors.IsNotFound(err) { + // Instance was deleted from the upstream control plane. Projections + // are owned by the project WorkloadDeployment, so cascading deletion + // handles cleanup. + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("failed getting upstream instance: %w", err) + } + + // Only project Instances that carry the upstream tracking label; others were + // not written by our InstanceReconciler write-back logic. + encodedClusterName, ok := downstreamInstance.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] + if !ok { + logger.V(1).Info("skipping instance without upstream cluster label") + return ctrl.Result{}, nil + } + + // 2. Resolve the project cluster name. + // The encoded form is "cluster-" with "/" replaced by "_". + clusterName := strings.TrimPrefix(encodedClusterName, "cluster-") + clusterName = strings.ReplaceAll(clusterName, "_", "/") + + // 3. Obtain the project cluster client. + projectCluster, err := r.MCManager.GetCluster(ctx, multicluster.ClusterName(clusterName)) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed getting project cluster %q: %w", clusterName, err) + } + projectClient := projectCluster.GetClient() + + // 4. Resolve the target project namespace from the Instance label. + // The InstanceReconciler stamps UpstreamOwnerNamespaceLabel with the project + // namespace name (read from the upstream Karmada namespace label set by the federator), + // so we can resolve the target namespace directly without scanning. + targetNamespace := downstreamInstance.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + if targetNamespace == "" { + logger.Info("Instance missing upstream-namespace label, requeueing", + "namespace", downstreamInstance.Namespace, "name", downstreamInstance.Name) + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + } + + // 5. Resolve the owning WorkloadDeployment by NAME in the project cluster. + // + // Core invariant: the ownerReference MUST be built from a project-cluster + // object obtained via projectClient.Get — never from any edge/Karmada + // identity. The WD name is stable across all planes (project cluster, Karmada, + // edge) and is the correct cross-plane identifier. + // + // Resolution order: + // a) Read WorkloadDeploymentNameLabel from the downstream Instance (stamped by + // the edge stateful control strategy). + // b) If absent (Instances created before the label was introduced), fall back + // to stripping the trailing "-" suffix from the Instance name. + wdName := downstreamInstance.Labels[computev1alpha.WorkloadDeploymentNameLabel] + if wdName == "" { + wdName = wdNameFromInstanceName(downstreamInstance.Name) + } + if wdName == "" { + logger.Info("cannot resolve WorkloadDeployment name from Instance — skipping projection", + "instance", downstreamInstance.Name) + return ctrl.Result{}, nil + } + + // Fetch the project-cluster WD directly by name. The returned object carries + // the project-cluster metadata.uid — the only UID that GC in the project + // cluster can act on. + var ownerWD computev1alpha.WorkloadDeployment + if err := projectClient.Get(ctx, client.ObjectKey{Namespace: targetNamespace, Name: wdName}, &ownerWD); err != nil { + if apierrors.IsNotFound(err) { + // Either a transient ordering race (Instance projected before + // WorkloadReconciler created the project WD) or the WD has been + // deleted. In both cases, do NOT create an ownerless projection. + // Requeue so the projection is created with a correct owner + // reference once the WD exists. The 5 s interval matches the + // existing upstream-namespace label requeue above. + logger.Info("project WorkloadDeployment not found — requeueing without creating projection", + "wdName", wdName, "namespace", targetNamespace) + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + } + return ctrl.Result{}, fmt.Errorf("failed getting WorkloadDeployment %s/%s in project cluster %s: %w", + targetNamespace, wdName, clusterName, err) + } + + // 6. Create or update the projection in the project namespace. + projection := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: downstreamInstance.Name, + Namespace: targetNamespace, + }, + } + + operationResult, err := controllerutil.CreateOrUpdate(ctx, projectClient, projection, func() error { + // Propagate upstream tracking labels so consumers can filter by origin. + if projection.Labels == nil { + projection.Labels = make(map[string]string) + } + for k, v := range downstreamInstance.Labels { + projection.Labels[k] = v + } + + projection.Spec = downstreamInstance.Spec + + // Attach an owner reference using the live project-cluster WD object. + // controllerutil.SetOwnerReference reads UID and GVK from ownerWD, which + // was fetched from projectClient — satisfying the core invariant. + return controllerutil.SetOwnerReference(&ownerWD, projection, projectCluster.GetScheme()) + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed upserting Instance projection in %s/%s: %w", clusterName, targetNamespace, err) + } + + logger.Info("reconciled Instance projection", "operation", operationResult, "namespace", targetNamespace, "cluster", clusterName) + + // 7. Sync status — status is a separate subresource. + projection.Status = downstreamInstance.Status + if err := projectClient.Status().Update(ctx, projection); err != nil && !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("failed updating Instance projection status: %w", err) + } + + return ctrl.Result{}, nil +} + +// wdNameFromInstanceName derives the WorkloadDeployment name from an Instance +// name by stripping the trailing "-" suffix. Instance names follow the +// convention "-" (e.g. "my-api-default-dfw-0"), which is +// structurally enforced by the stateful control strategy. Returns empty string +// if the name does not contain a numeric suffix (unrecognised format). +// +// This is used as a fallback when the WorkloadDeploymentNameLabel is absent on +// Instances created before that label was introduced. +func wdNameFromInstanceName(name string) string { + lastDash := strings.LastIndex(name, "-") + if lastDash <= 0 { + return "" + } + suffix := name[lastDash+1:] + for _, c := range suffix { + if c < '0' || c > '9' { + return "" + } + } + if len(suffix) == 0 { + return "" + } + return name[:lastDash] +} + +// SetupWithManager registers the InstanceProjector with upstreamMgr, a standard +// manager.Manager configured against the upstream Karmada/federation control plane +// REST config. FederationClient and MCManager must be set before calling this method. +func (r *InstanceProjector) SetupWithManager(upstreamMgr manager.Manager) error { + return ctrl.NewControllerManagedBy(upstreamMgr). + For(&computev1alpha.Instance{}). + Named("instance-projector"). + Complete(r) +} diff --git a/internal/controller/instance_projector_test.go b/internal/controller/instance_projector_test.go new file mode 100644 index 00000000..7dcc8168 --- /dev/null +++ b/internal/controller/instance_projector_test.go @@ -0,0 +1,492 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "maps" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// ─── Test constants ─────────────────────────────────────────────────────────── + +const ( + // projTestCluster is the project cluster name used in projector tests. + projTestCluster = "project-cluster" + + // projTestProjNS is the project namespace name. + projTestProjNS = "proj-namespace" + + // projTestProjNSUID is the project namespace UID embedded in the Karmada + // namespace name below. + projTestProjNSUID = types.UID("deadbeef-1111-2222-3333-444455556666") + + // projTestKarmadaNS is the Karmada namespace derived from the UID above + // via the ns- convention. + projTestKarmadaNS = "ns-deadbeef-1111-2222-3333-444455556666" + + // projTestInstanceName is the name of the Karmada (and projected) Instance. + // Follows the "-" convention: "my-wd-0". + projTestInstanceName = "my-wd-0" + + // projTestWDUID is the UID of the owning WorkloadDeployment as it exists in + // the PROJECT cluster. This is the UID that owner references must use, since + // Kubernetes GC in the project cluster only knows this UID. + projTestWDUID = types.UID("project-wd-uid-9999-aaaa-bbbb-cccc") + + // projTestEdgeWDUID is the UID of the WorkloadDeployment as it exists on the + // EDGE/Karmada plane. Each plane mints its own UID, so this is intentionally + // distinct from projTestWDUID. The WorkloadDeploymentUIDLabel on downstream + // Instances carries this edge UID — NOT the project UID. + projTestEdgeWDUID = types.UID("edge-uid-0000-1111-2222-3333") + + // projTestWDName is the name of the owning WorkloadDeployment. The name is + // the same across all planes (project cluster, Karmada, edge) and is the + // correct cross-plane stable identifier. + projTestWDName = "my-wd" + + // projTestWorkloadUID is the UID of the owning Workload (carried via WorkloadUIDLabel). + projTestWorkloadUID = "wl-uid-1111-2222-3333-4444" + + // projTestInstanceIndex is the ordinal index of the instance (carried via InstanceIndexLabel). + projTestInstanceIndex = "0" +) + +// encodedCluster returns the value of the UpstreamOwnerClusterNameLabel for +// projTestCluster ("cluster-"). +func encodedCluster() string { + return "cluster-" + projTestCluster +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +// projTestProjectNS builds the project cluster Namespace with the stable test UID. +func projTestProjectNS() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestProjNS, + UID: projTestProjNSUID, + }, + } +} + +// projTestWorkloadDeployment builds the project WorkloadDeployment that owns +// projected Instances. +func projTestWorkloadDeployment() *computev1alpha.WorkloadDeployment { + return &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestWDName, + Namespace: projTestProjNS, + UID: projTestWDUID, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: "LAX", + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: "my-workload"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + } +} + +// projTestKarmadaInstance builds a Karmada Instance with the default labels +// needed for the InstanceProjector to act on it. Optional label overrides are +// applied last. +func projTestKarmadaInstance(labelOverrides map[string]string) *computev1alpha.Instance { + labels := map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedCluster(), + downstreamclient.UpstreamOwnerNamespaceLabel: projTestProjNS, + // WorkloadDeploymentUIDLabel carries the EDGE UID — intentionally distinct + // from projTestWDUID (the project-cluster WD UID). Owner references must + // never be built from this value. + computev1alpha.WorkloadDeploymentUIDLabel: string(projTestEdgeWDUID), + computev1alpha.WorkloadDeploymentNameLabel: projTestWDName, + computev1alpha.WorkloadUIDLabel: projTestWorkloadUID, + computev1alpha.InstanceIndexLabel: projTestInstanceIndex, + } + maps.Copy(labels, labelOverrides) + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + Labels: labels, + }, + Spec: computev1alpha.InstanceSpec{ + // Minimal valid spec — actual content is copied to the projection. + }, + } +} + +// newTestProjector wires an InstanceProjector with the given downstream client and +// a project cluster that serves the supplied project client. +func newTestProjector(karmadaClient client.Client, projectClient client.Client) *InstanceProjector { + projectCluster := newFakeCluster(projectClient) + mgr := newFakeMCManager(projTestCluster, projectCluster) + return &InstanceProjector{ + FederationClient: karmadaClient, + MCManager: mgr, + } +} + +// projectorRequest builds a ctrl.Request for the test Instance in Karmada. +func projectorRequest() ctrl.Request { + return ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + }, + } +} + +// ─── Tests ─────────────────────────────────────────────────────────────────── + +// TestInstanceProjector_Reconcile is the primary table-driven test. +func TestInstanceProjector_Reconcile(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + + // karmadaInstance is what exists in the Karmada API server. + // A nil value means the Instance does not exist (not-found path). + karmadaInstance *computev1alpha.Instance + + // projectObjs are pre-populated in the project cluster fake client. + projectObjs []client.Object + + // request overrides the default projectorRequest() when set. + request *ctrl.Request + + // wantProjection controls whether a projected Instance should appear. + wantProjection bool + + // wantOwnerRef controls whether the projected Instance should have an + // owner reference pointing to the project WorkloadDeployment. + wantOwnerRef bool + + // wantRequeue controls whether the reconcile result should request a requeue. + wantRequeue bool + + // wantErr controls whether the reconcile should return an error. + wantErr bool + }{ + { + name: "happy path — instance projected with owner reference", + karmadaInstance: projTestKarmadaInstance(nil), + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), + }, + wantProjection: true, + wantOwnerRef: true, + }, + { + // Cross-plane UID regression test: the Karmada Instance carries the EDGE + // WD UID in WorkloadDeploymentUIDLabel (projTestEdgeWDUID), which is + // intentionally different from the project-cluster WD UID (projTestWDUID). + // The owner reference on the projection must use the project-cluster UID. + // This test fails if someone reintroduces UID-based matching against the + // edge/Karmada plane. + name: "WD name label present, edge UID differs from project UID — owner ref UID equals project WD UID", + karmadaInstance: projTestKarmadaInstance(nil), // carries projTestEdgeWDUID, not projTestWDUID + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), // UID is projTestWDUID + }, + wantProjection: true, + wantOwnerRef: true, + }, + { + // Fallback: when WorkloadDeploymentNameLabel is absent (Instances created + // before the label was introduced), the projector derives the WD name from + // the Instance name by stripping the trailing "-" suffix. + // Instance name "my-wd-0" → WD name "my-wd". + name: "WD name label absent, fallback name extraction from instance name — owner ref attached", + karmadaInstance: projTestKarmadaInstance(map[string]string{ + // Remove the name label to exercise the fallback path. + computev1alpha.WorkloadDeploymentNameLabel: "", + }), + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), + }, + wantProjection: true, + wantOwnerRef: true, + }, + { + // NotFound requeue: when the project WD does not yet exist (transient + // ordering race — Instance projected before WorkloadReconciler created + // the project WD), the projector must requeue and NOT create an ownerless + // projection. A projection must never be created without an owner reference. + name: "project WD not found — requeue, no ownerless projection created", + karmadaInstance: projTestKarmadaInstance(nil), + projectObjs: []client.Object{ + projTestProjectNS(), + // No WorkloadDeployment — simulates the transient ordering race. + }, + wantProjection: false, + wantRequeue: true, + }, + { + // Unresolvable WD name: both the label is absent and the Instance name has + // no numeric suffix to strip (unrecognised naming format). The projector + // should skip without error — no projection created, no requeue. + // The instance name "inst-no-ordinal" has no trailing numeric segment. + name: "WD name label absent and instance name yields no resolvable WD — skip, no projection", + karmadaInstance: &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: "inst-no-ordinal", + Namespace: projTestKarmadaNS, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: encodedCluster(), + downstreamclient.UpstreamOwnerNamespaceLabel: projTestProjNS, + // No WorkloadDeploymentNameLabel — no label, no numeric suffix. + computev1alpha.WorkloadUIDLabel: projTestWorkloadUID, + computev1alpha.InstanceIndexLabel: projTestInstanceIndex, + }, + }, + }, + request: &ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "inst-no-ordinal", + Namespace: projTestKarmadaNS, + }, + }, + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + wantRequeue: false, + }, + { + name: "missing upstream-cluster-name label — skipped, no projection", + karmadaInstance: &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: projTestInstanceName, + Namespace: projTestKarmadaNS, + // Intentionally no UpstreamOwnerClusterNameLabel. + Labels: map[string]string{ + "some-other-label": "value", + }, + }, + }, + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + }, + { + name: "missing upstream-namespace label — requeue", + karmadaInstance: projTestKarmadaInstance(map[string]string{ + // Override: remove the upstream namespace label. + downstreamclient.UpstreamOwnerNamespaceLabel: "", + }), + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + wantRequeue: true, + }, + { + name: "karmada instance not found — no-op", + karmadaInstance: nil, // causes Get to return NotFound + projectObjs: []client.Object{projTestProjectNS()}, + wantProjection: false, + }, + { + // Verify that all linking labels (WorkloadUID, WorkloadDeploymentUID, + // WorkloadDeploymentNameLabel, InstanceIndex) survive from the Karmada + // write-back object through to the projection. + name: "all linking labels propagated from Karmada to projection", + karmadaInstance: projTestKarmadaInstance(nil), + projectObjs: []client.Object{ + projTestProjectNS(), + projTestWorkloadDeployment(), + }, + wantProjection: true, + wantOwnerRef: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + // Build Karmada client. + var karmadaObjs []client.Object + if tt.karmadaInstance != nil { + karmadaObjs = append(karmadaObjs, tt.karmadaInstance) + } + karmadaClient := newKarmadaFakeClient(karmadaObjs...) + + // Build project client. + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(tt.projectObjs...). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newTestProjector(karmadaClient, projectClient) + + req := projectorRequest() + if tt.request != nil { + req = *tt.request + } + result, err := r.Reconcile(context.Background(), req) + + if tt.wantErr { + require.Error(t, err) + return + } + require.NoError(t, err) + + if tt.wantRequeue { + assert.NotZero(t, result.RequeueAfter, "expected RequeueAfter to be set") + } else { + assert.Equal(t, ctrl.Result{}, result) + } + + ctx := context.Background() + + // Check whether a projected Instance exists in the project namespace. + var projection computev1alpha.Instance + err = projectClient.Get(ctx, types.NamespacedName{ + Name: projTestInstanceName, + Namespace: projTestProjNS, + }, &projection) + + if !tt.wantProjection { + assert.True(t, isNotFound(err), + "expected no projection in project namespace, but found one (or unexpected error: %v)", err) + return + } + + require.NoError(t, err, "expected projection to exist in project namespace") + + // Labels should be copied from the Karmada instance. + if tt.karmadaInstance != nil { + for k, v := range tt.karmadaInstance.Labels { + assert.Equal(t, v, projection.Labels[k], + "projection label %q should match Karmada instance label", k) + } + } + + // Linking labels must survive from the Karmada instance to the projection + // so that the CLI can resolve Workload name, city, and instance ordinal. + if tt.wantProjection && tt.karmadaInstance != nil { + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.WorkloadUIDLabel], + projection.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel must be propagated to the projection") + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + projection.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + "WorkloadDeploymentUIDLabel must be propagated to the projection") + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.WorkloadDeploymentNameLabel], + projection.Labels[computev1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must be propagated to the projection") + assert.Equal(t, + tt.karmadaInstance.Labels[computev1alpha.InstanceIndexLabel], + projection.Labels[computev1alpha.InstanceIndexLabel], + "InstanceIndexLabel must be propagated to the projection") + } + + // Owner reference check. + if tt.wantOwnerRef { + require.NotEmpty(t, projection.OwnerReferences, + "projected instance should have an owner reference to the WorkloadDeployment") + ownerRef := projection.OwnerReferences[0] + // Core invariant: owner ref UID must be the PROJECT-cluster WD UID. + assert.Equal(t, string(projTestWDUID), string(ownerRef.UID), + "owner reference UID must match the project-cluster WorkloadDeployment UID") + // Regression guard: the edge UID must NOT appear in the owner ref. + // If this assertion fails, someone reintroduced cross-plane UID matching. + assert.NotEqual(t, string(projTestEdgeWDUID), string(ownerRef.UID), + "owner reference UID must NOT be the edge/Karmada WD UID") + assert.Equal(t, projTestWDName, ownerRef.Name, + "owner reference name should match the WorkloadDeployment name") + } else { + assert.Empty(t, projection.OwnerReferences, + "projected instance should have no owner reference when WD not found") + } + }) + } +} + +// TestInstanceProjector_SpecCopied verifies that the Instance spec is correctly +// propagated from the Karmada instance to the projection. +func TestInstanceProjector_SpecCopied(t *testing.T) { + t.Parallel() + + karmadaInst := projTestKarmadaInstance(nil) + // Set a recognizable spec field we can assert against. + karmadaInst.Spec.Controller = &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{{Name: "test-gate"}}, + } + + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(projTestProjectNS(), projTestWorkloadDeployment()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + karmadaClient := newKarmadaFakeClient(karmadaInst) + + r := newTestProjector(karmadaClient, projectClient) + _, err := r.Reconcile(context.Background(), projectorRequest()) + require.NoError(t, err) + + var projection computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: projTestInstanceName, Namespace: projTestProjNS}, + &projection)) + + require.NotNil(t, projection.Spec.Controller) + require.Len(t, projection.Spec.Controller.SchedulingGates, 1) + assert.Equal(t, "test-gate", projection.Spec.Controller.SchedulingGates[0].Name) +} + +// TestInstanceProjector_NamespaceResolution verifies that the projector resolves +// the target project namespace directly from the UpstreamOwnerNamespaceLabel on +// the Karmada Instance, landing the projection in the correct namespace. +func TestInstanceProjector_NamespaceResolution(t *testing.T) { + t.Parallel() + + karmadaInst := projTestKarmadaInstance(nil) + projectClient := fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects( + projTestProjectNS(), + projTestWorkloadDeployment(), + ). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + karmadaClient := newKarmadaFakeClient(karmadaInst) + + r := newTestProjector(karmadaClient, projectClient) + result, err := r.Reconcile(context.Background(), projectorRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + // Projection must land in the namespace named by the label. + var projection computev1alpha.Instance + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: projTestInstanceName, Namespace: projTestProjNS}, + &projection)) +} + +// isNotFound returns true when err is a Kubernetes not-found error or is nil +// (object not found means Get returned NotFound, not that err is nil). +// Used to distinguish "no projection created" from "projection exists but Get failed". +func isNotFound(err error) bool { + if err == nil { + return false // object exists — not the "not found" case + } + // Import apierrors to check — we already have it via the fake client package. + return client.IgnoreNotFound(err) == nil +} diff --git a/internal/controller/instance_writeback_test.go b/internal/controller/instance_writeback_test.go new file mode 100644 index 00000000..17c522f1 --- /dev/null +++ b/internal/controller/instance_writeback_test.go @@ -0,0 +1,448 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + "sync" + "testing" + + "github.com/go-logr/logr" + "github.com/go-logr/logr/funcr" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// ─── Log capture helper ─────────────────────────────────────────────────────── + +// logEntry holds a single captured log line (message + formatted key-value pairs). +type logEntry struct { + msg string + kvs string // funcr renders key-value pairs as a single string +} + +// captureLogger returns a logr.Logger backed by an in-memory sink and a pointer +// to the slice of captured entries. Thread-safe; safe to call from parallel tests. +func captureLogger() (logr.Logger, *[]logEntry) { + var mu sync.Mutex + var entries []logEntry + logger := funcr.New(func(prefix, args string) { + mu.Lock() + defer mu.Unlock() + entries = append(entries, logEntry{msg: prefix, kvs: args}) + }, funcr.Options{}) + return logger, &entries +} + +// ─── write-back test constants ──────────────────────────────────────────────── + +const ( + wbTestClusterName = "edge-cluster" + wbTestNamespace = "ns-proj-uid-1234" + wbTestInstanceName = "inst-0" + wbTestWorkloadUID = "wl-uid-aaaa-bbbb" + wbTestWDUID = "wd-uid-cccc-dddd" + wbTestInstanceIndex = "0" + wbTestUpstreamNS = "proj-namespace" + wbTestEncodedCluster = "cluster-" + wbTestClusterName + + // Four new self-describing labels. + wbTestWDName = "my-workload-deployment" + wbTestCityCode = "DFW" + wbTestWorkloadName = "my-workload" + wbTestPlacement = "us-central" +) + +// wbTestCellInstance builds a cell-side Instance with all seven owned labels +// pre-populated, as addInstanceControllerLabels would produce. +func wbTestCellInstance() *computev1alpha.Instance { + return &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + computev1alpha.WorkloadUIDLabel: wbTestWorkloadUID, + computev1alpha.WorkloadDeploymentUIDLabel: wbTestWDUID, + computev1alpha.InstanceIndexLabel: wbTestInstanceIndex, + computev1alpha.WorkloadDeploymentNameLabel: wbTestWDName, + computev1alpha.CityCodeLabel: wbTestCityCode, + computev1alpha.WorkloadNameLabel: wbTestWorkloadName, + computev1alpha.PlacementNameLabel: wbTestPlacement, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceReady, + Status: metav1.ConditionTrue, + Reason: computev1alpha.InstanceReadyReasonRunning, + Message: "Instance is ready", + LastTransitionTime: metav1.Now(), + }, + }, + }, + } +} + +// wbTestDownstreamNS returns a Namespace object in the downstream (Karmada) +// control plane that carries the upstream routing labels, simulating the +// namespace stamped by NSO's MappedNamespaceResourceStrategy. +func wbTestDownstreamNS() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + }, + }, + } +} + +// newWriteBackReconciler wires an InstanceReconciler whose FederationClient is set +// to federationClient and whose local cluster has a single cell instance. +func newWriteBackReconciler(federationClient client.Client) *InstanceReconciler { + return &InstanceReconciler{ + FederationClient: federationClient, + } +} + +// ─── Tests ─────────────────────────────────────────────────────────────────── + +// TestWriteBackToUpstream_CreatePath_AllLabels (Case A) verifies that the first +// write-back to an empty Karmada control plane creates an Instance with all five +// expected labels (two routing + three linking) and also writes the cell-side +// status via Status().Update. +func TestWriteBackToUpstream_CreatePath_AllLabels(t *testing.T) { + t.Parallel() + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + // Verify the created Karmada Instance carries all five expected labels. + var created computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created)) + + assert.Equal(t, wbTestEncodedCluster, created.Labels[downstreamclient.UpstreamOwnerClusterNameLabel], + "UpstreamOwnerClusterNameLabel must be set") + assert.Equal(t, wbTestUpstreamNS, created.Labels[downstreamclient.UpstreamOwnerNamespaceLabel], + "UpstreamOwnerNamespaceLabel must be set") + assert.Equal(t, wbTestWorkloadUID, created.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel must be propagated from cell instance") + assert.Equal(t, wbTestWDUID, created.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + "WorkloadDeploymentUIDLabel must be propagated from cell instance") + assert.Equal(t, wbTestInstanceIndex, created.Labels[computev1alpha.InstanceIndexLabel], + "InstanceIndexLabel must be propagated from cell instance") + + // Status must have been written via Status().Update after Create. + require.Len(t, created.Status.Conditions, 1, + "Status().Update must be called after Create; condition should be present") + assert.Equal(t, computev1alpha.InstanceReady, created.Status.Conditions[0].Type) + assert.Equal(t, metav1.ConditionTrue, created.Status.Conditions[0].Status) +} + +// TestWriteBackToUpstream_UpdatePath_LabelMerge (Case B) verifies that an +// existing Karmada Instance with a Karmada-managed label retains that label +// after the update path runs, while all five owned labels are written correctly. +func TestWriteBackToUpstream_UpdatePath_LabelMerge(t *testing.T) { + t.Parallel() + + karmadaManagedLabel := "karmada.io/managed" + + // Pre-populate the Karmada control plane with an Instance that has the old + // two-label map plus a simulated Karmada-managed label. + existingKarmadaInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + karmadaManagedLabel: "true", + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS(), existingKarmadaInstance). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &updated)) + + // All five owned labels must be present with correct values. + assert.Equal(t, wbTestEncodedCluster, updated.Labels[downstreamclient.UpstreamOwnerClusterNameLabel]) + assert.Equal(t, wbTestUpstreamNS, updated.Labels[downstreamclient.UpstreamOwnerNamespaceLabel]) + assert.Equal(t, wbTestWorkloadUID, updated.Labels[computev1alpha.WorkloadUIDLabel]) + assert.Equal(t, wbTestWDUID, updated.Labels[computev1alpha.WorkloadDeploymentUIDLabel]) + assert.Equal(t, wbTestInstanceIndex, updated.Labels[computev1alpha.InstanceIndexLabel]) + + // The Karmada-managed label must survive the merge (not be replaced/deleted). + assert.Equal(t, "true", updated.Labels[karmadaManagedLabel], + "Karmada-managed label must be preserved after merge; should not be overwritten") +} + +// TestWriteBackToUpstream_LabelChangeTriggerUpdate (Case C) verifies that +// a changed linking label on the cell instance causes the Karmada object to +// be updated with the new value. +func TestWriteBackToUpstream_LabelChangeTriggerUpdate(t *testing.T) { + t.Parallel() + + newWorkloadUID := "wl-uid-CHANGED" + + // Pre-populate with the five-label map from a previous write-back. + existingKarmadaInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + computev1alpha.WorkloadUIDLabel: wbTestWorkloadUID, + computev1alpha.WorkloadDeploymentUIDLabel: wbTestWDUID, + computev1alpha.InstanceIndexLabel: wbTestInstanceIndex, + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS(), existingKarmadaInstance). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + + // Modify the WorkloadUIDLabel on the cell instance. + cellInstance := wbTestCellInstance() + cellInstance.Labels[computev1alpha.WorkloadUIDLabel] = newWorkloadUID + + err := r.writeBackToUpstream(context.Background(), multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &updated)) + + assert.Equal(t, newWorkloadUID, updated.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel change on the cell instance must be reflected in the Karmada object") +} + +// TestWriteBackToUpstream_EmptyLinkingLabels_NonFatal (Case D) verifies that +// writeBackToUpstream completes without error when the cell-side Instance has +// no linking labels (e.g. during an early reconcile before +// addInstanceControllerLabels has run). The created Karmada object will carry +// empty string values for the three linking labels, and the RC-2 warning log +// must fire listing all three missing label keys. +func TestWriteBackToUpstream_EmptyLinkingLabels_NonFatal(t *testing.T) { + t.Parallel() + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + + // Instance with nil Labels — simulates an early reconcile with no linking labels. + cellInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + // Inject a capturing logger so we can assert the RC-2 warning fires. + capLogger, entries := captureLogger() + ctx := log.IntoContext(context.Background(), capLogger) + + // Must not return an error — empty labels are non-fatal. + err := r.writeBackToUpstream(ctx, multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + // The Karmada object should exist with empty string values for the linking labels. + var created computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created)) + + assert.Equal(t, "", created.Labels[computev1alpha.WorkloadUIDLabel], + "WorkloadUIDLabel should be empty string when not set on cell instance") + assert.Equal(t, "", created.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + "WorkloadDeploymentUIDLabel should be empty string when not set on cell instance") + assert.Equal(t, "", created.Labels[computev1alpha.InstanceIndexLabel], + "InstanceIndexLabel should be empty string when not set on cell instance") + + // Assert the RC-2 warning was emitted and named all three missing label keys. + // funcr encodes both the message and key-value pairs into the args string; + // we search across the full rendered output for each required substring. + warnMsg := "instance is missing linking labels for write-back" + allRendered := func() string { + parts := make([]string, len(*entries)) + for i, e := range *entries { + parts[i] = fmt.Sprintf("%s %s", e.msg, e.kvs) + } + return strings.Join(parts, "\n") + }() + + assert.True(t, strings.Contains(allRendered, warnMsg), + "expected RC-2 warning %q to be logged; got:\n%s", warnMsg, allRendered) + for _, key := range []string{ + computev1alpha.WorkloadUIDLabel, + computev1alpha.WorkloadDeploymentUIDLabel, + computev1alpha.InstanceIndexLabel, + } { + assert.True(t, strings.Contains(allRendered, key), + "expected missing label key %q to appear in warning log; got:\n%s", key, allRendered) + } +} + +// TestWriteBackToUpstream_FourNewLabels_CreatePath verifies that all four new +// self-describing labels (WorkloadDeploymentName, CityCode, WorkloadName, +// PlacementName) are written to the Karmada object on the create path. +func TestWriteBackToUpstream_FourNewLabels_CreatePath(t *testing.T) { + t.Parallel() + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS()). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + var created computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &created)) + + assert.Equal(t, wbTestWDName, created.Labels[computev1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must propagate to Karmada object") + assert.Equal(t, wbTestCityCode, created.Labels[computev1alpha.CityCodeLabel], + "CityCodeLabel must propagate to Karmada object") + assert.Equal(t, wbTestWorkloadName, created.Labels[computev1alpha.WorkloadNameLabel], + "WorkloadNameLabel must propagate to Karmada object") + assert.Equal(t, wbTestPlacement, created.Labels[computev1alpha.PlacementNameLabel], + "PlacementNameLabel must propagate to Karmada object") +} + +// TestWriteBackToUpstream_FourNewLabels_UpdatePath verifies that all four new +// self-describing labels are written on the update path and existing Karmada- +// managed labels on the downstream object are preserved. +func TestWriteBackToUpstream_FourNewLabels_UpdatePath(t *testing.T) { + t.Parallel() + + karmadaManagedLabel := "karmada.io/managed" + + existingKarmadaInstance := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: wbTestInstanceName, + Namespace: wbTestNamespace, + Labels: map[string]string{ + downstreamclient.UpstreamOwnerClusterNameLabel: wbTestEncodedCluster, + downstreamclient.UpstreamOwnerNamespaceLabel: wbTestUpstreamNS, + karmadaManagedLabel: "true", + }, + }, + Spec: computev1alpha.InstanceSpec{ + Runtime: computev1alpha.InstanceRuntimeSpec{ + Resources: computev1alpha.InstanceRuntimeResources{InstanceType: testInstanceType}, + }, + }, + } + + s := newKarmadaScheme() + upstreamClient := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(wbTestDownstreamNS(), existingKarmadaInstance). + WithStatusSubresource(&computev1alpha.Instance{}). + Build() + + r := newWriteBackReconciler(upstreamClient) + cellInstance := wbTestCellInstance() + + err := r.writeBackToUpstream(context.Background(), multicluster.ClusterName(wbTestClusterName), cellInstance) + require.NoError(t, err) + + var updated computev1alpha.Instance + require.NoError(t, upstreamClient.Get(context.Background(), + types.NamespacedName{Namespace: wbTestNamespace, Name: wbTestInstanceName}, + &updated)) + + assert.Equal(t, wbTestWDName, updated.Labels[computev1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must be set on update path") + assert.Equal(t, wbTestCityCode, updated.Labels[computev1alpha.CityCodeLabel], + "CityCodeLabel must be set on update path") + assert.Equal(t, wbTestWorkloadName, updated.Labels[computev1alpha.WorkloadNameLabel], + "WorkloadNameLabel must be set on update path") + assert.Equal(t, wbTestPlacement, updated.Labels[computev1alpha.PlacementNameLabel], + "PlacementNameLabel must be set on update path") + + // Karmada-managed label must survive the merge. + assert.Equal(t, "true", updated.Labels[karmadaManagedLabel], + "Karmada-managed label must be preserved after the update merge") +} diff --git a/internal/controller/instancecontrol/instancecontrol.go b/internal/controller/instancecontrol/instancecontrol.go index 6de9df99..d2c83692 100644 --- a/internal/controller/instancecontrol/instancecontrol.go +++ b/internal/controller/instancecontrol/instancecontrol.go @@ -26,10 +26,11 @@ type Strategy interface { type ActionType string const ( - ActionTypeCreate ActionType = "Create" - ActionTypeUpdate ActionType = "Update" - ActionTypeDelete ActionType = "Delete" - ActionTypeWait ActionType = "Wait" + ActionTypeCreate ActionType = "Create" + ActionTypeUpdate ActionType = "Update" + ActionTypeDelete ActionType = "Delete" + ActionTypeWait ActionType = "Wait" + ActionTypePatchLabels ActionType = "PatchLabels" ) type Action struct { @@ -104,3 +105,22 @@ func NewWaitAction(object client.Object) Action { fn: func(ctx context.Context, c client.Client) error { return nil }, } } + +// NewPatchLabelsAction returns an action that applies a metadata-only labels +// patch to the given object. It uses a MergeFrom patch so only the labels +// field is sent to the API server — the spec, template, and template-hash are +// never touched. This is intentionally separate from ActionTypeUpdate so that +// label backfill never participates in the ordered rolling-update flow. +func NewPatchLabelsAction(updated client.Object, base client.Object) Action { + patch := client.MergeFrom(base) + return Action{ + Object: updated, + actionType: ActionTypePatchLabels, + fn: func(ctx context.Context, c client.Client) error { + if err := c.Patch(ctx, updated, patch); err != nil { + return fmt.Errorf("failed to patch labels on %T %s: %w", updated, updated.GetName(), err) + } + return nil + }, + } +} diff --git a/internal/controller/instancecontrol/stateful/stateful_control.go b/internal/controller/instancecontrol/stateful/stateful_control.go index 566a652c..2d2e3073 100644 --- a/internal/controller/instancecontrol/stateful/stateful_control.go +++ b/internal/controller/instancecontrol/stateful/stateful_control.go @@ -15,13 +15,30 @@ import ( "go.datum.net/compute/internal/controller/instancecontrol" ) +// Options controls optional behaviours of the stateful instance control strategy. +type Options struct { + // NetworkingEnabled controls whether the Network scheduling gate is added to + // newly created Instances. Set to false when the networking integration is + // disabled so that Instances are not blocked waiting for a NetworkBinding. + // Defaults to true. + NetworkingEnabled bool +} + // Behavior inspired by https://github.com/kubernetes/kubernetes/tree/master/pkg/controller/statefulset // Does not currently implement exact behavior. type statefulControl struct { + opts Options } +// New returns a stateful instance control strategy with networking enabled. func New() instancecontrol.Strategy { - return &statefulControl{} + return NewWithOptions(Options{NetworkingEnabled: true}) +} + +// NewWithOptions returns a stateful instance control strategy with the given +// options. +func NewWithOptions(opts Options) instancecontrol.Strategy { + return &statefulControl{opts: opts} } func (c *statefulControl) GetActions( @@ -68,15 +85,25 @@ func (c *statefulControl) GetActions( }, Spec: deployment.Spec.Template.Spec, } + // Set Location best-effort: when Status.Location is nil (no matching + // Location object for the city code) Instance.Spec.Location stays nil and + // instance creation proceeds normally — this must not block scheduling. desiredInstances[i].Spec.Location = deployment.Status.Location // TODO(jreese) consider adding scheduling gates via mutating webhooks - desiredInstances[i].Spec.Controller = &v1alpha.InstanceController{ - TemplateHash: instanceTemplateHash, - SchedulingGates: []v1alpha.SchedulingGate{ + gates := []v1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + } + if c.opts.NetworkingEnabled { + // Prepend the Network gate so it is cleared first; quota is + // independent and evaluated in parallel by InstanceReconciler. + gates = append([]v1alpha.SchedulingGate{ {Name: instancecontrol.NetworkSchedulingGate.String()}, - {Name: instancecontrol.QuotaSchedulingGate.String()}, - }, + }, gates...) + } + desiredInstances[i].Spec.Controller = &v1alpha.InstanceController{ + TemplateHash: instanceTemplateHash, + SchedulingGates: gates, } addInstanceControllerLabels(desiredInstances[i], getInstanceOrdinal(desiredInstances[i].Name), deployment) @@ -114,10 +141,37 @@ func (c *statefulControl) GetActions( } } + // Backfill controller-managed labels on every existing instance, regardless + // of Ready state or template hash. This ensures newly-introduced labels + // (e.g. city-code, workload-name) are applied to pre-existing instances that + // were never touched by a rolling update. The patch is metadata-only and is + // emitted outside the ordered rollout decision so it never gates or reorders + // instance creation/updates. + var patchLabelActions []instancecontrol.Action + for _, instance := range desiredInstances { + if instance.CreationTimestamp.IsZero() || !instance.DeletionTimestamp.IsZero() { + // Skip instances that don't exist yet or are being deleted. + continue + } + + desiredLabels := desiredControllerLabels(getInstanceOrdinal(instance.Name), deployment) + if labelsNeedBackfill(instance.Labels, desiredLabels) { + base := instance.DeepCopy() + patched := instance.DeepCopy() + for k, v := range desiredLabels { + if patched.Labels == nil { + patched.Labels = make(map[string]string) + } + patched.Labels[k] = v + } + patchLabelActions = append(patchLabelActions, instancecontrol.NewPatchLabelsAction(patched, base)) + } + } + slices.SortFunc(updateActions, descendingOrdinal) slices.SortFunc(deleteActions, descendingOrdinal) - actions := make([]instancecontrol.Action, 0, len(createActions)+len(waitActions)+len(updateActions)+len(deleteActions)) + actions := make([]instancecontrol.Action, 0, len(createActions)+len(waitActions)+len(updateActions)+len(deleteActions)+len(patchLabelActions)) switch deployment.Spec.ScaleSettings.InstanceManagementPolicy { case v1alpha.OrderedReadyInstanceManagementPolicyType: @@ -144,6 +198,11 @@ func (c *statefulControl) GetActions( } + // Label-backfill actions are appended after the rollout ordering/skip logic + // so they are never affected by the "skip all but first" rule and never + // participate in rollout sequencing. + actions = append(actions, patchLabelActions...) + return actions, nil } @@ -152,7 +211,37 @@ func addInstanceControllerLabels(instance *v1alpha.Instance, index int, deployme instance.Labels = map[string]string{} } - instance.Labels[v1alpha.InstanceIndexLabel] = strconv.Itoa(index) - instance.Labels[v1alpha.WorkloadUIDLabel] = string(deployment.Spec.WorkloadRef.UID) - instance.Labels[v1alpha.WorkloadDeploymentUIDLabel] = string(deployment.GetUID()) + for k, v := range desiredControllerLabels(index, deployment) { + instance.Labels[k] = v + } +} + +// desiredControllerLabels returns the full set of controller-managed labels +// that every instance should carry. Used both when stamping a new/updated +// instance and when checking whether an existing instance needs a backfill +// patch. +func desiredControllerLabels(index int, deployment *v1alpha.WorkloadDeployment) map[string]string { + return map[string]string{ + v1alpha.InstanceIndexLabel: strconv.Itoa(index), + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + // Self-describing labels for routing, filtering, and observability. + // Backfilled on every reconcile so they stay accurate even for instances + // that pre-date the labels or that were not reached by a rolling update. + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } +} + +// labelsNeedBackfill reports whether any of the desired controller-managed +// label key/value pairs are absent or incorrect on the current instance labels. +func labelsNeedBackfill(current map[string]string, desired map[string]string) bool { + for k, v := range desired { + if current[k] != v { + return true + } + } + return false } diff --git a/internal/controller/instancecontrol/stateful/stateful_control_test.go b/internal/controller/instancecontrol/stateful/stateful_control_test.go index d45b24b3..ffc04272 100644 --- a/internal/controller/instancecontrol/stateful/stateful_control_test.go +++ b/internal/controller/instancecontrol/stateful/stateful_control_test.go @@ -13,6 +13,8 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/utils/ptr" + networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" + "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/controller/instancecontrol" ) @@ -150,16 +152,407 @@ func TestScaleDownWithAllReadyInstances(t *testing.T) { assert.False(t, actions[0].IsSkipped()) } +// TestNetworkingEnabledAddsNetworkGate verifies that when networking is enabled +// (the default), newly created Instances receive both the Network and Quota +// scheduling gates so that they are held until the network is provisioned. +func TestNetworkingEnabledAddsNetworkGate(t *testing.T) { + ctx := context.Background() + control := NewWithOptions(Options{NetworkingEnabled: true}) + + deployment := getWorkloadDeployment("test-deploy-net-on", 1) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.NotNil(t, instance.Spec.Controller) + + gateNames := make([]string, 0, len(instance.Spec.Controller.SchedulingGates)) + for _, g := range instance.Spec.Controller.SchedulingGates { + gateNames = append(gateNames, g.Name) + } + assert.Contains(t, gateNames, instancecontrol.NetworkSchedulingGate.String(), + "Network gate must be present when networking is enabled") + assert.Contains(t, gateNames, instancecontrol.QuotaSchedulingGate.String(), + "Quota gate must be present") +} + +// TestNetworkingDisabledOmitsNetworkGate verifies that when networking is +// disabled, newly created Instances do NOT receive the Network scheduling gate, +// so they are not blocked on network provisioning. The Quota gate is still +// added so quota enforcement remains active. +func TestNetworkingDisabledOmitsNetworkGate(t *testing.T) { + ctx := context.Background() + control := NewWithOptions(Options{NetworkingEnabled: false}) + + deployment := getWorkloadDeployment("test-deploy-net-off", 1) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.NotNil(t, instance.Spec.Controller) + + gateNames := make([]string, 0, len(instance.Spec.Controller.SchedulingGates)) + for _, g := range instance.Spec.Controller.SchedulingGates { + gateNames = append(gateNames, g.Name) + } + assert.NotContains(t, gateNames, instancecontrol.NetworkSchedulingGate.String(), + "Network gate must NOT be present when networking is disabled") + assert.Contains(t, gateNames, instancecontrol.QuotaSchedulingGate.String(), + "Quota gate must still be present when networking is disabled") +} + // Add more test functions below for different scenarios. +// TestInstanceLabels_FourNewLabelsStamped verifies that all four new +// self-describing labels are stamped on newly created Instances, with values +// sourced from the WorkloadDeployment spec. +func TestInstanceLabels_FourNewLabelsStamped(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-labels-deploy", 1) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + + assert.Equal(t, deployment.GetName(), instance.Labels[v1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must equal deployment name") + assert.Equal(t, deployment.Spec.CityCode, instance.Labels[v1alpha.CityCodeLabel], + "CityCodeLabel must equal deployment.Spec.CityCode") + assert.Equal(t, deployment.Spec.WorkloadRef.Name, instance.Labels[v1alpha.WorkloadNameLabel], + "WorkloadNameLabel must equal deployment.Spec.WorkloadRef.Name") + assert.Equal(t, deployment.Spec.PlacementName, instance.Labels[v1alpha.PlacementNameLabel], + "PlacementNameLabel must equal deployment.Spec.PlacementName") +} + +// TestInstanceLabels_PropagatedOnUpdate verifies that when an existing instance +// is updated (rolling update path), the four new labels are refreshed from the +// deployment so they remain accurate after spec changes. +func TestInstanceLabels_PropagatedOnUpdate(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-labels-update", 1) + + // Build a ready existing instance. + currentInstances := []v1alpha.Instance{*getInstanceForDeployment(deployment, 0)} + + // Trigger a rolling update by changing the image. + deployment.Spec.Template.Spec.Runtime.Sandbox.Containers[0].Image = "updated-image" + + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + assert.Equal(t, instancecontrol.ActionTypeUpdate, actions[0].ActionType()) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + + assert.Equal(t, deployment.GetName(), instance.Labels[v1alpha.WorkloadDeploymentNameLabel], + "WorkloadDeploymentNameLabel must be refreshed on update") + assert.Equal(t, deployment.Spec.CityCode, instance.Labels[v1alpha.CityCodeLabel], + "CityCodeLabel must be refreshed on update") + assert.Equal(t, deployment.Spec.WorkloadRef.Name, instance.Labels[v1alpha.WorkloadNameLabel], + "WorkloadNameLabel must be refreshed on update") + assert.Equal(t, deployment.Spec.PlacementName, instance.Labels[v1alpha.PlacementNameLabel], + "PlacementNameLabel must be refreshed on update") +} + +// TestInstanceLocation_SetWhenDeploymentStatusLocationPresent verifies that when +// deployment.Status.Location is set, the new Instance receives it as Spec.Location. +func TestInstanceLocation_SetWhenDeploymentStatusLocationPresent(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-location-set", 1) + deployment.Status.Location = &networkingv1alpha.LocationReference{ + Name: "loc-dfw-1", + Namespace: "networking-system", + } + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + assert.Len(t, actions, 1) + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.NotNil(t, instance.Spec.Location, + "Spec.Location must be set when deployment.Status.Location is non-nil") + assert.Equal(t, "loc-dfw-1", instance.Spec.Location.Name) + assert.Equal(t, "networking-system", instance.Spec.Location.Namespace) +} + +// TestInstanceLocation_NilWhenDeploymentStatusLocationAbsent verifies that when +// deployment.Status.Location is nil (no Location object matches the city code), +// instance creation still succeeds and Spec.Location remains nil — no regression +// on the "create instances regardless of Location" contract. +func TestInstanceLocation_NilWhenDeploymentStatusLocationAbsent(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-location-nil", 1) + // deployment.Status.Location is intentionally not set (nil) + + var currentInstances []v1alpha.Instance + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err, "instance creation must succeed even when Status.Location is nil") + assert.Len(t, actions, 1, "exactly one create action must be produced") + + instance, ok := actions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.Nil(t, instance.Spec.Location, + "Spec.Location must remain nil when deployment.Status.Location is not set") + assert.Equal(t, instancecontrol.ActionTypeCreate, actions[0].ActionType(), + "action must be a Create, proving instance creation is not gated on Location") +} + +// TestLabelBackfill_NotReadyMatchingHash verifies that a not-Ready instance +// with an unchanged template hash receives a PatchLabels action when it is +// missing controller-managed labels. The action must not be a rollout Update, +// must not alter spec/template, and must not block subsequent instances. +func TestLabelBackfill_NotReadyMatchingHash(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-notready", 2) + + // Instance 0: not-Ready, correct template hash, but missing city-code/workload-name labels. + instance0 := getInstanceForDeployment(deployment, 0) + apimeta.SetStatusCondition(&instance0.Status.Conditions, metav1.Condition{ + Type: v1alpha.InstanceReady, + Status: metav1.ConditionFalse, + Reason: "NotReady", + Message: "Instance is not ready", + LastTransitionTime: metav1.Now(), + }) + // Simulate pre-existing instance that only has the index label (missing the newer labels). + instance0.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "0", + } + + // Instance 1: needs to be created (nil in desiredInstances), so we only provide instance0. + currentInstances := []v1alpha.Instance{*instance0} + + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + // Collect actions by type. + var waitActions, createActions, updateActions, patchActions []instancecontrol.Action + for _, a := range actions { + switch a.ActionType() { + case instancecontrol.ActionTypeWait: + waitActions = append(waitActions, a) + case instancecontrol.ActionTypeCreate: + createActions = append(createActions, a) + case instancecontrol.ActionTypeUpdate: + updateActions = append(updateActions, a) + case instancecontrol.ActionTypePatchLabels: + patchActions = append(patchActions, a) + } + } + + // The not-Ready instance must still produce a Wait (rollout is gated). + assert.Len(t, waitActions, 1, "not-Ready instance must still produce a Wait action") + assert.Equal(t, "test-backfill-notready-0", waitActions[0].Object.GetName()) + + // The missing instance-1 create is skipped (ordered policy, Wait is first). + assert.Len(t, createActions, 1, "instance-1 create action must be present") + assert.True(t, createActions[0].IsSkipped(), "create for instance-1 must be skipped while instance-0 is waiting") + + // No template Update actions must be produced. + assert.Empty(t, updateActions, "no template Update must be produced for a matching-hash instance") + + // A PatchLabels action must be produced for instance-0. + assert.Len(t, patchActions, 1, "exactly one PatchLabels action for the label-drifted instance") + assert.Equal(t, "test-backfill-notready-0", patchActions[0].Object.GetName()) + assert.False(t, patchActions[0].IsSkipped(), "PatchLabels must not be skipped by the rollout skip-loop") + + // The patched object must carry all desired labels. + patched, ok := patchActions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.Equal(t, deployment.GetName(), patched.Labels[v1alpha.WorkloadDeploymentNameLabel]) + assert.Equal(t, deployment.Spec.CityCode, patched.Labels[v1alpha.CityCodeLabel]) + assert.Equal(t, deployment.Spec.WorkloadRef.Name, patched.Labels[v1alpha.WorkloadNameLabel]) + assert.Equal(t, deployment.Spec.PlacementName, patched.Labels[v1alpha.PlacementNameLabel]) + + // The patched object's spec and template-hash must be unchanged. + assert.Equal(t, instancecontrol.ComputeHash(deployment.Spec.Template), patched.Spec.Controller.TemplateHash, + "template hash must be unchanged by the label backfill") + assert.Equal(t, deployment.Spec.Template.Spec.Runtime, patched.Spec.Runtime, + "spec must be unchanged by the label backfill") +} + +// TestLabelBackfill_Idempotent verifies that an instance already carrying all +// correct controller-managed labels produces no PatchLabels action. +func TestLabelBackfill_Idempotent(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-idempotent", 1) + + // Instance already has all controller-managed labels set correctly. + instance := getInstanceForDeployment(deployment, 0) + instance.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "0", + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } + + currentInstances := []v1alpha.Instance{*instance} + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + for _, a := range actions { + assert.NotEqual(t, instancecontrol.ActionTypePatchLabels, a.ActionType(), + "no PatchLabels action must be produced when all labels are already correct") + } +} + +// TestLabelBackfill_ReadyInstanceCorrected verifies that a Ready instance with +// correct template hash but drifted labels receives a PatchLabels action +// without triggering a template rollout Update. +func TestLabelBackfill_ReadyInstanceCorrected(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-ready", 1) + + // Ready instance with matching hash but missing city-code label. + instance := getInstanceForDeployment(deployment, 0) + // Remove the city-code label to simulate drift. + delete(instance.Labels, v1alpha.CityCodeLabel) + + currentInstances := []v1alpha.Instance{*instance} + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + var updateActions, patchActions []instancecontrol.Action + for _, a := range actions { + switch a.ActionType() { + case instancecontrol.ActionTypeUpdate: + updateActions = append(updateActions, a) + case instancecontrol.ActionTypePatchLabels: + patchActions = append(patchActions, a) + } + } + + // No template Update must be produced — template hash matches. + assert.Empty(t, updateActions, "no template Update must be produced for a matching-hash ready instance") + + // A PatchLabels action must be produced. + assert.Len(t, patchActions, 1, "PatchLabels action must be produced for the label-drifted ready instance") + patched, ok := patchActions[0].Object.(*v1alpha.Instance) + assert.True(t, ok) + assert.Equal(t, deployment.Spec.CityCode, patched.Labels[v1alpha.CityCodeLabel], + "city-code label must be corrected by the backfill") +} + +// TestLabelBackfill_DoesNotAffectRollingUpdate verifies that a genuine template +// change on a Ready instance still produces a normal ordered Update action and +// that the PatchLabels path does not interfere with or duplicate it. +func TestLabelBackfill_DoesNotAffectRollingUpdate(t *testing.T) { + ctx := context.Background() + control := New() + + deployment := getWorkloadDeployment("test-backfill-rolling", 2) + + // Two ready instances with all correct labels and matching current hash. + instance0 := getInstanceForDeployment(deployment, 0) + instance0.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "0", + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } + instance1 := getInstanceForDeployment(deployment, 1) + instance1.Labels = map[string]string{ + v1alpha.InstanceIndexLabel: "1", + v1alpha.WorkloadUIDLabel: string(deployment.Spec.WorkloadRef.UID), + v1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), + v1alpha.WorkloadDeploymentNameLabel: deployment.GetName(), + v1alpha.CityCodeLabel: deployment.Spec.CityCode, + v1alpha.WorkloadNameLabel: deployment.Spec.WorkloadRef.Name, + v1alpha.PlacementNameLabel: deployment.Spec.PlacementName, + } + + // Trigger a template change. + deployment.Spec.Template.Spec.Runtime.Sandbox.Containers[0].Image = "rolling-update-image" + + currentInstances := []v1alpha.Instance{*instance0, *instance1} + actions, err := control.GetActions(ctx, scheme, deployment, currentInstances) + + assert.NoError(t, err) + + var updateActions, patchActions []instancecontrol.Action + for _, a := range actions { + switch a.ActionType() { + case instancecontrol.ActionTypeUpdate: + updateActions = append(updateActions, a) + case instancecontrol.ActionTypePatchLabels: + patchActions = append(patchActions, a) + } + } + + // Two Update actions expected (one per instance), ordered highest-to-lowest. + assert.Len(t, updateActions, 2, "both instances must produce Update actions on template change") + assert.Equal(t, "test-backfill-rolling-1", updateActions[0].Object.GetName(), + "Update actions must be ordered highest ordinal first") + assert.Equal(t, "test-backfill-rolling-0", updateActions[1].Object.GetName()) + assert.False(t, updateActions[0].IsSkipped(), "first Update must be active") + assert.True(t, updateActions[1].IsSkipped(), "second Update must be skipped (ordered rollout)") + + // No PatchLabels — all labels are already correct. + assert.Empty(t, patchActions, "no PatchLabels when all labels are already correct") +} + func getWorkloadDeployment(name string, minReplicas int32) *v1alpha.WorkloadDeployment { instance := getInstanceTemplate(name, 0) deployment := &v1alpha.WorkloadDeployment{ ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: "default", + UID: "test-wd-uid", }, Spec: v1alpha.WorkloadDeploymentSpec{ + WorkloadRef: v1alpha.WorkloadReference{ + Name: "test-workload", + UID: "test-workload-uid", + }, + PlacementName: "test-placement", + CityCode: "DFW", ScaleSettings: v1alpha.HorizontalScaleSettings{ MinReplicas: minReplicas, InstanceManagementPolicy: v1alpha.OrderedReadyInstanceManagementPolicyType, @@ -180,6 +573,20 @@ func getInstanceForDeployment(deployment *v1alpha.WorkloadDeployment, ordinal in TemplateHash: instancecontrol.ComputeHash(deployment.Spec.Template), } + // Stamp all controller-managed labels so that the label-backfill path is a + // no-op for instances built by this helper. Tests that specifically exercise + // label drift should manipulate the labels directly after calling this helper. + if instance.Labels == nil { + instance.Labels = map[string]string{} + } + instance.Labels[v1alpha.InstanceIndexLabel] = strconv.Itoa(ordinal) + instance.Labels[v1alpha.WorkloadUIDLabel] = string(deployment.Spec.WorkloadRef.UID) + instance.Labels[v1alpha.WorkloadDeploymentUIDLabel] = string(deployment.GetUID()) + instance.Labels[v1alpha.WorkloadDeploymentNameLabel] = deployment.GetName() + instance.Labels[v1alpha.CityCodeLabel] = deployment.Spec.CityCode + instance.Labels[v1alpha.WorkloadNameLabel] = deployment.Spec.WorkloadRef.Name + instance.Labels[v1alpha.PlacementNameLabel] = deployment.Spec.PlacementName + return instance } diff --git a/internal/controller/testing_helpers_test.go b/internal/controller/testing_helpers_test.go new file mode 100644 index 00000000..cc3d3d9f --- /dev/null +++ b/internal/controller/testing_helpers_test.go @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/cluster" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// ─── Scheme helpers ─────────────────────────────────────────────────────────── + +// newProjectScheme builds a runtime.Scheme with the types needed by the project +// cluster (corev1 + compute). +func newProjectScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = corev1.AddToScheme(s) + _ = computev1alpha.AddToScheme(s) + return s +} + +// newKarmadaScheme builds a runtime.Scheme with the types needed by the Karmada +// API server (corev1 + compute + karmada policy). +func newKarmadaScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = corev1.AddToScheme(s) + _ = computev1alpha.AddToScheme(s) + _ = karmadapolicyv1alpha1.Install(s) + return s +} + +// newProjectFakeClient returns a fake client pre-populated with the given +// objects and the project scheme. +func newProjectFakeClient(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(newProjectScheme()). + WithObjects(objs...). + WithStatusSubresource(objs...). + Build() +} + +// newKarmadaFakeClient returns a fake client pre-populated with the given +// objects and the Karmada scheme. +func newKarmadaFakeClient(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(newKarmadaScheme()). + WithObjects(objs...). + Build() +} + +// ─── Fake cluster.Cluster ───────────────────────────────────────────────────── + +// fakeCluster is a minimal cluster.Cluster implementation for tests. +// Embeds the interface so only the methods we need are implemented. +type fakeCluster struct { + cluster.Cluster // nil embed — panics if unimplemented methods are called + cl client.Client +} + +func (f *fakeCluster) GetClient() client.Client { return f.cl } +func (f *fakeCluster) GetScheme() *runtime.Scheme { return f.cl.Scheme() } +func (f *fakeCluster) GetAPIReader() client.Reader { return f.cl } + +// newFakeCluster wraps a fake client in a fakeCluster. +func newFakeCluster(cl client.Client) *fakeCluster { + return &fakeCluster{cl: cl} +} + +// ─── Fake mcmanager.Manager ─────────────────────────────────────────────────── + +// fakeMCManager is a minimal mcmanager.Manager implementation that serves a +// fixed map of project clusters. Only GetCluster is implemented; all other +// Manager methods panic through the embedded nil interface. +type fakeMCManager struct { + mcmanager.Manager // nil embed — panics if unimplemented methods are called + clusters map[string]cluster.Cluster +} + +func (m *fakeMCManager) GetCluster(_ context.Context, name multicluster.ClusterName) (cluster.Cluster, error) { + if c, ok := m.clusters[string(name)]; ok { + return c, nil + } + return nil, fmt.Errorf("cluster %q not found in fake manager", name) +} + +// newFakeMCManager returns a fakeMCManager with a single named cluster. +func newFakeMCManager(clusterName string, cl cluster.Cluster) *fakeMCManager { + return &fakeMCManager{ + clusters: map[string]cluster.Cluster{clusterName: cl}, + } +} diff --git a/internal/controller/workload_controller.go b/internal/controller/workload_controller.go index 6e907b65..6ca92e03 100644 --- a/internal/controller/workload_controller.go +++ b/internal/controller/workload_controller.go @@ -26,13 +26,17 @@ import ( mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) -const workloadControllerFinalizer = "compute.datumapis.com/workload-controller" +const ( + workloadControllerFinalizer = "compute.datumapis.com/workload-controller" + workloadConditionTypeAvailable = "Available" +) // WorkloadReconciler reconciles a Workload object type WorkloadReconciler struct { @@ -118,7 +122,7 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ if len(notFoundNetworks) > 0 { missingNetworks := strings.Join(notFoundNetworks.UnsortedList(), ", ") changed := apimeta.SetStatusCondition(&workload.Status.Conditions, metav1.Condition{ - Type: "Available", + Type: workloadConditionTypeAvailable, Status: metav1.ConditionFalse, Reason: "NetworkNotFound", Message: fmt.Sprintf("Unable to find networks: %s", missingNetworks), @@ -383,9 +387,9 @@ func (r *WorkloadReconciler) getDeploymentsForWorkload( existingDeployments.Insert(deployment.Name) } - var locations networkingv1alpha.LocationList + var locations networkingv1alpha.LocationBindingList if err := upstreamClient.List(ctx, &locations); err != nil { - return nil, nil, fmt.Errorf("failed to list locations: %w", err) + return nil, nil, fmt.Errorf("failed to list location bindings: %w", err) } if len(locations.Items) == 0 { @@ -463,7 +467,7 @@ func (r *WorkloadReconciler) SetupWithManager(mgr mcmanager.Manager) error { return mcbuilder.ControllerManagedBy(mgr). For(&computev1alpha.Workload{}, mcbuilder.WithEngageWithLocalCluster(false)). Owns(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). - Watches(&networkingv1alpha.Network{}, func(clusterName string, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + Watches(&networkingv1alpha.Network{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, network client.Object) []mcreconcile.Request { logger := log.FromContext(ctx) diff --git a/internal/controller/workloaddeployment_controller.go b/internal/controller/workloaddeployment_controller.go index 50e21ef0..9b17266e 100644 --- a/internal/controller/workloaddeployment_controller.go +++ b/internal/controller/workloaddeployment_controller.go @@ -24,6 +24,7 @@ import ( mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" computev1alpha "go.datum.net/compute/api/v1alpha" @@ -37,11 +38,28 @@ import ( type WorkloadDeploymentReconciler struct { mgr mcmanager.Manager finalizers finalizer.Finalizers + // KarmadaClient is an optional client pointing at the Karmada control plane. + // When non-nil, the reconciler writes the WorkloadDeployment status back to + // the Karmada namespace after each reconcile so the WorkloadDeploymentFederator + // can aggregate it into the project-namespace object. Set to nil to disable. + KarmadaClient client.Client + + // NetworkingEnabled controls whether the networking integration with + // network-services-operator is active. When false, NetworkBinding creation is + // skipped, the Network scheduling gate is never added to Instances (and is + // actively removed if present), and the networking step is treated as + // immediately ready. Defaults to true. + NetworkingEnabled bool } // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/status,verbs=get;update;patch // +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/finalizers,verbs=update +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=locations,verbs=get;list;watch +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=networkbindings,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=networkcontexts,verbs=get;list;watch +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=subnetclaims,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=networking.datumapis.com,resources=subnets,verbs=get;list;watch func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { logger := log.FromContext(ctx) @@ -86,10 +104,6 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco logger.Info("reconciling deployment") defer logger.Info("reconcile complete") - if deployment.Status.Location == nil { - return ctrl.Result{}, nil - } - // Collect all instances for this deployment listOpts := client.MatchingLabels{ computev1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()), @@ -100,7 +114,9 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco return ctrl.Result{}, fmt.Errorf("failed listing instances: %w", err) } - instanceControl := instancecontrolstateful.New() + instanceControl := instancecontrolstateful.NewWithOptions(instancecontrolstateful.Options{ + NetworkingEnabled: r.NetworkingEnabled, + }) actions, err := instanceControl.GetActions(ctx, cl.GetScheme(), &deployment, instances.Items) if err != nil { @@ -122,9 +138,26 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco } } - networkReady, err := r.reconcileNetworks(ctx, cl.GetClient(), &deployment) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed reconciling networks: %w", err) + // When networking is disabled, bypass the entire network provisioning path. + // The Network scheduling gate is treated as cleared and no NetworkBindings + // are created. This lets Instances reach the runtime on cells where + // network-services-operator (VPC) is not yet available. + var networkReady bool + if !r.NetworkingEnabled { + networkReady = true + } else { + var resolvedLocation *networkingv1alpha.LocationReference + networkReady, resolvedLocation, err = r.reconcileNetworks(ctx, cl.GetClient(), &deployment) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed reconciling networks: %w", err) + } + // Persist the resolved Location to status so downstream components (e.g. + // the stateful instance control strategy) can propagate it to Instances. + // When no matching Location exists, resolvedLocation is nil and + // Status.Location remains nil — instance creation is not blocked. + if resolvedLocation != nil { + deployment.Status.Location = resolvedLocation + } } // Networks are all ready with subnets ready to use, remove any scheduling @@ -143,59 +176,59 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco return ctrl.Result{}, err } - patchResult, err := controllerutil.CreateOrPatch(ctx, cl.GetClient(), &deployment, func() error { - deployment.Status.Replicas = int32(replicas) - deployment.Status.CurrentReplicas = int32(currentReplicas) - deployment.Status.DesiredReplicas = desiredReplicas - deployment.Status.ReadyReplicas = int32(readyReplicas) - - if quotaBlockedReplicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionFalse, - Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, - Message: fmt.Sprintf("%d of %d desired replicas are pending quota", quotaBlockedReplicas, desiredReplicas), - }) - } else { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentReplicasReady, - Status: metav1.ConditionTrue, - Reason: "ReplicasAvailable", - Message: fmt.Sprintf("%d/%d replicas available", readyReplicas, desiredReplicas), - }) - } - - if readyReplicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionTrue, - Reason: "StableInstanceFound", - Message: fmt.Sprintf("%d/%d instances are ready", readyReplicas, replicas), - }) - } else if !networkReady { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ProvisioningNetwork", - Message: "Network is being provisioned", - }) - } else if replicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ProvisioningInstances", - Message: "Instances are being provisioned", - }) - } + deployment.Status.Replicas = int32(replicas) + deployment.Status.CurrentReplicas = int32(currentReplicas) + deployment.Status.DesiredReplicas = desiredReplicas + deployment.Status.ReadyReplicas = int32(readyReplicas) + + if quotaBlockedReplicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentReplicasReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, + Message: fmt.Sprintf("%d of %d desired replicas are pending quota", quotaBlockedReplicas, desiredReplicas), + }) + } else { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentReplicasReady, + Status: metav1.ConditionTrue, + Reason: "ReplicasAvailable", + Message: fmt.Sprintf("%d/%d replicas available", readyReplicas, desiredReplicas), + }) + } - return nil - }) + if readyReplicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionTrue, + Reason: "StableInstanceFound", + Message: fmt.Sprintf("%d/%d instances are ready", readyReplicas, replicas), + }) + } else if !networkReady { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "ProvisioningNetwork", + Message: "Network is being provisioned", + }) + } else if replicas > 0 { + apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: "ProvisioningInstances", + Message: "Instances are being provisioned", + }) + } - if err != nil { + if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { return ctrl.Result{}, fmt.Errorf("failed updating deployment status: %w", err) } - logger.Info("deployment status processed", "operation_result", patchResult) + if err := r.writeStatusToKarmada(ctx, &deployment); err != nil { + return ctrl.Result{}, err + } + + logger.Info("deployment status updated") return ctrl.Result{}, nil } @@ -240,13 +273,70 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates( return currentReplicas, readyReplicas, quotaBlockedReplicas, nil } +// writeStatusToKarmada copies the WorkloadDeployment status to the matching +// object in the Karmada namespace so the WorkloadDeploymentFederator can +// sync it back to the project-namespace object on the control plane. +// It is a no-op when KarmadaClient is nil. +func (r *WorkloadDeploymentReconciler) writeStatusToKarmada(ctx context.Context, deployment *computev1alpha.WorkloadDeployment) error { + if r.KarmadaClient == nil { + return nil + } + + var kd computev1alpha.WorkloadDeployment + if err := r.KarmadaClient.Get(ctx, client.ObjectKeyFromObject(deployment), &kd); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed getting Karmada WD for status writeback: %w", err) + } + + kd.Status = deployment.Status + // Use Update (not Patch) so all required status fields are present in the + // request body; MergeFrom omits unchanged zero-value int32 fields which + // would fail the CRD's required constraints on currentReplicas/readyReplicas. + if err := r.KarmadaClient.Status().Update(ctx, &kd); err != nil { + return fmt.Errorf("failed updating Karmada WD status: %w", err) + } + + return nil +} + +// reconcileNetworks ensures NetworkBindings and SubnetClaims exist for all +// network interfaces on the deployment. It returns (networkReady, resolvedLocation, err). +// resolvedLocation is non-nil when a Location matching the deployment's city code +// was found; nil otherwise. Instance creation is never gated on resolvedLocation +// being non-nil — callers must treat a nil location as best-effort only. func (r *WorkloadDeploymentReconciler) reconcileNetworks( ctx context.Context, c client.Client, deployment *computev1alpha.WorkloadDeployment, -) (bool, error) { +) (bool, *networkingv1alpha.LocationReference, error) { logger := log.FromContext(ctx) + // Resolve the Location for this deployment's city code. With Karmada + // propagation the WorkloadDeployment lands in the cluster that serves the + // requested city, so the Location object for that city must exist locally. + var locationList networkingv1alpha.LocationList + if err := c.List(ctx, &locationList); err != nil { + return false, nil, fmt.Errorf("failed to list locations: %w", err) + } + + var locationRef *networkingv1alpha.LocationReference + for _, loc := range locationList.Items { + if cityCode, ok := loc.Spec.Topology["topology.datum.net/city-code"]; ok && cityCode == deployment.Spec.CityCode { + locationRef = &networkingv1alpha.LocationReference{ + Name: loc.Name, + Namespace: loc.Namespace, + } + break + } + } + + if locationRef == nil { + logger.Info("no location found for city code, waiting", "cityCode", deployment.Spec.CityCode) + return false, nil, nil + } + // First, ensure we have a NetworkBinding for each interface, and that the // binding is ready before we move on to create SubnetClaims. @@ -260,7 +350,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } if err := c.Get(ctx, networkBindingObjectKey, &networkBinding); client.IgnoreNotFound(err) != nil { - return false, fmt.Errorf("failed checking for existing network binding: %w", err) + return false, nil, fmt.Errorf("failed checking for existing network binding: %w", err) } if networkBinding.CreationTimestamp.IsZero() { @@ -271,16 +361,16 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( }, Spec: networkingv1alpha.NetworkBindingSpec{ Network: networkInterface.Network, - Location: *deployment.Status.Location, + Location: *locationRef, }, } if err := controllerutil.SetControllerReference(deployment, &networkBinding, c.Scheme()); err != nil { - return false, fmt.Errorf("failed to set controller on network binding: %w", err) + return false, nil, fmt.Errorf("failed to set controller on network binding: %w", err) } if err := c.Create(ctx, &networkBinding); err != nil { - return false, fmt.Errorf("failed creating network binding: %w", err) + return false, nil, fmt.Errorf("failed creating network binding: %w", err) } } @@ -293,7 +383,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( if !allNetworkBindingsReady { logger.Info("waiting for network bindings to be ready") - return false, nil + return false, locationRef, nil } // TODO(jreese): Currently this makes a SubnetClaim that will be used by @@ -312,12 +402,12 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } if err := c.Get(ctx, networkContextObjectKey, &networkContext); client.IgnoreNotFound(err) != nil { - return false, fmt.Errorf("failed checking for existing network context: %w", err) + return false, nil, fmt.Errorf("failed checking for existing network context: %w", err) } if !apimeta.IsStatusConditionTrue(networkContext.Status.Conditions, networkingv1alpha.NetworkContextReady) { logger.Info("waiting for network context to be ready", "network_context", networkContext.Name) - return false, nil + return false, locationRef, nil } var subnetClaims networkingv1alpha.SubnetClaimList @@ -326,7 +416,7 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } if err := c.List(ctx, &subnetClaims, listOpts...); err != nil { - return false, fmt.Errorf("failed listing subnet claims: %w", err) + return false, nil, fmt.Errorf("failed listing subnet claims: %w", err) } var subnetClaim networkingv1alpha.SubnetClaim @@ -347,8 +437,8 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( } // If it's not the same location, don't consider the subnet claim. - if claim.Spec.Location.Namespace != deployment.Status.Location.Namespace || - claim.Spec.Location.Name != deployment.Status.Location.Name { + if claim.Spec.Location.Namespace != locationRef.Namespace || + claim.Spec.Location.Name != locationRef.Name { continue } @@ -371,28 +461,28 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( NetworkContext: networkingv1alpha.LocalNetworkContextRef{ Name: networkContext.Name, }, - Location: *deployment.Status.Location, + Location: *locationRef, }, } if err := controllerutil.SetOwnerReference(&networkContext, &subnetClaim, c.Scheme()); err != nil { - return false, fmt.Errorf("failed to set controller on subnet claim: %w", err) + return false, nil, fmt.Errorf("failed to set controller on subnet claim: %w", err) } if err := c.Create(ctx, &subnetClaim); err != nil { - return false, fmt.Errorf("failed creating subnet claim: %w", err) + return false, nil, fmt.Errorf("failed creating subnet claim: %w", err) } logger.Info("created subnet claim", "subnetClaim", subnetClaim.Name) - return false, nil + return false, locationRef, nil } logger.Info("found subnet claim", "subnetClaim", subnetClaim.Name) if !apimeta.IsStatusConditionTrue(subnetClaim.Status.Conditions, "Ready") { logger.Info("waiting for subnet claim to be ready", "subnetClaim", subnetClaim.Name) - return false, nil + return false, locationRef, nil } var subnet networkingv1alpha.Subnet @@ -401,19 +491,19 @@ func (r *WorkloadDeploymentReconciler) reconcileNetworks( Name: subnetClaim.Status.SubnetRef.Name, } if err := c.Get(ctx, subnetObjectKey, &subnet); err != nil { - return false, fmt.Errorf("failed fetching subnet: %w", err) + return false, nil, fmt.Errorf("failed fetching subnet: %w", err) } if !apimeta.IsStatusConditionTrue(subnet.Status.Conditions, "Ready") { logger.Info("waiting for subnet to be ready", "subnet", subnet.Name) - return false, nil + return false, locationRef, nil } logger.Info("subnet is ready", "subnet", subnet.Name) } - return true, nil + return true, locationRef, nil } var errDeploymentHasInstances = errors.New("deployment has instances") @@ -468,47 +558,65 @@ func (r *WorkloadDeploymentReconciler) SetupWithManager(mgr mcmanager.Manager) e if err := r.finalizers.Register(workloadControllerFinalizer, r); err != nil { return fmt.Errorf("failed to register finalizer: %w", err) } - return mcbuilder.ControllerManagedBy(mgr). + + b := mcbuilder.ControllerManagedBy(mgr). For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). - Owns(&computev1alpha.Instance{}). - Owns(&networkingv1alpha.NetworkBinding{}). - Watches(&networkingv1alpha.SubnetClaim{}, func(clusterName string, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { - return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { - subnetClaim := o.(*networkingv1alpha.SubnetClaim) - return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnetClaim.Spec.Location) + Owns(&computev1alpha.Instance{}) + + // Only watch networking resources when the networking integration is enabled. + // On cells without network-services-operator these watches would log spurious + // errors for missing CRDs. + if r.NetworkingEnabled { + b = b. + Owns(&networkingv1alpha.NetworkBinding{}). + Watches(&networkingv1alpha.SubnetClaim{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { + subnetClaim := o.(*networkingv1alpha.SubnetClaim) + return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnetClaim.Spec.Location) + }) + }). + Watches(&networkingv1alpha.Subnet{}, func(clusterName multicluster.ClusterName, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { + return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { + subnet := o.(*networkingv1alpha.Subnet) + return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnet.Spec.Location) + }) }) - }). - Watches(&networkingv1alpha.Subnet{}, func(clusterName string, cl cluster.Cluster) handler.TypedEventHandler[client.Object, mcreconcile.Request] { - return handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []mcreconcile.Request { - subnet := o.(*networkingv1alpha.Subnet) - return enqueueWorkloadDeploymentByLocation(ctx, mgr, clusterName, subnet.Spec.Location) - }) - }). - Complete(r) + } + + return b.Complete(r) } -func enqueueWorkloadDeploymentByLocation(ctx context.Context, mgr mcmanager.Manager, clusterName string, locationRef networkingv1alpha.LocationReference) []mcreconcile.Request { +func enqueueWorkloadDeploymentByLocation(ctx context.Context, mgr mcmanager.Manager, clusterName multicluster.ClusterName, locationRef networkingv1alpha.LocationReference) []mcreconcile.Request { logger := log.FromContext(ctx) - cluster, err := mgr.GetCluster(ctx, clusterName) + cl, err := mgr.GetCluster(ctx, clusterName) if err != nil { logger.Error(err, "failed to get cluster") return nil } - clusterClient := cluster.GetClient() + clusterClient := cl.GetClient() - locationName := (types.NamespacedName{ + // Resolve the Location to find its city code, then look up WorkloadDeployments + // that target the same city via the deploymentCityCodeIndex. + var location networkingv1alpha.Location + if err := clusterClient.Get(ctx, types.NamespacedName{ Namespace: locationRef.Namespace, Name: locationRef.Name, - }).String() - listOpts := client.MatchingFields{ - deploymentLocationIndex: locationName, + }, &location); err != nil { + logger.Error(err, "failed to get location for enqueue", "location", locationRef) + return nil } - var workloadDeployments computev1alpha.WorkloadDeploymentList + cityCode, ok := location.Spec.Topology["topology.datum.net/city-code"] + if !ok { + return nil + } - if err := clusterClient.List(ctx, &workloadDeployments, listOpts); err != nil { - logger.Error(err, "failed to list workloads") + var workloadDeployments computev1alpha.WorkloadDeploymentList + if err := clusterClient.List(ctx, &workloadDeployments, client.MatchingFields{ + deploymentCityCodeIndex: cityCode, + }); err != nil { + logger.Error(err, "failed to list workload deployments") return nil } diff --git a/internal/controller/workloaddeployment_federator.go b/internal/controller/workloaddeployment_federator.go new file mode 100644 index 00000000..9c736cf0 --- /dev/null +++ b/internal/controller/workloaddeployment_federator.go @@ -0,0 +1,405 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/equality" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + "sigs.k8s.io/controller-runtime/pkg/log" + mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +const ( + // federatorFinalizer is added to project-namespace WorkloadDeployments that + // have been federated to the downstream control plane. It ensures we clean up + // the downstream object and any orphaned PropagationPolicies before the project + // object is permanently deleted. + federatorFinalizer = "compute.datumapis.com/federator" + + // cityCodeLabel is applied to WorkloadDeployments in the downstream namespace + // and is used by PropagationPolicy selectors to route them to the correct + // POP-cell clusters. Downstream Cluster objects are expected to carry this + // label with their city-code value. + cityCodeLabel = "topology.datum.net/city-code" + + // kindWorkloadDeployment is the Kind string for WorkloadDeployment resources. + kindWorkloadDeployment = "WorkloadDeployment" +) + +// WorkloadDeploymentFederator replicates WorkloadDeployments from project +// namespaces into the downstream control plane so it can propagate them to the +// appropriate POP-cell clusters. +// +// For each WorkloadDeployment the controller: +// 1. Determines the downstream namespace via the ns- +// convention (matching the MappedNamespaceResourceStrategy used by +// go.datum.net/network-services-operator; this logic will migrate to Milo +// once the shared library is promoted). +// 2. Upserts a corresponding WorkloadDeployment in that downstream namespace, +// stamped with label topology.datum.net/city-code=. +// 3. Lazily creates a PropagationPolicy per city code per downstream namespace +// that selects WorkloadDeployments by the city-code label and targets +// clusters carrying the same label. The PP is deleted once no deployments +// with that city code remain in the namespace. +// 4. Reads the aggregated status from the downstream control plane and writes +// it back to the project-namespace object. +// 5. On deletion: removes the downstream WorkloadDeployment and cleans up +// unused PropagationPolicies. +type WorkloadDeploymentFederator struct { + mgr mcmanager.Manager + // FederationClient is a client pointed at the Karmada federation control + // plane (the federation hub that the management controllers read and write + // through). The caller (cmd/main.go) constructs it from --federation-kubeconfig. + FederationClient client.Client + finalizers finalizer.Finalizers +} + +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments,verbs=get;list;watch;update;patch +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/finalizers,verbs=update +// +kubebuilder:rbac:groups=core,resources=namespaces,verbs=get;list + +func (r *WorkloadDeploymentFederator) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { + if r.FederationClient == nil { + return ctrl.Result{}, nil + } + + logger := log.FromContext(ctx) + + cl, err := r.mgr.GetCluster(ctx, req.ClusterName) + if err != nil { + return ctrl.Result{}, err + } + ctx = mccontext.WithCluster(ctx, req.ClusterName) + + var deployment computev1alpha.WorkloadDeployment + if err := cl.GetClient().Get(ctx, req.NamespacedName, &deployment); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + finalizationResult, err := r.finalizers.Finalize(ctx, &deployment) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to finalize: %w", err) + } + if finalizationResult.Updated { + if err = cl.GetClient().Update(ctx, &deployment); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update based on finalization result: %w", err) + } + return ctrl.Result{}, nil + } + + if !deployment.DeletionTimestamp.IsZero() { + return ctrl.Result{}, nil + } + + logger.Info("federating deployment to downstream control plane") + + // Determine the downstream namespace for this project namespace using the + // ns- convention (MappedNamespaceResourceStrategy). + // Using strategy.GetClient() for writes ensures the downstream namespace is + // created with UpstreamOwnerNamespaceLabel so the InstanceProjector can + // resolve the target project namespace without scanning all namespaces. + strategy := downstreamclient.NewMappedNamespaceResourceStrategy(string(req.ClusterName), cl.GetClient(), r.FederationClient) + downstreamNS, err := strategy.GetDownstreamNamespaceNameForUpstreamNamespace(ctx, deployment.Namespace) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to determine downstream namespace: %w", err) + } + + // Ensure the downstream namespace exists and carries the upstream tracking + // labels so the InstanceProjector can resolve the project namespace by label + // lookup instead of scanning all namespaces. + if err := r.ensureDownstreamNamespace(ctx, downstreamNS, deployment.Namespace, string(req.ClusterName)); err != nil { + return ctrl.Result{}, err + } + + // Upsert the WorkloadDeployment in the downstream control plane via the + // strategy client so any future Create calls also go through + // ensureDownstreamNamespace automatically. + if err := r.upsertDownstreamDeployment(ctx, strategy.GetClient(), &deployment, downstreamNS); err != nil { + return ctrl.Result{}, err + } + + // Lazily create the PropagationPolicy that targets clusters with the matching + // city-code label. + if err := r.ensurePropagationPolicy(ctx, downstreamNS, deployment.Spec.CityCode); err != nil { + return ctrl.Result{}, err + } + + // Pull aggregated status from the downstream control plane back into the + // project namespace. + if err := r.syncStatusFromDownstream(ctx, cl.GetClient(), &deployment, downstreamNS); err != nil { + return ctrl.Result{}, err + } + + logger.Info("federation complete") + return ctrl.Result{}, nil +} + +// Finalize removes the downstream WorkloadDeployment and, if no other +// deployments with the same city code remain in the downstream namespace, deletes +// the PropagationPolicy as well. +func (r *WorkloadDeploymentFederator) Finalize(ctx context.Context, obj client.Object) (finalizer.Result, error) { + if r.FederationClient == nil { + return finalizer.Result{}, nil + } + + deployment := obj.(*computev1alpha.WorkloadDeployment) + logger := log.FromContext(ctx).WithValues( + "deployment", deployment.Name, + "namespace", deployment.Namespace, + ) + + clusterName, ok := mccontext.ClusterFrom(ctx) + if !ok { + return finalizer.Result{}, fmt.Errorf("cluster name not found in context") + } + + cl, err := r.mgr.GetCluster(ctx, clusterName) + if err != nil { + return finalizer.Result{}, err + } + + strategy := downstreamclient.NewMappedNamespaceResourceStrategy(string(clusterName), cl.GetClient(), r.FederationClient) + downstreamNS, err := strategy.GetDownstreamNamespaceNameForUpstreamNamespace(ctx, deployment.Namespace) + if err != nil { + return finalizer.Result{}, fmt.Errorf("failed to determine downstream namespace during finalization: %w", err) + } + + // Delete the downstream WorkloadDeployment. + kd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: deployment.Name, + Namespace: downstreamNS, + }, + } + if err := r.FederationClient.Delete(ctx, kd); client.IgnoreNotFound(err) != nil { + return finalizer.Result{}, fmt.Errorf("failed to delete downstream deployment %s/%s: %w", downstreamNS, deployment.Name, err) + } + logger.Info("deleted downstream WorkloadDeployment", "downstreamNamespace", downstreamNS) + + // Clean up the PropagationPolicy if no other deployments with the same city + // code remain in this downstream namespace. + if err := r.cleanupPropagationPolicyIfUnused(ctx, downstreamNS, deployment.Spec.CityCode); err != nil { + return finalizer.Result{}, err + } + + return finalizer.Result{}, nil +} + +// ensureDownstreamNamespace creates or updates the downstream namespace, stamping +// it with the upstream tracking labels that MappedNamespaceResourceStrategy uses. +// This allows the InstanceProjector to resolve the project namespace name via a +// direct label lookup rather than scanning all namespaces by UID. +func (r *WorkloadDeploymentFederator) ensureDownstreamNamespace(ctx context.Context, name, upstreamNamespace, clusterName string) error { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: name}} + _, err := controllerutil.CreateOrUpdate(ctx, r.FederationClient, ns, func() error { + if ns.Labels == nil { + ns.Labels = make(map[string]string) + } + ns.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] = fmt.Sprintf("cluster-%s", strings.ReplaceAll(clusterName, "/", "_")) + ns.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] = upstreamNamespace + return nil + }) + if err != nil { + return fmt.Errorf("failed to ensure downstream namespace %q: %w", name, err) + } + return nil +} + +// upsertDownstreamDeployment creates or updates the WorkloadDeployment in the +// downstream namespace via the provided client (expected to be strategy.GetClient() +// so the downstream namespace is created with upstream tracking labels). +func (r *WorkloadDeploymentFederator) upsertDownstreamDeployment( + ctx context.Context, + downstreamClient client.Client, + deployment *computev1alpha.WorkloadDeployment, + downstreamNS string, +) error { + kd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: deployment.Name, + Namespace: downstreamNS, + }, + } + + result, err := controllerutil.CreateOrPatch(ctx, downstreamClient, kd, func() error { + if kd.Labels == nil { + kd.Labels = make(map[string]string) + } + kd.Labels[cityCodeLabel] = deployment.Spec.CityCode + kd.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] = deployment.Namespace + kd.Spec = deployment.Spec + return nil + }) + if err != nil { + return fmt.Errorf("failed to upsert downstream deployment %s/%s: %w", downstreamNS, deployment.Name, err) + } + + log.FromContext(ctx).Info("upserted downstream deployment", "result", result, "downstreamNamespace", downstreamNS) + return nil +} + +// ensurePropagationPolicy creates or updates a PropagationPolicy in the downstream +// namespace that selects all WorkloadDeployments with the given city-code label +// and targets clusters carrying the same label. +func (r *WorkloadDeploymentFederator) ensurePropagationPolicy( + ctx context.Context, + downstreamNS string, + cityCode string, +) error { + pp := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: propagationPolicyNameFor(cityCode), + Namespace: downstreamNS, + }, + } + + result, err := controllerutil.CreateOrPatch(ctx, r.FederationClient, pp, func() error { + pp.Spec = karmadapolicyv1alpha1.PropagationSpec{ + // Select all WorkloadDeployments in this namespace that carry the + // city-code label. Using a label selector (rather than individual + // resource names) means that new deployments for this city are + // automatically picked up without updating the policy. + ResourceSelectors: []karmadapolicyv1alpha1.ResourceSelector{ + { + APIVersion: computev1alpha.GroupVersion.String(), + Kind: kindWorkloadDeployment, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + cityCodeLabel: cityCode, + }, + }, + }, + }, + Placement: karmadapolicyv1alpha1.Placement{ + // Route to clusters that carry the same city-code label. POP-cell + // clusters registered with the downstream control plane must be + // labeled accordingly. + ClusterAffinity: &karmadapolicyv1alpha1.ClusterAffinity{ + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + cityCodeLabel: cityCode, + }, + }, + }, + }, + } + return nil + }) + if err != nil { + return fmt.Errorf("failed to upsert PropagationPolicy for city %q in %s: %w", cityCode, downstreamNS, err) + } + + log.FromContext(ctx).Info("upserted PropagationPolicy", "result", result, "cityCode", cityCode, "downstreamNamespace", downstreamNS) + return nil +} + +// syncStatusFromDownstream reads the aggregated status of the WorkloadDeployment +// from the downstream namespace and writes it back to the project-namespace +// object. It is a no-op when the downstream object does not yet exist. +func (r *WorkloadDeploymentFederator) syncStatusFromDownstream( + ctx context.Context, + projectClient client.Client, + deployment *computev1alpha.WorkloadDeployment, + downstreamNS string, +) error { + var kd computev1alpha.WorkloadDeployment + if err := r.FederationClient.Get(ctx, types.NamespacedName{ + Name: deployment.Name, + Namespace: downstreamNS, + }, &kd); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to get downstream deployment for status sync: %w", err) + } + + if equality.Semantic.DeepEqual(deployment.Status, kd.Status) { + return nil + } + + deployment.Status = kd.Status + if err := projectClient.Status().Update(ctx, deployment); err != nil { + return fmt.Errorf("failed to write downstream status back to project deployment: %w", err) + } + return nil +} + +// cleanupPropagationPolicyIfUnused deletes the PropagationPolicy for the given +// city code if no WorkloadDeployments with that city code remain in the +// downstream namespace. +func (r *WorkloadDeploymentFederator) cleanupPropagationPolicyIfUnused( + ctx context.Context, + downstreamNS string, + cityCode string, +) error { + var remaining computev1alpha.WorkloadDeploymentList + if err := r.FederationClient.List(ctx, &remaining, + client.InNamespace(downstreamNS), + client.MatchingLabels{cityCodeLabel: cityCode}, + ); err != nil { + return fmt.Errorf("failed to list remaining downstream deployments for city %q: %w", cityCode, err) + } + + if len(remaining.Items) > 0 { + // Other deployments still need this PropagationPolicy. + return nil + } + + pp := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: propagationPolicyNameFor(cityCode), + Namespace: downstreamNS, + }, + } + if err := r.FederationClient.Delete(ctx, pp); client.IgnoreNotFound(err) != nil { + return fmt.Errorf("failed to delete PropagationPolicy for city %q in %s: %w", cityCode, downstreamNS, err) + } + + log.FromContext(ctx).Info("deleted PropagationPolicy (no more deployments for city)", "cityCode", cityCode, "downstreamNamespace", downstreamNS) + return nil +} + +// SetupWithManager registers the controller with the multicluster manager. +// It must only be called when FederationClient is non-nil. +func (r *WorkloadDeploymentFederator) SetupWithManager(mgr mcmanager.Manager) error { + r.mgr = mgr + r.finalizers = finalizer.NewFinalizers() + if err := r.finalizers.Register(federatorFinalizer, r); err != nil { + return fmt.Errorf("failed to register federator finalizer: %w", err) + } + return mcbuilder.ControllerManagedBy(mgr). + For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)). + Named("workload-deployment-federator"). + Complete(r) +} + +// propagationPolicyNameFor returns the PropagationPolicy name for a given city +// code. The name is stable and deterministic so that multiple reconciles of +// different deployments sharing the same city code converge on the same policy. +func propagationPolicyNameFor(cityCode string) string { + // Sanitize the city code to a valid Kubernetes name: lower-case, spaces → hyphens. + sanitized := strings.ToLower(strings.ReplaceAll(cityCode, " ", "-")) + return fmt.Sprintf("city-%s", sanitized) +} diff --git a/internal/controller/workloaddeployment_federator_test.go b/internal/controller/workloaddeployment_federator_test.go new file mode 100644 index 00000000..2bd2169f --- /dev/null +++ b/internal/controller/workloaddeployment_federator_test.go @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + "time" + + karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/finalizer" + mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// ─── Shared test constants ──────────────────────────────────────────────────── + +const ( + testCluster = "test-project-cluster" + testProjNS = "my-project" + testProjNSUID = types.UID("aabbccdd-0000-1111-2222-333344445555") + testKarmadaNSStr = "ns-aabbccdd-0000-1111-2222-333344445555" + testWDName = "my-workload-deployment" + testCityCodeLAX = "LAX" +) + +// ─── Test helpers ───────────────────────────────────────────────────────────── + +// testProjectNamespace returns a corev1.Namespace for the project cluster with a +// stable UID that matches testKarmadaNSStr. +func testProjectNamespace() *corev1.Namespace { + return &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: testProjNS, + UID: testProjNSUID, + }, + } +} + +// testWorkloadDeployment returns a WorkloadDeployment with the given options. +func testWorkloadDeployment(opts ...func(*computev1alpha.WorkloadDeployment)) *computev1alpha.WorkloadDeployment { + wd := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testProjNS, + UID: "wd-uid-1111", + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + WorkloadRef: computev1alpha.WorkloadReference{ + Name: "test-workload", + }, + PlacementName: testDefaultPlacement, + ScaleSettings: computev1alpha.HorizontalScaleSettings{ + MinReplicas: 1, + }, + }, + } + for _, opt := range opts { + opt(wd) + } + return wd +} + +// withFinalizer adds the federator finalizer to the WorkloadDeployment. +func withFinalizer(wd *computev1alpha.WorkloadDeployment) { + wd.Finalizers = append(wd.Finalizers, federatorFinalizer) +} + +// withDeletionTimestamp sets a non-zero DeletionTimestamp on the WorkloadDeployment. +func withDeletionTimestamp(wd *computev1alpha.WorkloadDeployment) { + t := metav1.NewTime(time.Now().Add(-5 * time.Second)) + wd.DeletionTimestamp = &t +} + +// newTestFederator constructs a WorkloadDeploymentFederator wired to the given +// project client (via a fakeMCManager) and downstream client. The federator +// finalizer is pre-registered so reconcile can handle deletions. +func newTestFederator(projectClient client.Client, karmadaClient client.Client) *WorkloadDeploymentFederator { + projectCluster := newFakeCluster(projectClient) + mgr := newFakeMCManager(testCluster, projectCluster) + + r := &WorkloadDeploymentFederator{ + mgr: mgr, + FederationClient: karmadaClient, + } + + feds := finalizer.NewFinalizers() + if err := feds.Register(federatorFinalizer, r); err != nil { + panic("failed to register test finalizer: " + err.Error()) + } + r.finalizers = feds + return r +} + +// reconcileRequest builds an mcreconcile.Request for the test WorkloadDeployment. +func reconcileRequest() mcreconcile.Request { + return mcreconcile.Request{ + ClusterName: testCluster, + Request: ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: testWDName, + Namespace: testProjNS, + }, + }, + } +} + +// ─── Unit tests ─────────────────────────────────────────────────────────────── + +func TestPropagationPolicyNameFor(t *testing.T) { + t.Parallel() + + tests := []struct { + cityCode string + want string + }{ + {"LAX", "city-lax"}, + {"lax", "city-lax"}, + {"New York", "city-new-york"}, + {"LOS ANGELES", "city-los-angeles"}, + {"SEA", "city-sea"}, + } + + for _, tt := range tests { + t.Run(tt.cityCode, func(t *testing.T) { + t.Parallel() + got := propagationPolicyNameFor(tt.cityCode) + assert.Equal(t, tt.want, got) + }) + } +} + +// TestWorkloadDeploymentFederator_NoFederationClient verifies that the reconciler +// is a no-op when FederationClient is nil. +func TestWorkloadDeploymentFederator_NoFederationClient(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace(), testWorkloadDeployment()) + r := newTestFederator(projectClient, nil) + r.FederationClient = nil // explicitly nil + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) +} + +// TestWorkloadDeploymentFederator_AddsFinalizerOnFirstSeen verifies that the +// first reconcile of a brand-new WorkloadDeployment adds the finalizer and +// returns without federating (the finalizer update triggers a re-queue). +func TestWorkloadDeploymentFederator_AddsFinalizerOnFirstSeen(t *testing.T) { + t.Parallel() + + wd := testWorkloadDeployment() // no finalizer yet + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + // The project WD should now have the finalizer persisted. + var updated computev1alpha.WorkloadDeployment + require.NoError(t, projectClient.Get(context.Background(), + types.NamespacedName{Name: testWDName, Namespace: testProjNS}, &updated)) + assert.Contains(t, updated.Finalizers, federatorFinalizer) + + // Karmada should be untouched – federation happens on the next reconcile. + var wdList computev1alpha.WorkloadDeploymentList + require.NoError(t, karmadaClient.List(context.Background(), &wdList)) + assert.Empty(t, wdList.Items, "no Karmada WD should be created on first-seen reconcile") +} + +// TestWorkloadDeploymentFederator_FederatesToKarmada verifies that a +// WorkloadDeployment with the finalizer already set is fully federated: +// the Karmada namespace, WorkloadDeployment (with city-code label), and +// PropagationPolicy are all created. +func TestWorkloadDeploymentFederator_FederatesToKarmada(t *testing.T) { + t.Parallel() + + wd := testWorkloadDeployment(withFinalizer) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + ctx := context.Background() + + // Karmada namespace must exist. + var karmadaNS corev1.Namespace + err = karmadaClient.Get(ctx, types.NamespacedName{Name: testKarmadaNSStr}, &karmadaNS) + require.NoError(t, err, "Karmada namespace %q should exist", testKarmadaNSStr) + + // Karmada WorkloadDeployment must exist with the city-code label. + var karmadaWD computev1alpha.WorkloadDeployment + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: testWDName, + Namespace: testKarmadaNSStr, + }, &karmadaWD) + require.NoError(t, err, "Karmada WorkloadDeployment should exist") + assert.Equal(t, testCityCodeLAX, karmadaWD.Labels[cityCodeLabel], + "city-code label should be set on Karmada WD") + assert.Equal(t, testCityCodeLAX, karmadaWD.Spec.CityCode, + "spec.cityCode should be copied from project WD") + + // PropagationPolicy for the city code must exist. + ppName := propagationPolicyNameFor(testCityCodeLAX) + var pp karmadapolicyv1alpha1.PropagationPolicy + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, &pp) + require.NoError(t, err, "PropagationPolicy %q should exist", ppName) + + // The PP must select WorkloadDeployments by the city-code label. + require.Len(t, pp.Spec.ResourceSelectors, 1) + sel := pp.Spec.ResourceSelectors[0] + assert.Equal(t, computev1alpha.GroupVersion.String(), sel.APIVersion) + assert.Equal(t, "WorkloadDeployment", sel.Kind) + require.NotNil(t, sel.LabelSelector) + assert.Equal(t, testCityCodeLAX, sel.LabelSelector.MatchLabels[cityCodeLabel]) + + // The PP cluster affinity must target clusters carrying the same city-code. + require.NotNil(t, pp.Spec.Placement.ClusterAffinity) + require.NotNil(t, pp.Spec.Placement.ClusterAffinity.LabelSelector) + assert.Equal(t, testCityCodeLAX, + pp.Spec.Placement.ClusterAffinity.LabelSelector.MatchLabels[cityCodeLabel]) +} + +// TestWorkloadDeploymentFederator_Finalization covers the deletion scenarios: +// cleanup of Karmada resources and conditional PropagationPolicy removal. +func TestWorkloadDeploymentFederator_Finalization(t *testing.T) { + t.Parallel() + + ppName := propagationPolicyNameFor(testCityCodeLAX) + + tests := []struct { + name string + // karmadaExtra holds additional Karmada objects beyond the "own" WD and PP. + karmadaExtra []client.Object + wantPPGone bool + }{ + { + name: "last WD for city — PropagationPolicy removed", + karmadaExtra: nil, + wantPPGone: true, + }, + { + name: "other WD for same city remains — PropagationPolicy kept", + karmadaExtra: []client.Object{ + // A sibling WD in the same Karmada namespace with the same city-code. + &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other-deployment", + Namespace: testKarmadaNSStr, + Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + PlacementName: "other", + WorkloadRef: computev1alpha.WorkloadReference{Name: "other"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + }, + }, + wantPPGone: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + // Project cluster: namespace + WD with finalizer and deletion timestamp. + wd := testWorkloadDeployment(withFinalizer, withDeletionTimestamp) + projectClient := newProjectFakeClient(testProjectNamespace(), wd) + + // Karmada cluster: the mirrored WD + its PropagationPolicy + any extras. + karmadaWD := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: testWDName, + Namespace: testKarmadaNSStr, + Labels: map[string]string{cityCodeLabel: testCityCodeLAX}, + }, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: testCityCodeLAX, + PlacementName: testDefaultPlacement, + WorkloadRef: computev1alpha.WorkloadReference{Name: "test-workload"}, + ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, + }, + } + karmadaPP := &karmadapolicyv1alpha1.PropagationPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, + } + karmadaObjs := []client.Object{ + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testKarmadaNSStr}}, + karmadaWD, + karmadaPP, + } + karmadaObjs = append(karmadaObjs, tt.karmadaExtra...) + karmadaClient := newKarmadaFakeClient(karmadaObjs...) + + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) + + ctx := context.Background() + + // The Karmada-side WD must be gone. + var remainingWD computev1alpha.WorkloadDeployment + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: testWDName, + Namespace: testKarmadaNSStr, + }, &remainingWD) + assert.True(t, apierrors.IsNotFound(err), + "Karmada WD %q should be deleted after finalization", testWDName) + + // PropagationPolicy presence depends on whether siblings remain. + var remainingPP karmadapolicyv1alpha1.PropagationPolicy + err = karmadaClient.Get(ctx, types.NamespacedName{ + Name: ppName, + Namespace: testKarmadaNSStr, + }, &remainingPP) + if tt.wantPPGone { + assert.True(t, apierrors.IsNotFound(err), + "PropagationPolicy should be deleted when no city siblings remain") + } else { + assert.NoError(t, err, + "PropagationPolicy should be kept when other city siblings remain") + } + + // The project WD should be gone: once the federator finalizer is removed + // from an object that already has a DeletionTimestamp, the API server + // (and the fake client) garbage-collects the object. + var updatedWD computev1alpha.WorkloadDeployment + err = projectClient.Get(ctx, + types.NamespacedName{Name: testWDName, Namespace: testProjNS}, &updatedWD) + assert.True(t, apierrors.IsNotFound(err), + "project WD should be gone after finalizer removal (DeletionTimestamp + empty Finalizers = GC)") + }) + } +} + +// TestWorkloadDeploymentFederator_NotFound verifies that a missing +// WorkloadDeployment is handled gracefully (no error, no action). +func TestWorkloadDeploymentFederator_NotFound(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace()) // WD missing + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + result, err := r.Reconcile(context.Background(), reconcileRequest()) + require.NoError(t, err) + assert.Equal(t, ctrl.Result{}, result) +} + +// TestWorkloadDeploymentFederator_Finalize_DirectCall exercises the Finalize +// method directly, ensuring the cluster name is required in context. +func TestWorkloadDeploymentFederator_Finalize_DirectCall(t *testing.T) { + t.Parallel() + + projectClient := newProjectFakeClient(testProjectNamespace()) + karmadaClient := newKarmadaFakeClient() + r := newTestFederator(projectClient, karmadaClient) + + wd := testWorkloadDeployment(withFinalizer) + + // Without cluster in context → must return an error. + _, err := r.Finalize(context.Background(), wd) + require.Error(t, err, "Finalize without cluster context should fail") + assert.Contains(t, err.Error(), "cluster name not found") + + // With cluster in context → must succeed (karmada client returns not-found, which is OK). + ctx := mccontext.WithCluster(context.Background(), testCluster) + result, err := r.Finalize(ctx, wd) + require.NoError(t, err) + assert.False(t, result.Updated) +} diff --git a/internal/controller/workloaddeployment_location_test.go b/internal/controller/workloaddeployment_location_test.go new file mode 100644 index 00000000..ff996e73 --- /dev/null +++ b/internal/controller/workloaddeployment_location_test.go @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + computev1alpha "go.datum.net/compute/api/v1alpha" + networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" +) + +// newNetworkingScheme returns a scheme with compute + networkingv1alpha types. +func newNetworkingScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = computev1alpha.AddToScheme(s) + _ = networkingv1alpha.AddToScheme(s) + return s +} + +// TestReconcileNetworks_PersistsLocation_WhenLocationFound verifies that when a +// Location object matching the deployment's city code exists in the cluster, the +// resolved LocationReference is returned by reconcileNetworks and can be persisted +// to deployment.Status.Location. Instance creation must not be blocked — the +// function returns networkReady=false only because no NetworkInterfaces exist on +// the deployment in this scenario (short-circuit before bindings), not because +// Location was absent. +func TestReconcileNetworks_PersistsLocation_WhenLocationFound(t *testing.T) { + t.Parallel() + + const cityCode = "DFW" + const locationName = "loc-dfw-1" + const locationNamespace = "networking-system" + + location := &networkingv1alpha.Location{ + ObjectMeta: metav1.ObjectMeta{ + Name: locationName, + Namespace: locationNamespace, + }, + Spec: networkingv1alpha.LocationSpec{ + Topology: map[string]string{ + "topology.datum.net/city-code": cityCode, + }, + }, + } + + s := newNetworkingScheme() + cl := fake.NewClientBuilder().WithScheme(s).WithObjects(location).Build() + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: "test-wd", Namespace: "default"}, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: cityCode, + // No NetworkInterfaces — the function returns false,locationRef,nil + // after the location is found but before bindings are checked. + }, + } + + r := &WorkloadDeploymentReconciler{} + _, resolvedLocation, err := r.reconcileNetworks(context.Background(), cl, deployment) + + require.NoError(t, err) + require.NotNil(t, resolvedLocation, + "resolved location must be non-nil when a matching Location object exists") + assert.Equal(t, locationName, resolvedLocation.Name) + assert.Equal(t, locationNamespace, resolvedLocation.Namespace) + + // Simulate what the Reconcile loop does: persist resolvedLocation to Status. + deployment.Status.Location = resolvedLocation + assert.Equal(t, locationName, deployment.Status.Location.Name, + "Status.Location.Name must match the resolved Location object name") +} + +// TestReconcileNetworks_ReturnsNilLocation_WhenNoLocationFound verifies that +// when no Location object in the cluster matches the deployment's city code, +// reconcileNetworks returns (false, nil, nil) — no error and no resolved +// location. The caller must treat nil location as best-effort and must NOT block +// instance creation. +func TestReconcileNetworks_ReturnsNilLocation_WhenNoLocationFound(t *testing.T) { + t.Parallel() + + s := newNetworkingScheme() + // Cluster has a Location for a DIFFERENT city code. + otherLocation := &networkingv1alpha.Location{ + ObjectMeta: metav1.ObjectMeta{Name: "loc-ord-1", Namespace: "networking-system"}, + Spec: networkingv1alpha.LocationSpec{ + Topology: map[string]string{ + "topology.datum.net/city-code": "ORD", + }, + }, + } + cl := fake.NewClientBuilder().WithScheme(s).WithObjects(otherLocation).Build() + + deployment := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{Name: "test-wd", Namespace: "default"}, + Spec: computev1alpha.WorkloadDeploymentSpec{ + CityCode: "DFW", // no matching Location + }, + } + + r := &WorkloadDeploymentReconciler{} + networkReady, resolvedLocation, err := r.reconcileNetworks(context.Background(), cl, deployment) + + require.NoError(t, err, "missing location must not cause an error") + assert.False(t, networkReady, "network is not ready when no location is found") + assert.Nil(t, resolvedLocation, + "resolved location must be nil when no matching Location object exists") + + // Status.Location remains nil — callers must not update it in this case. + // Confirm the deployment's Status.Location is unaffected (nil → nil). + assert.Nil(t, deployment.Status.Location, + "Status.Location must remain nil when no Location matches the city code") +} diff --git a/internal/controller/workloaddeployment_scheduler.go b/internal/controller/workloaddeployment_scheduler.go deleted file mode 100644 index 041b0d64..00000000 --- a/internal/controller/workloaddeployment_scheduler.go +++ /dev/null @@ -1,153 +0,0 @@ -// SPDX-License-Identifier: AGPL-3.0-only - -package controller - -import ( - "context" - "fmt" - "time" - - apierrors "k8s.io/apimachinery/pkg/api/errors" - apimeta "k8s.io/apimachinery/pkg/api/meta" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/predicate" - mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" - mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" - mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" - mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" - - computev1alpha "go.datum.net/compute/api/v1alpha" - networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" -) - -// WorkloadDeploymentScheduler schedules a WorkloadDeployment -type WorkloadDeploymentScheduler struct { - mgr mcmanager.Manager -} - -func (r *WorkloadDeploymentScheduler) Reconcile(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { - logger := log.FromContext(ctx) - - cl, err := r.mgr.GetCluster(ctx, req.ClusterName) - if err != nil { - return ctrl.Result{}, err - } - - ctx = mccontext.WithCluster(ctx, req.ClusterName) - var deployment computev1alpha.WorkloadDeployment - if err := cl.GetClient().Get(ctx, req.NamespacedName, &deployment); err != nil { - if apierrors.IsNotFound(err) { - return ctrl.Result{}, nil - } - return ctrl.Result{}, err - } - - if !deployment.DeletionTimestamp.IsZero() { - return ctrl.Result{}, nil - } - - logger.Info("scheduling deployment") - defer logger.Info("scheduling complete") - - // TODO(jreese) improve! - // The first iteration of this scheduler will be very simple and only look for - // the first available location that is viable for the deployment. In the - // future, we could see a more advanced system similar to the Kubernetes - // scheduler itself. - - // Step 1: Get Locations - var locations networkingv1alpha.LocationList - if err := cl.GetClient().List(ctx, &locations); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to list locations: %w", err) - } - - if len(locations.Items) == 0 { - // Should only be the case in new environments if workloads are created - // prior to location registration. - - changed := apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "NoLocations", - ObservedGeneration: deployment.Generation, - Message: "No locations are registered with the system.", - }) - if changed { - // TODO(jreese) investigate kubevirt / other operators for better tracking - // of updates to the status. I seem to remember a "builder" of sorts that - // looked rather nice. - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - } - - return ctrl.Result{RequeueAfter: 30 * time.Second}, nil - } - - // TODO(jreese) define standard Topology keys somewhere - - var selectedLocation *networkingv1alpha.Location - for _, location := range locations.Items { - cityCode, ok := location.Spec.Topology["topology.datum.net/city-code"] - if ok && cityCode == deployment.Spec.CityCode { - selectedLocation = &location - break - } - } - - if selectedLocation == nil { - changed := apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "NoCandidateLocations", - ObservedGeneration: deployment.Generation, - Message: "No locations are candidates for this deployment.", - }) - if changed { - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - } - } else { - deployment.Status.Location = &networkingv1alpha.LocationReference{ - Name: selectedLocation.Name, - Namespace: selectedLocation.Namespace, - } - - // TODO(jreese) make sure we don't run into update conflicts with the update - // of the spec then status here. Just can't remember if it's an issue. - - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "LocationAssigned", - ObservedGeneration: deployment.Generation, - Message: "Deployment has been assigned a location.", - }) - - if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update deployment status: %w", err) - } - - } - - return ctrl.Result{}, nil -} - -// SetupWithManager sets up the controller with the Manager. -func (r *WorkloadDeploymentScheduler) SetupWithManager(mgr mcmanager.Manager) error { - r.mgr = mgr - return mcbuilder.ControllerManagedBy(mgr). - For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithPredicates( - predicate.NewPredicateFuncs(func(object client.Object) bool { - // Don't process deployments that have been scheduled - o := object.(*computev1alpha.WorkloadDeployment) - return o.Status.Location == nil - }), - )). - Named("workload-deployment-scheduler"). - Complete(r) -} diff --git a/internal/features/features.go b/internal/features/features.go new file mode 100644 index 00000000..8db20f09 --- /dev/null +++ b/internal/features/features.go @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +// Package features defines the feature gates for the compute operator. Feature +// gates follow the Kubernetes component-base convention: each feature is +// declared as a Feature constant, registered with a FeatureSpec that includes +// its default enablement state, and toggled at runtime via the --feature-gates +// flag exposed by the binary. +// +// Usage in cmd/main.go: +// +// features.MutableFeatureGate.AddFlag(flag.CommandLine) +// +// Usage in controllers: +// +// if features.MutableFeatureGate.Enabled(features.NetworkingIntegration) { ... } +package features + +import ( + "k8s.io/component-base/featuregate" +) + +const ( + // NetworkingIntegration controls whether the compute operator integrates with + // the network-services-operator (VPC) for NetworkBinding provisioning and the + // Network scheduling gate on Instances. + // + // When disabled: + // - No NetworkBinding objects are created. + // - The Network scheduling gate is not added to newly created Instances. + // - Any existing Network scheduling gate is actively removed. + // - The networking step is treated as immediately ready so Instances + // proceed to the runtime without a NetworkBinding. + // + // This flag exists so operators can run compute on edge/lab cells where + // VPC/NSO is not yet functional. The default is true (enabled) so that + // existing production deployments are unaffected. + // + // alpha: v0.1 + NetworkingIntegration featuregate.Feature = "NetworkingIntegration" +) + +// MutableFeatureGate is the mutable feature gate for the compute operator. +// Call MutableFeatureGate.AddFlag to register the --feature-gates flag before +// flag.Parse(). Controllers should read from FeatureGate (the read-only view) +// after startup. +var MutableFeatureGate featuregate.MutableFeatureGate = featuregate.NewFeatureGate() + +// FeatureGate is the read-only view of the compute operator feature gate. +// Use this in controllers and reconcilers rather than MutableFeatureGate to +// avoid accidental mutations after startup. +var FeatureGate featuregate.FeatureGate = MutableFeatureGate + +func init() { + if err := MutableFeatureGate.Add(map[featuregate.Feature]featuregate.FeatureSpec{ + NetworkingIntegration: {Default: true, PreRelease: featuregate.Alpha}, + }); err != nil { + panic(err) + } +} diff --git a/internal/features/features_test.go b/internal/features/features_test.go new file mode 100644 index 00000000..61687064 --- /dev/null +++ b/internal/features/features_test.go @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package features + +import ( + "testing" +) + +// TestNetworkingIntegration_DefaultEnabled verifies that the NetworkingIntegration +// feature gate defaults to enabled so that existing production deployments are +// unaffected when the flag is not set. +func TestNetworkingIntegration_DefaultEnabled(t *testing.T) { + // Use a fresh gate so this test is independent of any global state mutations. + gate := MutableFeatureGate.DeepCopy() + if !gate.Enabled(NetworkingIntegration) { + t.Error("NetworkingIntegration default = false, want true") + } +} + +// TestNetworkingIntegration_CanBeDisabled verifies that setting +// NetworkingIntegration=false via the feature gate string disables the +// integration, allowing operators to run compute without VPC/NSO. +func TestNetworkingIntegration_CanBeDisabled(t *testing.T) { + gate := MutableFeatureGate.DeepCopy() + if err := gate.Set("NetworkingIntegration=false"); err != nil { + t.Fatalf("Set(NetworkingIntegration=false): %v", err) + } + if gate.Enabled(NetworkingIntegration) { + t.Error("NetworkingIntegration = true after Set=false, want false") + } +} + +// TestNetworkingIntegration_ExplicitlyEnabled verifies that the gate can be +// explicitly set to true (round-trip). +func TestNetworkingIntegration_ExplicitlyEnabled(t *testing.T) { + gate := MutableFeatureGate.DeepCopy() + if err := gate.Set("NetworkingIntegration=true"); err != nil { + t.Fatalf("Set(NetworkingIntegration=true): %v", err) + } + if !gate.Enabled(NetworkingIntegration) { + t.Error("NetworkingIntegration = false after Set=true, want true") + } +} diff --git a/internal/quota/client.go b/internal/quota/client.go new file mode 100644 index 00000000..acef469c --- /dev/null +++ b/internal/quota/client.go @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package quota + +import ( + "context" + "fmt" + "net/url" + "sync" + + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// ProjectQuotaClientManager builds and caches controller-runtime clients that +// target individual Milo project control planes. It is safe for concurrent use. +type ProjectQuotaClientManager struct { + baseRestConfig *rest.Config + clients sync.Map // key: projectID (string), value: client.Client +} + +// New returns a ProjectQuotaClientManager that derives per-project REST configs +// from baseRestConfig by rewriting the host path. +func New(baseRestConfig *rest.Config) *ProjectQuotaClientManager { + return &ProjectQuotaClientManager{baseRestConfig: baseRestConfig} +} + +// StoreClient pre-populates the cache with a pre-built client for projectID. +// This is intended for use in unit tests where a real REST server is unavailable. +func (m *ProjectQuotaClientManager) StoreClient(projectID string, cl client.Client) { + m.clients.Store(projectID, cl) +} + +// ClientForProject returns a client.Client targeting the Milo project control +// plane for projectID. The client is constructed once and cached for subsequent +// calls. scheme must include all types the caller intends to operate on, +// including quotav1alpha1. +func (m *ProjectQuotaClientManager) ClientForProject( + ctx context.Context, + projectID string, + scheme *runtime.Scheme, +) (client.Client, error) { + if v, ok := m.clients.Load(projectID); ok { + return v.(client.Client), nil + } + + cfg := rest.CopyConfig(m.baseRestConfig) + apiHost, err := url.Parse(cfg.Host) + if err != nil { + return nil, fmt.Errorf("failed to parse base host: %w", err) + } + apiHost.Path = fmt.Sprintf( + "/apis/resourcemanager.miloapis.com/v1alpha1/projects/%s/control-plane", + projectID, + ) + cfg.Host = apiHost.String() + + cl, err := client.New(cfg, client.Options{Scheme: scheme}) + if err != nil { + return nil, fmt.Errorf("failed to create client for project %q: %w", projectID, err) + } + + m.clients.Store(projectID, cl) + return cl, nil +} diff --git a/internal/quota/metrics.go b/internal/quota/metrics.go new file mode 100644 index 00000000..5f1788cd --- /dev/null +++ b/internal/quota/metrics.go @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package quota + +import ( + "github.com/prometheus/client_golang/prometheus" + ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +// Metric reason label values for quota_eval_failures_total. +const ( + ReasonBackendUnavailable = "backend_unavailable" + ReasonProjectNotFound = "project_not_found" + ReasonNamespaceNotFound = "namespace_not_found" + ReasonMisconfigured = "misconfigured" + ReasonProjectIDUnresolvable = "project_id_unresolvable" + ReasonNoBudget = "no_budget" +) + +var ( + // EnforcementEnabled is a gauge set to 1 when quota enforcement is active + // (a credential path is configured) and 0 when disabled (no path configured). + // This gives dashboards and alerting a stable signal rather than log scraping. + EnforcementEnabled = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "compute_quota_enforcement_enabled", + Help: "1 if quota enforcement is active, 0 if disabled (no credential configured).", + }) + + // EvalFailuresTotal counts quota evaluation failures by reason code. + // Incremented each time quota evaluation fails for a reason other than the + // normal quota-exceeded or quota-pending flow. + EvalFailuresTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "compute_quota_eval_failures_total", + Help: "Total quota evaluation failures by reason code.", + }, []string{"reason"}) + + // ClaimOrphanedTotal counts ResourceClaims orphaned during instance deletion + // because the project ID could not be resolved at deletion time. + ClaimOrphanedTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "compute_quota_claim_orphaned_total", + Help: "Total ResourceClaims orphaned because project ID could not be resolved at deletion.", + }) +) + +func init() { + ctrlmetrics.Registry.MustRegister( + EnforcementEnabled, + EvalFailuresTotal, + ClaimOrphanedTotal, + ) +} diff --git a/internal/validation/instance_validation.go b/internal/validation/instance_validation.go index 7f112822..faa5ba0b 100644 --- a/internal/validation/instance_validation.go +++ b/internal/validation/instance_validation.go @@ -17,6 +17,19 @@ import ( networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) +// Validation constants for well-known string literals used across multiple +// validation functions. +const ( + // diskTypePDStandard is the only currently supported disk type. + diskTypePDStandard = "pd-standard" + + // defaultImageName is the only currently supported container image. + defaultImageName = "datumcloud/ubuntu-2204-lts" + + // defaultInstanceType is the only currently supported instance type. + defaultInstanceType = "datumcloud/d1-standard-2" +) + func validateInstanceTemplate( template computev1alpha.InstanceTemplateSpec, fieldPath *field.Path, @@ -97,6 +110,11 @@ func validateInstanceNetworkInterfaces( allErrs = append(allErrs, field.Invalid(networkNameField, networkInterface.Network, msg)) } + extra := make(map[string]authorizationv1.ExtraValue, len(opts.AdmissionRequest.UserInfo.Extra)) + for k, v := range opts.AdmissionRequest.UserInfo.Extra { + extra[k] = authorizationv1.ExtraValue(v) + } + review := authorizationv1.SubjectAccessReview{ Spec: authorizationv1.SubjectAccessReviewSpec{ ResourceAttributes: &authorizationv1.ResourceAttributes{ @@ -110,6 +128,7 @@ func validateInstanceNetworkInterfaces( User: opts.AdmissionRequest.UserInfo.Username, Groups: opts.AdmissionRequest.UserInfo.Groups, UID: opts.AdmissionRequest.UserInfo.UID, + Extra: extra, }, } @@ -258,8 +277,8 @@ func validateDiskVolumeSource(diskSource *computev1alpha.DiskTemplateVolumeSourc diskTemplateSpecField := diskTemplateField.Child("spec") // TODO(jrese) look up valid disk types - if diskTemplate.Spec.Type != "pd-standard" { - allErrs = append(allErrs, field.NotSupported(diskTemplateSpecField.Child("type"), diskTemplate.Spec.Type, []string{"pd-standard"})) + if diskTemplate.Spec.Type != diskTypePDStandard { + allErrs = append(allErrs, field.NotSupported(diskTemplateSpecField.Child("type"), diskTemplate.Spec.Type, []string{diskTypePDStandard})) } populatorResourceRequests, errs := validateDiskPopulator(diskTemplate.Spec.Populator, diskTemplateField.Child("populator")) @@ -400,8 +419,8 @@ func validateDiskPopulator(populator *computev1alpha.DiskPopulator, fieldPath *f // TODO(jreese) look up image imagePopulator := populator.Image - if imagePopulator.Name != "datumcloud/ubuntu-2204-lts" { - allErrs = append(allErrs, field.NotSupported(imageField.Child("name"), imagePopulator.Name, []string{"datumcloud/ubuntu-2204-lts"})) + if imagePopulator.Name != defaultImageName { + allErrs = append(allErrs, field.NotSupported(imageField.Child("name"), imagePopulator.Name, []string{defaultImageName})) } } } @@ -657,8 +676,8 @@ func validateInstanceRuntimeResources(resources computev1alpha.InstanceRuntimeRe allErrs := field.ErrorList{} // TODO(jreese) look up available instance types - if resources.InstanceType != "datumcloud/d1-standard-2" { - allErrs = append(allErrs, field.NotSupported(fieldPath, resources.InstanceType, []string{"datumcloud/d1-standard-2"})) + if resources.InstanceType != defaultInstanceType { + allErrs = append(allErrs, field.NotSupported(fieldPath, resources.InstanceType, []string{defaultInstanceType})) } if resources.Requests != nil { diff --git a/internal/validation/workload_validation_test.go b/internal/validation/workload_validation_test.go index f73e4c9f..b4e70df7 100644 --- a/internal/validation/workload_validation_test.go +++ b/internal/validation/workload_validation_test.go @@ -23,6 +23,15 @@ import ( networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha" ) +// Test constants for repeated string literals. +const ( + testCPUResource = "cpu" + testVolName = "vol" + testDuplicateMountPath = "duplicate-mount-path" + testDefaultNamespace = "default" + testCityCodeDFW = "DFW" +) + func TestValidateWorkloads(t *testing.T) { scenarios := map[string]struct { workload *computev1alpha.Workload @@ -157,7 +166,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ Value: resource.NewQuantity(50, resource.DecimalSI), AverageValue: resource.NewQuantity(50, resource.DecimalSI), @@ -181,7 +190,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ Value: resource.NewQuantity(-1, resource.DecimalSI), }, @@ -202,7 +211,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ AverageValue: resource.NewQuantity(-1, resource.DecimalSI), }, @@ -223,7 +232,7 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Placements[0].ScaleSettings.Metrics = []computev1alpha.MetricSpec{ { Resource: &computev1alpha.ResourceMetricSource{ - Name: "cpu", + Name: testCPUResource, Target: computev1alpha.MetricTarget{ AverageUtilization: proto.Int32(0), }, @@ -336,16 +345,16 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments = append( w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments, computev1alpha.VolumeAttachment{ - Name: "vol", + Name: testVolName, }, ) w.Spec.Template.Spec.Volumes = append(w.Spec.Template.Spec.Volumes, computev1alpha.InstanceVolume{ - Name: "vol", + Name: testVolName, VolumeSource: computev1alpha.VolumeSource{ Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("1Gi"), @@ -369,16 +378,16 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments = append( w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments, computev1alpha.VolumeAttachment{ - Name: "vol", + Name: testVolName, }, ) w.Spec.Template.Spec.Volumes = append(w.Spec.Template.Spec.Volumes, computev1alpha.InstanceVolume{ - Name: "vol", + Name: testVolName, VolumeSource: computev1alpha.VolumeSource{ Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("1Pi"), @@ -402,16 +411,16 @@ func TestValidateWorkloads(t *testing.T) { w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments = append( w.Spec.Template.Spec.Runtime.VirtualMachine.VolumeAttachments, computev1alpha.VolumeAttachment{ - Name: "vol", + Name: testVolName, }, ) w.Spec.Template.Spec.Volumes = append(w.Spec.Template.Spec.Volumes, computev1alpha.InstanceVolume{ - Name: "vol", + Name: testVolName, VolumeSource: computev1alpha.VolumeSource{ Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("10.5Gi"), @@ -436,7 +445,7 @@ func TestValidateWorkloads(t *testing.T) { Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("10Gi"), @@ -473,7 +482,7 @@ func TestValidateWorkloads(t *testing.T) { Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Resources: &computev1alpha.DiskResourceRequirements{ Requests: k8scorev1.ResourceList{ k8scorev1.ResourceStorage: resource.MustParse("10Gi"), @@ -490,11 +499,11 @@ func TestValidateWorkloads(t *testing.T) { } w.Spec.Template.Spec.Runtime.Sandbox.Containers[0].VolumeAttachments = []computev1alpha.VolumeAttachment{ { - Name: "duplicate-mount-path", + Name: testDuplicateMountPath, MountPath: proto.String("/mount1"), }, { - Name: "duplicate-mount-path", + Name: testDuplicateMountPath, MountPath: proto.String("/mount1"), }, { @@ -503,7 +512,7 @@ func TestValidateWorkloads(t *testing.T) { } w.Spec.Template.Spec.Volumes = []computev1alpha.InstanceVolume{ { - Name: "duplicate-mount-path", + Name: testDuplicateMountPath, VolumeSource: volumeSource, }, } @@ -540,7 +549,7 @@ func TestValidateWorkloads(t *testing.T) { interceptorFuncs: &interceptor.Funcs{ Create: func(ctx context.Context, client client.WithWatch, obj client.Object, opts ...client.CreateOption) error { if sar, ok := obj.(*authorizationv1.SubjectAccessReview); ok { - if sar.Spec.ResourceAttributes.Name == "default" && + if sar.Spec.ResourceAttributes.Name == testDefaultNamespace && sar.Spec.ResourceAttributes.Group == networkingv1alpha.GroupVersion.Group && sar.Spec.ResourceAttributes.Version == networkingv1alpha.GroupVersion.Version && sar.Spec.ResourceAttributes.Resource == "networks" { @@ -559,8 +568,8 @@ func TestValidateWorkloads(t *testing.T) { initObjs := []client.Object{ &networkingv1alpha.Network{ ObjectMeta: metav1.ObjectMeta{ - Namespace: "default", - Name: "default", + Namespace: testDefaultNamespace, + Name: testDefaultNamespace, }, }, } @@ -606,7 +615,7 @@ func TestValidateWorkloads(t *testing.T) { ) if len(scenario.opts.ValidCityCodes) == 0 { - scenario.opts.ValidCityCodes = []string{"DFW"} + scenario.opts.ValidCityCodes = []string{testCityCodeDFW} } t.Run(name, func(t *testing.T) { @@ -645,7 +654,7 @@ func MakeSandboxWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload }, Runtime: computev1alpha.InstanceRuntimeSpec{ Resources: computev1alpha.InstanceRuntimeResources{ - InstanceType: "datumcloud/d1-standard-2", + InstanceType: defaultInstanceType, }, Sandbox: &computev1alpha.SandboxRuntime{ Containers: []computev1alpha.SandboxContainer{ @@ -661,7 +670,7 @@ func MakeSandboxWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload Placements: []computev1alpha.WorkloadPlacement{ { Name: "placement1", - CityCodes: []string{"DFW"}, + CityCodes: []string{testCityCodeDFW}, ScaleSettings: computev1alpha.HorizontalScaleSettings{ MinReplicas: 1, }, @@ -702,7 +711,7 @@ func MakeVMWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload { }, Runtime: computev1alpha.InstanceRuntimeSpec{ Resources: computev1alpha.InstanceRuntimeResources{ - InstanceType: "datumcloud/d1-standard-2", + InstanceType: defaultInstanceType, }, VirtualMachine: &computev1alpha.VirtualMachineRuntime{ VolumeAttachments: []computev1alpha.VolumeAttachment{ @@ -719,10 +728,10 @@ func MakeVMWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload { Disk: &computev1alpha.DiskTemplateVolumeSource{ Template: &computev1alpha.DiskTemplateVolumeSourceTemplate{ Spec: computev1alpha.DiskSpec{ - Type: "pd-standard", + Type: diskTypePDStandard, Populator: &computev1alpha.DiskPopulator{ Image: &computev1alpha.ImageDiskPopulator{ - Name: "datumcloud/ubuntu-2204-lts", + Name: defaultImageName, }, }, }, @@ -736,7 +745,7 @@ func MakeVMWorkload(name string, tweaks ...Tweak) *computev1alpha.Workload { Placements: []computev1alpha.WorkloadPlacement{ { Name: "placement1", - CityCodes: []string{"DFW"}, + CityCodes: []string{testCityCodeDFW}, ScaleSettings: computev1alpha.HorizontalScaleSettings{ MinReplicas: 1, }, diff --git a/internal/webhook/v1alpha/workload_webhook.go b/internal/webhook/v1alpha/workload_webhook.go index e3f3735c..a8b94b38 100644 --- a/internal/webhook/v1alpha/workload_webhook.go +++ b/internal/webhook/v1alpha/workload_webhook.go @@ -6,12 +6,12 @@ import ( "fmt" "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/sets" ctrl "sigs.k8s.io/controller-runtime" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" computev1alpha "go.datum.net/compute/api/v1alpha" "go.datum.net/compute/internal/validation" @@ -27,8 +27,7 @@ func SetupWorkloadWebhookWithManager(mgr mcmanager.Manager) error { mgr: mgr, } - return ctrl.NewWebhookManagedBy(mgr.GetLocalManager()). - For(&computev1alpha.Workload{}). + return ctrl.NewWebhookManagedBy(mgr.GetLocalManager(), &computev1alpha.Workload{}). WithDefaulter(webhook). WithValidator(webhook). Complete() @@ -40,17 +39,11 @@ type workloadWebhook struct { mgr mcmanager.Manager } -var _ admission.CustomDefaulter = &workloadWebhook{} -var _ admission.CustomValidator = &workloadWebhook{} - -// Default implements webhook.Defaulter so a webhook will be registered for the type -func (r *workloadWebhook) Default(ctx context.Context, obj runtime.Object) error { - workload, ok := obj.(*computev1alpha.Workload) - if !ok { - return fmt.Errorf("unexpected type %T", obj) - } - _ = workload +var _ admission.Defaulter[*computev1alpha.Workload] = &workloadWebhook{} +var _ admission.Validator[*computev1alpha.Workload] = &workloadWebhook{} +// Default implements admission.Defaulter so a mutating webhook will be registered for the type. +func (r *workloadWebhook) Default(_ context.Context, _ *computev1alpha.Workload) error { // // TODO(jreese) review and test gateway defaulting / logic // if gw := workload.Spec.Gateway; gw != nil { // for i, tcpRoute := range gw.TCPRoutes { @@ -75,15 +68,10 @@ func (r *workloadWebhook) Default(ctx context.Context, obj runtime.Object) error // +kubebuilder:webhook:path=/validate-compute-datumapis-com-v1alpha-workload,mutating=false,failurePolicy=fail,sideEffects=None,groups=compute.datumapis.com,resources=workloads,verbs=create;update,versions=v1alpha,name=vworkload.kb.io,admissionReviewVersions=v1 -func (r *workloadWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) { - workload, ok := obj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", obj) - } - +func (r *workloadWebhook) ValidateCreate(ctx context.Context, workload *computev1alpha.Workload) (admission.Warnings, error) { clusterName := computewebhook.ClusterNameFromContext(ctx) - cluster, err := r.mgr.GetCluster(ctx, clusterName) + cluster, err := r.mgr.GetCluster(ctx, multicluster.ClusterName(clusterName)) if err != nil { return nil, err } @@ -101,9 +89,9 @@ func (r *workloadWebhook) ValidateCreate(ctx context.Context, obj runtime.Object // that means for the scheduling phase, since there would not currently be // sufficient context to know who created the workload and what locations // are valid candidates based on that. Maybe an annotation, or spec field? - var locations networkingv1alpha.LocationList + var locations networkingv1alpha.LocationBindingList if err := clusterClient.List(ctx, &locations); err != nil { - return nil, fmt.Errorf("failed to list locations: %w", err) + return nil, fmt.Errorf("failed to list location bindings: %w", err) } validCityCodes := sets.Set[string]{} @@ -123,38 +111,18 @@ func (r *workloadWebhook) ValidateCreate(ctx context.Context, obj runtime.Object } if errs := validation.ValidateWorkloadCreate(workload, opts); len(errs) > 0 { - return nil, errors.NewInvalid(obj.GetObjectKind().GroupVersionKind().GroupKind(), workload.Name, errs) + return nil, errors.NewInvalid(workload.GroupVersionKind().GroupKind(), workload.Name, errs) } return nil, nil } -func (r *workloadWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) { - oldworkload, ok := oldObj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", oldObj) - } - - _ = oldworkload - - newworkload, ok := newObj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", newObj) - } - - _ = newworkload - +func (r *workloadWebhook) ValidateUpdate(_ context.Context, _, _ *computev1alpha.Workload) (admission.Warnings, error) { // TODO(user): fill in your validation logic upon object update. return nil, nil } -func (r *workloadWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) { - workload, ok := obj.(*computev1alpha.Workload) - if !ok { - return nil, fmt.Errorf("unexpected type %T", obj) - } - _ = workload - +func (r *workloadWebhook) ValidateDelete(_ context.Context, _ *computev1alpha.Workload) (admission.Warnings, error) { // TODO(user): fill in your validation logic upon object deletion. return nil, nil } diff --git a/test/e2e/chainsaw-config.yaml b/test/e2e/chainsaw-config.yaml new file mode 100644 index 00000000..cd3a9950 --- /dev/null +++ b/test/e2e/chainsaw-config.yaml @@ -0,0 +1,47 @@ +# Chainsaw global configuration for the compute federation e2e test suite. +# +# Prerequisites +# ───────────── +# Run `task e2e:up` to create the Kind clusters and populate kubeconfigs under +# tmp/e2e/kubeconfigs/ before running Chainsaw. +# +# Running +# ─────── +# From the repository root via Taskfile (recommended): +# +# task e2e:test +# +# Or directly: +# +# KUBECONFIG=tmp/e2e/kubeconfigs/control-plane.yaml \ +# chainsaw test --config test/e2e/chainsaw-config.yaml test/e2e/ +# +# The KUBECONFIG env var sets the "default" cluster (control-plane cell). +# Additional clusters (downstream, pop-dfw, pop-ord) are declared below and +# referenced by name in individual test steps via `cluster: downstream` etc. +# +# Kubeconfig paths below are relative to the working directory where Chainsaw is +# invoked (the project root), NOT relative to this config file's location. +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Configuration +metadata: + name: chainsaw +spec: + timeouts: + apply: 30s + assert: 60s + cleanup: 60s + delete: 30s + error: 30s + exec: 30s + clusters: + # Downstream control plane. WorkloadDeployments, PropagationPolicies, + # and Instance write-backs live here. + downstream: + kubeconfig: tmp/e2e/kubeconfigs/downstream.yaml + # POP DFW cell — downstream member cluster labelled topology.datum.net/city-code=dfw. + pop-dfw: + kubeconfig: tmp/e2e/kubeconfigs/pop-dfw.yaml + # POP ORD cell — downstream member cluster labelled topology.datum.net/city-code=ord. + pop-ord: + kubeconfig: tmp/e2e/kubeconfigs/pop-ord.yaml diff --git a/test/e2e/deletion-cascade/assert-downstream-wd-exists.yaml b/test/e2e/deletion-cascade/assert-downstream-wd-exists.yaml new file mode 100644 index 00000000..aae65da1 --- /dev/null +++ b/test/e2e/deletion-cascade/assert-downstream-wd-exists.yaml @@ -0,0 +1,7 @@ +# Assert the WorkloadDeployment is present in the Karmada API server. +# Used both to confirm federation succeeded and as the target for the error: check. +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + namespace: ($downstreamNS) + name: test-cascade-wd diff --git a/test/e2e/deletion-cascade/chainsaw-test.yaml b/test/e2e/deletion-cascade/chainsaw-test.yaml new file mode 100644 index 00000000..03a11ea0 --- /dev/null +++ b/test/e2e/deletion-cascade/chainsaw-test.yaml @@ -0,0 +1,79 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: deletion-cascade +spec: + description: | + Verifies that deleting a WorkloadDeployment from the project namespace causes + the federator to remove the corresponding WorkloadDeployment from Karmada. + + The WorkloadDeploymentFederator adds a finalizer + (compute.datumapis.com/federator) to every project WD it manages. When the + project WD is deleted: + 1. The finalizer's Finalize method runs (blocking deletion until complete). + 2. It deletes the Karmada-side WorkloadDeployment. + 3. It removes the PropagationPolicy if no other WDs for the city remain. + 4. It removes the finalizer, allowing the project WD to be garbage-collected. + + This test validates: project WD deletion → Karmada WD deletion. + + template: true + + steps: + - name: create-wd + description: Create a WorkloadDeployment on the control-plane cluster. + try: + - apply: + file: workload-deployment.yaml + + - name: wait-for-federation + description: Wait for the WorkloadDeployment to appear in Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-cascade-wd + + - name: delete-wd + description: Delete the WorkloadDeployment from the control-plane cluster. + try: + - delete: + ref: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($namespace) + name: test-cascade-wd + + - name: assert-downstream-wd-deleted + description: Confirm the Karmada copy is removed by the finalizer. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - wait: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($downstreamNS) + name: test-cascade-wd + timeout: 30s + for: + deletion: {} diff --git a/test/e2e/deletion-cascade/workload-deployment.yaml b/test/e2e/deletion-cascade/workload-deployment.yaml new file mode 100644 index 00000000..39d68a1d --- /dev/null +++ b/test/e2e/deletion-cascade/workload-deployment.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: test-cascade-wd +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/env/README.md b/test/e2e/env/README.md new file mode 100644 index 00000000..671e705d --- /dev/null +++ b/test/e2e/env/README.md @@ -0,0 +1,251 @@ +# Local Kind + Karmada e2e Environment + +This document describes the local multi-cluster environment used for end-to-end +testing of the compute federation layer. + +--- + +## Prerequisites + +| Tool | Minimum version | Install | +|------|----------------|---------| +| [Docker Desktop](https://www.docker.com/products/docker-desktop/) | 4.x | required for Kind | +| [kind](https://kind.sigs.k8s.io/) | v0.23+ | `brew install kind` | +| [kubectl](https://kubernetes.io/docs/tasks/tools/) | v1.28+ | `brew install kubernetes-cli` | +| [helm](https://helm.sh/) | v3.14+ | `brew install helm` | +| [task](https://taskfile.dev/) | v3 | `brew install go-task` | +| Python 3 | 3.9+ | pre-installed on macOS | +| go | 1.24+ | `brew install go` | + +`karmadactl` is downloaded automatically by `task e2e:up` into `./bin/`. + +--- + +## Cluster Topology + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ compute-control-plane (Kind cluster) │ +│ │ +│ ┌───────────────────────────────────────────────────────────────┐ │ +│ │ karmada-system namespace │ │ +│ │ Karmada API Server ←── https://localhost:32443 │ │ +│ │ Karmada Controller Manager │ │ +│ │ Karmada Scheduler │ │ +│ └───────────────────────────────────────────────────────────────┘ │ +│ │ +│ compute operator (WorkloadReconciler, Federator, InstanceProjector)│ +└──────────────────────────┬──────────────────────────────────────────┘ + │ Karmada propagates WorkloadDeployments + ┌────────────────┴─────────────────┐ + │ │ +┌─────────▼──────────┐ ┌──────────▼─────────┐ +│ compute-pop-dfw │ │ compute-pop-ord │ +│ (Kind cluster) │ │ (Kind cluster) │ +│ │ │ │ +│ city-code=dfw │ │ city-code=ord │ +│ Compute CRDs │ │ Compute CRDs │ +│ NSO CRDs │ │ NSO CRDs │ +└────────────────────┘ └────────────────────┘ +``` + +### What lives where + +| Resource | Cluster | +|----------|---------| +| `Workload`, `WorkloadDeployment` (consumer-facing) | Control Plane Cell | +| `WorkloadDeployment` (federation intent), `PropagationPolicy` | Karmada API Server | +| `WorkloadDeployment` (propagated), `Instance`, `NetworkBinding`, `SubnetClaim` | POP cells | +| `Instance` (write-back for visibility) | Karmada API Server | + +--- + +## Running the environment + +### Start + +```bash +task e2e:up +``` + +This is fully idempotent — running it twice will not fail. + +What it does, in order: + +1. Downloads `karmadactl v1.16.0` into `./bin/` (once). +2. Adds the `karmada-charts` Helm repository. +3. Creates Kind clusters `compute-control-plane`, `compute-pop-dfw`, + `compute-pop-ord` (skips any that already exist). +4. Exports kubeconfigs to `./tmp/e2e/kubeconfigs/`. +5. Installs Karmada v1.16.0 via the `karmada-charts/karmada` Helm chart into + `compute-control-plane`, with the API server exposed on NodePort 32443. +6. Registers `compute-pop-dfw` and `compute-pop-ord` as member clusters and + labels each with `topology.datum.net/city-code`. +7. Installs compute CRDs to all clusters and the Karmada API server. +8. Installs NSO CRDs to the POP cell clusters. + +### Stop + +```bash +task e2e:down +``` + +Deletes all three Kind clusters and removes `./tmp/e2e/`. + +--- + +## Kubeconfigs + +After `task e2e:up`: + +| File | Cluster | Use for | +|------|---------|---------| +| `tmp/e2e/kubeconfigs/control-plane.yaml` | `compute-control-plane` | kubectl, deploying the compute operator | +| `tmp/e2e/kubeconfigs/karmada.yaml` | Karmada API server | kubectl, karmadactl | +| `tmp/e2e/kubeconfigs/pop-dfw.yaml` | `compute-pop-dfw` | kubectl, inspecting POP cell state | +| `tmp/e2e/kubeconfigs/pop-ord.yaml` | `compute-pop-ord` | kubectl, inspecting POP cell state | + +The `-internal.yaml` variants use the Kind container's Docker bridge IP and are +intended for the Karmada controller running inside Docker — not for direct +developer use. + +### Quick check + +```bash +# Verify cluster list in Karmada +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml get clusters + +# Expected output: +# NAME READY AGE +# compute-pop-dfw True ... +# compute-pop-ord True ... + +# Verify city-code labels +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml \ + get clusters -L topology.datum.net/city-code +``` + +--- + +## Using the environment from e2e tests + +Import `go.datum.net/compute/test/e2e/env` in your test suite: + +```go +package myfeature_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + computev1alpha1 "go.datum.net/compute/api/v1alpha1" + + "go.datum.net/compute/test/e2e/env" +) + +var testEnv *env.Environment + +func TestMyFeature(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "MyFeature Suite") +} + +var _ = BeforeSuite(func() { + scheme := runtime.NewScheme() + Expect(corev1.AddToScheme(scheme)).To(Succeed()) + Expect(computev1alpha1.AddToScheme(scheme)).To(Succeed()) + + var err error + testEnv, err = env.New(scheme) + Expect(err).NotTo(HaveOccurred()) +}) + +var _ = It("creates a workload and propagates it", func() { + // Control plane cluster client + cpClient := testEnv.ControlPlane.Client + + // Karmada API server client + karmadaClient := testEnv.Karmada.Client + + // POP DFW cluster client + dfwCell, err := testEnv.POPCell(env.CityCodeDFW) + Expect(err).NotTo(HaveOccurred()) + dfwClient := dfwCell.Client + + _ = cpClient + _ = karmadaClient + _ = dfwClient +}) +``` + +### Environment variable override + +Set `E2E_KUBECONFIG_DIR` to an absolute path to load kubeconfigs from a +different directory (useful in CI): + +```bash +E2E_KUBECONFIG_DIR=/path/to/kubeconfigs go test ./test/e2e/... +``` + +--- + +## Networking notes (macOS) + +On macOS with Docker Desktop, Kind clusters run as Docker containers. The +container-to-container networking works as follows: + +| From | To | Address used | +|------|----|--------------| +| macOS host | Any Kind cluster API server | `localhost:` | +| macOS host | Karmada API server | `https://localhost:32443` (NodePort) | +| Karmada controller (in Docker) | POP cell API servers | Docker bridge IP (`172.18.x.x:6443`) | + +The `-internal.yaml` kubeconfig variants use Docker bridge IPs with +`insecure-skip-tls-verify: true` because the node certificates do not include +bridge IPs in their SANs. This is acceptable for a local dev environment. + +--- + +## Troubleshooting + +### Karmada API server not reachable + +```bash +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml get ns +``` + +If this times out, check: +1. The Kind cluster is running: `kind get clusters` +2. Port 32443 is mapped: `docker port compute-control-plane-control-plane` +3. The karmada-apiserver pod is running: + ```bash + kubectl --kubeconfig tmp/e2e/kubeconfigs/control-plane.yaml \ + get pods -n karmada-system + ``` + +### POP cluster shows NotReady in Karmada + +The Karmada controller manager uses the Docker bridge IP kubeconfig to reach +POP cells. Check: + +```bash +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml \ + describe cluster compute-pop-dfw +``` + +Then verify the cluster secret contains the expected Docker IP: + +```bash +kubectl --kubeconfig tmp/e2e/kubeconfigs/karmada.yaml \ + get secret -n karmada-system | grep pop-dfw +``` + +### Start fresh + +```bash +task e2e:down && task e2e:up +``` diff --git a/test/e2e/env/env.go b/test/e2e/env/env.go new file mode 100644 index 00000000..7d2c59c6 --- /dev/null +++ b/test/e2e/env/env.go @@ -0,0 +1,233 @@ +// Package env provides helpers for connecting to the local Kind e2e environment +// created by "task e2e:up". +// +// # Environment layout +// +// The environment consists of three Kind clusters and one downstream API server: +// +// - Control plane cell — hosts the compute operator (WorkloadReconciler, +// WorkloadDeploymentFederator, InstanceProjector). +// - Downstream control plane — the federation API server; WorkloadDeployments +// are written here so they can be propagated to POP cells. +// - POP DFW (compute-pop-dfw) — member cluster labelled city-code=dfw. +// - POP ORD (compute-pop-ord) — member cluster labelled city-code=ord. +// +// # Kubeconfig resolution +// +// Kubeconfigs are read from the directory at [DefaultKubeconfigDir] (relative +// to the repository root), unless overridden via the [EnvKubeconfigDir] +// environment variable. +// +// Expected files inside that directory: +// +// control-plane.yaml — management / control-plane cell +// downstream.yaml — downstream federation API server (https://localhost:32443) +// pop-dfw.yaml — POP DFW cell (standard Kind localhost-based kubeconfig) +// pop-ord.yaml — POP ORD cell (standard Kind localhost-based kubeconfig) +// +// # Typical usage in a Ginkgo suite +// +// var ( +// testEnv *env.Environment +// ) +// +// var _ = BeforeSuite(func() { +// scheme := runtime.NewScheme() +// Expect(computev1alpha1.AddToScheme(scheme)).To(Succeed()) +// Expect(corev1.AddToScheme(scheme)).To(Succeed()) +// +// var err error +// testEnv, err = env.New(scheme) +// Expect(err).NotTo(HaveOccurred()) +// }) +package env + +import ( + "fmt" + "os" + "path/filepath" + "runtime" + + k8sruntime "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +// Environment variable name that overrides the kubeconfig directory. +const EnvKubeconfigDir = "E2E_KUBECONFIG_DIR" + +// DefaultKubeconfigDir is the kubeconfig directory used when [EnvKubeconfigDir] +// is not set. It is resolved relative to the repository root (three directories +// above this source file). +const DefaultKubeconfigDir = "tmp/e2e/kubeconfigs" + +// City codes for the two POP cells created by "task e2e:up". +const ( + CityCodeDFW = "dfw" + CityCodeORD = "ord" +) + +// Environment holds a [ClusterAccess] for each cluster in the local e2e +// environment. All fields are populated by [New]; none are nil on success. +type Environment struct { + // ControlPlane is the management / control-plane cell cluster. + // The compute operator runs here (WorkloadReconciler, + // WorkloadDeploymentFederator, InstanceProjector). + ControlPlane *ClusterAccess + + // Downstream is the downstream control plane. + // WorkloadDeployments and PropagationPolicies live here. + Downstream *ClusterAccess + + // POPCells maps city-code strings (e.g. "dfw", "ord") to the + // corresponding POP cell cluster. Use [Environment.POPCell] for + // safe, error-returning access. + POPCells map[string]*ClusterAccess +} + +// ClusterAccess bundles a REST config and a controller-runtime Client for a +// single cluster. +type ClusterAccess struct { + // Config is the REST config used to build the client. + Config *rest.Config + + // Client is a controller-runtime client scoped to this cluster. + // The client is built with the scheme supplied to [New]. + Client ctrlclient.Client +} + +// New creates an [Environment] by loading kubeconfigs from the configured +// directory and building a controller-runtime client for each cluster using +// the provided scheme. +// +// The scheme should have all relevant types registered before calling New; +// for example compute types, networking types, and core Kubernetes types. +func New(scheme *k8sruntime.Scheme) (*Environment, error) { + dir := kubeconfigDir() + + controlPlane, err := loadCluster(filepath.Join(dir, "control-plane.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("control-plane cluster: %w", err) + } + + downstream, err := loadCluster(filepath.Join(dir, "downstream.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("downstream control plane: %w", err) + } + + popDFW, err := loadCluster(filepath.Join(dir, "pop-dfw.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("POP DFW cluster: %w", err) + } + + popORD, err := loadCluster(filepath.Join(dir, "pop-ord.yaml"), scheme) + if err != nil { + return nil, fmt.Errorf("POP ORD cluster: %w", err) + } + + return &Environment{ + ControlPlane: controlPlane, + Downstream: downstream, + POPCells: map[string]*ClusterAccess{ + CityCodeDFW: popDFW, + CityCodeORD: popORD, + }, + }, nil +} + +// POPCell returns the [ClusterAccess] for the POP cell with the given city +// code. It returns an error if no POP cell is registered for that code. +func (e *Environment) POPCell(cityCode string) (*ClusterAccess, error) { + ca, ok := e.POPCells[cityCode] + if !ok { + known := make([]string, 0, len(e.POPCells)) + for k := range e.POPCells { + known = append(known, k) + } + return nil, fmt.Errorf("no POP cell registered for city code %q (known: %v)", cityCode, known) + } + return ca, nil +} + +// MustPOPCell is like [Environment.POPCell] but panics on error. +// Useful in test setup where a missing POP cell is always a fatal misconfiguration. +func (e *Environment) MustPOPCell(cityCode string) *ClusterAccess { + ca, err := e.POPCell(cityCode) + if err != nil { + panic(err) + } + return ca +} + +// RESTConfigFor is a convenience function that returns a [rest.Config] for the +// named cluster without constructing a client. Useful when the caller needs to +// build a typed clientset directly. +func RESTConfigFor(kubeconfigPath string) (*rest.Config, error) { + cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath) + if err != nil { + return nil, fmt.Errorf("building REST config from %s: %w", kubeconfigPath, err) + } + return cfg, nil +} + +// KubeconfigPath returns the absolute path to the kubeconfig file for the +// named cluster. name must be one of "control-plane", "downstream", "pop-dfw", +// or "pop-ord". +func KubeconfigPath(name string) string { + return filepath.Join(kubeconfigDir(), name+".yaml") +} + +// ─── internal helpers ──────────────────────────────────────────────────────── + +func loadCluster(kubeconfigPath string, scheme *k8sruntime.Scheme) (*ClusterAccess, error) { + cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath) + if err != nil { + return nil, fmt.Errorf("building REST config from %s: %w", kubeconfigPath, err) + } + + c, err := ctrlclient.New(cfg, ctrlclient.Options{Scheme: scheme}) + if err != nil { + return nil, fmt.Errorf("building client from %s: %w", kubeconfigPath, err) + } + + return &ClusterAccess{ + Config: cfg, + Client: c, + }, nil +} + +// kubeconfigDir returns the directory containing e2e kubeconfigs. +// It honours the E2E_KUBECONFIG_DIR environment variable, otherwise falls +// back to /tmp/e2e/kubeconfigs. +func kubeconfigDir() string { + if dir := os.Getenv(EnvKubeconfigDir); dir != "" { + return dir + } + return filepath.Join(repoRoot(), DefaultKubeconfigDir) +} + +// repoRoot walks up from this source file to find the repository root +// (identified by the presence of go.mod). +func repoRoot() string { + // Use the file path of this source file as a starting point so the helper + // works regardless of the caller's working directory. + _, thisFile, _, ok := runtime.Caller(0) + if !ok { + // Fallback: assume tests are run from the repo root. + return "." + } + + dir := filepath.Dir(thisFile) + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + return dir + } + parent := filepath.Dir(dir) + if parent == dir { + // Reached filesystem root without finding go.mod. + return "." + } + dir = parent + } +} diff --git a/test/e2e/full-federation/chainsaw-test.yaml b/test/e2e/full-federation/chainsaw-test.yaml new file mode 100644 index 00000000..020a2bc9 --- /dev/null +++ b/test/e2e/full-federation/chainsaw-test.yaml @@ -0,0 +1,150 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: full-federation +spec: + description: | + End-to-end federation chain test. + + Exercises the complete path from WorkloadDeployment creation through to + Instance projection on the control-plane cluster: + + 1. Create WorkloadDeployment on control-plane. + 2. WorkloadDeploymentFederator replicates it to Karmada (ns- namespace). + 3. Karmada PropagationPolicy routes the WD to pop-dfw. + 4. WorkloadDeploymentReconciler on pop-dfw creates Instance test-full-fed-wd-0. + 5. InstanceReconciler on pop-dfw writes Instance back to Karmada with + label meta.datumapis.com/upstream-cluster-name: cluster-single. + 6. InstanceProjector on control-plane creates a projection of the Instance + in the project namespace. + + Prerequisites: both operator instances must be running (task e2e:operator:start). + + template: true + + steps: + - name: create-workload-deployment + description: Create the WorkloadDeployment on the control-plane cluster. + try: + - apply: + file: workload-deployment.yaml + + - name: assert-wd-in-downstream + description: Assert WorkloadDeploymentFederator replicated the WD to Karmada and status is synced back. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd + - assert: + # Wait for the cell operator to write status back to the Karmada WD. + timeout: 60s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd + status: + replicas: 1 + desiredReplicas: 1 + + - name: assert-wd-on-pop-dfw + description: Assert Karmada propagated the WD to pop-dfw and the cell reconciler set status. + cluster: pop-dfw + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + # Karmada propagation can take longer than a local apply. + timeout: 60s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd + status: + replicas: 1 + desiredReplicas: 1 + + - name: assert-instance-on-pop-dfw + description: Assert WorkloadDeploymentReconciler created an Instance on pop-dfw with a Ready condition. + cluster: pop-dfw + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd-0 + (status.conditions[?type == 'Ready'] | [0]): + status: "Unknown" + + - name: assert-instance-writeback-in-downstream + description: Assert InstanceReconciler wrote the Instance back to Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($downstreamNS) + name: test-full-fed-wd-0 + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + + - name: assert-instance-projected-to-control-plane + description: Assert InstanceProjector created a projection with status on the control-plane. + try: + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($namespace) + name: test-full-fed-wd-0 + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + (status.conditions[?type == 'Ready'] | [0]): + status: "Unknown" diff --git a/test/e2e/full-federation/workload-deployment.yaml b/test/e2e/full-federation/workload-deployment.yaml new file mode 100644 index 00000000..70b4cb94 --- /dev/null +++ b/test/e2e/full-federation/workload-deployment.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: test-full-fed-wd + # namespace is injected by Chainsaw from ($namespace) +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/instance-projection/assert-downstream-wd.yaml b/test/e2e/instance-projection/assert-downstream-wd.yaml new file mode 100644 index 00000000..705d0893 --- /dev/null +++ b/test/e2e/instance-projection/assert-downstream-wd.yaml @@ -0,0 +1,6 @@ +# Assert the WorkloadDeployment is federated to Karmada (and the Karmada namespace created). +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + namespace: ($downstreamNS) + name: test-projector-wd diff --git a/test/e2e/instance-projection/assert-projected-instance.yaml b/test/e2e/instance-projection/assert-projected-instance.yaml new file mode 100644 index 00000000..0542194d --- /dev/null +++ b/test/e2e/instance-projection/assert-projected-instance.yaml @@ -0,0 +1,19 @@ +# Assert the InstanceProjector created a projection in the project namespace. +# +# The InstanceProjector (internal/controller/instance_projector.go): +# - Watches Instances in Karmada that carry upstreamClusterNameLabel +# - Strips "cluster-" prefix to get the cluster name ("single" in single-provider mode) +# - Finds the project namespace by matching ns- to namespace UIDs +# - Creates/updates the Instance projection in the project namespace +# - Sets an owner reference to the WorkloadDeployment for cascading deletion +apiVersion: compute.datumapis.com/v1alpha +kind: Instance +metadata: + # namespace is the Chainsaw test namespace (the project namespace on control-plane) + name: test-projected-instance + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + ownerReferences: + - apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + name: test-projector-wd diff --git a/test/e2e/instance-projection/chainsaw-test.yaml b/test/e2e/instance-projection/chainsaw-test.yaml new file mode 100644 index 00000000..16fa9f96 --- /dev/null +++ b/test/e2e/instance-projection/chainsaw-test.yaml @@ -0,0 +1,123 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: instance-projection +spec: + description: | + Verifies that the InstanceProjector watches Instances written back to the + Karmada API server and creates corresponding read-only projections in the + project namespace on the control-plane cluster. + + Flow: + 1. Create a WorkloadDeployment → triggers federator → Karmada namespace created. + 2. Write an Instance to Karmada (simulating a POP-cell InstanceReconciler write-back). + 3. InstanceProjector detects the Karmada Instance and creates a projection in the + project namespace (the Chainsaw test namespace on the control-plane cluster). + 4. Assert the projection exists with the upstream tracking label and an owner + reference to the WorkloadDeployment (for cascading deletion). + + Cluster name label: "cluster-single" + The compute operator runs in single-provider mode for this e2e environment, + registering the control-plane cluster with the multicluster-runtime manager + under the name "single" (see cmd/main.go, wrappedSingleClusterProvider). + + template: true + + steps: + - name: create-wd + description: Create the WorkloadDeployment to trigger federation and namespace creation. + try: + - apply: + file: workload-deployment.yaml + + - name: wait-for-downstream-namespace + description: Wait for the federated WorkloadDeployment to appear in Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-projector-wd + + - name: write-instance-to-downstream + description: | + Write an Instance to Karmada simulating InstanceReconciler write-back. + Uses explicit control-plane kubeconfig to derive downstreamNS and WD UID. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get workloaddeployment test-projector-wd \ + --namespace "$NAMESPACE" \ + -o jsonpath='{.metadata.uid}' + outputs: + - name: wdUID + value: ($stdout) + - script: + env: + - name: KARMADA_NS + value: ($downstreamNS) + - name: WD_UID + value: ($wdUID) + content: | + kubectl apply -f - < is the multicluster-runtime cluster name registered by +# wrappedSingleClusterProvider (always "single" in single-cluster mode) +# - Label meta.datumapis.com/upstream-namespace = the POP-cell namespace +apiVersion: compute.datumapis.com/v1alpha +kind: Instance +metadata: + namespace: ($instanceNS) + name: test-writeback-instance + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + meta.datumapis.com/upstream-namespace: ($instanceNS) diff --git a/test/e2e/instance-writeback/chainsaw-test.yaml b/test/e2e/instance-writeback/chainsaw-test.yaml new file mode 100644 index 00000000..32dbbc5d --- /dev/null +++ b/test/e2e/instance-writeback/chainsaw-test.yaml @@ -0,0 +1,112 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: instance-writeback +spec: + description: | + Verifies that the InstanceReconciler running in a POP-cell cluster writes + Instance objects back to the Karmada API server after reconciling the Ready + condition for the first time. + + Write-back convention (internal/controller/instance_controller.go): + - The Instance is written to Karmada at the same namespace/name as the POP-cell Instance. + - Label meta.datumapis.com/upstream-cluster-name is set to + "cluster-" (e.g. "cluster-compute-pop-dfw"). + - Label meta.datumapis.com/upstream-namespace records the originating namespace. + + Note: this test requires the compute operator (InstanceReconciler) to be running + in the DFW POP cell cluster. + + template: true + + steps: + - name: setup-namespaces + description: Create the Instance namespace in the DFW POP cell and Karmada. + try: + - script: + content: | + kubectl get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: instanceNS + value: ($stdout) + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/pop-dfw.yaml \ + create namespace "$INSTANCE_NS" \ + --dry-run=client -o yaml | \ + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/pop-dfw.yaml apply -f - + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/downstream.yaml \ + create namespace "$INSTANCE_NS" \ + --dry-run=client -o yaml | \ + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/downstream.yaml apply -f - + cleanup: + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/pop-dfw.yaml \ + delete namespace "$INSTANCE_NS" --ignore-not-found + - script: + env: + - name: INSTANCE_NS + value: ($instanceNS) + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/downstream.yaml \ + delete namespace "$INSTANCE_NS" --ignore-not-found + + - name: create-instance-on-pop-dfw + description: Create the Instance on the DFW POP cell cluster. + cluster: pop-dfw + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: instanceNS + value: ($stdout) + - apply: + file: instance-pop-dfw.yaml + cleanup: + - script: + content: | + INSTANCE_NS=$(kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}') + kubectl delete instance test-writeback-instance \ + --namespace "$INSTANCE_NS" --ignore-not-found + + - name: assert-instance-in-downstream + description: Wait for the InstanceReconciler to write back the Instance to Karmada. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: instanceNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: Instance + metadata: + namespace: ($instanceNS) + name: test-writeback-instance + labels: + meta.datumapis.com/upstream-cluster-name: cluster-single + meta.datumapis.com/upstream-namespace: ($instanceNS) diff --git a/test/e2e/instance-writeback/instance-pop-dfw.yaml b/test/e2e/instance-writeback/instance-pop-dfw.yaml new file mode 100644 index 00000000..250eb7d7 --- /dev/null +++ b/test/e2e/instance-writeback/instance-pop-dfw.yaml @@ -0,0 +1,15 @@ +# Instance created in the DFW POP cell. +# ($instanceNS) is the namespace derived from the Chainsaw test namespace UID, +# matching the ns- convention so the InstanceProjector can resolve it later. +apiVersion: compute.datumapis.com/v1alpha +kind: Instance +metadata: + name: test-writeback-instance + namespace: ($instanceNS) +spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network diff --git a/test/e2e/propagation-policy-lifecycle/assert-pp-exists.yaml b/test/e2e/propagation-policy-lifecycle/assert-pp-exists.yaml new file mode 100644 index 00000000..77a817a5 --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/assert-pp-exists.yaml @@ -0,0 +1,6 @@ +# Asserts that the PropagationPolicy for city dfw exists in the Karmada namespace. +apiVersion: policy.karmada.io/v1alpha1 +kind: PropagationPolicy +metadata: + namespace: ($downstreamNS) + name: workload-deployments-dfw diff --git a/test/e2e/propagation-policy-lifecycle/chainsaw-test.yaml b/test/e2e/propagation-policy-lifecycle/chainsaw-test.yaml new file mode 100644 index 00000000..5678c398 --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/chainsaw-test.yaml @@ -0,0 +1,133 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: propagation-policy-lifecycle +spec: + description: | + Verifies the PropagationPolicy lifecycle managed by the WorkloadDeploymentFederator: + + - A PropagationPolicy (city-dfw) is lazily created when the first WorkloadDeployment + for city code "dfw" is federated to Karmada. + - The PropagationPolicy is RETAINED while at least one WorkloadDeployment for + that city code remains in the Karmada namespace. + - The PropagationPolicy is DELETED when the last deployment for the city is removed. + + The test creates two WDs (wd-alpha, wd-beta) both targeting cityCode=dfw, verifies + the PP appears, deletes wd-alpha and asserts the PP is still present, then deletes + wd-beta and waits for the PP to disappear. + + template: true + + steps: + - name: create-deployments + description: Create two WorkloadDeployments targeting dfw on the control-plane. + try: + - apply: + file: workload-deployment-alpha.yaml + - apply: + file: workload-deployment-beta.yaml + + - name: assert-policy-created + description: | + Assert both WDs are federated to Karmada and the PropagationPolicy exists. + Both WDs must be present in Karmada before proceeding to the deletion steps; + otherwise wd-alpha's finalizer could see an empty Karmada list and prematurely + delete the PP before wd-beta has been federated. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: wd-alpha + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: wd-beta + - assert: + timeout: 30s + resource: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + metadata: + namespace: ($downstreamNS) + name: city-dfw + + - name: delete-alpha + description: Delete wd-alpha; wd-beta still targets dfw so the PP must be retained. + try: + - delete: + ref: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($namespace) + name: wd-alpha + + - name: assert-policy-retained + description: Assert the PropagationPolicy is still present after wd-alpha is deleted. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - sleep: + duration: 8s + - assert: + timeout: 5s + resource: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + metadata: + namespace: ($downstreamNS) + name: city-dfw + + - name: delete-beta + description: Delete wd-beta (the last WD for city dfw). + try: + - delete: + ref: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + namespace: ($namespace) + name: wd-beta + + - name: assert-policy-deleted + description: Wait for the PropagationPolicy to be removed once no WDs remain. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - wait: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + namespace: ($downstreamNS) + name: city-dfw + timeout: 30s + for: + deletion: {} diff --git a/test/e2e/propagation-policy-lifecycle/workload-deployment-alpha.yaml b/test/e2e/propagation-policy-lifecycle/workload-deployment-alpha.yaml new file mode 100644 index 00000000..f9eb27fd --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/workload-deployment-alpha.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: wd-alpha +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/propagation-policy-lifecycle/workload-deployment-beta.yaml b/test/e2e/propagation-policy-lifecycle/workload-deployment-beta.yaml new file mode 100644 index 00000000..fd1d65c1 --- /dev/null +++ b/test/e2e/propagation-policy-lifecycle/workload-deployment-beta.yaml @@ -0,0 +1,21 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: wd-beta +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1 diff --git a/test/e2e/workload-deployment-federation/assert-downstream-pp.yaml b/test/e2e/workload-deployment-federation/assert-downstream-pp.yaml new file mode 100644 index 00000000..98f8d0f1 --- /dev/null +++ b/test/e2e/workload-deployment-federation/assert-downstream-pp.yaml @@ -0,0 +1,20 @@ +# Assert the PropagationPolicy was created in the Karmada namespace. +# The name follows propagationPolicyNameFor("dfw") = "workload-deployments-dfw". +# ($downstreamNS) is substituted by Chainsaw's template engine. +apiVersion: policy.karmada.io/v1alpha1 +kind: PropagationPolicy +metadata: + namespace: ($downstreamNS) + name: workload-deployments-dfw +spec: + resourceSelectors: + - apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw + placement: + clusterAffinity: + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw diff --git a/test/e2e/workload-deployment-federation/assert-downstream-wd.yaml b/test/e2e/workload-deployment-federation/assert-downstream-wd.yaml new file mode 100644 index 00000000..23c308ff --- /dev/null +++ b/test/e2e/workload-deployment-federation/assert-downstream-wd.yaml @@ -0,0 +1,9 @@ +# Assert the WorkloadDeployment exists in Karmada with the city-code label. +# ($downstreamNS) is substituted by Chainsaw's template engine from the script binding. +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + namespace: ($downstreamNS) + name: test-federation-wd + labels: + topology.datum.net/city-code: dfw diff --git a/test/e2e/workload-deployment-federation/chainsaw-test.yaml b/test/e2e/workload-deployment-federation/chainsaw-test.yaml new file mode 100644 index 00000000..302d89c4 --- /dev/null +++ b/test/e2e/workload-deployment-federation/chainsaw-test.yaml @@ -0,0 +1,84 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: workload-deployment-federation +spec: + description: | + Verifies that the WorkloadDeploymentFederator replicates a WorkloadDeployment + from the project namespace (control-plane cluster) to the Karmada API server + with the correct city-code label and PropagationPolicy. + + The federator follows the ns- convention for Karmada namespaces, + matching the MappedNamespaceResourceStrategy used by NSO. The test derives + the expected Karmada namespace dynamically from the Chainsaw test namespace UID. + + Verified: + - WorkloadDeployment exists in Karmada at ns- + - Karmada copy carries label topology.datum.net/city-code: dfw + - PropagationPolicy city-dfw exists in the Karmada namespace, + selecting WDs by city-code and routing them to matching POP-cell clusters. + + template: true + + steps: + - name: derive-ns-and-create-wd + description: Derive Karmada namespace and create the WorkloadDeployment. + try: + - apply: + file: workload-deployment.yaml + + - name: assert-wd-in-downstream + description: Assert WorkloadDeployment federated to Karmada with city-code label. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + metadata: + namespace: ($downstreamNS) + name: test-federation-wd + labels: + topology.datum.net/city-code: dfw + + - name: assert-propagation-policy-in-downstream + description: Assert PropagationPolicy created for city-dfw. + cluster: downstream + try: + - script: + content: | + kubectl --kubeconfig=../../../tmp/e2e/kubeconfigs/control-plane.yaml \ + get namespace "$NAMESPACE" \ + -o template='{{printf "ns-%s" .metadata.uid}}' + outputs: + - name: downstreamNS + value: ($stdout) + - assert: + timeout: 30s + resource: + apiVersion: policy.karmada.io/v1alpha1 + kind: PropagationPolicy + metadata: + namespace: ($downstreamNS) + name: city-dfw + spec: + resourceSelectors: + - apiVersion: compute.datumapis.com/v1alpha + kind: WorkloadDeployment + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw + placement: + clusterAffinity: + labelSelector: + matchLabels: + topology.datum.net/city-code: dfw diff --git a/test/e2e/workload-deployment-federation/workload-deployment.yaml b/test/e2e/workload-deployment-federation/workload-deployment.yaml new file mode 100644 index 00000000..0cd2347a --- /dev/null +++ b/test/e2e/workload-deployment-federation/workload-deployment.yaml @@ -0,0 +1,22 @@ +apiVersion: compute.datumapis.com/v1alpha +kind: WorkloadDeployment +metadata: + name: test-federation-wd + # namespace is injected by Chainsaw from ($namespace) +spec: + cityCode: dfw + placementName: default + workloadRef: + name: test-workload + uid: "00000000-0000-0000-0000-000000000001" + template: + spec: + runtime: + resources: + instanceType: datumcloud/d1-standard-2 + networkInterfaces: + - network: + name: test-network + + scaleSettings: + minReplicas: 1