diff --git a/Makefile b/Makefile index 5e7c55d..a7b1e19 100644 --- a/Makefile +++ b/Makefile @@ -62,7 +62,39 @@ vet: ## Run go vet against code. .PHONY: test test: manifests generate fmt vet envtest ## Run tests. - KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./... -coverprofile cover.out + KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /test/e2e) -coverprofile cover.out + +##@ E2E Testing + +KIND_CLUSTER_NAME ?= elx-nodegroup-e2e +# E2E_IMG is the controller image to load into the kind cluster. +# Override if you want to use a pre-built image: make test-e2e E2E_IMG=myregistry/myimage:tag +E2E_IMG ?= elx-nodegroup-controller:e2e + +.PHONY: kind-create +kind-create: ## Create a kind cluster for e2e testing (requires kind: https://kind.sigs.k8s.io) + kind create cluster --name $(KIND_CLUSTER_NAME) --config test/e2e/kind-config.yaml + +.PHONY: kind-delete +kind-delete: ## Delete the e2e kind cluster + kind delete cluster --name $(KIND_CLUSTER_NAME) + +.PHONY: kind-load-and-deploy +kind-load-and-deploy: docker-build ## Build the controller image, load it into the kind cluster, and deploy + $(CONTAINER_TOOL) tag $(IMG) $(E2E_IMG) + kind load docker-image $(E2E_IMG) --name $(KIND_CLUSTER_NAME) + $(KUSTOMIZE) build config/crd | kubectl --context kind-$(KIND_CLUSTER_NAME) apply -f - + cd config/manager && $(KUSTOMIZE) edit set image controller=$(E2E_IMG) + $(KUSTOMIZE) build config/default | kubectl --context kind-$(KIND_CLUSTER_NAME) apply -f - + kubectl --context kind-$(KIND_CLUSTER_NAME) -n elx-nodegroup-controller-system \ + wait --for=condition=available deployment/controller-manager --timeout=120s + +.PHONY: test-e2e +test-e2e: ## Run e2e tests against the cluster referenced by KUBECONFIG (controller must be running) + go test ./test/e2e/... -v -timeout 5m + +.PHONY: e2e-full +e2e-full: kind-create kind-load-and-deploy test-e2e kind-delete ## Full e2e lifecycle: create cluster, deploy, test, destroy ##@ Build diff --git a/README.md b/README.md index ddd40c0..d3075e0 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,243 @@ # elx-nodegroup-controller -A tiny controller for persisting labels and taints on a list of nodes. +A Kubernetes controller that automatically applies and persists labels and taints across groups of nodes. Define a `NodeGroup` resource and the controller will ensure all member nodes carry the specified labels and taints — even if they are manually removed or if nodes are replaced. +## Table of Contents -## Deploy controller and CRD +- [Overview](#overview) +- [How It Works](#how-it-works) +- [Installation](#installation) +- [Usage](#usage) + - [NodeGroup API Reference](#nodegroup-api-reference) + - [Examples](#examples) +- [Development](#development) +- [Architecture](#architecture) + +## Overview + +Managing labels and taints on Kubernetes nodes is often tedious and error-prone — especially in clusters with dynamic node provisioning where nodes come and go. The `elx-nodegroup-controller` solves this by introducing the `NodeGroup` custom resource, which acts as a declarative specification of which nodes should carry which labels and taints. + +Key features: + +- **Declarative**: Define the desired state once; the controller continuously reconciles it. +- **Persistent**: Labels and taints are automatically reapplied if manually removed. +- **Cleanup on deletion**: When a `NodeGroup` is deleted, the controller removes the labels and taints it previously applied. +- **Dynamic membership**: Nodes can be selected by exact name or by node group name patterns (useful for auto-scaled node groups). + +## How It Works + +1. You create a `NodeGroup` resource listing the nodes (by name or naming pattern) and the labels/taints you want applied. +2. The controller watches both `NodeGroup` and `Node` resources. +3. On each reconciliation loop, the controller ensures every member node has the specified labels and taints. +4. If a `NodeGroup` is deleted, the controller cleans up all labels and taints it originally applied via a finalizer before allowing the resource to be removed. + +## Installation + +### Prerequisites + +- Kubernetes cluster >= 1.24 +- `kubectl` configured to talk to your cluster +- `kustomize` (or use `kubectl apply -k`) + +### Deploy the Controller and CRD ```bash kustomize build config/default | kubectl apply -f - ``` -## Sample nodegroup manifest +This creates the following resources in the `elx-nodegroup-controller-system` namespace: + +| Resource | Name | +|----------|------| +| CRD | `nodegroups.k8s.elx.cloud` | +| Namespace | `elx-nodegroup-controller-system` | +| ServiceAccount | `controller-manager` | +| ClusterRole | `manager-role` | +| ClusterRoleBinding | `manager-rolebinding` | +| Deployment | `controller-manager` | + +### Uninstall + +```bash +kustomize build config/default | kubectl delete -f - +``` + +## Usage + +### NodeGroup API Reference + +``` +apiVersion: k8s.elx.cloud/v1alpha2 +kind: NodeGroup +``` + +`NodeGroup` is a cluster-scoped resource (no namespace required). + +#### Spec + +| Field | Type | Description | +|-------|------|-------------| +| `members` | `[]string` | Explicit list of Kubernetes node names to include in this group. | +| `nodeGroupNames` | `[]string` | Node naming patterns for dynamic membership. A node is included if any segment of its name (split on `-`) matches one of these values. | +| `labels` | `map[string]string` | Labels to apply to all member nodes. | +| `taints` | `[]corev1.Taint` | Taints to apply to all member nodes. Each taint has `key`, `value` (optional), and `effect` (`NoSchedule`, `PreferNoSchedule`, or `NoExecute`). | + +You can use `members`, `nodeGroupNames`, or both together — their results are combined. + +### Examples + +#### Apply labels and taints to specific nodes + +```yaml +apiVersion: k8s.elx.cloud/v1alpha2 +kind: NodeGroup +metadata: + name: compute-nodes +spec: + members: + - worker-node-1 + - worker-node-2 + labels: + workload-type: compute + environment: production + taints: + - key: workload-type + value: compute + effect: NoSchedule +``` + +#### Dynamic membership via node group name patterns + +Useful in clusters where auto-scaling creates nodes with predictable name prefixes. A node named `gpu-a100-abc123` would match the pattern `gpu` because `gpu` is one of the `-`-separated segments of the name. -```yml +```yaml apiVersion: k8s.elx.cloud/v1alpha2 kind: NodeGroup metadata: - name: nodegroup-sample + name: gpu-nodes spec: - members: - - node1 # Kubernetes node name - nodeGroupNames: - - node0 # Kubernetes nodegroup name, used for clusters with dynamic node naming + nodeGroupNames: + - gpu labels: - name: value + hardware: gpu taints: - - effect: "NoSchedule" - key: key - value: value + - key: nvidia.com/gpu + value: "true" + effect: NoSchedule ``` + +#### Mixed: explicit members and dynamic patterns + +```yaml +apiVersion: k8s.elx.cloud/v1alpha2 +kind: NodeGroup +metadata: + name: spot-nodes +spec: + members: + - specific-spot-node-1 + nodeGroupNames: + - spot + labels: + capacity-type: spot + taints: + - key: spot-instance + value: "true" + effect: NoExecute +``` + +#### Common operations + +```bash +# Create or update a NodeGroup +kubectl apply -f nodegroup.yaml + +# List all NodeGroups in the cluster +kubectl get nodegroups + +# Inspect a NodeGroup +kubectl describe nodegroup gpu-nodes + +# Delete a NodeGroup (cleans up labels and taints from member nodes) +kubectl delete nodegroup gpu-nodes +``` + +## Development + +### Prerequisites + +- Go 1.22+ +- Docker +- `controller-gen` +- `kustomize` +- `envtest` (for running tests) + +### Build + +```bash +make build +``` + +### Run tests + +```bash +make test +``` + +### Generate CRD manifests and DeepCopy methods + +```bash +make manifests +make generate +``` + +### Build and push the container image + +```bash +make docker-build docker-push IMG=/: +``` + +### Deploy from a custom image + +```bash +make deploy IMG=/: +``` + +## Architecture + +The controller is built with [controller-runtime](https://github.com/kubernetes-sigs/controller-runtime) (Kubebuilder v3 layout). + +### Reconciliation flow + +``` +NodeGroup created/updated + │ + ▼ + Build member list + (spec.members + nodes matching spec.nodeGroupNames) + │ + ▼ + NodeGroup being deleted? + │ │ + Yes No + │ │ + ▼ ▼ + Remove Add finalizer + labels/taints (if missing) + from nodes │ + │ ▼ + ▼ Apply labels to + Remove member nodes + finalizer │ + ▼ + Apply taints to + member nodes +``` + +### Watching Nodes + +The controller also watches `Node` resources. When a node changes (e.g., it is recreated after being replaced by the autoscaler), the controller looks up which `NodeGroup`s list that node as a member and triggers reconciliation for each of them — ensuring labels and taints are immediately reapplied. + +### Finalizer + +The controller uses the `k8s.elx.cloud/finalizer` finalizer on each `NodeGroup`. This ensures the controller has the opportunity to clean up labels and taints from member nodes before Kubernetes removes the `NodeGroup` object. diff --git a/controllers/helpers_test.go b/controllers/helpers_test.go new file mode 100644 index 0000000..9f45bcf --- /dev/null +++ b/controllers/helpers_test.go @@ -0,0 +1,391 @@ +/* +Copyright 2022. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// Unit tests for the pure helper functions in nodegroup_controller.go. +// These do not require a running Kubernetes API server. + +func TestStringIn(t *testing.T) { + tests := []struct { + name string + s string + a []string + expected bool + }{ + {"found at start", "a", []string{"a", "b", "c"}, true}, + {"found in middle", "b", []string{"a", "b", "c"}, true}, + {"found at end", "c", []string{"a", "b", "c"}, true}, + {"not found", "d", []string{"a", "b", "c"}, false}, + {"empty slice", "a", []string{}, false}, + {"empty string found", "", []string{"", "a"}, true}, + {"empty string not found", "", []string{"a", "b"}, false}, + {"single element match", "only", []string{"only"}, true}, + {"single element no match", "x", []string{"only"}, false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := stringIn(tt.s, tt.a); got != tt.expected { + t.Errorf("stringIn(%q, %v) = %v, want %v", tt.s, tt.a, got, tt.expected) + } + }) + } +} + +func TestTaintPos(t *testing.T) { + taint1 := corev1.Taint{Key: "key1", Value: "val1", Effect: corev1.TaintEffectNoSchedule} + taint2 := corev1.Taint{Key: "key2", Value: "val2", Effect: corev1.TaintEffectNoExecute} + taint3 := corev1.Taint{Key: "key3", Value: "val3", Effect: corev1.TaintEffectPreferNoSchedule} + + tests := []struct { + name string + taint *corev1.Taint + taints []corev1.Taint + expected int + }{ + {"found at index 0", &taint1, []corev1.Taint{taint1, taint2}, 0}, + {"found at index 1", &taint2, []corev1.Taint{taint1, taint2}, 1}, + {"found at index 2", &taint3, []corev1.Taint{taint1, taint2, taint3}, 2}, + {"not found", &taint3, []corev1.Taint{taint1, taint2}, -1}, + {"empty taints list", &taint1, []corev1.Taint{}, -1}, + {"different effect, no match", &corev1.Taint{Key: "key1", Value: "val1", Effect: corev1.TaintEffectNoExecute}, []corev1.Taint{taint1}, -1}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := taintPos(tt.taint, tt.taints); got != tt.expected { + t.Errorf("taintPos() = %v, want %v", got, tt.expected) + } + }) + } +} + +func TestHasTaint(t *testing.T) { + present := corev1.Taint{Key: "k", Value: "v", Effect: corev1.TaintEffectNoSchedule} + absent := corev1.Taint{Key: "other", Value: "v", Effect: corev1.TaintEffectNoSchedule} + + nodeWith := &corev1.Node{ + Spec: corev1.NodeSpec{Taints: []corev1.Taint{present}}, + } + nodeWithout := &corev1.Node{} + + if !hasTaint(&present, nodeWith) { + t.Error("hasTaint returned false for a node that has the taint") + } + if hasTaint(&absent, nodeWith) { + t.Error("hasTaint returned true for a taint the node does not have") + } + if hasTaint(&present, nodeWithout) { + t.Error("hasTaint returned true for a node with no taints") + } +} + +func TestFinalizeNodeLabels(t *testing.T) { + tests := []struct { + name string + nodeLabels map[string]string + specLabels map[string]string + expectedUpdate bool + expectedLabels map[string]string + }{ + { + name: "removes label when value matches spec", + nodeLabels: map[string]string{"managed": "yes", "other": "kept"}, + specLabels: map[string]string{"managed": "yes"}, + expectedUpdate: true, + expectedLabels: map[string]string{"other": "kept"}, + }, + { + name: "does not remove label when value has been changed since NodeGroup applied it", + nodeLabels: map[string]string{"managed": "modified"}, + specLabels: map[string]string{"managed": "yes"}, + expectedUpdate: false, + expectedLabels: map[string]string{"managed": "modified"}, + }, + { + name: "does not remove label that is absent from node", + nodeLabels: map[string]string{"other": "value"}, + specLabels: map[string]string{"managed": "yes"}, + expectedUpdate: false, + expectedLabels: map[string]string{"other": "value"}, + }, + { + name: "removes multiple matching labels", + nodeLabels: map[string]string{"a": "1", "b": "2", "c": "3"}, + specLabels: map[string]string{"a": "1", "b": "2"}, + expectedUpdate: true, + expectedLabels: map[string]string{"c": "3"}, + }, + { + name: "handles nil node labels gracefully", + nodeLabels: nil, + specLabels: map[string]string{"managed": "yes"}, + expectedUpdate: false, + expectedLabels: nil, + }, + { + name: "handles empty spec labels", + nodeLabels: map[string]string{"key": "value"}, + specLabels: map[string]string{}, + expectedUpdate: false, + expectedLabels: map[string]string{"key": "value"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Labels: tt.nodeLabels}, + } + got := finalizeNodeLabels(node, tt.specLabels) + if got != tt.expectedUpdate { + t.Errorf("finalizeNodeLabels() needsUpdate = %v, want %v", got, tt.expectedUpdate) + } + if len(node.Labels) != len(tt.expectedLabels) { + t.Errorf("after finalize: labels = %v, want %v", node.Labels, tt.expectedLabels) + return + } + for k, v := range tt.expectedLabels { + if node.Labels[k] != v { + t.Errorf("after finalize: label[%s] = %q, want %q", k, node.Labels[k], v) + } + } + }) + } +} + +func TestReconcileNodeLabels(t *testing.T) { + tests := []struct { + name string + nodeLabels map[string]string + specLabels map[string]string + expectedUpdate bool + expectedLabels map[string]string + }{ + { + name: "adds new label to existing label map", + nodeLabels: map[string]string{"existing": "yes"}, + specLabels: map[string]string{"new": "label"}, + expectedUpdate: true, + expectedLabels: map[string]string{"existing": "yes", "new": "label"}, + }, + { + name: "adds label when node has nil labels map", + nodeLabels: nil, + specLabels: map[string]string{"key": "value"}, + expectedUpdate: true, + expectedLabels: map[string]string{"key": "value"}, + }, + { + name: "updates label with wrong value", + nodeLabels: map[string]string{"key": "oldvalue"}, + specLabels: map[string]string{"key": "newvalue"}, + expectedUpdate: true, + expectedLabels: map[string]string{"key": "newvalue"}, + }, + { + name: "no update when label already has correct value", + nodeLabels: map[string]string{"key": "value"}, + specLabels: map[string]string{"key": "value"}, + expectedUpdate: false, + expectedLabels: map[string]string{"key": "value"}, + }, + { + name: "no update when spec labels is empty", + nodeLabels: map[string]string{"key": "value"}, + specLabels: map[string]string{}, + expectedUpdate: false, + expectedLabels: map[string]string{"key": "value"}, + }, + { + name: "adds multiple labels", + nodeLabels: map[string]string{}, + specLabels: map[string]string{"a": "1", "b": "2"}, + expectedUpdate: true, + expectedLabels: map[string]string{"a": "1", "b": "2"}, + }, + { + name: "does not remove labels not in spec", + nodeLabels: map[string]string{"a": "1", "b": "2"}, + specLabels: map[string]string{"a": "1"}, + expectedUpdate: false, + expectedLabels: map[string]string{"a": "1", "b": "2"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "test-node", Labels: tt.nodeLabels}, + } + got := reconcileNodeLabels(node, tt.specLabels) + if got != tt.expectedUpdate { + t.Errorf("reconcileNodeLabels() needsUpdate = %v, want %v", got, tt.expectedUpdate) + } + for k, v := range tt.expectedLabels { + if node.Labels[k] != v { + t.Errorf("label[%s] = %q, want %q", k, node.Labels[k], v) + } + } + }) + } +} + +func TestFinalizeNodeTaints(t *testing.T) { + taint1 := corev1.Taint{Key: "key1", Value: "v1", Effect: corev1.TaintEffectNoSchedule} + taint2 := corev1.Taint{Key: "key2", Value: "v2", Effect: corev1.TaintEffectNoExecute} + taint3 := corev1.Taint{Key: "key3", Value: "v3", Effect: corev1.TaintEffectPreferNoSchedule} + + tests := []struct { + name string + nodeTaints []corev1.Taint + specTaints []corev1.Taint + expectedUpdate bool + remainingLen int + }{ + { + name: "removes single matching taint", + nodeTaints: []corev1.Taint{taint1, taint2}, + specTaints: []corev1.Taint{taint1}, + expectedUpdate: true, + remainingLen: 1, + }, + { + name: "removes all matching taints", + nodeTaints: []corev1.Taint{taint1, taint2}, + specTaints: []corev1.Taint{taint1, taint2}, + expectedUpdate: true, + remainingLen: 0, + }, + { + name: "does not remove taint not in spec", + nodeTaints: []corev1.Taint{taint1, taint2}, + specTaints: []corev1.Taint{taint3}, + expectedUpdate: false, + remainingLen: 2, + }, + { + name: "no update when node has no taints", + nodeTaints: []corev1.Taint{}, + specTaints: []corev1.Taint{taint1}, + expectedUpdate: false, + remainingLen: 0, + }, + { + name: "no update with empty spec taints", + nodeTaints: []corev1.Taint{taint1}, + specTaints: []corev1.Taint{}, + expectedUpdate: false, + remainingLen: 1, + }, + { + name: "preserves non-spec taints when removing spec taint", + nodeTaints: []corev1.Taint{taint1, taint2, taint3}, + specTaints: []corev1.Taint{taint2}, + expectedUpdate: true, + remainingLen: 2, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + node := &corev1.Node{ + Spec: corev1.NodeSpec{Taints: append([]corev1.Taint{}, tt.nodeTaints...)}, + } + got := finalizeNodeTaints(node, tt.specTaints) + if got != tt.expectedUpdate { + t.Errorf("finalizeNodeTaints() needsUpdate = %v, want %v", got, tt.expectedUpdate) + } + if len(node.Spec.Taints) != tt.remainingLen { + t.Errorf("remaining taints = %d, want %d (taints: %v)", len(node.Spec.Taints), tt.remainingLen, node.Spec.Taints) + } + }) + } +} + +func TestReconcileNodeTaints(t *testing.T) { + taint1 := corev1.Taint{Key: "key1", Value: "v1", Effect: corev1.TaintEffectNoSchedule} + taint2 := corev1.Taint{Key: "key2", Value: "v2", Effect: corev1.TaintEffectNoExecute} + + tests := []struct { + name string + nodeTaints []corev1.Taint + specTaints []corev1.Taint + expectedUpdate bool + expectedLen int + }{ + { + name: "adds a missing taint", + nodeTaints: []corev1.Taint{}, + specTaints: []corev1.Taint{taint1}, + expectedUpdate: true, + expectedLen: 1, + }, + { + name: "adds multiple missing taints", + nodeTaints: []corev1.Taint{}, + specTaints: []corev1.Taint{taint1, taint2}, + expectedUpdate: true, + expectedLen: 2, + }, + { + name: "no update when taint already present", + nodeTaints: []corev1.Taint{taint1}, + specTaints: []corev1.Taint{taint1}, + expectedUpdate: false, + expectedLen: 1, + }, + { + name: "adds only the missing taint when one is already present", + nodeTaints: []corev1.Taint{taint1}, + specTaints: []corev1.Taint{taint1, taint2}, + expectedUpdate: true, + expectedLen: 2, + }, + { + name: "no update with empty spec taints", + nodeTaints: []corev1.Taint{taint1}, + specTaints: []corev1.Taint{}, + expectedUpdate: false, + expectedLen: 1, + }, + { + name: "no update when node has no taints and spec is empty", + nodeTaints: []corev1.Taint{}, + specTaints: []corev1.Taint{}, + expectedUpdate: false, + expectedLen: 0, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "test-node"}, + Spec: corev1.NodeSpec{Taints: append([]corev1.Taint{}, tt.nodeTaints...)}, + } + got := reconcileNodeTaints(node, tt.specTaints) + if got != tt.expectedUpdate { + t.Errorf("reconcileNodeTaints() needsUpdate = %v, want %v", got, tt.expectedUpdate) + } + if len(node.Spec.Taints) != tt.expectedLen { + t.Errorf("taint count = %d, want %d (taints: %v)", len(node.Spec.Taints), tt.expectedLen, node.Spec.Taints) + } + }) + } +} diff --git a/controllers/nodegroup_extra_test.go b/controllers/nodegroup_extra_test.go new file mode 100644 index 0000000..ffe2242 --- /dev/null +++ b/controllers/nodegroup_extra_test.go @@ -0,0 +1,458 @@ +/* +Copyright 2022. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/elastx/elx-nodegroup-controller/api/v1alpha2" +) + +// Additional integration tests that extend nodegroup_controller_test.go. +// Each Context manages its own nodes and NodeGroups to avoid conflicts with the base test suite. + +var _ = Describe("NodeGroup controller - extended", func() { + const ( + timeout = time.Second * 15 + interval = time.Millisecond * 250 + ) + + Context("nodeGroupNames dynamic discovery", func() { + var ( + matchedNode1 corev1.Node + matchedNode2 corev1.Node + unmatchedNode corev1.Node + ng v1alpha2.NodeGroup + ) + + BeforeEach(func() { + // Nodes whose names contain "xgn" as a dash-separated segment + matchedNode1 = corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "extra-xgn-worker-a"}} + matchedNode2 = corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "extra-xgn-worker-b"}} + // Node that must NOT be matched + unmatchedNode = corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "extra-other-worker"}} + + for _, n := range []corev1.Node{matchedNode1, matchedNode2, unmatchedNode} { + node := n + Expect(k8sClient.Create(context.Background(), &node)).To(Succeed()) + } + + ng = v1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{Name: "extra-ng-xgn"}, + Spec: v1alpha2.NodeGroupSpec{ + NodeGroupNames: []string{"xgn"}, + Labels: map[string]string{"extra-group": "xgn"}, + }, + } + Expect(k8sClient.Create(context.Background(), &ng)).To(Succeed()) + }) + + AfterEach(func() { + ngFetched := &v1alpha2.NodeGroup{} + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: ng.Name}, ngFetched); err == nil { + k8sClient.Delete(context.Background(), ngFetched) //nolint:errcheck + } + for _, name := range []string{matchedNode1.Name, matchedNode2.Name, unmatchedNode.Name} { + n := &corev1.Node{} + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: name}, n); err == nil { + k8sClient.Delete(context.Background(), n) //nolint:errcheck + } + } + }) + + It("applies labels to nodes matching nodeGroupNames and not to others", func() { + n := &corev1.Node{} + + By("waiting for matched nodes to receive the label") + for _, name := range []string{matchedNode1.Name, matchedNode2.Name} { + nodeName := name + Eventually(func() string { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: nodeName}, n); err != nil { + return "" + } + return n.Labels["extra-group"] + }, timeout, interval).Should(Equal("xgn"), "node %s should have label extra-group=xgn", nodeName) + } + + By("verifying the unmatched node did not receive the label") + Consistently(func() string { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: unmatchedNode.Name}, n); err != nil { + return "" + } + return n.Labels["extra-group"] + }, time.Second*2, interval).Should(BeEmpty(), "unmatched node should not have the label") + }) + + It("cleans up labels from matched nodes when NodeGroup is deleted", func() { + n := &corev1.Node{} + + By("waiting for labels to be applied") + Eventually(func() string { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: matchedNode1.Name}, n); err != nil { + return "" + } + return n.Labels["extra-group"] + }, timeout, interval).Should(Equal("xgn")) + + By("deleting the NodeGroup") + ngFetched := &v1alpha2.NodeGroup{} + Expect(k8sClient.Get(context.Background(), types.NamespacedName{Name: ng.Name}, ngFetched)).To(Succeed()) + Expect(k8sClient.Delete(context.Background(), ngFetched)).To(Succeed()) + + By("verifying labels are removed from matched nodes") + Eventually(func() bool { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: matchedNode1.Name}, n); err != nil { + return false + } + _, hasLabel := n.Labels["extra-group"] + return !hasLabel + }, timeout, interval).Should(BeTrue()) + }) + }) + + Context("label value update", func() { + var ( + node corev1.Node + ng v1alpha2.NodeGroup + ) + + BeforeEach(func() { + node = corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "extra-label-update-node", + Labels: map[string]string{"update-label": "old-value"}, + }, + } + Expect(k8sClient.Create(context.Background(), &node)).To(Succeed()) + + ng = v1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{Name: "extra-ng-label-update"}, + Spec: v1alpha2.NodeGroupSpec{ + Members: []string{node.Name}, + Labels: map[string]string{"update-label": "new-value"}, + }, + } + Expect(k8sClient.Create(context.Background(), &ng)).To(Succeed()) + }) + + AfterEach(func() { + ngFetched := &v1alpha2.NodeGroup{} + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: ng.Name}, ngFetched); err == nil { + k8sClient.Delete(context.Background(), ngFetched) //nolint:errcheck + } + n := &corev1.Node{} + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: node.Name}, n); err == nil { + k8sClient.Delete(context.Background(), n) //nolint:errcheck + } + }) + + It("updates an existing label to the value in the spec", func() { + n := &corev1.Node{} + Eventually(func() string { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: node.Name}, n); err != nil { + return "" + } + return n.Labels["update-label"] + }, timeout, interval).Should(Equal("new-value")) + }) + }) + + Context("node with nil labels", func() { + var ( + node corev1.Node + ng v1alpha2.NodeGroup + ) + + BeforeEach(func() { + // Create node without any labels (Labels field is nil) + node = corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "extra-nil-labels-node"}, + } + Expect(k8sClient.Create(context.Background(), &node)).To(Succeed()) + + ng = v1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{Name: "extra-ng-nil-labels"}, + Spec: v1alpha2.NodeGroupSpec{ + Members: []string{node.Name}, + Labels: map[string]string{"nil-test": "applied"}, + }, + } + Expect(k8sClient.Create(context.Background(), &ng)).To(Succeed()) + }) + + AfterEach(func() { + ngFetched := &v1alpha2.NodeGroup{} + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: ng.Name}, ngFetched); err == nil { + k8sClient.Delete(context.Background(), ngFetched) //nolint:errcheck + } + n := &corev1.Node{} + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: node.Name}, n); err == nil { + k8sClient.Delete(context.Background(), n) //nolint:errcheck + } + }) + + It("applies labels to a node that initially has no labels", func() { + n := &corev1.Node{} + Eventually(func() string { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: node.Name}, n); err != nil { + return "" + } + return n.Labels["nil-test"] + }, timeout, interval).Should(Equal("applied")) + }) + }) + + Context("finalization preserves labels not managed by the NodeGroup", func() { + var ( + node corev1.Node + ng v1alpha2.NodeGroup + ) + + BeforeEach(func() { + // Node has a pre-existing label that the NodeGroup does not manage + node = corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "extra-preserve-labels-node", + Labels: map[string]string{"pre-existing": "keep-me"}, + }, + } + Expect(k8sClient.Create(context.Background(), &node)).To(Succeed()) + + ng = v1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{Name: "extra-ng-preserve"}, + Spec: v1alpha2.NodeGroupSpec{ + Members: []string{node.Name}, + Labels: map[string]string{"managed": "yes"}, + }, + } + Expect(k8sClient.Create(context.Background(), &ng)).To(Succeed()) + }) + + AfterEach(func() { + n := &corev1.Node{} + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: node.Name}, n); err == nil { + k8sClient.Delete(context.Background(), n) //nolint:errcheck + } + }) + + It("only removes managed labels during finalization, leaving pre-existing labels intact", func() { + n := &corev1.Node{} + + By("waiting for the managed label to be applied") + Eventually(func() string { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: node.Name}, n); err != nil { + return "" + } + return n.Labels["managed"] + }, timeout, interval).Should(Equal("yes")) + + By("deleting the NodeGroup") + ngFetched := &v1alpha2.NodeGroup{} + Expect(k8sClient.Get(context.Background(), types.NamespacedName{Name: ng.Name}, ngFetched)).To(Succeed()) + Expect(k8sClient.Delete(context.Background(), ngFetched)).To(Succeed()) + + By("verifying the managed label is removed and pre-existing label is kept") + Eventually(func() map[string]string { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: node.Name}, n); err != nil { + return nil + } + return n.Labels + }, timeout, interval).Should(And( + HaveKey("pre-existing"), + Not(HaveKey("managed")), + )) + }) + }) + + Context("mixed members and nodeGroupNames", func() { + var ( + explicitNode corev1.Node + dynamicNode corev1.Node + nonMemberNode corev1.Node + ng v1alpha2.NodeGroup + ) + + BeforeEach(func() { + explicitNode = corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "extra-explicit-member"}} + dynamicNode = corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "extra-dyngrp-worker"}} + nonMemberNode = corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "extra-excluded-node"}} + + for _, n := range []corev1.Node{explicitNode, dynamicNode, nonMemberNode} { + node := n + Expect(k8sClient.Create(context.Background(), &node)).To(Succeed()) + } + + ng = v1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{Name: "extra-ng-mixed"}, + Spec: v1alpha2.NodeGroupSpec{ + Members: []string{explicitNode.Name}, + NodeGroupNames: []string{"dyngrp"}, + Labels: map[string]string{"mixed-test": "yes"}, + }, + } + Expect(k8sClient.Create(context.Background(), &ng)).To(Succeed()) + }) + + AfterEach(func() { + ngFetched := &v1alpha2.NodeGroup{} + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: ng.Name}, ngFetched); err == nil { + k8sClient.Delete(context.Background(), ngFetched) //nolint:errcheck + } + for _, name := range []string{explicitNode.Name, dynamicNode.Name, nonMemberNode.Name} { + n := &corev1.Node{} + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: name}, n); err == nil { + k8sClient.Delete(context.Background(), n) //nolint:errcheck + } + } + }) + + It("applies labels to both explicit members and nodeGroupNames-matched nodes", func() { + n := &corev1.Node{} + + for _, name := range []string{explicitNode.Name, dynamicNode.Name} { + nodeName := name + Eventually(func() string { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: nodeName}, n); err != nil { + return "" + } + return n.Labels["mixed-test"] + }, timeout, interval).Should(Equal("yes"), "node %s should have label", nodeName) + } + + Consistently(func() string { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: nonMemberNode.Name}, n); err != nil { + return "" + } + return n.Labels["mixed-test"] + }, time.Second*2, interval).Should(BeEmpty()) + }) + }) + + Context("NodeGroup with no members and no nodeGroupNames", func() { + var ng v1alpha2.NodeGroup + + BeforeEach(func() { + ng = v1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{Name: "extra-ng-empty"}, + Spec: v1alpha2.NodeGroupSpec{ + Labels: map[string]string{"empty-group": "true"}, + }, + } + Expect(k8sClient.Create(context.Background(), &ng)).To(Succeed()) + }) + + AfterEach(func() { + ngFetched := &v1alpha2.NodeGroup{} + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: ng.Name}, ngFetched); err == nil { + k8sClient.Delete(context.Background(), ngFetched) //nolint:errcheck + } + }) + + It("reconciles without error and adds a finalizer", func() { + fetched := &v1alpha2.NodeGroup{} + Eventually(func() bool { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: ng.Name}, fetched); err != nil { + return false + } + return len(fetched.Finalizers) == 1 + }, timeout, interval).Should(BeTrue()) + }) + }) + + Context("finalization of both labels and taints", func() { + var ( + node corev1.Node + ng v1alpha2.NodeGroup + ) + taint := corev1.Taint{ + Key: "extra.elx.cloud/finalize-both", + Value: "yes", + Effect: corev1.TaintEffectNoSchedule, + } + + BeforeEach(func() { + node = corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "extra-finalize-both-node"}} + Expect(k8sClient.Create(context.Background(), &node)).To(Succeed()) + + ng = v1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{Name: "extra-ng-finalize-both"}, + Spec: v1alpha2.NodeGroupSpec{ + Members: []string{node.Name}, + Labels: map[string]string{"finalize-both": "yes"}, + Taints: []corev1.Taint{taint}, + }, + } + Expect(k8sClient.Create(context.Background(), &ng)).To(Succeed()) + }) + + AfterEach(func() { + n := &corev1.Node{} + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: node.Name}, n); err == nil { + k8sClient.Delete(context.Background(), n) //nolint:errcheck + } + }) + + It("removes both labels and taints when the NodeGroup is deleted", func() { + n := &corev1.Node{} + + By("waiting for both label and taint to be applied") + Eventually(func() bool { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: node.Name}, n); err != nil { + return false + } + if n.Labels["finalize-both"] != "yes" { + return false + } + for _, t := range n.Spec.Taints { + if t.MatchTaint(&taint) { + return true + } + } + return false + }, timeout, interval).Should(BeTrue()) + + By("deleting the NodeGroup") + ngFetched := &v1alpha2.NodeGroup{} + Expect(k8sClient.Get(context.Background(), types.NamespacedName{Name: ng.Name}, ngFetched)).To(Succeed()) + Expect(k8sClient.Delete(context.Background(), ngFetched)).To(Succeed()) + + By("verifying both label and taint are removed") + Eventually(func() bool { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: node.Name}, n); err != nil { + return false + } + if _, hasLabel := n.Labels["finalize-both"]; hasLabel { + return false + } + for _, t := range n.Spec.Taints { + if t.MatchTaint(&taint) { + return false + } + } + return true + }, timeout, interval).Should(BeTrue()) + }) + }) +}) diff --git a/controllers/reconciler_errors_test.go b/controllers/reconciler_errors_test.go new file mode 100644 index 0000000..84beec5 --- /dev/null +++ b/controllers/reconciler_errors_test.go @@ -0,0 +1,301 @@ +/* +Copyright 2022. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "errors" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" + + k8sv1alpha2 "github.com/elastx/elx-nodegroup-controller/api/v1alpha2" +) + +// reconcilerScheme returns a scheme with all required types registered. +func reconcilerScheme(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + if err := clientgoscheme.AddToScheme(s); err != nil { + t.Fatal(err) + } + if err := k8sv1alpha2.AddToScheme(s); err != nil { + t.Fatal(err) + } + return s +} + +// TestReconcile_NodeGroupGetError covers the non-NotFound error path when +// fetching the NodeGroup at the start of Reconcile. +func TestReconcile_NodeGroupGetError(t *testing.T) { + s := reconcilerScheme(t) + + injected := errors.New("injected get error") + fc := fake.NewClientBuilder().WithScheme(s). + WithInterceptorFuncs(interceptor.Funcs{ + Get: func(_ context.Context, _ client.WithWatch, _ types.NamespacedName, _ client.Object, _ ...client.GetOption) error { + return injected + }, + }).Build() + + r := &NodeGroupReconciler{Client: fc, Scheme: s} + _, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "does-not-matter"}, + }) + if !errors.Is(err, injected) { + t.Errorf("expected injected error, got %v", err) + } +} + +// TestReconcile_NodeListError covers the error path when listing all nodes fails. +func TestReconcile_NodeListError(t *testing.T) { + s := reconcilerScheme(t) + + ng := &k8sv1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{Name: "ng-list-err"}, + Spec: k8sv1alpha2.NodeGroupSpec{ + Members: []string{"node1"}, + Labels: map[string]string{"k": "v"}, + }, + } + + injected := errors.New("injected list error") + callCount := 0 + fc := fake.NewClientBuilder().WithScheme(s).WithObjects(ng). + WithInterceptorFuncs(interceptor.Funcs{ + List: func(_ context.Context, _ client.WithWatch, list client.ObjectList, _ ...client.ListOption) error { + // Only intercept Node lists so NodeGroup lists still work. + if _, ok := list.(*corev1.NodeList); ok { + callCount++ + return injected + } + return nil + }, + }).Build() + + r := &NodeGroupReconciler{Client: fc, Scheme: s} + _, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: ng.Name}, + }) + if !errors.Is(err, injected) { + t.Errorf("expected injected error, got %v", err) + } +} + +// TestReconcile_FinalizerUpdateError covers the error path when updating the +// NodeGroup to add the finalizer fails. +func TestReconcile_FinalizerUpdateError(t *testing.T) { + s := reconcilerScheme(t) + + ng := &k8sv1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ng-update-err", + Finalizers: []string{}, + }, + Spec: k8sv1alpha2.NodeGroupSpec{}, + } + + injected := errors.New("injected update error") + fc := fake.NewClientBuilder().WithScheme(s).WithObjects(ng). + WithInterceptorFuncs(interceptor.Funcs{ + Update: func(_ context.Context, _ client.WithWatch, _ client.Object, _ ...client.UpdateOption) error { + return injected + }, + }).Build() + + r := &NodeGroupReconciler{Client: fc, Scheme: s} + _, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: ng.Name}, + }) + if !errors.Is(err, injected) { + t.Errorf("expected injected error, got %v", err) + } +} + +// TestFindNodeGroupsForMember_ListError covers the error path in +// findNodeGroupsForMember when listing NodeGroups fails. +func TestFindNodeGroupsForMember_ListError(t *testing.T) { + s := reconcilerScheme(t) + + injected := errors.New("injected list error") + fc := fake.NewClientBuilder().WithScheme(s). + WithInterceptorFuncs(interceptor.Funcs{ + List: func(_ context.Context, _ client.WithWatch, _ client.ObjectList, _ ...client.ListOption) error { + return injected + }, + }).Build() + + r := &NodeGroupReconciler{Client: fc, Scheme: s} + node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "any-node"}} + + requests := r.findNodeGroupsForMember(context.Background(), node) + if len(requests) != 0 { + t.Errorf("expected empty requests on error, got %d", len(requests)) + } +} + +// TestReconcile_FinalizationNodeLabelUpdateError covers the error path when +// updating a node to remove a managed label during finalization fails. +// The controller logs the error but still returns Requeue so it retries. +func TestReconcile_FinalizationNodeLabelUpdateError(t *testing.T) { + s := reconcilerScheme(t) + + now := metav1.Now() + ng := &k8sv1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ng-fin-label-update-err", + DeletionTimestamp: &now, + Finalizers: []string{finalizer}, + ResourceVersion: "1", + }, + Spec: k8sv1alpha2.NodeGroupSpec{ + Members: []string{"fin-node"}, + Labels: map[string]string{"managed": "yes"}, + }, + } + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "fin-node", + Labels: map[string]string{"managed": "yes"}, + ResourceVersion: "1", + }, + } + + injected := errors.New("injected node update error") + fc := fake.NewClientBuilder().WithScheme(s).WithObjects(ng, node). + WithInterceptorFuncs(interceptor.Funcs{ + Update: func(_ context.Context, _ client.WithWatch, obj client.Object, _ ...client.UpdateOption) error { + if _, isNode := obj.(*corev1.Node); isNode { + return injected + } + return nil + }, + }).Build() + + r := &NodeGroupReconciler{Client: fc, Scheme: s} + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: ng.Name}, + }) + // Even when the node update fails, the controller requeues to retry. + if err != nil || !result.Requeue { + t.Errorf("expected Requeue=true, err=nil; got Requeue=%v, err=%v", result.Requeue, err) + } +} + +// TestReconcile_FinalizationNodeTaintUpdateError covers the error path when +// updating a node to remove a managed taint during finalization fails. +func TestReconcile_FinalizationNodeTaintUpdateError(t *testing.T) { + s := reconcilerScheme(t) + + taint := corev1.Taint{Key: "test/taint", Value: "yes", Effect: corev1.TaintEffectNoSchedule} + + now := metav1.Now() + ng := &k8sv1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ng-fin-taint-update-err", + DeletionTimestamp: &now, + Finalizers: []string{finalizer}, + ResourceVersion: "1", + }, + Spec: k8sv1alpha2.NodeGroupSpec{ + Members: []string{"fin-taint-node"}, + // No labels — skip to taint finalization on first reconcile + Taints: []corev1.Taint{taint}, + }, + } + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "fin-taint-node", + ResourceVersion: "1", + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{taint}, + }, + } + + injected := errors.New("injected taint node update error") + fc := fake.NewClientBuilder().WithScheme(s).WithObjects(ng, node). + WithInterceptorFuncs(interceptor.Funcs{ + Update: func(_ context.Context, _ client.WithWatch, obj client.Object, _ ...client.UpdateOption) error { + if _, isNode := obj.(*corev1.Node); isNode { + return injected + } + return nil + }, + }).Build() + + r := &NodeGroupReconciler{Client: fc, Scheme: s} + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: ng.Name}, + }) + if err != nil || !result.Requeue { + t.Errorf("expected Requeue=true, err=nil; got Requeue=%v, err=%v", result.Requeue, err) + } +} + +// TestReconcile_FinalizationFinalierRemovalError covers the error path when +// updating the NodeGroup to remove the finalizer fails after node cleanup. +func TestReconcile_FinalizationFinalizerRemovalError(t *testing.T) { + s := reconcilerScheme(t) + + now := metav1.Now() + ng := &k8sv1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ng-fin-finalizer-err", + DeletionTimestamp: &now, + Finalizers: []string{finalizer}, + ResourceVersion: "1", + }, + // No labels or taints — node cleanup is a no-op, goes straight to finalizer removal. + Spec: k8sv1alpha2.NodeGroupSpec{ + Members: []string{"fin-clean-node"}, + }, + } + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "fin-clean-node", + ResourceVersion: "1", + }, + } + + injected := errors.New("injected nodegroup update error") + fc := fake.NewClientBuilder().WithScheme(s).WithObjects(ng, node). + WithInterceptorFuncs(interceptor.Funcs{ + Update: func(_ context.Context, _ client.WithWatch, obj client.Object, _ ...client.UpdateOption) error { + if _, isNG := obj.(*k8sv1alpha2.NodeGroup); isNG { + return injected + } + return nil + }, + }).Build() + + r := &NodeGroupReconciler{Client: fc, Scheme: s} + _, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: ng.Name}, + }) + if !errors.Is(err, injected) { + t.Errorf("expected injected error, got %v", err) + } +} diff --git a/controllers/suite_test.go b/controllers/suite_test.go index 5084229..a420401 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -30,6 +30,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/envtest" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -75,7 +76,9 @@ var _ = BeforeSuite(func() { Expect(k8sClient).NotTo(BeNil()) k8sManager, err := ctrl.NewManager(cfg, ctrl.Options{ - Scheme: scheme.Scheme, + Scheme: scheme.Scheme, + Metrics: metricsserver.Options{BindAddress: "0"}, + HealthProbeBindAddress: "0", }) Expect(err).ToNot(HaveOccurred()) diff --git a/docs/api-reference.md b/docs/api-reference.md new file mode 100644 index 0000000..cb967e1 --- /dev/null +++ b/docs/api-reference.md @@ -0,0 +1,146 @@ +# NodeGroup API Reference + +## Overview + +The `NodeGroup` resource is a cluster-scoped custom resource provided by the `k8s.elx.cloud` API group. It defines a group of nodes and the labels and taints that should be applied to them. + +``` +Group: k8s.elx.cloud +Version: v1alpha2 +Kind: NodeGroup +Scope: Cluster +``` + +> **Note:** `v1alpha1` is also supported for backwards compatibility but is deprecated. Use `v1alpha2` for all new resources. + +--- + +## Spec + +### `spec.members` + +**Type:** `[]string` +**Required:** No + +A list of Kubernetes node names to explicitly include in this group. + +```yaml +spec: + members: + - worker-node-1 + - worker-node-2 +``` + +Each entry must be an exact match of the node's `.metadata.name`. + +--- + +### `spec.nodeGroupNames` + +**Type:** `[]string` +**Required:** No + +A list of name segments to use for dynamic node discovery. The controller will include any node in the cluster whose name contains one of these segments when the node name is split on the `-` character. + +```yaml +spec: + nodeGroupNames: + - gpu + - spot +``` + +**How matching works:** + +Given a node named `gpu-a100-abc123`, the name is split into parts: `["gpu", "a100", "abc123"]`. If any part appears in `nodeGroupNames`, the node is added to the group. + +This is useful in clusters where node pools use a consistent prefix in their generated names (e.g., `gpu-*`, `spot-*`). + +You may use `members` and `nodeGroupNames` simultaneously — the resulting member list is the union of both. + +--- + +### `spec.labels` + +**Type:** `map[string]string` +**Required:** No + +Labels to apply to all member nodes. The controller adds these labels and will reapply them if they are manually removed. + +```yaml +spec: + labels: + workload-type: compute + environment: production + team: platform +``` + +Labels that are not in the spec are not touched by the controller. The controller only removes labels it previously applied when the `NodeGroup` is deleted. + +--- + +### `spec.taints` + +**Type:** `[]corev1.Taint` +**Required:** No + +Taints to apply to all member nodes. The controller adds these taints if they are not already present. + +```yaml +spec: + taints: + - key: workload-type + value: compute + effect: NoSchedule + - key: dedicated + value: gpu + effect: NoExecute +``` + +Each taint has the following fields: + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `key` | string | Yes | The taint key. | +| `value` | string | No | The taint value. | +| `effect` | string | Yes | `NoSchedule`, `PreferNoSchedule`, or `NoExecute`. | +| `timeAdded` | timestamp | No | Set automatically by Kubernetes for `NoExecute` taints. | + +--- + +## Status + +The `status` subresource is currently reserved and contains no fields. + +--- + +## Full Example + +```yaml +apiVersion: k8s.elx.cloud/v1alpha2 +kind: NodeGroup +metadata: + name: gpu-nodes +spec: + # Include specific nodes by exact name + members: + - gpu-node-static-1 + + # Include nodes dynamically by name segment + nodeGroupNames: + - gpu + + # Labels to apply to all members + labels: + hardware: gpu + capacity-type: on-demand + team: ml-platform + + # Taints to apply to all members + taints: + - key: nvidia.com/gpu + value: "true" + effect: NoSchedule + - key: dedicated + value: gpu + effect: NoExecute +``` diff --git a/docs/development.md b/docs/development.md new file mode 100644 index 0000000..aae60a3 --- /dev/null +++ b/docs/development.md @@ -0,0 +1,178 @@ +# Development Guide + +## Prerequisites + +| Tool | Minimum Version | Purpose | +|------|----------------|---------| +| Go | 1.22 | Build the controller | +| Docker | any recent | Build container images | +| `kubectl` | any recent | Deploy to a cluster | +| `kustomize` | v4+ | Manage manifests | +| `controller-gen` | v0.14.0 | Generate CRD manifests and DeepCopy | +| `envtest` | bundled via `setup-envtest` | Run integration tests | + +Install Go toolchain dependencies: + +```bash +make controller-gen # Install controller-gen locally +make envtest # Install envtest binaries locally +``` + +--- + +## Repository Layout + +``` +. +├── api/ +│ ├── v1alpha1/ # Deprecated CRD version (supported for migration) +│ │ └── nodegroup_types.go +│ └── v1alpha2/ # Current storage version +│ ├── groupversion_info.go +│ ├── nodegroup_types.go +│ └── zz_generated.deepcopy.go +├── config/ +│ ├── crd/ # Generated CRD manifest +│ ├── default/ # Kustomization for full deployment +│ ├── manager/ # Deployment manifest +│ ├── rbac/ # RBAC manifests +│ └── samples/ # Example NodeGroup resources +├── controllers/ +│ ├── nodegroup_controller.go # Reconciliation logic +│ └── nodegroup_controller_test.go # Integration tests +├── hack/ # Helper scripts (boilerplate header) +├── main.go # Controller manager entry point +├── Dockerfile +├── Makefile +└── go.mod +``` + +--- + +## Common Tasks + +### Build the binary + +```bash +make build +``` + +Output: `./bin/manager` + +### Run tests + +```bash +make test +``` + +Tests use `envtest` to spin up a local Kubernetes API server. Test coverage output is written to `cover.out`. + +### Generate code + +After modifying the API types in `api/v1alpha2/nodegroup_types.go`, regenerate: + +```bash +# Regenerate CRD manifests (config/crd/bases/) +make manifests + +# Regenerate DeepCopy methods (zz_generated.deepcopy.go) +make generate +``` + +Always commit the generated files alongside your type changes. + +### Format and lint + +```bash +make fmt # gofmt +make vet # go vet +``` + +### Build the container image + +```bash +make docker-build IMG=/: +``` + +For multi-architecture builds (amd64, arm64, s390x, ppc64le): + +```bash +make docker-buildx IMG=/: +``` + +### Push the image + +```bash +make docker-push IMG=/: +``` + +### Deploy from a custom image + +```bash +make deploy IMG=/: +``` + +### Install only the CRD (no controller) + +```bash +make install +``` + +### Uninstall the CRD + +```bash +make uninstall +``` + +--- + +## Adding a New API Version + +The project follows the Kubebuilder multi-version API pattern. To add a new version (e.g., `v1beta1`): + +1. Create the new version under `api/v1beta1/`. +2. Add conversion functions (`hub.go` and `conversion.go`) if needed. +3. Register the new version in `main.go`. +4. Annotate the new type with `// +kubebuilder:storageversion` and remove it from the old version. +5. Run `make manifests generate` to regenerate code. + +--- + +## Controller Runtime Flags + +The `manager` binary accepts the following flags: + +| Flag | Default | Description | +|------|---------|-------------| +| `--metrics-bind-address` | `:8080` | Address for Prometheus metrics endpoint | +| `--health-probe-bind-address` | `:8081` | Address for liveness/readiness probes | +| `--leader-elect` | `false` | Enable leader election for HA deployments | + +The default Kustomize deployment enables `--leader-elect=true`. + +--- + +## Testing + +Tests are written using [Ginkgo v2](https://onsi.github.io/ginkgo/) and [Gomega](https://onsi.github.io/gomega/) and run against a local Kubernetes API server managed by `envtest`. + +### Running a specific test + +```bash +go test ./controllers/... -run "TestControllers/should apply labels" +``` + +### Test timeout + +Each test assertion polls with a 250ms interval and a 10-second timeout. Adjust these constants in `controllers/suite_test.go` if needed. + +### What is tested + +| Scenario | Description | +|----------|-------------| +| Finalizer management | Finalizer is added to new NodeGroups | +| Label application | Labels from spec are applied to member nodes | +| Taint application | Taints from spec are applied to member nodes | +| Cleanup on deletion | Labels and taints are removed when NodeGroup is deleted | +| Label/taint persistence | Re-applied if manually removed from a node | +| Node lifecycle | Labels/taints restored when a node is recreated | diff --git a/docs/operations.md b/docs/operations.md new file mode 100644 index 0000000..25e7ef0 --- /dev/null +++ b/docs/operations.md @@ -0,0 +1,182 @@ +# Operations Guide + +## Deployment + +### Install + +Deploy the controller and CRD with a single command: + +```bash +kustomize build config/default | kubectl apply -f - +``` + +Verify the controller is running: + +```bash +kubectl -n elx-nodegroup-controller-system get pods +``` + +Expected output: + +``` +NAME READY STATUS RESTARTS AGE +controller-manager- 2/2 Running 0 30s +``` + +### Upgrade + +Apply the updated manifests: + +```bash +kustomize build config/default | kubectl apply -f - +``` + +The controller deployment uses `RollingUpdate` by default, so upgrades are non-disruptive. + +### Uninstall + +```bash +kustomize build config/default | kubectl delete -f - +``` + +> **Note:** Delete all `NodeGroup` resources before uninstalling to ensure the controller can clean up labels and taints from your nodes. If you uninstall the controller while `NodeGroup` resources exist, the nodes will keep the labels and taints but you will need to remove them manually. + +--- + +## Managing NodeGroups + +### Create + +```bash +kubectl apply -f my-nodegroup.yaml +``` + +### List + +```bash +kubectl get nodegroups +``` + +### Inspect + +```bash +kubectl describe nodegroup +``` + +### Update + +Edit the spec and reapply: + +```bash +kubectl apply -f my-nodegroup.yaml +``` + +Or edit in place: + +```bash +kubectl edit nodegroup +``` + +### Delete + +```bash +kubectl delete nodegroup +``` + +When a `NodeGroup` is deleted, the controller automatically removes the labels and taints it applied from all member nodes before the resource is removed from the cluster. + +--- + +## Monitoring + +### Metrics + +The controller exposes Prometheus metrics on port `8080` (behind `kube-rbac-proxy` for authentication). Metrics follow the standard `controller-runtime` naming conventions: + +- `controller_runtime_reconcile_total` — total number of reconciliations +- `controller_runtime_reconcile_errors_total` — total number of reconciliation errors +- `controller_runtime_reconcile_time_seconds` — reconciliation duration histogram + +### Health Probes + +The controller exposes health probes on port `8081`: + +| Endpoint | Purpose | +|----------|---------| +| `/healthz` | Liveness probe | +| `/readyz` | Readiness probe | + +### Logs + +Fetch controller logs: + +```bash +kubectl -n elx-nodegroup-controller-system logs -l control-plane=controller-manager -f +``` + +The controller uses structured (JSON) logging via `logr`/`zap`. + +--- + +## Troubleshooting + +### Labels or taints not applied + +1. Check controller logs for errors: + ```bash + kubectl -n elx-nodegroup-controller-system logs -l control-plane=controller-manager + ``` + +2. Verify the `NodeGroup` resource is valid: + ```bash + kubectl describe nodegroup + ``` + +3. Confirm node names in `spec.members` match actual node names exactly: + ```bash + kubectl get nodes + ``` + +4. For `nodeGroupNames`, verify the segment matches by inspecting node names: + ```bash + kubectl get nodes -o name | sed 's|node/||' | tr '-' '\n' | sort -u + ``` + +### NodeGroup stuck in deletion + +If a `NodeGroup` is stuck with a deletion timestamp and is not being removed, the controller may not be running or may be failing to process the finalization. Check: + +```bash +# Is the controller running? +kubectl -n elx-nodegroup-controller-system get pods + +# Are there errors? +kubectl -n elx-nodegroup-controller-system logs -l control-plane=controller-manager + +# What finalizers are present? +kubectl get nodegroup -o jsonpath='{.metadata.finalizers}' +``` + +If the controller is permanently unavailable and you need to force-delete the resource (accepting that labels/taints on nodes will **not** be cleaned up): + +```bash +kubectl patch nodegroup -p '{"metadata":{"finalizers":[]}}' --type=merge +``` + +--- + +## Security Considerations + +The controller runs with the minimum required RBAC permissions: + +- **Nodes**: `get`, `list`, `watch`, `update`, `patch` +- **NodeGroups**: full CRUD + status + finalizers + +The controller pod is configured with: + +- Non-root user (`uid: 65532`) +- Read-only root filesystem +- No privilege escalation +- Distroless base image (minimal attack surface) +- Tolerations for all `NoSchedule` taints (so the controller can run even on tainted nodes) +- Node affinity that avoids control-plane nodes diff --git a/go.mod b/go.mod index 6af498d..f0b27c7 100644 --- a/go.mod +++ b/go.mod @@ -12,10 +12,12 @@ require ( ) require ( + dario.cat/mergo v1.0.2 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/emicklei/go-restful/v3 v3.12.0 // indirect + github.com/evanphx/json-patch v4.12.0+incompatible // indirect github.com/evanphx/json-patch/v5 v5.9.0 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/go-logr/logr v1.4.1 // indirect diff --git a/go.sum b/go.sum index 0e27329..b45324b 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,7 @@ dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk= dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= +dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8= +dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= @@ -45,8 +47,6 @@ github.com/google/pprof v0.0.0-20240422182052-72c8669ad3e7 h1:3q13T5NW3mlTJZM6B5 github.com/google/pprof v0.0.0-20240422182052-72c8669ad3e7/go.mod h1:kf6iHlnVGwgKolg33glAes7Yg/8iWP8ukqeldJSO7jw= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/imdario/mergo v0.3.6 h1:xTNEAn+kxVO7dTZGu0CegyqKZmoWFI0rF8UxjlB2d28= -github.com/imdario/mergo v0.3.6/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= @@ -159,18 +159,12 @@ k8s.io/api v0.30.0 h1:siWhRq7cNjy2iHssOB9SCGNCl2spiF1dO3dABqZ8niA= k8s.io/api v0.30.0/go.mod h1:OPlaYhoHs8EQ1ql0R/TsUgaRPhpKNxIMrKQfWUp8QSE= k8s.io/apiextensions-apiserver v0.29.3 h1:9HF+EtZaVpFjStakF4yVufnXGPRppWFEQ87qnO91YeI= k8s.io/apiextensions-apiserver v0.29.3/go.mod h1:po0XiY5scnpJfFizNGo6puNU6Fq6D70UJY2Cb2KwAVc= -k8s.io/apiextensions-apiserver v0.30.0 h1:jcZFKMqnICJfRxTgnC4E+Hpcq8UEhT8B2lhBcQ+6uAs= -k8s.io/apiextensions-apiserver v0.30.0/go.mod h1:N9ogQFGcrbWqAY9p2mUAL5mGxsLqwgtUce127VtRX5Y= k8s.io/apimachinery v0.30.0 h1:qxVPsyDM5XS96NIh9Oj6LavoVFYff/Pon9cZeDIkHHA= k8s.io/apimachinery v0.30.0/go.mod h1:iexa2somDaxdnj7bha06bhb43Zpa6eWH8N8dbqVjTUc= k8s.io/client-go v0.29.3 h1:R/zaZbEAxqComZ9FHeQwOh3Y1ZUs7FaHKZdQtIc2WZg= k8s.io/client-go v0.29.3/go.mod h1:tkDisCvgPfiRpxGnOORfkljmS+UrW+WtXAy2fTvXJB0= -k8s.io/client-go v0.30.0 h1:sB1AGGlhY/o7KCyCEQ0bPWzYDL0pwOZO4vAtTSh/gJQ= -k8s.io/client-go v0.30.0/go.mod h1:g7li5O5256qe6TYdAMyX/otJqMhIiGgTapdLchhmOaY= k8s.io/component-base v0.29.3 h1:Oq9/nddUxlnrCuuR2K/jp6aflVvc0uDvxMzAWxnGzAo= k8s.io/component-base v0.29.3/go.mod h1:Yuj33XXjuOk2BAaHsIGHhCKZQAgYKhqIxIjIr2UXYio= -k8s.io/component-base v0.30.0 h1:cj6bp38g0ainlfYtaOQuRELh5KSYjhKxM+io7AUIk4o= -k8s.io/component-base v0.30.0/go.mod h1:V9x/0ePFNaKeKYA3bOvIbrNoluTSG+fSJKjLdjOoeXQ= k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw= k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-openapi v0.0.0-20240423202451-8948a665c108 h1:Q8Z7VlGhcJgBHJHYugJ/K/7iB8a2eSxCyxdVjJp+lLY= @@ -179,8 +173,6 @@ k8s.io/utils v0.0.0-20240423183400-0849a56e8f22 h1:ao5hUqGhsqdm+bYbjH/pRkCs0unBG k8s.io/utils v0.0.0-20240423183400-0849a56e8f22/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/controller-runtime v0.17.2 h1:FwHwD1CTUemg0pW2otk7/U5/i5m2ymzvOXdbeGOUvw0= sigs.k8s.io/controller-runtime v0.17.2/go.mod h1:+MngTvIQQQhfXtwfdGw/UOQ/aIaqsYywfCINOtwMO/s= -sigs.k8s.io/controller-runtime v0.17.3 h1:65QmN7r3FWgTxDMz9fvGnO1kbf2nu+acg9p2R9oYYYk= -sigs.k8s.io/controller-runtime v0.17.3/go.mod h1:N0jpP5Lo7lMTF9aL56Z/B2oWBJjey6StQM0jRbKQXtY= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go new file mode 100644 index 0000000..a7ea0cf --- /dev/null +++ b/test/e2e/e2e_suite_test.go @@ -0,0 +1,117 @@ +/* +Copyright 2022. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package e2e contains end-to-end tests that run against a real Kubernetes cluster. +// The tests expect: +// - A kubeconfig pointing at a live cluster (KUBECONFIG env var or ~/.kube/config). +// - The NodeGroup CRD installed (make install). +// - The controller running either in-cluster or locally (make run / make deploy). +// +// A kind-based workflow is available via the Makefile: +// +// make kind-create # create a kind cluster +// make kind-load-and-deploy # build + load image + deploy controller +// make test-e2e # run these tests +// make kind-delete # tear down +package e2e_test + +import ( + "context" + "fmt" + "os" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/config" + + k8sv1alpha2 "github.com/elastx/elx-nodegroup-controller/api/v1alpha2" +) + +const ( + e2eTimeout = 60 * time.Second + e2eInterval = 500 * time.Millisecond +) + +var ( + k8sClient client.Client + clusterNodes []corev1.Node + // workerNodes are nodes without the control-plane role label. + // Safe to taint during testing. + workerNodes []corev1.Node + // testNodeName is the primary node used in single-node tests. + // Prefers a worker node; falls back to any available node. + testNodeName string +) + +func TestE2E(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "E2E Suite") +} + +var _ = BeforeSuite(func() { + By("connecting to cluster") + cfg, err := config.GetConfig() + if err != nil { + Skip(fmt.Sprintf("cannot get cluster config (is KUBECONFIG set?): %v", err)) + } + + scheme := runtime.NewScheme() + Expect(clientgoscheme.AddToScheme(scheme)).To(Succeed()) + Expect(k8sv1alpha2.AddToScheme(scheme)).To(Succeed()) + + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme}) + Expect(err).NotTo(HaveOccurred()) + + By("verifying NodeGroup CRD is installed") + ngList := &k8sv1alpha2.NodeGroupList{} + if err := k8sClient.List(context.Background(), ngList); err != nil { + Skip(fmt.Sprintf("NodeGroup CRD not accessible (install with `make install`): %v", err)) + } + + By("listing cluster nodes") + nodeList := &corev1.NodeList{} + Expect(k8sClient.List(context.Background(), nodeList)).To(Succeed()) + clusterNodes = nodeList.Items + if len(clusterNodes) == 0 { + Skip("cluster has no nodes") + } + + for _, n := range clusterNodes { + if _, isCP := n.Labels["node-role.kubernetes.io/control-plane"]; !isCP { + workerNodes = append(workerNodes, n) + } + } + + if len(workerNodes) > 0 { + testNodeName = workerNodes[0].Name + } else { + // Fall back to the first available node (control-plane only cluster) + testNodeName = clusterNodes[0].Name + } + fmt.Fprintf(os.Stdout, "\nUsing node %q for e2e tests\n", testNodeName) + + By("verifying controller is responsive (NodeGroup create/get round-trip)") + // This does not check that the controller pod is running; it just verifies + // the API server accepts NodeGroup resources. The tests themselves will time + // out if the controller is not reconciling. +}) diff --git a/test/e2e/kind-config.yaml b/test/e2e/kind-config.yaml new file mode 100644 index 0000000..752e993 --- /dev/null +++ b/test/e2e/kind-config.yaml @@ -0,0 +1,6 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: +- role: control-plane +- role: worker +- role: worker diff --git a/test/e2e/nodegroup_e2e_test.go b/test/e2e/nodegroup_e2e_test.go new file mode 100644 index 0000000..2c9a59c --- /dev/null +++ b/test/e2e/nodegroup_e2e_test.go @@ -0,0 +1,372 @@ +/* +Copyright 2022. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e_test + +import ( + "context" + "fmt" + "strings" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + + k8sv1alpha2 "github.com/elastx/elx-nodegroup-controller/api/v1alpha2" +) + +// uniqueName returns a name that is unique per test run to prevent conflicts +// when tests are run multiple times without cluster cleanup. +func uniqueName(base string) string { + return fmt.Sprintf("%s-%d", base, time.Now().UnixNano()%1_000_000) +} + +// createNodeGroup creates a NodeGroup and registers a DeferCleanup to delete it. +func createNodeGroup(ng *k8sv1alpha2.NodeGroup) { + Expect(k8sClient.Create(context.Background(), ng)).To(Succeed()) + DeferCleanup(func() { + fetched := &k8sv1alpha2.NodeGroup{} + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: ng.Name}, fetched); err == nil { + // Remove finalizer so deletion is immediate even if controller is not running + fetched.Finalizers = nil + _ = k8sClient.Update(context.Background(), fetched) + _ = k8sClient.Delete(context.Background(), fetched) + } + }) +} + +var _ = Describe("NodeGroup e2e", func() { + + Describe("applying labels via spec.members", func() { + It("applies labels to an explicit member node and removes them on deletion", func() { + ngName := uniqueName("e2e-labels") + labelKey := "e2e-test-label" + labelVal := "from-nodegroup" + + ng := &k8sv1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{Name: ngName}, + Spec: k8sv1alpha2.NodeGroupSpec{ + Members: []string{testNodeName}, + Labels: map[string]string{labelKey: labelVal}, + }, + } + createNodeGroup(ng) + + By("waiting for the label to appear on the target node") + node := &corev1.Node{} + Eventually(func() string { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: testNodeName}, node); err != nil { + return "" + } + return node.Labels[labelKey] + }, e2eTimeout, e2eInterval).Should(Equal(labelVal)) + + By("deleting the NodeGroup") + fetched := &k8sv1alpha2.NodeGroup{} + Expect(k8sClient.Get(context.Background(), types.NamespacedName{Name: ngName}, fetched)).To(Succeed()) + Expect(k8sClient.Delete(context.Background(), fetched)).To(Succeed()) + + By("waiting for the label to be removed from the node") + Eventually(func() bool { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: testNodeName}, node); err != nil { + return false + } + _, hasLabel := node.Labels[labelKey] + return !hasLabel + }, e2eTimeout, e2eInterval).Should(BeTrue()) + }) + + It("applies multiple labels simultaneously", func() { + ngName := uniqueName("e2e-multilabel") + ng := &k8sv1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{Name: ngName}, + Spec: k8sv1alpha2.NodeGroupSpec{ + Members: []string{testNodeName}, + Labels: map[string]string{ + "e2e-label-a": "value-a", + "e2e-label-b": "value-b", + }, + }, + } + createNodeGroup(ng) + + node := &corev1.Node{} + Eventually(func() bool { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: testNodeName}, node); err != nil { + return false + } + return node.Labels["e2e-label-a"] == "value-a" && node.Labels["e2e-label-b"] == "value-b" + }, e2eTimeout, e2eInterval).Should(BeTrue()) + }) + + It("re-applies labels that are manually removed from a node", func() { + ngName := uniqueName("e2e-persistent") + labelKey := "e2e-persistent-label" + + ng := &k8sv1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{Name: ngName}, + Spec: k8sv1alpha2.NodeGroupSpec{ + Members: []string{testNodeName}, + Labels: map[string]string{labelKey: "persistent"}, + }, + } + createNodeGroup(ng) + + node := &corev1.Node{} + By("waiting for label to be applied") + Eventually(func() string { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: testNodeName}, node); err != nil { + return "" + } + return node.Labels[labelKey] + }, e2eTimeout, e2eInterval).Should(Equal("persistent")) + + By("manually removing the label") + Expect(k8sClient.Get(context.Background(), types.NamespacedName{Name: testNodeName}, node)).To(Succeed()) + delete(node.Labels, labelKey) + Expect(k8sClient.Update(context.Background(), node)).To(Succeed()) + + By("waiting for the controller to re-apply the label") + Eventually(func() string { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: testNodeName}, node); err != nil { + return "" + } + return node.Labels[labelKey] + }, e2eTimeout, e2eInterval).Should(Equal("persistent")) + }) + }) + + Describe("applying taints via spec.members", func() { + var taintNode string + + BeforeEach(func() { + // Prefer a worker node for taint tests so as not to disrupt scheduling + // on a control-plane-only cluster. + if len(workerNodes) > 0 { + taintNode = workerNodes[0].Name + } else { + // Only control-plane available — use PreferNoSchedule to be safe. + taintNode = testNodeName + } + }) + + It("applies a NoSchedule taint to a member node and removes it on deletion", func() { + ngName := uniqueName("e2e-taint") + taint := corev1.Taint{ + Key: "e2e-test-taint", + Value: "true", + Effect: corev1.TaintEffectNoSchedule, + } + + ng := &k8sv1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{Name: ngName}, + Spec: k8sv1alpha2.NodeGroupSpec{ + Members: []string{taintNode}, + Taints: []corev1.Taint{taint}, + }, + } + createNodeGroup(ng) + + node := &corev1.Node{} + By("waiting for the taint to appear on the node") + Eventually(func() bool { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: taintNode}, node); err != nil { + return false + } + for _, t := range node.Spec.Taints { + if t.MatchTaint(&taint) { + return true + } + } + return false + }, e2eTimeout, e2eInterval).Should(BeTrue()) + + By("deleting the NodeGroup") + fetched := &k8sv1alpha2.NodeGroup{} + Expect(k8sClient.Get(context.Background(), types.NamespacedName{Name: ngName}, fetched)).To(Succeed()) + Expect(k8sClient.Delete(context.Background(), fetched)).To(Succeed()) + + By("waiting for the taint to be removed from the node") + Eventually(func() bool { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: taintNode}, node); err != nil { + return false + } + for _, t := range node.Spec.Taints { + if t.MatchTaint(&taint) { + return false + } + } + return true + }, e2eTimeout, e2eInterval).Should(BeTrue()) + }) + + It("re-applies a taint that is manually removed from a node", func() { + ngName := uniqueName("e2e-taint-persistent") + taint := corev1.Taint{ + Key: "e2e-persistent-taint", + Value: "yes", + Effect: corev1.TaintEffectPreferNoSchedule, + } + + ng := &k8sv1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{Name: ngName}, + Spec: k8sv1alpha2.NodeGroupSpec{ + Members: []string{taintNode}, + Taints: []corev1.Taint{taint}, + }, + } + createNodeGroup(ng) + + node := &corev1.Node{} + By("waiting for taint to be applied") + Eventually(func() bool { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: taintNode}, node); err != nil { + return false + } + for _, t := range node.Spec.Taints { + if t.MatchTaint(&taint) { + return true + } + } + return false + }, e2eTimeout, e2eInterval).Should(BeTrue()) + + By("manually removing the taint") + Expect(k8sClient.Get(context.Background(), types.NamespacedName{Name: taintNode}, node)).To(Succeed()) + filtered := []corev1.Taint{} + for _, t := range node.Spec.Taints { + if !t.MatchTaint(&taint) { + filtered = append(filtered, t) + } + } + node.Spec.Taints = filtered + Expect(k8sClient.Update(context.Background(), node)).To(Succeed()) + + By("waiting for the controller to re-apply the taint") + Eventually(func() bool { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: taintNode}, node); err != nil { + return false + } + for _, t := range node.Spec.Taints { + if t.MatchTaint(&taint) { + return true + } + } + return false + }, e2eTimeout, e2eInterval).Should(BeTrue()) + }) + }) + + Describe("dynamic membership via spec.nodeGroupNames", func() { + It("applies labels to nodes whose names contain a matching segment", func() { + // Find nodes whose names contain a unique segment we can target. + // For a kind cluster, node names look like "kind-control-plane" or "kind-worker". + // We pick a unique segment from the first node's name. + if len(clusterNodes) == 0 { + Skip("no nodes available") + } + targetNode := clusterNodes[0] + nameParts := strings.Split(targetNode.Name, "-") + if len(nameParts) == 0 { + Skip("cannot extract a name segment from node name: " + targetNode.Name) + } + // Use the last segment as the group name (e.g. "plane" from "kind-control-plane") + segment := nameParts[len(nameParts)-1] + + ngName := uniqueName("e2e-ngnames") + labelKey := "e2e-ngnames-label" + + ng := &k8sv1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{Name: ngName}, + Spec: k8sv1alpha2.NodeGroupSpec{ + NodeGroupNames: []string{segment}, + Labels: map[string]string{labelKey: "from-ngnames"}, + }, + } + createNodeGroup(ng) + + node := &corev1.Node{} + Eventually(func() string { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: targetNode.Name}, node); err != nil { + return "" + } + return node.Labels[labelKey] + }, e2eTimeout, e2eInterval).Should(Equal("from-ngnames")) + }) + }) + + Describe("NodeGroup finalizer lifecycle", func() { + It("adds a finalizer to a newly created NodeGroup", func() { + ngName := uniqueName("e2e-finalizer") + ng := &k8sv1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{Name: ngName}, + Spec: k8sv1alpha2.NodeGroupSpec{ + Labels: map[string]string{"e2e-finalizer": "true"}, + }, + } + createNodeGroup(ng) + + fetched := &k8sv1alpha2.NodeGroup{} + Eventually(func() bool { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: ngName}, fetched); err != nil { + return false + } + return len(fetched.Finalizers) > 0 + }, e2eTimeout, e2eInterval).Should(BeTrue()) + + Expect(fetched.Finalizers).To(ContainElement("k8s.elx.cloud/finalizer")) + }) + + It("removes the finalizer and allows deletion once nodes are cleaned up", func() { + ngName := uniqueName("e2e-del") + ng := &k8sv1alpha2.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{Name: ngName}, + Spec: k8sv1alpha2.NodeGroupSpec{ + Members: []string{testNodeName}, + Labels: map[string]string{"e2e-del-label": "yes"}, + }, + } + Expect(k8sClient.Create(context.Background(), ng)).To(Succeed()) + + By("waiting for the label to appear (confirms controller is active)") + node := &corev1.Node{} + Eventually(func() string { + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: testNodeName}, node); err != nil { + return "" + } + return node.Labels["e2e-del-label"] + }, e2eTimeout, e2eInterval).Should(Equal("yes")) + + By("deleting the NodeGroup") + Expect(k8sClient.Delete(context.Background(), ng)).To(Succeed()) + + By("waiting for the NodeGroup to be fully deleted (finalizer removed)") + Eventually(func() bool { + fetched := &k8sv1alpha2.NodeGroup{} + err := k8sClient.Get(context.Background(), types.NamespacedName{Name: ngName}, fetched) + return err != nil // object gone = true + }, e2eTimeout, e2eInterval).Should(BeTrue()) + + By("verifying the label was removed from the node") + Expect(k8sClient.Get(context.Background(), types.NamespacedName{Name: testNodeName}, node)).To(Succeed()) + Expect(node.Labels).NotTo(HaveKey("e2e-del-label")) + }) + }) +})