Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions charts/gpu-base-operator/templates/namespaced_role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ rules:
- get
- list
- update
- watch
- apiGroups:
- apps
resources:
Expand Down
23 changes: 23 additions & 0 deletions charts/gpu-base-operator/templates/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,23 @@ rules:
- rbac.authorization.k8s.io
resources:
- clusterrolebindings
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- rbac.authorization.k8s.io
resources:
- clusterroles
verbs:
- create
- delete
- get
- list
- update
- watch
- apiGroups:
- resource.k8s.io
resources:
Expand Down Expand Up @@ -146,3 +157,15 @@ rules:
- patch
- update
- watch
- apiGroups:
- security.openshift.io
resources:
- securitycontextconstraints
verbs:
- create
- delete
- get
- list
- update
- use
- watch
8 changes: 4 additions & 4 deletions charts/gpu-base-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ operator:
verbosity: 2
resources:
limits:
cpu: 500m
memory: 128Mi
cpu: 200m
memory: 386Mi
requests:
cpu: 10m
memory: 64Mi
cpu: 100m
memory: 256Mi

privateRegistry:
url: ""
Expand Down
8 changes: 4 additions & 4 deletions config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,11 @@ spec:
# More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
resources:
limits:
cpu: 500m
memory: 128Mi
cpu: 200m
memory: 386Mi
requests:
cpu: 10m
memory: 64Mi
cpu: 100m
memory: 256Mi
volumeMounts: []
volumes: []
serviceAccountName: controller-manager
Expand Down
1 change: 1 addition & 0 deletions config/rbac/namespaced_role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ rules:
- get
- list
- update
- watch
- apiGroups:
- apps
resources:
Expand Down
23 changes: 23 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,23 @@ rules:
- rbac.authorization.k8s.io
resources:
- clusterrolebindings
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- rbac.authorization.k8s.io
resources:
- clusterroles
verbs:
- create
- delete
- get
- list
- update
- watch
- apiGroups:
- resource.k8s.io
resources:
Expand Down Expand Up @@ -146,3 +157,15 @@ rules:
- patch
- update
- watch
- apiGroups:
- security.openshift.io
resources:
- securitycontextconstraints
verbs:
- create
- delete
- get
- list
- update
- use
- watch
21 changes: 14 additions & 7 deletions internal/controller/clusterpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (

apps "k8s.io/api/apps/v1"
core "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog/v2"
Expand Down Expand Up @@ -105,8 +106,8 @@ func addIfMissing(slice *[]string, s string) {

// +kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions,verbs=get;list;watch

// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterroles,verbs=get;list;create;delete
// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterrolebindings,verbs=get;list;create;delete
// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterroles,verbs=get;list;create;delete;watch;update
// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterrolebindings,verbs=get;list;create;delete;watch
// +kubebuilder:rbac:groups=admissionregistration.k8s.io,resources=validatingadmissionpolicies,verbs=get;list;create;delete
// +kubebuilder:rbac:groups=admissionregistration.k8s.io,resources=validatingadmissionpolicybindings,verbs=get;list;create;delete
// +kubebuilder:rbac:groups=resource.k8s.io,resources=deviceclasses,verbs=get;list;create;delete;watch;update
Expand All @@ -119,6 +120,8 @@ func addIfMissing(slice *[]string, s string) {

// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;patch;update

// +kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=create;delete;get;list;watch;use;update

// Main Reconcile function for ClusterPolicy. Individual sub-controllers will be called from here to handle their
// respective resources, and any errors they return will be aggregated into the ClusterPolicy status.
func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
Expand Down Expand Up @@ -200,7 +203,9 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques

controllerutil.RemoveFinalizer(cp, clusterPolicyFinalizer)

if err := r.Update(ctx, cp); err != nil {
// The object may have been garbage-collected between the Get and this.
// NotFound here means the goal is already achieved.
if err := r.Update(ctx, cp); !apierrors.IsNotFound(err) {
return ctrl.Result{}, err
Comment thread
tkatila marked this conversation as resolved.
}

Expand All @@ -221,11 +226,13 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques

retErr = err

for len(cp.Status.Errors) > maxKeptErrors {
cp.Status.Errors = cp.Status.Errors[1:]
}
if cp != nil {
for len(cp.Status.Errors) > maxKeptErrors {
cp.Status.Errors = cp.Status.Errors[1:]
}

cp.Status.Errors = append(cp.Status.Errors, err.Error())
cp.Status.Errors = append(cp.Status.Errors, err.Error())
}
}
}

Expand Down
47 changes: 47 additions & 0 deletions internal/controller/deviceplugin_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ type DevicePluginReconciler struct {
const (
dpValue = "intel-gpu-plugin"
xpumdVolumeName = "runxpumd"

dpResourcePart = "gpu-dp"
)

func logLevelForDp(spec *v1alpha.ClusterPolicy) int32 {
Expand Down Expand Up @@ -196,6 +198,39 @@ func (r *DevicePluginReconciler) updateDaemonSetObject(ds *apps.DaemonSet, spec
} else {
removeXpumdMounts(cspec)
}

if r.Opts.OpenShift {
_, _, _, saName := buildOpenShiftNames(spec.Name, dpResourcePart)
cspec.ServiceAccountName = saName
}
}

func (r *DevicePluginReconciler) createOpenShiftResourcesIfNotExists(ctx context.Context, cpName string) error {
sccName, roleName, bindingName, saName := buildOpenShiftNames(cpName, dpResourcePart)

if err := createServiceAccount(ctx, r.Client, saName, r.Opts.Namespace); err != nil {
return fmt.Errorf("failed to ensure DP ServiceAccount: %w", err)
}

if err := ensureSCC(ctx, r.Client, buildDevicePluginSCC(sccName)); err != nil {
return fmt.Errorf("failed to ensure DP SCC: %w", err)
}

if err := createSCCRole(ctx, r.Client, roleName, sccName); err != nil {
return fmt.Errorf("failed to ensure DP SCC ClusterRole: %w", err)
}

if err := createSCCRoleBinding(ctx, r.Client, bindingName, roleName, saName, r.Opts.Namespace); err != nil {
return fmt.Errorf("failed to ensure DP SCC ClusterRoleBinding: %w", err)
}

return nil
}

func (r *DevicePluginReconciler) cleanupOpenShiftResources(ctx context.Context, cpName string) {
sccName, roleName, bindingName, saName := buildOpenShiftNames(cpName, dpResourcePart)

deleteOpenShiftSCCResources(ctx, r.Client, sccName, roleName, bindingName, saName, r.Opts.Namespace)
}

func (r *DevicePluginReconciler) createDaemonSet(ctx context.Context, obj client.Object) (ctrl.Result, error) {
Expand Down Expand Up @@ -225,6 +260,10 @@ func (r *DevicePluginReconciler) removeDeploymentIfExists(ctx context.Context) (

crName := r.Opts.ReqName

if r.Opts.OpenShift {
r.cleanupOpenShiftResources(ctx, crName)
}

dss := &apps.DaemonSetList{}
labels := client.MatchingLabels{
appLabel: dpValue,
Expand Down Expand Up @@ -272,6 +311,14 @@ func (r *DevicePluginReconciler) Reconcile(ctx context.Context, cp *v1alpha.Clus
return ctrl.Result{}, err
}

if r.Opts.OpenShift {
if err := r.createOpenShiftResourcesIfNotExists(ctx, cp.Name); err != nil {
klog.Error(err, "unable to ensure OpenShift resources for DP")

return ctrl.Result{}, err
}
}

if len(olderDs.Items) == 0 {
return r.createDaemonSet(ctx, cp)
}
Expand Down
115 changes: 113 additions & 2 deletions internal/controller/deviceplugin_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,14 @@ import (
. "github.com/onsi/gomega"
apps "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
rbac "k8s.io/api/rbac/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

v1alpha "github.com/intel/gpu-base-operator/api/v1alpha1"
"github.com/intel/gpu-base-operator/config/deployments"
)
Expand Down Expand Up @@ -356,6 +357,116 @@ var _ = Describe("ClusterPolicy Controller for Device Plugin", func() {
})
})

Context("When reconciling Device Plugin and XPUM on OpenShift", func() {
defaultNamespace := "foobar-openshift"
const resourceName = "test-resource-ocp"

ctx := context.Background()

typeNamespacedName := types.NamespacedName{Name: resourceName}

BeforeEach(func() {
Expect(k8sClient.Create(ctx, &v1.Namespace{
ObjectMeta: metav1.ObjectMeta{Name: defaultNamespace},
})).To(Succeed())
})

AfterEach(func() {
resource := &v1alpha.ClusterPolicy{}
Expect(k8sClient.Get(ctx, typeNamespacedName, resource)).To(Succeed())
Expect(k8sClient.Delete(ctx, resource)).To(Succeed())

// Clean up cluster-scoped OpenShift resources created by the reconciler.
deleteOpenShiftSCCResources(ctx, k8sClient,
resourceName+"-gpu-dp-scc",
resourceName+"-gpu-dp-scc-role",
resourceName+"-gpu-dp-scc-binding",
resourceName+"-gpu-dp",
defaultNamespace)
deleteOpenShiftSCCResources(ctx, k8sClient,
resourceName+"-xpu-manager-scc",
resourceName+"-xpu-manager-scc-role",
resourceName+"-xpu-manager-scc-binding",
resourceName+"-xpu-manager",
defaultNamespace)
})

It("creates SCC resources for DP and XPUM and sets ServiceAccountName on DaemonSets", func() {
By("creating the ClusterPolicy")
Expect(k8sClient.Create(ctx, &v1alpha.ClusterPolicy{
ObjectMeta: metav1.ObjectMeta{Name: resourceName},
Spec: v1alpha.ClusterPolicySpec{
ResourceRegistration: "dp",
ResourceMonitoring: true,
DevicePluginSpec: v1alpha.DevicePluginSpec{
PluginImage: "intel/intel-gpu-plugin:test",
},
XpuManagerSpec: v1alpha.XpuManagerSpec{
Image: "intel/xpumanager:test",
},
},
})).To(Succeed())

reconciler := &ClusterPolicyReconciler{
Client: k8sClient,
Scheme: k8sClient.Scheme(),
Opts: ControllerOpts{
Namespace: defaultNamespace,
OpenShift: true,
RequeueDelay: time.Millisecond * 50,
},
}

By("first reconcile creates SCC resources")
_, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: typeNamespacedName})
Expect(err).NotTo(HaveOccurred())

By("DP ServiceAccount is created")
dpSA := &v1.ServiceAccount{}
Expect(k8sClient.Get(ctx, client.ObjectKey{Name: resourceName + "-gpu-dp-sa", Namespace: defaultNamespace}, dpSA)).To(Succeed())

By("DP SCC is created")
dpSCC := &unstructured.Unstructured{}
dpSCC.SetAPIVersion(sccAPIVersion)
dpSCC.SetKind(sccKind)
Expect(k8sClient.Get(ctx, client.ObjectKey{Name: resourceName + "-gpu-dp-scc"}, dpSCC)).To(Succeed())

By("DP ClusterRole is created")
Expect(k8sClient.Get(ctx, client.ObjectKey{Name: resourceName + "-gpu-dp-scc-role"}, &rbac.ClusterRole{})).To(Succeed())

By("DP ClusterRoleBinding is created")
Expect(k8sClient.Get(ctx, client.ObjectKey{Name: resourceName + "-gpu-dp-scc-binding"}, &rbac.ClusterRoleBinding{})).To(Succeed())

By("XPUM ServiceAccount is created")
Expect(k8sClient.Get(ctx, client.ObjectKey{Name: resourceName + "-xpu-manager-sa", Namespace: defaultNamespace}, &v1.ServiceAccount{})).To(Succeed())

By("XPUM SCC is created")
xpumSCC := &unstructured.Unstructured{}
xpumSCC.SetAPIVersion(sccAPIVersion)
xpumSCC.SetKind(sccKind)
Expect(k8sClient.Get(ctx, client.ObjectKey{Name: resourceName + "-xpu-manager-scc"}, xpumSCC)).To(Succeed())

By("DaemonSets have correct ServiceAccountNames")
dsList := &apps.DaemonSetList{}
Expect(k8sClient.List(ctx, dsList, client.InNamespace(defaultNamespace))).To(Succeed())
Expect(dsList.Items).To(HaveLen(2))
for _, ds := range dsList.Items {
switch ds.Name {
case resourceName + "-device-plugin":
Expect(ds.Spec.Template.Spec.ServiceAccountName).To(Equal(resourceName + "-gpu-dp-sa"))
case resourceName + "-xpu-manager":
Expect(ds.Spec.Template.Spec.ServiceAccountName).To(Equal(resourceName + "-xpu-manager-sa"))
default:
Fail("Unexpected DaemonSet: " + ds.Name)
}
}

By("second reconcile is idempotent")
_, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: typeNamespacedName})
Expect(err).NotTo(HaveOccurred())
})
})

})

var _ = Describe("Device Plugin", func() {
Expand Down
Loading