openshift · tmshort · Apr 23, 2026 · Apr 24, 2026 · coderabbitai · Apr 24, 2026
diff --git a/internal/catalogd/serverutil/serverutil.go b/internal/catalogd/serverutil/serverutil.go
@@ -1,7 +1,9 @@
 package serverutil
 
 import (
+	"context"
 	"crypto/tls"
+	"errors"
 	"fmt"
 	"io"
 	"net"
@@ -13,7 +15,7 @@ import (
 	"github.com/klauspost/compress/gzhttp"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/certwatcher"
-	"sigs.k8s.io/controller-runtime/pkg/manager"
+	"sigs.k8s.io/controller-runtime/pkg/healthz"
 
 	catalogdmetrics "github.com/operator-framework/operator-controller/internal/catalogd/metrics"
 	"github.com/operator-framework/operator-controller/internal/catalogd/storage"
@@ -27,49 +29,115 @@ type CatalogServerConfig struct {
 	LocalStorage storage.Instance
 }
 
-func AddCatalogServerToManager(mgr ctrl.Manager, cfg CatalogServerConfig, tlsFileWatcher *certwatcher.CertWatcher) error {
-	listener, err := net.Listen("tcp", cfg.CatalogAddr)
+// AddCatalogServerToManager adds the catalog HTTP server to the manager and registers
+// a readiness check that passes once the server has started serving.  Because
+// NeedLeaderElection returns false, Start() is called on every pod immediately, so all
+// replicas bind the catalog port and become ready.  Non-leader pods serve requests but
+// return 404 (empty local cache); callers are expected to retry.
+func AddCatalogServerToManager(mgr ctrl.Manager, cfg CatalogServerConfig, cw *certwatcher.CertWatcher) error {
+	shutdownTimeout := 30 * time.Second
+	r := &catalogServerRunnable{
+		cfg: cfg,
+		cw:  cw,
+		server: &http.Server{
+			Addr:         cfg.CatalogAddr,
+			Handler:      storageServerHandlerWrapped(mgr.GetLogger().WithName("catalogd-http-server"), cfg),
+			ReadTimeout:  5 * time.Second,
+			WriteTimeout: 5 * time.Minute,
+		},
+		shutdownTimeout: shutdownTimeout,
+		ready:           make(chan struct{}),
+	}
+
+	if err := mgr.Add(r); err != nil {
+		return fmt.Errorf("error adding catalog server to manager: %w", err)
+	}
+
+	// Register a readiness check that passes once Start() has been called and the
+	// server is actively serving.  All pods reach Start() (NeedLeaderElection=false),
+	// so all replicas become ready and receive traffic; non-leaders return 404 until
+	// they win the leader lease and populate their local cache.
+	if err := mgr.AddReadyzCheck("catalog-server", r.readyzCheck()); err != nil {
+		return fmt.Errorf("error adding catalog server readiness check: %w", err)
+	}
+
+	return nil
+}
+
+// catalogServerRunnable is a leader-only Runnable that binds the catalog HTTP port
+// lazily inside Start(), so non-leader pods never hold the listen socket.
-// catalogServerRunnable is a leader-only Runnable that binds the catalog HTTP port
-// lazily inside Start(), so non-leader pods never hold the listen socket.
+// catalogServerRunnable is a Runnable that binds the catalog HTTP port on all pods.
+// NeedLeaderElection returns false so all replicas serve catalog requests immediately.
-// catalogServerRunnable is a leader-only Runnable that binds the catalog HTTP port
-// lazily inside Start(), so non-leader pods never hold the listen socket.
+// catalogServerRunnable is a Runnable that binds the catalog HTTP port on all pods.
+// NeedLeaderElection returns false so all replicas serve catalog requests immediately.
+type catalogServerRunnable struct {
+	cfg             CatalogServerConfig
+	cw              *certwatcher.CertWatcher
+	server          *http.Server
+	shutdownTimeout time.Duration
+	// ready is closed by Start() once the server is about to begin serving.
+	ready chan struct{}
+}
+
+// NeedLeaderElection returns false so the catalog server starts on every pod
+// immediately, regardless of leadership.  This is required for rolling updates:
+// if Start() were gated on leadership, a new pod could not win the leader lease
+// (held by the still-running old pod) and therefore could never pass the
+// catalog-server readiness check, deadlocking the rollout.
+//
+// Non-leader pods serve the catalog HTTP port but have an empty local cache
+// (only the leader's reconciler downloads catalog content), so requests to a
+// non-leader return 404.  Callers are expected to retry.
+func (r *catalogServerRunnable) NeedLeaderElection() bool { return false }
+
+func (r *catalogServerRunnable) Start(ctx context.Context) error {
+	listener, err := net.Listen("tcp", r.cfg.CatalogAddr)
 	if err != nil {
 		return fmt.Errorf("error creating catalog server listener: %w", err)
 	}
 
-	if cfg.CertFile != "" && cfg.KeyFile != "" {
-		// Use the passed certificate watcher instead of creating a new one
+	if r.cfg.CertFile != "" && r.cfg.KeyFile != "" {
 		config := &tls.Config{
-			GetCertificate: tlsFileWatcher.GetCertificate,
+			GetCertificate: r.cw.GetCertificate,
 			MinVersion:     tls.VersionTLS12,
 		}
 		listener = tls.NewListener(listener, config)
 	}
 
-	shutdownTimeout := 30 * time.Second
-	catalogServer := manager.Server{
-		Name:                "catalogs",
-		OnlyServeWhenLeader: true,
-		Server: &http.Server{
-			Addr:        cfg.CatalogAddr,
-			Handler:     storageServerHandlerWrapped(mgr.GetLogger().WithName("catalogd-http-server"), cfg),
-			ReadTimeout: 5 * time.Second,
-			// TODO: Revert this to 10 seconds if/when the API
-			// evolves to have significantly smaller responses
-			WriteTimeout: 5 * time.Minute,
-		},
-		ShutdownTimeout: &shutdownTimeout,
-		Listener:        listener,
-	}
+	// Signal readiness before blocking on Serve so the readiness probe passes promptly.
+	close(r.ready)
 
-	err = mgr.Add(&catalogServer)
-	if err != nil {
-		return fmt.Errorf("error adding catalog server to manager: %w", err)
-	}
+	go func() {
+		<-ctx.Done()
+		shutdownCtx := context.Background()
+		if r.shutdownTimeout > 0 {
+			var cancel context.CancelFunc
+			shutdownCtx, cancel = context.WithTimeout(shutdownCtx, r.shutdownTimeout)
+			defer cancel()
+		}
+		if err := r.server.Shutdown(shutdownCtx); err != nil {
+			// Shutdown errors (e.g. context deadline exceeded) are not actionable;
+			// the process is terminating regardless.
+			_ = err
+		}
+	}()
 
+	if err := r.server.Serve(listener); err != nil && !errors.Is(err, http.ErrServerClosed) {
+		return err
+	}
 	return nil
 }
 
+// readyzCheck returns a healthz.Checker that passes once Start() has been called.
+func (r *catalogServerRunnable) readyzCheck() healthz.Checker {
+	return func(_ *http.Request) error {
+		select {
+		case <-r.ready:
+			return nil
+		default:
+			return fmt.Errorf("catalog server not yet started")
+		}
+	}
+}
+
 func logrLoggingHandler(l logr.Logger, handler http.Handler) http.Handler {
 	return handlers.CustomLoggingHandler(nil, handler, func(_ io.Writer, params handlers.LogFormatterParams) {
-		// extract parameters used in apache common log format, but then log using `logr` to remain consistent
-		// with other loggers used in this codebase.
 		username := "-"
 		if params.URL.User != nil {
 			if name := params.URL.User.Username(); name != "" {

diff --git a/openshift/catalogd/manifests-experimental.yaml b/openshift/catalogd/manifests-experimental.yaml
@@ -65,6 +65,23 @@ spec:
     - Ingress
     - Egress
 ---
+# Source: olmv1/templates/poddisruptionbudget-olmv1-system-catalogd.yml
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: catalogd-controller-manager
+  namespace: openshift-catalogd
+  labels:
+    app.kubernetes.io/name: catalogd
+    app.kubernetes.io/part-of: olm
+  annotations:
+    olm.operatorframework.io/feature-set: experimental
+spec:
+  minAvailable: 1
+  selector:
+    matchLabels:
+      control-plane: catalogd-controller-manager
+---
 # Source: olmv1/templates/serviceaccount-olmv1-system-common-controller-manager.yml
 apiVersion: v1
 kind: ServiceAccount
@@ -825,7 +842,7 @@ metadata:
   namespace: openshift-catalogd
 spec:
   minReadySeconds: 5
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:

diff --git a/openshift/catalogd/manifests.yaml b/openshift/catalogd/manifests.yaml
@@ -65,6 +65,23 @@ spec:
     - Ingress
     - Egress
 ---
+# Source: olmv1/templates/poddisruptionbudget-olmv1-system-catalogd.yml
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: catalogd-controller-manager
+  namespace: openshift-catalogd
+  labels:
+    app.kubernetes.io/name: catalogd
+    app.kubernetes.io/part-of: olm
+  annotations:
+    olm.operatorframework.io/feature-set: standard
+spec:
+  minAvailable: 1
+  selector:
+    matchLabels:
+      control-plane: catalogd-controller-manager
+---
 # Source: olmv1/templates/serviceaccount-olmv1-system-common-controller-manager.yml
 apiVersion: v1
 kind: ServiceAccount
@@ -825,7 +842,7 @@ metadata:
   namespace: openshift-catalogd
 spec:
   minReadySeconds: 5
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:

diff --git a/openshift/helm/catalogd.yaml b/openshift/helm/catalogd.yaml
@@ -8,8 +8,7 @@ options:
     enabled: true
     deployment:
       image: ${CATALOGD_IMAGE}
-    podDisruptionBudget:
-      enabled: false
+      replicas: 2
   operatorController:
     enabled: false
   openshift:

diff --git a/openshift/helm/operator-controller.yaml b/openshift/helm/operator-controller.yaml
@@ -8,8 +8,7 @@ options:
     enabled: true
     deployment:
       image: ${OPERATOR_CONTROLLER_IMAGE}
-    podDisruptionBudget:
-      enabled: false
+      replicas: 2
   catalogd:
     enabled: false
   openshift:

diff --git a/openshift/operator-controller/manifests-experimental.yaml b/openshift/operator-controller/manifests-experimental.yaml
@@ -61,6 +61,23 @@ spec:
     - Ingress
     - Egress
 ---
+# Source: olmv1/templates/poddisruptionbudget-olmv1-system-operator-controller.yml
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: operator-controller-controller-manager
+  namespace: openshift-operator-controller
+  labels:
+    app.kubernetes.io/name: operator-controller
+    app.kubernetes.io/part-of: olm
+  annotations:
+    olm.operatorframework.io/feature-set: experimental
+spec:
+  minAvailable: 1
+  selector:
+    matchLabels:
+      control-plane: operator-controller-controller-manager
+---
 # Source: olmv1/templates/serviceaccount-olmv1-system-common-controller-manager.yml
 apiVersion: v1
 kind: ServiceAccount
@@ -1184,7 +1201,7 @@ metadata:
   name: operator-controller-controller-manager
   namespace: openshift-operator-controller
 spec:
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:

diff --git a/openshift/operator-controller/manifests.yaml b/openshift/operator-controller/manifests.yaml
@@ -61,6 +61,23 @@ spec:
     - Ingress
     - Egress
 ---
+# Source: olmv1/templates/poddisruptionbudget-olmv1-system-operator-controller.yml
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: operator-controller-controller-manager
+  namespace: openshift-operator-controller
+  labels:
+    app.kubernetes.io/name: operator-controller
+    app.kubernetes.io/part-of: olm
+  annotations:
+    olm.operatorframework.io/feature-set: standard
+spec:
+  minAvailable: 1
+  selector:
+    matchLabels:
+      control-plane: operator-controller-controller-manager
+---
 # Source: olmv1/templates/serviceaccount-olmv1-system-common-controller-manager.yml
 apiVersion: v1
 kind: ServiceAccount
@@ -1056,7 +1073,7 @@ metadata:
   name: operator-controller-controller-manager
   namespace: openshift-operator-controller
 spec:
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate: