Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 95 additions & 27 deletions internal/catalogd/serverutil/serverutil.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package serverutil

import (
"context"
"crypto/tls"
"errors"
"fmt"
"io"
"net"
Expand All @@ -13,7 +15,7 @@ import (
"github.com/klauspost/compress/gzhttp"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/certwatcher"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/healthz"

catalogdmetrics "github.com/operator-framework/operator-controller/internal/catalogd/metrics"
"github.com/operator-framework/operator-controller/internal/catalogd/storage"
Expand All @@ -27,49 +29,115 @@ type CatalogServerConfig struct {
LocalStorage storage.Instance
}

func AddCatalogServerToManager(mgr ctrl.Manager, cfg CatalogServerConfig, tlsFileWatcher *certwatcher.CertWatcher) error {
listener, err := net.Listen("tcp", cfg.CatalogAddr)
// AddCatalogServerToManager adds the catalog HTTP server to the manager and registers
// a readiness check that passes once the server has started serving. Because
// NeedLeaderElection returns false, Start() is called on every pod immediately, so all
// replicas bind the catalog port and become ready. Non-leader pods serve requests but
// return 404 (empty local cache); callers are expected to retry.
func AddCatalogServerToManager(mgr ctrl.Manager, cfg CatalogServerConfig, cw *certwatcher.CertWatcher) error {
shutdownTimeout := 30 * time.Second
r := &catalogServerRunnable{
cfg: cfg,
cw: cw,
server: &http.Server{
Addr: cfg.CatalogAddr,
Handler: storageServerHandlerWrapped(mgr.GetLogger().WithName("catalogd-http-server"), cfg),
ReadTimeout: 5 * time.Second,
WriteTimeout: 5 * time.Minute,
},
shutdownTimeout: shutdownTimeout,
ready: make(chan struct{}),
}

if err := mgr.Add(r); err != nil {
return fmt.Errorf("error adding catalog server to manager: %w", err)
}

// Register a readiness check that passes once Start() has been called and the
// server is actively serving. All pods reach Start() (NeedLeaderElection=false),
// so all replicas become ready and receive traffic; non-leaders return 404 until
// they win the leader lease and populate their local cache.
if err := mgr.AddReadyzCheck("catalog-server", r.readyzCheck()); err != nil {
return fmt.Errorf("error adding catalog server readiness check: %w", err)
}

return nil
}

// catalogServerRunnable is a leader-only Runnable that binds the catalog HTTP port
// lazily inside Start(), so non-leader pods never hold the listen socket.
Comment on lines +67 to +68
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Stale comment contradicts implementation.

The comment says "leader-only Runnable" but NeedLeaderElection() returns false at line 87, meaning this runs on all pods. The comment should be updated to match the implementation.

📝 Proposed fix
-// catalogServerRunnable is a leader-only Runnable that binds the catalog HTTP port
-// lazily inside Start(), so non-leader pods never hold the listen socket.
+// catalogServerRunnable is a Runnable that binds the catalog HTTP port on all pods.
+// NeedLeaderElection returns false so all replicas serve catalog requests immediately.
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
// catalogServerRunnable is a leader-only Runnable that binds the catalog HTTP port
// lazily inside Start(), so non-leader pods never hold the listen socket.
// catalogServerRunnable is a Runnable that binds the catalog HTTP port on all pods.
// NeedLeaderElection returns false so all replicas serve catalog requests immediately.
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@internal/catalogd/serverutil/serverutil.go` around lines 67 - 68, The comment
for catalogServerRunnable is stale: it states "leader-only Runnable" while
NeedLeaderElection() returns false (so it runs on all pods); update the comment
above catalogServerRunnable to remove or correct the "leader-only" wording and
accurately describe that the runnable runs on all pods and lazily binds the
catalog HTTP port in Start(), ensuring the comment matches the behavior of
NeedLeaderElection() and Start().

type catalogServerRunnable struct {
cfg CatalogServerConfig
cw *certwatcher.CertWatcher
server *http.Server
shutdownTimeout time.Duration
// ready is closed by Start() once the server is about to begin serving.
ready chan struct{}
}

// NeedLeaderElection returns false so the catalog server starts on every pod
// immediately, regardless of leadership. This is required for rolling updates:
// if Start() were gated on leadership, a new pod could not win the leader lease
// (held by the still-running old pod) and therefore could never pass the
// catalog-server readiness check, deadlocking the rollout.
//
// Non-leader pods serve the catalog HTTP port but have an empty local cache
// (only the leader's reconciler downloads catalog content), so requests to a
// non-leader return 404. Callers are expected to retry.
func (r *catalogServerRunnable) NeedLeaderElection() bool { return false }
Comment on lines +78 to +87
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Search for catalog client code and retry/backoff patterns
rg -n -A5 'catalogd-service|/api/v1|ClusterCatalog' --type go | head -100

# Look for retry logic in catalog client
ast-grep --pattern $'func $FUNC($$$) {
  $$$
  retry($$$)
  $$$
}'

Repository: openshift/operator-framework-operator-controller

Length of output: 6971


🏁 Script executed:

#!/bin/bash
# Find files related to catalog client implementation
fd -e go | xargs rg -l 'catalogd' | grep -E '(client|catalog)' | head -20

# Search for HTTP client code making requests to catalogd
rg -n 'http.*Get|http\.Client' --type go | grep -i catalog | head -20

# Look for where catalog data is fetched
rg -n 'catalogd-service|/api/v1' --type go --no-heading | head -30

Repository: openshift/operator-framework-operator-controller

Length of output: 7441


🏁 Script executed:

#!/bin/bash
# Look for the actual catalog client implementation
find . -path ./test -prune -o -name '*catalog*.go' -type f -print | head -20

# Search for retry logic and backoff patterns
rg -n 'retry|backoff|exponential|MaxRetries' --type go | head -30

Repository: openshift/operator-framework-operator-controller

Length of output: 5753


🏁 Script executed:

#!/bin/bash
# Search in internal/operator-controller for catalog fetching logic
rg -n 'catalogd\|/api/v1/catalogs\|ClusterCatalog' internal/operator-controller --type go -B2 -A5 | head -100

Repository: openshift/operator-framework-operator-controller

Length of output: 74


🏁 Script executed:

#!/bin/bash
# Read the catalog client implementation
cat -n internal/operator-controller/catalogmetadata/client/client.go

# Also check the test file to understand expected behavior
cat -n internal/operator-controller/catalogmetadata/client/client_test.go | head -300

Repository: openshift/operator-framework-operator-controller

Length of output: 17978


Client lacks retry logic for 404 responses from non-leader pods.

The catalog client in internal/operator-controller/catalogmetadata/client/client.go does not implement retry logic. When PopulateCache() receives a non-200 status code (including 404 from non-leader pods), it caches the error without retrying. This means a request that hits a non-leader pod will cache the 404 error permanently rather than retrying against the leader pod, breaking the rolling update mechanism that depends on client retries.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@internal/catalogd/serverutil/serverutil.go` around lines 78 - 87,
PopulateCache in internal/operator-controller/catalogmetadata/client/client.go
is caching non-200 responses (notably 404s from non-leader pods) which causes
permanent failures during rollouts; update PopulateCache to not cache error
responses and to implement retries for transient/non-leader 404s (use a small
retry loop with exponential backoff and/or attempt alternate endpoints), only
writing to the local cache when a 2xx response is received; keep existing
behavior that Start()/NeedLeaderElection() returns false but ensure
PopulateCache retries across requests before giving up and then surface a final
error without persisting the 404 into the cache.


func (r *catalogServerRunnable) Start(ctx context.Context) error {
listener, err := net.Listen("tcp", r.cfg.CatalogAddr)
if err != nil {
return fmt.Errorf("error creating catalog server listener: %w", err)
}

if cfg.CertFile != "" && cfg.KeyFile != "" {
// Use the passed certificate watcher instead of creating a new one
if r.cfg.CertFile != "" && r.cfg.KeyFile != "" {
config := &tls.Config{
GetCertificate: tlsFileWatcher.GetCertificate,
GetCertificate: r.cw.GetCertificate,
MinVersion: tls.VersionTLS12,
}
listener = tls.NewListener(listener, config)
}

shutdownTimeout := 30 * time.Second
catalogServer := manager.Server{
Name: "catalogs",
OnlyServeWhenLeader: true,
Server: &http.Server{
Addr: cfg.CatalogAddr,
Handler: storageServerHandlerWrapped(mgr.GetLogger().WithName("catalogd-http-server"), cfg),
ReadTimeout: 5 * time.Second,
// TODO: Revert this to 10 seconds if/when the API
// evolves to have significantly smaller responses
WriteTimeout: 5 * time.Minute,
},
ShutdownTimeout: &shutdownTimeout,
Listener: listener,
}
// Signal readiness before blocking on Serve so the readiness probe passes promptly.
close(r.ready)

err = mgr.Add(&catalogServer)
if err != nil {
return fmt.Errorf("error adding catalog server to manager: %w", err)
}
go func() {
<-ctx.Done()
shutdownCtx := context.Background()
if r.shutdownTimeout > 0 {
var cancel context.CancelFunc
shutdownCtx, cancel = context.WithTimeout(shutdownCtx, r.shutdownTimeout)
defer cancel()
}
if err := r.server.Shutdown(shutdownCtx); err != nil {
// Shutdown errors (e.g. context deadline exceeded) are not actionable;
// the process is terminating regardless.
_ = err
}
}()

if err := r.server.Serve(listener); err != nil && !errors.Is(err, http.ErrServerClosed) {
return err
}
return nil
}

// readyzCheck returns a healthz.Checker that passes once Start() has been called.
func (r *catalogServerRunnable) readyzCheck() healthz.Checker {
return func(_ *http.Request) error {
select {
case <-r.ready:
return nil
default:
return fmt.Errorf("catalog server not yet started")
}
}
}

func logrLoggingHandler(l logr.Logger, handler http.Handler) http.Handler {
return handlers.CustomLoggingHandler(nil, handler, func(_ io.Writer, params handlers.LogFormatterParams) {
// extract parameters used in apache common log format, but then log using `logr` to remain consistent
// with other loggers used in this codebase.
username := "-"
if params.URL.User != nil {
if name := params.URL.User.Username(); name != "" {
Expand Down
19 changes: 18 additions & 1 deletion openshift/catalogd/manifests-experimental.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,23 @@ spec:
- Ingress
- Egress
---
# Source: olmv1/templates/poddisruptionbudget-olmv1-system-catalogd.yml
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: catalogd-controller-manager
namespace: openshift-catalogd
labels:
app.kubernetes.io/name: catalogd
app.kubernetes.io/part-of: olm
annotations:
olm.operatorframework.io/feature-set: experimental
spec:
minAvailable: 1
selector:
matchLabels:
control-plane: catalogd-controller-manager
---
# Source: olmv1/templates/serviceaccount-olmv1-system-common-controller-manager.yml
apiVersion: v1
kind: ServiceAccount
Expand Down Expand Up @@ -825,7 +842,7 @@ metadata:
namespace: openshift-catalogd
spec:
minReadySeconds: 5
replicas: 1
replicas: 2
strategy:
type: RollingUpdate
rollingUpdate:
Expand Down
19 changes: 18 additions & 1 deletion openshift/catalogd/manifests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,23 @@ spec:
- Ingress
- Egress
---
# Source: olmv1/templates/poddisruptionbudget-olmv1-system-catalogd.yml
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: catalogd-controller-manager
namespace: openshift-catalogd
labels:
app.kubernetes.io/name: catalogd
app.kubernetes.io/part-of: olm
annotations:
olm.operatorframework.io/feature-set: standard
spec:
minAvailable: 1
selector:
matchLabels:
control-plane: catalogd-controller-manager
---
# Source: olmv1/templates/serviceaccount-olmv1-system-common-controller-manager.yml
apiVersion: v1
kind: ServiceAccount
Expand Down Expand Up @@ -825,7 +842,7 @@ metadata:
namespace: openshift-catalogd
spec:
minReadySeconds: 5
replicas: 1
replicas: 2
strategy:
type: RollingUpdate
rollingUpdate:
Expand Down
3 changes: 1 addition & 2 deletions openshift/helm/catalogd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ options:
enabled: true
deployment:
image: ${CATALOGD_IMAGE}
podDisruptionBudget:
enabled: false
replicas: 2
operatorController:
enabled: false
openshift:
Expand Down
3 changes: 1 addition & 2 deletions openshift/helm/operator-controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ options:
enabled: true
deployment:
image: ${OPERATOR_CONTROLLER_IMAGE}
podDisruptionBudget:
enabled: false
replicas: 2
catalogd:
enabled: false
openshift:
Expand Down
19 changes: 18 additions & 1 deletion openshift/operator-controller/manifests-experimental.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,23 @@ spec:
- Ingress
- Egress
---
# Source: olmv1/templates/poddisruptionbudget-olmv1-system-operator-controller.yml
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: operator-controller-controller-manager
namespace: openshift-operator-controller
labels:
app.kubernetes.io/name: operator-controller
app.kubernetes.io/part-of: olm
annotations:
olm.operatorframework.io/feature-set: experimental
spec:
minAvailable: 1
selector:
matchLabels:
control-plane: operator-controller-controller-manager
---
# Source: olmv1/templates/serviceaccount-olmv1-system-common-controller-manager.yml
apiVersion: v1
kind: ServiceAccount
Expand Down Expand Up @@ -1184,7 +1201,7 @@ metadata:
name: operator-controller-controller-manager
namespace: openshift-operator-controller
spec:
replicas: 1
replicas: 2
strategy:
type: RollingUpdate
rollingUpdate:
Expand Down
19 changes: 18 additions & 1 deletion openshift/operator-controller/manifests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,23 @@ spec:
- Ingress
- Egress
---
# Source: olmv1/templates/poddisruptionbudget-olmv1-system-operator-controller.yml
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: operator-controller-controller-manager
namespace: openshift-operator-controller
labels:
app.kubernetes.io/name: operator-controller
app.kubernetes.io/part-of: olm
annotations:
olm.operatorframework.io/feature-set: standard
spec:
minAvailable: 1
selector:
matchLabels:
control-plane: operator-controller-controller-manager
---
# Source: olmv1/templates/serviceaccount-olmv1-system-common-controller-manager.yml
apiVersion: v1
kind: ServiceAccount
Expand Down Expand Up @@ -1056,7 +1073,7 @@ metadata:
name: operator-controller-controller-manager
namespace: openshift-operator-controller
spec:
replicas: 1
replicas: 2
strategy:
type: RollingUpdate
rollingUpdate:
Expand Down