From b4dd9f996239f7cf8becd6d6459137a048dcb088 Mon Sep 17 00:00:00 2001 From: Matt Clark Date: Mon, 15 Jun 2026 14:51:27 -0700 Subject: [PATCH] ROSAENG-2057: add pull-secret update, audit, and validate subcommands --- cmd/cluster/cmd.go | 4 +- cmd/cluster/pullsecret.go | 37 + cmd/cluster/pullsecretstatus.go | 420 +++++++ cmd/cluster/pullsecretstatus_test.go | 86 ++ cmd/cluster/replacepullsecret.go | 932 +++++++++++++++ cmd/cluster/validatepullsecret.go | 2 + cmd/cluster/validatepullsecretext.go | 6 + cmd/servicelog/post.go | 4 + docs/README.md | 185 ++- docs/osdctl_cluster.md | 2 +- docs/osdctl_cluster_pull-secret.md | 36 + docs/osdctl_cluster_pull-secret_audit.md | 64 + docs/osdctl_cluster_pull-secret_update.md | 66 ++ docs/osdctl_cluster_pull-secret_validate.md | 65 ++ go.mod | 2 +- pkg/controller/pullsecret.go | 1157 +++++++++++++++++++ pkg/controller/pullsecret_test.go | 820 +++++++++++++ pkg/controller/pullsecretop.go | 245 ++++ 18 files changed, 4090 insertions(+), 43 deletions(-) create mode 100644 cmd/cluster/pullsecret.go create mode 100644 cmd/cluster/pullsecretstatus.go create mode 100644 cmd/cluster/pullsecretstatus_test.go create mode 100644 cmd/cluster/replacepullsecret.go create mode 100644 docs/osdctl_cluster_pull-secret.md create mode 100644 docs/osdctl_cluster_pull-secret_audit.md create mode 100644 docs/osdctl_cluster_pull-secret_update.md create mode 100644 docs/osdctl_cluster_pull-secret_validate.md create mode 100644 pkg/controller/pullsecret.go create mode 100644 pkg/controller/pullsecret_test.go create mode 100644 pkg/controller/pullsecretop.go diff --git a/cmd/cluster/cmd.go b/cmd/cluster/cmd.go index 8aca31a86..ac29c4d6b 100644 --- a/cmd/cluster/cmd.go +++ b/cmd/cluster/cmd.go @@ -33,11 +33,13 @@ func NewCmdCluster(streams genericclioptions.IOStreams, client *k8s.LazyClient, clusterCmd.AddCommand(newCmdResync()) clusterCmd.AddCommand(newCmdContext()) clusterCmd.AddCommand(newCmdTransferOwner(streams, globalOpts)) + clusterCmd.AddCommand(newCmdPullSecret(streams, globalOpts)) + clusterCmd.AddCommand(newCmdReplacePullSecretDeprecated(streams, globalOpts)) clusterCmd.AddCommand(access.NewCmdAccess(streams, client)) clusterCmd.AddCommand(newCmdCpd()) clusterCmd.AddCommand(newCmdCheckBannedUser()) clusterCmd.AddCommand(newCmdValidatePullSecret()) - clusterCmd.AddCommand(newCmdValidatePullSecretExt()) + clusterCmd.AddCommand(newCmdValidatePullSecretExtDeprecated()) clusterCmd.AddCommand(newCmdEtcdHealthCheck()) clusterCmd.AddCommand(newCmdEtcdMemberReplacement()) clusterCmd.AddCommand(newCmdFromInfraId(globalOpts)) diff --git a/cmd/cluster/pullsecret.go b/cmd/cluster/pullsecret.go new file mode 100644 index 000000000..8ba14c9cb --- /dev/null +++ b/cmd/cluster/pullsecret.go @@ -0,0 +1,37 @@ +package cluster + +import ( + "github.com/spf13/cobra" + "k8s.io/cli-runtime/pkg/genericclioptions" + + "github.com/openshift/osdctl/internal/utils/globalflags" +) + +func newCmdPullSecret(streams genericclioptions.IOStreams, globalOpts *globalflags.GlobalOptions) *cobra.Command { + cmd := &cobra.Command{ + Use: "pull-secret", + Short: "Diagnose and manage cluster pull secrets", + Long: "Diagnose and manage cluster pull secrets.", + DisableAutoGenTag: true, + } + + cmd.AddCommand(newCmdPullSecretAudit(streams, globalOpts)) + cmd.AddCommand(newCmdPullSecretUpdate(streams, globalOpts)) + cmd.AddCommand(newCmdPullSecretValidate()) + + return cmd +} + +func newCmdPullSecretValidate() *cobra.Command { + cmd := newCmdValidatePullSecretExt() + cmd.Use = "validate" + cmd.Example = ` # Compare OCM Access-Token, OCM Registry-Credentials, and OCM Account Email against cluster's pull secret + osdctl cluster pull-secret validate --cluster-id ${CLUSTER_ID} --reason "OSD-XYZ" + + # Exclude Access-Token, and Registry-Credential checks... + osdctl cluster pull-secret validate --cluster-id ${CLUSTER_ID} --reason "OSD-XYZ" --skip-access-token --skip-registry-creds + + # Skip sending service logs (useful for testing) + osdctl cluster pull-secret validate --cluster-id ${CLUSTER_ID} --reason "OSD-XYZ" --skip-service-logs` + return cmd +} diff --git a/cmd/cluster/pullsecretstatus.go b/cmd/cluster/pullsecretstatus.go new file mode 100644 index 000000000..49979b230 --- /dev/null +++ b/cmd/cluster/pullsecretstatus.go @@ -0,0 +1,420 @@ +package cluster + +import ( + "context" + "fmt" + "io" + "os" + "regexp" + "time" + + "github.com/fatih/color" + "github.com/olekukonko/tablewriter" + amv1 "github.com/openshift-online/ocm-sdk-go/accountsmgmt/v1" + "github.com/sirupsen/logrus" + "github.com/spf13/cobra" + "go.uber.org/zap/zapcore" + "k8s.io/cli-runtime/pkg/genericclioptions" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + "github.com/openshift/osdctl/cmd/common" + "github.com/openshift/osdctl/internal/utils/globalflags" + "github.com/openshift/osdctl/pkg/controller" + "github.com/openshift/osdctl/pkg/utils" +) + +// nolint:gosec +const pullSecStatusUsageTemplate = `Usage:{{if .Runnable}} + {{.UseLine}}{{end}}{{if gt (len .Aliases) 0}} + +Aliases: + {{.NameAndAliases}}{{end}}{{if .HasExample}} + +Examples: +{{.Example}}{{end}} + +Required Flags (one of --cluster-id or --account-id): + -C, --cluster-id string Any cluster owned by the account (used to resolve the owner) + -A, --account-id string OCM account ID directly + --reason string Elevation reason for cluster connections + +Optional Flags: + --validate Validate all clusters' pull secrets against OCM +` + +type pullSecretAuditOptions struct { + clusterID string + accountID string + reason string + validate bool + logger *logrus.Logger + + genericclioptions.IOStreams + GlobalOptions *globalflags.GlobalOptions +} + +func newCmdPullSecretAudit(streams genericclioptions.IOStreams, globalOpts *globalflags.GlobalOptions) *cobra.Command { + ops := &pullSecretAuditOptions{ + IOStreams: streams, + GlobalOptions: globalOpts, + logger: newAuditLogger(), + } + cmd := &cobra.Command{ + Use: "audit", + Short: "Audit pull secret status for all clusters owned by an account", + Long: `Audit pull secret status for all clusters sharing the same OCM account. + +Given a cluster ID or account ID, resolves the owner account and lists all +clusters owned by that account. Compares cluster creation dates against the +account's registry credential update timestamps to flag clusters that may +have stale pull secrets. + +Use --validate to connect to each cluster and compare its pull secret +against the OCM access token and registry credential auths. + +For validating a single cluster, use 'osdctl cluster pull-secret validate'.`, + Example: ` # Overview of all clusters for the account + osdctl cluster pull-secret audit -C 1kfmyclusterid --reason "OHSS-1234" + + # Using account ID directly + osdctl cluster pull-secret audit -A 2g9OLHPkwDDcXvq2mt7kjfIQ0gf --reason "OHSS-1234" + + # Validate all clusters' pull secrets against OCM + osdctl cluster pull-secret audit -C 1kfmyclusterid --reason "OHSS-1234" --validate`, + Args: cobra.NoArgs, + DisableAutoGenTag: true, + SilenceUsage: true, + PreRunE: func(cmd *cobra.Command, args []string) error { + if ops.clusterID == "" && ops.accountID == "" { + return fmt.Errorf("one of --cluster-id or --account-id is required") + } + if ops.clusterID != "" && ops.accountID != "" { + return fmt.Errorf("--cluster-id and --account-id are mutually exclusive") + } + idPattern := regexp.MustCompile(`^[a-zA-Z0-9_-]+$`) + if ops.accountID != "" && !idPattern.MatchString(ops.accountID) { + return fmt.Errorf("--account-id contains invalid characters") + } + if ops.clusterID != "" && !idPattern.MatchString(ops.clusterID) { + return fmt.Errorf("--cluster-id contains invalid characters") + } + return nil + }, + RunE: func(cmd *cobra.Command, args []string) error { + return ops.run(cmd.Context()) + }, + } + + cmd.Flags().StringVarP(&ops.clusterID, "cluster-id", "C", "", "Any cluster owned by the account (used to resolve the owner)") + cmd.Flags().StringVarP(&ops.accountID, "account-id", "A", "", "OCM account ID directly") + cmd.Flags().StringVar(&ops.reason, "reason", "", "Elevation reason for cluster connections") + cmd.Flags().BoolVar(&ops.validate, "validate", false, "Validate all clusters' pull secrets against OCM") + + if err := cmd.MarkFlagRequired("reason"); err != nil { + panic(fmt.Sprintf("failed to mark 'reason' as required: %v", err)) + } + + cmd.SetUsageTemplate(pullSecStatusUsageTemplate) + + return cmd +} + +func newAuditLogger() *logrus.Logger { + l := logrus.New() + l.SetOutput(os.Stderr) + l.SetFormatter(&logrus.TextFormatter{ + FullTimestamp: true, + TimestampFormat: "15:04:05", + ForceColors: true, + }) + l.SetLevel(logrus.InfoLevel) + return l +} + +func (o *pullSecretAuditOptions) run(ctx context.Context) error { + out := o.Out + logger := o.logger + + log.SetLogger(zap.New(zap.WriteTo(o.ErrOut), zap.Level(zapcore.WarnLevel))) + + logger.Info("Creating OCM connection") + ocm, err := utils.CreateConnection() + if err != nil { + return fmt.Errorf("failed to create OCM client: %w", err) + } + defer func() { + if closeErr := ocm.Close(); closeErr != nil { + logger.Warnf("Cannot close the OCM connection: %v", closeErr) + } + }() + + // Resolve account — from cluster ID or directly + var ownerAccountID, ownerUsername, ownerEmail string + + if o.accountID != "" { + ownerAccount, err := utils.GetAccount(ocm, o.accountID) + if err != nil { + return fmt.Errorf("failed to get account %s: %w", o.accountID, err) + } + ownerAccountID = ownerAccount.ID() + ownerUsername = ownerAccount.Username() + ownerEmail = ownerAccount.Email() + logger.Infof("Account resolved: %s (%s)", ownerUsername, ownerAccountID) + } else { + cluster, err := utils.GetClusterAnyStatus(ocm, o.clusterID) + if err != nil { + return fmt.Errorf("failed to get cluster: %w", err) + } + logger.Infof("Cluster resolved: %s (%s)", cluster.Name(), cluster.ID()) + + subscription, err := utils.GetSubscription(ocm, cluster.ID()) + if err != nil { + return fmt.Errorf("failed to get subscription: %w", err) + } + + ownerAccount, err := utils.GetAccount(ocm, subscription.Creator().ID()) + if err != nil { + return fmt.Errorf("failed to get owner account: %w", err) + } + ownerAccountID = ownerAccount.ID() + ownerUsername = ownerAccount.Username() + ownerEmail = ownerAccount.Email() + logger.Infof("Owner resolved: %s (account: %s)", ownerUsername, ownerAccountID) + } + + logger.Info("Fetching registry credentials from OCM") + latestCredUpdate, err := controller.GetLatestCredentialUpdate(ocm, ownerAccountID) + if err != nil { + logger.Warnf("Could not fetch registry credentials: %v", err) + } + + logger.Info("Querying clusters for this account") + clusters, err := controller.ListOwnerSubscriptions(ocm, ownerAccountID) + if err != nil { + return fmt.Errorf("failed to list subscriptions: %w", err) + } + + // Fetch OCM data and collect validation results if --validate + type checkResult struct { + accessTokenResult *controller.PullSecretVerifyResult + regCredResult *controller.PullSecretVerifyResult + err error + } + checkResults := make(map[string]*checkResult) + + var auths map[string]*amv1.AccessTokenAuth + hasAccessToken := false + hasRegCreds := false + + if o.validate { + logger.Infof("Fetching access token from OCM for owner '%s'", ownerUsername) + _, auths, err = controller.FetchOwnerAccessToken(ocm, ownerUsername, logger) + if err != nil { + logger.Warnf("Could not fetch access token: %v", err) + fmt.Fprintf(out, "\n%s Could not fetch OCM access token (may require region-lead permissions).\n", colorWarn("[WARN]")) + fmt.Fprint(out, "Continue with registry credentials only? ") + if !utils.ConfirmPrompt() { + o.validate = false + } + } else { + hasAccessToken = true + logger.Infof("Retrieved %d auth entries from OCM access token", len(auths)) + } + + if o.validate { + logger.Info("Fetching registry credentials from OCM") + testCreds, regErr := utils.GetRegistryCredentials(ocm, ownerAccountID) + if regErr != nil || len(testCreds) == 0 { + logger.Warnf("Could not fetch registry credentials: %v", regErr) + } else { + hasRegCreds = true + logger.Infof("Retrieved %d registry credentials from OCM", len(testCreds)) + } + } + + if !hasAccessToken && !hasRegCreds { + logger.Warn("Neither access token nor registry credentials available — skipping validation") + fmt.Fprintf(out, "%s Cannot compare cluster pull secrets without OCM data. Skipping --validate.\n", colorWarn("[WARN]")) + o.validate = false + } + } + + // Connect to clusters and collect results + if o.validate { + elevationReasons := []string{ + o.reason, + "Checking pull secret status using osdctl pull-secret audit", + } + + for _, c := range clusters { + cr := &checkResult{} + logger.Infof("Connecting to cluster %s (%s)", c.Name, c.ID) + _, _, clientset, connErr := common.GetKubeConfigAndClient(c.ID, elevationReasons...) + if connErr != nil { + cr.err = fmt.Errorf("failed to connect: %v", connErr) + checkResults[c.ID] = cr + continue + } + + if hasAccessToken { + result, verifyErr := controller.CompareAccessTokenAuthsToCluster(ctx, clientset, auths, nil) + if verifyErr != nil { + logger.Warnf("Access token verification failed for %s: %v", c.ID, verifyErr) + } else { + cr.accessTokenResult = result + } + } + + if hasRegCreds { + result, verifyErr := controller.CompareRegistryCredentialAuthsToCluster(ctx, ocm, clientset, ownerAccountID, ownerEmail, nil) + if verifyErr != nil { + logger.Warnf("Registry credential verification failed for %s: %v", c.ID, verifyErr) + } else { + cr.regCredResult = result + } + } + + checkResults[c.ID] = cr + } + } + + // --- Render --- + + fmt.Fprintf(out, "\n============================================================\n") + fmt.Fprintf(out, " Owner: %s (account: %s)\n", ownerUsername, ownerAccountID) + fmt.Fprintf(out, " Email: %s\n", ownerEmail) + if !latestCredUpdate.IsZero() { + fmt.Fprintf(out, " Registry credentials last updated: %s\n", latestCredUpdate.Format("2006-01-02 15:04:05 UTC")) + } + fmt.Fprintf(out, " Clusters: %d\n", len(clusters)) + fmt.Fprintf(out, "============================================================\n\n") + + staleCount := 0 + for i, c := range clusters { + if i > 0 { + fmt.Fprintln(out) + fmt.Fprintln(out, "============================================================") + fmt.Fprintln(out) + } + renderClusterBanner(out, c) + + cr, hasCheck := checkResults[c.ID] + renderPSStatus(out, c, latestCredUpdate, &staleCount) + + if hasCheck && cr.err != nil { + fmt.Fprintf(out, " %s PS CHECK: %v\n", colorFail("[FAIL]"), cr.err) + } else if hasCheck { + if cr.accessTokenResult != nil { + renderCheckTable(out, cr.accessTokenResult, c.ID, "ACCESS TOKEN AUTHS") + } else if hasAccessToken { + fmt.Fprintf(out, " %s access token verification failed for this cluster\n", colorWarn("[WARN]")) + } + if cr.regCredResult != nil { + renderCheckTable(out, cr.regCredResult, c.ID, "REGISTRY CREDENTIAL AUTHS") + } else if hasRegCreds { + fmt.Fprintf(out, " %s registry credential verification failed for this cluster\n", colorWarn("[WARN]")) + } + } + } + + validatedCount := 0 + for _, cr := range checkResults { + if cr.err == nil { + validatedCount++ + } + } + + fmt.Fprintf(out, "\n%d cluster(s) found", len(clusters)) + if staleCount > 0 { + fmt.Fprintf(out, ", %s %d potentially stale", colorWarn("[WARN]"), staleCount) + } + if o.validate { + fmt.Fprintf(out, ", %d validated", validatedCount) + if validatedCount < len(checkResults) { + fmt.Fprintf(out, ", %d failed to connect", len(checkResults)-validatedCount) + } + } + fmt.Fprintln(out, ".") + + if !o.validate { + fmt.Fprintf(out, "Use --validate to check all clusters' pull secrets against OCM.\n") + } + + return nil +} + +func renderClusterBanner(out io.Writer, c controller.ClusterSummary) { + label := color.New(color.FgBlue, color.Bold).SprintFunc() + fmt.Fprintf(out, "%s %s (%s)\n", label("Cluster:"), c.Name, c.ID) + fmt.Fprintf(out, "%s %s %s %s\n", + label("Created:"), c.CreatedAt.Format("2006-01-02 15:04"), + label("Status:"), c.Status) +} + +func renderPSStatus(out io.Writer, c controller.ClusterSummary, latestCredUpdate time.Time, staleCount *int) { + psLabel := color.New(color.FgCyan, color.Bold).SprintFunc() + psDetail := color.New(color.FgCyan).SprintFunc() + + if latestCredUpdate.IsZero() { + fmt.Fprintf(out, " %s %s\n", psLabel("PS STATUS:"), psDetail("unknown — no credential timestamps available")) + } else if c.CreatedAt.Before(latestCredUpdate) { + fmt.Fprintf(out, " %s %s %s\n", colorWarn("[WARN]"), psLabel("PS STATUS:"), psDetail("may be stale — created before last credential update")) + *staleCount++ + } else { + fmt.Fprintf(out, " %s %s\n", psLabel("PS STATUS:"), psDetail("likely current — created after last credential update")) + } +} + +func renderCheckTable(out io.Writer, result *controller.PullSecretVerifyResult, clusterID string, sourceLabel string) { + table := tablewriter.NewWriter(out) + table.SetHeader([]string{sourceLabel, "TOKEN", "EMAIL", "STATUS"}) + table.SetHeaderAlignment(tablewriter.ALIGN_LEFT) + table.SetAlignment(tablewriter.ALIGN_LEFT) + table.SetBorder(false) + table.SetColumnSeparator(" ") + table.SetAutoWrapText(false) + table.SetAutoFormatHeaders(false) + table.SetHeaderColor( + tablewriter.Colors{tablewriter.Bold, tablewriter.FgBlueColor}, + tablewriter.Colors{tablewriter.Bold, tablewriter.FgBlueColor}, + tablewriter.Colors{tablewriter.Bold, tablewriter.FgBlueColor}, + tablewriter.Colors{tablewriter.Bold, tablewriter.FgBlueColor}, + ) + + mismatchStatus := color.New(color.FgYellow, color.Bold).SprintFunc() + for _, ar := range result.AuthResults { + status := colorOK("[OK]") + tokenStr := "match" + emailStr := "match" + if !ar.OK { + status = mismatchStatus("[!]") + if ar.Detail == "not found in cluster secret" { + tokenStr = "missing" + emailStr = "missing" + } else { + if !ar.TokenMatch { + tokenStr = "MISMATCH" + } + if !ar.EmailMatch { + emailStr = "MISMATCH" + } + } + } + table.Append([]string{ar.Registry, tokenStr, emailStr, status}) + if !ar.OK && ar.Detail != "" && ar.Detail != "not found in cluster secret" { + mismatchDetail := color.New(color.FgYellow).SprintFunc() + table.Append([]string{"", mismatchDetail(ar.Detail), "", ""}) + } + } + table.Render() + + if result.Matched < result.Total { + fmt.Fprintf(out, " %s Verified %d/%d — consider 'osdctl cluster pull-secret update -C %s'\n", + colorWarn("[WARN]"), result.Matched, result.Total, clusterID) + } + if len(result.MissingRequired) > 0 { + fmt.Fprintf(out, " %s missing required registries: %v\n", + colorWarn("[WARN]"), result.MissingRequired) + } +} diff --git a/cmd/cluster/pullsecretstatus_test.go b/cmd/cluster/pullsecretstatus_test.go new file mode 100644 index 000000000..c23b8751c --- /dev/null +++ b/cmd/cluster/pullsecretstatus_test.go @@ -0,0 +1,86 @@ +package cluster + +import ( + "bytes" + "testing" + + "k8s.io/cli-runtime/pkg/genericclioptions" +) + +func TestAuditPreRunE_MutualExclusivity(t *testing.T) { + streams := genericclioptions.IOStreams{Out: &bytes.Buffer{}, ErrOut: &bytes.Buffer{}, In: nil} + cmd := newCmdPullSecretAudit(streams, nil) + + tests := []struct { + name string + clusterID string + accountID string + wantErr string + }{ + { + name: "neither provided", + wantErr: "one of --cluster-id or --account-id is required", + }, + { + name: "both provided", + clusterID: "abc123", + accountID: "def456", + wantErr: "mutually exclusive", + }, + { + name: "cluster-id only", + clusterID: "abc123", + }, + { + name: "account-id only", + accountID: "def456", + }, + { + name: "account-id with special chars", + accountID: "abc' OR 1=1 --", + wantErr: "invalid characters", + }, + { + name: "cluster-id with special chars", + clusterID: "abc'; DROP TABLE", + wantErr: "invalid characters", + }, + { + name: "account-id with hyphens (valid)", + accountID: "abc-123-def", + }, + { + name: "cluster-id with underscores (valid)", + clusterID: "my_cluster_123", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if err := cmd.Flags().Set("cluster-id", tt.clusterID); err != nil { + t.Fatalf("failed to set cluster-id: %v", err) + } + if err := cmd.Flags().Set("account-id", tt.accountID); err != nil { + t.Fatalf("failed to set account-id: %v", err) + } + if err := cmd.Flags().Set("reason", "test"); err != nil { + t.Fatalf("failed to set reason: %v", err) + } + + err := cmd.PreRunE(cmd, nil) + + if tt.wantErr != "" { + if err == nil { + t.Fatalf("expected error containing %q, got nil", tt.wantErr) + } + if !bytes.Contains([]byte(err.Error()), []byte(tt.wantErr)) { + t.Fatalf("expected error containing %q, got %q", tt.wantErr, err.Error()) + } + } else { + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + } + }) + } +} diff --git a/cmd/cluster/replacepullsecret.go b/cmd/cluster/replacepullsecret.go new file mode 100644 index 000000000..ade4f79cc --- /dev/null +++ b/cmd/cluster/replacepullsecret.go @@ -0,0 +1,932 @@ +package cluster + +import ( + "context" + b64 "encoding/base64" + "fmt" + "os" + "regexp" + "time" + + "github.com/fatih/color" + sdk "github.com/openshift-online/ocm-sdk-go" + amv1 "github.com/openshift-online/ocm-sdk-go/accountsmgmt/v1" + cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" + "github.com/sirupsen/logrus" + "github.com/spf13/cobra" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/cli-runtime/pkg/genericclioptions" + "k8s.io/client-go/kubernetes" + "sigs.k8s.io/controller-runtime/pkg/client" + + "go.uber.org/zap/zapcore" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + "github.com/openshift/osdctl/cmd/common" + "github.com/openshift/osdctl/cmd/servicelog" + "github.com/openshift/osdctl/internal/utils/globalflags" + "github.com/openshift/osdctl/pkg/controller" + "github.com/openshift/osdctl/pkg/utils" +) + +var ( + reasonPattern = regexp.MustCompile(`(?i)(OHSS|PD|SREP|OSD|SDE|ROSAENG)-\d+`) + + colorOK = color.New(color.FgGreen).SprintFunc() + colorFail = color.New(color.FgRed).SprintFunc() + colorWarn = color.New(color.FgYellow).SprintFunc() + colorDryRun = color.New(color.FgCyan).SprintFunc() +) + +// nolint:gosec +const replacePullSecUsageTemplate = `Usage:{{if .Runnable}} + {{.UseLine}}{{end}}{{if gt (len .Aliases) 0}} + +Aliases: + {{.NameAndAliases}}{{end}}{{if .HasExample}} + +Examples: +{{.Example}}{{end}} + +Required Flags: + -C, --cluster-id string The Internal/External Cluster ID or Cluster Name + --reason string The reason for this command (usually an OHSS or PD ticket) + +Optional Flags: + -d, --dry-run Dry-run - show what would change but do not apply + --force Proceed despite pre-flight failures or no-op detection (YES confirmation) + --hive-ocm-url string OCM environment for Hive operations (aliases: production, staging, integration) +` + +type replacePullSecretOptions struct { + clusterID string + reason string + dryrun bool + force bool + hiveOcmUrl string + logger *logrus.Logger + + genericclioptions.IOStreams + GlobalOptions *globalflags.GlobalOptions +} + +func newReplacePullSecretLogger() *logrus.Logger { + l := logrus.New() + l.SetOutput(os.Stderr) + l.SetFormatter(&logrus.TextFormatter{ + FullTimestamp: true, + TimestampFormat: "15:04:05", + ForceColors: true, + }) + l.SetLevel(logrus.InfoLevel) + return l +} + +func newCmdReplacePullSecretDeprecated(streams genericclioptions.IOStreams, globalOpts *globalflags.GlobalOptions) *cobra.Command { + cmd := newCmdPullSecretUpdate(streams, globalOpts) + cmd.Use = "replace-pull-secret" + cmd.Deprecated = "use 'osdctl cluster pull-secret update' instead" + return cmd +} + +func newCmdPullSecretUpdate(streams genericclioptions.IOStreams, globalOpts *globalflags.GlobalOptions) *cobra.Command { + ops := &replacePullSecretOptions{ + IOStreams: streams, + GlobalOptions: globalOpts, + logger: newReplacePullSecretLogger(), + } + cmd := &cobra.Command{ + Use: "update", + Short: "Refresh a cluster's pull secret from the cluster owner's OCM account", + Long: `Refresh a cluster's pull secret from the cluster owner's OCM account. + +This updates the pull secret on a ROSA HCP or Classic cluster without performing +an ownership transfer. The pull secret is rebuilt from the latest credentials +in the cluster owner's OCM account. + +A pre-flight check always runs first. If any checks fail, the command exits +unless --force is specified (requires typing YES to confirm). + +See documentation prior to executing: +https://github.com/openshift/ops-sop/blob/master/hypershift/knowledge_base/howto/replace-pull-secret.md +https://github.com/openshift/ops-sop/blob/master/v4/howto/transfer_cluster_ownership.md`, + Example: ` # Update pull secret on a cluster + osdctl cluster pull-secret update --cluster-id 1kfmyclusterid --reason "OHSS-1234" + + # Dry-run to preview without making changes + osdctl cluster pull-secret update --cluster-id 1kfmyclusterid --reason "OHSS-1234" --dry-run + + # Force proceed despite pre-flight failures (e.g. missing pull secret) + osdctl cluster pull-secret update --cluster-id 1kfmyclusterid --reason "OHSS-1234" --force`, + Args: cobra.NoArgs, + DisableAutoGenTag: true, + SilenceUsage: true, + PreRunE: func(cmd *cobra.Command, args []string) error { + return ops.validate() + }, + RunE: func(cmd *cobra.Command, args []string) error { + return ops.run(cmd.Context()) + }, + } + + cmd.Flags().StringVarP(&ops.clusterID, "cluster-id", "C", "", "The Internal/External Cluster ID or Cluster Name") + cmd.Flags().StringVar(&ops.reason, "reason", "", "The reason for this command (usually an OHSS or PD ticket)") + cmd.Flags().BoolVarP(&ops.dryrun, "dry-run", "d", false, "Dry-run - show what would change but do not apply") + cmd.Flags().BoolVar(&ops.force, "force", false, "Proceed despite pre-flight failures or no-op detection (requires YES confirmation)") + cmd.Flags().StringVar(&ops.hiveOcmUrl, "hive-ocm-url", "", "OCM environment for Hive operations (aliases: production, staging, integration)") + + for _, flag := range []string{"cluster-id", "reason"} { + if err := cmd.MarkFlagRequired(flag); err != nil { + panic(fmt.Sprintf("failed to mark '%s' as required: %v", flag, err)) + } + } + + cmd.SetUsageTemplate(replacePullSecUsageTemplate) + + return cmd +} + +func (o *replacePullSecretOptions) validate() error { + if o.clusterID == "" || o.reason == "" { + return nil // let cobra handle required flag errors + } + if o.hiveOcmUrl != "" { + resolved, err := utils.ValidateAndResolveOcmUrl(o.hiveOcmUrl) + if err != nil { + return fmt.Errorf("invalid --hive-ocm-url: %w", err) + } + o.hiveOcmUrl = resolved + } + if !reasonPattern.MatchString(o.reason) { + o.logger.Warnf("--reason %q does not appear to contain a ticket ID (e.g. OHSS-1234)", o.reason) + fmt.Fprint(o.Out, "Continue without a valid ticket reference? ") + if !utils.ConfirmPrompt() { + return fmt.Errorf("operation aborted — provide a valid --reason") + } + } + return nil +} + +func (o *replacePullSecretOptions) run(ctx context.Context) error { + out := o.Out + logger := o.logger + op := controller.NewPullSecretOp(o.dryrun, logger, out) + + log.SetLogger(zap.New(zap.WriteTo(o.ErrOut), zap.Level(zapcore.WarnLevel))) + + // ================================================================ + // Step 1: OCM data and cluster connectivity + // ================================================================ + + op.Section(1, "OCM data and cluster connectivity", + "Resolve the cluster, owner account, and OCM access token.", + "Establish connections to the infrastructure and target clusters.") + + logger.Info("Creating OCM connection") + ocm, err := utils.CreateConnection() + if err != nil { + return fmt.Errorf("failed to create OCM client: %w", err) + } + defer func() { + if closeErr := ocm.Close(); closeErr != nil { + logger.Warnf("Cannot close the OCM connection: %v", closeErr) + } + }() + + cluster, err := utils.GetClusterAnyStatus(ocm, o.clusterID) + if err != nil { + return fmt.Errorf("failed to get cluster: %w", err) + } + o.clusterID = cluster.ID() + + isHCP, err := utils.IsHostedCluster(o.clusterID) + if err != nil { + return fmt.Errorf("failed to check if cluster is HCP: %w", err) + } + + clusterType := "OSD/ROSA Classic" + if isHCP { + clusterType = "HCP" + } + + subscription, err := utils.GetSubscription(ocm, o.clusterID) + if err != nil { + return fmt.Errorf("failed to get subscription: %w", err) + } + + ownerAccount, err := utils.GetAccount(ocm, subscription.Creator().ID()) + if err != nil { + return fmt.Errorf("failed to get owner account from subscription: %w", err) + } + ownerUsername := ownerAccount.Username() + ownerAccountID := ownerAccount.ID() + + siblingCount := controller.CountOwnerClusters(ocm, ownerAccountID, logger) + + fmt.Fprintf(out, "\n Cluster: %s (%s)\n", cluster.Name(), o.clusterID) + fmt.Fprintf(out, " Type: %s\n", clusterType) + fmt.Fprintf(out, " Owner: %s (account: %s)\n", ownerUsername, ownerAccountID) + fmt.Fprintf(out, " Reason: %s\n", o.reason) + if o.dryrun { + fmt.Fprintf(out, " Mode: %s\n", colorDryRun("DRY-RUN (no changes will be made)")) + } + if siblingCount > 1 { + fmt.Fprintf(out, "\n %s This account owns %d clusters sharing the same access token.\n", colorWarn("[NOTE]"), siblingCount) + fmt.Fprintf(out, " This command only updates the pull secret on the cluster above.\n") + fmt.Fprintf(out, " Use 'osdctl cluster pull-secret audit -C %s' to review all clusters.\n", o.clusterID) + } + + fmt.Fprint(out, "\nIs this the correct cluster? ") + if !utils.ConfirmPrompt() { + return fmt.Errorf("operation aborted by user") + } + + // Resolve infrastructure clusters + var hiveOCM *sdk.Connection + if o.hiveOcmUrl != "" && !isHCP { + logger.Infof("Creating separate OCM connection for Hive operations: %s", o.hiveOcmUrl) + hiveOCM, err = utils.CreateConnectionWithUrl(o.hiveOcmUrl) + if err != nil { + op.Fail("could not create hive OCM connection: %v", err) + } else { + defer hiveOCM.Close() + op.OK("hive OCM connection established (%s)", o.hiveOcmUrl) + } + } else if o.hiveOcmUrl != "" && isHCP { + op.Info("--hive-ocm-url ignored for HCP clusters (ManifestWork path does not use Hive)") + } + + var mgmtCluster *cmv1.Cluster + var masterCluster *cmv1.Cluster + + if isHCP { + mgmtCluster, err = utils.GetManagementCluster(o.clusterID) + if err != nil { + op.Fail("could not resolve management cluster: %v", err) + } else { + op.OK("management cluster: %s", mgmtCluster.Name()) + } + svcCluster, svcErr := utils.GetServiceCluster(o.clusterID) + if svcErr != nil { + op.Fail("could not resolve service cluster: %v", svcErr) + } else { + masterCluster = svcCluster + op.OK("service cluster: %s", svcCluster.Name()) + } + } else { + var hiveCluster *cmv1.Cluster + if hiveOCM != nil { + hiveCluster, err = utils.GetHiveClusterWithConn(o.clusterID, ocm, hiveOCM) + } else { + hiveCluster, err = utils.GetHiveCluster(o.clusterID) + } + if err != nil { + op.Fail("could not resolve hive cluster: %v", err) + if o.hiveOcmUrl == "" { + op.Info("Hint: if the hive cluster is in a different OCM environment, try --hive-ocm-url (e.g. --hive-ocm-url prod)") + } + } else { + masterCluster = hiveCluster + op.OK("hive cluster: %s", hiveCluster.Name()) + } + } + + // Fetch OCM access token + var pullSecret []byte + var auths map[string]*amv1.AccessTokenAuth + var fetchOK bool + pullSecret, auths, fetchOK = op.FetchAccessTokenOp(ocm, ownerUsername) + if !fetchOK { + return fmt.Errorf("failed to fetch OCM access token for owner '%s'", ownerUsername) + } + + // Connect to clusters + elevationReasons := []string{ + o.reason, + "Replacing pull secret using osdctl pull-secret update", + } + + var masterKubeCli client.Client + var masterKubeClientSet *kubernetes.Clientset + + if masterCluster != nil { + logger.Infof("Connecting to infrastructure cluster %s", masterCluster.Name()) + if hiveOCM != nil { + masterKubeCli, _, masterKubeClientSet, err = common.GetKubeConfigAndClientWithConn(masterCluster.ID(), hiveOCM, elevationReasons...) + } else { + masterKubeCli, _, masterKubeClientSet, err = common.GetKubeConfigAndClient(masterCluster.ID(), elevationReasons...) + } + if err != nil { + op.Fail("could not connect to infrastructure cluster %s: %v", masterCluster.Name(), err) + } else { + op.OK("connected to infrastructure cluster %s with elevation", masterCluster.Name()) + } + } + + var targetClientSet *kubernetes.Clientset + + logger.Infof("Connecting to target cluster %s", cluster.Name()) + _, _, targetClientSet, err = common.GetKubeConfigAndClient(o.clusterID, elevationReasons...) + if err != nil { + op.Fail("could not connect to target cluster %s: %v", cluster.Name(), err) + } else { + op.OK("connected to target cluster %s with elevation", cluster.Name()) + } + + // ================================================================ + // Pre-flight checks (live mode only — dry-run shows checks inline) + // ================================================================ + + if !o.dryrun { + fmt.Fprintf(out, "\nRunning pre-flight checks (dry-run RBAC verification)...\n") + preflightOp := controller.NewPullSecretOp(true, logger, out) + + // Check infra connectivity + if masterCluster == nil || masterKubeCli == nil || masterKubeClientSet == nil { + preflightOp.Fail("infrastructure cluster not connected") + } + // Check target connectivity + if targetClientSet == nil { + preflightOp.Fail("target cluster not connected") + } + // Check auths available + if auths == nil { + preflightOp.Fail("OCM access token not available") + } + // Check target cluster RBAC + if targetClientSet != nil { + if !preflightOp.CheckCanI(ctx, targetClientSet, cluster.Name(), "get", "secrets", "", "openshift-config") { + preflightOp.Fail("cannot read secrets in openshift-config on %s", cluster.Name()) + } + } + // Check infra cluster RBAC + if masterKubeClientSet != nil && masterKubeCli != nil && !isHCP { + infraLabel := "(infra)" + if masterCluster != nil { + infraLabel = masterCluster.Name() + } + hiveInfo, hiveErr := controller.FindHiveNamespace(ctx, masterKubeCli, o.clusterID) + if hiveErr != nil || hiveInfo == nil { + preflightOp.Fail("could not resolve hive namespace") + } else { + if !preflightOp.CheckCanI(ctx, masterKubeClientSet, infraLabel, "update", "secrets", "", hiveInfo.Namespace) { + preflightOp.Fail("cannot update secrets in %s", hiveInfo.Namespace) + } + if !preflightOp.CheckCanI(ctx, masterKubeClientSet, infraLabel, "create", "syncsets", "hive.openshift.io", hiveInfo.Namespace) { + preflightOp.Fail("cannot create syncsets in %s", hiveInfo.Namespace) + } + } + } else if masterKubeClientSet != nil && isHCP { + infraLabel := "(infra)" + if masterCluster != nil { + infraLabel = masterCluster.Name() + } + mgmtNS := "" + if mgmtCluster != nil { + mgmtNS = mgmtCluster.Name() + } + if !preflightOp.CheckCanI(ctx, masterKubeClientSet, infraLabel, "get", "manifestworks", "work.open-cluster-management.io", mgmtNS) { + preflightOp.Fail("cannot get manifestworks on %s", infraLabel) + } + if !preflightOp.CheckCanI(ctx, masterKubeClientSet, infraLabel, "update", "manifestworks", "work.open-cluster-management.io", mgmtNS) { + preflightOp.Fail("cannot update manifestworks on %s", infraLabel) + } + } + + if !preflightOp.AllOK { + fmt.Fprintf(out, "%s Pre-flight checks failed.\n", colorFail("[FAIL]")) + for _, f := range preflightOp.Failures { + fmt.Fprintf(out, " %s %s\n", colorFail("[FAIL]"), f) + } + if !o.force { + return fmt.Errorf("pre-flight checks failed") + } + fmt.Fprintf(out, "\n%s --force specified. Proceeding despite failures.\n", colorWarn("[WARN]")) + fmt.Fprintf(out, "Type YES to confirm: ") + var response string + if _, scanErr := fmt.Scanln(&response); scanErr != nil || response != "YES" { + return fmt.Errorf("operation aborted by user") + } + } else { + fmt.Fprintf(out, "%s Pre-flight checks passed.\n", colorOK("[OK]")) + } + } + + // ================================================================ + // Step 2: Compare current pull secret against OCM (live mode only) + // ================================================================ + + if !o.dryrun { + if isHCP { + op.Section(2, "Compare pull secret across OCM and target cluster", + "Before making changes, compare the pull secret between OCM and the cluster:", + " OCM — access token auths (source of truth)", + " Target — openshift-config/pull-secret (what the cluster uses)", + "", + "HCP clusters use ManifestWork, not Hive SyncSet.", + "The ManifestWork will be updated with the correct OCM auths.") + } else { + op.Section(2, "Compare pull secret across OCM, Hive, and target cluster", + "Before making changes, compare the pull secret across all three sources:", + " OCM — access token auths (source of truth)", + " Hive — secret in hive namespace (used by SyncSet)", + " Target — openshift-config/pull-secret (what the cluster uses)", + "", + "Hive will always be brought in sync with OCM.", + "Target will be synced via SyncSet only if it differs from the updated hive secret.") + } + + var hiveData, targetData []byte + + // Read hive secret (Classic only) + if masterKubeClientSet != nil && masterKubeCli != nil && !isHCP { + hiveInfo, hiveErr := controller.FindHiveNamespace(ctx, masterKubeCli, o.clusterID) + if hiveErr == nil { + hiveSecret, getErr := masterKubeClientSet.CoreV1().Secrets(hiveInfo.Namespace).Get(ctx, "pull", metav1.GetOptions{}) + if getErr == nil { + hiveData = hiveSecret.Data[".dockerconfigjson"] + } + } + } + + // Read target secret + if targetClientSet != nil { + targetSecret, getErr := targetClientSet.CoreV1().Secrets("openshift-config").Get(ctx, "pull-secret", metav1.GetOptions{}) + if getErr == nil { + targetData = targetSecret.Data[".dockerconfigjson"] + } + } + + allInSync := true + + // Access token three-way comparison + if auths != nil { + atSimple := controller.AccessTokenToSimple(auths) + atComparison, cmpErr := controller.CompareThreeWay(atSimple, hiveData, targetData) + if cmpErr != nil { + op.Warn("access token comparison failed: %v", cmpErr) + allInSync = false + } else { + controller.RenderThreeWayComparison(atComparison, "ACCESS TOKEN AUTHS", !isHCP, out) + if !atComparison.AllInSync { + allInSync = false + } + } + } else { + op.Warn("OCM access token not available — cannot compare access token auths") + allInSync = false + } + + // Registry credential three-way comparison + regCreds, regErr := utils.GetRegistryCredentials(ocm, ownerAccountID) + if regErr == nil && len(regCreds) > 0 { + rcSimple := make(map[string]controller.SimpleAuth) + for _, cred := range regCreds { + token, _ := cred.GetToken() + username, _ := cred.GetUsername() + if token == "" || username == "" { + continue + } + registryID := cred.Registry().ID() + regResp, err := ocm.AccountsMgmt().V1().Registries().Registry(registryID).Get().Send() + if err != nil { + continue + } + regName, _ := regResp.Body().GetName() + if regName == "" { + continue + } + rcSimple[regName] = controller.SimpleAuth{ + Auth: b64.StdEncoding.EncodeToString([]byte(fmt.Sprintf("%s:%s", username, token))), + Email: ownerAccount.Email(), + } + } + if len(rcSimple) > 0 { + rcComparison, cmpErr := controller.CompareThreeWay(rcSimple, hiveData, targetData) + if cmpErr != nil { + op.Warn("registry credential comparison failed: %v", cmpErr) + allInSync = false + } else { + controller.RenderThreeWayComparison(rcComparison, "REGISTRY CREDENTIAL AUTHS", !isHCP, out) + if !rcComparison.AllInSync { + allInSync = false + } + } + } + } else { + op.Warn("Could not fetch registry credentials — skipping registry credential comparison") + allInSync = false + } + + if allInSync { + op.OK("All sources in sync — nothing to update") + if !o.force { + return nil + } + op.Warn("--force specified — proceeding despite no changes needed") + fmt.Fprintf(out, "Type YES to confirm: ") + var response string + if _, scanErr := fmt.Scanln(&response); scanErr != nil || response != "YES" { + return fmt.Errorf("operation aborted by user") + } + } + + fmt.Fprint(out, "\nProceed with pull secret update? ") + if !utils.ConfirmPrompt() { + return fmt.Errorf("operation aborted by user") + } + } + + // ================================================================ + // Step N: Update pull secret on infrastructure cluster + // ================================================================ + + step := 2 + if !o.dryrun { + step = 3 + } + + infraName := "(unresolved)" + if masterCluster != nil { + infraName = masterCluster.Name() + } + + if isHCP { + mgmtName := "" + if mgmtCluster != nil { + mgmtName = mgmtCluster.Name() + } + // HCP pull secret architecture (see KCS 7118834, hypershift.pages.dev/how-to/powervs/global-pull-secret/): + // 1. HostedCluster.spec.pullSecret (management cluster) — source of truth, updated via ManifestWork + // 2. original-pull-secret (kube-system on hosted cluster) — HCCO syncs from #1 + // 3. additional-pull-secret (kube-system) — optional customer-added registries, not affected + // This tool operates at level 1. Customer-added registries (level 3) are preserved. + // Verification reads openshift-config/pull-secret which reflects level 1. + op.Section(step, "Update pull secret via ManifestWork (HCP)", + "HCP clusters have a multi-layer pull secret architecture:", + " 1. HostedCluster.spec.pullSecret (management cluster) — source of truth", + " 2. original-pull-secret (kube-system on hosted cluster) — HCCO syncs from #1", + " 3. additional-pull-secret (kube-system) — optional customer-added registries", + "", + "This tool operates at level 1 by updating the ManifestWork on the service cluster.", + "HCCO then reconciles the change to the hosted cluster. Customer-added registries", + "in additional-pull-secret (level 3) are not affected by this operation.", + fmt.Sprintf("ManifestWork: %s/%s on service cluster %s", mgmtName, o.clusterID, infraName)) + + if masterKubeClientSet != nil { + op.Would("get and update ManifestWork %s/%s on service cluster %s", mgmtName, o.clusterID, infraName) + op.CheckCanI(ctx, masterKubeClientSet, infraName, "get", "manifestworks", "work.open-cluster-management.io", mgmtName) + op.CheckCanI(ctx, masterKubeClientSet, infraName, "update", "manifestworks", "work.open-cluster-management.io", mgmtName) + } else { + op.Would("get and update ManifestWork on service cluster %s", infraName) + op.Fail("cannot verify — infrastructure cluster not connected") + } + + if !o.dryrun && op.AllOK { + err = controller.UpdateHCPPullSecretViaManifestWork(ctx, ocm, masterKubeCli, o.clusterID, mgmtCluster.Name(), pullSecret, out) + if err != nil { + return fmt.Errorf("failed to update pull secret via ManifestWork: %w", err) + } + op.OK("ManifestWork updated successfully") + op.PullSecretUpdated = true + } + } else { + op.Section(step, "Update pull secret via Hive SyncSet (Classic)", + "Classic clusters store the pull secret in a Hive namespace on the hive cluster.", + "The secret is updated (or created if missing), then a SyncSet syncs it to the target cluster.", + "After sync completes, the SyncSet is cleaned up.", + "", + "Note: The hive secret is never deleted. If missing, it can be restored from the", + "target cluster's pull secret or rebuilt from OCM auths.") + + var resolvedHiveNS string + var resolvedCDName string + + if masterKubeCli != nil && masterKubeClientSet != nil { + hiveInfo, found := op.FindHiveNamespaceOp(ctx, masterKubeCli, o.clusterID, infraName) + if found { + resolvedHiveNS = hiveInfo.Namespace + resolvedCDName = hiveInfo.ClusterDeploymentName + hiveSecretExists := op.CheckSecretExists(ctx, masterKubeClientSet, resolvedHiveNS, "pull", infraName) + + if !hiveSecretExists { + existingData, source := op.ResolveExistingPullSecret(ctx, masterKubeClientSet, targetClientSet, resolvedHiveNS, infraName, cluster.Name()) + if existingData != nil && source != "" { + fmt.Fprintf(out, "\n The target cluster's pull secret may contain additional auths not available in OCM.\n") + fmt.Fprintf(out, " Use the target cluster's pull secret as the base for restoring the hive secret?\n") + fmt.Fprintf(out, " - YES: merge OCM auths into the target cluster's existing pull secret (recommended)\n") + fmt.Fprintf(out, " - NO: build from OCM auths only (may be missing customer or operator-added auths)\n") + fmt.Fprint(out, " Use target cluster pull secret as base? ") + if utils.ConfirmPrompt() { + op.Info("Will use %s as base for hive secret restoration", source) + merged, mergeErr := controller.MergePullSecretAuths(existingData, pullSecret) + if mergeErr != nil { + op.Warn("failed to merge existing pull secret: %v — using OCM auths only", mergeErr) + } else { + pullSecret = merged + } + } else { + op.Info("Will build hive secret from OCM auths only") + } + } else { + op.Warn("no existing pull secret available — will build from OCM auths only (requires --force)") + } + } + + op.Would("update or create secret %s/pull on %s with merged pull secret data", resolvedHiveNS, infraName) + op.CheckCanI(ctx, masterKubeClientSet, infraName, "get", "secrets", "", resolvedHiveNS) + op.CheckCanI(ctx, masterKubeClientSet, infraName, "update", "secrets", "", resolvedHiveNS) + op.CheckCanI(ctx, masterKubeClientSet, infraName, "create", "secrets", "", resolvedHiveNS) + + op.Would("create SyncSet %s/%s to sync to %s", resolvedHiveNS, controller.SyncSetName, cluster.Name()) + op.CheckCanI(ctx, masterKubeClientSet, infraName, "create", "syncsets", "hive.openshift.io", resolvedHiveNS) + + op.Would("poll ClusterSync %s/%s then delete SyncSet", resolvedHiveNS, resolvedCDName) + op.CheckCanI(ctx, masterKubeClientSet, infraName, "get", "clustersync", "hiveinternal.openshift.io", resolvedHiveNS) + op.CheckCanI(ctx, masterKubeClientSet, infraName, "delete", "syncsets", "hive.openshift.io", resolvedHiveNS) + } + } else { + op.Would("resolve Hive namespace, update secret, create SyncSet on %s", infraName) + op.Fail("cannot verify — infrastructure cluster not connected") + } + + if !o.dryrun && op.AllOK && resolvedHiveNS != "" { + // Use the resolved namespace instead of letting updatePullSecret re-discover it + op.Info("Updating pull secret in %s/pull on %s", resolvedHiveNS, infraName) + err = controller.UpdateHivePullSecretSSS(ctx, masterKubeCli, masterKubeClientSet, resolvedHiveNS, resolvedCDName, pullSecret, out) + if err != nil { + op.Fail("Hive SyncSet sync issue: %v", err) + fmt.Fprintf(out, "\n%s Current state:\n", colorWarn("[NOTE]")) + fmt.Fprintf(out, " - Hive secret %s/pull: UPDATED with merged OCM data\n", resolvedHiveNS) + fmt.Fprintf(out, " - Target cluster pull secret: may not reflect the update yet\n") + fmt.Fprintf(out, " - SyncSet: cleaned up\n") + fmt.Fprintf(out, " - Service log: NOT sent (will be sent on successful re-run)\n") + fmt.Fprintf(out, "\n Re-run this command to retry syncing to the target cluster.\n") + fmt.Fprintf(out, " The hive secret is correct — only the sync to the target needs to complete.\n") + } else { + op.OK("pull secret updated via Hive SyncSet") + op.PullSecretUpdated = true + } + } + } + + // ================================================================ + // Pod rollouts (Classic only) + // ================================================================ + + step++ + if !isHCP { + op.Section(step, "Pod rollouts (Classic only)", + "After the pull secret is synced, telemeter-client and ocm-agent pods are restarted", + "so they pick up the new credentials. HCP clusters do not require pod rollouts.") + + if targetClientSet != nil { + op.Would("roll out pods openshift-monitoring/telemeter-client on %s", cluster.Name()) + op.CheckCanI(ctx, targetClientSet, cluster.Name(), "delete", "pods", "", "openshift-monitoring") + + op.Would("roll out pods openshift-ocm-agent-operator/ocm-agent on %s", cluster.Name()) + op.CheckCanI(ctx, targetClientSet, cluster.Name(), "delete", "pods", "", "openshift-ocm-agent-operator") + } else { + op.Fail("cannot verify — target cluster not connected") + } + + if !o.dryrun && op.AllOK { + logger.Info("Rolling out pods openshift-monitoring/telemeter-client") + if err := controller.RestartPodsBySelector(ctx, targetClientSet, "openshift-monitoring", "app.kubernetes.io/name=telemeter-client", out); err != nil { + op.Warn("failed to roll out telemeter-client pods: %v", err) + } + } + step++ + } + + // ================================================================ + // Step N: Verify pull secret on target cluster + // ================================================================ + + op.Section(step, "Verify pull secret on target cluster", + "The pull secret on the target cluster is compared against both the OCM", + "access token auths and registry credential auths to verify all entries match.", + "Required registries are also checked to ensure the cluster can pull images.") + + if targetClientSet != nil { + op.CheckCanI(ctx, targetClientSet, cluster.Name(), "get", "secrets", "", "openshift-config") + + var atAllMatch, rcAllMatch bool + + // Access token verification + atLabel := color.New(color.FgBlue, color.Bold).SprintFunc() + if auths != nil { + op.Info("Comparing %s against openshift-config/pull-secret on %s...", atLabel("ACCESS TOKEN"), cluster.Name()) + atResult, verifyErr := controller.CompareAccessTokenAuthsToCluster(ctx, targetClientSet, auths, out) + if verifyErr != nil { + op.Warn("%s verification: %v", atLabel("ACCESS TOKEN"), verifyErr) + } else if atResult.Matched == atResult.Total { + op.OK("all %d %s auth entries match", atResult.Total, atLabel("ACCESS TOKEN")) + atAllMatch = true + } else { + diffCount := len(atResult.Mismatches) + op.AuthDiffCount += diffCount + op.Warn("%d/%d %s auth entries differ", diffCount, atResult.Total, atLabel("ACCESS TOKEN")) + } + } else { + op.Fail("cannot compare %s auths — OCM access token not available", atLabel("ACCESS TOKEN")) + } + + // Registry credential verification + rcLabel := color.New(color.FgBlue, color.Bold).SprintFunc() + op.Info("Comparing %s against openshift-config/pull-secret on %s...", rcLabel("REGISTRY CREDENTIAL"), cluster.Name()) + rcResult, rcErr := controller.CompareRegistryCredentialAuthsToCluster(ctx, ocm, targetClientSet, ownerAccountID, ownerAccount.Email(), out) + if rcErr != nil { + op.Warn("%s verification: %v", rcLabel("REGISTRY CREDENTIAL"), rcErr) + } else if rcResult.Matched == rcResult.Total { + op.OK("all %d %s auth entries match", rcResult.Total, rcLabel("REGISTRY CREDENTIAL")) + rcAllMatch = true + } else { + diffCount := len(rcResult.Mismatches) + op.AuthDiffCount += diffCount + op.Warn("%d/%d %s auth entries differ", diffCount, rcResult.Total, rcLabel("REGISTRY CREDENTIAL")) + } + + if atAllMatch && rcAllMatch { + op.PullSecretUpToDate = true + op.OK("All %s and %s auths match — pull secret is up to date", atLabel("ACCESS TOKEN"), rcLabel("REGISTRY CREDENTIAL")) + } else if o.dryrun { + if atAllMatch && !rcAllMatch { + op.Warn("%s auths match but %s auths differ", atLabel("ACCESS TOKEN"), rcLabel("REGISTRY CREDENTIAL")) + } else if !atAllMatch && rcAllMatch { + op.Warn("%s auths differ but %s auths match", atLabel("ACCESS TOKEN"), rcLabel("REGISTRY CREDENTIAL")) + } + } else if op.PullSecretUpdated && (!atAllMatch || !rcAllMatch) { + // Update was performed but verification found diffs — likely propagation delay + fmt.Fprintf(out, "\n%s Verification found diffs after update. This may be due to propagation delay.\n", colorWarn("[WARN]")) + if isHCP { + fmt.Fprintf(out, " HCCO reconciliation to the target cluster can take up to 60 seconds.\n") + } + fmt.Fprintf(out, "\n Options:\n") + fmt.Fprintf(out, " 1. Retry verification (recommended — wait for propagation)\n") + fmt.Fprintf(out, " 2. Continue without verification\n") + fmt.Fprintf(out, "\n To verify manually later:\n") + fmt.Fprintf(out, " osdctl cluster pull-secret validate -C %s --reason %q\n", o.clusterID, o.reason) + fmt.Fprint(out, "\n Retry verification? ") + if utils.ConfirmPrompt() { + for attempt := 1; attempt <= 6; attempt++ { + fmt.Fprintf(out, " Waiting 10s before retry %d/6...\n", attempt) + time.Sleep(10 * time.Second) + + atAllMatch = false + rcAllMatch = false + op.AuthDiffCount = 0 + + if auths != nil { + atRetry, retryErr := controller.CompareAccessTokenAuthsToCluster(ctx, targetClientSet, auths, out) + if retryErr == nil && atRetry.Matched == atRetry.Total { + atAllMatch = true + } + } + rcRetry, rcRetryErr := controller.CompareRegistryCredentialAuthsToCluster(ctx, ocm, targetClientSet, ownerAccountID, ownerAccount.Email(), out) + if rcRetryErr == nil && rcRetry.Matched == rcRetry.Total { + rcAllMatch = true + } + + if atAllMatch && rcAllMatch { + op.PullSecretUpToDate = true + op.AuthDiffCount = 0 + op.OK("All %s and %s auths match — pull secret is up to date", atLabel("ACCESS TOKEN"), rcLabel("REGISTRY CREDENTIAL")) + break + } + fmt.Fprintf(out, " Still differs (%d/6)...\n", attempt) + } + if !atAllMatch || !rcAllMatch { + op.Warn("Verification still shows diffs after 60s of retries") + } + } else { + op.Info("Skipping verification retry") + } + } + } else { + op.Fail("cannot verify — target cluster not connected") + } + + if !isHCP && !o.dryrun && op.AllOK { + logger.Info("Rolling out pods openshift-ocm-agent-operator/ocm-agent") + if err := controller.RestartPodsBySelector(ctx, targetClientSet, "openshift-ocm-agent-operator", "app=ocm-agent", out); err != nil { + op.Warn("failed to roll out ocm-agent pods: %v", err) + } + } + + // No-op check — exit before service log if nothing needs updating + // Skip if we already performed an update (verification will show all-match post-update) + noopLabel := color.New(color.FgBlue, color.Bold).SprintFunc() + if op.PullSecretUpToDate && !o.dryrun && !op.PullSecretUpdated { + if !o.force { + op.OK("All %s and %s auths match — nothing to update", noopLabel("ACCESS TOKEN"), noopLabel("REGISTRY CREDENTIAL")) + return nil + } + op.Warn("--force specified — proceeding with update despite no changes needed") + fmt.Fprintf(out, "Type YES to confirm: ") + var response string + if _, scanErr := fmt.Scanln(&response); scanErr != nil || response != "YES" { + return fmt.Errorf("operation aborted by user") + } + } + step++ + + // ================================================================ + // Step N+1: Service log + // ================================================================ + + op.Section(step, "Send internal service log (optional)", + "An internal (non-customer-visible) service log is sent to record that the", + "pull secret was updated, including the owner username and reason.", + "Declining or skipping this step does not affect the pull secret update.") + + op.Would("send internal service log for %s", cluster.Name()) + + if !o.dryrun && op.AllOK { + postCmd := servicelog.PostCmdOptions{ + ClusterId: o.clusterID, + TemplateParams: []string{ + fmt.Sprintf("MESSAGE=Pull secret replaced for cluster owner '%s'. Reason: %s", ownerUsername, o.reason), + }, + InternalOnly: true, + } + if err := postCmd.Run(); err != nil { + op.Warn("failed to send internal service log: %v", err) + fmt.Fprintf(out, "To send manually: osdctl servicelog post -i %s -p MESSAGE=\"Pull secret replaced for cluster owner %q.\"\n", o.clusterID, ownerUsername) + } else { + op.OK("internal service log step completed") + } + } + + // ================================================================ + // Summary + // ================================================================ + + // ================================================================ + // Result + // ================================================================ + + prefix := "" + if o.dryrun { + prefix = colorDryRun("[Dry Run] ") + } + + hdrColor := color.New(color.FgBlue, color.Bold).SprintFunc() + fmt.Fprintf(out, "\n%s\n", hdrColor("============================================================")) + fmt.Fprintf(out, "%s%s\n", prefix, hdrColor("Result")) + fmt.Fprintf(out, "%s\n", hdrColor("============================================================")) + + if op.AllOK { + if op.PullSecretUpdated && op.AuthDiffCount == 0 { + noopLabel := color.New(color.FgBlue, color.Bold).SprintFunc() + fmt.Fprintf(out, "%s%s Pull secret updated successfully. All %s and %s auths now match.\n", + prefix, colorOK("[OK]"), noopLabel("ACCESS TOKEN"), noopLabel("REGISTRY CREDENTIAL")) + } else if op.PullSecretUpdated && op.AuthDiffCount > 0 { + noopLabel := color.New(color.FgBlue, color.Bold).SprintFunc() + fmt.Fprintf(out, "%s%s Pull secret updated, but %d %s and/or %s auth entries still differ — verify manually.\n", + prefix, colorWarn("[WARN]"), op.AuthDiffCount, noopLabel("ACCESS TOKEN"), noopLabel("REGISTRY CREDENTIAL")) + } else if op.PullSecretUpToDate && o.dryrun { + noopLabel := color.New(color.FgBlue, color.Bold).SprintFunc() + fmt.Fprintf(out, "%s%s All %s and %s auths match — a live run would skip the update step.\n", + prefix, colorOK("[OK]"), noopLabel("ACCESS TOKEN"), noopLabel("REGISTRY CREDENTIAL")) + } else if op.PullSecretUpToDate && !o.dryrun { + noopLabel := color.New(color.FgBlue, color.Bold).SprintFunc() + fmt.Fprintf(out, "%s%s All %s and %s auths match — nothing to update.\n", + prefix, colorOK("[OK]"), noopLabel("ACCESS TOKEN"), noopLabel("REGISTRY CREDENTIAL")) + } else if op.AuthDiffCount > 0 && o.dryrun { + entry := "entry differs" + if op.AuthDiffCount > 1 { + entry = "entries differ" + } + fmt.Fprintf(out, "%s%s All pre-flight checks passed. No changes were made.\n", prefix, colorOK("[OK]")) + fmt.Fprintf(out, "%s%s %d auth %s and will be updated on a live run.\n", + prefix, colorWarn("[NOTE]"), op.AuthDiffCount, entry) + } else if o.dryrun { + fmt.Fprintf(out, "%s%s All pre-flight checks passed. No changes were made.\n", prefix, colorOK("[OK]")) + } else { + fmt.Fprintf(out, "%s%s Pull secret update completed successfully.\n", prefix, colorOK("[OK]")) + } + } else { + fmt.Fprintf(out, "%s%s Some checks failed.\n", prefix, colorFail("[FAIL]")) + if len(op.Failures) > 0 { + fmt.Fprintf(out, "\nFailures:\n") + for _, f := range op.Failures { + fmt.Fprintf(out, " %s %s\n", colorFail("[FAIL]"), f) + } + } + if !o.dryrun { + if !o.force { + return fmt.Errorf("pre-flight checks failed") + } + fmt.Fprintf(out, "\n%s --force specified. Proceeding despite failures.\n", colorWarn("[WARN]")) + fmt.Fprintf(out, "Type YES to confirm: ") + var response string + if _, scanErr := fmt.Scanln(&response); scanErr != nil || response != "YES" { + return fmt.Errorf("operation aborted by user") + } + } + } + + return nil +} diff --git a/cmd/cluster/validatepullsecret.go b/cmd/cluster/validatepullsecret.go index 5ae3c6ed1..1899ae39f 100644 --- a/cmd/cluster/validatepullsecret.go +++ b/cmd/cluster/validatepullsecret.go @@ -44,6 +44,8 @@ elevation by setting --managed-script=false. Args: cobra.NoArgs, DisableAutoGenTag: true, Run: func(cmd *cobra.Command, args []string) { + fmt.Fprintln(os.Stderr, "Tip: For extended validation, use 'osdctl cluster pull-secret validate'") + fmt.Fprintln(os.Stderr, " For account-wide pull secret audit, use 'osdctl cluster pull-secret audit'") cmdutil.CheckErr(ops.run()) }, } diff --git a/cmd/cluster/validatepullsecretext.go b/cmd/cluster/validatepullsecretext.go index fb62bf38c..afc5a482e 100644 --- a/cmd/cluster/validatepullsecretext.go +++ b/cmd/cluster/validatepullsecretext.go @@ -72,6 +72,12 @@ const VPSExample string = ` osdctl cluster validate-pull-secret-ext --cluster-id ${CLUSTER_ID} --reason "OSD-XYZ" --skip-service-logs ` +func newCmdValidatePullSecretExtDeprecated() *cobra.Command { + cmd := newCmdValidatePullSecretExt() + cmd.Deprecated = "use 'osdctl cluster pull-secret validate' instead" + return cmd +} + func newCmdValidatePullSecretExt() *cobra.Command { ops := newValidatePullSecretExtOptions() validatePullSecretCmd := &cobra.Command{ diff --git a/cmd/servicelog/post.go b/cmd/servicelog/post.go index c99b6d76c..8f57a9079 100644 --- a/cmd/servicelog/post.go +++ b/cmd/servicelog/post.go @@ -52,6 +52,10 @@ type PostCmdOptions struct { failedClusters map[string]string } +func (o *PostCmdOptions) SetDryRun(dryRun bool) { + o.isDryRun = dryRun +} + const documentationBaseURL = "https://docs.openshift.com" func newPostCmd() *cobra.Command { diff --git a/docs/README.md b/docs/README.md index c09198300..c114730b3 100644 --- a/docs/README.md +++ b/docs/README.md @@ -60,6 +60,10 @@ - `logging-check --cluster-id ` - Shows the logging support status of a specified cluster - `orgId --cluster-id ` - Shows the support status of a specified cluster - `transfer-owner` - Transfer cluster ownership to a new user (to be done by Region Lead) - `validate-pull-secret --cluster-id ` - Checks if the pull secret email matches the owner email - - `validate-pull-secret-ext --cluster-id $CLUSTER_ID` - Extended checks to confirm pull-secret data is synced with current OCM data - `verify-dns --cluster-id ` - Verify DNS resolution for HCP cluster public endpoints - `cost` - Cost Management related utilities - `carbon-report` - Generate carbon emissions report csv to stdout for a given AWS Account and Usage Period @@ -1842,6 +1845,147 @@ osdctl cluster owner [flags] -u, --user-id string user to check the cluster owner on ``` +### osdctl cluster pull-secret + +Diagnose and manage cluster pull secrets. + +``` +osdctl cluster pull-secret [flags] +``` + +#### Flags + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + -h, --help help for pull-secret + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -o, --output string Valid formats are ['', 'json', 'yaml', 'env'] + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + -S, --skip-version-check skip checking to see if this is the most recent release +``` + +### osdctl cluster pull-secret audit + +Audit pull secret status for all clusters sharing the same OCM account. + +Given a cluster ID or account ID, resolves the owner account and lists all +clusters owned by that account. Compares cluster creation dates against the +account's registry credential update timestamps to flag clusters that may +have stale pull secrets. + +Use --validate to connect to each cluster and compare its pull secret +against the OCM access token and registry credential auths. + +For validating a single cluster, use 'osdctl cluster pull-secret validate'. + +``` +osdctl cluster pull-secret audit [flags] +``` + +#### Flags + +``` + -A, --account-id string OCM account ID directly + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + -C, --cluster-id string Any cluster owned by the account (used to resolve the owner) + --context string The name of the kubeconfig context to use + -h, --help help for audit + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -o, --output string Valid formats are ['', 'json', 'yaml', 'env'] + --reason string Elevation reason for cluster connections + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + -S, --skip-version-check skip checking to see if this is the most recent release + --validate Validate all clusters' pull secrets against OCM +``` + +### osdctl cluster pull-secret update + +Refresh a cluster's pull secret from the cluster owner's OCM account. + +This updates the pull secret on a ROSA HCP or Classic cluster without performing +an ownership transfer. The pull secret is rebuilt from the latest credentials +in the cluster owner's OCM account. + +A pre-flight check always runs first. If any checks fail, the command exits +unless --force is specified (requires typing YES to confirm). + +See documentation prior to executing: +https://github.com/openshift/ops-sop/blob/master/hypershift/knowledge_base/howto/replace-pull-secret.md +https://github.com/openshift/ops-sop/blob/master/v4/howto/transfer_cluster_ownership.md + +``` +osdctl cluster pull-secret update [flags] +``` + +#### Flags + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + -C, --cluster-id string The Internal/External Cluster ID or Cluster Name + --context string The name of the kubeconfig context to use + -d, --dry-run Dry-run - show what would change but do not apply + --force Proceed despite pre-flight failures or no-op detection (requires YES confirmation) + -h, --help help for update + --hive-ocm-url string OCM environment for Hive operations (aliases: production, staging, integration) + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -o, --output string Valid formats are ['', 'json', 'yaml', 'env'] + --reason string The reason for this command (usually an OHSS or PD ticket) + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + -S, --skip-version-check skip checking to see if this is the most recent release +``` + +### osdctl cluster pull-secret validate + + + Attempts to validate if a cluster's pull-secret auth values are in sync with the account's email, + registry_credential, and access token data stored in OCM. + + Service logs are automatically sent for detected issues. Multiple failures are aggregated into + a single service log. Use --skip-service-logs to prevent sending service logs. + + If this is being executed against a cluster which is not owned by the current OCM account, + Region Lead permissions are required to view and validate the OCM AccessToken. + + +``` +osdctl cluster pull-secret validate [flags] +``` + +#### Flags + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + -C, --cluster-id string Provide internal ID of the cluster + --context string The name of the kubeconfig context to use + -h, --help help for validate + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -l, --log-level string debug, info, warn, error. (default=info) (default "info") + -o, --output string Valid formats are ['', 'json', 'yaml', 'env'] + --reason string Mandatory reason for this command to be run (usually includes an OHSS or PD ticket) + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --skip-access-token Exclude OCM AccessToken checks against cluster secret + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + --skip-registry-creds Exclude OCM Registry Credentials checks against cluster secret + --skip-service-logs Skip sending service logs (useful for testing/automation) + -S, --skip-version-check skip checking to see if this is the most recent release +``` + ### osdctl cluster reports Manage cluster reports stored in backplane-api. @@ -2508,45 +2652,6 @@ osdctl cluster validate-pull-secret --cluster-id [flags] -S, --skip-version-check skip checking to see if this is the most recent release ``` -### osdctl cluster validate-pull-secret-ext - - - Attempts to validate if a cluster's pull-secret auth values are in sync with the account's email, - registry_credential, and access token data stored in OCM. - - Service logs are automatically sent for detected issues. Multiple failures are aggregated into - a single service log. Use --skip-service-logs to prevent sending service logs. - - If this is being executed against a cluster which is not owned by the current OCM account, - Region Lead permissions are required to view and validate the OCM AccessToken. - - -``` -osdctl cluster validate-pull-secret-ext --cluster-id $CLUSTER_ID [flags] -``` - -#### Flags - -``` - --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. - --cluster string The name of the kubeconfig cluster to use - -C, --cluster-id string Provide internal ID of the cluster - --context string The name of the kubeconfig context to use - -h, --help help for validate-pull-secret-ext - --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure - --kubeconfig string Path to the kubeconfig file to use for CLI requests. - -l, --log-level string debug, info, warn, error. (default=info) (default "info") - -o, --output string Valid formats are ['', 'json', 'yaml', 'env'] - --reason string Mandatory reason for this command to be run (usually includes an OHSS or PD ticket) - --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") - -s, --server string The address and port of the Kubernetes API server - --skip-access-token Exclude OCM AccessToken checks against cluster secret - --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value - --skip-registry-creds Exclude OCM Registry Credentials checks against cluster secret - --skip-service-logs Skip sending service logs (useful for testing/automation) - -S, --skip-version-check skip checking to see if this is the most recent release -``` - ### osdctl cluster verify-dns Performs DNS resolution tests for HCP clusters. diff --git a/docs/osdctl_cluster.md b/docs/osdctl_cluster.md index 671a69a4e..3667e94b9 100644 --- a/docs/osdctl_cluster.md +++ b/docs/osdctl_cluster.md @@ -43,6 +43,7 @@ Provides information for a specified cluster * [osdctl cluster logging-check](osdctl_cluster_logging-check.md) - Shows the logging support status of a specified cluster * [osdctl cluster orgId](osdctl_cluster_orgId.md) - Get the OCM org ID for a given cluster * [osdctl cluster owner](osdctl_cluster_owner.md) - List the clusters owned by the user (can be specified to any user, not only yourself) +* [osdctl cluster pull-secret](osdctl_cluster_pull-secret.md) - Diagnose and manage cluster pull secrets * [osdctl cluster reports](osdctl_cluster_reports.md) - Manage cluster reports in backplane-api * [osdctl cluster resize](osdctl_cluster_resize.md) - resize control-plane/infra nodes * [osdctl cluster resync](osdctl_cluster_resync.md) - Force a resync of a cluster from Hive @@ -52,6 +53,5 @@ Provides information for a specified cluster * [osdctl cluster support](osdctl_cluster_support.md) - Cluster Support * [osdctl cluster transfer-owner](osdctl_cluster_transfer-owner.md) - Transfer cluster ownership to a new user (to be done by Region Lead) * [osdctl cluster validate-pull-secret](osdctl_cluster_validate-pull-secret.md) - Checks if the pull secret email matches the owner email -* [osdctl cluster validate-pull-secret-ext](osdctl_cluster_validate-pull-secret-ext.md) - Extended checks to confirm pull-secret data is synced with current OCM data * [osdctl cluster verify-dns](osdctl_cluster_verify-dns.md) - Verify DNS resolution for HCP cluster public endpoints diff --git a/docs/osdctl_cluster_pull-secret.md b/docs/osdctl_cluster_pull-secret.md new file mode 100644 index 000000000..ce5a78ee8 --- /dev/null +++ b/docs/osdctl_cluster_pull-secret.md @@ -0,0 +1,36 @@ +## osdctl cluster pull-secret + +Diagnose and manage cluster pull secrets + +### Synopsis + +Diagnose and manage cluster pull secrets. + +### Options + +``` + -h, --help help for pull-secret +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -o, --output string Valid formats are ['', 'json', 'yaml', 'env'] + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + -S, --skip-version-check skip checking to see if this is the most recent release +``` + +### SEE ALSO + +* [osdctl cluster](osdctl_cluster.md) - Provides information for a specified cluster +* [osdctl cluster pull-secret audit](osdctl_cluster_pull-secret_audit.md) - Audit pull secret status for all clusters owned by an account +* [osdctl cluster pull-secret update](osdctl_cluster_pull-secret_update.md) - Refresh a cluster's pull secret from the cluster owner's OCM account +* [osdctl cluster pull-secret validate](osdctl_cluster_pull-secret_validate.md) - Extended checks to confirm pull-secret data is synced with current OCM data + diff --git a/docs/osdctl_cluster_pull-secret_audit.md b/docs/osdctl_cluster_pull-secret_audit.md new file mode 100644 index 000000000..b4ed61bed --- /dev/null +++ b/docs/osdctl_cluster_pull-secret_audit.md @@ -0,0 +1,64 @@ +## osdctl cluster pull-secret audit + +Audit pull secret status for all clusters owned by an account + +### Synopsis + +Audit pull secret status for all clusters sharing the same OCM account. + +Given a cluster ID or account ID, resolves the owner account and lists all +clusters owned by that account. Compares cluster creation dates against the +account's registry credential update timestamps to flag clusters that may +have stale pull secrets. + +Use --validate to connect to each cluster and compare its pull secret +against the OCM access token and registry credential auths. + +For validating a single cluster, use 'osdctl cluster pull-secret validate'. + +``` +osdctl cluster pull-secret audit [flags] +``` + +### Examples + +``` + # Overview of all clusters for the account + osdctl cluster pull-secret audit -C 1kfmyclusterid --reason "OHSS-1234" + + # Using account ID directly + osdctl cluster pull-secret audit -A 2g9OLHPkwDDcXvq2mt7kjfIQ0gf --reason "OHSS-1234" + + # Validate all clusters' pull secrets against OCM + osdctl cluster pull-secret audit -C 1kfmyclusterid --reason "OHSS-1234" --validate +``` + +### Options + +``` + -A, --account-id string OCM account ID directly + -C, --cluster-id string Any cluster owned by the account (used to resolve the owner) + -h, --help help for audit + --reason string Elevation reason for cluster connections + --validate Validate all clusters' pull secrets against OCM +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -o, --output string Valid formats are ['', 'json', 'yaml', 'env'] + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + -S, --skip-version-check skip checking to see if this is the most recent release +``` + +### SEE ALSO + +* [osdctl cluster pull-secret](osdctl_cluster_pull-secret.md) - Diagnose and manage cluster pull secrets + diff --git a/docs/osdctl_cluster_pull-secret_update.md b/docs/osdctl_cluster_pull-secret_update.md new file mode 100644 index 000000000..dd6da110e --- /dev/null +++ b/docs/osdctl_cluster_pull-secret_update.md @@ -0,0 +1,66 @@ +## osdctl cluster pull-secret update + +Refresh a cluster's pull secret from the cluster owner's OCM account + +### Synopsis + +Refresh a cluster's pull secret from the cluster owner's OCM account. + +This updates the pull secret on a ROSA HCP or Classic cluster without performing +an ownership transfer. The pull secret is rebuilt from the latest credentials +in the cluster owner's OCM account. + +A pre-flight check always runs first. If any checks fail, the command exits +unless --force is specified (requires typing YES to confirm). + +See documentation prior to executing: +https://github.com/openshift/ops-sop/blob/master/hypershift/knowledge_base/howto/replace-pull-secret.md +https://github.com/openshift/ops-sop/blob/master/v4/howto/transfer_cluster_ownership.md + +``` +osdctl cluster pull-secret update [flags] +``` + +### Examples + +``` + # Update pull secret on a cluster + osdctl cluster pull-secret update --cluster-id 1kfmyclusterid --reason "OHSS-1234" + + # Dry-run to preview without making changes + osdctl cluster pull-secret update --cluster-id 1kfmyclusterid --reason "OHSS-1234" --dry-run + + # Force proceed despite pre-flight failures (e.g. missing pull secret) + osdctl cluster pull-secret update --cluster-id 1kfmyclusterid --reason "OHSS-1234" --force +``` + +### Options + +``` + -C, --cluster-id string The Internal/External Cluster ID or Cluster Name + -d, --dry-run Dry-run - show what would change but do not apply + --force Proceed despite pre-flight failures or no-op detection (requires YES confirmation) + -h, --help help for update + --hive-ocm-url string OCM environment for Hive operations (aliases: production, staging, integration) + --reason string The reason for this command (usually an OHSS or PD ticket) +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -o, --output string Valid formats are ['', 'json', 'yaml', 'env'] + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + -S, --skip-version-check skip checking to see if this is the most recent release +``` + +### SEE ALSO + +* [osdctl cluster pull-secret](osdctl_cluster_pull-secret.md) - Diagnose and manage cluster pull secrets + diff --git a/docs/osdctl_cluster_pull-secret_validate.md b/docs/osdctl_cluster_pull-secret_validate.md new file mode 100644 index 000000000..47cccc3de --- /dev/null +++ b/docs/osdctl_cluster_pull-secret_validate.md @@ -0,0 +1,65 @@ +## osdctl cluster pull-secret validate + +Extended checks to confirm pull-secret data is synced with current OCM data + +### Synopsis + + + Attempts to validate if a cluster's pull-secret auth values are in sync with the account's email, + registry_credential, and access token data stored in OCM. + + Service logs are automatically sent for detected issues. Multiple failures are aggregated into + a single service log. Use --skip-service-logs to prevent sending service logs. + + If this is being executed against a cluster which is not owned by the current OCM account, + Region Lead permissions are required to view and validate the OCM AccessToken. + + +``` +osdctl cluster pull-secret validate [flags] +``` + +### Examples + +``` + # Compare OCM Access-Token, OCM Registry-Credentials, and OCM Account Email against cluster's pull secret + osdctl cluster pull-secret validate --cluster-id ${CLUSTER_ID} --reason "OSD-XYZ" + + # Exclude Access-Token, and Registry-Credential checks... + osdctl cluster pull-secret validate --cluster-id ${CLUSTER_ID} --reason "OSD-XYZ" --skip-access-token --skip-registry-creds + + # Skip sending service logs (useful for testing) + osdctl cluster pull-secret validate --cluster-id ${CLUSTER_ID} --reason "OSD-XYZ" --skip-service-logs +``` + +### Options + +``` + -C, --cluster-id string Provide internal ID of the cluster + -h, --help help for validate + -l, --log-level string debug, info, warn, error. (default=info) (default "info") + --reason string Mandatory reason for this command to be run (usually includes an OHSS or PD ticket) + --skip-access-token Exclude OCM AccessToken checks against cluster secret + --skip-registry-creds Exclude OCM Registry Credentials checks against cluster secret + --skip-service-logs Skip sending service logs (useful for testing/automation) +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -o, --output string Valid formats are ['', 'json', 'yaml', 'env'] + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --skip-aws-proxy-check aws_proxy Don't use the configured aws_proxy value + -S, --skip-version-check skip checking to see if this is the most recent release +``` + +### SEE ALSO + +* [osdctl cluster pull-secret](osdctl_cluster_pull-secret.md) - Diagnose and manage cluster pull secrets + diff --git a/go.mod b/go.mod index 574bedbb4..e90908cf1 100644 --- a/go.mod +++ b/go.mod @@ -65,6 +65,7 @@ require ( github.com/zclconf/go-cty v1.13.0 gitlab.com/gitlab-org/api/client-go v0.128.0 go.uber.org/mock v0.6.0 + go.uber.org/zap v1.27.0 golang.org/x/oauth2 v0.35.0 golang.org/x/sync v0.20.0 golang.org/x/term v0.43.0 @@ -239,7 +240,6 @@ require ( go.opentelemetry.io/otel/metric v1.39.0 // indirect go.opentelemetry.io/otel/trace v1.39.0 // indirect go.uber.org/multierr v1.11.0 // indirect - go.uber.org/zap v1.27.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto v0.50.0 // indirect diff --git a/pkg/controller/pullsecret.go b/pkg/controller/pullsecret.go new file mode 100644 index 000000000..c4717de30 --- /dev/null +++ b/pkg/controller/pullsecret.go @@ -0,0 +1,1157 @@ +package controller + +import ( + "bufio" + "context" + b64 "encoding/base64" + "encoding/json" + "fmt" + "io" + "os" + "sort" + "time" + + "github.com/fatih/color" + "github.com/olekukonko/tablewriter" + sdk "github.com/openshift-online/ocm-sdk-go" + amv1 "github.com/openshift-online/ocm-sdk-go/accountsmgmt/v1" + hiveapiv1 "github.com/openshift/hive/apis/hive/v1" + hiveinternalv1alpha1 "github.com/openshift/hive/apis/hiveinternal/v1alpha1" + hypershiftv1beta1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + "github.com/sirupsen/logrus" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/util/retry" + "math/rand" + workv1 "open-cluster-management.io/api/work/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + "strings" + + "github.com/openshift/osdctl/pkg/utils" +) + +var ( + psColorOK = color.New(color.FgGreen).SprintFunc() + psColorWarn = color.New(color.FgYellow).SprintFunc() +) + +// RequiredPullSecretAuths lists the registry auth entries that must be present +// in a cluster's pull secret for the cluster to function. +var RequiredPullSecretAuths = []string{ + "cloud.openshift.com", + "quay.io", + "registry.redhat.io", + "registry.connect.redhat.com", +} + +// ClusterSummary holds subscription-level data for a cluster owned by an account. +type ClusterSummary struct { + Name string + ID string + Status string + CreatedAt time.Time +} + +// AuthCheckResult holds the outcome of a single registry auth comparison. +type AuthCheckResult struct { + Registry string + Source string // "access_token" or "registry_credential" + OK bool + TokenMatch bool + EmailMatch bool + Email string + Detail string +} + +// PullSecretVerifyResult holds the outcome of a per-registry auth comparison. +type PullSecretVerifyResult struct { + Matched int + Total int + Mismatches []string + AuthResults []AuthCheckResult + MissingRequired []string +} + +// FetchOwnerAccessToken retrieves the cluster owner's pull secret from OCM, +// using impersonation if the current OCM user is not the cluster owner. +// Returns the marshaled pull secret bytes and the raw auth map for verification. +func FetchOwnerAccessToken(ocm *sdk.Connection, ownerUsername string, logger *logrus.Logger) ([]byte, map[string]*amv1.AccessTokenAuth, error) { + currentAccountResp, err := ocm.AccountsMgmt().V1().CurrentAccount().Get().Send() + if err != nil { + logger.Warnf("Could not fetch current account info, will use impersonation: %v", err) + } + + var response *amv1.AccessTokenPostResponse + if currentAccountResp != nil && currentAccountResp.Body().Username() == ownerUsername { + logger.Info("Current OCM user matches cluster owner, fetching access token directly") + response, err = ocm.AccountsMgmt().V1().AccessToken().Post().Send() + } else { + logger.Infof("Impersonating cluster owner '%s' to fetch access token", ownerUsername) + response, err = ocm.AccountsMgmt().V1().AccessToken().Post().Impersonate(ownerUsername).Parameter("body", nil).Send() + } + if err != nil { + return nil, nil, fmt.Errorf("failed to fetch OCM access token: %w", err) + } + + auths, ok := response.Body().GetAuths() + if !ok { + return nil, nil, fmt.Errorf("failed to get auths from access token response — contact SDB if this persists") + } + + authsMap := map[string]map[string]string{} + for k, auth := range auths { + authsMap[k] = map[string]string{ + "auth": auth.Auth(), + "email": auth.Email(), + } + } + + pullSecret, err := json.Marshal(map[string]map[string]map[string]string{ + "auths": authsMap, + }) + if err != nil { + return nil, nil, fmt.Errorf("failed to marshal pull secret: %w", err) + } + + return pullSecret, auths, nil +} + +// ValidateRequiredAuths checks that the OCM access token includes all required +// registry auth entries. Returns the list of missing registries. +func ValidateRequiredAuths(auths map[string]*amv1.AccessTokenAuth) []string { + var missing []string + for _, required := range RequiredPullSecretAuths { + if _, ok := auths[required]; !ok { + missing = append(missing, required) + } + } + return missing +} + +// CompareAccessTokenAuthsToCluster compares OCM access token auths against the pull +// secret on the target cluster. Writes per-registry results to out. +// Returns a PullSecretVerifyResult with match counts and any mismatches. +func CompareAccessTokenAuthsToCluster(ctx context.Context, clientset *kubernetes.Clientset, expectedAuths map[string]*amv1.AccessTokenAuth, out io.Writer) (*PullSecretVerifyResult, error) { + pullSecret, err := clientset.CoreV1().Secrets("openshift-config").Get(ctx, "pull-secret", metav1.GetOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to get secret openshift-config/pull-secret from target cluster: %w", err) + } + + if _, ok := pullSecret.Data[".dockerconfigjson"]; !ok { + return nil, fmt.Errorf("secret openshift-config/pull-secret is missing .dockerconfigjson key") + } + + result := &PullSecretVerifyResult{Total: len(expectedAuths)} + + sortedKeys := make([]string, 0, len(expectedAuths)) + for k := range expectedAuths { + sortedKeys = append(sortedKeys, k) + } + sort.Strings(sortedKeys) + + for _, authKey := range sortedKeys { + expectedAuth := expectedAuths[authKey] + ar := AuthCheckResult{Registry: authKey, Source: "access_token"} + + clusterAuth, err := extractPullSecretAuth(authKey, pullSecret) + if err != nil { + result.Mismatches = append(result.Mismatches, authKey) + ar.Detail = "not found in cluster secret" + result.AuthResults = append(result.AuthResults, ar) + continue + } + + ar.TokenMatch = clusterAuth.auth == expectedAuth.Auth() + ar.EmailMatch = clusterAuth.email == expectedAuth.Email() + ar.Email = expectedAuth.Email() + + if !ar.TokenMatch || !ar.EmailMatch { + result.Mismatches = append(result.Mismatches, authKey) + details := "" + if !ar.TokenMatch { + details += "token mismatch" + } + if !ar.EmailMatch { + if details != "" { + details += ", " + } + details += fmt.Sprintf("email mismatch (cluster=%q, OCM=%q)", clusterAuth.email, expectedAuth.Email()) + } + ar.Detail = details + result.AuthResults = append(result.AuthResults, ar) + continue + } + + ar.OK = true + result.Matched++ + result.AuthResults = append(result.AuthResults, ar) + } + + for _, required := range RequiredPullSecretAuths { + if _, err := extractPullSecretAuth(required, pullSecret); err != nil { + result.MissingRequired = append(result.MissingRequired, required) + } + } + + // Write human-readable output if a writer is provided + if out != nil { + RenderVerifyResult(result, out) + } + + return result, nil +} + +// RenderVerifyResult writes the verification result in human-readable format. +func RenderVerifyResult(result *PullSecretVerifyResult, out io.Writer) { + mismatchLine := color.New(color.FgYellow, color.Bold).SprintFunc() + for _, ar := range result.AuthResults { + if ar.OK { + fmt.Fprintf(out, " %s %-40s token=match, email=match (%s)\n", psColorOK("[OK]"), ar.Registry, ar.Email) + } else { + fmt.Fprintf(out, " %s\n", mismatchLine(fmt.Sprintf("[!] %-40s %s", ar.Registry, ar.Detail))) + } + } + + fmt.Fprintf(out, "\n Verified %d/%d auth entries match\n", result.Matched, result.Total) + + if len(result.MissingRequired) > 0 { + fmt.Fprintf(out, "\n%s cluster pull secret is missing required registries:\n", psColorWarn("[WARN]")) + for _, m := range result.MissingRequired { + fmt.Fprintf(out, " - %s\n", m) + } + fmt.Fprintf(out, "The cluster may have issues pulling images or reporting telemetry.\n") + } else { + fmt.Fprintf(out, " %s All required registries present in cluster pull secret\n", psColorOK("[OK]")) + } +} + +// CompareRegistryCredentialAuthsToCluster compares OCM registry credentials against the pull +// secret on the target cluster. Registry credentials use a different token +// format (base64-encoded "username:token") than access token auths. +func CompareRegistryCredentialAuthsToCluster(ctx context.Context, ocm *sdk.Connection, clientset *kubernetes.Clientset, accountID string, accountEmail string, out io.Writer) (*PullSecretVerifyResult, error) { + creds, err := utils.GetRegistryCredentials(ocm, accountID) + if err != nil { + return nil, fmt.Errorf("failed to fetch registry credentials: %w", err) + } + if len(creds) == 0 { + return nil, fmt.Errorf("no registry credentials found for account %s", accountID) + } + + pullSecret, err := clientset.CoreV1().Secrets("openshift-config").Get(ctx, "pull-secret", metav1.GetOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to get secret openshift-config/pull-secret: %w", err) + } + + result := &PullSecretVerifyResult{Total: len(creds)} + + for _, cred := range creds { + registryID := cred.Registry().ID() + + // Resolve registry name from OCM + regResp, err := ocm.AccountsMgmt().V1().Registries().Registry(registryID).Get().Send() + if err != nil { + ar := AuthCheckResult{Registry: registryID, Source: "registry_credential", Detail: fmt.Sprintf("cannot resolve registry: %v", err)} + result.Mismatches = append(result.Mismatches, registryID) + result.AuthResults = append(result.AuthResults, ar) + continue + } + regName, _ := regResp.Body().GetName() + if regName == "" { + regName = registryID + } + + ar := AuthCheckResult{Registry: regName, Source: "registry_credential"} + + token, _ := cred.GetToken() + username, _ := cred.GetUsername() + if token == "" || username == "" { + ar.Detail = "missing token or username in OCM registry credential" + result.Mismatches = append(result.Mismatches, regName) + result.AuthResults = append(result.AuthResults, ar) + continue + } + + clusterAuth, err := extractPullSecretAuth(regName, pullSecret) + if err != nil { + ar.Detail = "not found in cluster secret" + result.Mismatches = append(result.Mismatches, regName) + result.AuthResults = append(result.AuthResults, ar) + continue + } + + // Registry credential tokens are stored as base64("username:token") in the cluster secret + expectedToken := fmt.Sprintf("%s:%s", username, token) + clusterTokenDecoded, err := b64.StdEncoding.DecodeString(clusterAuth.auth) + if err != nil { + ar.Detail = "failed to decode cluster token" + result.Mismatches = append(result.Mismatches, regName) + result.AuthResults = append(result.AuthResults, ar) + continue + } + + ar.TokenMatch = expectedToken == string(clusterTokenDecoded) + ar.EmailMatch = accountEmail == clusterAuth.email + ar.Email = accountEmail + + if !ar.TokenMatch || !ar.EmailMatch { + result.Mismatches = append(result.Mismatches, regName) + details := "" + if !ar.TokenMatch { + details += "token mismatch" + } + if !ar.EmailMatch { + if details != "" { + details += ", " + } + details += fmt.Sprintf("email mismatch (cluster=%q, OCM=%q)", clusterAuth.email, accountEmail) + } + ar.Detail = details + result.AuthResults = append(result.AuthResults, ar) + continue + } + + ar.OK = true + result.Matched++ + result.AuthResults = append(result.AuthResults, ar) + } + + if out != nil { + RenderVerifyResult(result, out) + } + + return result, nil +} + +// ThreeWayAuthState describes the sync state of a single auth entry across OCM, hive, and target. +type ThreeWayAuthState struct { + Registry string + InOCM bool + InHive bool + InTarget bool + OCMMatchesHive bool + OCMMatchesTarget bool + HiveMatchesTarget bool +} + +// ThreeWayComparison holds the full comparison result across OCM, hive, and target. +type ThreeWayComparison struct { + Auths []ThreeWayAuthState + HiveNeedsUpdate bool + TargetNeedsSync bool + AllInSync bool +} + +// SimpleAuth holds a registry auth's token and email for generic comparison. +type SimpleAuth struct { + Auth string + Email string +} + +// AccessTokenToSimple converts access token auths to SimpleAuth map. +func AccessTokenToSimple(auths map[string]*amv1.AccessTokenAuth) map[string]SimpleAuth { + result := make(map[string]SimpleAuth, len(auths)) + for k, v := range auths { + result[k] = SimpleAuth{Auth: v.Auth(), Email: v.Email()} + } + return result +} + +// CompareThreeWay compares pull secret auths across OCM, hive secret, and target cluster secret. +// ocmAuths maps registry name → SimpleAuth with the expected auth/email values. +// hiveData and targetData are the raw .dockerconfigjson bytes from each secret. +func CompareThreeWay(ocmAuths map[string]SimpleAuth, hiveData []byte, targetData []byte) (*ThreeWayComparison, error) { + result := &ThreeWayComparison{AllInSync: true} + + type parsedAuth struct { + Auth string `json:"auth"` + Email string `json:"email"` + } + type parsedPS struct { + Auths map[string]parsedAuth `json:"auths"` + } + + var hive, target parsedPS + hiveAuths := make(map[string]parsedAuth) + targetAuths := make(map[string]parsedAuth) + + if len(hiveData) > 0 { + if err := json.Unmarshal(hiveData, &hive); err != nil { + return nil, fmt.Errorf("failed to parse hive pull secret: %w", err) + } + hiveAuths = hive.Auths + } + if len(targetData) > 0 { + if err := json.Unmarshal(targetData, &target); err != nil { + return nil, fmt.Errorf("failed to parse target pull secret: %w", err) + } + targetAuths = target.Auths + } + + // Only compare registries present in the OCM source being checked. + // Registries in hive/target but not in OCM are outside this source's scope. + for registry := range ocmAuths { + state := ThreeWayAuthState{Registry: registry} + + ocmAuth, inOCM := ocmAuths[registry] + hiveAuth, inHive := hiveAuths[registry] + targetAuth, inTarget := targetAuths[registry] + + state.InOCM = inOCM + state.InHive = inHive + state.InTarget = inTarget + + if inOCM && inHive { + state.OCMMatchesHive = ocmAuth.Auth == hiveAuth.Auth && ocmAuth.Email == hiveAuth.Email + } + if inOCM && inTarget { + state.OCMMatchesTarget = ocmAuth.Auth == targetAuth.Auth && ocmAuth.Email == targetAuth.Email + } + if inHive && inTarget { + state.HiveMatchesTarget = hiveAuth.Auth == targetAuth.Auth && hiveAuth.Email == targetAuth.Email + } + + // Determine if sync is needed + if inOCM && !state.OCMMatchesHive { + result.HiveNeedsUpdate = true + result.AllInSync = false + } + if inOCM && !state.OCMMatchesTarget { + result.TargetNeedsSync = true + result.AllInSync = false + } + if inHive && inTarget && !state.HiveMatchesTarget { + result.AllInSync = false + } + + result.Auths = append(result.Auths, state) + } + + sort.Slice(result.Auths, func(i, j int) bool { + return result.Auths[i].Registry < result.Auths[j].Registry + }) + + return result, nil +} + +// RenderThreeWayComparison prints the three-way comparison in a readable format. +// sourceLabel identifies the OCM source (e.g. "ACCESS TOKEN AUTHS", "REGISTRY CREDENTIAL AUTHS"). +// When hasHive is false (HCP clusters), the hive columns are omitted. +func RenderThreeWayComparison(result *ThreeWayComparison, sourceLabel string, hasHive bool, out io.Writer) { + table := tablewriter.NewWriter(out) + table.SetHeaderAlignment(tablewriter.ALIGN_LEFT) + table.SetAlignment(tablewriter.ALIGN_LEFT) + table.SetBorder(false) + table.SetColumnSeparator(" ") + table.SetAutoWrapText(false) + table.SetAutoFormatHeaders(false) + + if hasHive { + table.SetHeader([]string{sourceLabel, "OCM↔HIVE", "OCM↔TARGET", "HIVE↔TARGET"}) + table.SetHeaderColor( + tablewriter.Colors{tablewriter.Bold, tablewriter.FgBlueColor}, + tablewriter.Colors{tablewriter.Bold, tablewriter.FgBlueColor}, + tablewriter.Colors{tablewriter.Bold, tablewriter.FgBlueColor}, + tablewriter.Colors{tablewriter.Bold, tablewriter.FgBlueColor}, + ) + for _, a := range result.Auths { + table.Append([]string{ + a.Registry, + syncStatus(a.InHive, a.OCMMatchesHive), + syncStatus(a.InTarget, a.OCMMatchesTarget), + syncStatus(a.InHive && a.InTarget, a.HiveMatchesTarget), + }) + } + } else { + table.SetHeader([]string{sourceLabel, "OCM↔TARGET"}) + table.SetHeaderColor( + tablewriter.Colors{tablewriter.Bold, tablewriter.FgBlueColor}, + tablewriter.Colors{tablewriter.Bold, tablewriter.FgBlueColor}, + ) + for _, a := range result.Auths { + table.Append([]string{ + a.Registry, + syncStatus(a.InTarget, a.OCMMatchesTarget), + }) + } + } + table.Render() + + fmt.Fprintln(out) + if result.AllInSync { + fmt.Fprintf(out, " %s All sources in sync\n", psColorOK("[OK]")) + } else { + if hasHive && result.HiveNeedsUpdate { + fmt.Fprintf(out, " %s Hive secret needs update from OCM\n", psColorWarn("[!]")) + } + if result.TargetNeedsSync { + fmt.Fprintf(out, " %s Target cluster needs update from OCM\n", psColorWarn("[!]")) + } + } +} + +func syncStatus(present bool, matches bool) string { + if !present { + return color.New(color.FgYellow).Sprint("missing") + } + if matches { + return color.New(color.FgGreen).Sprint("match") + } + return color.New(color.FgYellow, color.Bold).Sprint("DIFFERS") +} + +// HiveNamespaceInfo holds the resolved Hive namespace and ClusterDeployment name +// for a given cluster. +type HiveNamespaceInfo struct { + Namespace string + ClusterDeploymentName string +} + +// FindHiveNamespace discovers the Hive namespace for a cluster by listing +// ClusterDeployments filtered by the api.openshift.com/id label. This avoids +// the fragile uhc-{env}-{clusterID} namespace construction. +func FindHiveNamespace(ctx context.Context, kubeCli client.Client, clusterID string) (*HiveNamespaceInfo, error) { + if err := hiveapiv1.AddToScheme(kubeCli.Scheme()); err != nil { + return nil, fmt.Errorf("failed to add hive scheme: %w", err) + } + + // Try label-based lookup first (fast, targeted) + cdList := &hiveapiv1.ClusterDeploymentList{} + labelSelector := client.MatchingLabels{"api.openshift.com/id": clusterID} + if err := kubeCli.List(ctx, cdList, labelSelector); err == nil && len(cdList.Items) > 0 { + cd := cdList.Items[0] + return &HiveNamespaceInfo{ + Namespace: cd.Namespace, + ClusterDeploymentName: cd.Name, + }, nil + } + + // Fallback: list all ClusterDeployments and match by ClusterMetadata + allCDs := &hiveapiv1.ClusterDeploymentList{} + if err := kubeCli.List(ctx, allCDs); err != nil { + return nil, fmt.Errorf("failed to list ClusterDeployments: %w", err) + } + + for _, cd := range allCDs.Items { + if cd.Spec.ClusterMetadata != nil && cd.Spec.ClusterMetadata.ClusterID == clusterID { + return &HiveNamespaceInfo{ + Namespace: cd.Namespace, + ClusterDeploymentName: cd.Name, + }, nil + } + } + + return nil, fmt.Errorf("no ClusterDeployment found for cluster ID %s", clusterID) +} + +// PreflightResult holds the outcome of pre-flight checks. +type PreflightResult struct { + SecretExists bool + SecretData []byte // existing .dockerconfigjson content, nil if missing +} + +// PreflightCheck validates that the target cluster's pull-secret exists and is +// readable before attempting any mutations. All operations are read-only. +// Returns a PreflightResult so callers can decide whether to create or update. +func PreflightCheck(ctx context.Context, clientset *kubernetes.Clientset, isHCP bool, clusterName string, out io.Writer) (*PreflightResult, error) { + fmt.Fprintf(out, "\nPre-flight checks on %s...\n", clusterName) + result := &PreflightResult{} + + secret, err := clientset.CoreV1().Secrets("openshift-config").Get(ctx, "pull-secret", metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + fmt.Fprintf(out, " %s secret openshift-config/pull-secret not found on %s\n", psColorWarn("[WARN]"), clusterName) + fmt.Fprintf(out, " The pull secret can be rebuilt from the owner's OCM access token and registry credentials.\n") + } else { + fmt.Fprintf(out, " %s failed to read secret openshift-config/pull-secret on %s: %v\n", psColorWarn("[WARN]"), clusterName, err) + } + return result, nil + } + result.SecretExists = true + + data, ok := secret.Data[".dockerconfigjson"] + if !ok { + fmt.Fprintf(out, " %s secret openshift-config/pull-secret exists on %s but is missing .dockerconfigjson key\n", psColorWarn("[WARN]"), clusterName) + return result, nil + } + result.SecretData = data + + fmt.Fprintf(out, " %s secret openshift-config/pull-secret exists on %s\n", psColorOK("[OK]"), clusterName) + fmt.Fprintf(out, " %s secret openshift-config/pull-secret has .dockerconfigjson key\n", psColorOK("[OK]")) + + if !isHCP { + pods, err := clientset.CoreV1().Pods("openshift-monitoring").List(ctx, metav1.ListOptions{ + LabelSelector: "app.kubernetes.io/name=telemeter-client", + }) + if err == nil && len(pods.Items) > 0 { + fmt.Fprintf(out, " %s pods openshift-monitoring/telemeter-client found on %s (%d)\n", psColorOK("[OK]"), clusterName, len(pods.Items)) + } else if err != nil { + fmt.Fprintf(out, " %s could not list pods openshift-monitoring/telemeter-client on %s: %v\n", psColorWarn("[WARN]"), clusterName, err) + } + + pods, err = clientset.CoreV1().Pods("openshift-ocm-agent-operator").List(ctx, metav1.ListOptions{ + LabelSelector: "app=ocm-agent", + }) + if err == nil && len(pods.Items) > 0 { + fmt.Fprintf(out, " %s pods openshift-ocm-agent-operator/ocm-agent found on %s (%d)\n", psColorOK("[OK]"), clusterName, len(pods.Items)) + } else if err != nil { + fmt.Fprintf(out, " %s could not list pods openshift-ocm-agent-operator/ocm-agent on %s: %v\n", psColorWarn("[WARN]"), clusterName, err) + } + } + + fmt.Fprintf(out, " %s Pre-flight checks passed\n", psColorOK("[OK]")) + return result, nil +} + +// CountOwnerClusters returns the number of active clusters owned by the given +// account ID. +func CountOwnerClusters(ocm *sdk.Connection, accountID string, logger *logrus.Logger) int { + search := fmt.Sprintf("creator.id = '%s' and status != 'Deprovisioned' and status != 'Archived'", accountID) + resp, err := ocm.AccountsMgmt().V1().Subscriptions().List(). + Search(search). + Size(1). + Send() + if err != nil { + logger.Debugf("Could not query sibling clusters: %v", err) + return 0 + } + return resp.Total() +} + +// ListOwnerSubscriptions returns all active subscriptions for the given account ID. +func ListOwnerSubscriptions(ocm *sdk.Connection, accountID string) ([]ClusterSummary, error) { + search := fmt.Sprintf("creator.id = '%s' and status != 'Deprovisioned' and status != 'Archived'", accountID) + pageSize := 100 + request := ocm.AccountsMgmt().V1().Subscriptions().List(). + Search(search). + Size(pageSize) + + var clusters []ClusterSummary + for { + resp, err := request.Send() + if err != nil { + return nil, err + } + + for _, sub := range resp.Items().Slice() { + name, _ := sub.GetDisplayName() + clusterID, _ := sub.GetClusterID() + status, _ := sub.GetStatus() + createdAt, _ := sub.GetCreatedAt() + + if clusterID == "" { + continue + } + + clusters = append(clusters, ClusterSummary{ + Name: name, + ID: clusterID, + Status: status, + CreatedAt: createdAt, + }) + } + + if resp.Size() < pageSize { + break + } + request.Page(resp.Page() + 1) + } + + return clusters, nil +} + +// GetLatestCredentialUpdate returns the most recent UpdatedAt time across +// all registry credentials for the given account. +func GetLatestCredentialUpdate(ocm *sdk.Connection, accountID string) (time.Time, error) { + creds, err := utils.GetRegistryCredentials(ocm, accountID) + if err != nil { + return time.Time{}, err + } + + var latest time.Time + for _, cred := range creds { + if updated, ok := cred.GetUpdatedAt(); ok { + if updated.After(latest) { + latest = updated + } + } + } + return latest, nil +} + +// pullSecretAuthEntry holds extracted auth data from a cluster pull secret. +type pullSecretAuthEntry struct { + auth string + email string +} + +// extractPullSecretAuth extracts an auth entry from a cluster pull secret by registry name. +func extractPullSecretAuth(authID string, secret *corev1.Secret) (*pullSecretAuthEntry, error) { + dockerConfigJSON, ok := secret.Data[".dockerconfigjson"] + if !ok { + return nil, fmt.Errorf("secret is missing .dockerconfigjson key") + } + + var parsed struct { + Auths map[string]struct { + Auth string `json:"auth"` + Email string `json:"email"` + } `json:"auths"` + } + if err := json.Unmarshal(dockerConfigJSON, &parsed); err != nil { + return nil, fmt.Errorf("failed to parse pull secret JSON: %w", err) + } + + entry, found := parsed.Auths[authID] + if !found { + return nil, fmt.Errorf("auth '%s' not found in pull secret", authID) + } + + return &pullSecretAuthEntry{ + auth: entry.Auth, + email: entry.Email, + }, nil +} + +const ( + checkSyncMaxAttempts = 24 // 24 × 5s = 2 minute total timeout for SyncSet sync + syncPollInterval = 5 * time.Second +) + +// MergePullSecretAuths merges new auths into existing pull secret data. Existing auths +// not present in newData are preserved. This never removes auths. +func MergePullSecretAuths(existingData, newData []byte) ([]byte, error) { + type auth struct { + Auth string `json:"auth"` + Email string `json:"email"` + } + type auths struct { + Auths map[string]auth `json:"auths"` + } + + var existing, incoming auths + + if err := json.Unmarshal(existingData, &existing); err != nil { + return nil, fmt.Errorf("failed to parse existing pull secret: %w", err) + } + if err := json.Unmarshal(newData, &incoming); err != nil { + return nil, fmt.Errorf("failed to parse new pull secret: %w", err) + } + + if existing.Auths == nil { + existing.Auths = make(map[string]auth) + } + + for k, v := range incoming.Auths { + if v.Auth == "" { + continue + } + existing.Auths[k] = v + } + + return json.Marshal(existing) +} + +// UpdateHivePullSecretSSS updates the pull secret in the given hive namespace +// using update-in-place (never deletes). If the secret doesn't exist, it creates it. +// When the secret exists, new auths are merged into the existing secret via MergePullSecretAuths, +// preserving any auths not present in the new data. +func UpdateHivePullSecretSSS(ctx context.Context, kubeCli client.Client, clientset *kubernetes.Clientset, hiveNamespace string, cdName string, pullsecret []byte, out io.Writer) error { + // Check for conflicting SyncSets before any mutations. + // If the user aborts here, no secrets have been modified. + if err := CheckExistingSyncSets(ctx, hiveNamespace, kubeCli, out); err != nil { + return err + } + + secretName := "pull" + + existing, err := clientset.CoreV1().Secrets(hiveNamespace).Get(ctx, secretName, metav1.GetOptions{}) + if err != nil { + if !apierrors.IsNotFound(err) { + return fmt.Errorf("failed to read secret %s/%s: %w", hiveNamespace, secretName, err) + } + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: secretName, + Namespace: hiveNamespace, + }, + Type: corev1.SecretTypeDockerConfigJson, + Data: map[string][]byte{ + ".dockerconfigjson": pullsecret, + }, + } + _, createErr := clientset.CoreV1().Secrets(hiveNamespace).Create(ctx, secret, metav1.CreateOptions{}) + if createErr != nil { + return fmt.Errorf("failed to create secret %s/%s: %w", hiveNamespace, secretName, createErr) + } + } else { + existingData, ok := existing.Data[".dockerconfigjson"] + if !ok || len(existingData) == 0 { + fmt.Fprintf(out, "%s secret %s/%s exists but missing .dockerconfigjson key — overwriting\n", psColorWarn("[WARN]"), hiveNamespace, secretName) + existing.Data[".dockerconfigjson"] = pullsecret + } else { + mergedData, mergeErr := MergePullSecretAuths(existingData, pullsecret) + if mergeErr != nil { + return fmt.Errorf("failed to merge pull secret auths for %s/%s: %w", hiveNamespace, secretName, mergeErr) + } + existing.Data[".dockerconfigjson"] = mergedData + } + _, updateErr := clientset.CoreV1().Secrets(hiveNamespace).Update(ctx, existing, metav1.UpdateOptions{}) + if updateErr != nil { + return fmt.Errorf("failed to update secret %s/%s: %w", hiveNamespace, secretName, updateErr) + } + } + + fmt.Fprintf(out, "[OK] hive secret %s/pull updated\n", hiveNamespace) + + if err := syncPullSecretViaHive(ctx, hiveNamespace, cdName, kubeCli, out); err != nil { + return fmt.Errorf("hive secret updated but sync to target failed: %w. Re-run this command to retry the sync", err) + } + + return nil +} + +// SyncSetName is the name used by this tool for pull secret SyncSets. +// Distinct from "pull-secret-replacement" used by transfer-owner to avoid collisions. +const SyncSetName = "pull-secret-update" + +// CheckExistingSyncSets checks for existing SyncSets that could interfere with +// a new pull secret sync. Must be called BEFORE updating the hive secret so that +// aborting leaves no mutations. +func CheckExistingSyncSets(ctx context.Context, hiveNamespace string, kubeCli client.Client, out io.Writer) error { + for _, ssName := range []string{SyncSetName, "pull-secret-replacement"} { + existing := &hiveapiv1.SyncSet{} + existing.Name = ssName + existing.Namespace = hiveNamespace + if err := kubeCli.Get(ctx, client.ObjectKeyFromObject(existing), existing); err == nil { + fmt.Fprintf(out, "\n%s Existing SyncSet %s/%s found.\n", psColorWarn("[WARN]"), hiveNamespace, ssName) + if ssName == "pull-secret-replacement" { + fmt.Fprintf(out, " This SyncSet was created by transfer-owner or a previous tool.\n") + } else { + fmt.Fprintf(out, " This SyncSet was created by a previous pull-secret update run.\n") + } + age := time.Since(existing.CreationTimestamp.Time) + fmt.Fprintf(out, " Created: %s (%s ago)\n", existing.CreationTimestamp.Format("2006-01-02 15:04:05 UTC"), age.Truncate(time.Second)) + if age < 5*time.Minute { + fmt.Fprintf(out, " %s Created recently — another SRE may be running this tool on the same cluster.\n", psColorWarn("[WARN]")) + } + fmt.Fprintf(out, "\n No changes have been made yet — it is safe to abort.\n") + fmt.Fprintf(out, "\n Options:\n") + fmt.Fprintf(out, " 1. Delete it and continue (recommended if orphaned)\n") + fmt.Fprintf(out, " 2. Abort — investigate manually (recommended if concurrent)\n") + fmt.Fprintf(out, " Delete existing SyncSet and continue? ") + + reader := bufio.NewReader(os.Stdin) + response, readErr := reader.ReadString('\n') + if readErr != nil && readErr != io.EOF { + return fmt.Errorf("failed to read user input: %w", readErr) + } + response = strings.TrimSpace(strings.ToLower(response)) + if response != "y" && response != "yes" { + return fmt.Errorf("aborted — existing SyncSet %s/%s needs manual investigation", hiveNamespace, ssName) + } + + fmt.Fprintf(out, " Deleting SyncSet %s/%s...\n", hiveNamespace, ssName) + if delErr := kubeCli.Delete(ctx, existing); delErr != nil { + return fmt.Errorf("failed to delete existing SyncSet %s/%s: %w", hiveNamespace, ssName, delErr) + } + time.Sleep(5 * time.Second) + } + } + return nil +} + +// syncPullSecretViaHive creates a SyncSet to sync the pull secret from hive to +// the target cluster, polls ClusterSync for completion, then cleans up the SyncSet. +// CheckExistingSyncSets must be called before this function. +func syncPullSecretViaHive(ctx context.Context, hiveNamespace string, cdName string, kubeCli client.Client, out io.Writer) error { + syncSet := &hiveapiv1.SyncSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: SyncSetName, + Namespace: hiveNamespace, + }, + Spec: hiveapiv1.SyncSetSpec{ + ClusterDeploymentRefs: []corev1.LocalObjectReference{ + {Name: cdName}, + }, + SyncSetCommonSpec: hiveapiv1.SyncSetCommonSpec{ + ResourceApplyMode: "Upsert", + Secrets: []hiveapiv1.SecretMapping{ + { + SourceRef: hiveapiv1.SecretReference{ + Name: "pull", + Namespace: hiveNamespace, + }, + TargetRef: hiveapiv1.SecretReference{ + Name: "pull-secret", + Namespace: "openshift-config", + }, + }, + }, + }, + }, + } + + if err := kubeCli.Create(ctx, syncSet); err != nil { + return fmt.Errorf("failed to create SyncSet: %w", err) + } + // Use the server-side creation timestamp for comparison, not local time. + // Kubernetes timestamps have second-level precision; comparing against + // a nanosecond-precision local time.Now() causes false negatives when + // the sync completes within the same second as the creation. + syncSetCreatedAt := syncSet.CreationTimestamp.Time + fmt.Fprintf(out, "SyncSet %s in namespace %s has been created.\n", SyncSetName, hiveNamespace) + + if err := hiveinternalv1alpha1.AddToScheme(kubeCli.Scheme()); err != nil { + return fmt.Errorf("failed to add hiveinternal scheme: %w", err) + } + + searchStatus := &hiveinternalv1alpha1.ClusterSync{ + ObjectMeta: metav1.ObjectMeta{ + Name: cdName, + Namespace: hiveNamespace, + }, + } + foundStatus := &hiveinternalv1alpha1.ClusterSync{} + isSynced := false + var lastGetErr error + reader := bufio.NewReader(os.Stdin) + + // Poll in 60s rounds (12 × 5s), prompt user to continue or abort each round + for round := 0; ; round++ { + for i := 0; i < 12; i++ { + if err := kubeCli.Get(ctx, client.ObjectKeyFromObject(searchStatus), foundStatus); err != nil { + lastGetErr = err + fmt.Fprintf(out, "!") + time.Sleep(syncPollInterval) + continue + } + lastGetErr = nil + + for _, status := range foundStatus.Status.SyncSets { + if status.Name == SyncSetName && status.FirstSuccessTime != nil { + if !status.FirstSuccessTime.Time.Before(syncSetCreatedAt) { + isSynced = true + break + } + } + } + + if isSynced { + fmt.Fprintf(out, "\nSync completed...\n") + break + } + + fmt.Fprintf(out, ".") + time.Sleep(syncPollInterval) + } + + if isSynced { + break + } + + fmt.Fprintf(out, "\n%s SyncSet sync not confirmed after %d seconds.\n", psColorWarn("[WARN]"), (round+1)*60) + if lastGetErr != nil { + fmt.Fprintf(out, " Last error: %v\n", lastGetErr) + } + fmt.Fprintf(out, "Continue waiting? (y/N): ") + response, readErr := reader.ReadString('\n') + if readErr != nil && readErr != io.EOF { + break + } + response = strings.TrimSpace(strings.ToLower(response)) + if response != "y" && response != "yes" { + break + } + } + + // Always clean up the SyncSet, even on timeout + if delErr := kubeCli.Delete(ctx, syncSet); delErr != nil { + fmt.Fprintf(out, "\n%s failed to delete SyncSet %s/%s: %v\n", psColorWarn("[WARN]"), hiveNamespace, SyncSetName, delErr) + } + + if !isSynced { + if lastGetErr != nil { + return fmt.Errorf("SyncSet %s/%s sync not confirmed (last error: %v) (SyncSet cleaned up)", hiveNamespace, SyncSetName, lastGetErr) + } + return fmt.Errorf("SyncSet %s/%s sync not confirmed (SyncSet cleaned up). Re-run this command to retry", hiveNamespace, SyncSetName) + } + + return nil +} + +// UpdateHCPPullSecretViaManifestWork updates the pull secret within a ManifestWork +// on the service cluster for HCP clusters. +// +// HCP pull secret architecture: +// - This operates at level 1 (HostedCluster.spec.pullSecret) +// - HCCO reconciles changes to kube-system/original-pull-secret on the hosted cluster +// - Customer-added registries in kube-system/additional-pull-secret are not affected +// - Ref: https://access.redhat.com/solutions/7118834 +// - Ref: https://hypershift.pages.dev/how-to/powervs/global-pull-secret/ +func UpdateHCPPullSecretViaManifestWork(ctx context.Context, ocm *sdk.Connection, kubeCli client.Client, clusterID, mgmtClusterName string, pullsecret []byte, out io.Writer) error { + if err := workv1.AddToScheme(kubeCli.Scheme()); err != nil { + return fmt.Errorf("failed to add work scheme: %w", err) + } + + hostedCluster, err := utils.GetClusterAnyStatus(ocm, clusterID) + if err != nil { + return fmt.Errorf("failed to get cluster: %w", err) + } + + secretNamePrefix := hostedCluster.DomainPrefix() + "-pull" + newSecretName := secretNamePrefix + "-" + randomHexSuffix(6) + + timeoutCtx, cancel := context.WithTimeout(ctx, 2*time.Minute) + defer cancel() + + err = retry.RetryOnConflict(retry.DefaultRetry, func() error { + manifestWork := &workv1.ManifestWork{} + if err := kubeCli.Get(timeoutCtx, types.NamespacedName{Name: clusterID, Namespace: mgmtClusterName}, manifestWork); err != nil { + return fmt.Errorf("failed to get ManifestWork %s/%s: %w", mgmtClusterName, clusterID, err) + } + + if err := updateManifestWorkPayloads(manifestWork, secretNamePrefix, newSecretName, pullsecret); err != nil { + return err + } + + return kubeCli.Update(timeoutCtx, manifestWork, &client.UpdateOptions{}) + }) + if err != nil { + return fmt.Errorf("cannot update pull-secret within ManifestWork: %w", err) + } + + fmt.Fprintf(out, "ManifestWork updated. Waiting for secret to sync on hosted cluster...\n") + + // Poll the ManifestWork status for applied condition rather than sleeping a fixed duration. + // Uses the parent ctx (not timeoutCtx) since the user controls loop duration via prompts. + // Each round polls for 60s (12 × 5s), then prompts to continue or abort. + reader := bufio.NewReader(os.Stdin) + for round := 0; ; round++ { + for i := 0; i < 12; i++ { + mw := &workv1.ManifestWork{} + if getErr := kubeCli.Get(ctx, types.NamespacedName{Name: clusterID, Namespace: mgmtClusterName}, mw); getErr == nil { + for _, cond := range mw.Status.Conditions { + if cond.Type == "Applied" && cond.Status == "True" { + fmt.Fprintf(out, "\nManifestWork applied.\n") + return nil + } + } + } + fmt.Fprintf(out, ".") + time.Sleep(5 * time.Second) + } + fmt.Fprintf(out, "\n%s ManifestWork sync not confirmed after %d seconds.\n", psColorWarn("[WARN]"), (round+1)*60) + fmt.Fprintf(out, "Continue waiting? (y/N): ") + response, readErr := reader.ReadString('\n') + if readErr != nil && readErr != io.EOF { + return fmt.Errorf("failed to read user input: %w", readErr) + } + response = strings.TrimSpace(strings.ToLower(response)) + if response != "y" && response != "yes" { + fmt.Fprintf(out, "%s Aborting wait — verify ManifestWork sync manually.\n", psColorWarn("[WARN]")) + return nil + } + } +} + +func updateManifestWorkPayloads(mw *workv1.ManifestWork, secretNamePrefix, newSecretName string, pullsecret []byte) error { + secretUpdated := false + hcIndex := -1 + + for i, manifest := range mw.Spec.Workload.Manifests { + if manifest.Raw == nil { + continue + } + + var meta struct { + Kind string `json:"kind"` + } + if err := json.Unmarshal(manifest.Raw, &meta); err != nil { + return err + } + + switch meta.Kind { + case "Secret": + secret := &corev1.Secret{} + if err := json.Unmarshal(manifest.Raw, secret); err != nil { + return err + } + if strings.HasPrefix(secret.Name, secretNamePrefix) { + if secret.Data == nil { + secret.Data = map[string][]byte{} + } + oldPullSecret, hasKey := secret.Data[".dockerconfigjson"] + var newPullSecret []byte + if !hasKey || len(oldPullSecret) == 0 { + newPullSecret = pullsecret + } else { + var mergeErr error + newPullSecret, mergeErr = MergePullSecretAuths(oldPullSecret, pullsecret) + if mergeErr != nil { + return fmt.Errorf("cannot merge pull secret auths: %w", mergeErr) + } + } + secret.Name = newSecretName + secret.Data[".dockerconfigjson"] = newPullSecret + secretJSON, err := json.Marshal(secret) + if err != nil { + return err + } + mw.Spec.Workload.Manifests[i].Raw = secretJSON + secretUpdated = true + } + case "HostedCluster": + hcIndex = i + } + } + + if !secretUpdated { + return fmt.Errorf("no Secret matching prefix %q found in ManifestWork", secretNamePrefix) + } + if hcIndex >= 0 { + hc := &hypershiftv1beta1.HostedCluster{} + if err := json.Unmarshal(mw.Spec.Workload.Manifests[hcIndex].Raw, hc); err != nil { + return err + } + hc.Spec.PullSecret.Name = newSecretName + hcJSON, err := json.Marshal(hc) + if err != nil { + return err + } + mw.Spec.Workload.Manifests[hcIndex].Raw = hcJSON + } + + return nil +} + +// RestartPodsBySelector deletes pods matching the selector in the namespace to trigger a rollout. +func RestartPodsBySelector(ctx context.Context, clientset *kubernetes.Clientset, namespace, selector string, out io.Writer) error { + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: selector, + }) + if err != nil { + return fmt.Errorf("failed to list pods in namespace '%s' with selector '%s': %w", namespace, selector, err) + } + + for _, pod := range pods.Items { + if err := clientset.CoreV1().Pods(namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil { + return fmt.Errorf("failed to delete pod '%s' in namespace '%s': %w", pod.Name, namespace, err) + } + fmt.Fprintf(out, "Pod %s in namespace %s has been deleted.\n", pod.Name, namespace) + } + + fmt.Fprintf(out, "Pods in namespace %s with selector '%s' have been deleted.\n", namespace, selector) + return nil +} + +func randomHexSuffix(length int) string { + const chars = "0123456789abcdef" + result := make([]byte, length) + for i := range result { + result[i] = chars[rand.Intn(len(chars))] //nolint:gosec // resource name suffix, not security-sensitive + } + return string(result) +} diff --git a/pkg/controller/pullsecret_test.go b/pkg/controller/pullsecret_test.go new file mode 100644 index 000000000..153383eb3 --- /dev/null +++ b/pkg/controller/pullsecret_test.go @@ -0,0 +1,820 @@ +package controller + +import ( + "bytes" + "encoding/json" + "fmt" + "testing" + + "context" + + amv1 "github.com/openshift-online/ocm-sdk-go/accountsmgmt/v1" + hiveapiv1 "github.com/openshift/hive/apis/hive/v1" + hiveinternalv1alpha1 "github.com/openshift/hive/apis/hiveinternal/v1alpha1" + hypershiftv1beta1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + "github.com/sirupsen/logrus" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + workv1 "open-cluster-management.io/api/work/v1" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +// --- helpers --- + +func makePullSecretJSON(auths map[string]map[string]string) []byte { + ps := map[string]map[string]map[string]string{"auths": auths} + b, err := json.Marshal(ps) + if err != nil { + panic(fmt.Sprintf("makePullSecretJSON: %v", err)) + } + return b +} + +// --- ValidateRequiredAuths --- + +func buildAuthMap(registries ...string) map[string]*amv1.AccessTokenAuth { + m := make(map[string]*amv1.AccessTokenAuth, len(registries)) + for _, r := range registries { + auth, err := amv1.NewAccessTokenAuth().Auth("tok").Email("e@e").Build() + if err != nil { + panic(fmt.Sprintf("buildAuthMap: %v", err)) + } + m[r] = auth + } + return m +} + +func TestValidateRequiredAuths_AllPresent(t *testing.T) { + auths := buildAuthMap(RequiredPullSecretAuths...) + missing := ValidateRequiredAuths(auths) + if len(missing) != 0 { + t.Fatalf("expected 0 missing, got %v", missing) + } +} + +func TestValidateRequiredAuths_SomeMissing(t *testing.T) { + auths := buildAuthMap("quay.io") + missing := ValidateRequiredAuths(auths) + if len(missing) != 3 { + t.Fatalf("expected 3 missing, got %d: %v", len(missing), missing) + } +} + +func TestValidateRequiredAuths_Empty(t *testing.T) { + missing := ValidateRequiredAuths(map[string]*amv1.AccessTokenAuth{}) + if len(missing) != 4 { + t.Fatalf("expected 4 missing, got %d", len(missing)) + } +} + +// --- CompareThreeWay --- + +func TestCompareThreeWay_AllInSync(t *testing.T) { + ocm := map[string]SimpleAuth{ + "quay.io": {Auth: "tok1", Email: "a@b"}, + "registry.redhat.io": {Auth: "tok2", Email: "a@b"}, + } + hive := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "tok1", "email": "a@b"}, + "registry.redhat.io": {"auth": "tok2", "email": "a@b"}, + }) + target := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "tok1", "email": "a@b"}, + "registry.redhat.io": {"auth": "tok2", "email": "a@b"}, + }) + + result, err := CompareThreeWay(ocm, hive, target) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !result.AllInSync { + t.Error("expected AllInSync=true") + } + if result.HiveNeedsUpdate { + t.Error("expected HiveNeedsUpdate=false") + } + if result.TargetNeedsSync { + t.Error("expected TargetNeedsSync=false") + } + if len(result.Auths) != 2 { + t.Errorf("expected 2 auth states, got %d", len(result.Auths)) + } +} + +func TestCompareThreeWay_TargetDiffers(t *testing.T) { + ocm := map[string]SimpleAuth{ + "quay.io": {Auth: "tok1", Email: "a@b"}, + } + hive := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "tok1", "email": "a@b"}, + }) + target := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "tok1", "email": "tampered@bad.com"}, + }) + + result, err := CompareThreeWay(ocm, hive, target) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.AllInSync { + t.Error("expected AllInSync=false") + } + if !result.TargetNeedsSync { + t.Error("expected TargetNeedsSync=true") + } + if result.HiveNeedsUpdate { + t.Error("expected HiveNeedsUpdate=false (hive matches OCM)") + } +} + +func TestCompareThreeWay_HiveDiffers(t *testing.T) { + ocm := map[string]SimpleAuth{ + "quay.io": {Auth: "newtok", Email: "a@b"}, + } + hive := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "oldtok", "email": "a@b"}, + }) + target := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "oldtok", "email": "a@b"}, + }) + + result, err := CompareThreeWay(ocm, hive, target) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.AllInSync { + t.Error("expected AllInSync=false") + } + if !result.HiveNeedsUpdate { + t.Error("expected HiveNeedsUpdate=true") + } + if !result.TargetNeedsSync { + t.Error("expected TargetNeedsSync=true") + } +} + +func TestCompareThreeWay_NilHive(t *testing.T) { + ocm := map[string]SimpleAuth{ + "quay.io": {Auth: "tok", Email: "a@b"}, + } + target := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "tok", "email": "a@b"}, + }) + + result, err := CompareThreeWay(ocm, nil, target) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.AllInSync { + t.Error("expected AllInSync=false when hive is nil") + } + if !result.HiveNeedsUpdate { + t.Error("expected HiveNeedsUpdate=true when hive is nil") + } + if result.TargetNeedsSync { + t.Error("expected TargetNeedsSync=false since target matches OCM") + } +} + +func TestCompareThreeWay_ExtraRegistryInTarget(t *testing.T) { + ocm := map[string]SimpleAuth{ + "quay.io": {Auth: "tok", Email: "a@b"}, + } + hive := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "tok", "email": "a@b"}, + }) + target := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "tok", "email": "a@b"}, + "custom.registry.io": {"auth": "custom", "email": "c@d"}, + }) + + result, err := CompareThreeWay(ocm, hive, target) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !result.AllInSync { + t.Error("expected AllInSync=true — extra registries in target are not in OCM scope") + } + if len(result.Auths) != 1 { + t.Errorf("expected 1 auth state (only quay.io), got %d", len(result.Auths)) + } +} + +func TestCompareThreeWay_InvalidHiveJSON(t *testing.T) { + ocm := map[string]SimpleAuth{ + "quay.io": {Auth: "tok", Email: "a@b"}, + } + _, err := CompareThreeWay(ocm, []byte("not json"), nil) + if err == nil { + t.Error("expected error for invalid hive JSON") + } +} + +func TestCompareThreeWay_InvalidTargetJSON(t *testing.T) { + ocm := map[string]SimpleAuth{ + "quay.io": {Auth: "tok", Email: "a@b"}, + } + _, err := CompareThreeWay(ocm, nil, []byte("{corrupt")) + if err == nil { + t.Error("expected error for invalid target JSON") + } +} + +// --- MergePullSecretAuths --- + +func TestMergePullSecretAuths_AddNew(t *testing.T) { + existing := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "tok1", "email": "a@b"}, + }) + incoming := makePullSecretJSON(map[string]map[string]string{ + "registry.redhat.io": {"auth": "tok2", "email": "a@b"}, + }) + + merged, err := MergePullSecretAuths(existing, incoming) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + var parsed struct { + Auths map[string]struct { + Auth string `json:"auth"` + Email string `json:"email"` + } `json:"auths"` + } + if err := json.Unmarshal(merged, &parsed); err != nil { + t.Fatalf("failed to parse merged: %v", err) + } + if len(parsed.Auths) != 2 { + t.Fatalf("expected 2 auths, got %d", len(parsed.Auths)) + } + if parsed.Auths["quay.io"].Auth != "tok1" { + t.Error("existing auth was overwritten") + } + if parsed.Auths["registry.redhat.io"].Auth != "tok2" { + t.Error("new auth not added") + } +} + +func TestMergePullSecretAuths_OverwriteExisting(t *testing.T) { + existing := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "oldtok", "email": "old@e"}, + }) + incoming := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "newtok", "email": "new@e"}, + }) + + merged, err := MergePullSecretAuths(existing, incoming) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + var parsed struct { + Auths map[string]struct { + Auth string `json:"auth"` + Email string `json:"email"` + } `json:"auths"` + } + if err := json.Unmarshal(merged, &parsed); err != nil { + t.Fatalf("failed to parse merged: %v", err) + } + if parsed.Auths["quay.io"].Auth != "newtok" { + t.Error("incoming auth did not overwrite existing") + } + if parsed.Auths["quay.io"].Email != "new@e" { + t.Error("incoming email did not overwrite existing") + } +} + +func TestMergePullSecretAuths_NeverDeletes(t *testing.T) { + existing := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "tok1", "email": "a@b"}, + "registry.redhat.io": {"auth": "tok2", "email": "a@b"}, + "registry.connect.redhat.com": {"auth": "tok3", "email": "a@b"}, + }) + incoming := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "newtok", "email": "a@b"}, + }) + + merged, err := MergePullSecretAuths(existing, incoming) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + var parsed struct { + Auths map[string]struct{} `json:"auths"` + } + if err := json.Unmarshal(merged, &parsed); err != nil { + t.Fatalf("failed to parse merged: %v", err) + } + if len(parsed.Auths) != 3 { + t.Fatalf("merge deleted auths: expected 3, got %d", len(parsed.Auths)) + } +} + +func TestMergePullSecretAuths_InvalidJSON(t *testing.T) { + _, err := MergePullSecretAuths([]byte("not json"), makePullSecretJSON(map[string]map[string]string{})) + if err == nil { + t.Error("expected error for invalid existing JSON") + } + + _, err = MergePullSecretAuths(makePullSecretJSON(map[string]map[string]string{}), []byte("not json")) + if err == nil { + t.Error("expected error for invalid incoming JSON") + } +} + +func TestMergePullSecretAuths_NilAuthsMap(t *testing.T) { + existing := []byte(`{"auths":null}`) + incoming := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "tok1", "email": "a@b"}, + }) + + merged, err := MergePullSecretAuths(existing, incoming) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + var parsed struct { + Auths map[string]struct{} `json:"auths"` + } + if err := json.Unmarshal(merged, &parsed); err != nil { + t.Fatalf("failed to parse merged: %v", err) + } + if len(parsed.Auths) != 1 { + t.Fatalf("expected 1 auth, got %d", len(parsed.Auths)) + } +} + +func TestMergePullSecretAuths_EmptyAuthSkipped(t *testing.T) { + existing := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "validtok", "email": "a@b"}, + }) + incoming := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "", "email": "a@b"}, + }) + + merged, err := MergePullSecretAuths(existing, incoming) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + var parsed struct { + Auths map[string]struct { + Auth string `json:"auth"` + } `json:"auths"` + } + if err := json.Unmarshal(merged, &parsed); err != nil { + t.Fatalf("failed to parse merged: %v", err) + } + if parsed.Auths["quay.io"].Auth != "validtok" { + t.Errorf("empty auth should not overwrite valid auth, got %q", parsed.Auths["quay.io"].Auth) + } +} + +func TestMergePullSecretAuths_EmptyExisting(t *testing.T) { + existing := []byte(`{}`) + incoming := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "tok1", "email": "a@b"}, + }) + + merged, err := MergePullSecretAuths(existing, incoming) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + var parsed struct { + Auths map[string]struct{} `json:"auths"` + } + if err := json.Unmarshal(merged, &parsed); err != nil { + t.Fatalf("failed to parse merged: %v", err) + } + if len(parsed.Auths) != 1 { + t.Fatalf("expected 1 auth, got %d", len(parsed.Auths)) + } +} + +// --- extractPullSecretAuth --- + +func TestExtractPullSecretAuth_Valid(t *testing.T) { + secret := &corev1.Secret{ + Data: map[string][]byte{ + ".dockerconfigjson": makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "tok1", "email": "a@b"}, + }), + }, + } + + entry, err := extractPullSecretAuth("quay.io", secret) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if entry.auth != "tok1" { + t.Errorf("expected auth=tok1, got %s", entry.auth) + } + if entry.email != "a@b" { + t.Errorf("expected email=a@b, got %s", entry.email) + } +} + +func TestExtractPullSecretAuth_MissingRegistry(t *testing.T) { + secret := &corev1.Secret{ + Data: map[string][]byte{ + ".dockerconfigjson": makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "tok1", "email": "a@b"}, + }), + }, + } + + _, err := extractPullSecretAuth("registry.redhat.io", secret) + if err == nil { + t.Error("expected error for missing registry") + } +} + +func TestExtractPullSecretAuth_MissingDockerConfig(t *testing.T) { + secret := &corev1.Secret{Data: map[string][]byte{}} + _, err := extractPullSecretAuth("quay.io", secret) + if err == nil { + t.Error("expected error for missing .dockerconfigjson") + } +} + +func TestExtractPullSecretAuth_InvalidJSON(t *testing.T) { + secret := &corev1.Secret{ + Data: map[string][]byte{ + ".dockerconfigjson": []byte("not json"), + }, + } + _, err := extractPullSecretAuth("quay.io", secret) + if err == nil { + t.Error("expected error for invalid JSON") + } +} + +// --- updateManifestWorkPayloads --- + +func TestUpdateManifestWorkPayloads_UpdatesSecretAndHC(t *testing.T) { + secret := &corev1.Secret{} + secret.APIVersion = "v1" + secret.Kind = "Secret" + secret.Name = "test-ps-secret-abc" + secret.Namespace = "clusters" + secret.Data = map[string][]byte{ + ".dockerconfigjson": makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "oldtok", "email": "old@e"}, + }), + } + secretJSON, _ := json.Marshal(secret) + + hc := &hypershiftv1beta1.HostedCluster{} + hc.APIVersion = "hypershift.openshift.io/v1beta1" + hc.Kind = "HostedCluster" + hc.Name = "test-cluster" + hc.Spec.PullSecret.Name = "test-ps-secret-abc" + hcJSON, _ := json.Marshal(hc) + + mw := &workv1.ManifestWork{ + Spec: workv1.ManifestWorkSpec{ + Workload: workv1.ManifestsTemplate{ + Manifests: []workv1.Manifest{ + {RawExtension: runtime.RawExtension{Raw: secretJSON}}, + {RawExtension: runtime.RawExtension{Raw: hcJSON}}, + }, + }, + }, + } + + newPS := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "newtok", "email": "new@e"}, + "registry.redhat.io": {"auth": "tok2", "email": "new@e"}, + }) + + err := updateManifestWorkPayloads(mw, "test-ps-secret", "test-ps-secret-xyz", newPS) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Verify the secret was updated + var updatedSecret corev1.Secret + if err := json.Unmarshal(mw.Spec.Workload.Manifests[0].Raw, &updatedSecret); err != nil { + t.Fatalf("failed to parse updated secret: %v", err) + } + if updatedSecret.Name != "test-ps-secret-xyz" { + t.Errorf("secret name not updated: got %s", updatedSecret.Name) + } + var parsed struct { + Auths map[string]struct { + Auth string `json:"auth"` + } `json:"auths"` + } + if err := json.Unmarshal(updatedSecret.Data[".dockerconfigjson"], &parsed); err != nil { + t.Fatalf("failed to parse updated secret data: %v", err) + } + if parsed.Auths["quay.io"].Auth != "newtok" { + t.Error("secret auth not updated") + } + if _, ok := parsed.Auths["registry.redhat.io"]; !ok { + t.Error("new registry not added to secret") + } + + // Verify the HostedCluster pullSecret ref was updated + var updatedHC hypershiftv1beta1.HostedCluster + if err := json.Unmarshal(mw.Spec.Workload.Manifests[1].Raw, &updatedHC); err != nil { + t.Fatalf("failed to parse updated HC: %v", err) + } + if updatedHC.Spec.PullSecret.Name != "test-ps-secret-xyz" { + t.Errorf("HC pullSecret name not updated: got %s", updatedHC.Spec.PullSecret.Name) + } +} + +func TestUpdateManifestWorkPayloads_SkipsNonMatchingSecret(t *testing.T) { + secret := &corev1.Secret{} + secret.APIVersion = "v1" + secret.Kind = "Secret" + secret.Name = "other-secret" + secret.Namespace = "clusters" + secret.Data = map[string][]byte{"key": []byte("value")} + secretJSON, _ := json.Marshal(secret) + + mw := &workv1.ManifestWork{ + Spec: workv1.ManifestWorkSpec{ + Workload: workv1.ManifestsTemplate{ + Manifests: []workv1.Manifest{ + {RawExtension: runtime.RawExtension{Raw: secretJSON}}, + }, + }, + }, + } + + err := updateManifestWorkPayloads(mw, "test-ps-secret", "test-ps-secret-new", []byte(`{"auths":{}}`)) + if err == nil { + t.Fatal("expected error when no Secret matches prefix") + } + + var unchanged corev1.Secret + if err := json.Unmarshal(mw.Spec.Workload.Manifests[0].Raw, &unchanged); err != nil { + t.Fatalf("failed to parse unchanged secret: %v", err) + } + if unchanged.Name != "other-secret" { + t.Error("non-matching secret was modified") + } +} + +func TestUpdateManifestWorkPayloads_SecretWithoutHC(t *testing.T) { + secret := &corev1.Secret{} + secret.APIVersion = "v1" + secret.Kind = "Secret" + secret.Name = "test-ps-secret-abc" + secret.Data = map[string][]byte{ + ".dockerconfigjson": makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "oldtok", "email": "old@e"}, + }), + } + secretJSON, _ := json.Marshal(secret) + + mw := &workv1.ManifestWork{ + Spec: workv1.ManifestWorkSpec{ + Workload: workv1.ManifestsTemplate{ + Manifests: []workv1.Manifest{ + {RawExtension: runtime.RawExtension{Raw: secretJSON}}, + }, + }, + }, + } + + newPS := makePullSecretJSON(map[string]map[string]string{ + "quay.io": {"auth": "newtok", "email": "new@e"}, + }) + + err := updateManifestWorkPayloads(mw, "test-ps-secret", "test-ps-secret-xyz", newPS) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + var updated corev1.Secret + if err := json.Unmarshal(mw.Spec.Workload.Manifests[0].Raw, &updated); err != nil { + t.Fatalf("failed to parse: %v", err) + } + if updated.Name != "test-ps-secret-xyz" { + t.Errorf("secret name not updated: got %s", updated.Name) + } +} + +// --- PullSecretOp --- + +func TestPullSecretOp_Section(t *testing.T) { + var buf bytes.Buffer + op := NewPullSecretOp(false, logrus.New(), &buf) + op.Section(1, "Test Step", "Line one", "Line two") + + out := buf.String() + if !bytes.Contains([]byte(out), []byte("Step 1:")) { + t.Error("section missing step number") + } + if !bytes.Contains([]byte(out), []byte("Test Step")) { + t.Error("section missing title") + } + if !bytes.Contains([]byte(out), []byte("Line one")) { + t.Error("section missing description line") + } +} + +func TestPullSecretOp_DryRunPrefixes(t *testing.T) { + var buf bytes.Buffer + op := NewPullSecretOp(true, logrus.New(), &buf) + + op.OK("test ok") + if !bytes.Contains(buf.Bytes(), []byte("[Dry Run]")) { + t.Error("dry-run OK missing prefix") + } + if !bytes.Contains(buf.Bytes(), []byte("[OK]")) { + t.Error("OK missing [OK] marker") + } + + buf.Reset() + op.Fail("test fail %s", "reason") + if !bytes.Contains(buf.Bytes(), []byte("[Dry Run]")) { + t.Error("dry-run Fail missing prefix") + } + if !bytes.Contains(buf.Bytes(), []byte("[FAIL]")) { + t.Error("Fail missing [FAIL] marker") + } + buf.Reset() + op.Would("do something") + if !bytes.Contains(buf.Bytes(), []byte("Would:")) { + t.Error("Would missing Would: prefix") + } +} + +func TestPullSecretOp_LiveModePrefixes(t *testing.T) { + var buf bytes.Buffer + op := NewPullSecretOp(false, logrus.New(), &buf) + + op.OK("live ok") + if bytes.Contains(buf.Bytes(), []byte("[Dry Run]")) { + t.Error("live mode should not have [Dry Run] prefix") + } + if !bytes.Contains(buf.Bytes(), []byte("[OK]")) { + t.Error("live OK missing [OK] marker") + } +} + +func TestPullSecretOp_FailTracking(t *testing.T) { + var buf bytes.Buffer + op := NewPullSecretOp(false, logrus.New(), &buf) + + op.OK("good thing") + if len(op.Failures) != 0 { + t.Error("OK should not add failures") + } + + op.Fail("bad thing: %s", "reason") + if len(op.Failures) != 1 { + t.Fatalf("expected 1 failure, got %d", len(op.Failures)) + } + if op.Failures[0] != "bad thing: reason" { + t.Errorf("failure message wrong: %s", op.Failures[0]) + } + + op.Fail("another fail") + if len(op.Failures) != 2 { + t.Fatalf("expected 2 failures, got %d", len(op.Failures)) + } +} + +// --- syncStatus --- + +func TestSyncStatus(t *testing.T) { + tests := []struct { + name string + present bool + matches bool + expected string + }{ + {"present and matches", true, true, "match"}, + {"present but differs", true, false, "DIFFERS"}, + {"not present", false, false, "missing"}, + {"not present but matches (edge)", false, true, "missing"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := syncStatus(tt.present, tt.matches) + if got != tt.expected { + t.Errorf("syncStatus(%v, %v) = %q, want %q", tt.present, tt.matches, got, tt.expected) + } + }) + } +} + +// --- pullSecretScheme for fake client tests --- + +func pullSecretScheme(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + if err := corev1.AddToScheme(s); err != nil { + t.Fatalf("failed to add corev1: %v", err) + } + if err := hiveapiv1.AddToScheme(s); err != nil { + t.Fatalf("failed to add hiveapiv1: %v", err) + } + if err := hiveinternalv1alpha1.AddToScheme(s); err != nil { + t.Fatalf("failed to add hiveinternalv1alpha1: %v", err) + } + return s +} + +// --- CheckExistingSyncSets --- + +func TestCheckExistingSyncSets_NoExisting(t *testing.T) { + kubeCli := fake.NewClientBuilder().WithScheme(pullSecretScheme(t)).Build() + var buf bytes.Buffer + err := CheckExistingSyncSets(context.Background(), "uhc-test-ns", kubeCli, &buf) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestCheckExistingSyncSets_DetectsOurSyncSet(t *testing.T) { + ss := &hiveapiv1.SyncSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: SyncSetName, + Namespace: "uhc-test-ns", + }, + } + kubeCli := fake.NewClientBuilder().WithScheme(pullSecretScheme(t)).WithRuntimeObjects(ss).Build() + var buf bytes.Buffer + + // Would prompt for input — since stdin is closed, ReadString returns EOF, + // response is empty, doesn't match "y"/"yes", returns abort error + err := CheckExistingSyncSets(context.Background(), "uhc-test-ns", kubeCli, &buf) + if err == nil { + t.Fatal("expected abort error when SyncSet exists and user doesn't confirm") + } + output := buf.String() + if !bytes.Contains([]byte(output), []byte("Existing SyncSet")) { + t.Error("expected warning about existing SyncSet") + } + if !bytes.Contains([]byte(output), []byte("No changes have been made")) { + t.Error("expected 'no changes' safety message") + } +} + +func TestCheckExistingSyncSets_DetectsTransferOwnerSyncSet(t *testing.T) { + ss := &hiveapiv1.SyncSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pull-secret-replacement", + Namespace: "uhc-test-ns", + }, + } + kubeCli := fake.NewClientBuilder().WithScheme(pullSecretScheme(t)).WithRuntimeObjects(ss).Build() + var buf bytes.Buffer + + err := CheckExistingSyncSets(context.Background(), "uhc-test-ns", kubeCli, &buf) + if err == nil { + t.Fatal("expected abort error when transfer-owner SyncSet exists") + } + output := buf.String() + if !bytes.Contains([]byte(output), []byte("transfer-owner")) { + t.Error("expected message about transfer-owner SyncSet") + } +} + +// --- PreflightCheck --- +// PreflightCheck takes *kubernetes.Clientset (concrete type) which cannot be faked +// with k8s.io/client-go/kubernetes/fake.Clientset. These are covered by expect tests. +// TODO: refactor PreflightCheck to accept kubernetes.Interface for testability. + +// --- FindHiveNamespace --- + +func TestFindHiveNamespace_FoundByLabel(t *testing.T) { + cd := &hiveapiv1.ClusterDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cd", + Namespace: "uhc-production-abc123", + Labels: map[string]string{"api.openshift.com/id": "abc123"}, + }, + } + kubeCli := fake.NewClientBuilder().WithScheme(pullSecretScheme(t)).WithRuntimeObjects(cd).Build() + + info, err := FindHiveNamespace(context.Background(), kubeCli, "abc123") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if info.Namespace != "uhc-production-abc123" { + t.Errorf("expected namespace uhc-production-abc123, got %s", info.Namespace) + } + if info.ClusterDeploymentName != "test-cd" { + t.Errorf("expected CD name test-cd, got %s", info.ClusterDeploymentName) + } +} + +func TestFindHiveNamespace_NotFound(t *testing.T) { + kubeCli := fake.NewClientBuilder().WithScheme(pullSecretScheme(t)).Build() + + _, err := FindHiveNamespace(context.Background(), kubeCli, "nonexistent") + if err == nil { + t.Fatal("expected error when no ClusterDeployment found") + } +} diff --git a/pkg/controller/pullsecretop.go b/pkg/controller/pullsecretop.go new file mode 100644 index 000000000..49d40c6fa --- /dev/null +++ b/pkg/controller/pullsecretop.go @@ -0,0 +1,245 @@ +package controller + +import ( + "context" + "fmt" + "io" + + "github.com/fatih/color" + sdk "github.com/openshift-online/ocm-sdk-go" + amv1 "github.com/openshift-online/ocm-sdk-go/accountsmgmt/v1" + "github.com/sirupsen/logrus" + authorizationv1 "k8s.io/api/authorization/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +var ( + opColorOK = color.New(color.FgGreen).SprintFunc() + opColorFail = color.New(color.FgRed).SprintFunc() + opColorWarn = color.New(color.FgYellow).SprintFunc() + opColorDryRun = color.New(color.FgCyan).SprintFunc() + opColorHdr = color.New(color.FgBlue, color.Bold).SprintFunc() + opColorDetail = color.New(color.FgWhite).SprintFunc() +) + +// PullSecretOp carries context for pull secret operations. Each method +// checks DryRun and either performs the operation or reports what it would do. +type PullSecretOp struct { + DryRun bool + Logger *logrus.Logger + Out io.Writer + AllOK bool + PullSecretUpToDate bool + PullSecretUpdated bool + AuthDiffCount int + Failures []string +} + +// NewPullSecretOp creates a new operation context. +func NewPullSecretOp(dryRun bool, logger *logrus.Logger, out io.Writer) *PullSecretOp { + return &PullSecretOp{ + DryRun: dryRun, + Logger: logger, + Out: out, + AllOK: true, + } +} + +// Section prints a step header with educational description. +func (op *PullSecretOp) Section(step int, title string, lines ...string) { + fmt.Fprintf(op.Out, "\n%s\n", opColorHdr("============================================================")) + prefix := "" + if op.DryRun { + prefix = opColorDryRun("[Dry Run] ") + } + fmt.Fprintf(op.Out, "%s%s\n", prefix, opColorHdr(fmt.Sprintf("Step %d: %s", step, title))) + for _, line := range lines { + fmt.Fprintf(op.Out, " %s\n", opColorDetail(line)) + } + fmt.Fprintf(op.Out, "%s\n", opColorHdr("============================================================")) +} + +// OK prints a success result. +func (op *PullSecretOp) OK(format string, args ...any) { + prefix := "" + if op.DryRun { + prefix = opColorDryRun("[Dry Run] ") + } + fmt.Fprintf(op.Out, "%s%s %s\n", prefix, opColorOK("[OK]"), fmt.Sprintf(format, args...)) +} + +// Fail prints a failure result and marks the operation as not-all-OK. +func (op *PullSecretOp) Fail(format string, args ...any) { + msg := fmt.Sprintf(format, args...) + prefix := "" + if op.DryRun { + prefix = opColorDryRun("[Dry Run] ") + } + fmt.Fprintf(op.Out, "%s%s %s\n", prefix, opColorFail("[FAIL]"), msg) + op.AllOK = false + op.Failures = append(op.Failures, msg) +} + +// Warn prints a warning. +func (op *PullSecretOp) Warn(format string, args ...any) { + prefix := "" + if op.DryRun { + prefix = opColorDryRun("[Dry Run] ") + } + fmt.Fprintf(op.Out, "%s%s %s\n", prefix, opColorWarn("[WARN]"), fmt.Sprintf(format, args...)) +} + +// Would prints what the operation would do (dry-run only). +func (op *PullSecretOp) Would(format string, args ...any) { + if op.DryRun { + fmt.Fprintf(op.Out, "%s %s %s\n", opColorDryRun("[Dry Run]"), opColorDryRun("Would:"), opColorDryRun(fmt.Sprintf(format, args...))) + } +} + +// Info prints an informational message. +func (op *PullSecretOp) Info(format string, args ...any) { + prefix := "" + if op.DryRun { + prefix = opColorDryRun("[Dry Run] ") + } + fmt.Fprintf(op.Out, "%s%s\n", prefix, fmt.Sprintf(format, args...)) +} + +// CheckCanI verifies RBAC permission. In dry-run mode it reports the result. +// In live mode it just logs the check. Returns whether the permission is allowed. +func (op *PullSecretOp) CheckCanI(ctx context.Context, clientset *kubernetes.Clientset, systemLabel, verb, resource, group, namespace string) bool { + if clientset == nil { + if op.DryRun { + fmt.Fprintf(op.Out, "%s %s %s: auth can-i %s %s in %s (client not available)\n", + opColorDryRun("[Dry Run]"), opColorWarn("[SKIP]"), systemLabel, verb, resource, nsLabel(namespace)) + } + return false + } + + review := &authorizationv1.SelfSubjectAccessReview{ + Spec: authorizationv1.SelfSubjectAccessReviewSpec{ + ResourceAttributes: &authorizationv1.ResourceAttributes{ + Verb: verb, + Resource: resource, + Group: group, + Namespace: namespace, + }, + }, + } + result, err := clientset.AuthorizationV1().SelfSubjectAccessReviews().Create(ctx, review, metav1.CreateOptions{}) + if err != nil { + if op.DryRun { + fmt.Fprintf(op.Out, "%s %s %s: auth can-i %s %s in %s (%v)\n", + opColorDryRun("[Dry Run]"), opColorWarn("[SKIP]"), systemLabel, verb, resource, nsLabel(namespace), err) + } + return false + } + + allowed := result.Status.Allowed + if !allowed { + op.AllOK = false + } + if op.DryRun { + status := opColorOK("[OK]") + if !allowed { + status = opColorFail("[FAIL]") + } + fmt.Fprintf(op.Out, "%s %s %s: auth can-i %s %s in %s\n", + opColorDryRun("[Dry Run]"), status, systemLabel, verb, resource, nsLabel(namespace)) + } + return allowed +} + +// CheckSecretExists checks if a secret exists. Returns true if found. +func (op *PullSecretOp) CheckSecretExists(ctx context.Context, clientset *kubernetes.Clientset, namespace, name, systemLabel string) bool { + if clientset == nil { + op.Fail("cannot check secret %s/%s — client not available", namespace, name) + return false + } + + _, err := clientset.CoreV1().Secrets(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + op.Warn("secret %s/%s not found on %s", namespace, name, systemLabel) + } else { + op.Warn("failed to read secret %s/%s on %s: %v", namespace, name, systemLabel, err) + } + return false + } + op.OK("secret %s/%s exists on %s", namespace, name, systemLabel) + return true +} + +// FindHiveNamespaceOp wraps FindHiveNamespace with operational output. +func (op *PullSecretOp) FindHiveNamespaceOp(ctx context.Context, kubeCli client.Client, clusterID, infraName string) (*HiveNamespaceInfo, bool) { + op.Info("Resolving Hive namespace for cluster %s on %s...", clusterID, infraName) + hiveInfo, err := FindHiveNamespace(ctx, kubeCli, clusterID) + if err != nil { + op.Fail("could not find Hive namespace: %v", err) + return nil, false + } + op.OK("found ClusterDeployment %s/%s on %s", hiveInfo.Namespace, hiveInfo.ClusterDeploymentName, infraName) + return hiveInfo, true +} + +// FetchAccessTokenOp wraps FetchOwnerAccessToken with operational output. +func (op *PullSecretOp) FetchAccessTokenOp(ocm *sdk.Connection, ownerUsername string) ([]byte, map[string]*amv1.AccessTokenAuth, bool) { + op.Logger.Infof("Fetching pull secret from OCM for owner '%s'", ownerUsername) + pullSecret, auths, err := FetchOwnerAccessToken(ocm, ownerUsername, op.Logger) + if err != nil { + op.Fail("could not fetch OCM access token: %v", err) + return nil, nil, false + } + op.OK("retrieved %d auth entries from OCM access token", len(auths)) + + missing := ValidateRequiredAuths(auths) + if len(missing) > 0 { + for _, m := range missing { + op.Warn("OCM access token missing required auth: %s", m) + } + } + return pullSecret, auths, true +} + +// ResolveExistingPullSecret finds the best available base pull secret data. +// Tries the hive secret first, then falls back to the target cluster's secret. +// Returns the secret data bytes and the source description. +func (op *PullSecretOp) ResolveExistingPullSecret(ctx context.Context, infraClientSet *kubernetes.Clientset, targetClientSet *kubernetes.Clientset, hiveNS string, infraName string, targetName string) ([]byte, string) { + // Try hive secret first + if infraClientSet != nil { + hiveSecret, err := infraClientSet.CoreV1().Secrets(hiveNS).Get(ctx, "pull", metav1.GetOptions{}) + if err == nil { + if data, ok := hiveSecret.Data[".dockerconfigjson"]; ok { + op.OK("secret %s/pull found on %s — using as base", hiveNS, infraName) + return data, fmt.Sprintf("%s/pull on %s", hiveNS, infraName) + } + } + op.Warn("secret %s/pull not found on %s", hiveNS, infraName) + } + + // Hive secret missing — check target cluster + if targetClientSet != nil { + targetSecret, err := targetClientSet.CoreV1().Secrets("openshift-config").Get(ctx, "pull-secret", metav1.GetOptions{}) + if err == nil { + if data, ok := targetSecret.Data[".dockerconfigjson"]; ok { + op.OK("secret openshift-config/pull-secret found on %s (can be used as base)", targetName) + return data, fmt.Sprintf("openshift-config/pull-secret on %s", targetName) + } + } + op.Warn("secret openshift-config/pull-secret not found on %s", targetName) + } + + // Neither found + op.Warn("no existing pull secret found — will need to build from OCM auths only") + return nil, "" +} + +func nsLabel(namespace string) string { + if namespace == "" { + return "(cluster-scoped)" + } + return namespace +}