diff --git a/pkg/controllers/clusteroperator_controller.go b/pkg/controllers/clusteroperator_controller.go index 94f7f1950..0126e9f27 100644 --- a/pkg/controllers/clusteroperator_controller.go +++ b/pkg/controllers/clusteroperator_controller.go @@ -28,6 +28,7 @@ import ( operatorv1 "github.com/openshift/api/operator/v1" "github.com/openshift/library-go/pkg/cloudprovider" "github.com/openshift/library-go/pkg/operator/configobserver/featuregates" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" @@ -128,7 +129,7 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) } if progressing { - return ctrl.Result{}, nil + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil } if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { @@ -164,6 +165,17 @@ func (r *CloudOperatorReconciler) sync(ctx context.Context, config config.Operat return true, nil } + converged, err := r.operandsConverged(ctx, resources) + if err != nil { + return false, err + } + if !converged { + if err := r.ensureStatusProgressing(ctx, conditionOverrides); err != nil { + return false, err + } + return true, nil + } + return false, nil } @@ -192,6 +204,76 @@ func (r *CloudOperatorReconciler) applyResources(ctx context.Context, resources return anyUpdated, nil } +func isDeploymentRolloutComplete(deploy *appsv1.Deployment) bool { + if deploy.Status.ObservedGeneration < deploy.Generation { + return false + } + for _, condition := range deploy.Status.Conditions { + if condition.Type == appsv1.DeploymentProgressing && + condition.Status == corev1.ConditionTrue && + condition.Reason == "NewReplicaSetAvailable" { + return true + } + } + return false +} + +func isDeploymentRolloutStalled(deploy *appsv1.Deployment) bool { + if deploy.Status.ObservedGeneration < deploy.Generation { + return false + } + for _, condition := range deploy.Status.Conditions { + if condition.Type == appsv1.DeploymentProgressing && + condition.Status == corev1.ConditionFalse && + condition.Reason == "ProgressDeadlineExceeded" { + return true + } + } + return false +} + +func isDaemonSetRolloutComplete(ds *appsv1.DaemonSet) bool { + return ds.Status.ObservedGeneration >= ds.Generation && + ds.Status.UpdatedNumberScheduled >= ds.Status.DesiredNumberScheduled && + ds.Status.NumberUnavailable == 0 && + ds.Status.NumberMisscheduled == 0 +} + +func (r *CloudOperatorReconciler) operandsConverged(ctx context.Context, resources []client.Object) (bool, error) { + for _, resource := range resources { + switch resource.(type) { + case *appsv1.Deployment: + live := &appsv1.Deployment{} + if err := r.Get(ctx, client.ObjectKeyFromObject(resource), live); apierrors.IsNotFound(err) { + klog.V(2).Infof("Deployment %s not found, treating as not converged", resource.GetName()) + return false, nil + } else if err != nil { + return false, fmt.Errorf("failed to get Deployment %s: %w", resource.GetName(), err) + } + if isDeploymentRolloutStalled(live) { + return false, fmt.Errorf("deployment %s rollout stalled: ProgressDeadlineExceeded", resource.GetName()) + } + if !isDeploymentRolloutComplete(live) { + klog.V(2).Infof("Deployment %s rollout not complete", resource.GetName()) + return false, nil + } + case *appsv1.DaemonSet: + live := &appsv1.DaemonSet{} + if err := r.Get(ctx, client.ObjectKeyFromObject(resource), live); apierrors.IsNotFound(err) { + klog.V(2).Infof("DaemonSet %s not found, treating as not converged", resource.GetName()) + return false, nil + } else if err != nil { + return false, fmt.Errorf("failed to get DaemonSet %s: %w", resource.GetName(), err) + } + if !isDaemonSetRolloutComplete(live) { + klog.V(2).Infof("DaemonSet %s rollout not complete", resource.GetName()) + return false, nil + } + } + } + return true, nil +} + // SetupWithManager sets up the controller with the Manager. func (r *CloudOperatorReconciler) SetupWithManager(mgr ctrl.Manager) error { watcher, err := NewObjectWatcher(WatcherOptions{ diff --git a/pkg/controllers/clusteroperator_controller_test.go b/pkg/controllers/clusteroperator_controller_test.go index a062971bb..9903ca418 100644 --- a/pkg/controllers/clusteroperator_controller_test.go +++ b/pkg/controllers/clusteroperator_controller_test.go @@ -620,17 +620,56 @@ var _ = Describe("Apply resources should", func() { } } - updated, err = reconciler.applyResources(context.TODO(), resources) + updated, err = reconciler.applyResources(context.Background(), resources) Expect(err).ShouldNot(HaveOccurred()) Expect(updated).To(BeTrue(), "should report updated even when only a non-final resource changed") }) - It("should set Progressing=True on first sync and not signal progressing when resources are stable", func() { + Context("operandsConverged", func() { + It("returns false when deployment rollout is not complete, true after patching status", func() { + operatorConfig := getConfigForPlatform(&configv1.PlatformStatus{Type: configv1.AWSPlatformType}) + awsResources, err := cloud.GetResources(operatorConfig) + Expect(err).To(Succeed()) + resources = append(resources, awsResources...) + + updated, err := reconciler.applyResources(context.Background(), resources) + Expect(err).ShouldNot(HaveOccurred()) + Expect(updated).To(BeTrue()) + + // envtest has no deployment controller, so no Progressing condition exists. + converged, err := reconciler.operandsConverged(context.Background(), resources) + Expect(err).To(Succeed()) + Expect(converged).To(BeFalse(), "should not be converged when deployment has no Progressing condition") + + // Patch the deployment status to mark rollout complete. + for _, res := range resources { + if dep, ok := res.(*appsv1.Deployment); ok { + live := &appsv1.Deployment{} + Expect(cl.Get(context.Background(), client.ObjectKeyFromObject(dep), live)).To(Succeed()) + live.Status.ObservedGeneration = live.Generation + live.Status.Conditions = []appsv1.DeploymentCondition{ + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionTrue, + Reason: "NewReplicaSetAvailable", + }, + } + Expect(cl.Status().Update(context.Background(), live)).To(Succeed()) + } + } + + converged, err = reconciler.operandsConverged(context.Background(), resources) + Expect(err).To(Succeed()) + Expect(converged).To(BeTrue(), "should be converged after deployment rollout completes") + }) + }) + + It("should set Progressing=True on first sync, remain progressing until rollout completes, then stop", func() { reconciler.ManagedNamespace = DefaultManagedNamespace co := &configv1.ClusterOperator{} co.SetName(clusterOperatorName) - Expect(cl.Create(context.TODO(), co)).To(Succeed()) + Expect(cl.Create(context.Background(), co)).To(Succeed()) operatorConfig := getConfigForPlatform(&configv1.PlatformStatus{Type: configv1.AWSPlatformType}) awsResources, err := cloud.GetResources(operatorConfig) @@ -638,29 +677,45 @@ var _ = Describe("Apply resources should", func() { resources = append(resources, awsResources...) // First sync: resources do not yet exist, so applyResources reports updated=true. - progressing, err := reconciler.sync(context.TODO(), operatorConfig, nil) + progressing, err := reconciler.sync(context.Background(), operatorConfig, nil) Expect(err).To(Succeed()) Expect(progressing).To(BeTrue(), "sync should report progressing when resources are newly applied") - Expect(cl.Get(context.TODO(), client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) progressingCond := v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing) Expect(progressingCond).NotTo(BeNil(), "Progressing condition should exist after first sync") Expect(progressingCond.Status).To( Equal(configv1.ConditionTrue), "Progressing should be True after resources are first applied", ) - // Second sync: resources exist and are unchanged, so applyResources reports updated=false. - progressing, err = reconciler.sync(context.TODO(), operatorConfig, nil) + // Second sync: resources exist and are unchanged, but deployment rollout is + // not complete (envtest has no deployment controller). sync should still + // report progressing. + progressing, err = reconciler.sync(context.Background(), operatorConfig, nil) Expect(err).To(Succeed()) - Expect(progressing).To(BeFalse(), "sync should not report progressing when resources are already up to date") + Expect(progressing).To(BeTrue(), "sync should report progressing while deployment rollout is incomplete") - // Progressing remains True because sync() does not clear it; only setStatusAvailable() does. - Expect(cl.Get(context.TODO(), client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) - progressingCond = v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing) - Expect(progressingCond).NotTo(BeNil(), "Progressing condition should still exist after second sync") - Expect(progressingCond.Status).To( - Equal(configv1.ConditionTrue), "Progressing should remain True until setStatusAvailable is called", - ) + // Patch deployment status to mark rollout complete. + for _, res := range resources { + if dep, ok := res.(*appsv1.Deployment); ok { + live := &appsv1.Deployment{} + Expect(cl.Get(context.Background(), client.ObjectKeyFromObject(dep), live)).To(Succeed()) + live.Status.ObservedGeneration = live.Generation + live.Status.Conditions = []appsv1.DeploymentCondition{ + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionTrue, + Reason: "NewReplicaSetAvailable", + }, + } + Expect(cl.Status().Update(context.Background(), live)).To(Succeed()) + } + } + + // Third sync: deployment rollout is complete, sync should report not progressing. + progressing, err = reconciler.sync(context.Background(), operatorConfig, nil) + Expect(err).To(Succeed()) + Expect(progressing).To(BeFalse(), "sync should not report progressing once deployment rollout completes") }) AfterEach(func() { @@ -1012,7 +1067,7 @@ var _ = Describe("Reconcile progressing flow", func() { } }) - It("sets Progressing=True on first reconcile, then Available on second", func() { + It("sets Progressing=True on first reconcile, stays progressing until rollout completes, then Available", func() { // First Reconcile: resources are created, Progressing=True, Available not set. _, err := reconciler.Reconcile(ctx, reconcile.Request{}) Expect(err).NotTo(HaveOccurred()) @@ -1042,7 +1097,8 @@ var _ = Describe("Reconcile progressing flow", func() { "Available should not be True while progressing") } - // Second Reconcile: nothing changed, Progressing=False, Available=True. + // Second Reconcile: nothing changed, but deployment rollout is not complete + // (envtest has no deployment controller). Progressing should remain True. _, err = reconciler.Reconcile(ctx, reconcile.Request{}) Expect(err).NotTo(HaveOccurred()) @@ -1050,12 +1106,264 @@ var _ = Describe("Reconcile progressing flow", func() { progressingCond = v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing) Expect(progressingCond).NotTo(BeNil(), "Progressing condition should exist after second reconcile") Expect(progressingCond.Status).To( - Equal(configv1.ConditionFalse), "Progressing should be False after second reconcile with no changes", + Equal(configv1.ConditionTrue), "Progressing should remain True while deployment rollout is incomplete", + ) + + // Patch deployment status to mark rollout complete. + for _, res := range operandResources { + if dep, ok := res.(*appsv1.Deployment); ok { + live := &appsv1.Deployment{} + Expect(cl.Get(ctx, client.ObjectKeyFromObject(dep), live)).To(Succeed()) + live.Status.ObservedGeneration = live.Generation + live.Status.Conditions = []appsv1.DeploymentCondition{ + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionTrue, + Reason: "NewReplicaSetAvailable", + }, + } + Expect(cl.Status().Update(ctx, live)).To(Succeed()) + } + } + + // Third Reconcile: deployment rollout is complete, Progressing=False, Available=True. + _, err = reconciler.Reconcile(ctx, reconcile.Request{}) + Expect(err).NotTo(HaveOccurred()) + + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + progressingCond = v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing) + Expect(progressingCond).NotTo(BeNil(), "Progressing condition should exist after third reconcile") + Expect(progressingCond.Status).To( + Equal(configv1.ConditionFalse), "Progressing should be False after deployment rollout completes", ) availCond = v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorAvailable) - Expect(availCond).NotTo(BeNil(), "Available condition should exist after second reconcile") + Expect(availCond).NotTo(BeNil(), "Available condition should exist after third reconcile") Expect(availCond.Status).To( - Equal(configv1.ConditionTrue), "Available should be True after second reconcile", + Equal(configv1.ConditionTrue), "Available should be True after deployment rollout completes", ) }) }) + +var _ = Describe("Rollout completion checks", func() { + Context("isDeploymentRolloutComplete", func() { + It("is not complete when no conditions exist", func() { + deploy := &appsv1.Deployment{} + Expect(isDeploymentRolloutComplete(deploy)).To(BeFalse()) + }) + + It("is complete when ObservedGeneration matches and Reason=NewReplicaSetAvailable", func() { + deploy := &appsv1.Deployment{} + deploy.Generation = 2 + deploy.Status = appsv1.DeploymentStatus{ + ObservedGeneration: 2, + Conditions: []appsv1.DeploymentCondition{ + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionTrue, + Reason: "NewReplicaSetAvailable", + }, + }, + } + Expect(isDeploymentRolloutComplete(deploy)).To(BeTrue()) + }) + + It("is not complete when ObservedGeneration lags behind Generation", func() { + deploy := &appsv1.Deployment{} + deploy.Generation = 3 + deploy.Status = appsv1.DeploymentStatus{ + ObservedGeneration: 2, + Conditions: []appsv1.DeploymentCondition{ + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionTrue, + Reason: "NewReplicaSetAvailable", + }, + }, + } + Expect(isDeploymentRolloutComplete(deploy)).To(BeFalse()) + }) + + It("is not complete when Progressing=True with Reason=ReplicaSetUpdated", func() { + deploy := &appsv1.Deployment{} + deploy.Generation = 1 + deploy.Status = appsv1.DeploymentStatus{ + ObservedGeneration: 1, + Conditions: []appsv1.DeploymentCondition{ + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionTrue, + Reason: "ReplicaSetUpdated", + }, + }, + } + Expect(isDeploymentRolloutComplete(deploy)).To(BeFalse()) + }) + + It("is not complete when Progressing=False due to deadline exceeded", func() { + deploy := &appsv1.Deployment{} + deploy.Generation = 1 + deploy.Status = appsv1.DeploymentStatus{ + ObservedGeneration: 1, + Conditions: []appsv1.DeploymentCondition{ + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionFalse, + Reason: "ProgressDeadlineExceeded", + }, + }, + } + Expect(isDeploymentRolloutComplete(deploy)).To(BeFalse()) + }) + + It("is not complete when Progressing=True with Reason=FoundNewReplicaSet", func() { + deploy := &appsv1.Deployment{} + deploy.Generation = 1 + deploy.Status = appsv1.DeploymentStatus{ + ObservedGeneration: 1, + Conditions: []appsv1.DeploymentCondition{ + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionTrue, + Reason: "FoundNewReplicaSet", + }, + }, + } + Expect(isDeploymentRolloutComplete(deploy)).To(BeFalse()) + }) + + It("keys off Progressing condition even when other conditions are present", func() { + deploy := &appsv1.Deployment{} + deploy.Generation = 1 + deploy.Status = appsv1.DeploymentStatus{ + ObservedGeneration: 1, + Conditions: []appsv1.DeploymentCondition{ + { + Type: appsv1.DeploymentAvailable, + Status: corev1.ConditionTrue, + Reason: "MinimumReplicasAvailable", + }, + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionTrue, + Reason: "NewReplicaSetAvailable", + }, + }, + } + Expect(isDeploymentRolloutComplete(deploy)).To(BeTrue()) + }) + }) + + Context("isDeploymentRolloutStalled", func() { + It("is stalled when ProgressDeadlineExceeded", func() { + deploy := &appsv1.Deployment{ + Status: appsv1.DeploymentStatus{ + Conditions: []appsv1.DeploymentCondition{ + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionFalse, + Reason: "ProgressDeadlineExceeded", + }, + }, + }, + } + Expect(isDeploymentRolloutStalled(deploy)).To(BeTrue()) + }) + + It("is not stalled during normal rollout", func() { + deploy := &appsv1.Deployment{ + Status: appsv1.DeploymentStatus{ + Conditions: []appsv1.DeploymentCondition{ + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionTrue, + Reason: "ReplicaSetUpdated", + }, + }, + }, + } + Expect(isDeploymentRolloutStalled(deploy)).To(BeFalse()) + }) + + It("is not stalled when no conditions exist", func() { + deploy := &appsv1.Deployment{} + Expect(isDeploymentRolloutStalled(deploy)).To(BeFalse()) + }) + + It("ignores stale ProgressDeadlineExceeded from a previous generation", func() { + deploy := &appsv1.Deployment{} + deploy.Generation = 2 + deploy.Status = appsv1.DeploymentStatus{ + ObservedGeneration: 1, + Conditions: []appsv1.DeploymentCondition{ + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionFalse, + Reason: "ProgressDeadlineExceeded", + }, + }, + } + Expect(isDeploymentRolloutStalled(deploy)).To(BeFalse()) + }) + }) + + Context("isDaemonSetRolloutComplete", func() { + It("is not complete when UpdatedNumberScheduled < DesiredNumberScheduled", func() { + ds := &appsv1.DaemonSet{} + ds.Generation = 1 + ds.Status = appsv1.DaemonSetStatus{ + ObservedGeneration: 1, + DesiredNumberScheduled: 3, + UpdatedNumberScheduled: 1, + } + Expect(isDaemonSetRolloutComplete(ds)).To(BeFalse()) + }) + + It("is not complete when NumberMisscheduled > 0 even if updated count matches", func() { + ds := &appsv1.DaemonSet{} + ds.Generation = 1 + ds.Status = appsv1.DaemonSetStatus{ + ObservedGeneration: 1, + DesiredNumberScheduled: 3, + UpdatedNumberScheduled: 3, + NumberMisscheduled: 1, + } + Expect(isDaemonSetRolloutComplete(ds)).To(BeFalse()) + }) + + It("is not complete when NumberUnavailable > 0 even if scheduled counts match", func() { + ds := &appsv1.DaemonSet{} + ds.Generation = 1 + ds.Status = appsv1.DaemonSetStatus{ + ObservedGeneration: 1, + DesiredNumberScheduled: 3, + UpdatedNumberScheduled: 3, + NumberUnavailable: 1, + } + Expect(isDaemonSetRolloutComplete(ds)).To(BeFalse()) + }) + + It("is not complete when ObservedGeneration lags behind Generation", func() { + ds := &appsv1.DaemonSet{} + ds.Generation = 2 + ds.Status = appsv1.DaemonSetStatus{ + ObservedGeneration: 1, + DesiredNumberScheduled: 3, + UpdatedNumberScheduled: 3, + } + Expect(isDaemonSetRolloutComplete(ds)).To(BeFalse()) + }) + + It("is complete when all conditions are met", func() { + ds := &appsv1.DaemonSet{} + ds.Generation = 2 + ds.Status = appsv1.DaemonSetStatus{ + ObservedGeneration: 2, + DesiredNumberScheduled: 3, + UpdatedNumberScheduled: 3, + NumberMisscheduled: 0, + NumberUnavailable: 0, + } + Expect(isDaemonSetRolloutComplete(ds)).To(BeTrue()) + }) + }) +}) diff --git a/pkg/controllers/status.go b/pkg/controllers/status.go index 46dfbc1d5..d2e78c8c1 100644 --- a/pkg/controllers/status.go +++ b/pkg/controllers/status.go @@ -110,6 +110,20 @@ func (r *ClusterOperatorStatusClient) setStatusProgressing(ctx context.Context, return r.syncStatus(ctx, co, conds, overrides) } +func (r *ClusterOperatorStatusClient) ensureStatusProgressing(ctx context.Context, overrides []configv1.ClusterOperatorStatusCondition) error { + co, err := r.getOrCreateClusterOperator(ctx) + if err != nil { + return err + } + progressing := v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing) + upgradeable := v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorUpgradeable) + if progressing != nil && progressing.Status == configv1.ConditionTrue && + upgradeable != nil && upgradeable.Status == configv1.ConditionTrue { + return nil + } + return r.setStatusProgressing(ctx, overrides) +} + // setStatusAvailable sets the Available condition to True, with the given reason // and message, and sets both the Progressing and Degraded conditions to False. func (r *ClusterOperatorStatusClient) setStatusAvailable(ctx context.Context, overrides []configv1.ClusterOperatorStatusCondition) error {