From 61ce443112d4780c6b91b0a6fbcd0c3f46b0ee31 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Sun, 29 Oct 2023 15:03:19 +0200 Subject: [PATCH 1/3] remove VA deletion Signed-off-by: Michael Shitrit --- controllers/selfnoderemediation_controller.go | 20 +---------- controllers/tests/config/suite_test.go | 5 ++- .../selfnoderemediation_controller_test.go | 33 +++++-------------- controllers/tests/controller/suite_test.go | 6 ++-- controllers/tests/shared/shared.go | 16 ++++----- 5 files changed, 22 insertions(+), 58 deletions(-) diff --git a/controllers/selfnoderemediation_controller.go b/controllers/selfnoderemediation_controller.go index 1cf24b24..71d24a1d 100644 --- a/controllers/selfnoderemediation_controller.go +++ b/controllers/selfnoderemediation_controller.go @@ -339,7 +339,7 @@ func (r *SelfNodeRemediationReconciler) deleteResources(node *v1.Node) error { namespaces := v1.NamespaceList{} if err := r.Client.List(context.Background(), &namespaces); err != nil { - r.logger.Error(err, "failed to list namespaces", err) + r.logger.Error(err, "failed to list namespaces") return err } @@ -355,24 +355,6 @@ func (r *SelfNodeRemediationReconciler) deleteResources(node *v1.Node) error { } } - volumeAttachments := &storagev1.VolumeAttachmentList{} - if err := r.Client.List(context.Background(), volumeAttachments); err != nil { - r.logger.Error(err, "failed to get volumeAttachments list") - return err - } - forceDeleteOption := &client.DeleteOptions{ - GracePeriodSeconds: &zero, - } - for _, va := range volumeAttachments.Items { - if va.Spec.NodeName == node.Name { - err := r.Client.Delete(context.Background(), &va, forceDeleteOption) - if err != nil { - r.logger.Error(err, "failed to delete volumeAttachment", "name", va.Name) - return err - } - } - } - r.logger.Info("done deleting node resources", "node name", node.Name) return nil diff --git a/controllers/tests/config/suite_test.go b/controllers/tests/config/suite_test.go index 62de7233..898f9a13 100644 --- a/controllers/tests/config/suite_test.go +++ b/controllers/tests/config/suite_test.go @@ -82,9 +82,8 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) k8sClient = &shared.K8sClientWrapper{ - Client: k8sManager.GetClient(), - Reader: k8sManager.GetAPIReader(), - VaFailureMessage: "simulation of client error for VA", + Client: k8sManager.GetClient(), + Reader: k8sManager.GetAPIReader(), } Expect(k8sClient).ToNot(BeNil()) diff --git a/controllers/tests/controller/selfnoderemediation_controller_test.go b/controllers/tests/controller/selfnoderemediation_controller_test.go index 623875c4..f6f2422e 100644 --- a/controllers/tests/controller/selfnoderemediation_controller_test.go +++ b/controllers/tests/controller/selfnoderemediation_controller_test.go @@ -60,12 +60,11 @@ var _ = Describe("SNR Controller", func() { AfterEach(func() { k8sClient.ShouldSimulateFailure = false - k8sClient.ShouldSimulateVaFailure = false + k8sClient.ShouldSimulatePodDeleteFailure = false isAdditionalSetupNeeded = false deleteRemediations() deleteSelfNodeRemediationPod() deleteVolumeAttachment(vaName, false) - verifyVaDeleted(vaName) //clear node's state, this is important to remove taints, label etc. Expect(k8sClient.Update(context.Background(), getNode(shared.UnhealthyNodeName))) Expect(k8sClient.Update(context.Background(), getNode(shared.PeerNodeName))) @@ -184,8 +183,6 @@ var _ = Describe("SNR Controller", func() { verifySelfNodeRemediationPodDoesntExist() - verifyVaDeleted(vaName) - verifyEvent("Normal", "DeleteResources", "Remediation process - finished deleting unhealthy node resources") verifyFinalizerExists() @@ -219,7 +216,7 @@ var _ = Describe("SNR Controller", func() { It("The snr agent attempts to keep deleting node resources during temporary api-server failure", func() { node := verifyNodeIsUnschedulable() - k8sClient.ShouldSimulateVaFailure = true + k8sClient.ShouldSimulatePodDeleteFailure = true addUnschedulableTaint(node) @@ -229,16 +226,16 @@ var _ = Describe("SNR Controller", func() { verifyNoWatchdogFood() - verifySelfNodeRemediationPodDoesntExist() - verifyVaNotDeleted(vaName) // The kube-api calls for VA fail intentionally. In this case, we expect the snr agent to try // to delete node resources again. So LastError is set to the error every time Reconcile() // is triggered. If it becomes another error, it means something unintended happens. - verifyLastErrorKeepsApiErrorForVa() + verifyLastErrorKeepsApiError() - k8sClient.ShouldSimulateVaFailure = false + k8sClient.ShouldSimulatePodDeleteFailure = false + + verifySelfNodeRemediationPodDoesntExist() deleteVolumeAttachment(vaName, true) @@ -391,20 +388,6 @@ func deleteVolumeAttachment(vaName string, verifyExist bool) { ExpectWithOffset(1, err).To(Succeed()) } -func verifyVaDeleted(vaName string) { - vaKey := client.ObjectKey{ - Namespace: shared.Namespace, - Name: vaName, - } - - EventuallyWithOffset(1, func() bool { - va := &storagev1.VolumeAttachment{} - err := k8sClient.Get(context.Background(), vaKey, va) - return apierrors.IsNotFound(err) - - }, 5*time.Second, 100*time.Millisecond).Should(BeTrue()) -} - func verifyVaNotDeleted(vaName string) { vaKey := client.ObjectKey{ Namespace: shared.Namespace, @@ -419,7 +402,7 @@ func verifyVaNotDeleted(vaName string) { }, 5*time.Second, 250*time.Millisecond).Should(BeFalse()) } -func verifyLastErrorKeepsApiErrorForVa() { +func verifyLastErrorKeepsApiError() { By("Verify that LastError in SNR status has been kept kube-api error for VA") snr := &v1alpha1.SelfNodeRemediation{} ConsistentlyWithOffset(1, func() bool { @@ -427,7 +410,7 @@ func verifyLastErrorKeepsApiErrorForVa() { if err := k8sClient.Client.Get(context.Background(), snrNamespacedName, snr); err != nil { return false } - return snr.Status.LastError == k8sClient.VaFailureMessage + return snr.Status.LastError == k8sClient.SimulatedFailureMessage }, 5*time.Second, 250*time.Millisecond).Should(BeTrue()) } diff --git a/controllers/tests/controller/suite_test.go b/controllers/tests/controller/suite_test.go index 77516aed..982a694d 100644 --- a/controllers/tests/controller/suite_test.go +++ b/controllers/tests/controller/suite_test.go @@ -96,9 +96,9 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) k8sClient = &shared.K8sClientWrapper{ - Client: k8sManager.GetClient(), - Reader: k8sManager.GetAPIReader(), - VaFailureMessage: "simulation of client error for VA", + Client: k8sManager.GetClient(), + Reader: k8sManager.GetAPIReader(), + SimulatedFailureMessage: "simulation of client error for delete when listing namespace", } Expect(k8sClient).ToNot(BeNil()) diff --git a/controllers/tests/shared/shared.go b/controllers/tests/shared/shared.go index 5cf578ec..28270387 100644 --- a/controllers/tests/shared/shared.go +++ b/controllers/tests/shared/shared.go @@ -5,7 +5,7 @@ import ( "errors" "time" - storagev1 "k8s.io/api/storage/v1" + corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -20,10 +20,10 @@ const ( type K8sClientWrapper struct { client.Client - Reader client.Reader - ShouldSimulateFailure bool - ShouldSimulateVaFailure bool - VaFailureMessage string + Reader client.Reader + ShouldSimulateFailure bool + ShouldSimulatePodDeleteFailure bool + SimulatedFailureMessage string } type MockCalculator struct { @@ -34,9 +34,9 @@ type MockCalculator struct { func (kcw *K8sClientWrapper) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error { if kcw.ShouldSimulateFailure { return errors.New("simulation of client error") - } else if kcw.ShouldSimulateVaFailure { - if _, ok := list.(*storagev1.VolumeAttachmentList); ok { - return errors.New(kcw.VaFailureMessage) + } else if kcw.ShouldSimulatePodDeleteFailure { + if _, ok := list.(*corev1.NamespaceList); ok { + return errors.New(kcw.SimulatedFailureMessage) } } return kcw.Client.List(ctx, list, opts...) From b203e8b526117c58e0eccc3a0478dadf54d25895 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Mon, 30 Oct 2023 12:46:53 +0200 Subject: [PATCH 2/3] remove VA deletion, update e2e tests Signed-off-by: Michael Shitrit --- e2e/self_node_remediation_test.go | 40 +------------------------------ 1 file changed, 1 insertion(+), 39 deletions(-) diff --git a/e2e/self_node_remediation_test.go b/e2e/self_node_remediation_test.go index 76baa281..878f3ae4 100644 --- a/e2e/self_node_remediation_test.go +++ b/e2e/self_node_remediation_test.go @@ -15,7 +15,6 @@ import ( gomegatypes "github.com/onsi/gomega/types" v1 "k8s.io/api/core/v1" - storagev1 "k8s.io/api/storage/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -108,17 +107,14 @@ var _ = Describe("Self Node Remediation E2E", func() { Context("Resource Deletion Strategy", func() { var oldPodCreationTime time.Time - var va *storagev1.VolumeAttachment BeforeEach(func() { remediationStrategy = v1alpha1.ResourceDeletionRemediationStrategy oldPodCreationTime = findSnrPod(node).CreationTimestamp.Time - va = createVolumeAttachment(node) }) It("should delete pods and volume attachments", func() { checkPodRecreated(node, oldPodCreationTime) - checkVaDeleted(va) //Simulate NHC trying to delete SNR deleteAndWait(snr) snr = nil @@ -167,17 +163,15 @@ var _ = Describe("Self Node Remediation E2E", func() { // b) unhealthy // - kill connectivity on one node // - create SNR - // - verify node does reboot and and is deleted / re-created + // - verify node does reboot and is deleted / re-created var snr *v1alpha1.SelfNodeRemediation - var va *storagev1.VolumeAttachment var oldPodCreationTime time.Time BeforeEach(func() { killApiConnection(node, apiIPs, false) snr = createSNR(node, v1alpha1.ResourceDeletionRemediationStrategy) oldPodCreationTime = findSnrPod(node).CreationTimestamp.Time - va = createVolumeAttachment(node) }) AfterEach(func() { @@ -192,7 +186,6 @@ var _ = Describe("Self Node Remediation E2E", func() { // - because the 2nd check has a small timeout only checkReboot(node, oldBootTime) checkPodRecreated(node, oldPodCreationTime) - checkVaDeleted(va) if _, isExist := os.LookupEnv(skipLogsEnvVarName); !isExist { // we can't check logs of unhealthy node anymore, check peer logs peer := &workers.Items[1] @@ -333,17 +326,14 @@ var _ = Describe("Self Node Remediation E2E", func() { Context("Resource Deletion Strategy", func() { var oldPodCreationTime time.Time - var va *storagev1.VolumeAttachment BeforeEach(func() { remediationStrategy = v1alpha1.ResourceDeletionRemediationStrategy oldPodCreationTime = findSnrPod(controlPlaneNode).CreationTimestamp.Time - va = createVolumeAttachment(controlPlaneNode) }) It("should delete pods and volume attachments", func() { checkPodRecreated(controlPlaneNode, oldPodCreationTime) - checkVaDeleted(va) //Simulate NHC trying to delete SNR deleteAndWait(snr) snr = nil @@ -358,34 +348,6 @@ var _ = Describe("Self Node Remediation E2E", func() { }) }) -func checkVaDeleted(va *storagev1.VolumeAttachment) { - EventuallyWithOffset(1, func() bool { - newVa := &storagev1.VolumeAttachment{} - err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(va), newVa) - return errors.IsNotFound(err) - - }, 10*time.Second, 250*time.Millisecond).Should(BeTrue()) -} - -func createVolumeAttachment(node *v1.Node) *storagev1.VolumeAttachment { - foo := "foo" - va := &storagev1.VolumeAttachment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "some-va", - Namespace: testNamespace, - }, - Spec: storagev1.VolumeAttachmentSpec{ - Attacher: foo, - Source: storagev1.VolumeAttachmentSource{}, - NodeName: node.Name, - }, - } - - va.Spec.Source.PersistentVolumeName = &foo - ExpectWithOffset(1, k8sClient.Create(context.Background(), va)).To(Succeed()) - return va -} - func checkPodRecreated(node *v1.Node, oldPodCreationTime time.Time) bool { return EventuallyWithOffset(1, func() time.Time { pod := findSnrPod(node) From b2790deb40ebc8150cccd1fb92317ef28b5d1ea3 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Tue, 31 Oct 2023 16:05:09 +0200 Subject: [PATCH 3/3] use common method Signed-off-by: Michael Shitrit --- controllers/selfnoderemediation_controller.go | 44 +--------------- go.mod | 2 +- go.sum | 4 +- .../medik8s/common/pkg/resources/resources.go | 52 +++++++++++++++++++ vendor/modules.txt | 3 +- 5 files changed, 59 insertions(+), 46 deletions(-) create mode 100644 vendor/github.com/medik8s/common/pkg/resources/resources.go diff --git a/controllers/selfnoderemediation_controller.go b/controllers/selfnoderemediation_controller.go index 71d24a1d..da46f49e 100644 --- a/controllers/selfnoderemediation_controller.go +++ b/controllers/selfnoderemediation_controller.go @@ -23,6 +23,7 @@ import ( "time" "github.com/go-logr/logr" + "github.com/medik8s/common/pkg/resources" "github.com/pkg/errors" v1 "k8s.io/api/core/v1" @@ -30,7 +31,6 @@ import ( apiErrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" utilerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/client-go/tools/record" @@ -317,47 +317,7 @@ func (r *SelfNodeRemediationReconciler) remediateWithResourceDeletion(snr *v1alp // if not, it will return a 'zero' time and non-nil error, which means exponential backoff is triggered // SelfNodeRemediation is only used in order to match method signature required by remediateWithResourceRemoval func (r *SelfNodeRemediationReconciler) deleteResourcesWrapper(node *v1.Node, _ *v1alpha1.SelfNodeRemediation) (time.Duration, error) { - return 0, r.deleteResources(node) -} - -func (r *SelfNodeRemediationReconciler) deleteResources(node *v1.Node) error { - //fence - zero := int64(0) - backgroundDeletePolicy := metav1.DeletePropagationBackground - - deleteOptions := &client.DeleteAllOfOptions{ - ListOptions: client.ListOptions{ - FieldSelector: fields.SelectorFromSet(fields.Set{"spec.nodeName": node.Name}), - Namespace: "", - Limit: 0, - }, - DeleteOptions: client.DeleteOptions{ - GracePeriodSeconds: &zero, - PropagationPolicy: &backgroundDeletePolicy, - }, - } - - namespaces := v1.NamespaceList{} - if err := r.Client.List(context.Background(), &namespaces); err != nil { - r.logger.Error(err, "failed to list namespaces") - return err - } - - r.logger.Info("starting to delete node resources", "node name", node.Name) - - pod := &v1.Pod{} - for _, ns := range namespaces.Items { - deleteOptions.Namespace = ns.Name - err := r.Client.DeleteAllOf(context.Background(), pod, deleteOptions) - if err != nil { - r.logger.Error(err, "failed to delete pods of unhealthy node", "namespace", ns.Name) - return err - } - } - - r.logger.Info("done deleting node resources", "node name", node.Name) - - return nil + return 0, resources.DeletePods(context.Background(), r.Client, node.Name) } func (r *SelfNodeRemediationReconciler) remediateWithOutOfServiceTaint(snr *v1alpha1.SelfNodeRemediation) (ctrl.Result, error) { diff --git a/go.mod b/go.mod index 2b87f5eb..0106769d 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( github.com/Masterminds/sprig v2.22.0+incompatible github.com/go-logr/logr v1.2.3 github.com/go-ping/ping v1.1.0 - github.com/medik8s/common v1.2.0 + github.com/medik8s/common v1.9.0 github.com/onsi/ginkgo/v2 v2.9.1 github.com/onsi/gomega v1.27.4 github.com/openshift/api v0.0.0-20230414143018-3367bc7e6ac7 // release-4.13 diff --git a/go.sum b/go.sum index 15679a0d..bb1e57da 100644 --- a/go.sum +++ b/go.sum @@ -215,8 +215,8 @@ github.com/mailru/easyjson v0.7.6/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJ github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/matttproud/golang_protobuf_extensions v1.0.2 h1:hAHbPm5IJGijwng3PWk09JkG9WeqChjprR5s9bBZ+OM= github.com/matttproud/golang_protobuf_extensions v1.0.2/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= -github.com/medik8s/common v1.2.0 h1:xgQOijD3rEn+PfCd4lJuV3WFdt5QA6SIaqF01rRp2as= -github.com/medik8s/common v1.2.0/go.mod h1:ZT/3hfMXJLmZEcqmxRWB5LGC8Wl+qKGGQ4zM8hOE7PY= +github.com/medik8s/common v1.9.0 h1:nMMffmj+e0l0xRB0RfpsbdDpW9upXU2MFeGGADJlwyE= +github.com/medik8s/common v1.9.0/go.mod h1:ZT/3hfMXJLmZEcqmxRWB5LGC8Wl+qKGGQ4zM8hOE7PY= github.com/mitchellh/copystructure v1.1.2 h1:Th2TIvG1+6ma3e/0/bopBKohOTY7s4dA8V2q4EUcBJ0= github.com/mitchellh/copystructure v1.1.2/go.mod h1:EBArHfARyrSWO/+Wyr9zwEkc6XMFB9XyNgFNmRkZZU4= github.com/mitchellh/reflectwalk v1.0.1 h1:FVzMWA5RllMAKIdUSC8mdWo3XtwoecrH79BY70sEEpE= diff --git a/vendor/github.com/medik8s/common/pkg/resources/resources.go b/vendor/github.com/medik8s/common/pkg/resources/resources.go new file mode 100644 index 00000000..33b9d70b --- /dev/null +++ b/vendor/github.com/medik8s/common/pkg/resources/resources.go @@ -0,0 +1,52 @@ +package resources + +import ( + "context" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// DeletePods deletes all the pods from the node +func DeletePods(ctx context.Context, r client.Client, nodeName string) error { + log := ctrl.Log.WithName("commons-resource") + zero := int64(0) + backgroundDeletePolicy := metav1.DeletePropagationBackground + + deleteOptions := &client.DeleteAllOfOptions{ + ListOptions: client.ListOptions{ + FieldSelector: fields.SelectorFromSet(fields.Set{"spec.nodeName": nodeName}), + Namespace: "", + Limit: 0, + }, + DeleteOptions: client.DeleteOptions{ + GracePeriodSeconds: &zero, + PropagationPolicy: &backgroundDeletePolicy, + }, + } + + namespaces := corev1.NamespaceList{} + if err := r.List(ctx, &namespaces); err != nil { + log.Error(err, "failed to list namespaces") + return err + } + + log.Info("starting to delete pods", "node name", nodeName) + + pod := &corev1.Pod{} + for _, ns := range namespaces.Items { + deleteOptions.Namespace = ns.Name + err := r.DeleteAllOf(ctx, pod, deleteOptions) + if err != nil { + log.Error(err, "failed to delete pods of node", "namespace", ns.Name) + return err + } + } + + log.Info("done deleting pods", "node name", nodeName) + + return nil +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 19c6ddff..35a4b75a 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -107,10 +107,11 @@ github.com/mailru/easyjson/jwriter # github.com/matttproud/golang_protobuf_extensions v1.0.2 ## explicit; go 1.9 github.com/matttproud/golang_protobuf_extensions/pbutil -# github.com/medik8s/common v1.2.0 +# github.com/medik8s/common v1.9.0 ## explicit; go 1.20 github.com/medik8s/common/pkg/labels github.com/medik8s/common/pkg/nodes +github.com/medik8s/common/pkg/resources # github.com/mitchellh/copystructure v1.1.2 ## explicit; go 1.15 github.com/mitchellh/copystructure