From 627c3bb50a1d6bc5dcfa0d85a6bb8f57c46cd695 Mon Sep 17 00:00:00 2001 From: "Mark E. Scott, Jr." Date: Thu, 6 Jun 2024 14:35:21 -0500 Subject: [PATCH] Add minPeersForRemediation configuration value. With this one can specify the number of worker peers needed to be able to contact before determining a node is unhealthy. It covers the case in which there are 3 control plane nodes and a single worker node, and yet you still want to be able to perform remediations on that worker node It has a default of 1, which maintains existing behaviors without explicitly altering the value. --- .gitignore | 1 + .../selfnoderemediationconfig_types.go | 10 ++ ...medik8s.io_selfnoderemediationconfigs.yaml | 11 ++ ...medik8s.io_selfnoderemediationconfigs.yaml | 11 ++ .../selfnoderemediationconfig_controller.go | 1 + ...lfnoderemediationconfig_controller_test.go | 1 + controllers/tests/config/suite_test.go | 9 + .../selfnoderemediation_controller_test.go | 96 +++++++++-- controllers/tests/controller/suite_test.go | 160 ++++++++++++++++-- install/self-node-remediation-deamonset.yaml | 2 + pkg/apicheck/check.go | 21 ++- 11 files changed, 291 insertions(+), 32 deletions(-) diff --git a/.gitignore b/.gitignore index b456ebb9f..403022101 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ bin # editor and IDE paraphernalia .idea +*.iml *.swp *.swo *~ diff --git a/api/v1alpha1/selfnoderemediationconfig_types.go b/api/v1alpha1/selfnoderemediationconfig_types.go index 381428f71..ae7e318ef 100644 --- a/api/v1alpha1/selfnoderemediationconfig_types.go +++ b/api/v1alpha1/selfnoderemediationconfig_types.go @@ -28,6 +28,7 @@ const ( ConfigCRName = "self-node-remediation-config" defaultWatchdogPath = "/dev/watchdog" defaultIsSoftwareRebootEnabled = true + defaultMinPeersForRemediation = 1 ) // SelfNodeRemediationConfigSpec defines the desired state of SelfNodeRemediationConfig @@ -127,6 +128,15 @@ type SelfNodeRemediationConfigSpec struct { // CustomDsTolerations allows to add custom tolerations snr agents that are running on the ds in order to support remediation for different types of nodes. // +optional CustomDsTolerations []v1.Toleration `json:"customDsTolerations,omitempty"` + + // Minimum number of peer workers/control nodes to attempt to contact before deciding if node is unhealthy or not + // if set to zero, no other peers will be required to be present for remediation action to occur when this + // node has lost API server access. If an insufficient number of peers are found, we will not attempt to ask + // any peer nodes (if present) whether they see that the current node has been marked unhealthy with a + // SelfNodeRemediation CR + // +kubebuilder:default:=1 + // +kubebuilder:validation:Minimum=0 + MinPeersForRemediation int `json:"minPeersForRemediation,omitempty"` } // SelfNodeRemediationConfigStatus defines the observed state of SelfNodeRemediationConfig diff --git a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index bea94e5ea..9cb12ca80 100644 --- a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -127,6 +127,17 @@ spec: its peers. minimum: 1 type: integer + minPeersForRemediation: + default: 1 + description: "Minimum number of peer workers/control nodes to attempt + to contact before deciding if node is unhealthy or not\n\tif set + to zero, no other peers will be required to be present for remediation + action to occur when this\n\tnode has lost API server access. If + an insufficient number of peers are found, we will not attempt to + ask\n\tany peer nodes (if present) whether they see that the current + node has been marked unhealthy with a\n\tSelfNodeRemediation CR" + minimum: 0 + type: integer peerApiServerTimeout: default: 5s description: |- diff --git a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index f4d47917d..a152b1799 100644 --- a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -125,6 +125,17 @@ spec: its peers. minimum: 1 type: integer + minPeersForRemediation: + default: 1 + description: "Minimum number of peer workers/control nodes to attempt + to contact before deciding if node is unhealthy or not\n\tif set + to zero, no other peers will be required to be present for remediation + action to occur when this\n\tnode has lost API server access. If + an insufficient number of peers are found, we will not attempt to + ask\n\tany peer nodes (if present) whether they see that the current + node has been marked unhealthy with a\n\tSelfNodeRemediation CR" + minimum: 0 + type: integer peerApiServerTimeout: default: 5s description: |- diff --git a/controllers/selfnoderemediationconfig_controller.go b/controllers/selfnoderemediationconfig_controller.go index b4f64842b..1d5c60785 100644 --- a/controllers/selfnoderemediationconfig_controller.go +++ b/controllers/selfnoderemediationconfig_controller.go @@ -153,6 +153,7 @@ func (r *SelfNodeRemediationConfigReconciler) syncConfigDaemonSet(ctx context.Co data.Data["PeerRequestTimeout"] = snrConfig.Spec.PeerRequestTimeout.Nanoseconds() data.Data["MaxApiErrorThreshold"] = snrConfig.Spec.MaxApiErrorThreshold data.Data["EndpointHealthCheckUrl"] = snrConfig.Spec.EndpointHealthCheckUrl + data.Data["MinPeersForRemediation"] = snrConfig.Spec.MinPeersForRemediation data.Data["HostPort"] = snrConfig.Spec.HostPort data.Data["IsSoftwareRebootEnabled"] = fmt.Sprintf("\"%t\"", snrConfig.Spec.IsSoftwareRebootEnabled) diff --git a/controllers/tests/config/selfnoderemediationconfig_controller_test.go b/controllers/tests/config/selfnoderemediationconfig_controller_test.go index 8469e9313..b272422a3 100644 --- a/controllers/tests/config/selfnoderemediationconfig_controller_test.go +++ b/controllers/tests/config/selfnoderemediationconfig_controller_test.go @@ -238,6 +238,7 @@ var _ = Describe("SNR Config Test", func() { Expect(createdConfig.Spec.ApiServerTimeout.Seconds()).To(BeEquivalentTo(5)) Expect(createdConfig.Spec.ApiCheckInterval.Seconds()).To(BeEquivalentTo(15)) Expect(createdConfig.Spec.PeerUpdateInterval.Seconds()).To(BeEquivalentTo(15 * 60)) + Expect(createdConfig.Spec.MinPeersForRemediation).To(BeEquivalentTo(1)) }) }) diff --git a/controllers/tests/config/suite_test.go b/controllers/tests/config/suite_test.go index f25fe1574..7374be0f2 100644 --- a/controllers/tests/config/suite_test.go +++ b/controllers/tests/config/suite_test.go @@ -40,7 +40,10 @@ import ( "github.com/medik8s/self-node-remediation/pkg/apicheck" "github.com/medik8s/self-node-remediation/pkg/certificates" "github.com/medik8s/self-node-remediation/pkg/peers" + //+kubebuilder:scaffold:imports + "github.com/medik8s/self-node-remediation/pkg/reboot" + "github.com/medik8s/self-node-remediation/pkg/watchdog" ) // These tests use Ginkgo (BDD-style Go testing framework). Refer to @@ -52,6 +55,7 @@ var cancelFunc context.CancelFunc var k8sClient *shared.K8sClientWrapper var certReader certificates.CertStorageReader var managerReconciler *controllers.SelfNodeRemediationReconciler +var dummyDog watchdog.Watchdog func TestAPIs(t *testing.T) { RegisterFailHandler(Fail) @@ -120,12 +124,17 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) certReader = certificates.NewSecretCertStorage(k8sClient, ctrl.Log.WithName("SecretCertStorage"), shared.Namespace) + + dummyDog = watchdog.NewFake(true) + rebooter := reboot.NewWatchdogRebooter(dummyDog, ctrl.Log.WithName("rebooter")) + apiConnectivityCheckConfig := &apicheck.ApiConnectivityCheckConfig{ Log: ctrl.Log.WithName("api-check"), MyNodeName: shared.UnhealthyNodeName, CheckInterval: shared.ApiCheckInterval, MaxErrorsThreshold: shared.MaxErrorThreshold, Peers: peers, + Rebooter: rebooter, Cfg: cfg, CertReader: certReader, } diff --git a/controllers/tests/controller/selfnoderemediation_controller_test.go b/controllers/tests/controller/selfnoderemediation_controller_test.go index fe9a15ca8..a57e7b312 100644 --- a/controllers/tests/controller/selfnoderemediation_controller_test.go +++ b/controllers/tests/controller/selfnoderemediation_controller_test.go @@ -445,22 +445,88 @@ var _ = Describe("SNR Controller", func() { remediationStrategy = v1alpha1.ResourceDeletionRemediationStrategy }) - It("Verify that watchdog is not receiving food after some time", func() { - lastFoodTime := dummyDog.LastFoodTime() - timeout := dummyDog.GetTimeout() - Eventually(func() bool { - newTime := dummyDog.LastFoodTime() - // ensure the timeout passed - timeoutPassed := time.Now().After(lastFoodTime.Add(3 * timeout)) - // ensure wd wasn't feeded - missedFeed := newTime.Before(lastFoodTime.Add(timeout)) - if timeoutPassed && missedFeed { - return true - } - lastFoodTime = newTime - return false - }, 10*shared.PeerUpdateInterval, timeout).Should(BeTrue()) + AfterEach(func() { + By("Restore default settings") + apiConnectivityCheckConfig.MinPeersForRemediation = 1 + + // sleep so config can update + time.Sleep(time.Second * 2) + }) + + Context("no peer found, and using default setting for MinPeersForRemediation", func() { + BeforeEach(func() { + apiConnectivityCheckConfig.MinPeersForRemediation = 1 + snrConfig.Spec.MinPeersForRemediation = 1 + }) + + It("Does not receive peer communication and has MinPeersForRemediation set to 1 so no reboot "+ + "will occur", func() { + lastFoodTime := dummyDog.LastFoodTime() + timeout := dummyDog.GetTimeout() + fmt.Printf("timeout: %s", timeout.String()) + Eventually(func() bool { + fmt.Println("YYYY") + newTime := dummyDog.LastFoodTime() + fmt.Printf("newTime: %s\n", newTime.Format(time.StampMilli)) + // ensure the timeout passed + timeoutPassed := time.Now().After(lastFoodTime.Add(3 * timeout)) + // ensure wd wasn't feeded + fmt.Printf("missedFeed time: %s\n", lastFoodTime.Add(timeout).Format(time.StampMilli)) + missedFeed := newTime.Before(lastFoodTime.Add(timeout)) + if timeoutPassed && missedFeed { + fmt.Println("timeout passed and missed feed") + return true + } + lastFoodTime = newTime + fmt.Println("timeout didn't pass and/or didn't miss feed") + return false + }, 10*shared.PeerUpdateInterval, timeout).Should(BeTrue()) + }) + }) + + Context("no peer found and MinPeersForRemediation is configured to 0", func() { + JustBeforeEach(func() { + apiConnectivityCheckConfig.MinPeersForRemediation = 0 + + // sleep so config can update + time.Sleep(time.Second * 2) + }) + + It("Does not receive peer communication and since configured to need zero peers, initiates a reboot", + func() { + fmt.Println("test 0") + lastFoodTime := dummyDog.LastFoodTime() + fmt.Printf("original lastFoodTime: %s\n", lastFoodTime.Format(time.StampMilli)) + timeout := dummyDog.GetTimeout() + fmt.Printf("timeout: %s\n", timeout.String()) + + Eventually(func() bool { + fmt.Println("test 1") + newTime := dummyDog.LastFoodTime() + fmt.Printf("newTime: %s\n", newTime.Format(time.StampMilli)) + // ensure the timeout passed + n := time.Now() + fmt.Printf("now (%s) should be past 3x past lastFoodTime(%s) which is %s\n", + n.Format(time.StampMilli), lastFoodTime.Format(time.StampMilli), lastFoodTime.Add(3*timeout).Format(time.StampMilli)) + //fmt.Printf("timeout: %s\n", lastFoodTime.Add(3*timeout).Format(time.StampMilli)) + timeoutPassed := n.After(lastFoodTime.Add(3 * timeout)) + // ensure wd wasn't feeded + fmt.Printf("missedFeed time: %s\n", lastFoodTime.Add(timeout).Format(time.StampMilli)) + fmt.Printf("newTime (%s) should be before lastFoodTime+%s (%s)\n", newTime.Format(time.StampMilli), timeout.String(), + lastFoodTime.Add(timeout).Format(time.StampMilli)) + missedFeed := newTime.Before(lastFoodTime.Add(timeout)) + if timeoutPassed && missedFeed { + fmt.Println("returning true") + return true + } + lastFoodTime = newTime + + fmt.Println("returning false") + return false + }, "15s", timeout).Should(BeTrue()) + }) }) + }) Context("Configuration is missing", func() { diff --git a/controllers/tests/controller/suite_test.go b/controllers/tests/controller/suite_test.go index d07df421e..655611239 100644 --- a/controllers/tests/controller/suite_test.go +++ b/controllers/tests/controller/suite_test.go @@ -53,13 +53,14 @@ import ( // http://onsi.github.io/ginkgo/ to learn more about Ginkgo. var ( - testEnv *envtest.Environment - dummyDog watchdog.Watchdog - unhealthyNode, peerNode = &v1.Node{}, &v1.Node{} - cancelFunc context.CancelFunc - k8sClient *shared.K8sClientWrapper - fakeRecorder *record.FakeRecorder - snrConfig *selfnoderemediationv1alpha1.SelfNodeRemediationConfig + testEnv *envtest.Environment + dummyDog watchdog.Watchdog + unhealthyNode, peerNode = &v1.Node{}, &v1.Node{} + cancelFunc context.CancelFunc + k8sClient *shared.K8sClientWrapper + fakeRecorder *record.FakeRecorder + snrConfig *selfnoderemediationv1alpha1.SelfNodeRemediationConfig + apiConnectivityCheckConfig *apicheck.ApiConnectivityCheckConfig ) var unhealthyNodeNamespacedName = client.ObjectKey{ @@ -152,14 +153,15 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) rebooter := reboot.NewWatchdogRebooter(dummyDog, ctrl.Log.WithName("rebooter")) - apiConnectivityCheckConfig := &apicheck.ApiConnectivityCheckConfig{ - Log: ctrl.Log.WithName("api-check"), - MyNodeName: shared.UnhealthyNodeName, - CheckInterval: shared.ApiCheckInterval, - MaxErrorsThreshold: shared.MaxErrorThreshold, - Peers: peers, - Rebooter: rebooter, - Cfg: cfg, + apiConnectivityCheckConfig = &apicheck.ApiConnectivityCheckConfig{ + Log: ctrl.Log.WithName("api-check"), + MyNodeName: shared.UnhealthyNodeName, + CheckInterval: shared.ApiCheckInterval, + MaxErrorsThreshold: shared.MaxErrorThreshold, + Peers: peers, + Rebooter: rebooter, + Cfg: cfg, + MinPeersForRemediation: 1, } apiCheck := apicheck.New(apiConnectivityCheckConfig, nil) err = k8sManager.Add(apiCheck) @@ -203,6 +205,134 @@ var _ = BeforeSuite(func() { }) +//var _ = BeforeEach(func() { +// //logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) +// +// //testScheme := runtime.NewScheme() +// // +// //By("bootstrapping test environment") +// //testEnv = &envtest.Environment{ +// // CRDInstallOptions: envtest.CRDInstallOptions{ +// // Scheme: testScheme, +// // Paths: []string{ +// // filepath.Join("../../..", "vendor", "github.com", "openshift", "api", "machine", "v1beta1"), +// // filepath.Join("../../..", "config", "crd", "bases"), +// // }, +// // ErrorIfPathMissing: true, +// // }, +// //} +// // +// //cfg, err := testEnv.Start() +// //Expect(err).NotTo(HaveOccurred()) +// //Expect(cfg).NotTo(BeNil()) +// // +// //Expect(scheme.AddToScheme(testScheme)).To(Succeed()) +// //Expect(machinev1beta1.Install(testScheme)).To(Succeed()) +// //Expect(selfnoderemediationv1alpha1.AddToScheme(testScheme)).To(Succeed()) +// // +// ////+kubebuilder:scaffold:scheme +// +// k8sManager, err := ctrl.NewManager(cfg, ctrl.Options{ +// Scheme: testScheme, +// MetricsBindAddress: "0", +// }) +// Expect(err).ToNot(HaveOccurred()) +// +// k8sClient = &shared.K8sClientWrapper{ +// Client: k8sManager.GetClient(), +// Reader: k8sManager.GetAPIReader(), +// SimulatedFailureMessage: "simulation of client error for delete when listing namespace", +// } +// Expect(k8sClient).ToNot(BeNil()) +// +// nsToCreate := &v1.Namespace{ +// ObjectMeta: metav1.ObjectMeta{ +// Name: shared.Namespace, +// }, +// } +// Expect(k8sClient.Create(context.Background(), nsToCreate)).To(Succeed()) +// +// //mock deployment ns +// _ = os.Setenv("DEPLOYMENT_NAMESPACE", nsToCreate.Name) +// _ = os.Setenv("SELF_NODE_REMEDIATION_IMAGE", shared.DsDummyImageName) +// +// dummyDog = watchdog.NewFake(true) +// err = k8sManager.Add(dummyDog) +// Expect(err).ToNot(HaveOccurred()) +// err = (&controllers.SelfNodeRemediationConfigReconciler{ +// Client: k8sManager.GetClient(), +// Log: ctrl.Log.WithName("controllers").WithName("self-node-remediation-config-controller"), +// InstallFileFolder: "../../../install/", +// Scheme: testScheme, +// Namespace: shared.Namespace, +// RebootDurationCalculator: shared.MockRebootDurationCalculator{}, +// }).SetupWithManager(k8sManager) +// +// // peers need their own node on start +// unhealthyNode = getNode(shared.UnhealthyNodeName) +// Expect(k8sClient.Create(context.Background(), unhealthyNode)).To(Succeed(), "failed to create unhealthy node") +// +// peerNode = getNode(shared.PeerNodeName) +// Expect(k8sClient.Create(context.Background(), peerNode)).To(Succeed(), "failed to create peer node") +// +// peerApiServerTimeout := 5 * time.Second +// peers := peers.New(shared.UnhealthyNodeName, shared.PeerUpdateInterval, k8sClient, ctrl.Log.WithName("peers"), peerApiServerTimeout) +// err = k8sManager.Add(peers) +// Expect(err).ToNot(HaveOccurred()) +// +// rebooter := reboot.NewWatchdogRebooter(dummyDog, ctrl.Log.WithName("rebooter")) +// apiConnectivityCheckConfig = &apicheck.ApiConnectivityCheckConfig{ +// Log: ctrl.Log.WithName("api-check"), +// MyNodeName: shared.UnhealthyNodeName, +// CheckInterval: shared.ApiCheckInterval, +// MaxErrorsThreshold: shared.MaxErrorThreshold, +// Peers: peers, +// Rebooter: rebooter, +// Cfg: cfg, +// MinPeersForRemediation: 1, +// } +// apiCheck := apicheck.New(apiConnectivityCheckConfig, nil) +// err = k8sManager.Add(apiCheck) +// Expect(err).ToNot(HaveOccurred()) +// +// fakeRecorder = record.NewFakeRecorder(1000) +// // reconciler for unhealthy node +// err = (&controllers.SelfNodeRemediationReconciler{ +// Client: k8sClient, +// Log: ctrl.Log.WithName("controllers").WithName("self-node-remediation-controller").WithName("unhealthy node"), +// Rebooter: rebooter, +// RebootDurationCalculator: nil, +// MyNodeName: shared.UnhealthyNodeName, +// MyNamespace: shared.Namespace, +// Recorder: fakeRecorder, +// IsAgent: true, +// }).SetupWithManager(k8sManager) +// Expect(err).ToNot(HaveOccurred()) +// +// // reconciler for manager running on peer node +// err = (&controllers.SelfNodeRemediationReconciler{ +// Client: k8sClient, +// Log: ctrl.Log.WithName("controllers").WithName("self-node-remediation-controller").WithName("manager node"), +// MyNodeName: shared.PeerNodeName, +// MyNamespace: shared.Namespace, +// Rebooter: nil, +// RebootDurationCalculator: shared.MockRebootDurationCalculator{}, +// Recorder: fakeRecorder, +// IsAgent: false, +// }).SetupWithManager(k8sManager) +// Expect(err).ToNot(HaveOccurred()) +// +// var ctx context.Context +// ctx, cancelFunc = context.WithCancel(ctrl.SetupSignalHandler()) +// +// go func() { +// defer GinkgoRecover() +// err = k8sManager.Start(ctx) +// //Expect(err).ToNot(HaveOccurred()) +// }() +// +//}) + func getNode(name string) *v1.Node { node := &v1.Node{} node.Name = name diff --git a/install/self-node-remediation-deamonset.yaml b/install/self-node-remediation-deamonset.yaml index 36869fd84..0561b6911 100644 --- a/install/self-node-remediation-deamonset.yaml +++ b/install/self-node-remediation-deamonset.yaml @@ -72,6 +72,8 @@ spec: value: {{.EndpointHealthCheckUrl}} - name: HOST_PORT value: "{{.HostPort}}" + - name: MIN_PEERS_FOR_REMEDIATION + value: "{{.MinPeersForRemediation}}" image: {{.Image}} imagePullPolicy: Always volumeMounts: diff --git a/pkg/apicheck/check.go b/pkg/apicheck/check.go index fda7e9019..4233072b6 100644 --- a/pkg/apicheck/check.go +++ b/pkg/apicheck/check.go @@ -50,6 +50,7 @@ type ApiConnectivityCheckConfig struct { PeerRequestTimeout time.Duration PeerHealthPort int MaxTimeForNoPeersResponse time.Duration + MinPeersForRemediation int } func New(config *ApiConnectivityCheckConfig, controlPlaneManager *controlplane.Manager) *ApiConnectivityCheck { @@ -129,12 +130,28 @@ func (c *ApiConnectivityCheck) getWorkerPeersResponse() peers.Response { c.config.Log.Info("Error count exceeds threshold, trying to ask other nodes if I'm healthy") peersToAsk := c.config.Peers.GetPeersAddresses(peers.Worker) - if peersToAsk == nil || len(peersToAsk) == 0 { - c.config.Log.Info("Peers list is empty and / or couldn't be retrieved from server, nothing we can do, so consider the node being healthy") + + // We check to see if we have at least the number of peers that the user has configured as required. + // If we don't have this many peers (for instance there are zero peers, and the default value is set + // which requires at least one peer), we don't want to remediate. In this case we have some confusion + // and don't want to remediate a node when we shouldn't. Note: It would be unusual for MinPeersForRemediation + // to be greater than 1 unless the environment has specific requirements. + if len(peersToAsk) < c.config.MinPeersForRemediation { + c.config.Log.Info("Ignoring api-server error as we have an insufficient number of peers found, "+ + "so we aren't going to attempt to contact any to check for a SelfNodeRemediation CR"+ + " - we will consider it as if there was no CR present & as healthy.", "minPeersRequired", + c.config.MinPeersForRemediation, "actualNumPeersFound", len(peersToAsk)) + // TODO: maybe we need to check if this happens too much and reboot return peers.Response{IsHealthy: true, Reason: peers.HealthyBecauseNoPeersWereFound} } + // If we make it here and there are no peers, we can't proceed because we need at least one peer + // to check. So it doesn't make sense to continue on - we'll mark as unhealthy and exit fast + if len(peersToAsk) == 0 { + return peers.Response{IsHealthy: false, Reason: peers.UnHealthyBecauseNodeIsIsolated} + } + apiErrorsResponsesSum := 0 nrAllPeers := len(peersToAsk) // peersToAsk is being reduced at every iteration, iterate until no peers left to ask