diff --git a/api/v1alpha1/selfnoderemediationconfig_types.go b/api/v1alpha1/selfnoderemediationconfig_types.go index 381428f7..ae7e318e 100644 --- a/api/v1alpha1/selfnoderemediationconfig_types.go +++ b/api/v1alpha1/selfnoderemediationconfig_types.go @@ -28,6 +28,7 @@ const ( ConfigCRName = "self-node-remediation-config" defaultWatchdogPath = "/dev/watchdog" defaultIsSoftwareRebootEnabled = true + defaultMinPeersForRemediation = 1 ) // SelfNodeRemediationConfigSpec defines the desired state of SelfNodeRemediationConfig @@ -127,6 +128,15 @@ type SelfNodeRemediationConfigSpec struct { // CustomDsTolerations allows to add custom tolerations snr agents that are running on the ds in order to support remediation for different types of nodes. // +optional CustomDsTolerations []v1.Toleration `json:"customDsTolerations,omitempty"` + + // Minimum number of peer workers/control nodes to attempt to contact before deciding if node is unhealthy or not + // if set to zero, no other peers will be required to be present for remediation action to occur when this + // node has lost API server access. If an insufficient number of peers are found, we will not attempt to ask + // any peer nodes (if present) whether they see that the current node has been marked unhealthy with a + // SelfNodeRemediation CR + // +kubebuilder:default:=1 + // +kubebuilder:validation:Minimum=0 + MinPeersForRemediation int `json:"minPeersForRemediation,omitempty"` } // SelfNodeRemediationConfigStatus defines the observed state of SelfNodeRemediationConfig diff --git a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index bea94e5e..9cb12ca8 100644 --- a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -127,6 +127,17 @@ spec: its peers. minimum: 1 type: integer + minPeersForRemediation: + default: 1 + description: "Minimum number of peer workers/control nodes to attempt + to contact before deciding if node is unhealthy or not\n\tif set + to zero, no other peers will be required to be present for remediation + action to occur when this\n\tnode has lost API server access. If + an insufficient number of peers are found, we will not attempt to + ask\n\tany peer nodes (if present) whether they see that the current + node has been marked unhealthy with a\n\tSelfNodeRemediation CR" + minimum: 0 + type: integer peerApiServerTimeout: default: 5s description: |- diff --git a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index f4d47917..a152b179 100644 --- a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -125,6 +125,17 @@ spec: its peers. minimum: 1 type: integer + minPeersForRemediation: + default: 1 + description: "Minimum number of peer workers/control nodes to attempt + to contact before deciding if node is unhealthy or not\n\tif set + to zero, no other peers will be required to be present for remediation + action to occur when this\n\tnode has lost API server access. If + an insufficient number of peers are found, we will not attempt to + ask\n\tany peer nodes (if present) whether they see that the current + node has been marked unhealthy with a\n\tSelfNodeRemediation CR" + minimum: 0 + type: integer peerApiServerTimeout: default: 5s description: |- diff --git a/controllers/selfnoderemediationconfig_controller.go b/controllers/selfnoderemediationconfig_controller.go index b4f64842..1d5c6078 100644 --- a/controllers/selfnoderemediationconfig_controller.go +++ b/controllers/selfnoderemediationconfig_controller.go @@ -153,6 +153,7 @@ func (r *SelfNodeRemediationConfigReconciler) syncConfigDaemonSet(ctx context.Co data.Data["PeerRequestTimeout"] = snrConfig.Spec.PeerRequestTimeout.Nanoseconds() data.Data["MaxApiErrorThreshold"] = snrConfig.Spec.MaxApiErrorThreshold data.Data["EndpointHealthCheckUrl"] = snrConfig.Spec.EndpointHealthCheckUrl + data.Data["MinPeersForRemediation"] = snrConfig.Spec.MinPeersForRemediation data.Data["HostPort"] = snrConfig.Spec.HostPort data.Data["IsSoftwareRebootEnabled"] = fmt.Sprintf("\"%t\"", snrConfig.Spec.IsSoftwareRebootEnabled) diff --git a/controllers/tests/config/selfnoderemediationconfig_controller_test.go b/controllers/tests/config/selfnoderemediationconfig_controller_test.go index 8469e931..b272422a 100644 --- a/controllers/tests/config/selfnoderemediationconfig_controller_test.go +++ b/controllers/tests/config/selfnoderemediationconfig_controller_test.go @@ -238,6 +238,7 @@ var _ = Describe("SNR Config Test", func() { Expect(createdConfig.Spec.ApiServerTimeout.Seconds()).To(BeEquivalentTo(5)) Expect(createdConfig.Spec.ApiCheckInterval.Seconds()).To(BeEquivalentTo(15)) Expect(createdConfig.Spec.PeerUpdateInterval.Seconds()).To(BeEquivalentTo(15 * 60)) + Expect(createdConfig.Spec.MinPeersForRemediation).To(BeEquivalentTo(1)) }) }) diff --git a/controllers/tests/config/suite_test.go b/controllers/tests/config/suite_test.go index f25fe157..7374be0f 100644 --- a/controllers/tests/config/suite_test.go +++ b/controllers/tests/config/suite_test.go @@ -40,7 +40,10 @@ import ( "github.com/medik8s/self-node-remediation/pkg/apicheck" "github.com/medik8s/self-node-remediation/pkg/certificates" "github.com/medik8s/self-node-remediation/pkg/peers" + //+kubebuilder:scaffold:imports + "github.com/medik8s/self-node-remediation/pkg/reboot" + "github.com/medik8s/self-node-remediation/pkg/watchdog" ) // These tests use Ginkgo (BDD-style Go testing framework). Refer to @@ -52,6 +55,7 @@ var cancelFunc context.CancelFunc var k8sClient *shared.K8sClientWrapper var certReader certificates.CertStorageReader var managerReconciler *controllers.SelfNodeRemediationReconciler +var dummyDog watchdog.Watchdog func TestAPIs(t *testing.T) { RegisterFailHandler(Fail) @@ -120,12 +124,17 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) certReader = certificates.NewSecretCertStorage(k8sClient, ctrl.Log.WithName("SecretCertStorage"), shared.Namespace) + + dummyDog = watchdog.NewFake(true) + rebooter := reboot.NewWatchdogRebooter(dummyDog, ctrl.Log.WithName("rebooter")) + apiConnectivityCheckConfig := &apicheck.ApiConnectivityCheckConfig{ Log: ctrl.Log.WithName("api-check"), MyNodeName: shared.UnhealthyNodeName, CheckInterval: shared.ApiCheckInterval, MaxErrorsThreshold: shared.MaxErrorThreshold, Peers: peers, + Rebooter: rebooter, Cfg: cfg, CertReader: certReader, } diff --git a/install/self-node-remediation-deamonset.yaml b/install/self-node-remediation-deamonset.yaml index 36869fd8..0561b691 100644 --- a/install/self-node-remediation-deamonset.yaml +++ b/install/self-node-remediation-deamonset.yaml @@ -72,6 +72,8 @@ spec: value: {{.EndpointHealthCheckUrl}} - name: HOST_PORT value: "{{.HostPort}}" + - name: MIN_PEERS_FOR_REMEDIATION + value: "{{.MinPeersForRemediation}}" image: {{.Image}} imagePullPolicy: Always volumeMounts: diff --git a/pkg/apicheck/check.go b/pkg/apicheck/check.go index fda7e901..4233072b 100644 --- a/pkg/apicheck/check.go +++ b/pkg/apicheck/check.go @@ -50,6 +50,7 @@ type ApiConnectivityCheckConfig struct { PeerRequestTimeout time.Duration PeerHealthPort int MaxTimeForNoPeersResponse time.Duration + MinPeersForRemediation int } func New(config *ApiConnectivityCheckConfig, controlPlaneManager *controlplane.Manager) *ApiConnectivityCheck { @@ -129,12 +130,28 @@ func (c *ApiConnectivityCheck) getWorkerPeersResponse() peers.Response { c.config.Log.Info("Error count exceeds threshold, trying to ask other nodes if I'm healthy") peersToAsk := c.config.Peers.GetPeersAddresses(peers.Worker) - if peersToAsk == nil || len(peersToAsk) == 0 { - c.config.Log.Info("Peers list is empty and / or couldn't be retrieved from server, nothing we can do, so consider the node being healthy") + + // We check to see if we have at least the number of peers that the user has configured as required. + // If we don't have this many peers (for instance there are zero peers, and the default value is set + // which requires at least one peer), we don't want to remediate. In this case we have some confusion + // and don't want to remediate a node when we shouldn't. Note: It would be unusual for MinPeersForRemediation + // to be greater than 1 unless the environment has specific requirements. + if len(peersToAsk) < c.config.MinPeersForRemediation { + c.config.Log.Info("Ignoring api-server error as we have an insufficient number of peers found, "+ + "so we aren't going to attempt to contact any to check for a SelfNodeRemediation CR"+ + " - we will consider it as if there was no CR present & as healthy.", "minPeersRequired", + c.config.MinPeersForRemediation, "actualNumPeersFound", len(peersToAsk)) + // TODO: maybe we need to check if this happens too much and reboot return peers.Response{IsHealthy: true, Reason: peers.HealthyBecauseNoPeersWereFound} } + // If we make it here and there are no peers, we can't proceed because we need at least one peer + // to check. So it doesn't make sense to continue on - we'll mark as unhealthy and exit fast + if len(peersToAsk) == 0 { + return peers.Response{IsHealthy: false, Reason: peers.UnHealthyBecauseNodeIsIsolated} + } + apiErrorsResponsesSum := 0 nrAllPeers := len(peersToAsk) // peersToAsk is being reduced at every iteration, iterate until no peers left to ask