diff --git a/api/v1alpha1/selfnoderemediationconfig_types.go b/api/v1alpha1/selfnoderemediationconfig_types.go index 381428f7..41ba8a12 100644 --- a/api/v1alpha1/selfnoderemediationconfig_types.go +++ b/api/v1alpha1/selfnoderemediationconfig_types.go @@ -28,6 +28,7 @@ const ( ConfigCRName = "self-node-remediation-config" defaultWatchdogPath = "/dev/watchdog" defaultIsSoftwareRebootEnabled = true + defaultMinPeersForRemediation = 1 ) // SelfNodeRemediationConfigSpec defines the desired state of SelfNodeRemediationConfig @@ -127,6 +128,11 @@ type SelfNodeRemediationConfigSpec struct { // CustomDsTolerations allows to add custom tolerations snr agents that are running on the ds in order to support remediation for different types of nodes. // +optional CustomDsTolerations []v1.Toleration `json:"customDsTolerations,omitempty"` + + // +kubebuilder:default:=1 + // +kubebuilder:validation:Minimum=0 + // Minimum number of peer workers/control nodes to attempt to contact before deciding if node is unhealthy or not + MinPeersForRemediation int `json:"minPeersForRemediation,omitempty"` } // SelfNodeRemediationConfigStatus defines the observed state of SelfNodeRemediationConfig @@ -170,6 +176,7 @@ func NewDefaultSelfNodeRemediationConfig() SelfNodeRemediationConfig { Spec: SelfNodeRemediationConfigSpec{ WatchdogFilePath: defaultWatchdogPath, IsSoftwareRebootEnabled: defaultIsSoftwareRebootEnabled, + MinPeersForRemediation: defaultMinPeersForRemediation, }, } } diff --git a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index f4d47917..84f7deec 100644 --- a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -125,6 +125,12 @@ spec: its peers. minimum: 1 type: integer + minPeersForRemediation: + default: 1 + description: Minimum number of peer workers/control nodes to attempt + to contact before deciding if node is unhealthy or not + minimum: 0 + type: integer peerApiServerTimeout: default: 5s description: |- diff --git a/controllers/selfnoderemediationconfig_controller.go b/controllers/selfnoderemediationconfig_controller.go index b4f64842..1d5c6078 100644 --- a/controllers/selfnoderemediationconfig_controller.go +++ b/controllers/selfnoderemediationconfig_controller.go @@ -153,6 +153,7 @@ func (r *SelfNodeRemediationConfigReconciler) syncConfigDaemonSet(ctx context.Co data.Data["PeerRequestTimeout"] = snrConfig.Spec.PeerRequestTimeout.Nanoseconds() data.Data["MaxApiErrorThreshold"] = snrConfig.Spec.MaxApiErrorThreshold data.Data["EndpointHealthCheckUrl"] = snrConfig.Spec.EndpointHealthCheckUrl + data.Data["MinPeersForRemediation"] = snrConfig.Spec.MinPeersForRemediation data.Data["HostPort"] = snrConfig.Spec.HostPort data.Data["IsSoftwareRebootEnabled"] = fmt.Sprintf("\"%t\"", snrConfig.Spec.IsSoftwareRebootEnabled) diff --git a/controllers/tests/config/selfnoderemediationconfig_controller_test.go b/controllers/tests/config/selfnoderemediationconfig_controller_test.go index 8469e931..b272422a 100644 --- a/controllers/tests/config/selfnoderemediationconfig_controller_test.go +++ b/controllers/tests/config/selfnoderemediationconfig_controller_test.go @@ -238,6 +238,7 @@ var _ = Describe("SNR Config Test", func() { Expect(createdConfig.Spec.ApiServerTimeout.Seconds()).To(BeEquivalentTo(5)) Expect(createdConfig.Spec.ApiCheckInterval.Seconds()).To(BeEquivalentTo(15)) Expect(createdConfig.Spec.PeerUpdateInterval.Seconds()).To(BeEquivalentTo(15 * 60)) + Expect(createdConfig.Spec.MinPeersForRemediation).To(BeEquivalentTo(1)) }) }) diff --git a/install/self-node-remediation-deamonset.yaml b/install/self-node-remediation-deamonset.yaml index 36869fd8..0561b691 100644 --- a/install/self-node-remediation-deamonset.yaml +++ b/install/self-node-remediation-deamonset.yaml @@ -72,6 +72,8 @@ spec: value: {{.EndpointHealthCheckUrl}} - name: HOST_PORT value: "{{.HostPort}}" + - name: MIN_PEERS_FOR_REMEDIATION + value: "{{.MinPeersForRemediation}}" image: {{.Image}} imagePullPolicy: Always volumeMounts: diff --git a/pkg/apicheck/check.go b/pkg/apicheck/check.go index fda7e901..e3d2466d 100644 --- a/pkg/apicheck/check.go +++ b/pkg/apicheck/check.go @@ -50,6 +50,7 @@ type ApiConnectivityCheckConfig struct { PeerRequestTimeout time.Duration PeerHealthPort int MaxTimeForNoPeersResponse time.Duration + MinPeersForRemediation int } func New(config *ApiConnectivityCheckConfig, controlPlaneManager *controlplane.Manager) *ApiConnectivityCheck { @@ -129,12 +130,17 @@ func (c *ApiConnectivityCheck) getWorkerPeersResponse() peers.Response { c.config.Log.Info("Error count exceeds threshold, trying to ask other nodes if I'm healthy") peersToAsk := c.config.Peers.GetPeersAddresses(peers.Worker) - if peersToAsk == nil || len(peersToAsk) == 0 { - c.config.Log.Info("Peers list is empty and / or couldn't be retrieved from server, nothing we can do, so consider the node being healthy") - // TODO: maybe we need to check if this happens too much and reboot + if peersToAsk == nil && c.config.MinPeersForRemediation != 0 || len(peersToAsk) < c.config.MinPeersForRemediation { + c.config.Log.Info("Peers list is empty and / or less than the minimum required peers for remediation, so consider the node being healthy") + //todo maybe we need to check if this happens too much and reboot return peers.Response{IsHealthy: true, Reason: peers.HealthyBecauseNoPeersWereFound} } + //if MinPeersForRemediation == 0 and there are no peers to contact, assume node is unhealthy + if peersToAsk == nil || len(peersToAsk) == 0 { + return peers.Response{IsHealthy: false, Reason: peers.UnHealthyBecauseNodeIsIsolated} + } + apiErrorsResponsesSum := 0 nrAllPeers := len(peersToAsk) // peersToAsk is being reduced at every iteration, iterate until no peers left to ask