Skip to content

Commit

Permalink
Add minPeersForRemediation configuration value.
Browse files Browse the repository at this point in the history
With this one can specify the number of worker peers needed to
be able to contact before determining a node is unhealthy.

It covers the case in which there are 3 control plane nodes and a single
worker node, and yet you still want to be able to perform remediations
on that worker node

It has a default of 1, which maintains existing behaviors without
explicitly altering the value.
  • Loading branch information
novasbc committed Nov 18, 2024
1 parent 0bc6866 commit 117656c
Show file tree
Hide file tree
Showing 8 changed files with 64 additions and 2 deletions.
10 changes: 10 additions & 0 deletions api/v1alpha1/selfnoderemediationconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ const (
ConfigCRName = "self-node-remediation-config"
defaultWatchdogPath = "/dev/watchdog"
defaultIsSoftwareRebootEnabled = true
defaultMinPeersForRemediation = 1
)

// SelfNodeRemediationConfigSpec defines the desired state of SelfNodeRemediationConfig
Expand Down Expand Up @@ -127,6 +128,15 @@ type SelfNodeRemediationConfigSpec struct {
// CustomDsTolerations allows to add custom tolerations snr agents that are running on the ds in order to support remediation for different types of nodes.
// +optional
CustomDsTolerations []v1.Toleration `json:"customDsTolerations,omitempty"`

// Minimum number of peer workers/control nodes to attempt to contact before deciding if node is unhealthy or not
// if set to zero, no other peers will be required to be present for remediation action to occur when this
// node has lost API server access. If an insufficient number of peers are found, we will not attempt to ask
// any peer nodes (if present) whether they see that the current node has been marked unhealthy with a
// SelfNodeRemediation CR
// +kubebuilder:default:=1
// +kubebuilder:validation:Minimum=0
MinPeersForRemediation int `json:"minPeersForRemediation,omitempty"`
}

// SelfNodeRemediationConfigStatus defines the observed state of SelfNodeRemediationConfig
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,17 @@ spec:
its peers.
minimum: 1
type: integer
minPeersForRemediation:
default: 1
description: "Minimum number of peer workers/control nodes to attempt
to contact before deciding if node is unhealthy or not\n\tif set
to zero, no other peers will be required to be present for remediation
action to occur when this\n\tnode has lost API server access. If
an insufficient number of peers are found, we will not attempt to
ask\n\tany peer nodes (if present) whether they see that the current
node has been marked unhealthy with a\n\tSelfNodeRemediation CR"
minimum: 0
type: integer
peerApiServerTimeout:
default: 5s
description: |-
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,17 @@ spec:
its peers.
minimum: 1
type: integer
minPeersForRemediation:
default: 1
description: "Minimum number of peer workers/control nodes to attempt
to contact before deciding if node is unhealthy or not\n\tif set
to zero, no other peers will be required to be present for remediation
action to occur when this\n\tnode has lost API server access. If
an insufficient number of peers are found, we will not attempt to
ask\n\tany peer nodes (if present) whether they see that the current
node has been marked unhealthy with a\n\tSelfNodeRemediation CR"
minimum: 0
type: integer
peerApiServerTimeout:
default: 5s
description: |-
Expand Down
1 change: 1 addition & 0 deletions controllers/selfnoderemediationconfig_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ func (r *SelfNodeRemediationConfigReconciler) syncConfigDaemonSet(ctx context.Co
data.Data["PeerRequestTimeout"] = snrConfig.Spec.PeerRequestTimeout.Nanoseconds()
data.Data["MaxApiErrorThreshold"] = snrConfig.Spec.MaxApiErrorThreshold
data.Data["EndpointHealthCheckUrl"] = snrConfig.Spec.EndpointHealthCheckUrl
data.Data["MinPeersForRemediation"] = snrConfig.Spec.MinPeersForRemediation
data.Data["HostPort"] = snrConfig.Spec.HostPort
data.Data["IsSoftwareRebootEnabled"] = fmt.Sprintf("\"%t\"", snrConfig.Spec.IsSoftwareRebootEnabled)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ var _ = Describe("SNR Config Test", func() {
Expect(createdConfig.Spec.ApiServerTimeout.Seconds()).To(BeEquivalentTo(5))
Expect(createdConfig.Spec.ApiCheckInterval.Seconds()).To(BeEquivalentTo(15))
Expect(createdConfig.Spec.PeerUpdateInterval.Seconds()).To(BeEquivalentTo(15 * 60))
Expect(createdConfig.Spec.MinPeersForRemediation).To(BeEquivalentTo(1))
})
})

Expand Down
9 changes: 9 additions & 0 deletions controllers/tests/config/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@ import (
"github.com/medik8s/self-node-remediation/pkg/apicheck"
"github.com/medik8s/self-node-remediation/pkg/certificates"
"github.com/medik8s/self-node-remediation/pkg/peers"

//+kubebuilder:scaffold:imports
"github.com/medik8s/self-node-remediation/pkg/reboot"
"github.com/medik8s/self-node-remediation/pkg/watchdog"
)

// These tests use Ginkgo (BDD-style Go testing framework). Refer to
Expand All @@ -52,6 +55,7 @@ var cancelFunc context.CancelFunc
var k8sClient *shared.K8sClientWrapper
var certReader certificates.CertStorageReader
var managerReconciler *controllers.SelfNodeRemediationReconciler
var dummyDog watchdog.Watchdog

func TestAPIs(t *testing.T) {
RegisterFailHandler(Fail)
Expand Down Expand Up @@ -120,12 +124,17 @@ var _ = BeforeSuite(func() {
Expect(err).ToNot(HaveOccurred())

certReader = certificates.NewSecretCertStorage(k8sClient, ctrl.Log.WithName("SecretCertStorage"), shared.Namespace)

dummyDog = watchdog.NewFake(true)
rebooter := reboot.NewWatchdogRebooter(dummyDog, ctrl.Log.WithName("rebooter"))

apiConnectivityCheckConfig := &apicheck.ApiConnectivityCheckConfig{
Log: ctrl.Log.WithName("api-check"),
MyNodeName: shared.UnhealthyNodeName,
CheckInterval: shared.ApiCheckInterval,
MaxErrorsThreshold: shared.MaxErrorThreshold,
Peers: peers,
Rebooter: rebooter,
Cfg: cfg,
CertReader: certReader,
}
Expand Down
2 changes: 2 additions & 0 deletions install/self-node-remediation-deamonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ spec:
value: {{.EndpointHealthCheckUrl}}
- name: HOST_PORT
value: "{{.HostPort}}"
- name: MIN_PEERS_FOR_REMEDIATION
value: "{{.MinPeersForRemediation}}"
image: {{.Image}}
imagePullPolicy: Always
volumeMounts:
Expand Down
21 changes: 19 additions & 2 deletions pkg/apicheck/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ type ApiConnectivityCheckConfig struct {
PeerRequestTimeout time.Duration
PeerHealthPort int
MaxTimeForNoPeersResponse time.Duration
MinPeersForRemediation int
}

func New(config *ApiConnectivityCheckConfig, controlPlaneManager *controlplane.Manager) *ApiConnectivityCheck {
Expand Down Expand Up @@ -129,12 +130,28 @@ func (c *ApiConnectivityCheck) getWorkerPeersResponse() peers.Response {

c.config.Log.Info("Error count exceeds threshold, trying to ask other nodes if I'm healthy")
peersToAsk := c.config.Peers.GetPeersAddresses(peers.Worker)
if peersToAsk == nil || len(peersToAsk) == 0 {
c.config.Log.Info("Peers list is empty and / or couldn't be retrieved from server, nothing we can do, so consider the node being healthy")

// We check to see if we have at least the number of peers that the user has configured as required.
// If we don't have this many peers (for instance there are zero peers, and the default value is set
// which requires at least one peer), we don't want to remediate. In this case we have some confusion
// and don't want to remediate a node when we shouldn't. Note: It would be unusual for MinPeersForRemediation
// to be greater than 1 unless the environment has specific requirements.
if len(peersToAsk) < c.config.MinPeersForRemediation {
c.config.Log.Info("Ignoring api-server error as we have an insufficient number of peers found, "+
"so we aren't going to attempt to contact any to check for a SelfNodeRemediation CR"+
" - we will consider it as if there was no CR present & as healthy.", "minPeersRequired",
c.config.MinPeersForRemediation, "actualNumPeersFound", len(peersToAsk))

// TODO: maybe we need to check if this happens too much and reboot
return peers.Response{IsHealthy: true, Reason: peers.HealthyBecauseNoPeersWereFound}
}

// If we make it here and there are no peers, we can't proceed because we need at least one peer
// to check. So it doesn't make sense to continue on - we'll mark as unhealthy and exit fast
if len(peersToAsk) == 0 {
return peers.Response{IsHealthy: false, Reason: peers.UnHealthyBecauseNodeIsIsolated}
}

apiErrorsResponsesSum := 0
nrAllPeers := len(peersToAsk)
// peersToAsk is being reduced at every iteration, iterate until no peers left to ask
Expand Down

0 comments on commit 117656c

Please sign in to comment.