Skip to content

Commit

Permalink
Add minPeersForRemediation configuration value.
Browse files Browse the repository at this point in the history
With this one can specify the number of worker peers needed to
be able to contact before determining a node is unhealthy.

It covers the case in which there are 3 control plane nodes and a single
worker node, and yet you still want to be able to perform remediations
on that worker node

It has a default of 1, which maintains existing behaviors without
explicitly altering the value.
  • Loading branch information
novasbc committed Jan 8, 2025
1 parent 0bc6866 commit 627c3bb
Show file tree
Hide file tree
Showing 11 changed files with 291 additions and 32 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ bin

# editor and IDE paraphernalia
.idea
*.iml
*.swp
*.swo
*~
Expand Down
10 changes: 10 additions & 0 deletions api/v1alpha1/selfnoderemediationconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ const (
ConfigCRName = "self-node-remediation-config"
defaultWatchdogPath = "/dev/watchdog"
defaultIsSoftwareRebootEnabled = true
defaultMinPeersForRemediation = 1
)

// SelfNodeRemediationConfigSpec defines the desired state of SelfNodeRemediationConfig
Expand Down Expand Up @@ -127,6 +128,15 @@ type SelfNodeRemediationConfigSpec struct {
// CustomDsTolerations allows to add custom tolerations snr agents that are running on the ds in order to support remediation for different types of nodes.
// +optional
CustomDsTolerations []v1.Toleration `json:"customDsTolerations,omitempty"`

// Minimum number of peer workers/control nodes to attempt to contact before deciding if node is unhealthy or not
// if set to zero, no other peers will be required to be present for remediation action to occur when this
// node has lost API server access. If an insufficient number of peers are found, we will not attempt to ask
// any peer nodes (if present) whether they see that the current node has been marked unhealthy with a
// SelfNodeRemediation CR
// +kubebuilder:default:=1
// +kubebuilder:validation:Minimum=0
MinPeersForRemediation int `json:"minPeersForRemediation,omitempty"`
}

// SelfNodeRemediationConfigStatus defines the observed state of SelfNodeRemediationConfig
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,17 @@ spec:
its peers.
minimum: 1
type: integer
minPeersForRemediation:
default: 1
description: "Minimum number of peer workers/control nodes to attempt
to contact before deciding if node is unhealthy or not\n\tif set
to zero, no other peers will be required to be present for remediation
action to occur when this\n\tnode has lost API server access. If
an insufficient number of peers are found, we will not attempt to
ask\n\tany peer nodes (if present) whether they see that the current
node has been marked unhealthy with a\n\tSelfNodeRemediation CR"
minimum: 0
type: integer
peerApiServerTimeout:
default: 5s
description: |-
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,17 @@ spec:
its peers.
minimum: 1
type: integer
minPeersForRemediation:
default: 1
description: "Minimum number of peer workers/control nodes to attempt
to contact before deciding if node is unhealthy or not\n\tif set
to zero, no other peers will be required to be present for remediation
action to occur when this\n\tnode has lost API server access. If
an insufficient number of peers are found, we will not attempt to
ask\n\tany peer nodes (if present) whether they see that the current
node has been marked unhealthy with a\n\tSelfNodeRemediation CR"
minimum: 0
type: integer
peerApiServerTimeout:
default: 5s
description: |-
Expand Down
1 change: 1 addition & 0 deletions controllers/selfnoderemediationconfig_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ func (r *SelfNodeRemediationConfigReconciler) syncConfigDaemonSet(ctx context.Co
data.Data["PeerRequestTimeout"] = snrConfig.Spec.PeerRequestTimeout.Nanoseconds()
data.Data["MaxApiErrorThreshold"] = snrConfig.Spec.MaxApiErrorThreshold
data.Data["EndpointHealthCheckUrl"] = snrConfig.Spec.EndpointHealthCheckUrl
data.Data["MinPeersForRemediation"] = snrConfig.Spec.MinPeersForRemediation
data.Data["HostPort"] = snrConfig.Spec.HostPort
data.Data["IsSoftwareRebootEnabled"] = fmt.Sprintf("\"%t\"", snrConfig.Spec.IsSoftwareRebootEnabled)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ var _ = Describe("SNR Config Test", func() {
Expect(createdConfig.Spec.ApiServerTimeout.Seconds()).To(BeEquivalentTo(5))
Expect(createdConfig.Spec.ApiCheckInterval.Seconds()).To(BeEquivalentTo(15))
Expect(createdConfig.Spec.PeerUpdateInterval.Seconds()).To(BeEquivalentTo(15 * 60))
Expect(createdConfig.Spec.MinPeersForRemediation).To(BeEquivalentTo(1))
})
})

Expand Down
9 changes: 9 additions & 0 deletions controllers/tests/config/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@ import (
"github.com/medik8s/self-node-remediation/pkg/apicheck"
"github.com/medik8s/self-node-remediation/pkg/certificates"
"github.com/medik8s/self-node-remediation/pkg/peers"

//+kubebuilder:scaffold:imports
"github.com/medik8s/self-node-remediation/pkg/reboot"
"github.com/medik8s/self-node-remediation/pkg/watchdog"
)

// These tests use Ginkgo (BDD-style Go testing framework). Refer to
Expand All @@ -52,6 +55,7 @@ var cancelFunc context.CancelFunc
var k8sClient *shared.K8sClientWrapper
var certReader certificates.CertStorageReader
var managerReconciler *controllers.SelfNodeRemediationReconciler
var dummyDog watchdog.Watchdog

func TestAPIs(t *testing.T) {
RegisterFailHandler(Fail)
Expand Down Expand Up @@ -120,12 +124,17 @@ var _ = BeforeSuite(func() {
Expect(err).ToNot(HaveOccurred())

certReader = certificates.NewSecretCertStorage(k8sClient, ctrl.Log.WithName("SecretCertStorage"), shared.Namespace)

dummyDog = watchdog.NewFake(true)
rebooter := reboot.NewWatchdogRebooter(dummyDog, ctrl.Log.WithName("rebooter"))

apiConnectivityCheckConfig := &apicheck.ApiConnectivityCheckConfig{
Log: ctrl.Log.WithName("api-check"),
MyNodeName: shared.UnhealthyNodeName,
CheckInterval: shared.ApiCheckInterval,
MaxErrorsThreshold: shared.MaxErrorThreshold,
Peers: peers,
Rebooter: rebooter,
Cfg: cfg,
CertReader: certReader,
}
Expand Down
96 changes: 81 additions & 15 deletions controllers/tests/controller/selfnoderemediation_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -445,22 +445,88 @@ var _ = Describe("SNR Controller", func() {
remediationStrategy = v1alpha1.ResourceDeletionRemediationStrategy
})

It("Verify that watchdog is not receiving food after some time", func() {
lastFoodTime := dummyDog.LastFoodTime()
timeout := dummyDog.GetTimeout()
Eventually(func() bool {
newTime := dummyDog.LastFoodTime()
// ensure the timeout passed
timeoutPassed := time.Now().After(lastFoodTime.Add(3 * timeout))
// ensure wd wasn't feeded
missedFeed := newTime.Before(lastFoodTime.Add(timeout))
if timeoutPassed && missedFeed {
return true
}
lastFoodTime = newTime
return false
}, 10*shared.PeerUpdateInterval, timeout).Should(BeTrue())
AfterEach(func() {
By("Restore default settings")
apiConnectivityCheckConfig.MinPeersForRemediation = 1

// sleep so config can update
time.Sleep(time.Second * 2)
})

Context("no peer found, and using default setting for MinPeersForRemediation", func() {
BeforeEach(func() {
apiConnectivityCheckConfig.MinPeersForRemediation = 1
snrConfig.Spec.MinPeersForRemediation = 1
})

It("Does not receive peer communication and has MinPeersForRemediation set to 1 so no reboot "+
"will occur", func() {
lastFoodTime := dummyDog.LastFoodTime()
timeout := dummyDog.GetTimeout()
fmt.Printf("timeout: %s", timeout.String())
Eventually(func() bool {
fmt.Println("YYYY")
newTime := dummyDog.LastFoodTime()
fmt.Printf("newTime: %s\n", newTime.Format(time.StampMilli))
// ensure the timeout passed
timeoutPassed := time.Now().After(lastFoodTime.Add(3 * timeout))
// ensure wd wasn't feeded
fmt.Printf("missedFeed time: %s\n", lastFoodTime.Add(timeout).Format(time.StampMilli))
missedFeed := newTime.Before(lastFoodTime.Add(timeout))
if timeoutPassed && missedFeed {
fmt.Println("timeout passed and missed feed")
return true
}
lastFoodTime = newTime
fmt.Println("timeout didn't pass and/or didn't miss feed")
return false
}, 10*shared.PeerUpdateInterval, timeout).Should(BeTrue())
})
})

Context("no peer found and MinPeersForRemediation is configured to 0", func() {
JustBeforeEach(func() {
apiConnectivityCheckConfig.MinPeersForRemediation = 0

// sleep so config can update
time.Sleep(time.Second * 2)
})

It("Does not receive peer communication and since configured to need zero peers, initiates a reboot",
func() {
fmt.Println("test 0")
lastFoodTime := dummyDog.LastFoodTime()
fmt.Printf("original lastFoodTime: %s\n", lastFoodTime.Format(time.StampMilli))
timeout := dummyDog.GetTimeout()
fmt.Printf("timeout: %s\n", timeout.String())

Eventually(func() bool {
fmt.Println("test 1")
newTime := dummyDog.LastFoodTime()
fmt.Printf("newTime: %s\n", newTime.Format(time.StampMilli))
// ensure the timeout passed
n := time.Now()
fmt.Printf("now (%s) should be past 3x past lastFoodTime(%s) which is %s\n",
n.Format(time.StampMilli), lastFoodTime.Format(time.StampMilli), lastFoodTime.Add(3*timeout).Format(time.StampMilli))
//fmt.Printf("timeout: %s\n", lastFoodTime.Add(3*timeout).Format(time.StampMilli))
timeoutPassed := n.After(lastFoodTime.Add(3 * timeout))
// ensure wd wasn't feeded
fmt.Printf("missedFeed time: %s\n", lastFoodTime.Add(timeout).Format(time.StampMilli))
fmt.Printf("newTime (%s) should be before lastFoodTime+%s (%s)\n", newTime.Format(time.StampMilli), timeout.String(),
lastFoodTime.Add(timeout).Format(time.StampMilli))
missedFeed := newTime.Before(lastFoodTime.Add(timeout))
if timeoutPassed && missedFeed {
fmt.Println("returning true")
return true
}
lastFoodTime = newTime

fmt.Println("returning false")
return false
}, "15s", timeout).Should(BeTrue())
})
})

})

Context("Configuration is missing", func() {
Expand Down
Loading

0 comments on commit 627c3bb

Please sign in to comment.