Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Configurable minimum worker nodecount #238

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ bin

# editor and IDE paraphernalia
.idea
*.iml
*.swp
*.swo
*~
Expand Down
10 changes: 10 additions & 0 deletions api/v1alpha1/selfnoderemediationconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ const (
ConfigCRName = "self-node-remediation-config"
defaultWatchdogPath = "/dev/watchdog"
defaultIsSoftwareRebootEnabled = true
defaultMinPeersForRemediation = 1
)

// SelfNodeRemediationConfigSpec defines the desired state of SelfNodeRemediationConfig
Expand Down Expand Up @@ -127,6 +128,15 @@ type SelfNodeRemediationConfigSpec struct {
// CustomDsTolerations allows to add custom tolerations snr agents that are running on the ds in order to support remediation for different types of nodes.
// +optional
CustomDsTolerations []v1.Toleration `json:"customDsTolerations,omitempty"`

// Minimum number of peer workers/control nodes to attempt to contact before deciding if node is unhealthy or not
// if set to zero, no other peers will be required to be present for remediation action to occur when this
// node has lost API server access. If an insufficient number of peers are found, we will not attempt to ask
// any peer nodes (if present) whether they see that the current node has been marked unhealthy with a
// SelfNodeRemediation CR
// +kubebuilder:default:=1
// +kubebuilder:validation:Minimum=0
MinPeersForRemediation int `json:"minPeersForRemediation,omitempty"`
}

// SelfNodeRemediationConfigStatus defines the observed state of SelfNodeRemediationConfig
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,17 @@ spec:
its peers.
minimum: 1
type: integer
minPeersForRemediation:
default: 1
description: "Minimum number of peer workers/control nodes to attempt
to contact before deciding if node is unhealthy or not\n\tif set
to zero, no other peers will be required to be present for remediation
action to occur when this\n\tnode has lost API server access. If
an insufficient number of peers are found, we will not attempt to
ask\n\tany peer nodes (if present) whether they see that the current
node has been marked unhealthy with a\n\tSelfNodeRemediation CR"
minimum: 0
type: integer
peerApiServerTimeout:
default: 5s
description: |-
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,17 @@ spec:
its peers.
minimum: 1
type: integer
minPeersForRemediation:
default: 1
description: "Minimum number of peer workers/control nodes to attempt
to contact before deciding if node is unhealthy or not\n\tif set
to zero, no other peers will be required to be present for remediation
action to occur when this\n\tnode has lost API server access. If
an insufficient number of peers are found, we will not attempt to
ask\n\tany peer nodes (if present) whether they see that the current
node has been marked unhealthy with a\n\tSelfNodeRemediation CR"
minimum: 0
type: integer
peerApiServerTimeout:
default: 5s
description: |-
Expand Down
1 change: 1 addition & 0 deletions controllers/selfnoderemediationconfig_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ func (r *SelfNodeRemediationConfigReconciler) syncConfigDaemonSet(ctx context.Co
data.Data["PeerRequestTimeout"] = snrConfig.Spec.PeerRequestTimeout.Nanoseconds()
data.Data["MaxApiErrorThreshold"] = snrConfig.Spec.MaxApiErrorThreshold
data.Data["EndpointHealthCheckUrl"] = snrConfig.Spec.EndpointHealthCheckUrl
data.Data["MinPeersForRemediation"] = snrConfig.Spec.MinPeersForRemediation
data.Data["HostPort"] = snrConfig.Spec.HostPort
data.Data["IsSoftwareRebootEnabled"] = fmt.Sprintf("\"%t\"", snrConfig.Spec.IsSoftwareRebootEnabled)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ var _ = Describe("SNR Config Test", func() {
Expect(createdConfig.Spec.ApiServerTimeout.Seconds()).To(BeEquivalentTo(5))
Expect(createdConfig.Spec.ApiCheckInterval.Seconds()).To(BeEquivalentTo(15))
Expect(createdConfig.Spec.PeerUpdateInterval.Seconds()).To(BeEquivalentTo(15 * 60))
Expect(createdConfig.Spec.MinPeersForRemediation).To(BeEquivalentTo(1))
})
})

Expand Down
9 changes: 9 additions & 0 deletions controllers/tests/config/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@ import (
"github.com/medik8s/self-node-remediation/pkg/apicheck"
"github.com/medik8s/self-node-remediation/pkg/certificates"
"github.com/medik8s/self-node-remediation/pkg/peers"

//+kubebuilder:scaffold:imports
"github.com/medik8s/self-node-remediation/pkg/reboot"
"github.com/medik8s/self-node-remediation/pkg/watchdog"
)

// These tests use Ginkgo (BDD-style Go testing framework). Refer to
Expand All @@ -52,6 +55,7 @@ var cancelFunc context.CancelFunc
var k8sClient *shared.K8sClientWrapper
var certReader certificates.CertStorageReader
var managerReconciler *controllers.SelfNodeRemediationReconciler
var dummyDog watchdog.Watchdog

func TestAPIs(t *testing.T) {
RegisterFailHandler(Fail)
Expand Down Expand Up @@ -120,12 +124,17 @@ var _ = BeforeSuite(func() {
Expect(err).ToNot(HaveOccurred())

certReader = certificates.NewSecretCertStorage(k8sClient, ctrl.Log.WithName("SecretCertStorage"), shared.Namespace)

dummyDog = watchdog.NewFake(true)
rebooter := reboot.NewWatchdogRebooter(dummyDog, ctrl.Log.WithName("rebooter"))

apiConnectivityCheckConfig := &apicheck.ApiConnectivityCheckConfig{
Log: ctrl.Log.WithName("api-check"),
MyNodeName: shared.UnhealthyNodeName,
CheckInterval: shared.ApiCheckInterval,
MaxErrorsThreshold: shared.MaxErrorThreshold,
Peers: peers,
Rebooter: rebooter,
Cfg: cfg,
CertReader: certReader,
}
Expand Down
96 changes: 81 additions & 15 deletions controllers/tests/controller/selfnoderemediation_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -445,22 +445,88 @@ var _ = Describe("SNR Controller", func() {
remediationStrategy = v1alpha1.ResourceDeletionRemediationStrategy
})

It("Verify that watchdog is not receiving food after some time", func() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, any particular reason this test was removed ?

lastFoodTime := dummyDog.LastFoodTime()
timeout := dummyDog.GetTimeout()
Eventually(func() bool {
newTime := dummyDog.LastFoodTime()
// ensure the timeout passed
timeoutPassed := time.Now().After(lastFoodTime.Add(3 * timeout))
// ensure wd wasn't feeded
missedFeed := newTime.Before(lastFoodTime.Add(timeout))
if timeoutPassed && missedFeed {
return true
}
lastFoodTime = newTime
return false
}, 10*shared.PeerUpdateInterval, timeout).Should(BeTrue())
AfterEach(func() {
By("Restore default settings")
apiConnectivityCheckConfig.MinPeersForRemediation = 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: I think this makes more sense in the general After Each block


// sleep so config can update
time.Sleep(time.Second * 2)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is needed, IIUC config isn't being updated async here

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I verified that if I didn't put the sleep often times the config was not updated when the other routines ran. It took me quite some time to track this particular problem down 😭

It's even worse if I disable some tests so things run in different orders

})

Context("no peer found, and using default setting for MinPeersForRemediation", func() {
BeforeEach(func() {
apiConnectivityCheckConfig.MinPeersForRemediation = 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is redundant because was already updated in the AfterEach block

snrConfig.Spec.MinPeersForRemediation = 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC this is the default value, so I think it would make more sense to add it in GenerateTestConfig

})

It("Does not receive peer communication and has MinPeersForRemediation set to 1 so no reboot "+
"will occur", func() {
lastFoodTime := dummyDog.LastFoodTime()
timeout := dummyDog.GetTimeout()
fmt.Printf("timeout: %s", timeout.String())
Eventually(func() bool {
fmt.Println("YYYY")
newTime := dummyDog.LastFoodTime()
fmt.Printf("newTime: %s\n", newTime.Format(time.StampMilli))
// ensure the timeout passed
timeoutPassed := time.Now().After(lastFoodTime.Add(3 * timeout))
// ensure wd wasn't feeded
fmt.Printf("missedFeed time: %s\n", lastFoodTime.Add(timeout).Format(time.StampMilli))
missedFeed := newTime.Before(lastFoodTime.Add(timeout))
if timeoutPassed && missedFeed {
fmt.Println("timeout passed and missed feed")
return true
}
lastFoodTime = newTime
fmt.Println("timeout didn't pass and/or didn't miss feed")
return false
}, 10*shared.PeerUpdateInterval, timeout).Should(BeTrue())
})
})

Context("no peer found and MinPeersForRemediation is configured to 0", func() {
JustBeforeEach(func() {
apiConnectivityCheckConfig.MinPeersForRemediation = 0

// sleep so config can update
time.Sleep(time.Second * 2)
})

It("Does not receive peer communication and since configured to need zero peers, initiates a reboot",
func() {
fmt.Println("test 0")
lastFoodTime := dummyDog.LastFoodTime()
fmt.Printf("original lastFoodTime: %s\n", lastFoodTime.Format(time.StampMilli))
timeout := dummyDog.GetTimeout()
fmt.Printf("timeout: %s\n", timeout.String())

Eventually(func() bool {
fmt.Println("test 1")
newTime := dummyDog.LastFoodTime()
fmt.Printf("newTime: %s\n", newTime.Format(time.StampMilli))
// ensure the timeout passed
n := time.Now()
fmt.Printf("now (%s) should be past 3x past lastFoodTime(%s) which is %s\n",
n.Format(time.StampMilli), lastFoodTime.Format(time.StampMilli), lastFoodTime.Add(3*timeout).Format(time.StampMilli))
//fmt.Printf("timeout: %s\n", lastFoodTime.Add(3*timeout).Format(time.StampMilli))
timeoutPassed := n.After(lastFoodTime.Add(3 * timeout))
// ensure wd wasn't feeded
fmt.Printf("missedFeed time: %s\n", lastFoodTime.Add(timeout).Format(time.StampMilli))
fmt.Printf("newTime (%s) should be before lastFoodTime+%s (%s)\n", newTime.Format(time.StampMilli), timeout.String(),
lastFoodTime.Add(timeout).Format(time.StampMilli))
missedFeed := newTime.Before(lastFoodTime.Add(timeout))
if timeoutPassed && missedFeed {
fmt.Println("returning true")
return true
}
lastFoodTime = newTime

fmt.Println("returning false")
return false
}, "15s", timeout).Should(BeTrue())
})
})

})

Context("Configuration is missing", func() {
Expand Down
Loading