From 0858540f7487b926caf32928500e047e137dbf76 Mon Sep 17 00:00:00 2001 From: Gonzalo Reyero Ferreras Date: Wed, 22 Nov 2023 03:44:19 -0600 Subject: [PATCH] Lifecycle ts: fix to avoid some checks to be skipped. The WithSkipCheckFn works doing OR (default behaviour) or AND operations with **all** the skip functions that have been passed. It's not possible to mix them, so I had to create a helper function to check whether both the env.Deployments and env.StatefulSets slices are empty. Examples: check1.WithSkipCheckFn(skipFn1, skipFn2) check1 will be skipped if **any** of those skip functions return true (OR op because check.SkipMode=SkipModeAny by default). check2.WithSkipCheckFn(skipFn1, skipFn2). WithSkipModeAll() check2 will be skipped if **both** skipFn1 and skipFn2 return true (AND op = check.SkipMode=SkipModeAll). Also: - Some refactor was done in the lifecycle-pod-recreation check to create deployments and statefulsets compliant/nonCompliant objects lists. - Added skip functions for the lifecycle-storage-provisioner and set pod as compliant if it's not using any pvc/storageclass. - Added passing checks from lifecycle and networking test suites to the gradetool's policy json. ToDos: - refactor the testPodsRecreation() check function to avoid the cyclomatic complexity linting issue. - refactor the testStorageProvisioner() check function to add some short-circuits to avoid the for-if-for-if-for-if... pattern. --- .../lifecycle/podsets/podsets.go | 12 +- cnf-certification-test/lifecycle/suite.go | 110 +++++++++++----- generated_policy.json | 124 +++++++++++++++--- pkg/testhelper/testhelper.go | 18 +++ 4 files changed, 207 insertions(+), 57 deletions(-) diff --git a/cnf-certification-test/lifecycle/podsets/podsets.go b/cnf-certification-test/lifecycle/podsets/podsets.go index d1e7a0237..7169bd35a 100644 --- a/cnf-certification-test/lifecycle/podsets/podsets.go +++ b/cnf-certification-test/lifecycle/podsets/podsets.go @@ -181,7 +181,10 @@ func getNotReadyStatefulSets(statefulSets []*provider.StatefulSet) []*provider.S return notReadyStatefulSets } -func WaitForAllPodSetsReady(env *provider.TestEnvironment, timeout time.Duration) (claimsLog loghelper.CuratedLogLines, atLeastOnePodsetNotReady bool) { +func WaitForAllPodSetsReady(env *provider.TestEnvironment, timeout time.Duration) ( + claimsLog loghelper.CuratedLogLines, + notReadyDeployments []*provider.Deployment, + notReadyStatefulSets []*provider.StatefulSet) { const queryInterval = 15 * time.Second deploymentsToCheck := env.Deployments @@ -190,10 +193,10 @@ func WaitForAllPodSetsReady(env *provider.TestEnvironment, timeout time.Duration logrus.Infof("Waiting %s for %d podsets to be ready.", timeout, len(deploymentsToCheck)+len(statefulSetsToCheck)) for startTime := time.Now(); time.Since(startTime) < timeout; { logrus.Infof("Checking Deployments readiness of Deployments %v", getDeploymentsInfo(deploymentsToCheck)) - notReadyDeployments := getNotReadyDeployments(deploymentsToCheck) + notReadyDeployments = getNotReadyDeployments(deploymentsToCheck) logrus.Infof("Checking StatefulSets readiness of StatefulSets %v", getStatefulSetsInfo(statefulSetsToCheck)) - notReadyStatefulSets := getNotReadyStatefulSets(statefulSetsToCheck) + notReadyStatefulSets = getNotReadyStatefulSets(statefulSetsToCheck) logrus.Infof("Not ready Deployments: %v", getDeploymentsInfo(notReadyDeployments)) logrus.Infof("Not ready StatefulSets: %v", getStatefulSetsInfo(notReadyStatefulSets)) @@ -213,8 +216,7 @@ func WaitForAllPodSetsReady(env *provider.TestEnvironment, timeout time.Duration claimsLog.AddLogLine("Not ready Deployments: %v", getDeploymentsInfo(deploymentsToCheck)) claimsLog.AddLogLine("Not ready StatefulSets: %v", getStatefulSetsInfo(statefulSetsToCheck)) - atLeastOnePodsetNotReady = len(deploymentsToCheck) > 0 || len(statefulSetsToCheck) > 0 - return claimsLog, atLeastOnePodsetNotReady + return claimsLog, deploymentsToCheck, statefulSetsToCheck } func GetAllNodesForAllPodSets(pods []*provider.Pod) (nodes map[string]bool) { diff --git a/cnf-certification-test/lifecycle/suite.go b/cnf-certification-test/lifecycle/suite.go index 92d5bb1a0..9a1daa674 100644 --- a/cnf-certification-test/lifecycle/suite.go +++ b/cnf-certification-test/lifecycle/suite.go @@ -56,6 +56,14 @@ var ( env = provider.GetTestEnvironment() return nil } + + // podset = deployment or statefulset + skipIfNoPodSetsetsUnderTest = func() (bool, string) { + if len(env.Deployments) == 0 && len(env.StatefulSets) == 0 { + return true, "no deployments nor statefulsets to check found" + } + return false, "" + } ) //nolint:funlen @@ -144,10 +152,8 @@ func init() { // High availability test testID, tags = identifiers.GetGinkgoTestIDAndLabels(identifiers.TestPodHighAvailabilityBestPractices) checksGroup.Add(checksdb.NewCheck(testID, tags). - WithSkipCheckFn( - testhelper.GetNotEnoughWorkersSkipFn(&env, minWorkerNodesForLifecycle), - testhelper.GetNoDeploymentsUnderTestSkipFn(&env), - testhelper.GetNoStatefulSetsUnderTestSkipFn(&env)). + WithSkipCheckFn(testhelper.GetNotEnoughWorkersSkipFn(&env, minWorkerNodesForLifecycle)). + WithSkipCheckFn(skipIfNoPodSetsetsUnderTest). WithCheckFn(func(c *checksdb.Check) error { testHighAvailability(c, &env) return nil @@ -169,9 +175,8 @@ func init() { checksGroup.Add(checksdb.NewCheck(testID, tags). WithSkipCheckFn( testhelper.GetNotEnoughWorkersSkipFn(&env, minWorkerNodesForLifecycle), - testhelper.GetNoDeploymentsUnderTestSkipFn(&env), - testhelper.GetNoStatefulSetsUnderTestSkipFn(&env), testhelper.GetNotIntrusiveSkipFn(&env)). + WithSkipCheckFn(skipIfNoPodSetsetsUnderTest). WithCheckFn(func(c *checksdb.Check) error { testPodsRecreation(c, &env) return nil @@ -182,9 +187,8 @@ func init() { checksGroup.Add(checksdb.NewCheck(testID, tags). WithSkipCheckFn( testhelper.GetNotIntrusiveSkipFn(&env), - testhelper.GetNotEnoughWorkersSkipFn(&env, minWorkerNodesForLifecycle), - testhelper.GetNoDeploymentsUnderTestSkipFn(&env), - testhelper.GetNoStatefulSetsUnderTestSkipFn(&env)). + testhelper.GetNotEnoughWorkersSkipFn(&env, minWorkerNodesForLifecycle)). + WithSkipCheckFn(skipIfNoPodSetsetsUnderTest). WithCheckFn(func(c *checksdb.Check) error { testDeploymentScaling(&env, timeout, c) return nil @@ -195,9 +199,8 @@ func init() { checksGroup.Add(checksdb.NewCheck(testID, tags). WithSkipCheckFn( testhelper.GetNotIntrusiveSkipFn(&env), - testhelper.GetNotEnoughWorkersSkipFn(&env, minWorkerNodesForLifecycle), - testhelper.GetNoDeploymentsUnderTestSkipFn(&env), - testhelper.GetNoStatefulSetsUnderTestSkipFn(&env)). + testhelper.GetNotEnoughWorkersSkipFn(&env, minWorkerNodesForLifecycle)). + WithSkipCheckFn(skipIfNoPodSetsetsUnderTest). WithCheckFn(func(c *checksdb.Check) error { testStatefulSetScaling(&env, timeout, c) return nil @@ -244,7 +247,10 @@ func init() { // Storage provisioner test testID, tags = identifiers.GetGinkgoTestIDAndLabels(identifiers.TestStorageProvisioner) checksGroup.Add(checksdb.NewCheck(testID, tags). - WithSkipCheckFn(testhelper.GetNoPodsUnderTestSkipFn(&env)). + WithSkipCheckFn( + testhelper.GetNoPodsUnderTestSkipFn(&env), + testhelper.GetNoStorageClassesSkipFn(&env), + testhelper.GetNoPersistentVolumeClaimsSkipFn(&env)). WithCheckFn(func(c *checksdb.Check) error { testStorageProvisioner(c, &env) return nil @@ -575,7 +581,7 @@ func testHighAvailability(check *checksdb.Check, env *provider.TestEnvironment) } // testPodsRecreation tests that pods belonging to deployments and statefulsets are re-created and ready in case a node is lost -func testPodsRecreation(check *checksdb.Check, env *provider.TestEnvironment) { //nolint:funlen +func testPodsRecreation(check *checksdb.Check, env *provider.TestEnvironment) { //nolint:funlen,gocyclo var compliantObjects []*testhelper.ReportObject var nonCompliantObjects []*testhelper.ReportObject @@ -594,10 +600,15 @@ func testPodsRecreation(check *checksdb.Check, env *provider.TestEnvironment) { // Before draining any node, wait until all podsets are ready. The timeout depends on the number of podsets to check. // timeout = k-mins + (1min * (num-deployments + num-statefulsets)) allPodsetsReadyTimeout := timeoutPodSetReady + time.Minute*time.Duration(len(env.Deployments)+len(env.StatefulSets)) - claimsLog, atLeastOnePodsetNotReady := podsets.WaitForAllPodSetsReady(env, allPodsetsReadyTimeout) - if atLeastOnePodsetNotReady { + claimsLog, notReadyDeployments, notReadyStatefulSets := podsets.WaitForAllPodSetsReady(env, allPodsetsReadyTimeout) + if len(notReadyDeployments) > 0 || len(notReadyStatefulSets) > 0 { tnf.ClaimFilePrintf("%s", claimsLog.GetLogLines()) - nonCompliantObjects = append(nonCompliantObjects, testhelper.NewPodReportObject("", "", "Some deployments or stateful sets are not in a good initial state. Cannot perform test.", false)) + for _, dep := range notReadyDeployments { + nonCompliantObjects = append(nonCompliantObjects, testhelper.NewDeploymentReportObject(dep.Namespace, dep.Name, "Deployment was not ready before draining any node.", false)) + } + for _, sts := range notReadyStatefulSets { + nonCompliantObjects = append(nonCompliantObjects, testhelper.NewStatefulSetReportObject(sts.Namespace, sts.Name, "Statefulset was not ready before draining any node.", false)) + } return } @@ -615,43 +626,64 @@ func testPodsRecreation(check *checksdb.Check, env *provider.TestEnvironment) { } if len(podsWithNodeAssignment) > 0 { logrus.Errorf("Pod(s) have been found to contain a node assignment and cannot perform the pod-recreation test: %v", podsWithNodeAssignment) - nonCompliantObjects = append(nonCompliantObjects, testhelper.NewPodReportObject("", "", "Some pods have node assignments and cannot perform the pod-recreation test", false)) + for _, pod := range podsWithNodeAssignment { + nonCompliantObjects = append(nonCompliantObjects, testhelper.NewPodReportObject(pod.Namespace, pod.Name, "Pod has node assignment.", false)) + } + return } - for n := range podsets.GetAllNodesForAllPodSets(env.Pods) { - defer podrecreation.CordonCleanup(n) //nolint:gocritic // The defer in loop is intentional, calling the cleanup function once per node - err := podrecreation.CordonHelper(n, podrecreation.Cordon) + for nodeName := range podsets.GetAllNodesForAllPodSets(env.Pods) { + defer podrecreation.CordonCleanup(nodeName) //nolint:gocritic // The defer in loop is intentional, calling the cleanup function once per node + err := podrecreation.CordonHelper(nodeName, podrecreation.Cordon) if err != nil { - logrus.Errorf("error cordoning the node: %s", n) - nonCompliantObjects = append(nonCompliantObjects, testhelper.NewNodeReportObject(n, "Node cordoning failed", false)) + logrus.Errorf("error cordoning the node: %s", nodeName) + nonCompliantObjects = append(nonCompliantObjects, testhelper.NewNodeReportObject(nodeName, "Node cordoning failed", false)) return } - tnf.Logf(logrus.InfoLevel, fmt.Sprintf("Draining and Cordoning node %s: ", n)) - logrus.Debugf("node: %s cordoned", n) - count, err := podrecreation.CountPodsWithDelete(env.Pods, n, podrecreation.NoDelete) + tnf.Logf(logrus.InfoLevel, fmt.Sprintf("Draining and Cordoning node %s: ", nodeName)) + logrus.Debugf("node: %s cordoned", nodeName) + count, err := podrecreation.CountPodsWithDelete(env.Pods, nodeName, podrecreation.NoDelete) if err != nil { - nonCompliantObjects = append(nonCompliantObjects, testhelper.NewNodeReportObject(n, "Getting pods list to drain failed", false)) + nonCompliantObjects = append(nonCompliantObjects, testhelper.NewNodeReportObject(nodeName, "Getting pods list to drain failed", false)) return } nodeTimeout := timeoutPodSetReady + timeoutPodRecreationPerPod*time.Duration(count) - logrus.Debugf("draining node: %s with timeout: %s", n, nodeTimeout) - _, err = podrecreation.CountPodsWithDelete(env.Pods, n, podrecreation.DeleteForeground) + logrus.Debugf("draining node: %s with timeout: %s", nodeName, nodeTimeout) + _, err = podrecreation.CountPodsWithDelete(env.Pods, nodeName, podrecreation.DeleteForeground) if err != nil { - nonCompliantObjects = append(nonCompliantObjects, testhelper.NewNodeReportObject(n, "Draining node failed", false)) + nonCompliantObjects = append(nonCompliantObjects, testhelper.NewNodeReportObject(nodeName, "Draining node failed", false)) return } - claimsLog, podsNotReady := podsets.WaitForAllPodSetsReady(env, nodeTimeout) - if podsNotReady { + claimsLog, notReadyDeployments, notReadyStatefulSets := podsets.WaitForAllPodSetsReady(env, nodeTimeout) + if len(notReadyDeployments) > 0 || len(notReadyStatefulSets) > 0 { tnf.ClaimFilePrintf("%s", claimsLog.GetLogLines()) - nonCompliantObjects = append(nonCompliantObjects, testhelper.NewNodeReportObject(n, "Some pods are not ready after draining the node", false)) + for _, dep := range notReadyDeployments { + nonCompliantObjects = append(nonCompliantObjects, testhelper.NewDeploymentReportObject(dep.Namespace, dep.Name, "Deployment not ready after draining node "+nodeName, false)) + } + for _, sts := range notReadyStatefulSets { + nonCompliantObjects = append(nonCompliantObjects, testhelper.NewStatefulSetReportObject(sts.Namespace, sts.Name, "Statefulset not ready after draining node "+nodeName, false)) + } return } - err = podrecreation.CordonHelper(n, podrecreation.Uncordon) + err = podrecreation.CordonHelper(nodeName, podrecreation.Uncordon) if err != nil { - logrus.Fatalf("error uncordoning the node: %s", n) + logrus.Fatalf("error uncordoning the node: %s", nodeName) + } + } + + // If everything went well for all nodes, the nonCompliantObjects should be empty. We need to + // manually add all the deps/sts into the compliant object lists so the check is marked as skipped. + // ToDo: Improve this. + if len(nonCompliantObjects) == 0 { + for _, dep := range env.Deployments { + compliantObjects = append(compliantObjects, testhelper.NewDeploymentReportObject(dep.Namespace, dep.Name, "Deployment's pods successfully re-schedulled after node draining.", true)) + } + + for _, sts := range env.StatefulSets { + compliantObjects = append(compliantObjects, testhelper.NewStatefulSetReportObject(sts.Namespace, sts.Name, "Statefulset's pods successfully re-schedulled after node draining.", true)) } } @@ -773,6 +805,7 @@ func testStorageProvisioner(check *checksdb.Check, env *provider.TestEnvironment var Pvc = env.PersistentVolumeClaims snoSingleLocalStorageProvisionner := "" for _, put := range env.Pods { + usesPvcAndStorageClass := false for pvIndex := range put.Spec.Volumes { // Skip any nil persistentClaims. volume := put.Spec.Volumes[pvIndex] @@ -785,6 +818,7 @@ func testStorageProvisioner(check *checksdb.Check, env *provider.TestEnvironment if Pvc[i].Name == put.Spec.Volumes[pvIndex].PersistentVolumeClaim.ClaimName && Pvc[i].Namespace == put.Namespace { for j := range StorageClasses { if Pvc[i].Spec.StorageClassName != nil && StorageClasses[j].Name == *Pvc[i].Spec.StorageClassName { + usesPvcAndStorageClass = true tnf.ClaimFilePrintf("%s pvc_name: %s, storageclass_name: %s, provisioner_name: %s", put.String(), put.Spec.Volumes[pvIndex].PersistentVolumeClaim.ClaimName, StorageClasses[j].Name, StorageClasses[j].Provisioner) @@ -831,6 +865,12 @@ func testStorageProvisioner(check *checksdb.Check, env *provider.TestEnvironment } } } + // Save as compliant pod in case it's not using any of the existing PVC/StorageClasses of the cluster. + // Otherwise, in this cases the check will be marked as skipped. + // ToDo: improve this function. + if !usesPvcAndStorageClass { + compliantObjects = append(compliantObjects, testhelper.NewPodReportObject(put.Namespace, put.Name, "Pod not configured to use local storage.", true)) + } } } check.SetResult(compliantObjects, nonCompliantObjects) diff --git a/generated_policy.json b/generated_policy.json index edfc128e6..3c99927f5 100644 --- a/generated_policy.json +++ b/generated_policy.json @@ -1,11 +1,6 @@ { "grades": { "requiredPassingTests": [ - { - "id": "affiliated-certification-operator-is-certified", - "suite": "affiliated-certification", - "tags": "common" - }, { "id": "access-control-bpf-capability-check", "suite": "access-control", @@ -101,21 +96,11 @@ "suite": "access-control", "tags": "telco" }, - { - "id": "access-control-security-context", - "suite": "access-control", - "tags": "extended" - }, { "id": "access-control-security-context-non-root-user-check", "suite": "access-control", "tags": "common" }, - { - "id": "access-control-security-context-privilege-escalation", - "suite": "access-control", - "tags": "common" - }, { "id": "access-control-service-type", "suite": "access-control", @@ -137,8 +122,68 @@ "tags": "telco" }, { - "id": "access-control-sys-ptrace-capability", - "suite": "access-control", + "id": "affiliated-certification-operator-is-certified", + "suite": "affiliated-certification", + "tags": "common" + }, + { + "id": "lifecycle-container-shutdown", + "suite": "lifecycle", + "tags": "telco" + }, + { + "id": "lifecycle-container-startup", + "suite": "lifecycle", + "tags": "telco" + }, + { + "id": "lifecycle-deployment-scaling", + "suite": "lifecycle", + "tags": "common" + }, + { + "id": "lifecycle-image-pull-policy", + "suite": "lifecycle", + "tags": "telco" + }, + { + "id": "lifecycle-liveness-probe", + "suite": "lifecycle", + "tags": "telco" + }, + { + "id": "lifecycle-pod-high-availability", + "suite": "lifecycle", + "tags": "common" + }, + { + "id": "lifecycle-pod-owner-type", + "suite": "lifecycle", + "tags": "telco" + }, + { + "id": "lifecycle-pod-recreation", + "suite": "lifecycle", + "tags": "common" + }, + { + "id": "lifecycle-pod-scheduling", + "suite": "lifecycle", + "tags": "telco" + }, + { + "id": "lifecycle-pod-toleration-bypass", + "suite": "lifecycle", + "tags": "telco" + }, + { + "id": "lifecycle-readiness-probe", + "suite": "lifecycle", + "tags": "telco" + }, + { + "id": "lifecycle-startup-probe", + "suite": "lifecycle", "tags": "telco" }, { @@ -151,6 +196,51 @@ "suite": "manageability", "tags": "extended" }, + { + "id": "networking-dual-stack-service", + "suite": "networking", + "tags": "extended" + }, + { + "id": "networking-icmpv4-connectivity", + "suite": "networking", + "tags": "common" + }, + { + "id": "networking-icmpv4-connectivity-multus", + "suite": "networking", + "tags": "telco" + }, + { + "id": "networking-icmpv6-connectivity", + "suite": "networking", + "tags": "common" + }, + { + "id": "networking-icmpv6-connectivity-multus", + "suite": "networking", + "tags": "telco" + }, + { + "id": "networking-network-policy-deny-all", + "suite": "networking", + "tags": "common" + }, + { + "id": "networking-ocp-reserved-ports-usage", + "suite": "networking", + "tags": "common" + }, + { + "id": "networking-reserved-partner-ports", + "suite": "networking", + "tags": "extended" + }, + { + "id": "networking-undeclared-container-ports-usage", + "suite": "networking", + "tags": "extended" + }, { "id": "observability-container-logging", "suite": "observability", diff --git a/pkg/testhelper/testhelper.go b/pkg/testhelper/testhelper.go index fd3c0c224..f28dbc1cd 100644 --- a/pkg/testhelper/testhelper.go +++ b/pkg/testhelper/testhelper.go @@ -511,6 +511,24 @@ func GetNoAffinityRequiredPodsSkipFn(env *provider.TestEnvironment) func() (bool } } +func GetNoStorageClassesSkipFn(env *provider.TestEnvironment) func() (bool, string) { + return func() (bool, string) { + if len(env.StorageClassList) == 0 { + return true, "no storage classes found" + } + return false, "" + } +} + +func GetNoPersistentVolumeClaimsSkipFn(env *provider.TestEnvironment) func() (bool, string) { + return func() (bool, string) { + if len(env.PersistentVolumeClaims) == 0 { + return true, "no persistent volume claims found" + } + return false, "" + } +} + func SkipIfEmptyAny(skip func(string, ...int), object ...[2]interface{}) { for _, o := range object { s := reflect.ValueOf(o[0])