From 748fa028fbaed2b81d774b3551ee7f2781bb83a2 Mon Sep 17 00:00:00 2001 From: Shereen Haj Date: Mon, 23 Sep 2024 01:54:15 +0300 Subject: [PATCH] api: introduce NodeGroupStatus So far, tracking the node groups' statuses has been done via the collective operator status, which contains a list of all affected MCPs and their matching RTE daemonsets list. We keep populating the statuses in these fields for API backward compatibility and additionally we start reflecting the status per node group. The relation between the current node group mcp and daemonsets and the new representation is 1:1, and there is no change in the functionality, we are merely providing a new way to gather node group updates under a single entity. However, this is a required preamble for supporting NodeSelector under NodeGroup, and NodeGroupStatus will be the only place to record and track node group state in HCP (hypershift). Signed-off-by: Shereen Haj --- .../numaresourcesoperator_normalize_test.go | 1 - .../v1/numaresourcesoperator_types.go | 27 ++++ .../v1/zz_generated.deepcopy.go | 42 +++++++ ...y.openshift.io_numaresourcesoperators.yaml | 109 ++++++++++++++++ ...ources-operator.clusterserviceversion.yaml | 13 +- ...y.openshift.io_numaresourcesoperators.yaml | 109 ++++++++++++++++ ...ources-operator.clusterserviceversion.yaml | 11 ++ .../numaresourcesoperator_controller.go | 42 +++++++ .../numaresourcesoperator_controller_test.go | 6 +- internal/objects/objects.go | 11 ++ internal/objects/objects_test.go | 118 ++++++++++++++++++ internal/wait/numaresources.go | 9 ++ pkg/objectnames/objectnames.go | 9 +- pkg/objectnames/objectnames_test.go | 12 ++ pkg/validation/validation.go | 15 +++ pkg/validation/validation_test.go | 104 +++++++++++++++ test/e2e/rte/rte_test.go | 6 + 17 files changed, 638 insertions(+), 6 deletions(-) diff --git a/api/numaresourcesoperator/v1/numaresourcesoperator_normalize_test.go b/api/numaresourcesoperator/v1/numaresourcesoperator_normalize_test.go index dc2f0212e..446e67e9c 100644 --- a/api/numaresourcesoperator/v1/numaresourcesoperator_normalize_test.go +++ b/api/numaresourcesoperator/v1/numaresourcesoperator_normalize_test.go @@ -310,5 +310,4 @@ func TestNodeGroupNormalizeName(t *testing.T) { if got != expected { t.Errorf("unexpected normalized name:\ngot=%+v\nexpected=%+v", got, expected) } - } diff --git a/api/numaresourcesoperator/v1/numaresourcesoperator_types.go b/api/numaresourcesoperator/v1/numaresourcesoperator_types.go index 2838201eb..4ae5d9862 100644 --- a/api/numaresourcesoperator/v1/numaresourcesoperator_types.go +++ b/api/numaresourcesoperator/v1/numaresourcesoperator_types.go @@ -126,6 +126,29 @@ type NodeGroup struct { Config *NodeGroupConfig `json:"config,omitempty"` } +// NodeGroupStatus reports the status of a NodeGroup once matches an actual set of nodes and it is correctly processed +// by the system. In other words, is not possible to have a NodeGroupStatus which does not represent a valid NodeGroup +// which in turn correctly references unambiguously a set of nodes in the cluster. +// Hence, if a NodeGroupStatus is published, its `Name` must be present, because it refers back to a NodeGroup whose +// config was correctly processed in the Spec. And its DaemonSet will be nonempty, because matches correctly a set +// of nodes in the cluster. The Config is best-effort always represented, possibly reflecting the system defaults. +// If the system cannot process a NodeGroup correctly from the Spec, it will report Degraded state in the top-level +// condition, and will provide details using the aforementioned conditions. +type NodeGroupStatus struct { + // Name matches the name of a configured NodeGroup + Name string `json:"name"` + // DaemonSet of the configured RTEs, for this node group + //+operator-sdk:csv:customresourcedefinitions:type=status,displayName="RTE DaemonSets" + DaemonSets []NamespacedName `json:"daemonsets,omitempty"` + // NodeGroupConfig represents the latest available configuration applied to this NodeGroup + // +optional + //+operator-sdk:csv:customresourcedefinitions:type=status,displayName="Optional configuration enforced on this NodeGroup" + Config *NodeGroupConfig `json:"config,omitempty"` + // Selector represents label selector for this node group that was set by either MachineConfigPoolSelector or NodeSelector + //+operator-sdk:csv:customresourcedefinitions:type=status,displayName="Label selector of node group status" + Selector *metav1.LabelSelector `json:"selector,omitempty"` +} + // NUMAResourcesOperatorStatus defines the observed state of NUMAResourcesOperator type NUMAResourcesOperatorStatus struct { // DaemonSets of the configured RTEs, one per node group @@ -134,6 +157,10 @@ type NUMAResourcesOperatorStatus struct { // MachineConfigPools resolved from configured node groups //+operator-sdk:csv:customresourcedefinitions:type=status,displayName="RTE MCPs from node groups" MachineConfigPools []MachineConfigPool `json:"machineconfigpools,omitempty"` + // NodeGroups report the observed status of the configured NodeGroups, matching by their name + // +optional + //+operator-sdk:csv:customresourcedefinitions:type=status,displayName="Node groups observed status" + NodeGroups []NodeGroupStatus `json:"nodeGroups,omitempty"` // Conditions show the current state of the NUMAResourcesOperator Operator //+operator-sdk:csv:customresourcedefinitions:type=status,displayName="Condition reported" Conditions []metav1.Condition `json:"conditions,omitempty"` diff --git a/api/numaresourcesoperator/v1/zz_generated.deepcopy.go b/api/numaresourcesoperator/v1/zz_generated.deepcopy.go index 0e404ffc7..e0b7525fd 100644 --- a/api/numaresourcesoperator/v1/zz_generated.deepcopy.go +++ b/api/numaresourcesoperator/v1/zz_generated.deepcopy.go @@ -156,6 +156,13 @@ func (in *NUMAResourcesOperatorStatus) DeepCopyInto(out *NUMAResourcesOperatorSt (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.NodeGroups != nil { + in, out := &in.NodeGroups, &out.NodeGroups + *out = make([]NodeGroupStatus, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]metav1.Condition, len(*in)) @@ -335,6 +342,11 @@ func (in *NamespacedName) DeepCopy() *NamespacedName { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NodeGroup) DeepCopyInto(out *NodeGroup) { *out = *in + if in.Name != nil { + in, out := &in.Name, &out.Name + *out = new(string) + **out = **in + } if in.MachineConfigPoolSelector != nil { in, out := &in.MachineConfigPoolSelector, &out.MachineConfigPoolSelector *out = new(metav1.LabelSelector) @@ -399,6 +411,36 @@ func (in *NodeGroupConfig) DeepCopy() *NodeGroupConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeGroupStatus) DeepCopyInto(out *NodeGroupStatus) { + *out = *in + if in.DaemonSets != nil { + in, out := &in.DaemonSets, &out.DaemonSets + *out = make([]NamespacedName, len(*in)) + copy(*out, *in) + } + if in.Config != nil { + in, out := &in.Config, &out.Config + *out = new(NodeGroupConfig) + (*in).DeepCopyInto(*out) + } + if in.Selector != nil { + in, out := &in.Selector, &out.Selector + *out = new(metav1.LabelSelector) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeGroupStatus. +func (in *NodeGroupStatus) DeepCopy() *NodeGroupStatus { + if in == nil { + return nil + } + out := new(NodeGroupStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ResourceSpecParams) DeepCopyInto(out *ResourceSpecParams) { *out = *in diff --git a/bundle/manifests/nodetopology.openshift.io_numaresourcesoperators.yaml b/bundle/manifests/nodetopology.openshift.io_numaresourcesoperators.yaml index e8ce3657e..cd0888c29 100644 --- a/bundle/manifests/nodetopology.openshift.io_numaresourcesoperators.yaml +++ b/bundle/manifests/nodetopology.openshift.io_numaresourcesoperators.yaml @@ -410,6 +410,115 @@ spec: - name type: object type: array + nodeGroups: + description: NodeGroups report the observed status of the configured + NodeGroups, matching by their name + items: + description: |- + NodeGroupStatus reports the status of a NodeGroup once matches an actual set of nodes and it is correctly processed + by the system. In other words, is not possible to have a NodeGroupStatus which does not represent a valid NodeGroup + which in turn correctly references unambiguously a set of nodes in the cluster. + Hence, if a NodeGroupStatus is published, its `Name` must be present, because it refers back to a NodeGroup whose + config was correctly processed in the Spec. And its DaemonSet will be nonempty, because matches correctly a set + of nodes in the cluster. The Config is best-effort always represented, possibly reflecting the system defaults. + If the system cannot process a NodeGroup correctly from the Spec, it will report Degraded state in the top-level + condition, and will provide details using the aforementioned conditions. + properties: + config: + description: NodeGroupConfig represents the latest available + configuration applied to this NodeGroup + properties: + infoRefreshMode: + description: InfoRefreshMode sets the mechanism which will + be used to refresh the topology info. + enum: + - Periodic + - Events + - PeriodicAndEvents + type: string + infoRefreshPause: + description: InfoRefreshPause defines if updates to NRTs + are paused for the machines belonging to this group + enum: + - Disabled + - Enabled + type: string + infoRefreshPeriod: + description: InfoRefreshPeriod sets the topology info refresh + period. Use explicit 0 to disable. + type: string + podsFingerprinting: + description: PodsFingerprinting defines if pod fingerprint + should be reported for the machines belonging to this + group + enum: + - Disabled + - Enabled + - EnabledExclusiveResources + type: string + tolerations: + description: |- + Tolerations overrides tolerations to be set into RTE daemonsets for this NodeGroup. If not empty, the tolerations will be the one set here. + Leave empty to make the system use the default tolerations. + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + type: object + daemonsets: + description: DaemonSet of the configured RTEs, for this node + group + items: + description: |- + NamespacedName comprises a resource name, with a mandatory namespace, + rendered as "/". + properties: + name: + type: string + namespace: + type: string + type: object + type: array + name: + description: Name matches the name of a configured NodeGroup + type: string + required: + - name + type: object + type: array relatedObjects: description: RelatedObjects list of objects of interest for this operator items: diff --git a/bundle/manifests/numaresources-operator.clusterserviceversion.yaml b/bundle/manifests/numaresources-operator.clusterserviceversion.yaml index fd9a065a9..1b9759109 100644 --- a/bundle/manifests/numaresources-operator.clusterserviceversion.yaml +++ b/bundle/manifests/numaresources-operator.clusterserviceversion.yaml @@ -62,7 +62,7 @@ metadata: } ] capabilities: Basic Install - createdAt: "2024-09-24T07:18:49Z" + createdAt: "2024-09-24T09:03:48Z" olm.skipRange: '>=4.17.0 <4.18.0' operators.operatorframework.io/builder: operator-sdk-v1.36.1 operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 @@ -149,6 +149,17 @@ spec: applied to this MachineConfigPool displayName: Optional configuration enforced on this NodeGroup path: machineconfigpools[0].config + - description: NodeGroups report the observed status of the configured NodeGroups, + matching by their name + displayName: Node groups observed status + path: nodeGroups + - description: NodeGroupConfig represents the latest available configuration + applied to this NodeGroup + displayName: Optional configuration enforced on this NodeGroup + path: nodeGroups[0].config + - description: DaemonSet of the configured RTEs, for this node group + displayName: RTE DaemonSets + path: nodeGroups[0].daemonsets - description: RelatedObjects list of objects of interest for this operator displayName: Related Objects path: relatedObjects diff --git a/config/crd/bases/nodetopology.openshift.io_numaresourcesoperators.yaml b/config/crd/bases/nodetopology.openshift.io_numaresourcesoperators.yaml index c2a9a1104..3da82384a 100644 --- a/config/crd/bases/nodetopology.openshift.io_numaresourcesoperators.yaml +++ b/config/crd/bases/nodetopology.openshift.io_numaresourcesoperators.yaml @@ -410,6 +410,115 @@ spec: - name type: object type: array + nodeGroups: + description: NodeGroups report the observed status of the configured + NodeGroups, matching by their name + items: + description: |- + NodeGroupStatus reports the status of a NodeGroup once matches an actual set of nodes and it is correctly processed + by the system. In other words, is not possible to have a NodeGroupStatus which does not represent a valid NodeGroup + which in turn correctly references unambiguously a set of nodes in the cluster. + Hence, if a NodeGroupStatus is published, its `Name` must be present, because it refers back to a NodeGroup whose + config was correctly processed in the Spec. And its DaemonSet will be nonempty, because matches correctly a set + of nodes in the cluster. The Config is best-effort always represented, possibly reflecting the system defaults. + If the system cannot process a NodeGroup correctly from the Spec, it will report Degraded state in the top-level + condition, and will provide details using the aforementioned conditions. + properties: + config: + description: NodeGroupConfig represents the latest available + configuration applied to this NodeGroup + properties: + infoRefreshMode: + description: InfoRefreshMode sets the mechanism which will + be used to refresh the topology info. + enum: + - Periodic + - Events + - PeriodicAndEvents + type: string + infoRefreshPause: + description: InfoRefreshPause defines if updates to NRTs + are paused for the machines belonging to this group + enum: + - Disabled + - Enabled + type: string + infoRefreshPeriod: + description: InfoRefreshPeriod sets the topology info refresh + period. Use explicit 0 to disable. + type: string + podsFingerprinting: + description: PodsFingerprinting defines if pod fingerprint + should be reported for the machines belonging to this + group + enum: + - Disabled + - Enabled + - EnabledExclusiveResources + type: string + tolerations: + description: |- + Tolerations overrides tolerations to be set into RTE daemonsets for this NodeGroup. If not empty, the tolerations will be the one set here. + Leave empty to make the system use the default tolerations. + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + type: object + daemonsets: + description: DaemonSet of the configured RTEs, for this node + group + items: + description: |- + NamespacedName comprises a resource name, with a mandatory namespace, + rendered as "/". + properties: + name: + type: string + namespace: + type: string + type: object + type: array + name: + description: Name matches the name of a configured NodeGroup + type: string + required: + - name + type: object + type: array relatedObjects: description: RelatedObjects list of objects of interest for this operator items: diff --git a/config/manifests/bases/numaresources-operator.clusterserviceversion.yaml b/config/manifests/bases/numaresources-operator.clusterserviceversion.yaml index 409120760..576890e11 100644 --- a/config/manifests/bases/numaresources-operator.clusterserviceversion.yaml +++ b/config/manifests/bases/numaresources-operator.clusterserviceversion.yaml @@ -88,6 +88,17 @@ spec: applied to this MachineConfigPool displayName: Optional configuration enforced on this NodeGroup path: machineconfigpools[0].config + - description: NodeGroups report the observed status of the configured NodeGroups, + matching by their name + displayName: Node groups observed status + path: nodeGroups + - description: NodeGroupConfig represents the latest available configuration + applied to this NodeGroup + displayName: Optional configuration enforced on this NodeGroup + path: nodeGroups[0].config + - description: DaemonSet of the configured RTEs, for this node group + displayName: RTE DaemonSets + path: nodeGroups[0].daemonsets - description: RelatedObjects list of objects of interest for this operator displayName: Related Objects path: relatedObjects diff --git a/controllers/numaresourcesoperator_controller.go b/controllers/numaresourcesoperator_controller.go index 24c5c07b9..7a4d6bb9d 100644 --- a/controllers/numaresourcesoperator_controller.go +++ b/controllers/numaresourcesoperator_controller.go @@ -268,13 +268,55 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, } } + instance.Status.NodeGroups = syncNodeGroupsStatusesNamesAndConfig(instance) + if done, res, cond, err := r.reconcileResourceDaemonSet(ctx, instance, trees); done { return res, cond, err } + instance.Status.NodeGroups = syncNodeGroupsStatusesDaemonSets(instance) + return ctrl.Result{}, status.ConditionAvailable, nil } +func syncNodeGroupsStatusesNamesAndConfig(instance *nropv1.NUMAResourcesOperator) []nropv1.NodeGroupStatus { + ngStatuses := []nropv1.NodeGroupStatus{} + ngNameToConfigMap := map[string]nropv1.NodeGroupConfig{} + for _, mcp := range instance.Status.MachineConfigPools { + ngNameToConfigMap[mcp.NodeGroupName] = *mcp.Config //configs for mcps of the same node group ARE similar + } + for name, config := range ngNameToConfigMap { + cfg := config.DeepCopy() + ngStatuses = append(ngStatuses, nropv1.NodeGroupStatus{Name: name, Config: cfg}) + } + return ngStatuses +} + +func syncNodeGroupsStatusesDaemonSets(instance *nropv1.NUMAResourcesOperator) []nropv1.NodeGroupStatus { + ngNameToDaemonSetsMap := map[string][]nropv1.NamespacedName{} + for _, ds := range instance.Status.DaemonSets { + // extract associated name + name := objectnames.ExtractAssociatedNameFromRTEDaemonset(ds.Name, instance.Name) + for _, mcp := range instance.Status.MachineConfigPools { + if mcp.Name != name { + continue + } + ngName := mcp.NodeGroupName + if ngNameToDaemonSetsMap[ngName] == nil { + ngNameToDaemonSetsMap[ngName] = []nropv1.NamespacedName{ds} + continue + } + ngNameToDaemonSetsMap[ngName] = append(ngNameToDaemonSetsMap[ngName], ds) + } + } + + for _, ng := range instance.Status.NodeGroups { + ng.DaemonSets = ngNameToDaemonSetsMap[ng.Name] + } + + return instance.Status.NodeGroups +} + func (r *NUMAResourcesOperatorReconciler) syncDaemonSetsStatuses(ctx context.Context, rd client.Reader, daemonSetsInfo []nropv1.NamespacedName) ([]nropv1.NamespacedName, bool, error) { dsStatuses := []nropv1.NamespacedName{} for _, nname := range daemonSetsInfo { diff --git a/controllers/numaresourcesoperator_controller_test.go b/controllers/numaresourcesoperator_controller_test.go index 7ae691a6e..c1e98f161 100644 --- a/controllers/numaresourcesoperator_controller_test.go +++ b/controllers/numaresourcesoperator_controller_test.go @@ -46,13 +46,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" nropv1 "github.com/openshift-kni/numaresources-operator/api/numaresourcesoperator/v1" + testobjs "github.com/openshift-kni/numaresources-operator/internal/objects" "github.com/openshift-kni/numaresources-operator/pkg/images" "github.com/openshift-kni/numaresources-operator/pkg/objectnames" "github.com/openshift-kni/numaresources-operator/pkg/objectstate/rte" "github.com/openshift-kni/numaresources-operator/pkg/status" "github.com/openshift-kni/numaresources-operator/pkg/validation" - - testobjs "github.com/openshift-kni/numaresources-operator/internal/objects" ) const ( @@ -536,7 +535,6 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() { Expect(err).ToNot(HaveOccurred()) }) It(" operator status should report RelatedObjects as expected", func() { - By("Getting updated NROP Status") key := client.ObjectKeyFromObject(nro) nroUpdated := &nropv1.NUMAResourcesOperator{} @@ -573,6 +571,8 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() { Expect(len(nroUpdated.Status.RelatedObjects)).To(Equal(len(expected))) Expect(nroUpdated.Status.RelatedObjects).To(ContainElements(expected)) + + Expect(validation.EqualNamespacedDSSlicesByName(nroUpdated.Status.DaemonSets, testobjs.GetDaemonSetListFromNodeGroupStatuses(nroUpdated.Status.NodeGroups))) }) }) }) diff --git a/internal/objects/objects.go b/internal/objects/objects.go index 55e61ff50..88d662fd1 100644 --- a/internal/objects/objects.go +++ b/internal/objects/objects.go @@ -170,3 +170,14 @@ func NamespaceLabels() map[string]string { "security.openshift.io/scc.podSecurityLabelSync": "false", } } + +func GetDaemonSetListFromNodeGroupStatuses(groups []nropv1.NodeGroupStatus) []nropv1.NamespacedName { + dss := []nropv1.NamespacedName{} + for _, group := range groups { + if len(group.DaemonSets) == 0 { + continue + } + dss = append(dss, group.DaemonSets...) + } + return dss +} diff --git a/internal/objects/objects_test.go b/internal/objects/objects_test.go index 7f19adb79..b1045b366 100644 --- a/internal/objects/objects_test.go +++ b/internal/objects/objects_test.go @@ -17,10 +17,13 @@ package objects import ( + "reflect" "testing" "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + nropv1 "github.com/openshift-kni/numaresources-operator/api/numaresourcesoperator/v1" ) func TestNewNUMAResourcesOperator(t *testing.T) { @@ -93,3 +96,118 @@ func TestNewNamespace(t *testing.T) { } } } + +func TestGetDaemonSetListFromNodeGroupStatuses(t *testing.T) { + testcases := []struct { + name string + input []nropv1.NodeGroupStatus + output []nropv1.NamespacedName + }{ + { + name: "single nodegroup", + input: []nropv1.NodeGroupStatus{ + { + Name: "nodegroup-1", + DaemonSets: []nropv1.NamespacedName{ + { + Name: "daemonset-1", + }, + }, + }, + }, + output: []nropv1.NamespacedName{ + { + Name: "daemonset-1", + }, + }, + }, + { + name: "multiple nodegroups - each with non empty ds", + input: []nropv1.NodeGroupStatus{ + { + Name: "nodegroup-1", + DaemonSets: []nropv1.NamespacedName{ + { + Name: "daemonset-1", + }, + { + Name: "daemonset-2", + }, + }, + }, + { + Name: "nodegroup-2", + DaemonSets: []nropv1.NamespacedName{ + { + Name: "daemonset-3", + }, + }, + }, + { + Name: "nodegroup-3", + DaemonSets: []nropv1.NamespacedName{ + { + Name: "daemonset-1", //duplicates should not exist, if they do it's a bug and we don't want to ignore it + }, + }, + }, + }, + output: []nropv1.NamespacedName{ + { + Name: "daemonset-1", + }, + { + Name: "daemonset-2", + }, + { + Name: "daemonset-3", + }, + { + Name: "daemonset-1", + }, + }, + }, + { + name: "multiple nodegroups - some with empty ds", + input: []nropv1.NodeGroupStatus{ + { + Name: "nodegroup-1", + DaemonSets: []nropv1.NamespacedName{}, + }, + { + Name: "nodegroup-2", + DaemonSets: []nropv1.NamespacedName{ + { + Name: "daemonset-3", + }, + }, + }, + { + Name: "nodegroup-3", + DaemonSets: []nropv1.NamespacedName{ + { + Name: "daemonset-1", + }, + }, + }, + }, + output: []nropv1.NamespacedName{ + { + Name: "daemonset-3", + }, + { + Name: "daemonset-1", + }, + }, + }, + } + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + got := GetDaemonSetListFromNodeGroupStatuses(tc.input) + if !reflect.DeepEqual(got, tc.output) { + t.Errorf("unexpected daemonsets list:\n\t%v\n\tgot:\n\t%v", tc.output, got) + } + }) + } +} diff --git a/internal/wait/numaresources.go b/internal/wait/numaresources.go index 06c0bf927..f0f6b4a11 100644 --- a/internal/wait/numaresources.go +++ b/internal/wait/numaresources.go @@ -23,6 +23,8 @@ import ( "k8s.io/klog/v2" nropv1 "github.com/openshift-kni/numaresources-operator/api/numaresourcesoperator/v1" + testobjs "github.com/openshift-kni/numaresources-operator/internal/objects" + "github.com/openshift-kni/numaresources-operator/pkg/validation" ) func (wt Waiter) ForNUMAResourcesOperatorDeleted(ctx context.Context, nrop *nropv1.NUMAResourcesOperator) error { @@ -62,6 +64,13 @@ func (wt Waiter) ForDaemonsetInNUMAResourcesOperatorStatus(ctx context.Context, klog.Warningf("failed to get the DaemonSet from NUMAResourcesOperator %s", key.String()) return false, nil } + + dssFromNodeGroupStatus := testobjs.GetDaemonSetListFromNodeGroupStatuses(updatedNRO.Status.NodeGroups) + if validation.EqualNamespacedDSSlicesByName(updatedNRO.Status.DaemonSets, dssFromNodeGroupStatus) != nil { + klog.Warningf("daemonset list mismatch: from NodeGroupStatus:\n%+v\n from operator status:\n%+v\n", dssFromNodeGroupStatus, updatedNRO.Status.NodeGroups) + return false, nil + } + klog.Infof("Daemonset info %s/%s ready in NUMAResourcesOperator %s", updatedNRO.Status.DaemonSets[0].Namespace, updatedNRO.Status.DaemonSets[0].Name, key.String()) return true, nil }) diff --git a/pkg/objectnames/objectnames.go b/pkg/objectnames/objectnames.go index 1f697f8d8..a8e01fbe2 100644 --- a/pkg/objectnames/objectnames.go +++ b/pkg/objectnames/objectnames.go @@ -16,7 +16,10 @@ package objectnames -import "fmt" +import ( + "fmt" + "strings" +) const ( DefaultNUMAResourcesOperatorCrName = "numaresourcesoperator" @@ -30,3 +33,7 @@ func GetMachineConfigName(instanceName, mcpName string) string { func GetComponentName(instanceName, mcpName string) string { return fmt.Sprintf("%s-%s", instanceName, mcpName) } + +func ExtractAssociatedNameFromRTEDaemonset(dsName, instanceName string) string { + return strings.Replace(dsName, fmt.Sprintf("%s-", instanceName), "", 1) +} diff --git a/pkg/objectnames/objectnames_test.go b/pkg/objectnames/objectnames_test.go index 1b3da13e8..204c6871e 100644 --- a/pkg/objectnames/objectnames_test.go +++ b/pkg/objectnames/objectnames_test.go @@ -37,3 +37,15 @@ func TestGetComponentName(t *testing.T) { t.Errorf("generated empty ComponentName") } } + +func TestExtractAssociatedNameFromRTEDaemonset(t *testing.T) { + got := ExtractAssociatedNameFromRTEDaemonset("bar-foo", "bar") + if got != "foo" { + t.Errorf("expected \"foo\" got %s", got) + } + + got = ExtractAssociatedNameFromRTEDaemonset("bar-foo-foobar", "bar") + if got != "foo-foobar" { + t.Errorf("expected \"foo-foobar\" got %s", got) + } +} diff --git a/pkg/validation/validation.go b/pkg/validation/validation.go index ad6fe3d82..af28b0983 100644 --- a/pkg/validation/validation.go +++ b/pkg/validation/validation.go @@ -18,6 +18,8 @@ package validation import ( "fmt" + "slices" + "sort" "strings" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -186,3 +188,16 @@ func nodeGroupMachineConfigPoolSelector(nodeGroups []nropv1.NodeGroup) error { return nil } + +// EqualNamespacedDSSlicesByName validates two slices of type NamespacedName are equal in Names +func EqualNamespacedDSSlicesByName(s1, s2 []nropv1.NamespacedName) error { + sort.SliceStable(s1, func(i, j int) bool { return s1[i].Name > s1[j].Name }) + sort.SliceStable(s2, func(i, j int) bool { return s2[i].Name > s2[j].Name }) + equal := slices.EqualFunc(s1, s2, func(a nropv1.NamespacedName, b nropv1.NamespacedName) bool { + return a.Name == b.Name + }) + if !equal { + return fmt.Errorf("expected RTE daemonsets are different from actual daemonsets") + } + return nil +} diff --git a/pkg/validation/validation_test.go b/pkg/validation/validation_test.go index 0a0126b88..71eccd7a1 100644 --- a/pkg/validation/validation_test.go +++ b/pkg/validation/validation_test.go @@ -251,3 +251,107 @@ func TestNodeGroupsSanity(t *testing.T) { }) } } + +func TestEqualNamespacedDSSlicesByName(t *testing.T) { + type testCase struct { + name string + s1 []nropv1.NamespacedName + s2 []nropv1.NamespacedName + expectedError bool + } + + testCases := []testCase{ + { + name: "nil slices", + s1: []nropv1.NamespacedName{}, + s2: []nropv1.NamespacedName{}, + expectedError: false, + }, + { + name: "equal slices by name", + s1: []nropv1.NamespacedName{ + { + Name: "foo", + }, + { + Namespace: "ns1", + Name: "bar", + }, + }, + s2: []nropv1.NamespacedName{ + { + Name: "bar", + }, + { + Namespace: "ns2", + Name: "foo", + }, + }, + expectedError: false, + }, + { + name: "different slices by length", + s1: []nropv1.NamespacedName{ + { + Namespace: "ns1", + Name: "foo", + }, + { + Namespace: "ns1", + Name: "bar", + }, + }, + s2: []nropv1.NamespacedName{ + { + Namespace: "ns2", + Name: "bar", + }, + { + Namespace: "ns2", + Name: "foo", + }, + { + Namespace: "ns2", + Name: "foo", + }, + }, + expectedError: true, + }, + { + name: "different slices by name", + s1: []nropv1.NamespacedName{ + { + Namespace: "ns1", + Name: "foo", + }, + { + Namespace: "ns1", + Name: "bar", + }, + }, + s2: []nropv1.NamespacedName{ + { + Namespace: "ns1", + Name: "foo", + }, + { + Namespace: "ns1", + Name: "foo", + }, + }, + expectedError: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + err := EqualNamespacedDSSlicesByName(tc.s1, tc.s2) + if err == nil && tc.expectedError { + t.Errorf("expected error, succeeded") + } + if err != nil && !tc.expectedError { + t.Errorf("expected success, failed") + } + }) + } +} diff --git a/test/e2e/rte/rte_test.go b/test/e2e/rte/rte_test.go index 8b614de58..2723b024e 100644 --- a/test/e2e/rte/rte_test.go +++ b/test/e2e/rte/rte_test.go @@ -46,11 +46,13 @@ import ( nropv1 "github.com/openshift-kni/numaresources-operator/api/numaresourcesoperator/v1" nodegroupv1 "github.com/openshift-kni/numaresources-operator/api/numaresourcesoperator/v1/helper/nodegroup" "github.com/openshift-kni/numaresources-operator/internal/machineconfigpools" + testobjs "github.com/openshift-kni/numaresources-operator/internal/objects" "github.com/openshift-kni/numaresources-operator/internal/podlist" "github.com/openshift-kni/numaresources-operator/internal/remoteexec" "github.com/openshift-kni/numaresources-operator/pkg/loglevel" "github.com/openshift-kni/numaresources-operator/pkg/objectnames" rteupdate "github.com/openshift-kni/numaresources-operator/pkg/objectupdate/rte" + "github.com/openshift-kni/numaresources-operator/pkg/validation" rteconfig "github.com/openshift-kni/numaresources-operator/rte/pkg/config" "github.com/openshift-kni/numaresources-operator/test/utils/clients" "github.com/openshift-kni/numaresources-operator/test/utils/objects" @@ -144,6 +146,8 @@ var _ = ginkgo.Describe("with a running cluster with all the components", func() err := clients.Client.Get(context.TODO(), client.ObjectKey{Name: objectnames.DefaultNUMAResourcesOperatorCrName}, nropObj) gomega.Expect(err).ToNot(gomega.HaveOccurred()) gomega.Expect(nropObj.Status.DaemonSets).ToNot(gomega.BeEmpty()) + dssFromNodeGroupStatus := testobjs.GetDaemonSetListFromNodeGroupStatuses(nropObj.Status.NodeGroups) + gomega.Expect(validation.EqualNamespacedDSSlicesByName(nropObj.Status.DaemonSets, dssFromNodeGroupStatus)) klog.Infof("NRO %q", nropObj.Name) // NROP guarantees all the daemonsets are in the same namespace, @@ -191,6 +195,8 @@ var _ = ginkgo.Describe("with a running cluster with all the components", func() err := clients.Client.Get(context.TODO(), client.ObjectKey{Name: objectnames.DefaultNUMAResourcesOperatorCrName}, nropObj) gomega.Expect(err).ToNot(gomega.HaveOccurred()) gomega.Expect(nropObj.Status.DaemonSets).ToNot(gomega.BeEmpty()) + dssFromNodeGroupStatus := testobjs.GetDaemonSetListFromNodeGroupStatuses(nropObj.Status.NodeGroups) + gomega.Expect(validation.EqualNamespacedDSSlicesByName(nropObj.Status.DaemonSets, dssFromNodeGroupStatus)) klog.Infof("NRO %q", nropObj.Name) // NROP guarantees all the daemonsets are in the same namespace,