From 49cf96950a66a42e129e7be6621edd6486355a1b Mon Sep 17 00:00:00 2001 From: Tim Serong Date: Thu, 15 Aug 2024 20:09:15 +1000 Subject: [PATCH] feat: Add ability to enable Longhorn V2 Data Engine Related issue: https://github.com/harvester/harvester/issues/5274 Signed-off-by: Tim Serong --- .../node.harvesterhci.io_nodeconfigs.yaml | 5 + manifests/daemonset.yaml | 16 +- package/Dockerfile | 2 + .../v1beta1/nodeconfig.go | 8 +- .../v1beta1/zz_generated_deepcopy.go | 21 +++ pkg/controller/nodeconfig/config/longhorn.go | 163 ++++++++++++++++++ pkg/controller/nodeconfig/controller.go | 32 ++++ 7 files changed, 241 insertions(+), 6 deletions(-) create mode 100644 pkg/controller/nodeconfig/config/longhorn.go diff --git a/manifests/crds/node.harvesterhci.io_nodeconfigs.yaml b/manifests/crds/node.harvesterhci.io_nodeconfigs.yaml index ec421130..e0777fd0 100644 --- a/manifests/crds/node.harvesterhci.io_nodeconfigs.yaml +++ b/manifests/crds/node.harvesterhci.io_nodeconfigs.yaml @@ -39,6 +39,11 @@ spec: type: object spec: properties: + longhornConfig: + properties: + enableV2DataEngine: + type: boolean + type: object ntpConfigs: properties: ntpServers: diff --git a/manifests/daemonset.yaml b/manifests/daemonset.yaml index 1238468e..ee8d5489 100644 --- a/manifests/daemonset.yaml +++ b/manifests/daemonset.yaml @@ -50,9 +50,12 @@ spec: cpu: 10m memory: 64Mi volumeMounts: - - mountPath: /sys/kernel/mm/ksm - name: ksm + - mountPath: /sys/kernel/mm + name: mm readOnly: false + - mountPath: /lib/modules + name: modules + readOnly: true - mountPath: /host/proc name: proc readOnly: true @@ -64,9 +67,12 @@ spec: - mountPath: /host/oem name: host-oem volumes: - - name: ksm + - name: mm + hostPath: + path: /sys/kernel/mm + - name: modules hostPath: - path: /sys/kernel/mm/ksm + path: /lib/modules - name: proc hostPath: path: /proc @@ -81,4 +87,4 @@ spec: - name: host-oem hostPath: path: /oem - type: "" \ No newline at end of file + type: "" diff --git a/package/Dockerfile b/package/Dockerfile index 6e3ba93d..1ca9af9e 100644 --- a/package/Dockerfile +++ b/package/Dockerfile @@ -2,7 +2,9 @@ FROM registry.suse.com/bci/bci-base:15.5 +# kmod -> for `modprobe` command RUN zypper -n rm container-suseconnect && \ + zypper -n install kmod && \ zypper -n clean -a && rm -rf /tmp/* /var/tmp/* /usr/share/doc/packages/* ARG TARGETPLATFORM diff --git a/pkg/apis/node.harvesterhci.io/v1beta1/nodeconfig.go b/pkg/apis/node.harvesterhci.io/v1beta1/nodeconfig.go index ab6bc3bc..67d9f92c 100644 --- a/pkg/apis/node.harvesterhci.io/v1beta1/nodeconfig.go +++ b/pkg/apis/node.harvesterhci.io/v1beta1/nodeconfig.go @@ -22,12 +22,18 @@ type NodeConfig struct { } type NodeConfigSpec struct { - NTPConfig *NTPConfig `json:"ntpConfigs,omitempty"` + NTPConfig *NTPConfig `json:"ntpConfigs,omitempty"` + LonghornConfig *LonghornConfig `json:"longhornConfig,omitempty"` } + type NTPConfig struct { NTPServers string `json:"ntpServers"` } +type LonghornConfig struct { + EnableV2DataEngine bool `json:"enableV2DataEngine,omitempty"` +} + type NodeConfigStatus struct { NTPConditions []ConfigStatus `json:"ntpConditions,omitempty"` } diff --git a/pkg/apis/node.harvesterhci.io/v1beta1/zz_generated_deepcopy.go b/pkg/apis/node.harvesterhci.io/v1beta1/zz_generated_deepcopy.go index 6ce53950..5175a6ce 100644 --- a/pkg/apis/node.harvesterhci.io/v1beta1/zz_generated_deepcopy.go +++ b/pkg/apis/node.harvesterhci.io/v1beta1/zz_generated_deepcopy.go @@ -277,6 +277,22 @@ func (in *KsmtunedStatus) DeepCopy() *KsmtunedStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *LonghornConfig) DeepCopyInto(out *LonghornConfig) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LonghornConfig. +func (in *LonghornConfig) DeepCopy() *LonghornConfig { + if in == nil { + return nil + } + out := new(LonghornConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NTPConfig) DeepCopyInto(out *NTPConfig) { *out = *in @@ -362,6 +378,11 @@ func (in *NodeConfigSpec) DeepCopyInto(out *NodeConfigSpec) { *out = new(NTPConfig) **out = **in } + if in.LonghornConfig != nil { + in, out := &in.LonghornConfig, &out.LonghornConfig + *out = new(LonghornConfig) + **out = **in + } return } diff --git a/pkg/controller/nodeconfig/config/longhorn.go b/pkg/controller/nodeconfig/config/longhorn.go new file mode 100644 index 00000000..8a983a73 --- /dev/null +++ b/pkg/controller/nodeconfig/config/longhorn.go @@ -0,0 +1,163 @@ +package config + +import ( + "fmt" + "os" + "os/exec" + "strconv" + "strings" + + "github.com/harvester/go-common/sys" + "github.com/mudler/yip/pkg/schema" + "github.com/sirupsen/logrus" +) + +const ( + spdkStageName = "Runtime SPDK Prerequisites" + hugepagesPath = "/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages" + hugepagesToAllocate = 1024 +) + +var ( + modulesToLoad = []string{"vfio_pci", "uio_pci_generic", "nvme_tcp"} +) + +func modprobe(modules []string, load bool) error { + args := []string{"-a"} + if !load { + args = append(args, "-r") + } + args = append(args, modules...) + out, err := exec.Command("/usr/sbin/modprobe", args...).CombinedOutput() + if err != nil { + // This ensures we capture some helpful information if modules can't + // be loaded. For example, if /lib/modules isn't actually mounted in + // the container, we'll see something like this: + // modprobe failed: exit status 1 (output: 'modprobe: WARNING: Module + // vfio_pci not found in directory /lib/modules/5.14.21-150500.55.68-default[...]') + return fmt.Errorf("modprobe failed: %v (output: '%s')", err, out) + } + return nil +} + +func setNrHugepages(n uint64) error { + if err := os.WriteFile(hugepagesPath, []byte(strconv.FormatUint(n, 10)), 0644); err != nil { + return fmt.Errorf("unable to write %d to %s: %v", n, hugepagesPath, err) + } + return nil +} + +func getNrHugepages() (uint64, error) { + data, err := os.ReadFile(hugepagesPath) + if err != nil { + return 0, fmt.Errorf("unable to read %s: %v", hugepagesPath, err) + } + n, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64) + if err != nil { + return 0, err + } + return n, nil +} + +func restartKubelet() error { + // This is safe because TryRestartService will only restart + // services that are already running, i.e. this will restart + // whichever of rke2-server or rke2-agent happens to be active + // on this host + for _, service := range []string{"rke2-server.service", "rke2-agent.service"} { + if err := sys.TryRestartService(service); err != nil { + return err + } + } + return nil +} + +func EnableV2DataEngine() error { + origHugepages, err := getNrHugepages() + if err != nil { + return err + } + + // Write the persistent config first, so we know it's saved... + if err := UpdatePersistentOEMSettings(schema.Stage{ + Name: spdkStageName, + Sysctl: map[string]string{ + "vm.nr_hugepages": fmt.Sprintf("%d", hugepagesToAllocate), + }, + Commands: []string{ + "modprobe vfio_pci", + "modprobe uio_pci_generic", + "modprobe nvme_tcp", + }, + }); err != nil { + return err + } + + // ...then try to do the runtime activation (which may not succeed) + if err := modprobe(modulesToLoad, true); err != nil { + return fmt.Errorf("unable to load kernel modules %v: %v", modulesToLoad, err) + } + + if origHugepages >= hugepagesToAllocate { + // We've already got enough hugepages, and don't want to unnecessarily + // restart the kubelet, so no further action required + return nil + } + + if err := setNrHugepages(hugepagesToAllocate); err != nil { + return err + } + + nrHugepages, err := getNrHugepages() + if err != nil { + return err + } + if nrHugepages == hugepagesToAllocate { + // We've successfully allocated the hugepages, but still need to restart + // the kubelet in order for Longhorn to see the allocation. + logrus.Infof("Restarting kubelet to set nr_hugepages=%d", hugepagesToAllocate) + return restartKubelet() + } + + // We didn't get enough hugepages (not enough available unfragmented memory) + // but the system is now configured correctly so that if it's rebooted we should + // get the required allocation. + // TODO: record this somewhere (an event?) so that it can be picked up in the GUI + // Note that if there aren't enough hugepages, when harvester tries to enable the + // v2 data engine setting in Longhorn, the validator.longhorn.io admission webhook + // will pick up the failure and an error will be displayed on the harvester settings + // page, so we might not need to separately record this. + logrus.Errorf("Unable to allocate %d hugepages (only got %d)", hugepagesToAllocate, nrHugepages) + + return nil +} + +func DisableV2DataEngine() error { + origHugepages, err := getNrHugepages() + if err != nil { + return err + } + + // Write the persistent config first, so we know it's saved... + if err := RemovePersistentOEMSettings(spdkStageName); err != nil { + return err + } + + // ...then try to do the runtime deactivation + if err := modprobe(modulesToLoad, false); err != nil { + return fmt.Errorf("unable to unload kernel modules %v: %v", modulesToLoad, err) + } + + if origHugepages == 0 { + // We already don't have any hugepages, and don't want to unnecessarily + // restart the kubelet, so no further action required + return nil + } + + if err := setNrHugepages(0); err != nil { + return err + } + + logrus.Info("Restarting kubelet to set nr_hugepages=0") + return restartKubelet() +} diff --git a/pkg/controller/nodeconfig/controller.go b/pkg/controller/nodeconfig/controller.go index ff0b9cb7..fdabd2f8 100644 --- a/pkg/controller/nodeconfig/controller.go +++ b/pkg/controller/nodeconfig/controller.go @@ -57,6 +57,32 @@ func (c *Controller) OnNodeConfigChange(key string, nodecfg *nodeconfigv1.NodeCo return nil, nil } + // V2 Data Engine related handling. This is intentionally not bothering + // to check whether the engine is already enabled or not, it runs on any + // change to the node config, even if that change wasn't related to the + // longhorn settings. This is mostly harmless, because if the engine is + // already in the state about to be applied, re-applying that state is + // effectively a no-op, and I'd rather keep the code simple than add + // annotations for whether or not we already enabled the engine. + // The one wrinkle is that when allocating (or deallocating) hugepages, + // the kubelet needs to be restarted to pick up the change and reflect + // that in node.status.capacity.hugepages-2Mi, so that Longhorn can + // query that value when lhs/v2-data-engine is set to true. This restart + // logic is handled inside EnableV2DataEngine() and DisableV2DataEngine(). + if nodecfg.Spec.LonghornConfig != nil && nodecfg.Spec.LonghornConfig.EnableV2DataEngine { + if err := config.EnableV2DataEngine(); err != nil { + logrus.WithFields(logrus.Fields{ + "err": err.Error(), + }).Error("Failed to enable V2 Data Engine") + } + } else { + if err := config.DisableV2DataEngine(); err != nil { + logrus.WithFields(logrus.Fields{ + "err": err.Error(), + }).Error("Failed to disable V2 Data Engine") + } + } + // NTP related handling appliedConfig := nodecfg.ObjectMeta.Annotations[ConfigAppliedAnnotation] ntpConfigHandler := config.NewNTPConfigHandler(c.mtx, c.NodeClient, confName, nodecfg.Spec.NTPConfig, appliedConfig) @@ -129,6 +155,12 @@ func (c *Controller) OnNodeConfigRemove(key string, nodecfg *nodeconfigv1.NodeCo c.NodeConfigs.EnqueueAfter(nodecfg.Namespace, nodecfg.Name, enqueueJitter()) return nil, err } + if err := config.DisableV2DataEngine(); err != nil { + logrus.WithFields(logrus.Fields{ + "err": err.Error(), + }).Error("Failed to disable V2 Data Engine") + c.NodeConfigs.EnqueueAfter(nodecfg.Namespace, nodecfg.Name, enqueueJitter()) + } return nil, nil }