diff --git a/base/kube-system/manifests/device-plugins/intel-gpu-nfd-platform.yaml b/base/kube-system/manifests/device-plugins/intel-gpu-nfd-platform.yaml new file mode 100644 index 000000000..69884553a --- /dev/null +++ b/base/kube-system/manifests/device-plugins/intel-gpu-nfd-platform.yaml @@ -0,0 +1,197 @@ +apiVersion: nfd.k8s-sigs.io/v1alpha1 +kind: NodeFeatureRule +metadata: + name: intel-gpu-platform-labeling +spec: + rules: + - extendedResources: + gpu.intel.com/millicores: "@local.label.gpu.intel.com/millicores" + gpu.intel.com/memory.max: "@local.label.gpu.intel.com/memory.max" + gpu.intel.com/tiles: "@local.label.gpu.intel.com/tiles" + matchFeatures: + - feature: local.label + matchExpressions: + gpu.intel.com/millicores: {op: Exists} + gpu.intel.com/memory.max: {op: Exists} + gpu.intel.com/tiles: {op: Exists} + name: intel.gpu.fractionalresources + # generic rule for older and upcoming devices + - labelsTemplate: | + {{ range .pci.device }}gpu.intel.com/device-id.{{ .class }}-{{ .device }}.present=true + {{ end }} + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0300" + - "0380" + vendor: + op: In + value: + - "8086" + name: intel.gpu.generic.deviceid + - labelsTemplate: gpu.intel.com/device-id.0300-{{ (index .pci.device 0).device }}.count={{ len .pci.device }} + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0300" + vendor: + op: In + value: + - "8086" + name: intel.gpu.generic.count.300 + - labelsTemplate: gpu.intel.com/device-id.0380-{{ (index .pci.device 0).device }}.count={{ len .pci.device }} + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0380" + vendor: + op: In + value: + - "8086" + name: intel.gpu.generic.count.380 + - labels: + gpu.intel.com/product: "Max_1100" + labelsTemplate: "gpu.intel.com/device.count={{ len .pci.device }}" + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0380" + vendor: + op: In + value: + - "8086" + device: + op: In + value: + - "0bda" + name: intel.gpu.max.1100 + - labels: + gpu.intel.com/product: "Max_1550" + labelsTemplate: "gpu.intel.com/device.count={{ len .pci.device }}" + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0380" + vendor: + op: In + value: + - "8086" + device: + op: In + value: + - "0bd5" + name: intel.gpu.max.1550 + - labels: + gpu.intel.com/family: "Max_Series" + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0380" + vendor: + op: In + value: + - "8086" + device: + op: In + value: + - "0bda" + - "0bd5" + - "0bd9" + - "0bdb" + - "0bd7" + - "0bd6" + - "0bd0" + name: intel.gpu.max.series + - labels: + gpu.intel.com/family: "Flex_Series" + gpu.intel.com/product: "Flex_170" + labelsTemplate: "gpu.intel.com/device.count={{ len .pci.device }}" + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0380" + vendor: + op: In + value: + - "8086" + device: + op: In + value: + - "56c0" + name: intel.gpu.flex.170 + - labels: + gpu.intel.com/family: "Flex_Series" + gpu.intel.com/product: "Flex_140" + labelsTemplate: "gpu.intel.com/device.count={{ len .pci.device }}" + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0380" + vendor: + op: In + value: + - "8086" + device: + op: In + value: + - "56c1" + name: intel.gpu.flex.140 + - labels: + gpu.intel.com/family: "A_Series" + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0300" + vendor: + op: In + value: + - "8086" + device: + op: In + value: + - "56a6" + - "56a5" + - "56a1" + - "56a0" + - "5694" + - "5693" + - "5692" + - "5691" + - "5690" + - "56b3" + - "56b2" + - "56a4" + - "56a3" + - "5697" + - "5696" + - "5695" + - "56b1" + - "56b0" + name: intel.gpu.a.series diff --git a/base/kube-system/manifests/device-plugins/intel-gpu-nfd.yaml b/base/kube-system/manifests/device-plugins/intel-gpu-nfd.yaml new file mode 100644 index 000000000..69884553a --- /dev/null +++ b/base/kube-system/manifests/device-plugins/intel-gpu-nfd.yaml @@ -0,0 +1,197 @@ +apiVersion: nfd.k8s-sigs.io/v1alpha1 +kind: NodeFeatureRule +metadata: + name: intel-gpu-platform-labeling +spec: + rules: + - extendedResources: + gpu.intel.com/millicores: "@local.label.gpu.intel.com/millicores" + gpu.intel.com/memory.max: "@local.label.gpu.intel.com/memory.max" + gpu.intel.com/tiles: "@local.label.gpu.intel.com/tiles" + matchFeatures: + - feature: local.label + matchExpressions: + gpu.intel.com/millicores: {op: Exists} + gpu.intel.com/memory.max: {op: Exists} + gpu.intel.com/tiles: {op: Exists} + name: intel.gpu.fractionalresources + # generic rule for older and upcoming devices + - labelsTemplate: | + {{ range .pci.device }}gpu.intel.com/device-id.{{ .class }}-{{ .device }}.present=true + {{ end }} + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0300" + - "0380" + vendor: + op: In + value: + - "8086" + name: intel.gpu.generic.deviceid + - labelsTemplate: gpu.intel.com/device-id.0300-{{ (index .pci.device 0).device }}.count={{ len .pci.device }} + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0300" + vendor: + op: In + value: + - "8086" + name: intel.gpu.generic.count.300 + - labelsTemplate: gpu.intel.com/device-id.0380-{{ (index .pci.device 0).device }}.count={{ len .pci.device }} + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0380" + vendor: + op: In + value: + - "8086" + name: intel.gpu.generic.count.380 + - labels: + gpu.intel.com/product: "Max_1100" + labelsTemplate: "gpu.intel.com/device.count={{ len .pci.device }}" + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0380" + vendor: + op: In + value: + - "8086" + device: + op: In + value: + - "0bda" + name: intel.gpu.max.1100 + - labels: + gpu.intel.com/product: "Max_1550" + labelsTemplate: "gpu.intel.com/device.count={{ len .pci.device }}" + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0380" + vendor: + op: In + value: + - "8086" + device: + op: In + value: + - "0bd5" + name: intel.gpu.max.1550 + - labels: + gpu.intel.com/family: "Max_Series" + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0380" + vendor: + op: In + value: + - "8086" + device: + op: In + value: + - "0bda" + - "0bd5" + - "0bd9" + - "0bdb" + - "0bd7" + - "0bd6" + - "0bd0" + name: intel.gpu.max.series + - labels: + gpu.intel.com/family: "Flex_Series" + gpu.intel.com/product: "Flex_170" + labelsTemplate: "gpu.intel.com/device.count={{ len .pci.device }}" + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0380" + vendor: + op: In + value: + - "8086" + device: + op: In + value: + - "56c0" + name: intel.gpu.flex.170 + - labels: + gpu.intel.com/family: "Flex_Series" + gpu.intel.com/product: "Flex_140" + labelsTemplate: "gpu.intel.com/device.count={{ len .pci.device }}" + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0380" + vendor: + op: In + value: + - "8086" + device: + op: In + value: + - "56c1" + name: intel.gpu.flex.140 + - labels: + gpu.intel.com/family: "A_Series" + matchFeatures: + - feature: pci.device + matchExpressions: + class: + op: In + value: + - "0300" + vendor: + op: In + value: + - "8086" + device: + op: In + value: + - "56a6" + - "56a5" + - "56a1" + - "56a0" + - "5694" + - "5693" + - "5692" + - "5691" + - "5690" + - "56b3" + - "56b2" + - "56a4" + - "56a3" + - "5697" + - "5696" + - "5695" + - "56b1" + - "56b0" + name: intel.gpu.a.series diff --git a/base/kube-system/manifests/device-plugins/intel-gpu-plugin.yaml b/base/kube-system/manifests/device-plugins/intel-gpu-plugin.yaml index 764baf3e3..61d78c3bd 100644 --- a/base/kube-system/manifests/device-plugins/intel-gpu-plugin.yaml +++ b/base/kube-system/manifests/device-plugins/intel-gpu-plugin.yaml @@ -1,40 +1,44 @@ ---- apiVersion: apps/v1 kind: DaemonSet metadata: name: intel-gpu-plugin - namespace: kube-system labels: app: intel-gpu-plugin - app.kubernetes.io/name: intel-gpu-plugin - app.kubernetes.io/version: 0.18.1 spec: selector: matchLabels: - app.kubernetes.io/name: intel-gpu-plugin + app: intel-gpu-plugin template: metadata: labels: app: intel-gpu-plugin - app.kubernetes.io/name: intel-gpu-plugin - app.kubernetes.io/version: 0.18.1 spec: containers: - name: intel-gpu-plugin + args: + - "-enable-monitoring" + - "-v=2" env: - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName - image: intel/intel-gpu-plugin:0.18.1 + - name: HOST_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + image: intel/intel-gpu-plugin:0.29.0 imagePullPolicy: IfNotPresent securityContext: + seLinuxOptions: + type: "container_device_plugin_t" readOnlyRootFilesystem: true + allowPrivilegeEscalation: false volumeMounts: - name: devfs mountPath: /dev/dri readOnly: true - - name: sysfs + - name: sysfsdrm mountPath: /sys/class/drm readOnly: true - name: kubeletsockets @@ -43,7 +47,7 @@ spec: - name: devfs hostPath: path: /dev/dri - - name: sysfs + - name: sysfsdrm hostPath: path: /sys/class/drm - name: kubeletsockets @@ -51,4 +55,4 @@ spec: path: /var/lib/kubelet/device-plugins nodeSelector: kubernetes.io/arch: amd64 - gpu.infra/intel: "true" + intel.feature.node.kubernetes.io/gpu: "true"