Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add DRA driver For IMEX #1195

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ type ClusterPolicySpec struct {
Toolkit ToolkitSpec `json:"toolkit"`
// DevicePlugin component spec
DevicePlugin DevicePluginSpec `json:"devicePlugin"`
// DRADriver component spec
DRADriver DRADriverSpec `json:"draDriver"`
// DCGMExporter spec
DCGMExporter DCGMExporterSpec `json:"dcgmExporter"`
// DCGM component spec
Expand Down Expand Up @@ -841,6 +843,45 @@ type SandboxDevicePluginSpec struct {
Env []EnvVar `json:"env,omitempty"`
}

// DRADriverSpec defines the properties for the NVIDIA DRA Driver deployment
type DRADriverSpec struct {
// Enabled indicates if the deployment of NVIDIA DRA Driver through the operator is enabled
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use the full-form of DRA when mentioning it first, and then use the acronyms everywhere else

// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable NVIDIA DRA Driver deployment through GPU Operator"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
Enabled *bool `json:"enabled,omitempty"`

// NVIDIA DRA Driver image repository
// +kubebuilder:validation:Optional
Repository string `json:"repository,omitempty"`

// NVIDIA DRA Driver image name
// +kubebuilder:validation:Pattern=[a-zA-Z0-9\-]+
Image string `json:"image,omitempty"`

// NVIDIA DRA Driver image tag
// +kubebuilder:validation:Optional
Version string `json:"version,omitempty"`

// Image pull policy
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image Pull Policy"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:imagePullPolicy"
ImagePullPolicy string `json:"imagePullPolicy,omitempty"`

// Image pull secrets
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image pull secrets"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:io.kubernetes:Secret"
ImagePullSecrets []string `json:"imagePullSecrets,omitempty"`

// DeviceClasses indicates which device classes are enabled in the DRA driver
// +kubebuilder:validation:Optional
DeviceClasses []string `json:"deviceClasses,omitempty"`
}

// DCGMExporterSpec defines the properties for NVIDIA DCGM Exporter deployment
type DCGMExporterSpec struct {
// Enabled indicates if deployment of NVIDIA DCGM Exporter through operator is enabled
Expand Down Expand Up @@ -1764,6 +1805,9 @@ func ImagePath(spec interface{}) (string, error) {
case *SandboxDevicePluginSpec:
config := spec.(*SandboxDevicePluginSpec)
return imagePath(config.Repository, config.Image, config.Version, "SANDBOX_DEVICE_PLUGIN_IMAGE")
case *DRADriverSpec:
config := spec.(*DRADriverSpec)
return imagePath(config.Repository, config.Image, config.Version, "DRA_DRIVER_IMAGE")
case *DCGMExporterSpec:
config := spec.(*DCGMExporterSpec)
return imagePath(config.Repository, config.Image, config.Version, "DCGM_EXPORTER_IMAGE")
Expand Down Expand Up @@ -1872,6 +1916,15 @@ func (p *DevicePluginSpec) IsEnabled() bool {
return *p.Enabled
}

// IsEnabled returns true if draDriver is enabled through gpu-operator
func (d *DRADriverSpec) IsEnabled() bool {
if d.Enabled == nil {
// default is true if not specified by user
return true
}
return *d.Enabled
}

// IsEnabled returns true if dcgm-exporter is enabled(default) through gpu-operator
func (e *DCGMExporterSpec) IsEnabled() bool {
if e.Enabled == nil {
Expand Down
31 changes: 31 additions & 0 deletions api/nvidia/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions assets/state-dra-driver/0100_service_account.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: nvidia-dra-driver
namespace: "FILLED BY THE OPERATOR"
15 changes: 15 additions & 0 deletions assets/state-dra-driver/0200_clusterrole.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: nvidia-dra-driver
rules:
# TODO: restrict RBAC for DRA driver
- apiGroups:
- ""
- apps
- resource.k8s.io
- gpu.nvidia.com
resources:
- '*'
verbs:
- '*'
12 changes: 12 additions & 0 deletions assets/state-dra-driver/0300_clusterrolebinding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: nvidia-dra-driver
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: nvidia-dra-driver
subjects:
- kind: ServiceAccount
name: nvidia-dra-driver
namespace: "FILLED BY THE OPERATOR"
8 changes: 8 additions & 0 deletions assets/state-dra-driver/0400_deviceclass-imex.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: resource.k8s.io/v1beta1
kind: DeviceClass
metadata:
name: imex.nvidia.com
spec:
selectors:
- cel:
expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'imex-channel'"
44 changes: 44 additions & 0 deletions assets/state-dra-driver/0500_deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: nvidia-imex-dra-driver-controller
name: nvidia-imex-dra-driver-controller
namespace: "FILLED BY THE OPERATOR"
spec:
replicas: 1
selector:
matchLabels:
app: nvidia-imex-dra-driver-controller
template:
metadata:
labels:
app: nvidia-imex-dra-driver-controller
spec:
priorityClassName: system-node-critical
serviceAccountName: nvidia-dra-driver
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Exists
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
operator: Exists
containers:
- name: controller
image: "FILLED BY THE OPERATOR"
imagePullPolicy: IfNotPresent
command: ["nvidia-dra-controller", "-v", "6"]
env:
- name: DEVICE_CLASSES
value: imex
- name: POD_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
- name: NAMESPACE
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
34 changes: 34 additions & 0 deletions assets/state-dra-driver/0600_configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: nvidia-dra-driver-kubelet-plugin-entrypoint
namespace: "FILLED BY THE OPERATOR"
labels:
app: nvidia-dra-driver-kubelet-plugin
data:
entrypoint.sh: |-
#!/bin/bash

until [[ -f /run/nvidia/validations/driver-ready ]]
do
echo "waiting for the driver validations to be ready..."
sleep 5
done

set -o allexport
cat /run/nvidia/validations/driver-ready
. /run/nvidia/validations/driver-ready
# TODO: add an alias for DRIVER_ROOT_CTR_PATH in the k8s-dra-driver and remove the below export
export CONTAINER_DRIVER_ROOT=$DRIVER_ROOT_CTR_PATH

# Conditionally mask the params file to prevent this container from
# recreating any missing GPU device nodes. This is necessary, for
# example, when running under nvkind to limit the set GPUs governed
# by the plugin even though it has cgroup access to all of them.
if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then
cp /proc/driver/nvidia/params root/gpu-params
sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params
mount --bind root/gpu-params /proc/driver/nvidia/params
fi
echo "Starting nvidia-dra-plugin"
exec nvidia-dra-plugin
118 changes: 118 additions & 0 deletions assets/state-dra-driver/0700_daemonset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app: nvidia-imex-dra-driver-kubelet-plugin
name: nvidia-imex-dra-driver-kubelet-plugin
namespace: "FILLED BY THE OPERATOR"
spec:
selector:
matchLabels:
app: nvidia-imex-dra-driver-kubelet-plugin
updateStrategy:
rollingUpdate:
maxSurge: 0
maxUnavailable: 1
type: RollingUpdate
template:
metadata:
labels:
app: nvidia-imex-dra-driver-kubelet-plugin
spec:
priorityClassName: system-node-critical
serviceAccountName: nvidia-dra-driver
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.imex-domain
operator: Exists
initContainers:
- image: "FILLED BY THE OPERATOR"
name: driver-validation
command: [ 'sh', '-c' ]
args: [ "until [ -f /run/nvidia/validations/driver-ready ]; do echo waiting for driver to be setup; sleep 5; done" ]
securityContext:
privileged: true
volumeMounts:
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
mountPropagation: HostToContainer
containers:
- name: plugin
image: "FILLED BY THE OPERATOR"
imagePullPolicy: IfNotPresent
command: ["/bin/bash", "-c"]
args:
- /bin/entrypoint.sh
env:
- name: MASK_NVIDIA_DRIVER_PARAMS
value: "false"
- name: NVIDIA_VISIBLE_DEVICES
value: void
- name: CDI_ROOT
value: /var/run/cdi
- name: NVIDIA_MIG_CONFIG_DEVICES
value: all
- name: DEVICE_CLASSES
value: imex
- name: NODE_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: spec.nodeName
- name: NAMESPACE
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
securityContext:
privileged: true
volumeMounts:
- name: nvidia-dra-driver-kubelet-plugin-entrypoint
readOnly: true
mountPath: /bin/entrypoint.sh
subPath: entrypoint.sh
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
- name: driver-install-dir
mountPath: /driver-root
mountPropagation: HostToContainer
readOnly: true
- name: host-root
mountPath: /host
readOnly: true
- mountPath: /var/lib/kubelet/plugins_registry
name: plugins-registry
- mountPath: /var/lib/kubelet/plugins
mountPropagation: Bidirectional
name: plugins
- mountPath: /var/run/cdi
name: cdi
volumes:
- name: nvidia-dra-driver-kubelet-plugin-entrypoint
configMap:
name: nvidia-dra-driver-kubelet-plugin-entrypoint
defaultMode: 448
- name: run-nvidia-validations
hostPath:
path: "/run/nvidia/validations"
type: DirectoryOrCreate
- name: driver-install-dir
hostPath:
path: "/run/nvidia/driver"
type: DirectoryOrCreate
- name: host-root
hostPath:
path: /
- name: plugins-registry
hostPath:
path: /var/lib/kubelet/plugins_registry
- name: plugins
hostPath:
path: /var/lib/kubelet/plugins
- name: cdi
hostPath:
path: /var/run/cdi
type: DirectoryOrCreate
Loading
Loading