Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for setting libcuda verbosity #563

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions charts/hami/templates/scheduler/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ spec:
- --resource-cores={{ .Values.resourceCores }}
- --resource-mem-percentage={{ .Values.resourceMemPercentage }}
- --resource-priority={{ .Values.resourcePriority }}
- --libcuda-log-verbosity-level={{ .Values.libcudaLogVerbosity }}
- --http_bind=0.0.0.0:443
- --cert_file=/tls/tls.crt
- --key_file=/tls/tls.key
Expand Down
2 changes: 2 additions & 0 deletions charts/hami/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ resourceMem: "nvidia.com/gpumem"
resourceMemPercentage: "nvidia.com/gpumem-percentage"
resourceCores: "nvidia.com/gpucores"
resourcePriority: "nvidia.com/priority"
## Use environment variable LIBCUDA_LOG_LEVEL to set the visibility of logs in containers
libcudaLogVerbosity: "2"

#MLU Parameters
mluResourceName: "cambricon.com/vmlu"
Expand Down
44 changes: 29 additions & 15 deletions pkg/device/nvidia/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,14 @@
)

var (
ResourceName string
ResourceMem string
ResourceCores string
ResourceMemPercentage string
ResourcePriority string
DebugMode bool
OverwriteEnv bool
ResourceName string
ResourceMem string
ResourceCores string
ResourceMemPercentage string
ResourcePriority string
LIBCUDALogVerbosityLevel string
DebugMode bool
OverwriteEnv bool
)

type NvidiaGPUDevices struct {
Expand All @@ -78,6 +79,7 @@
fs.StringVar(&ResourceMemPercentage, "resource-mem-percentage", "nvidia.com/gpumem-percentage", "gpu memory fraction to allocate")
fs.StringVar(&ResourceCores, "resource-cores", "nvidia.com/gpucores", "cores percentage to use")
fs.StringVar(&ResourcePriority, "resource-priority", "vgputaskpriority", "vgpu task priority 0 for high and 1 for low")
fs.StringVar(&LIBCUDALogVerbosityLevel, "libcuda-log-verbosity-level", "2", "verbosity level of LIBCUDA")

Check warning on line 82 in pkg/device/nvidia/device.go

View check run for this annotation

Codecov / codecov/patch

pkg/device/nvidia/device.go#L82

Added line #L82 was not covered by tests
fs.BoolVar(&OverwriteEnv, "overwrite-env", false, "If set NVIDIA_VISIBLE_DEVICES=none to pods with no-gpu allocation")
}

Expand Down Expand Up @@ -136,18 +138,32 @@
return nodedevices, nil
}

func setOrUpdateEnvVar(ctr *corev1.Container, name string, value string) {
// Check if the env var already exists
for i, envVar := range ctr.Env {
if envVar.Name == name {
// If found, update the value
ctr.Env[i].Value = value
return
}

Check warning on line 148 in pkg/device/nvidia/device.go

View check run for this annotation

Codecov / codecov/patch

pkg/device/nvidia/device.go#L144-L148

Added lines #L144 - L148 were not covered by tests
}
// If not found, append it as a new env var
ctr.Env = append(ctr.Env, corev1.EnvVar{
Name: name,
Value: value,
})
}

func (dev *NvidiaGPUDevices) MutateAdmission(ctr *corev1.Container, p *corev1.Pod) (bool, error) {
/*gpu related */
priority, ok := ctr.Resources.Limits[corev1.ResourceName(ResourcePriority)]
if ok {
ctr.Env = append(ctr.Env, corev1.EnvVar{
Name: api.TaskPriority,
Value: fmt.Sprint(priority.Value()),
})
setOrUpdateEnvVar(ctr, api.TaskPriority, fmt.Sprint(priority.Value()))

Check warning on line 161 in pkg/device/nvidia/device.go

View check run for this annotation

Codecov / codecov/patch

pkg/device/nvidia/device.go#L161

Added line #L161 was not covered by tests
}

_, resourceNameOK := ctr.Resources.Limits[corev1.ResourceName(ResourceName)]
if resourceNameOK {
setOrUpdateEnvVar(ctr, "LIBCUDA_LOG_LEVEL", LIBCUDALogVerbosityLevel)
return resourceNameOK, nil
}

Expand All @@ -159,14 +175,12 @@
if config.DefaultResourceNum > 0 {
ctr.Resources.Limits[corev1.ResourceName(ResourceName)] = *resource.NewQuantity(int64(config.DefaultResourceNum), resource.BinarySI)
resourceNameOK = true
setOrUpdateEnvVar(ctr, "LIBCUDA_LOG_LEVEL", LIBCUDALogVerbosityLevel)
}
}

if !resourceNameOK && OverwriteEnv {
ctr.Env = append(ctr.Env, corev1.EnvVar{
Name: "NVIDIA_VISIBLE_DEVICES",
Value: "none",
})
setOrUpdateEnvVar(ctr, "NVIDIA_VISIBLE_DEVICES", "none")

Check warning on line 183 in pkg/device/nvidia/device.go

View check run for this annotation

Codecov / codecov/patch

pkg/device/nvidia/device.go#L183

Added line #L183 was not covered by tests
}
return resourceNameOK, nil
}
Expand Down
Loading