-
Notifications
You must be signed in to change notification settings - Fork 1.5k
/
Copy pathgpu_perf_job.yml
46 lines (32 loc) · 1.6 KB
/
gpu_perf_job.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# This runs a couple of tests on a gpu node.
# It helps diagnosis of health of perf issues.
type: command
code: src
command: /bin/bash ./gpu_perf.sh
# to iterate quickly on your environment design, use inline
environment: file:./environments/azureml/env.yml
# to reuse with different parameters, use registered environment
# environment: azureml:nccltests_azureml:openmpi4.1.0-cuda11.1-cudnn8-ubuntu18.04
# environment: azureml:nccltests_nvidia_pytorch:22.02-py3
# NOTE: set env var if needed by your configuration
environment_variables:
NCCL_DEBUG: "INFO" # adjusts the level of info from NCCL tests
# NCCL_IB_PCI_RELAXED_ORDERING: "1" # Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
# NCCL_IB_DISABLE: "1" # force disable infiniband (if set to "1")
# NCCL_NET_PLUGIN: "none" # to force NET/Plugin off (no rdma/sharp plugin at all)
# NCCL_NET: "Socket" # to force node-to-node comm to use Socket
# NCCL_SOCKET_IFNAME: "eth0" # to force Socket comm to use eth0 (use NCCL_NET=Socket)
# NCCL_TOPO_FILE: "/opt/microsoft/ndv4-topo.xml" # Use special topology file
# UCX_IB_PCI_RELAXED_ORDERING: "on"
# UCX_TLS: "tcp"
# UCX_NET_DEVICES: "eth0" # if you have Error: Failed to resolve UCX endpoint...
# CUDA_DEVICE_ORDER: "PCI_BUS_ID" # ordering of gpus
compute: azureml:gpu-cluster
distribution:
type: mpi
process_count_per_instance: 1 # NOTE: set equal to number of gpus on the node
resources:
instance_count: 1 # NOTE: to use multiple nodes
experiment_name: nccl-test
display_name: Gpu Diag (NCCL tests)
description: Runs NCCL-tests on gpu nodes.