-
Notifications
You must be signed in to change notification settings - Fork 1.5k
/
Copy pathjob.yml
37 lines (31 loc) · 1.65 KB
/
job.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
# name of the experiment, use this to structure your training results
experiment_name: nebula-mnist-example
display_name: nebula-mnist-example
code: .
# compute cluster on which the command above should be run
# note: before you can use the compute, it needs to be created first in your AzureML workspace
compute: gpu-v100-1GPU-cluster
# compute resources
resources:
# number of nodes in the cluster above
instance_count: 1
# environment in which the command above should be run in
# note: It is recommended to use an environment which either is the ACPT image or at least inherits from it. The ACPT
# image has many frameworks related to PyTorch pre-installed, hence reduces efforts to create your own
# environment. To learn more about environments in AML, see
# https://learn.microsoft.com/en-us/azure/machine-learning/concept-environments
#
# for the latest ACPT image using PyTorch 2.2, CUDA 12.1 on Python 3.10
# environment: azureml:AzureML-acpt-pytorch-2.2-cuda12.1@latest
environment: azureml:AzureML-acpt-pytorch-2.2-cuda12.1@latest
# for a specific version
#environment: azureml:AzureML-acpt-pytorch-2.2-cuda12.1:3
# for the latest version of a custom environment: (====install Nebula in dockerfile environment=====)
#environment: azureml:ACPT-Extended@latest
# for a specific Docker image (has to be compatible with Azure ML):
#environment:
# image: mcr.microsoft.com/azureml/curated/acpt-pytorch-2.2-cuda12.1:2
# command that should be run by the job (====install Nebula with comand OR use dockerfile environment=====)
command: >-
python train.py --save-model