forked from GoogleCloudPlatform/cluster-toolkit
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhtc-slurm.yaml
165 lines (147 loc) · 5.29 KB
/
htc-slurm.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# Copyright 2024 Google LLC
# Copyright (C) SchedMD LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
# This blueprint provisions a cluster using the Slurm scheduler configured to
# efficiently run many short duration, loosely-coupled (non-MPI) jobs. See also:
# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/htc.md
# https://slurm.schedmd.com/high_throughput.html
blueprint_name: htc-slurm-v6
vars:
project_id: ## Set GCP Project ID Here ##
deployment_name: htc-slurm-v6
region: us-west4
zone: us-west4-c
# By default, public IPs are set in the login and controller to allow easier
# SSH access. To turn this behavior off, set this to false.
enable_public_ips: true
# Stage `community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/*` into the deployment folder.
# If you move the blueprint, make sure the relative path is correct.
staged_configs: $(ghpc_stage("../modules/scheduler/schedmd-slurm-gcp-v6-controller/etc"))
# Documentation for each of the modules used below can be found at
# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
deployment_groups:
- group: primary
modules:
# Source is an embedded module, denoted by "modules/*" without ./, ../, /
# as a prefix. To refer to a local or community module, prefix with ./, ../ or /
# Example - ./modules/network/pre-existing-vpc
- id: network
source: modules/network/vpc
- id: homefs
source: modules/file-system/filestore
use: [network]
settings:
local_mount: /home
- id: projectsfs
source: modules/file-system/filestore
use: [network]
settings:
filestore_tier: HIGH_SCALE_SSD
size_gb: 10240
local_mount: /projects
# This file system has an associated license cost.
# https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud
- id: scratchfs
source: community/modules/file-system/DDN-EXAScaler
use: [network]
settings:
local_mount: /scratch
# The compute partition is designed for performance.
# Use:
# `srun -N 4 -p compute <<Command>>` for any node in the partition.
# `srun -N 4 -p compute --mincpus 30 <<Command>>` for node group c2s60.
- id: compute_nodeset_c2s60
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
name: c2s60
node_count_dynamic_max: 200
bandwidth_tier: gvnic_enabled
enable_placement: false
allow_automatic_updates: false
- id: compute_nodeset_c2s30
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
node_count_dynamic_max: 200
machine_type: c2-standard-30
bandwidth_tier: gvnic_enabled
enable_placement: false
allow_automatic_updates: false
- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use:
- compute_nodeset_c2s60
- compute_nodeset_c2s30
settings:
partition_name: compute
exclusive: false
# The lowcost partition is designed to run at a lower cost and without additional quota
# Use:
# `srun -N 4 <<Command>>` for any node in the partition.
# `srun -N 4 --mincpus 2` for node group n2s4.
- id: low_cost_nodeset_n2s2
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
name: n2s2
machine_type: n2-standard-2
node_count_dynamic_max: 10
bandwidth_tier: gvnic_enabled
enable_placement: false
allow_automatic_updates: false
- id: low_cost_nodeset_n2s4
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
name: n2s4
machine_type: n2-standard-4
node_count_dynamic_max: 10
bandwidth_tier: gvnic_enabled
enable_placement: false
allow_automatic_updates: false
- id: low_cost_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use:
- low_cost_nodeset_n2s2
- low_cost_nodeset_n2s4
settings:
is_default: true
partition_name: lowcost
exclusive: false
- id: slurm_login
source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
use: [network]
settings:
machine_type: n2-standard-4
enable_login_public_ips: $(vars.enable_public_ips)
- id: slurm_controller
source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
use:
- network
- homefs
- scratchfs
- projectsfs
- low_cost_partition
- compute_partition
- slurm_login
settings:
machine_type: c2-standard-8
enable_controller_public_ips: $(vars.enable_public_ips)
slurm_conf_tpl: $(vars.staged_configs)/htc-slurm.conf.tpl
slurmdbd_conf_tpl: $(vars.staged_configs)/htc-slurmdbd.conf.tpl
- id: hpc_dashboard
source: modules/monitoring/dashboard
outputs: [instructions]