Skip to content

Commit

Permalink
DRAFT
Browse files Browse the repository at this point in the history
  • Loading branch information
mr0re1 committed Jan 9, 2025
1 parent 43585e3 commit 60187c0
Show file tree
Hide file tree
Showing 12 changed files with 56 additions and 45 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ limitations under the License.
| <a name="input_instance_template"></a> [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for controller. | `string` | `null` | no |
| <a name="input_labels"></a> [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no |
| <a name="input_login_network_storage"></a> [login\_network\_storage](#input\_login\_network\_storage) | An array of network attached storage mounts to be configured on all login nodes. | <pre>list(object({<br/> server_ip = string,<br/> remote_mount = string,<br/> local_mount = string,<br/> fs_type = string,<br/> mount_options = string,<br/> }))</pre> | `[]` | no |
| <a name="input_login_nodes"></a> [login\_nodes](#input\_login\_nodes) | List of slurm login instance definitions. | <pre>list(object({<br/> name_prefix = string<br/> access_config = optional(list(object({<br/> nat_ip = string<br/> network_tier = string<br/> })))<br/> additional_disks = optional(list(object({<br/> disk_name = optional(string)<br/> device_name = optional(string)<br/> disk_size_gb = optional(number)<br/> disk_type = optional(string)<br/> disk_labels = optional(map(string), {})<br/> auto_delete = optional(bool, true)<br/> boot = optional(bool, false)<br/> })), [])<br/> additional_networks = optional(list(object({<br/> access_config = optional(list(object({<br/> nat_ip = string<br/> network_tier = string<br/> })), [])<br/> alias_ip_range = optional(list(object({<br/> ip_cidr_range = string<br/> subnetwork_range_name = string<br/> })), [])<br/> ipv6_access_config = optional(list(object({<br/> network_tier = string<br/> })), [])<br/> network = optional(string)<br/> network_ip = optional(string, "")<br/> nic_type = optional(string)<br/> queue_count = optional(number)<br/> stack_type = optional(string)<br/> subnetwork = optional(string)<br/> subnetwork_project = optional(string)<br/> })), [])<br/> bandwidth_tier = optional(string, "platform_default")<br/> can_ip_forward = optional(bool, false)<br/> disable_smt = optional(bool, false)<br/> disk_auto_delete = optional(bool, true)<br/> disk_labels = optional(map(string), {})<br/> disk_size_gb = optional(number)<br/> disk_type = optional(string, "n1-standard-1")<br/> enable_confidential_vm = optional(bool, false)<br/> enable_oslogin = optional(bool, true)<br/> enable_shielded_vm = optional(bool, false)<br/> gpu = optional(object({<br/> count = number<br/> type = string<br/> }))<br/> labels = optional(map(string), {})<br/> machine_type = optional(string)<br/> metadata = optional(map(string), {})<br/> min_cpu_platform = optional(string)<br/> num_instances = optional(number, 1)<br/> on_host_maintenance = optional(string)<br/> preemptible = optional(bool, false)<br/> region = optional(string)<br/> service_account = optional(object({<br/> email = optional(string)<br/> scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])<br/> }))<br/> shielded_instance_config = optional(object({<br/> enable_integrity_monitoring = optional(bool, true)<br/> enable_secure_boot = optional(bool, true)<br/> enable_vtpm = optional(bool, true)<br/> }))<br/> source_image_family = optional(string)<br/> source_image_project = optional(string)<br/> source_image = optional(string)<br/> static_ips = optional(list(string), [])<br/> subnetwork = string<br/> spot = optional(bool, false)<br/> tags = optional(list(string), [])<br/> zone = optional(string)<br/> termination_action = optional(string)<br/> }))</pre> | `[]` | no |
| <a name="input_login_nodes"></a> [login\_nodes](#input\_login\_nodes) | List of slurm login instance definitions. | <pre>list(object({<br/> group_name = string<br/> access_config = optional(list(object({<br/> nat_ip = string<br/> network_tier = string<br/> })))<br/> additional_disks = optional(list(object({<br/> disk_name = optional(string)<br/> device_name = optional(string)<br/> disk_size_gb = optional(number)<br/> disk_type = optional(string)<br/> disk_labels = optional(map(string), {})<br/> auto_delete = optional(bool, true)<br/> boot = optional(bool, false)<br/> })), [])<br/> additional_networks = optional(list(object({<br/> access_config = optional(list(object({<br/> nat_ip = string<br/> network_tier = string<br/> })), [])<br/> alias_ip_range = optional(list(object({<br/> ip_cidr_range = string<br/> subnetwork_range_name = string<br/> })), [])<br/> ipv6_access_config = optional(list(object({<br/> network_tier = string<br/> })), [])<br/> network = optional(string)<br/> network_ip = optional(string, "")<br/> nic_type = optional(string)<br/> queue_count = optional(number)<br/> stack_type = optional(string)<br/> subnetwork = optional(string)<br/> subnetwork_project = optional(string)<br/> })), [])<br/> bandwidth_tier = optional(string, "platform_default")<br/> can_ip_forward = optional(bool, false)<br/> disable_smt = optional(bool, false)<br/> disk_auto_delete = optional(bool, true)<br/> disk_labels = optional(map(string), {})<br/> disk_size_gb = optional(number)<br/> disk_type = optional(string, "n1-standard-1")<br/> enable_confidential_vm = optional(bool, false)<br/> enable_oslogin = optional(bool, true)<br/> enable_shielded_vm = optional(bool, false)<br/> gpu = optional(object({<br/> count = number<br/> type = string<br/> }))<br/> labels = optional(map(string), {})<br/> machine_type = optional(string)<br/> metadata = optional(map(string), {})<br/> min_cpu_platform = optional(string)<br/> num_instances = optional(number, 1)<br/> on_host_maintenance = optional(string)<br/> preemptible = optional(bool, false)<br/> region = optional(string)<br/> service_account = optional(object({<br/> email = optional(string)<br/> scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])<br/> }))<br/> shielded_instance_config = optional(object({<br/> enable_integrity_monitoring = optional(bool, true)<br/> enable_secure_boot = optional(bool, true)<br/> enable_vtpm = optional(bool, true)<br/> }))<br/> source_image_family = optional(string)<br/> source_image_project = optional(string)<br/> source_image = optional(string)<br/> static_ips = optional(list(string), [])<br/> subnetwork = string<br/> spot = optional(bool, false)<br/> tags = optional(list(string), [])<br/> zone = optional(string)<br/> termination_action = optional(string)<br/> }))</pre> | `[]` | no |
| <a name="input_login_startup_script"></a> [login\_startup\_script](#input\_login\_startup\_script) | Startup script used by the login VMs. | `string` | `"# no-op"` | no |
| <a name="input_login_startup_scripts_timeout"></a> [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If<br/>any script exceeds this timeout, then the instance setup process is considered<br/>failed and handled accordingly.<br/><br/>NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no |
| <a name="input_machine_type"></a> [machine\_type](#input\_machine\_type) | Machine type to create. | `string` | `"c2-standard-4"` | no |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@
module "slurm_login_template" {
source = "../../internal/slurm-gcp/instance_template"

for_each = { for x in var.login_nodes : x.name_prefix => x }
for_each = { for x in var.login_nodes : x.group_name => x }

project_id = var.project_id
slurm_cluster_name = local.slurm_cluster_name
slurm_instance_role = "login"
slurm_bucket_path = module.slurm_files.slurm_bucket_path
name_prefix = each.value.name_prefix
name_prefix = each.value.group_name

additional_disks = each.value.additional_disks
bandwidth_tier = each.value.bandwidth_tier
Expand Down Expand Up @@ -57,7 +57,7 @@ module "slurm_login_template" {
# INSTANCE
module "slurm_login_instance" {
source = "../../internal/slurm-gcp/instance"
for_each = { for x in var.login_nodes : x.name_prefix => x }
for_each = { for x in var.login_nodes : x.group_name => x }

access_config = each.value.access_config
hostname = "${local.slurm_cluster_name}-${each.key}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ No modules.
| <a name="input_google_app_cred_path"></a> [google\_app\_cred\_path](#input\_google\_app\_cred\_path) | Path to Google Application Credentials. | `string` | `null` | no |
| <a name="input_install_dir"></a> [install\_dir](#input\_install\_dir) | Directory where the hybrid configuration directory will be installed on the<br/>on-premise controller (e.g. /etc/slurm/hybrid). This updates the prefix path<br/>for the resume and suspend scripts in the generated `cloud.conf` file.<br/><br/>This variable should be used when the TerraformHost and the SlurmctldHost<br/>are different.<br/><br/>This will default to var.output\_dir if null. | `string` | `null` | no |
| <a name="input_login_network_storage"></a> [login\_network\_storage](#input\_login\_network\_storage) | Storage to mounted on login and controller instances<br/>- server\_ip : Address of the storage server.<br/>- remote\_mount : The location in the remote instance filesystem to mount from.<br/>- local\_mount : The location on the instance filesystem to mount to.<br/>- fs\_type : Filesystem type (e.g. "nfs").<br/>- mount\_options : Options to mount with. | <pre>list(object({<br/> server_ip = string<br/> remote_mount = string<br/> local_mount = string<br/> fs_type = string<br/> mount_options = string<br/> }))</pre> | `[]` | no |
| <a name="input_login_startup_scripts"></a> [login\_startup\_scripts](#input\_login\_startup\_scripts) | List of scripts to be ran on login VM startup. | <pre>list(object({<br/> filename = string<br/> content = string<br/> }))</pre> | `[]` | no |
| <a name="input_login_startup_scripts"></a> [login\_startup\_scripts](#input\_login\_startup\_scripts) | List of scripts to be ran on login VM startup in the specific group. | <pre>map(list(object({<br/> filename = string<br/> content = string<br/> })))</pre> | `{}` | no |
| <a name="input_login_startup_scripts_timeout"></a> [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If<br/>any script exceeds this timeout, then the instance setup process is considered<br/>failed and handled accordingly.<br/><br/>NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no |
| <a name="input_munge_mount"></a> [munge\_mount](#input\_munge\_mount) | Remote munge mount for compute and login nodes to acquire the munge.key.<br/><br/>By default, the munge mount server will be assumed to be the<br/>`var.slurm_control_host` (or `var.slurm_control_addr` if non-null) when<br/>`server_ip=null`. | <pre>object({<br/> server_ip = string<br/> remote_mount = string<br/> fs_type = string<br/> mount_options = string<br/> })</pre> | <pre>{<br/> "fs_type": "nfs",<br/> "mount_options": "",<br/> "remote_mount": "/etc/munge/",<br/> "server_ip": null<br/>}</pre> | no |
| <a name="input_network_storage"></a> [network\_storage](#input\_network\_storage) | Storage to mounted on all instances.<br/>- server\_ip : Address of the storage server.<br/>- remote\_mount : The location in the remote instance filesystem to mount from.<br/>- local\_mount : The location on the instance filesystem to mount to.<br/>- fs\_type : Filesystem type (e.g. "nfs").<br/>- mount\_options : Options to mount with. | <pre>list(object({<br/> server_ip = string<br/> remote_mount = string<br/> local_mount = string<br/> fs_type = string<br/> mount_options = string<br/> }))</pre> | `[]` | no |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,14 +250,17 @@ resource "google_storage_bucket_object" "nodeset_startup_scripts" {
}

resource "google_storage_bucket_object" "login_startup_scripts" {
for_each = {
for x in var.login_startup_scripts
: replace(basename(x.filename), "/[^a-zA-Z0-9-_]/", "_") => x
}
for_each = { for x in flatten([
for group, scripts in var.login_startup_scripts
: [for s in scripts
: {
content = s.content,
name = format("slurm-login-%s-script-%s", group, replace(basename(s.filename), "/[^a-zA-Z0-9-_]/", "_")) }
]]) : x.name => x.content }

bucket = var.bucket_name
name = format("%s/slurm-login-script-%s", local.bucket_dir, each.key)
content = each.value.content
name = format("%s/%s", local.bucket_dir, each.key)
content = each.value
}

resource "google_storage_bucket_object" "prolog_scripts" {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def run_custom_scripts():
elif lookup().instance_role == "compute":
# compute setup with compute.d and nodeset.d
custom_dirs = [custom_dir / "compute.d", custom_dir / "nodeset.d"]
elif lookup().instance_role == "login":
elif lookup().is_login_node:
# login setup with only login.d
custom_dirs = [custom_dir / "login.d"]
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def resolve_network_storage(nodeset=None):
# On non-controller instances, entries in network_storage could overwrite
# default exports from the controller. Be careful, of course
mounts.update(mounts_by_local(lookup().cfg.network_storage))
if lookup().instance_role in ("login", "controller"):
if lookup().is_login_node or lookup().is_controller:
mounts.update(mounts_by_local(lookup().cfg.login_network_storage))

if nodeset is not None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ def reconfigure_slurm():
run("systemctl restart slurmd")
util.run(f"wall '{update_msg}'", timeout=30)
log.debug("Done.")
elif lookup().instance_role_safe == "login":
elif lookup().is_login_node:
log.info("Restarting sackd to make changes take effect.")
run("systemctl restart sackd")
util.run(f"wall '{update_msg}'", timeout=30)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -351,24 +351,23 @@ def hash_file(fullpath: Path) -> str:

def install_custom_scripts(check_hash=False):
"""download custom scripts from gcs bucket"""
role, tokens = lookup().instance_role, []

if role == "controller":
tokens = ["controller", "prolog", "epilog"]
elif role == "compute":
tokens = [
"compute",
"prolog",
"epilog",
f"nodeset-{lookup().node_nodeset_name()}"
]
elif role == "login":
tokens = [f"login-{lookup().login_group_name()}"]

compute_tokens = ["compute", "prolog", "epilog"]
if lookup().instance_role == "compute":
try:
compute_tokens.append(f"nodeset-{lookup().node_nodeset_name()}")
except Exception as e:
log.error(f"Failed to lookup nodeset: {e}")
prefixes = [f"slurm-{tok}-script" for tok in tokens]

prefix_tokens = dict.get(
{
"login": ["login"],
"compute": compute_tokens,
"controller": ["controller", "prolog", "epilog"],
},
lookup().instance_role,
[],
)
prefixes = [f"slurm-{tok}-script" for tok in prefix_tokens]
# TODO: use single `blob_list`, to reduce ~4x number of GCS requests
blobs = list(chain.from_iterable(blob_list(prefix=p) for p in prefixes))

script_pattern = re.compile(r"slurm-(?P<path>\S+)-script-(?P<name>\S+)")
Expand Down Expand Up @@ -1295,6 +1294,10 @@ def instance_role_safe(self):
@property
def is_controller(self):
return self.instance_role_safe == "controller"

@property
def is_login_node(self):
return self.instance_role_safe == "login"

@cached_property
def compute(self):
Expand All @@ -1315,6 +1318,10 @@ def hostname_fqdn(self):
def zone(self):
return instance_metadata("zone")

def login_group_name(self):
assert self.is_login_node, f"{self.hostname} is not a login node"
return self._node_desc(self.hostname)["nodeset"]

node_desc_regex = re.compile(
r"^(?P<prefix>(?P<cluster>[^\s\-]+)-(?P<nodeset>\S+))-(?P<node>(?P<suffix>\w+)|(?P<range>\[[\d,-]+\]))$"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,15 +104,6 @@ variable "cloudsql_secret" {
default = null
}

variable "login_startup_scripts" {
description = "List of scripts to be ran on login VM startup."
type = list(object({
filename = string
content = string
}))
default = []
}

variable "login_startup_scripts_timeout" {
description = <<EOD
The timeout (seconds) applied to each script in login_startup_scripts. If
Expand Down Expand Up @@ -155,6 +146,15 @@ variable "compute_startup_scripts" {
default = []
}

variable "login_startup_scripts" {
description = "List of scripts to be ran on login VM startup in the specific group."
type = map(list(object({
filename = string
content = string
})))
default = {}
}

variable "nodeset_startup_scripts" {
description = "List of scripts to be ran on compute VM startup in the specific nodeset."
type = map(list(object({
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ locals {
}
ghpc_startup_script_compute = length(local.daos_ns) > 0 ? [local.daos_install_mount_script, local.ghpc_startup_compute] : [local.ghpc_startup_compute]

login_startup_scripts = { for g in var.login_nodes : g.group_name => local.ghpc_startup_script_login }
nodeset_startup_scripts = { for k, v in local.nodeset_map : k => v.startup_script }
}

Expand Down Expand Up @@ -151,7 +152,7 @@ module "slurm_files" {
nodeset_startup_scripts = local.nodeset_startup_scripts
compute_startup_scripts = local.ghpc_startup_script_compute
compute_startup_scripts_timeout = var.compute_startup_scripts_timeout
login_startup_scripts = local.ghpc_startup_script_login
login_startup_scripts = local.login_startup_scripts
login_startup_scripts_timeout = var.login_startup_scripts_timeout

enable_debug_logging = var.enable_debug_logging
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ variable "bucket_dir" {
variable "login_nodes" {
description = "List of slurm login instance definitions."
type = list(object({
name_prefix = string
group_name = string
access_config = optional(list(object({
nat_ip = string
network_tier = string
Expand Down Expand Up @@ -171,8 +171,8 @@ variable "login_nodes" {
}))
default = []
validation {
condition = length(distinct([for x in var.login_nodes : x.name_prefix])) == length(var.login_nodes)
error_message = "All login_nodes must have a unique name_prefix."
condition = length(distinct([for x in var.login_nodes : x.group_name])) == length(var.login_nodes)
error_message = "All login_nodes must have a unique group name."
}
}

Expand Down
Loading

0 comments on commit 60187c0

Please sign in to comment.