From 10ff44a29157629e36ef15c000e6f4b30798ca73 Mon Sep 17 00:00:00 2001 From: Ed Robinson Date: Fri, 22 Nov 2024 12:24:52 +0000 Subject: [PATCH] Port asg_node_group to v1.25 * We have some legacy clusters on 1.24 that need to be upgraded before extended support ends. * They are still using cluster-autoscaler (not Karpenter) * The simplest upgrade procedure is to forward-port the asg_node_group module. Note: this module adds a node_instance_profile variable, as cluster config no longer exposes an instance profile for the node group. To avoid any issues this needs to be set to the same value as you were using in 1.24 iam_config Also note that the IAM role of nodes in the node group will need to be manually added to the clusters `aws_auth_role_map` as this is no longer defaulted. --- modules/asg_node_group/README.md | 229 ++++++++++++++++++ .../bottlerocket_config.toml.tpl | 14 ++ modules/asg_node_group/cloud_config.tpl | 7 + modules/asg_node_group/main.tf | 205 ++++++++++++++++ modules/asg_node_group/variables.tf | 200 +++++++++++++++ modules/asg_node_group/versions.tf | 15 ++ 6 files changed, 670 insertions(+) create mode 100644 modules/asg_node_group/README.md create mode 100644 modules/asg_node_group/bottlerocket_config.toml.tpl create mode 100644 modules/asg_node_group/cloud_config.tpl create mode 100644 modules/asg_node_group/main.tf create mode 100644 modules/asg_node_group/variables.tf create mode 100644 modules/asg_node_group/versions.tf diff --git a/modules/asg_node_group/README.md b/modules/asg_node_group/README.md new file mode 100644 index 00000000..2d43e544 --- /dev/null +++ b/modules/asg_node_group/README.md @@ -0,0 +1,229 @@ +# asg_node_group + +This module provisions nodes for your cluster by managing AWS auto scaling groups. + +## Features + +* Will manage spot or on demand instances. +* Provisions an auto scaling group per availability zone, to support applications + utilizing EBS volumes via PVC. +* Prepares the auto scaling group(s) to be scaled by the cluster autoscaler. +* Uses the official AWS EKS optimised Amazon Linux AMI + +## Usage + +```hcl +module "nodes" { + source = "cookpad/eks/aws//modules/asg_node_group" + + cluster_config = module.cluster.config + max_size = 60 + instance_family = "memory_optimized" + instance_size = "4xlarge" +} +``` + +### Instance type selection + +There are two ways to choose the instance types launched by the autoscaling +groups: + +#### `instance_family` & `instance_size` + +The module has 4 preset instance families to choose from (the default is `general_purpose`) : + +| family | instance types (x86_64) | (arm64) | +|--------|----------------| +| `memory_optimized` | `r5`, `r5d`, `r5n`, `r5dn`, `r5a`, `r5ad` | `r6g`, `r6gd` | +| `general_purpose` | `m5`, `m5d`, `m5n`, `m5dn`, `m5a`, `m5ad` | `m6g`, `m6gd` | +| `compute_optimized` | `c5`, `c5n`, `c5d` | `c6g`, `c6gn`, `c6gd`, `c7g` | +| `burstable` | `t3`, `t3a` | `t4g` | + +This is combined with `instance_size` to choose the instance types that the +group will launch. + +These groups are useful when utilising spot instances to provide diversity to +avoid the effects of price spikes. + +When using on-demand instances, as diversity is not required, only the +first instance type in a family is used. + +e.g. +```hcl +module "nodes" { + source = "cookpad/eks/aws//modules/asg_node_group" + + cluster_config = module.cluster.config + instance_family = "compute_optimized" + instance_lifecycle = "on_demand" +} +``` + +#### `instance_family` & `instance_types` + +Alternatively `instance_types` can be used to provide a list of the exact +instance types that will be launched, `instance_family` and `instance_size` is +used in this case to provide part of the ASG name. + +e.g. +```hcl +module "nodes" { + source = "cookpad/eks/aws//modules/asg_node_group" + + cluster_config = module.cluster.config + max_size = 16 + instance_family = "io_optimised" + instance_size = "xlarge" + instance_types = ["i3.xlarge", "i3en.xlarge"] +} +``` + +### GPU Nodes + +In order to use a GPU optimised AMI set the `gpu` variable. + +It is recommended to set the `k8s.amazonaws.com/accelerator` variable to avoid +the cluster autoscaler from adding too many nodes whilst the GPU driver is +initialising. See https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler/cloudprovider/aws#gpu-node-groups for more info. + +If you are running mixed workloads on your cluster, you could +add a taint to your GPU nodes to avoid running non GPU workloads on expensive +GPU instances. + +Note: Currently you would need to manually add the appropriate toleration +to your workloads, as EKS currently doesn't enable the `ExtendedResourceToleration` +admission controller, see: https://github.com/aws/containers-roadmap/issues/739 + +```hcl +module "gpu_nodes" { + source = "cookpad/eks/aws//modules/asg_node_group" + + cluster_config = module.cluster.config + + gpu = true + instance_family = "gpu" + instance_size = "2xlarge" + instance_types = ["p3.2xlarge"] + + labels = { + "k8s.amazonaws.com/accelerator" = "nvidia-tesla-v100" + } + + taints = { + "nvidia.com/gpu" = "gpu:NoSchedule" + } +} +``` + +### Labels & taints + +You can provide kubernetes labels and/or taints for the nodes, to provide some +control of where your workloads are scheduled. + +* https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ +* https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/ + +e.g. +```hcl +module "nodes" { + source = "cookpad/eks/aws//modules/asg_node_group" + + cluster_config = module.cluster.config + + labels = { + "cookpad.com/environment_name" = "production" + "cookpad.com/department" = "machine-learning" + } + + taints = { + "dedicated" = "gpu:PreferNoSchedule" + } +} +``` + +### Volume size + +You can configure the root volume size (it defaults to 40 GiB). + +e.g. + +```hcl +module "nodes" { + source = "cookpad/eks/aws//modules/asg_node_group" + + cluster_config = module.cluster.config + root_volume_size = 10 +} + +``` + +### Zone awareness + +The module by default provisions 1 ASG per availability zone so the cluster +autoscaler can create instances in particular zone. + +If this is not required you can disable this behaviour, and the module will +create a single ASG that will create instances any of your cluster's availability +zones. + +e.g. + +```hcl +module "nodes" { + source = "cookpad/eks/aws//modules/asg_node_group" + + cluster_config = module.cluster.config + zone_awareness = false +} + +``` + +### Security groups + +The module automatically applies the node security group provided by the cluster +module to each node. This allows access of the nodes to the control plane, and +intra-cluster communication between pods running on the cluster. + +If you need to add any additional security groups, e.g. for ssh access, configure +`security_groups` with the security group ids. + +### SSH key + +Set `key_name` to configure a ssh key pair. + +### Cloud config + +The module will configure the instance user data to use cloud config to add +each node to the cluster, via the eks bootsstrap script, as well as setting the +instances name tag. + +If you need to provide any additional cloud config it will be merged, +see https://cloudinit.readthedocs.io/en/latest/topics/merging.html for more info. + +### Bottlerocket + +[Bottlerocket](https://github.com/bottlerocket-os/bottlerocket) is a free and open-source Linux-based operating system meant for hosting containers. + +To use bottlerocket set the bottlerocket variable. + +```hcl +module "bottlerocket_nodes" { + source = "cookpad/eks/aws//modules/asg_node_group" + + cluster_config = module.cluster.config + bottlerocket = true +} +``` +⚠️ Bottlerocket now [supports GPU nodes](https://github.com/bottlerocket-os/bottlerocket/blob/develop/QUICKSTART-EKS.md#aws-k8s--nvidia-variants), set `gpu = true` to enable them. Ensure that you set `instance_types` to a GPU instance type. + +📝 If you want to get a shell session on your instances via Bottlerocket's SSM agent +you will need to attach the `arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore` policy +to your node instance profile. If you use the `cookpad/eks/aws//modules/iam` module to +provision your node role, then this is done by default! + +### IMDSv2 instead of v1 to secure nodes from a hacker to obtain AWS credentials + +By default, IMDSv2 will be enabled through the variable nodes_metadata_http_tokens. + +⚠️ If you are using kube2iam change the default value to "optional". [terraform IMDSv2](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/launch_template#metadata-options) +Once we don't have any cluster using kube2iam, this variable can be removed and forced to be required the token. diff --git a/modules/asg_node_group/bottlerocket_config.toml.tpl b/modules/asg_node_group/bottlerocket_config.toml.tpl new file mode 100644 index 00000000..945b2cab --- /dev/null +++ b/modules/asg_node_group/bottlerocket_config.toml.tpl @@ -0,0 +1,14 @@ +[settings.kubernetes] +cluster-name = "${cluster_name}" +api-server = "${cluster_endpoint}" +cluster-certificate = "${cluster_ca_data}" +[settings.kubernetes.node-labels] +${node_labels} +[settings.kubernetes.node-taints] +${node_taints} +[settings.host-containers.admin] +enabled = ${admin_container_enabled} +superpowered = ${admin_container_superpowered} +%{ if admin_container_source != "" } +source = "${admin_container_source}" +%{ endif } diff --git a/modules/asg_node_group/cloud_config.tpl b/modules/asg_node_group/cloud_config.tpl new file mode 100644 index 00000000..ad0c23ba --- /dev/null +++ b/modules/asg_node_group/cloud_config.tpl @@ -0,0 +1,7 @@ +## template: jinja +#cloud-config +fqdn: eks-node-${cluster_name}-{{ v1.instance_id }} +runcmd: +- [aws, --region={{ v1.region }}, ec2, create-tags, --resources={{ v1.instance_id }}, "--tags=Key=Name,Value=eks-node-${cluster_name}-{{ v1.instance_id }}"] +- [systemctl, restart, docker] +- [/etc/eks/bootstrap.sh, ${cluster_name}, --kubelet-extra-args, '--node-labels=${labels} --register-with-taints="${taints}"'] diff --git a/modules/asg_node_group/main.tf b/modules/asg_node_group/main.tf new file mode 100644 index 00000000..cd7b04ad --- /dev/null +++ b/modules/asg_node_group/main.tf @@ -0,0 +1,205 @@ +locals { + k8s_version = "1.25" + preset_instance_families = { + x86_64__memory_optimized = ["r5", "r5d", "r5n", "r5dn", "r5a", "r5ad"] + x86_64__general_purpose = ["m5", "m5d", "m5n", "m5dn", "m5a", "m5ad"] + x86_64__compute_optimized = ["c5", "c5n", "c5d"] + x86_64__burstable = ["t3", "t3a"] + + arm64__memory_optimized = ["r6g", "r6gd"] + arm64__general_purpose = ["m6g", "m6gd"] + arm64__compute_optimized = ["c6g", "c6gn", "c6gd", "c7g"] + arm64__burstable = ["t4g"] + } + + instance_types = length(var.instance_types) > 0 ? var.instance_types : [for instance_family in local.preset_instance_families["${var.architecture}__${var.instance_family}"] : "${instance_family}.${var.instance_size}"] + instance_overrides = var.instance_lifecycle == "spot" ? local.instance_types : [local.instance_types[0]] + name_prefix = replace(join("-", compact(["eks-node", var.cluster_config.name, var.name, var.instance_family, var.instance_size, var.instance_lifecycle])), "_", "-") + asg_subnets = var.zone_awareness ? { for az, subnet in var.cluster_config.private_subnet_ids : az => [subnet] } : { "multi-zone" = values(var.cluster_config.private_subnet_ids) } + max_size = floor(var.max_size / length(local.asg_subnets)) + min_size = ceil(var.min_size / length(local.asg_subnets)) + root_device_mappings = reverse(tolist(data.aws_ami.image.block_device_mappings))[0] + autoscaler_tags = var.cluster_autoscaler ? { "k8s.io/cluster-autoscaler/enabled" = "true", "k8s.io/cluster-autoscaler/${var.cluster_config.name}" = "owned" } : {} + bottlerocket_tags = var.bottlerocket ? { "Name" = "eks-node-${var.cluster_config.name}" } : {} + tags = merge(var.cluster_config.tags, var.tags, { "kubernetes.io/cluster/${var.cluster_config.name}" = "owned" }, local.autoscaler_tags, local.bottlerocket_tags) + node_group_label = var.name != "" ? var.name : local.name_prefix + cloud_config = templatefile( + "${path.module}/cloud_config.tpl", + { + cluster_name = var.cluster_config.name + labels = join(",", [for label, value in local.labels : "${label}=${value}"]) + taints = join(",", [for taint, value_effect in var.taints : "${taint}=${value_effect}"]) + } + ) + bottlerocket_config = templatefile( + "${path.module}/bottlerocket_config.toml.tpl", + { + cluster_name = var.cluster_config.name + cluster_endpoint = var.cluster_config.endpoint + cluster_ca_data = var.cluster_config.ca_data + node_labels = join("\n", [for label, value in local.labels : "\"${label}\" = \"${value}\""]) + node_taints = join("\n", [for taint, value in var.taints : "\"${taint}\" = \"${value}\""]) + admin_container_enabled = var.bottlerocket_admin_container_enabled + admin_container_superpowered = var.bottlerocket_admin_container_superpowered + admin_container_source = var.bottlerocket_admin_container_source + } + ) + + labels = merge( + { "node-group.k8s.cookpad.com/name" = local.node_group_label }, + var.gpu ? { "nvidia.com/gpu" = "true" } : {}, + var.bottlerocket ? { "bottlerocket" = "true" } : {}, + var.labels, + ) +} + +data "assert_test" "node_group_label" { + test = length(local.node_group_label) < 64 + throw = "node-group.k8s.cookpad.com/name label must be 63 characters or less. Set `name` or shorten `cluster-config.name`." +} + +data "aws_ssm_parameter" "image_id" { + name = var.bottlerocket ? "/aws/service/bottlerocket/aws-k8s-${local.k8s_version}${var.gpu ? "-nvidia" : ""}/${var.architecture}/latest/image_id" : "/aws/service/eks/optimized-ami/${local.k8s_version}/amazon-linux-2${var.gpu ? "-gpu" : ""}${var.architecture == "arm64" ? "-arm64" : ""}/recommended/image_id" +} + +data "aws_ami" "image" { + owners = ["amazon"] + filter { + name = "image-id" + values = [data.aws_ssm_parameter.image_id.value] + } +} + +data "aws_region" "current" {} + +resource "aws_launch_template" "config" { + image_id = data.aws_ami.image.id + name = local.name_prefix + vpc_security_group_ids = concat([var.cluster_config.node_security_group], var.security_groups) + user_data = var.bottlerocket ? base64gzip(local.bottlerocket_config) : base64gzip(local.cloud_config) + + instance_type = local.instance_types.0 + + iam_instance_profile { + name = var.node_instance_profile + } + + block_device_mappings { + device_name = local.root_device_mappings.device_name + + ebs { + volume_size = var.root_volume_size + volume_type = local.root_device_mappings.ebs.volume_type + snapshot_id = local.root_device_mappings.ebs.snapshot_id + } + } + + metadata_options { + http_endpoint = "enabled" + http_tokens = var.imdsv2_required ? "required" : "optional" + http_put_response_hop_limit = 2 + } + + tag_specifications { + resource_type = "instance" + tags = local.tags + } + + tag_specifications { + resource_type = "volume" + tags = local.tags + } + + tags = local.tags + + key_name = var.key_name +} + +resource "aws_autoscaling_group" "nodes" { + for_each = local.asg_subnets + + name = "${local.name_prefix}-${each.key}" + min_size = local.min_size + max_size = local.max_size + vpc_zone_identifier = each.value + termination_policies = var.termination_policies + enabled_metrics = var.enabled_metrics + wait_for_capacity_timeout = "10m" + + mixed_instances_policy { + launch_template { + launch_template_specification { + launch_template_id = aws_launch_template.config.id + version = "$Latest" + } + + dynamic "override" { + for_each = local.instance_types + content { + instance_type = override.value + } + } + } + + instances_distribution { + on_demand_base_capacity = 0 + on_demand_percentage_above_base_capacity = (var.instance_lifecycle == "on_demand" ? 100 : 0) + spot_allocation_strategy = var.spot_allocation_strategy + spot_instance_pools = max(floor(length(local.instance_types) / 2), 2) + } + } + + tag { + key = "Role" + value = "eks-node" + propagate_at_launch = true + } + + tag { + key = "k8s.io/cluster-autoscaler/node-template/label/topology.kubernetes.io/zone" + value = each.key + propagate_at_launch = false + } + + tag { + key = "k8s.io/cluster-autoscaler/node-template/label/topology.kubernetes.io/region" + value = data.aws_region.current.name + propagate_at_launch = false + } + + tag { + key = "k8s.io/cluster-autoscaler/node-template/label/topology.kubernetes.io/region" + value = data.aws_region.current.name + propagate_at_launch = false + } + + dynamic "tag" { + for_each = local.labels + content { + key = "k8s.io/cluster-autoscaler/node-template/label/${tag.key}" + value = tag.value + propagate_at_launch = true + } + } + + dynamic "tag" { + for_each = var.taints + content { + key = "k8s.io/cluster-autoscaler/node-template/taint/${tag.key}" + value = tag.value + propagate_at_launch = true + } + } + + dynamic "tag" { + for_each = local.tags + + content { + key = tag.key + value = tag.value + propagate_at_launch = false + } + } + + depends_on = [aws_launch_template.config] +} diff --git a/modules/asg_node_group/variables.tf b/modules/asg_node_group/variables.tf new file mode 100644 index 00000000..3add15b3 --- /dev/null +++ b/modules/asg_node_group/variables.tf @@ -0,0 +1,200 @@ +variable "cluster_config" { + type = object({ + name = string + endpoint = string + ca_data = string + vpc_id = string + private_subnet_ids = map(string) + node_security_group = string + tags = map(string) + }) +} + +variable "name" { + type = string + default = "" + description = "An optional identifier for this node group" + + validation { + condition = length(var.name) < 64 + error_message = "Name must be 63 characters or less." + } +} + +variable "zone_awareness" { + type = bool + default = true + description = "Should the cluster autoscaler be aware of the AZ it is launching nodes into, if true then one ASG is created per AZ. If false a single AZ spanning all the zones will be created, applications making use of EBS volumes may not work as expected" +} + +variable "root_volume_size" { + type = number + default = 40 + description = "Volume size for the root partition. Value in GiB." +} + +variable "max_size" { + type = number + default = 12 + description = "The maximum number of instances that will be launched by this group, if not a multiple of the number of AZs in the group, may be rounded down" +} + +variable "min_size" { + type = number + default = 0 + description = "The minimum number of instances that will be launched by this group, if not a multiple of the number of AZs in the group, may be rounded up" +} + +variable "instance_size" { + type = string + default = "large" + description = "The size of instances in this node group" +} + +variable "instance_family" { + type = string + default = "general_purpose" + description = "The family of instances that this group will launch, should be one of: memory_optimized, general_purpose, compute_optimized or burstable. Defaults to general_purpose" +} + +variable "instance_lifecycle" { + type = string + default = "spot" + description = "The lifecycle of instances managed by this group, should be 'spot' or 'on_demand'." +} + +variable "spot_allocation_strategy" { + type = string + default = "lowest-price" + description = "How to allocate capacity across the Spot pools. Valid values: 'lowest-price' or 'capacity-optimized'." +} + +variable "architecture" { + type = string + default = "x86_64" + description = "CPU Architecture to launch. This parameter is used for predefined instance type set selection and AMI selection. Valid values: 'x86_64' or 'arm64'." + + validation { + condition = var.architecture == "x86_64" || var.architecture == "arm64" + error_message = "Architecture must be 'x86_64' or 'arm64'." + } +} + +variable "gpu" { + type = bool + default = false + description = "Set if using GPU instance types" +} + +variable "labels" { + type = map(string) + default = {} + description = "Labels that will be added to the kubernetes node. A qualified name must consist of alphanumeric characters, '-', '_' or '.', and must start and end with an alphanumeric character (e.g. 'MyName', or 'my.name', or '123-abc', regex used for validation is '([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]') with an optional DNS subdomain prefix and '/' (e.g. 'example.com/MyName')" + # TODO: add custom validation rule once the feature is stable https://www.terraform.io/docs/configuration/variables.html#custom-validation-rules + + validation { + condition = alltrue([for s in values(var.labels) : length(s) < 64]) + error_message = "Labels must be 63 characters or less." + } +} + +variable "taints" { + type = map(string) + default = {} + description = "taints that will be added to the kubernetes node" +} + +variable "tags" { + type = map(string) + default = {} + description = "A map of additional tags to apply to this groups AWS resources" +} + +variable "instance_types" { + type = list(string) + description = <