Skip to content

Commit

Permalink
update modules to accept kubectl provider from root
Browse files Browse the repository at this point in the history
  • Loading branch information
annuay-google committed Oct 3, 2024
1 parent 7451cdc commit e4390e3
Show file tree
Hide file tree
Showing 15 changed files with 92 additions and 72 deletions.
1 change: 1 addition & 0 deletions modules/compute/gke-node-pool/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ limitations under the License.
| <a name="input_disk_type"></a> [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no |
| <a name="input_enable_gcfs"></a> [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no |
| <a name="input_enable_secure_boot"></a> [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no |
| <a name="input_gke_version"></a> [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes |
| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/> type = optional(string)<br/> count = optional(number, 0)<br/> gpu_driver_installation_config = optional(list(object({<br/> gpu_driver_version = string<br/> })))<br/> gpu_partition_size = optional(string)<br/> gpu_sharing_config = optional(list(object({<br/> gpu_sharing_strategy = optional(string)<br/> max_shared_clients_per_gpu = optional(number)<br/> })))<br/> }))</pre> | `null` | no |
| <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no |
| <a name="input_image_type"></a> [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
Expand Down
28 changes: 28 additions & 0 deletions modules/compute/gke-node-pool/gpu_direct.tf
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ locals {
updated_workload_path = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml")
rxdm_version = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9
min_additional_networks = 4
major_minor_version_acceptable_map = {
"1.27" = "1.27.7-gke.1121000"
"1.28" = "1.28.8-gke.1095000"
"1.29" = "1.29.3-gke.1093000"
"1.30" = "1.30.2-gke.1023000"
}
}
"a3-megagpu-8g" = {
# Manifest to be installed for enabling TCPXO on a3-megagpu-8g machines
Expand All @@ -43,10 +49,25 @@ locals {
updated_workload_path = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml")
rxdm_version = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4
min_additional_networks = 8
major_minor_version_acceptable_map = {
"1.28" = "1.28.9-gke.1250000"
"1.29" = "1.29.4-gke.1542000"
"1.30" = "1.30.4-gke.1129000"
}
}
}

min_additional_networks = try(local.gpu_direct_settings[var.machine_type].min_additional_networks, 0)

gke_version_regex = "(\\d+\\.\\d+)\\.(\\d+)-gke\\.(\\d+)" # GKE version format: 1.X.Y-gke.Z , regex output: ["1.X" , "Y", "Z"]

gke_version_parts = regex(local.gke_version_regex, var.gke_version)
gke_version_major = local.gke_version_parts[0]

major_minor_version_acceptable_map = try(local.gpu_direct_setting[var.machine_type].major_minor_version_acceptable_map, null)
minor_version_acceptable = try(contains(keys(local.major_minor_version_acceptable_map), local.gke_version_major), false) ? local.major_minor_version_acceptable_map[local.gke_version_major] : "1.0.0-gke.0"
minor_version_acceptable_parts = regex(local.gke_version_regex, local.minor_version_acceptable)
gke_gpudirect_compatible = local.gke_version_parts[1] > local.minor_version_acceptable_parts[1] || (local.gke_version_parts[1] == local.minor_version_acceptable_parts[1] && local.gke_version_parts[2] >= local.minor_version_acceptable_parts[2])
}

check "gpu_direct_check_multi_vpc" {
Expand All @@ -55,3 +76,10 @@ check "gpu_direct_check_multi_vpc" {
error_message = "To achieve optimal performance for ${var.machine_type} machine, at least ${local.min_additional_networks} additional vpc is recommended. You could configure it in the blueprint through modules/network/multivpc with network_count set as ${local.min_additional_networks}"
}
}

check "gke_version_requirements" {
assert {
condition = local.gke_gpudirect_compatible
error_message = "GPUDirect is not supported on GKE version ${var.gke_version} for ${var.machine_type} machine. For supported version details visit https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#requirements"
}
}
3 changes: 0 additions & 3 deletions modules/compute/gke-node-pool/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -316,9 +316,6 @@ resource "null_resource" "enable_tcpxo_in_workload" {
module "kubectl_apply" {
source = "../../management/kubectl-apply"

cluster_id = var.cluster_id
project_id = var.project_id

apply_manifests = flatten([
for manifest in local.gpu_direct_setting.gpu_direct_manifests : [
{
Expand Down
5 changes: 5 additions & 0 deletions modules/compute/gke-node-pool/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -360,3 +360,8 @@ variable "initial_node_count" {
type = number
default = null
}

variable "gke_version" {
description = "GKE version"
type = string
}
13 changes: 2 additions & 11 deletions modules/management/kubectl-apply/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,11 @@ limitations under the License.
| Name | Version |
|------|---------|
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.3 |
| <a name="requirement_google"></a> [google](#requirement\_google) | > 5.0 |
| <a name="requirement_http"></a> [http](#requirement\_http) | ~> 3.0 |
| <a name="requirement_kubectl"></a> [kubectl](#requirement\_kubectl) | >= 1.7.0 |

## Providers

| Name | Version |
|------|---------|
| <a name="provider_google"></a> [google](#provider\_google) | > 5.0 |
No providers.

## Modules

Expand All @@ -107,20 +103,15 @@ limitations under the License.

## Resources

| Name | Type |
|------|------|
| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source |
| [google_container_cluster.gke_cluster](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_cluster) | data source |
No resources.

## Inputs

| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| <a name="input_apply_manifests"></a> [apply\_manifests](#input\_apply\_manifests) | A list of manifests to apply to GKE cluster using kubectl. For more details see [kubectl module's inputs](kubectl/README.md). | <pre>list(object({<br/> content = optional(string, null)<br/> source = optional(string, null)<br/> template_vars = optional(map(any), null)<br/> server_side_apply = optional(bool, false)<br/> wait_for_rollout = optional(bool, true)<br/> }))</pre> | `[]` | no |
| <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | An identifier for the gke cluster resource with format projects/<project\_id>/locations/<region>/clusters/<name>. | `string` | n/a | yes |
| <a name="input_jobset"></a> [jobset](#input\_jobset) | Install [Jobset](https://github.com/kubernetes-sigs/jobset) which manages a group of K8s [jobs](https://kubernetes.io/docs/concepts/workloads/controllers/job/) as a unit. | <pre>object({<br/> install = optional(bool, false)<br/> version = optional(string, "v0.5.2")<br/> })</pre> | `{}` | no |
| <a name="input_kueue"></a> [kueue](#input\_kueue) | Install and configure [Kueue](https://kueue.sigs.k8s.io/docs/overview/) workload scheduler. | <pre>object({<br/> install = optional(bool, false)<br/> version = optional(string, "v0.8.1")<br/> config_path = optional(string, null)<br/> })</pre> | `{}` | no |
| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID that hosts the gke cluster. | `string` | n/a | yes |

## Outputs

Expand Down
25 changes: 4 additions & 21 deletions modules/management/kubectl-apply/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,6 @@
*/

locals {
cluster_id_parts = split("/", var.cluster_id)
cluster_name = local.cluster_id_parts[5]
cluster_location = local.cluster_id_parts[3]
project_id = var.project_id != null ? var.project_id : local.cluster_id_parts[1]

apply_manifests_map = tomap({
for index, manifest in var.apply_manifests : index => manifest
})
Expand All @@ -30,14 +25,6 @@ locals {
jobset_install_source = format("${path.module}/manifests/jobset-%s.yaml", try(var.jobset.version, ""))
}

data "google_container_cluster" "gke_cluster" {
project = local.project_id
name = local.cluster_name
location = local.cluster_location
}

data "google_client_config" "default" {}

module "kubectl_apply_manifests" {
for_each = local.apply_manifests_map
source = "./kubectl"
Expand All @@ -49,8 +36,7 @@ module "kubectl_apply_manifests" {
wait_for_rollout = each.value.wait_for_rollout

providers = {
kubectl = kubectl
http = http.h
http = http.h
}
}

Expand All @@ -60,8 +46,7 @@ module "install_kueue" {
server_side_apply = true

providers = {
kubectl = kubectl
http = http.h
http = http.h
}
}

Expand All @@ -71,8 +56,7 @@ module "install_jobset" {
server_side_apply = true

providers = {
kubectl = kubectl
http = http.h
http = http.h
}
}

Expand All @@ -85,7 +69,6 @@ module "configure_kueue" {
wait_for_rollout = true

providers = {
kubectl = kubectl
http = http.h
http = http.h
}
}
8 changes: 0 additions & 8 deletions modules/management/kubectl-apply/providers.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,6 @@
* limitations under the License.
*/

provider "kubectl" {
host = "https://${data.google_container_cluster.gke_cluster.endpoint}"
token = data.google_client_config.default.access_token
cluster_ca_certificate = base64decode(data.google_container_cluster.gke_cluster.master_auth[0].cluster_ca_certificate)
load_config_file = false
apply_retry_count = 15 # Terraform may apply resources in parallel, leading to potential dependency issues. This retry mechanism ensures that if a resource's dependencies aren't ready, Terraform will attempt to apply it again.
}

provider "http" {
alias = "h"
}
11 changes: 0 additions & 11 deletions modules/management/kubectl-apply/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,6 @@
* limitations under the License.
*/

variable "project_id" {
description = "The project ID that hosts the gke cluster."
type = string
}

variable "cluster_id" {
description = "An identifier for the gke cluster resource with format projects/<project_id>/locations/<region>/clusters/<name>."
type = string
nullable = false
}

variable "apply_manifests" {
description = "A list of manifests to apply to GKE cluster using kubectl. For more details see [kubectl module's inputs](kubectl/README.md)."
type = list(object({
Expand Down
12 changes: 0 additions & 12 deletions modules/management/kubectl-apply/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,11 @@

terraform {
required_providers {
google = {
source = "hashicorp/google"
version = "> 5.0"
}
kubectl = {
source = "gavinbunney/kubectl"
version = ">= 1.7.0"
}
http = {
source = "hashicorp/http"
version = "~> 3.0"
}
}

provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:kubectl-apply/v1.37.2"
}

required_version = ">= 1.3"
}
4 changes: 4 additions & 0 deletions modules/scheduler/gke-cluster/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,12 @@ limitations under the License.

| Name | Description |
|------|-------------|
| <a name="output_access_token"></a> [access\_token](#output\_access\_token) | Google client config access token. |
| <a name="output_cluster_ca_certificate"></a> [cluster\_ca\_certificate](#output\_cluster\_ca\_certificate) | GKE cluster CA certificate. |
| <a name="output_cluster_id"></a> [cluster\_id](#output\_cluster\_id) | An identifier for the resource with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. |
| <a name="output_gke_cluster_endpoint"></a> [gke\_cluster\_endpoint](#output\_gke\_cluster\_endpoint) | GKE cluster endpoint. |
| <a name="output_gke_cluster_exists"></a> [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster has been created. Needed by community/modules/scripts/kubernetes-operations. |
| <a name="output_gke_version"></a> [gke\_version](#output\_gke\_version) | GKE cluster's version. |
| <a name="output_instructions"></a> [instructions](#output\_instructions) | Instructions on how to connect to the created cluster. |
| <a name="output_k8s_service_account_name"></a> [k8s\_service\_account\_name](#output\_k8s\_service\_account\_name) | Name of k8s service account. |
<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
3 changes: 0 additions & 3 deletions modules/scheduler/gke-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -335,9 +335,6 @@ module "workload_identity" {
module "kubectl_apply" {
source = "../../management/kubectl-apply"

cluster_id = google_container_cluster.gke_cluster.id
project_id = var.project_id

apply_manifests = flatten([
for idx, network_info in var.additional_networks : [
{
Expand Down
20 changes: 20 additions & 0 deletions modules/scheduler/gke-cluster/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,23 @@ output "k8s_service_account_name" {
description = "Name of k8s service account."
value = one(module.workload_identity[*].k8s_service_account_name)
}

output "gke_version" {
description = "GKE cluster's version."
value = google_container_cluster.gke_cluster.master_version
}

output "gke_cluster_endpoint" {
description = "GKE cluster endpoint."
value = google_container_cluster.gke_cluster.endpoint
}

output "cluster_ca_certificate" {
description = "GKE cluster CA certificate."
value = google_container_cluster.gke_cluster.master_auth[0].cluster_ca_certificate
}

output "access_token" {
description = "Google client config access token."
value = data.google_client_config.default.access_token
}
5 changes: 5 additions & 0 deletions modules/scheduler/pre-existing-gke-cluster/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ limitations under the License.

| Name | Type |
|------|------|
| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source |
| [google_container_cluster.existing_gke_cluster](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_cluster) | data source |

## Inputs
Expand All @@ -109,6 +110,10 @@ limitations under the License.

| Name | Description |
|------|-------------|
| <a name="output_access_token"></a> [access\_token](#output\_access\_token) | Google client config access token. |
| <a name="output_cluster_ca_certificate"></a> [cluster\_ca\_certificate](#output\_cluster\_ca\_certificate) | GKE cluster CA certificate. |
| <a name="output_cluster_id"></a> [cluster\_id](#output\_cluster\_id) | An identifier for the gke cluster with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. |
| <a name="output_gke_cluster_endpoint"></a> [gke\_cluster\_endpoint](#output\_gke\_cluster\_endpoint) | GKE cluster endpoint. |
| <a name="output_gke_cluster_exists"></a> [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster exists. |
| <a name="output_gke_version"></a> [gke\_version](#output\_gke\_version) | GKE cluster's version. |
<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
5 changes: 2 additions & 3 deletions modules/scheduler/pre-existing-gke-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,11 @@ data "google_container_cluster" "existing_gke_cluster" {
location = var.region
}

data "google_client_config" "default" {}

module "kubectl_apply" {
source = "../../management/kubectl-apply" # can point to github

cluster_id = data.google_container_cluster.existing_gke_cluster.id
project_id = var.project_id

apply_manifests = flatten([
for idx, network_info in var.additional_networks : [
{
Expand Down
21 changes: 21 additions & 0 deletions modules/scheduler/pre-existing-gke-cluster/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,24 @@ output "gke_cluster_exists" {
data.google_container_cluster.existing_gke_cluster
]
}

output "gke_version" {
description = "GKE cluster's version."
value = data.google_container_cluster.existing_gke_cluster.master_version
}


output "gke_cluster_endpoint" {
description = "GKE cluster endpoint."
value = data.google_container_cluster.existing_gke_cluster.endpoint
}

output "cluster_ca_certificate" {
description = "GKE cluster CA certificate."
value = data.google_container_cluster.existing_gke_cluster.master_auth[0].cluster_ca_certificate
}

output "access_token" {
description = "Google client config access token."
value = data.google_client_config.default.access_token
}

0 comments on commit e4390e3

Please sign in to comment.