From 1c9e9f0cfdbd12f732dff091e1cb9df92f7b29a5 Mon Sep 17 00:00:00 2001 From: Srdjan Petrovic Date: Mon, 7 Aug 2023 15:42:25 -0700 Subject: [PATCH] A few permissions-related changes. Namely: * Use a Kubernetes Role instead of a ClusterRole. The former is namespace-scoped, so it fits our use-case better. * Don't create a Role per component. Instead, create a single "application" role and use it across the components. * Explicitly specify deletion permissions for rolebindings, as using the "*" doesn't seem to work for some reason. The above changes, in addition to being more sensible, also fix the cleanup issue where the deletion of older version was failing. --- .github/workflows/integration.yml | 2 +- internal/gke/cloud_patcher.go | 7 +- internal/gke/deploy.go | 176 ++++++++++++++++++------------ internal/gke/gke.go | 58 ++++++---- internal/gke/kube_patcher.go | 8 +- 5 files changed, 155 insertions(+), 96 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 50177fd..7be44b8 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -16,7 +16,7 @@ name: Integration Test env: CONFIG_FILE: deploy.toml - WAIT_TIMEOUT: "15m" + WAIT_TIMEOUT: "180m" on: schedule: diff --git a/internal/gke/cloud_patcher.go b/internal/gke/cloud_patcher.go index 62bbb69..ac3b283 100644 --- a/internal/gke/cloud_patcher.go +++ b/internal/gke/cloud_patcher.go @@ -224,11 +224,14 @@ func patchSSLCertificate(ctx context.Context, config CloudConfig, opts patchOpti return old, nil }, create: func() error { - _, err := client.Insert(ctx, &computepb.InsertSslCertificateRequest{ + op, err := client.Insert(ctx, &computepb.InsertSslCertificateRequest{ SslCertificateResource: cert, Project: config.Project, }) - return err + if err != nil { + return err + } + return op.Wait(ctx) }, update: func(updateVal interface{}) error { // Neither Update or a Patch API exists. We also cannot delete the diff --git a/internal/gke/deploy.go b/internal/gke/deploy.go index 6620142..521916f 100644 --- a/internal/gke/deploy.go +++ b/internal/gke/deploy.go @@ -83,18 +83,24 @@ const ( // Managed DNS zone for the Service Weaver's internal domain name. managedDNSZoneName = "serviceweaver-internal" - // Names of the GCP IAM service accounts used by various Service Weaver actors. + // Names of the GCP IAM service accounts used by various Service Weaver entities. controllerIAMServiceAccount = "serviceweaver-controller" distributorIAMServiceAccount = "serviceweaver-distributor" managerIAMServiceAccount = "serviceweaver-manager" applicationIAMServiceAccount = "serviceweaver-application" - // Names of the GKE service accounts used by various Service Weaver actors. + // Names of the GKE service accounts used by various Service Weaver entities. controllerKubeServiceAccount = "controller" distributorKubeServiceAccount = "distributor" managerKubeServiceAccount = "manager" spareKubeServiceAccount = "spare" + // Names of the Roles in use by various Service Weaver entities. + controllerKubeRole = "controller" + distributorKubeRole = "distributor" + managerKubeRole = "manager" + applicationKubeRole = "application" + // Various settings for the Service Weaver's Certificate Authority. caName = "serviceweaver-ca" caPoolName = "serviceweaver-ca" @@ -898,25 +904,34 @@ func ensureConfigCluster(ctx context.Context, config CloudConfig, cfg *config.GK if err != nil { return nil, "", err } - if err := ensureKubeServiceAccount(ctx, cluster, nil /*logger*/, controllerKubeServiceAccount, controllerIAMServiceAccount, nil /*labels*/, []rbacv1.PolicyRule{ - { - APIGroups: []string{""}, // Core APIs. - Resources: []string{"configmaps"}, - Verbs: []string{"*"}, - }, - { - APIGroups: []string{"net.gke.io"}, // Networking APIs. - Resources: []string{"serviceimports"}, - Verbs: []string{"*"}, + if err := patchRole(ctx, cluster, patchOptions{}, &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: controllerKubeRole, + Namespace: namespaceName, }, - { - APIGroups: []string{"gateway.networking.k8s.io"}, // Gateway APIs. - Resources: []string{"gateways", "httproutes"}, - Verbs: []string{"*"}, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{""}, // Core APIs. + Resources: []string{"configmaps"}, + Verbs: []string{"*"}, + }, + { + APIGroups: []string{"net.gke.io"}, // Networking APIs. + Resources: []string{"serviceimports"}, + Verbs: []string{"*"}, + }, + { + APIGroups: []string{"gateway.networking.k8s.io"}, // Gateway APIs. + Resources: []string{"gateways", "httproutes"}, + Verbs: []string{"*"}, + }, }, }); err != nil { return nil, "", err } + if err := ensureKubeServiceAccount(ctx, cluster, nil /*logger*/, controllerKubeServiceAccount, controllerIAMServiceAccount, nil /*labels*/, controllerKubeRole); err != nil { + return nil, "", err + } // Ensure Service Weaver priority classes have been created in the cluster. if err := ensureControlPriorityClass(ctx, cluster); err != nil { @@ -954,75 +969,100 @@ func ensureApplicationCluster(ctx context.Context, config CloudConfig, cfg *conf } // Setup the distributor/manager/application service accounts. - if err := ensureKubeServiceAccount(ctx, cluster, nil /*logger*/, distributorKubeServiceAccount, distributorIAMServiceAccount, nil /*labels*/, []rbacv1.PolicyRule{ - { - APIGroups: []string{""}, // Core APIs. - Resources: []string{"services", "configmaps"}, - Verbs: []string{"*"}, + if err := patchRole(ctx, cluster, patchOptions{}, &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: distributorKubeRole, + Namespace: namespaceName, }, - { - APIGroups: []string{"gateway.networking.k8s.io"}, // Gateway APIs. - Resources: []string{"gateways", "httproutes"}, - Verbs: []string{"*"}, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{""}, // Core APIs. + Resources: []string{"services", "configmaps"}, + Verbs: []string{"*"}, + }, + { + APIGroups: []string{"gateway.networking.k8s.io"}, // Gateway APIs. + Resources: []string{"gateways", "httproutes"}, + Verbs: []string{"*"}, + }, }, }); err != nil { return nil, "", err } - if err := ensureKubeServiceAccount(ctx, cluster, nil /*logger*/, managerKubeServiceAccount, managerIAMServiceAccount, nil /*labels*/, []rbacv1.PolicyRule{ - { - APIGroups: []string{""}, // Core APIs. - Resources: []string{ - "pods", "services", "configmaps", "serviceaccounts"}, - Verbs: []string{"*"}, - }, - { - APIGroups: []string{"apps"}, // Application APIs. - Resources: []string{"deployments"}, - Verbs: []string{"*"}, - }, - { - APIGroups: []string{"rbac.authorization.k8s.io"}, // Auth - Resources: []string{"clusterrole", "rolebinding"}, - Verbs: []string{"*"}, - }, - { - APIGroups: []string{"batch"}, // Batch APIs. - Resources: []string{"jobs"}, - Verbs: []string{"*"}, - }, - { - APIGroups: []string{"autoscaling.gke.io"}, // Autoscaling APIs. - Resources: []string{"multidimpodautoscalers"}, - Verbs: []string{"*"}, + if err := ensureKubeServiceAccount(ctx, cluster, nil /*logger*/, distributorKubeServiceAccount, distributorIAMServiceAccount, nil /*labels*/, distributorKubeRole); err != nil { + return nil, "", err + } + if err := patchRole(ctx, cluster, patchOptions{}, &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: managerKubeRole, + Namespace: namespaceName, }, - { - APIGroups: []string{"net.gke.io"}, // Networking APIs. - Resources: []string{"serviceexports"}, - Verbs: []string{"*"}, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{""}, // Core APIs. + Resources: []string{ + "pods", "services", "configmaps", "serviceaccounts"}, + Verbs: []string{"*"}, + }, + { + APIGroups: []string{"apps"}, // Application APIs. + Resources: []string{"deployments"}, + Verbs: []string{"*"}, + }, + { + APIGroups: []string{"rbac.authorization.k8s.io"}, // Auth + Resources: []string{"rolebindings"}, + Verbs: []string{"delete", "deletecollection"}, + }, + { + APIGroups: []string{"batch"}, // Batch APIs. + Resources: []string{"jobs"}, + Verbs: []string{"*"}, + }, + { + APIGroups: []string{"autoscaling.gke.io"}, // Autoscaling APIs. + Resources: []string{"multidimpodautoscalers"}, + Verbs: []string{"*"}, + }, + { + APIGroups: []string{"net.gke.io"}, // Networking APIs. + Resources: []string{"serviceexports"}, + Verbs: []string{"*"}, + }, }, }); err != nil { return nil, "", err } + if err := ensureKubeServiceAccount(ctx, cluster, nil /*logger*/, managerKubeServiceAccount, managerIAMServiceAccount, nil /*labels*/, managerKubeRole); err != nil { + return nil, "", err + } // Setup the service account used for spare capacity. - if err := ensureKubeServiceAccount(ctx, cluster, nil /*logger*/, spareKubeServiceAccount, "" /*iamAccount*/, nil /*labels*/, nil /*policyRules*/); err != nil { + if err := ensureKubeServiceAccount(ctx, cluster, nil /*logger*/, spareKubeServiceAccount, "" /*iamAccount*/, nil /*labels*/, "" /*kubeRole*/); err != nil { return nil, "", err } // Setup service accounts used for application replica sets. - for _, sa := range cfg.ComponentIdentity { - if err := ensureKubeServiceAccount(ctx, cluster, nil /*logger*/, sa, applicationIAMServiceAccount, - map[string]string{ - appKey: name{cfg.Deployment.App.Name}.DNSLabel(), - versionKey: name{cfg.Deployment.Id}.DNSLabel(), + if err := patchRole(ctx, cluster, patchOptions{}, &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: applicationKubeRole, + Namespace: namespaceName, + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{""}, // Core APIs. + Resources: []string{"pods"}, + Verbs: []string{"get", "list", "watch"}, }, - []rbacv1.PolicyRule{ - { - APIGroups: []string{""}, // Core APIs. - Resources: []string{"pods"}, - Verbs: []string{"get", "list", "watch"}, - }, - }); err != nil { + }, + }); err != nil { + return nil, "", err + } + for _, sa := range cfg.ComponentIdentity { + if err := ensureKubeServiceAccount(ctx, cluster, nil /*logger*/, sa, applicationIAMServiceAccount, map[string]string{ + appKey: name{cfg.Deployment.App.Name}.DNSLabel(), + versionKey: name{cfg.Deployment.Id}.DNSLabel(), + }, applicationKubeRole); err != nil { return nil, "", err } } diff --git a/internal/gke/gke.go b/internal/gke/gke.go index 2b1ebb7..e3b36ae 100644 --- a/internal/gke/gke.go +++ b/internal/gke/gke.go @@ -28,6 +28,7 @@ import ( "text/template" "time" + compute "cloud.google.com/go/compute/apiv1" "github.com/ServiceWeaver/weaver-gke/internal/config" "github.com/ServiceWeaver/weaver-gke/internal/nanny" "github.com/ServiceWeaver/weaver-gke/internal/proto" @@ -334,8 +335,7 @@ func kill(ctx context.Context, cluster *ClusterInfo, logger *slog.Logger, app, v Resource: "multidimpodautoscalers", }).Namespace(namespaceName), cluster.Clientset.AppsV1().Deployments(namespaceName), - cluster.Clientset.RbacV1().ClusterRoleBindings(), - cluster.Clientset.RbacV1().ClusterRoles(), + cluster.Clientset.RbacV1().RoleBindings(namespaceName), cluster.Clientset.CoreV1().ServiceAccounts(namespaceName), } { opts := metav1.DeleteOptions{} @@ -897,6 +897,7 @@ func updateTrafficRoutes(ctx context.Context, cluster *ClusterInfo, logger *slog } // Delete the old routes that aren't in the new set. + var errs []error for _, route := range oldRoutes.Items { routeName := route.ObjectMeta.Name if _, ok := newRoutes[routeName]; ok { @@ -904,11 +905,19 @@ func updateTrafficRoutes(ctx context.Context, cluster *ClusterInfo, logger *slog } // Delete the old route. if err := cli.Delete(ctx, routeName, metav1.DeleteOptions{}); err != nil { - return fmt.Errorf("error deleting obsolete route %s: %w", routeName, err) + errs = append(errs, fmt.Errorf("error deleting obsolete route %s: %w", routeName, err)) + } + + // Delete the SSL certificate that belongs to the old route hosts. + // If the route re-appears, the certificate will be re-created. + for _, host := range route.Spec.Hostnames { + if err := deleteSSLCertificate(ctx, cluster, logger, string(host)); err != nil { + errs = append(errs, fmt.Errorf("error deleting unused SSL certificate for host %s: %w", host, err)) + } } } - return nil + return errors.Join(errs...) } // updateGlobalExternalGateway updates the global external gateway in the @@ -1015,7 +1024,7 @@ func computeTrafficBackends(isGlobal bool, cluster *ClusterInfo, allocations []* // ensureKubeServiceAccount ensures that a service account with the given // name has been created with the given policy rules, and has been associated // with a given IAM service account, if one was specified. -func ensureKubeServiceAccount(ctx context.Context, cluster *ClusterInfo, logger *slog.Logger, account, iamAccount string, labels map[string]string, policyRules []rbacv1.PolicyRule) error { +func ensureKubeServiceAccount(ctx context.Context, cluster *ClusterInfo, logger *slog.Logger, account, iamAccount string, labels map[string]string, kubeRole string) error { if iamAccount != "" { // Allow the kubernetes service account to access the IAM service // account. @@ -1045,23 +1054,12 @@ func ensureKubeServiceAccount(ctx context.Context, cluster *ClusterInfo, logger return err } - if policyRules == nil { - // No Kubernetes permissions: we're done. + if kubeRole == "" { + // No Kubernetes role to bind: we're done. return nil } - // Create a Kubernetes cluster role with the given permissions. - if err := patchClusterRole(ctx, cluster, patchOptions{}, &rbacv1.ClusterRole{ - ObjectMeta: metav1.ObjectMeta{ - Name: account, - Labels: labels, - }, - Rules: policyRules, - }); err != nil { - return err - } - - // Bind the Kubernetes cluster role to the Kubernetes service account. + // Bind the Kubernetes cluster role to the service account. return patchRoleBinding(ctx, cluster, patchOptions{}, &rbacv1.RoleBinding{ ObjectMeta: metav1.ObjectMeta{ Name: account, @@ -1076,8 +1074,8 @@ func ensureKubeServiceAccount(ctx context.Context, cluster *ClusterInfo, logger }, }, RoleRef: rbacv1.RoleRef{ - Kind: "ClusterRole", - Name: account, + Kind: "Role", + Name: kubeRole, }, }) } @@ -1279,6 +1277,24 @@ func ensureSSLCertificate(ctx context.Context, cluster *ClusterInfo, logger *slo return certName, nil } +// deleteSSLCertificate deletes a Google-managed SSL certificate for the +// given hostname. +func deleteSSLCertificate(ctx context.Context, cluster *ClusterInfo, logger *slog.Logger, host string) error { + certName := name{"serviceweaver", host}.DNSLabel() + client, err := compute.NewSslCertificatesRESTClient(ctx, cluster.CloudConfig.ClientOptions()...) + if err != nil { + return err + } + op, err := client.Delete(ctx, &computepb.DeleteSslCertificateRequest{ + Project: cluster.CloudConfig.Project, + SslCertificate: certName, + }) + if err != nil { + return err + } + return op.Wait(ctx) +} + // ensureTemporarySpareCapacity ensures that a given temporary spare CPU // capacity is available to the given application version in the given cluster // for the given duration [1]. diff --git a/internal/gke/kube_patcher.go b/internal/gke/kube_patcher.go index aef56b7..3b743bb 100644 --- a/internal/gke/kube_patcher.go +++ b/internal/gke/kube_patcher.go @@ -293,12 +293,12 @@ func patchKubeServiceAccount(ctx context.Context, cluster *ClusterInfo, opts pat }.Run(ctx, account) } -// patchClusterRole updates the kubernetes cluser role with the new configuration. -func patchClusterRole(ctx context.Context, cluster *ClusterInfo, opts patchOptions, role *rbacv1.ClusterRole) error { - cli := cluster.Clientset.RbacV1().ClusterRoles() +// patchRole updates the kubernetes role with the new configuration. +func patchRole(ctx context.Context, cluster *ClusterInfo, opts patchOptions, role *rbacv1.Role) error { + cli := cluster.Clientset.RbacV1().Roles(getNamespace(role.ObjectMeta)) return kubePatcher{ cluster: cluster, - desc: "cluster role", + desc: "role", opts: opts, get: func(ctx context.Context) (metav1.Object, error) { return cli.Get(ctx, role.Name, metav1.GetOptions{})