Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: effective tls policies reconciler #927

Merged
merged 15 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
393 changes: 393 additions & 0 deletions controllers/effective_tls_policies_reconciler.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,393 @@
package controllers

import (
"context"
"errors"
"reflect"
"sync"

certmanv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1"
"github.com/kuadrant/policy-machinery/controller"
"github.com/kuadrant/policy-machinery/machinery"
"github.com/samber/lo"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/validation/field"
"k8s.io/client-go/dynamic"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
crlog "sigs.k8s.io/controller-runtime/pkg/log"
gatewayapiv1 "sigs.k8s.io/gateway-api/apis/v1"

kuadrantv1alpha1 "github.com/kuadrant/kuadrant-operator/api/v1alpha1"
)

type EffectiveTLSPoliciesReconciler struct {
client *dynamic.DynamicClient
scheme *runtime.Scheme
}

func NewEffectiveTLSPoliciesReconciler(client *dynamic.DynamicClient, scheme *runtime.Scheme) *EffectiveTLSPoliciesReconciler {
return &EffectiveTLSPoliciesReconciler{client: client, scheme: scheme}
}

func (t *EffectiveTLSPoliciesReconciler) Subscription() *controller.Subscription {
return &controller.Subscription{
Events: []controller.ResourceEventMatcher{
{Kind: &machinery.GatewayGroupKind},
{Kind: &kuadrantv1alpha1.TLSPolicyGroupKind},
{Kind: &CertManagerCertificateKind},
},
ReconcileFunc: t.Reconcile,
}
}

//+kubebuilder:rbac:groups=kuadrant.io,resources=tlspolicies,verbs=get;list;watch;update;patch;delete
//+kubebuilder:rbac:groups=kuadrant.io,resources=tlspolicies/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=kuadrant.io,resources=tlspolicies/finalizers,verbs=update
//+kubebuilder:rbac:groups="cert-manager.io",resources=issuers,verbs=get;list;watch;
//+kubebuilder:rbac:groups="cert-manager.io",resources=clusterissuers,verbs=get;list;watch;
//+kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch
//+kubebuilder:rbac:groups="cert-manager.io",resources=certificates,verbs=get;list;watch;create;update;patch;delete

func (t *EffectiveTLSPoliciesReconciler) Reconcile(ctx context.Context, _ []controller.ResourceEvent, topology *machinery.Topology, _ error, s *sync.Map) error {
logger := controller.LoggerFromContext(ctx).WithName("EffectiveTLSPoliciesReconciler").WithName("Reconcile")

listeners := topology.Targetables().Items(func(object machinery.Object) bool {
_, ok := object.(*machinery.Listener)
return ok
})

// Get all certs in topology for comparison with expected certs to determine orphaned certs later
// Only certs owned by TLSPolicies should be in the topology - no need to check again
certs := lo.FilterMap(topology.Objects().Items(), func(item machinery.Object, index int) (*certmanv1.Certificate, bool) {
r, ok := item.(*controller.RuntimeObject)
if !ok {
return nil, false
}
c, ok := r.Object.(*certmanv1.Certificate)

return c, ok
})

var expectedCerts []*certmanv1.Certificate
filterForTLSPolicies := func(p machinery.Policy, _ int) bool {
_, ok := p.(*kuadrantv1alpha1.TLSPolicy)
return ok
}

for _, listener := range listeners {
l := listener.(*machinery.Listener)

policies := lo.Filter(l.Policies(), filterForTLSPolicies)

if len(policies) == 0 {
policies = lo.Filter(l.Gateway.Policies(), filterForTLSPolicies)
}

for _, p := range policies {
policy := p.(*kuadrantv1alpha1.TLSPolicy)
KevFan marked this conversation as resolved.
Show resolved Hide resolved

// Policy is deleted
if policy.DeletionTimestamp != nil {
logger.V(1).Info("policy is marked for deletion, nothing to do", "name", policy.Name, "namespace", policy.Namespace, "uid", policy.GetUID())
continue
}

// Policy is not valid
isValid, _ := IsTLSPolicyValid(ctx, s, policy)
if !isValid {
logger.V(1).Info("deleting certs for invalid policy", "name", policy.Name, "namespace", policy.Namespace, "uid", policy.GetUID())
if err := t.deleteCertificatesForTargetable(ctx, topology, l); err != nil {
logger.Error(err, "unable to delete certs for invalid policy", "name", policy.Name, "namespace", policy.Namespace, "uid", policy.GetUID())
}
continue
}

// Policy is valid
// Need to use Gateway as listener hosts can be merged into a singular cert if using the same cert reference
expectedCertificates := expectedCertificatesForGateway(ctx, l.Gateway.Gateway, policy)

for _, cert := range expectedCertificates {
resource := t.client.Resource(CertManagerCertificatesResource).Namespace(cert.GetNamespace())

// Check is cert already in topology
objs := topology.Objects().Children(l)
obj, ok := lo.Find(objs, func(o machinery.Object) bool {
return o.GroupVersionKind().GroupKind() == CertManagerCertificateKind && o.GetNamespace() == cert.GetNamespace() && o.GetName() == cert.GetName()
})

// Create
if !ok {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we be more positive in our checks, doing if ok instead of `if !ok. I also wonder if it is better to have the more common action listed first. So if we do more creates that should be listed first but if we do more creates that should be.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure but whether to do !ok or ok first depends also on whether we create or update first, so if we want to be more positive in our check here if ok, this would mean update is first 🤔

Not sure which action do we expect more of. Maybe @mikenairn can answer on this

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't expect too many updates of a Certificate resource after creation. I don't have strong opinions on this, but for what its worth i did if ok in the dnspolicy equivalent task, and deal with updates first.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if ok { update } / if !ok { create }. Does it matter? The number of jump instructions in the final compiled code is literally the same I think.

If !ok feels like being negative (it doesn't to me BTW), then perhaps we could rename the variable to missing. Is it better? I.e. if missing { create }.

IMO, if it doesn't have any obvious downside in performance, the important thing is that the code is readable. In my head (and it could be only me), I read this in terms of chronology and progressive use cases.

The first thing that happens is having nothing, then:

  1. user creates the network resources
  2. user creates the policies
  3. the operator creates the internal resources (for the first time after seeing the policies)
  4. user adjusts the policies and/or network resources
  5. operator updates the internal resources accordingly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks guys 👍 I don't think it matters in fairness unless people really feel it hinders readability. My preference is to keep it as is tbh as I also prefer dealing with the create first followed by update action. If we feel strongly about this, maybe this can be done in a separate where this can be dealt with in the entire codebase

expectedCerts = append(expectedCerts, cert)
if err := controllerutil.SetControllerReference(policy, cert, t.scheme); err != nil {
logger.Error(err, "failed to set owner reference on certificate", "name", policy.Name, "namespace", policy.Namespace, "uid", policy.GetUID())
continue
}

un, err := controller.Destruct(cert)
if err != nil {
logger.Error(err, "unable to destruct cert")
continue
}
_, err = resource.Create(ctx, un, metav1.CreateOptions{})
if err != nil && !apierrors.IsAlreadyExists(err) {
logger.Error(err, "unable to create certificate", "name", policy.Name, "namespace", policy.Namespace, "uid", policy.GetUID())
}

continue
}

// Update
tCert := obj.(*controller.RuntimeObject).Object.(*certmanv1.Certificate)
expectedCerts = append(expectedCerts, tCert)
if reflect.DeepEqual(tCert.Spec, cert.Spec) {
logger.V(1).Info("skipping update, cert specs are the same, nothing to do")
continue
}

tCert.Spec = cert.Spec
un, err := controller.Destruct(tCert)
if err != nil {
logger.Error(err, "unable to destruct cert")
continue
}
_, err = resource.Update(ctx, un, metav1.UpdateOptions{})
if err != nil {
logger.Error(err, "unable to update certificate", "policy", policy.Name)
}
}
}
}

// Clean up orphaned certs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cleaning up of orphaned certs gives me an odd feeling, which I have yet to put my finger on. Had the same feeling with the DNS records. The question I ask find asking my self is what does the cleaning up of orphaned certs have to do with TLS polices and their reconcile. I have this feeling that this clean up should be structured as a Task in the Workflow over being part of the TLS policy reconcile. There seems to be a mixing of concerns. I understand how this mixing of concerns is ingrained in us from using the controller runtime. This currently a feeling that this is a code smell, but yet I don't have a good though on what to do about it.

One thing that I don't know at time of writing is how do certs become orphaned.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this is because there is a indirect link between policies and the subresouce (certificates in this case) and so gives the expected state of the cluster. Though true, this probably can be done as a separate task, that I can look into if you feel strongly about this.

A cert can be orphaned when a gateway listener is removed, or is the target ref of a tls policy is changed to another gateway. In both of these cases the cert(s) is created and will be orphaned since the policy still exists and still valid

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed we need to cleanup certs that remained orphan due to, e.g., a listener being removed from a gateway ceteris paribus.

Let's keep this as-is now since it does the job. Maybe in the future we can look for this pattern across the reconcilers (not only this one) and try to improve them all by having "janitors" as separate tasks that can run in parallel to other tasks.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am happy with leaving this for now. It is some thing that can be done in the future.

uniqueExpectedCerts := lo.UniqBy(expectedCerts, func(item *certmanv1.Certificate) types.UID {
return item.GetUID()
})
orphanedCerts, _ := lo.Difference(certs, uniqueExpectedCerts)
for _, orphanedCert := range orphanedCerts {
resource := t.client.Resource(CertManagerCertificatesResource).Namespace(orphanedCert.GetNamespace())
if err := resource.Delete(ctx, orphanedCert.Name, metav1.DeleteOptions{}); err != nil && !apierrors.IsNotFound(err) {
logger.Error(err, "unable to delete orphaned certificate", "name", orphanedCert.GetName(), "namespace", orphanedCert.GetNamespace(), "uid", orphanedCert.GetUID())
continue
}
}

return nil
}

func (t *EffectiveTLSPoliciesReconciler) deleteCertificatesForTargetable(ctx context.Context, topology *machinery.Topology, target machinery.Targetable) error {
children := topology.Objects().Children(target)

certs := lo.FilterMap(children, func(item machinery.Object, index int) (*certmanv1.Certificate, bool) {
r, ok := item.(*controller.RuntimeObject)
if !ok {
return nil, false
}
c, ok := r.Object.(*certmanv1.Certificate)

return c, ok
})

var deletionErr error
for _, cert := range certs {
resource := t.client.Resource(CertManagerCertificatesResource).Namespace(cert.GetNamespace())

if err := resource.Delete(ctx, cert.Name, metav1.DeleteOptions{}); err != nil && !apierrors.IsNotFound(err) {
deletionErr = errors.Join(err)
}
}

return deletionErr
}

func expectedCertificatesForGateway(ctx context.Context, gateway *gatewayapiv1.Gateway, tlsPolicy *kuadrantv1alpha1.TLSPolicy) []*certmanv1.Certificate {
log := crlog.FromContext(ctx)

tlsHosts := make(map[corev1.ObjectReference][]string)
for i, l := range gateway.Spec.Listeners {
hostname := "*"
if l.Hostname != nil {
hostname = string(*l.Hostname)
}

err := validateGatewayListenerBlock(field.NewPath("spec", "listeners").Index(i), l, gateway).ToAggregate()
if err != nil {
log.Info("Skipped a listener block: " + err.Error())
continue
}

for _, certRef := range l.TLS.CertificateRefs {
secretRef := corev1.ObjectReference{
Name: string(certRef.Name),
}
if certRef.Namespace != nil {
secretRef.Namespace = string(*certRef.Namespace)
} else {
secretRef.Namespace = gateway.GetNamespace()
}
// Gateway API hostname explicitly disallows IP addresses, so this
// should be OK.
tlsHosts[secretRef] = append(tlsHosts[secretRef], hostname)
}
}

certs := make([]*certmanv1.Certificate, 0, len(tlsHosts))
for secretRef, hosts := range tlsHosts {
certs = append(certs, buildCertManagerCertificate(tlsPolicy, secretRef, hosts))
}
return certs
}

func expectedCertificatesForListener(l *machinery.Listener, tlsPolicy *kuadrantv1alpha1.TLSPolicy) []*certmanv1.Certificate {
tlsHosts := make(map[corev1.ObjectReference][]string)

hostname := "*"
if l.Hostname != nil {
hostname = string(*l.Hostname)
}

for _, certRef := range l.TLS.CertificateRefs {
secretRef := corev1.ObjectReference{
Name: string(certRef.Name),
}
if certRef.Namespace != nil {
secretRef.Namespace = string(*certRef.Namespace)
} else {
secretRef.Namespace = l.GetNamespace()
}
// Gateway API hostname explicitly disallows IP addresses, so this
// should be OK.
tlsHosts[secretRef] = append(tlsHosts[secretRef], hostname)
}

certs := make([]*certmanv1.Certificate, 0, len(tlsHosts))
for secretRef, hosts := range tlsHosts {
certs = append(certs, buildCertManagerCertificate(tlsPolicy, secretRef, hosts))
}
return certs
}

func buildCertManagerCertificate(tlsPolicy *kuadrantv1alpha1.TLSPolicy, secretRef corev1.ObjectReference, hosts []string) *certmanv1.Certificate {
crt := &certmanv1.Certificate{
ObjectMeta: metav1.ObjectMeta{
Name: secretRef.Name,
Namespace: secretRef.Namespace,
},
TypeMeta: metav1.TypeMeta{
Kind: certmanv1.CertificateKind,
APIVersion: certmanv1.SchemeGroupVersion.String(),
},
Spec: certmanv1.CertificateSpec{
DNSNames: hosts,
SecretName: secretRef.Name,
IssuerRef: tlsPolicy.Spec.IssuerRef,
Usages: certmanv1.DefaultKeyUsages(),
},
}
translatePolicy(crt, tlsPolicy.Spec)
return crt
}

// https://cert-manager.io/docs/usage/gateway/#supported-annotations
// Helper functions largely based on cert manager https://github.com/cert-manager/cert-manager/blob/master/pkg/controller/certificate-shim/sync.go

func validateGatewayListenerBlock(path *field.Path, l gatewayapiv1.Listener, ingLike metav1.Object) field.ErrorList {
var errs field.ErrorList

if l.Hostname == nil || *l.Hostname == "" {
errs = append(errs, field.Required(path.Child("hostname"), "the hostname cannot be empty"))
}

if l.TLS == nil {
errs = append(errs, field.Required(path.Child("tls"), "the TLS block cannot be empty"))
return errs
}

if len(l.TLS.CertificateRefs) == 0 {
errs = append(errs, field.Required(path.Child("tls").Child("certificateRef"),
"listener has no certificateRefs"))
} else {
// check that each CertificateRef is valid
for i, secretRef := range l.TLS.CertificateRefs {
if *secretRef.Group != "core" && *secretRef.Group != "" {
errs = append(errs, field.NotSupported(path.Child("tls").Child("certificateRef").Index(i).Child("group"),
*secretRef.Group, []string{"core", ""}))
}

if *secretRef.Kind != "Secret" && *secretRef.Kind != "" {
errs = append(errs, field.NotSupported(path.Child("tls").Child("certificateRef").Index(i).Child("kind"),
*secretRef.Kind, []string{"Secret", ""}))
}

if secretRef.Namespace != nil && string(*secretRef.Namespace) != ingLike.GetNamespace() {
errs = append(errs, field.Invalid(path.Child("tls").Child("certificateRef").Index(i).Child("namespace"),
*secretRef.Namespace, "cross-namespace secret references are not allowed in listeners"))
}
}
}

if l.TLS.Mode == nil {
errs = append(errs, field.Required(path.Child("tls").Child("mode"),
"the mode field is required"))
} else {
if *l.TLS.Mode != gatewayapiv1.TLSModeTerminate {
errs = append(errs, field.NotSupported(path.Child("tls").Child("mode"),
*l.TLS.Mode, []string{string(gatewayapiv1.TLSModeTerminate)}))
}
}

return errs
}

// translatePolicy updates the Certificate spec using the TLSPolicy spec
// converted from https://github.com/cert-manager/cert-manager/blob/master/pkg/controller/certificate-shim/helper.go#L63
func translatePolicy(crt *certmanv1.Certificate, tlsPolicy kuadrantv1alpha1.TLSPolicySpec) {
if tlsPolicy.CommonName != "" {
crt.Spec.CommonName = tlsPolicy.CommonName
}

if tlsPolicy.Duration != nil {
crt.Spec.Duration = tlsPolicy.Duration
}

if tlsPolicy.RenewBefore != nil {
crt.Spec.RenewBefore = tlsPolicy.RenewBefore
}

if tlsPolicy.RenewBefore != nil {
crt.Spec.RenewBefore = tlsPolicy.RenewBefore
}

if tlsPolicy.Usages != nil {
crt.Spec.Usages = tlsPolicy.Usages
}

if tlsPolicy.RevisionHistoryLimit != nil {
crt.Spec.RevisionHistoryLimit = tlsPolicy.RevisionHistoryLimit
}

if tlsPolicy.PrivateKey != nil {
if crt.Spec.PrivateKey == nil {
crt.Spec.PrivateKey = &certmanv1.CertificatePrivateKey{}
}

if tlsPolicy.PrivateKey.Algorithm != "" {
crt.Spec.PrivateKey.Algorithm = tlsPolicy.PrivateKey.Algorithm
}

if tlsPolicy.PrivateKey.Encoding != "" {
crt.Spec.PrivateKey.Encoding = tlsPolicy.PrivateKey.Encoding
}

if tlsPolicy.PrivateKey.Size != 0 {
crt.Spec.PrivateKey.Size = tlsPolicy.PrivateKey.Size
}

if tlsPolicy.PrivateKey.RotationPolicy != "" {
crt.Spec.PrivateKey.RotationPolicy = tlsPolicy.PrivateKey.RotationPolicy
}
}
}
Loading
Loading