-
Notifications
You must be signed in to change notification settings - Fork 68
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: split nic and vm into their own gc controllers, added shared st…
…ate between them to prevent conflicts in nic deletion calls
- Loading branch information
1 parent
29543f3
commit 4104201
Showing
4 changed files
with
260 additions
and
199 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
198 changes: 0 additions & 198 deletions
198
pkg/controllers/nodeclaim/garbagecollection/controller.go
This file was deleted.
Oops, something went wrong.
137 changes: 137 additions & 0 deletions
137
pkg/controllers/nodeclaim/garbagecollection/instance_garbagecollection.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
/* | ||
Portions Copyright (c) Microsoft Corporation. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package garbagecollection | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"time" | ||
|
||
"github.com/Azure/karpenter-provider-azure/pkg/providers/instance" | ||
"github.com/awslabs/operatorpkg/singleton" | ||
"github.com/patrickmn/go-cache" | ||
|
||
// "github.com/Azure/karpenter-provider-azure/pkg/cloudprovider" | ||
"github.com/samber/lo" | ||
"go.uber.org/multierr" | ||
v1 "k8s.io/api/core/v1" | ||
"k8s.io/apimachinery/pkg/util/sets" | ||
"k8s.io/client-go/util/workqueue" | ||
"knative.dev/pkg/logging" | ||
controllerruntime "sigs.k8s.io/controller-runtime" | ||
"sigs.k8s.io/controller-runtime/pkg/client" | ||
"sigs.k8s.io/controller-runtime/pkg/manager" | ||
"sigs.k8s.io/controller-runtime/pkg/reconcile" | ||
"sigs.k8s.io/karpenter/pkg/operator/injection" | ||
|
||
karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" | ||
|
||
corecloudprovider "sigs.k8s.io/karpenter/pkg/cloudprovider" | ||
) | ||
|
||
|
||
const ( | ||
NicBelongsToVM = "NicBelongsToVM" | ||
) | ||
|
||
type VirtualMachineController struct { | ||
kubeClient client.Client | ||
cloudProvider corecloudprovider.CloudProvider | ||
unremovableNics *cache.Cache | ||
successfulCount uint64 // keeps track of successful reconciles for more aggressive requeueing near the start of the controller | ||
} | ||
|
||
func NewVirtualMachineController(kubeClient client.Client, cloudProvider corecloudprovider.CloudProvider, unremovableNics *cache.Cache) *VirtualMachineController { | ||
return &VirtualMachineController{ | ||
kubeClient: kubeClient, | ||
cloudProvider: cloudProvider, | ||
unremovableNics: unremovableNics, | ||
successfulCount: 0, | ||
} | ||
} | ||
|
||
func (c *VirtualMachineController) Reconcile(ctx context.Context) (reconcile.Result, error) { | ||
ctx = injection.WithControllerName(ctx, "instance.garbagecollection") | ||
|
||
// We LIST VMs on the CloudProvider BEFORE we grab NodeClaims/Nodes on the cluster so that we make sure that, if | ||
// LISTing instances takes a long time, our information is more updated by the time we get to nodeclaim and Node LIST | ||
// This works since our CloudProvider instances are deleted based on whether the NodeClaim exists or not, not vice-versa | ||
retrieved, err := c.cloudProvider.List(ctx) | ||
if err != nil { | ||
return reconcile.Result{}, fmt.Errorf("listing cloudprovider VMs, %w", err) | ||
} | ||
|
||
// Mark all vms on the cloudprovider as unremovableNics for the nicGc controller | ||
for _, nodeClaim := range retrieved { | ||
// Nics Belonging to a vm cannot be removed while attached to the vm. | ||
// lets set a nic as unremovable for 15 minutes if it belongs to a vm | ||
c.unremovableNics.Set(instance.GenerateResourceName(nodeClaim.Name), "", time.Minute * 15) | ||
} | ||
|
||
managedRetrieved := lo.Filter(retrieved, func(nc *karpv1.NodeClaim, _ int) bool { | ||
return nc.DeletionTimestamp.IsZero() | ||
}) | ||
nodeClaimList := &karpv1.NodeClaimList{} | ||
if err = c.kubeClient.List(ctx, nodeClaimList); err != nil { | ||
return reconcile.Result{}, err | ||
} | ||
nodeList := &v1.NodeList{} | ||
if err := c.kubeClient.List(ctx, nodeList); err != nil { | ||
return reconcile.Result{}, err | ||
} | ||
resolvedProviderIDs := sets.New[string](lo.FilterMap(nodeClaimList.Items, func(n karpv1.NodeClaim, _ int) (string, bool) { | ||
return n.Status.ProviderID, n.Status.ProviderID != "" | ||
})...) | ||
errs := make([]error, len(retrieved)) | ||
workqueue.ParallelizeUntil(ctx, 100, len(managedRetrieved), func(i int) { | ||
if !resolvedProviderIDs.Has(managedRetrieved[i].Status.ProviderID) && | ||
time.Since(managedRetrieved[i].CreationTimestamp.Time) > time.Minute*5 { | ||
errs[i] = c.garbageCollect(ctx, managedRetrieved[i], nodeList) | ||
} | ||
}) | ||
if err = multierr.Combine(errs...); err != nil { | ||
return reconcile.Result{}, err | ||
} | ||
c.successfulCount++ | ||
return reconcile.Result{RequeueAfter: lo.Ternary(c.successfulCount <= 20, time.Second*10, time.Minute*2)}, nil | ||
} | ||
|
||
func (c *VirtualMachineController) garbageCollect(ctx context.Context, nodeClaim *karpv1.NodeClaim, nodeList *v1.NodeList) error { | ||
ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("provider-id", nodeClaim.Status.ProviderID)) | ||
if err := c.cloudProvider.Delete(ctx, nodeClaim); err != nil { | ||
return corecloudprovider.IgnoreNodeClaimNotFoundError(err) | ||
} | ||
logging.FromContext(ctx).Debugf("garbage collected cloudprovider instance") | ||
|
||
// Go ahead and cleanup the node if we know that it exists to make scheduling go quicker | ||
if node, ok := lo.Find(nodeList.Items, func(n v1.Node) bool { | ||
return n.Spec.ProviderID == nodeClaim.Status.ProviderID | ||
}); ok { | ||
if err := c.kubeClient.Delete(ctx, &node); err != nil { | ||
return client.IgnoreNotFound(err) | ||
} | ||
logging.FromContext(ctx).With("node", node.Name).Debugf("garbage collected node") | ||
} | ||
return nil | ||
} | ||
|
||
func (c *VirtualMachineController) Register(_ context.Context, m manager.Manager) error { | ||
return controllerruntime.NewControllerManagedBy(m). | ||
Named("instance.garbagecollection"). | ||
WatchesRawSource(singleton.Source()). | ||
Complete(singleton.AsReconciler(c)) | ||
} |
Oops, something went wrong.