From 84f36976b3437a38e5dc558c583493178c066000 Mon Sep 17 00:00:00 2001 From: vsoch Date: Sun, 18 Feb 2024 21:40:05 -0700 Subject: [PATCH] feat: podgroup deletion when finished/failed Problem: Since the PodGroup controller creates the PodGroup, it should delete it as well. Solution: Ideally I wanted to attach an owner reference, meaning that the top level job (that also owns the pod) would be owner to the PodGroup. But that does not seem to take - either because the controller is the owner or the field is read only for k8s. For the time being, I decided to delete the PodGroup when the group is determined to be Finished/Failed, which happens when that number of pods equals or exceeds the MinimumSize. I think granted that MinimumSize == size this should be OK with fluence, and we might need to consider other approaches if/when the min size is smaller than the total size (because fluence might still see a pod in the queue and try to schedule again. I think what we might do in that case is just update the MinSize for the group, so if fluence schedules again it will be for the smaller size. But not sure about that either! TBA. The important thing now is that the pod group cleans itself up! Signed-off-by: vsoch --- examples/test_example/fluence-sized-job.yaml | 16 ++++ .../pkg/controllers/podgroup_controller.go | 89 +++++++++++++------ 2 files changed, 80 insertions(+), 25 deletions(-) create mode 100644 examples/test_example/fluence-sized-job.yaml diff --git a/examples/test_example/fluence-sized-job.yaml b/examples/test_example/fluence-sized-job.yaml new file mode 100644 index 0000000..a195d87 --- /dev/null +++ b/examples/test_example/fluence-sized-job.yaml @@ -0,0 +1,16 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: fluence-sized-job +spec: + parallelism: 3 + completions: 3 + template: + spec: + schedulerName: fluence + containers: + - name: fluence-job + image: busybox + command: [echo, potato] + restartPolicy: Never + backoffLimit: 4 diff --git a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go index 72bda77..fa4593c 100644 --- a/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go +++ b/sig-scheduler-plugins/pkg/controllers/podgroup_controller.go @@ -125,60 +125,81 @@ func (r *PodGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c // to account for if the labels are different, do we take the smallest? log.Info("PodGroup", "Status", fmt.Sprintf("WARNING: Pod group current MinMember %s does not match %d", pg.Spec.MinMember, size)) } + return r.updateStatus(ctx, pg, podList.Items) - // If we get here, we have a PodGroup with a set size and can inspect / update phase - pods := podList.Items - pgCopy := pg.DeepCopy() +} +func (r *PodGroupReconciler) updateStatus( + ctx context.Context, + pg *schedv1alpha1.PodGroup, + pods []v1.Pod, +) (ctrl.Result, error) { - switch pgCopy.Status.Phase { + patch := client.MergeFrom(pg.DeepCopy()) + + switch pg.Status.Phase { case "": - pgCopy.Status.Phase = schedv1alpha1.PodGroupPending + pg.Status.Phase = schedv1alpha1.PodGroupPending + result, err := r.updateOwnerReferences(ctx, pg, &pods[0]) + if result.Requeue || err != nil { + return result, err + } + case schedv1alpha1.PodGroupPending: if len(pods) >= int(pg.Spec.MinMember) { - pgCopy.Status.Phase = schedv1alpha1.PodGroupScheduling - - // Always update owner references to be the first pod - // E.g., if a job owns it, ensure the group is deleted with it - updateOwnerReferences(pgCopy, &pods[0]) + pg.Status.Phase = schedv1alpha1.PodGroupScheduling + result, err := r.updateOwnerReferences(ctx, pg, &pods[0]) + if result.Requeue || err != nil { + return result, err + } } default: // Get updated counts of running, succeeded, and failed pods - pgCopy.Status.Running, pgCopy.Status.Succeeded, pgCopy.Status.Failed = getCurrentPodStats(pods) + running, succeeded, failed := getCurrentPodStats(pods) // If for some reason we weren't pending and now have fewer than min required, flip back to pending if len(pods) < int(pg.Spec.MinMember) { - pgCopy.Status.Phase = schedv1alpha1.PodGroupPending + pg.Status.Phase = schedv1alpha1.PodGroupPending break } // A pod with succeeded + running STILL less than the minimum required is scheduling - if pgCopy.Status.Succeeded+pgCopy.Status.Running < pg.Spec.MinMember { - pgCopy.Status.Phase = schedv1alpha1.PodGroupScheduling + if succeeded+running < pg.Spec.MinMember { + pg.Status.Phase = schedv1alpha1.PodGroupScheduling } // A pod with succeeded + running >= the minimum required is running! - if pgCopy.Status.Succeeded+pgCopy.Status.Running >= pg.Spec.MinMember { - pgCopy.Status.Phase = schedv1alpha1.PodGroupRunning + if succeeded+running >= pg.Spec.MinMember { + pg.Status.Phase = schedv1alpha1.PodGroupRunning } // We have non zero failed, and the total of failed, running amd succeeded > min member // Final state of pod group is FAILED womp womp - if pgCopy.Status.Failed != 0 && - pgCopy.Status.Failed+pgCopy.Status.Running+pgCopy.Status.Succeeded >= pg.Spec.MinMember { - pgCopy.Status.Phase = schedv1alpha1.PodGroupFailed + if failed != 0 && failed+running+succeeded >= pg.Spec.MinMember { + pg.Status.Phase = schedv1alpha1.PodGroupFailed } // Finished! This is where we want to get :) // TODO: ideally the owning higher level object deletion will delete here, // but that won't always work for one of pods - need a new strategy - if pgCopy.Status.Succeeded >= pg.Spec.MinMember { - pgCopy.Status.Phase = schedv1alpha1.PodGroupFinished + if succeeded >= pg.Spec.MinMember { + pg.Status.Phase = schedv1alpha1.PodGroupFinished } + pg.Status.Running = running + pg.Status.Failed = failed + pg.Status.Succeeded = succeeded } - // TODO need better handling here of cleanup, etc. This mostly handles status changes - return r.patchPodGroup(ctx, pg, pgCopy) + // Apply the patch to update, or delete if finished + // TODO would be better if owner references took here, so delete on owner deletion + var err error + if pg.Status.Phase == schedv1alpha1.PodGroupFinished || pg.Status.Phase == schedv1alpha1.PodGroupFailed { + err = r.Delete(ctx, pg) + } else { + r.Status().Update(ctx, pg) + err = r.Patch(ctx, pg, patch) + } + return ctrl.Result{Requeue: true}, err } // newPodGroup creates a new podGroup object, capturing the creation time @@ -273,21 +294,37 @@ func getCurrentPodStats(pods []v1.Pod) (int32, int32, int32) { // updateOwnerReferences ensures the group is always owned by the same entity that owns the pod // This ensures that, for example, a job that is wrapping pods is the owner. -func updateOwnerReferences(pg *schedv1alpha1.PodGroup, pod *v1.Pod) { +func (r *PodGroupReconciler) updateOwnerReferences( + ctx context.Context, + pg *schedv1alpha1.PodGroup, + pod *v1.Pod, +) (ctrl.Result, error) { // Case 1: The pod itself doesn't have owner references. YOLO if len(pod.OwnerReferences) == 0 { - return + return ctrl.Result{}, nil } + // Collect owner references for pod group + owners := []metav1.OwnerReference{} var refs []string for _, ownerRef := range pod.OwnerReferences { refs = append(refs, fmt.Sprintf("%s/%s", pod.Namespace, ownerRef.Name)) + owners = append(owners, ownerRef) } + patch := client.MergeFrom(pg.DeepCopy()) if len(refs) != 0 { sort.Strings(refs) pg.Status.OccupiedBy = strings.Join(refs, ",") } + if len(owners) > 0 { + pg.ObjectMeta.OwnerReferences = owners + } + // Apply the patch to update the size + r.Status().Update(ctx, pg) + err := r.Patch(ctx, pg, patch) + return ctrl.Result{Requeue: true}, err + } // SetupWithManager sets up the controller with the Manager. @@ -346,6 +383,8 @@ func (r *PodGroupReconciler) ensurePodGroup(ctx context.Context, obj client.Obje if apierrs.IsNotFound(err) { r.log.Info("Pod: ", "Status", pod.Status.Phase, "Name", pod.Name, "Group", groupName, "Namespace", pod.Namespace, "Action", "Creating PodGroup") + //owner := r.getOwnerMetadata(pod) + // TODO should an owner be set here? Setting to a specific pod seems risky/wrong in case deleted. err, _ := r.newPodGroup(ctx, groupName, pod.Namespace, int32(groupSize)) if err != nil {