Skip to content

Commit c2a0444

Browse files
authored
Merge branch 'main' into rfcs/terminate-before-create-static-capacity
2 parents 0b37b41 + 0f6f76c commit c2a0444

File tree

3 files changed

+82
-7
lines changed

3 files changed

+82
-7
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module sigs.k8s.io/karpenter
22

3-
go 1.25.7
3+
go 1.26.1
44

55
require (
66
github.com/Pallinder/go-randomdata v1.2.0

pkg/controllers/node/termination/controller.go

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ import (
5858
const (
5959
minReconciles = 100
6060
maxReconciles = 5000
61+
MinDrainTime = 5 * time.Second
6162
)
6263

6364
// Controller for the resource
@@ -190,25 +191,36 @@ func (c *Controller) finalize(ctx context.Context, node *corev1.Node) (reconcile
190191

191192
type terminationFunc func(context.Context, *v1.NodeClaim, *corev1.Node, *time.Time) (reconcile.Result, error)
192193

193-
// awaitDrain initiates the drain of the node and will continue to requeue until the node has been drained. If the
194-
// nodeClaim has a terminationGracePeriod set, pods will be deleted to ensure this function does not requeue past the
194+
// awaitDrain initiates the drain of the node and will continue to requeue until the node has been drained and the minimum drain time has passed.
195+
// If the nodeClaim has a terminationGracePeriod set, pods will be deleted to ensure this function does not requeue past the
195196
// nodeTerminationTime.
196197
func (c *Controller) awaitDrain(
197198
ctx context.Context,
198199
nodeClaim *v1.NodeClaim,
199200
node *corev1.Node,
200201
nodeTerminationTime *time.Time,
201202
) (reconcile.Result, error) {
203+
if nodeClaim != nil && nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained) == nil {
204+
nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeDrained, "Draining", "Draining")
205+
}
202206
if err := c.terminator.Drain(ctx, node, nodeTerminationTime); err != nil {
203207
if !terminator.IsNodeDrainError(err) {
204208
return reconcile.Result{}, fmt.Errorf("draining node, %w", err)
205209
}
206210
c.recorder.Publish(terminatorevents.NodeFailedToDrain(node, err))
207-
if nodeClaim != nil {
208-
nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeDrained, "Draining", "Draining")
209-
}
210211
return reconcile.Result{RequeueAfter: 1 * time.Second}, nil
211212
}
213+
214+
// If the nodeclaim exists, check if minDrainTime has elapsed. If it hasn't we should requeue.
215+
// This check helps to ensure that we drain pods scheduled to the Node immediately after we taint it, which
216+
// can occur when the scheduler has not seen the taint yet.
217+
if nodeClaim != nil {
218+
cond := nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained)
219+
if cond == nil || (cond.IsUnknown() && c.clock.Since(cond.LastTransitionTime.Time) < MinDrainTime) {
220+
return reconcile.Result{RequeueAfter: 1 * time.Second}, nil
221+
}
222+
}
223+
212224
if nodeClaim != nil {
213225
nodeClaim.StatusConditions().SetTrue(v1.ConditionTypeDrained)
214226
}

pkg/controllers/node/termination/suite_test.go

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,10 @@ var _ = Describe("Termination", func() {
116116
ExpectApplied(ctx, env.Client, node, nodeClaim)
117117
Expect(env.Client.Delete(ctx, node)).To(Succeed())
118118
node = ExpectNodeExists(ctx, env.Client, node.Name)
119-
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Drain, VolumeDetachment, InstanceTerminationInitiation
119+
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Taint and Start Drain
120+
fakeClock.Step(2 * termination.MinDrainTime)
121+
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Drain, VolumeDetachment, InstanceTerminationInitiation
122+
120123
ExpectNotRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // InstanceTerminationValidation
121124
ExpectNotFound(ctx, env.Client, node)
122125
})
@@ -154,6 +157,8 @@ var _ = Describe("Termination", func() {
154157
Expect(env.Client.Delete(ctx, node)).To(Succeed())
155158
node = ExpectNodeExists(ctx, env.Client, node.Name)
156159

160+
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Taint and Start Drain
161+
fakeClock.Step(2 * termination.MinDrainTime)
157162
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Drain, VolumeDetachment, InstanceTerminationInitiation
158163
nc := ExpectExists(ctx, env.Client, nodeClaim)
159164
Expect(nc.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).IsTrue()).To(BeTrue())
@@ -186,6 +191,9 @@ var _ = Describe("Termination", func() {
186191
defer GinkgoRecover()
187192
defer wg.Done()
188193
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
194+
if i == 0 {
195+
fakeClock.Step(2 * termination.MinDrainTime)
196+
}
189197
}(nodes[i])
190198
}
191199
wg.Wait()
@@ -231,6 +239,7 @@ var _ = Describe("Termination", func() {
231239
node = ExpectNodeExists(ctx, env.Client, node.Name)
232240

233241
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // DrainInitiation
242+
fakeClock.Step(2 * termination.MinDrainTime)
234243
Expect(queue.Has(podSkip)).To(BeFalse())
235244
ExpectObjectReconciled(ctx, env.Client, queue, podEvict)
236245

@@ -262,6 +271,7 @@ var _ = Describe("Termination", func() {
262271

263272
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // DrainInitiation
264273
Expect(queue.Has(podSkip)).To(BeFalse())
274+
fakeClock.Step(2 * termination.MinDrainTime)
265275
ExpectObjectReconciled(ctx, env.Client, queue, podEvict)
266276

267277
// Expect node to exist and be draining
@@ -291,6 +301,7 @@ var _ = Describe("Termination", func() {
291301
Expect(env.Client.Delete(ctx, node)).To(Succeed())
292302
node = ExpectNodeExists(ctx, env.Client, node.Name)
293303
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // DrainInitiation
304+
fakeClock.Step(2 * termination.MinDrainTime)
294305
ExpectObjectReconciled(ctx, env.Client, queue, podEvict)
295306

296307
// Expect node to exist and be draining
@@ -319,6 +330,7 @@ var _ = Describe("Termination", func() {
319330
node = ExpectNodeExists(ctx, env.Client, node.Name)
320331

321332
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // DrainInitiation
333+
fakeClock.Step(2 * termination.MinDrainTime)
322334
ExpectObjectReconciled(ctx, env.Client, queue, pod)
323335

324336
// Expect pod with no owner ref to be enqueued for eviction
@@ -350,6 +362,8 @@ var _ = Describe("Termination", func() {
350362
Expect(env.Client.Delete(ctx, node)).To(Succeed())
351363
node = ExpectNodeExists(ctx, env.Client, node.Name)
352364
// Trigger Termination Controller, which should ignore these pods and delete the node
365+
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Taint and DrainInitiation
366+
fakeClock.Step(2 * termination.MinDrainTime)
353367
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Drain, VolumeDetachment, InstanceTerminationInitiation
354368
ExpectNotRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // InstanceTerminationValidation
355369
ExpectNotFound(ctx, env.Client, node)
@@ -377,6 +391,7 @@ var _ = Describe("Termination", func() {
377391
Expect(env.Client.Delete(ctx, node)).To(Succeed())
378392
node = ExpectNodeExists(ctx, env.Client, node.Name)
379393
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // DrainInitiation
394+
fakeClock.Step(2 * termination.MinDrainTime)
380395

381396
// Expect node to exist and be draining
382397
ExpectNodeWithNodeClaimDraining(env.Client, node.Name)
@@ -464,6 +479,7 @@ var _ = Describe("Termination", func() {
464479
}
465480

466481
// Reconcile to delete node
482+
fakeClock.Step(2 * termination.MinDrainTime)
467483
node = ExpectNodeExists(ctx, env.Client, node.Name)
468484
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // DrainValidation, VolumeDetachment, InstanceTerminationInitiation
469485
ExpectNotRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // InstanceTerminationValidation
@@ -501,6 +517,7 @@ var _ = Describe("Termination", func() {
501517
ExpectDeleted(ctx, env.Client, podClusterCritical)
502518

503519
// Reconcile to delete node
520+
fakeClock.Step(2 * termination.MinDrainTime)
504521
node = ExpectNodeExists(ctx, env.Client, node.Name)
505522
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // DrainValidation, VolumeDetachment, InstanceTerminationInitiation
506523
ExpectNotRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // InstanceTerminationValidation
@@ -541,6 +558,7 @@ var _ = Describe("Termination", func() {
541558
ExpectDeleted(ctx, env.Client, podEvict)
542559

543560
// Reconcile to delete node
561+
fakeClock.Step(2 * termination.MinDrainTime)
544562
node = ExpectNodeExists(ctx, env.Client, node.Name)
545563
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // DrainValidation, VolumeDetachment, InstanceTerminationInitiation
546564
ExpectNotRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // InstanceTerminationValidation
@@ -566,6 +584,7 @@ var _ = Describe("Termination", func() {
566584
EventuallyExpectTerminating(ctx, env.Client, pods[0], pods[1])
567585

568586
// Expect node to exist and be draining, but not deleted
587+
fakeClock.Step(2 * termination.MinDrainTime)
569588
node = ExpectNodeExists(ctx, env.Client, node.Name)
570589
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // DrainValidation
571590
ExpectNodeWithNodeClaimDraining(env.Client, node.Name)
@@ -608,6 +627,7 @@ var _ = Describe("Termination", func() {
608627
EventuallyExpectTerminating(ctx, env.Client, pods[0], pods[1])
609628

610629
// Expect node to exist and be draining, but not deleted
630+
fakeClock.Step(2 * termination.MinDrainTime)
611631
node = ExpectNodeExists(ctx, env.Client, node.Name)
612632
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // DrainValidation
613633
ExpectNodeWithNodeClaimDraining(env.Client, node.Name)
@@ -638,6 +658,7 @@ var _ = Describe("Termination", func() {
638658
EventuallyExpectTerminating(ctx, env.Client, pods[0], pods[1])
639659

640660
// Expect node to exist and be draining, but not deleted
661+
fakeClock.Step(2 * termination.MinDrainTime)
641662
node = ExpectNodeExists(ctx, env.Client, node.Name)
642663
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // DrainValidation
643664
ExpectNodeWithNodeClaimDraining(env.Client, node.Name)
@@ -699,6 +720,7 @@ var _ = Describe("Termination", func() {
699720
// Delete the pod directly to act like something else is doing the pod termination
700721
ExpectDeleted(ctx, env.Client, pod)
701722

723+
fakeClock.Step(2 * termination.MinDrainTime)
702724
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // DrainValidation, VolumeDetachment, InstanceTerminationInitiation
703725
ExpectNotRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // InstanceTerminationValidation
704726
ExpectNotFound(ctx, env.Client, node)
@@ -747,6 +769,7 @@ var _ = Describe("Termination", func() {
747769
ExpectApplied(ctx, env.Client, node, nodeClaim, nodePool, pod)
748770

749771
Expect(env.Client.Delete(ctx, node)).To(Succeed())
772+
fakeClock.Step(2 * termination.MinDrainTime)
750773
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // DrainInitiation
751774
ExpectObjectReconciled(ctx, env.Client, queue, pod)
752775

@@ -817,6 +840,34 @@ var _ = Describe("Termination", func() {
817840
pod = ExpectExists(ctx, env.Client, pod)
818841
Expect(pod.DeletionTimestamp.Time).To((BeTemporally("~", nodeTerminationTimestamp, 10*time.Second)))
819842
})
843+
It("should not finish draining until minDrainTime has passed", func() {
844+
ExpectApplied(ctx, env.Client, node, nodeClaim)
845+
Expect(env.Client.Delete(ctx, node)).To(Succeed())
846+
node = ExpectNodeExists(ctx, env.Client, node.Name)
847+
848+
// First reconcile: should taint node and start draining
849+
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Taint and DrainInitiation
850+
851+
// Verify node is tainted and NodeClaim condition is set to "Draining"
852+
node = ExpectNodeExists(ctx, env.Client, node.Name)
853+
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
854+
Expect(node.Spec.Taints).To(ContainElement(v1.DisruptedNoScheduleTaint))
855+
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue())
856+
857+
// Should still be requeueing as minDrainTime has not passed
858+
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node))
859+
fakeClock.Step(termination.MinDrainTime - 2*time.Second)
860+
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node))
861+
862+
// Should proceed with drain completion, after minDrainTime has passed
863+
fakeClock.Step(termination.MinDrainTime)
864+
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node))
865+
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
866+
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsTrue()).To(BeTrue())
867+
868+
ExpectNotRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node))
869+
ExpectNotFound(ctx, env.Client, node)
870+
})
820871
Context("VolumeAttachments", func() {
821872
It("should wait for volume attachments", func() {
822873
va := test.VolumeAttachment(test.VolumeAttachmentOptions{
@@ -828,6 +879,8 @@ var _ = Describe("Termination", func() {
828879
ExpectExists(ctx, env.Client, nodeClaim)
829880
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue())
830881

882+
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Taint and Start Drain
883+
fakeClock.Step(2 * termination.MinDrainTime)
831884
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Drain, VolumeDetachmentInitiation
832885
ExpectExists(ctx, env.Client, node)
833886
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
@@ -868,6 +921,8 @@ var _ = Describe("Termination", func() {
868921
ExpectManualBinding(ctx, env.Client, pod, node)
869922
Expect(env.Client.Delete(ctx, node)).To(Succeed())
870923

924+
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Taint and Start Drain
925+
fakeClock.Step(2 * termination.MinDrainTime)
871926
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Drain
872927
ExpectExists(ctx, env.Client, node)
873928
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
@@ -894,6 +949,8 @@ var _ = Describe("Termination", func() {
894949
ExpectApplied(ctx, env.Client, node, nodeClaim, nodePool, va)
895950
Expect(env.Client.Delete(ctx, node)).To(Succeed())
896951

952+
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Taint and Start Drain
953+
fakeClock.Step(2 * termination.MinDrainTime)
897954
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Drain, VolumeDetachment
898955
ExpectExists(ctx, env.Client, node)
899956
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
@@ -917,6 +974,8 @@ var _ = Describe("Termination", func() {
917974
ExpectApplied(ctx, env.Client, node, nodeClaim)
918975
Expect(env.Client.Delete(ctx, node)).To(Succeed())
919976
node = ExpectNodeExists(ctx, env.Client, node.Name)
977+
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Taint and Start Drain
978+
fakeClock.Step(2 * termination.MinDrainTime)
920979
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
921980
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Drain, VolumeDetachment, InstanceTerminationInitation
922981
ExpectNotRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // InstanceTerminationFinalization
@@ -929,6 +988,8 @@ var _ = Describe("Termination", func() {
929988
ExpectApplied(ctx, env.Client, node, nodeClaim)
930989
Expect(env.Client.Delete(ctx, node)).To(Succeed())
931990
node = ExpectNodeExists(ctx, env.Client, node.Name)
991+
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Taint and Start Drain
992+
fakeClock.Step(2 * termination.MinDrainTime)
932993
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Drain, VolumeDetachment, InstanceTerminationInitiation
933994
ExpectMetricCounterValue(termination.NodesDrainedTotal, 1, map[string]string{metrics.NodePoolLabel: node.Labels[v1.NodePoolLabelKey]})
934995
ExpectNotRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // InstanceTerminationFinalization
@@ -941,6 +1002,8 @@ var _ = Describe("Termination", func() {
9411002
ExpectApplied(ctx, env.Client, node, nodeClaim)
9421003
Expect(env.Client.Delete(ctx, node)).To(Succeed())
9431004
node = ExpectNodeExists(ctx, env.Client, node.Name)
1005+
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Taint and Start Drain
1006+
fakeClock.Step(2 * termination.MinDrainTime)
9441007
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // Drain, VolumeDetachment, InstanceTerminationInitation
9451008
ExpectNotRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // InstanceTerminationFinalization
9461009

0 commit comments

Comments
 (0)