Skip to content

Commit 72ce981

Browse files
authored
fix: correct offering count tracking in cost controller re-add path (#2946)
1 parent d9a12ef commit 72ce981

File tree

2 files changed

+58
-4
lines changed

2 files changed

+58
-4
lines changed

pkg/state/cost/cost.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -305,12 +305,12 @@ func (cc *ClusterCost) internalAddOffering(ctx context.Context, npName string, o
305305
}
306306
oc, exists = cc.npCostMap[npName].offeringCounts[offeringKey]
307307
if !exists {
308-
oc = OfferingCount{Count: 1, Price: 0.0}
309-
log.FromContext(ctx).Error(fmt.Errorf("failed to find offering %q during retry while searching for instance %q in zone %q with capacity %q in nodepool %q", offeringKey, offeringKey.InstanceName, offeringKey.Zone, offeringKey.CapacityType, npName), "offering won't be counted towards total cluster cost")
308+
// Start at 0; the unconditional oc.Count += 1 below accounts for this add.
309+
oc = OfferingCount{Count: 0, Price: 0.0}
310+
log.FromContext(ctx).Error(fmt.Errorf("failed to find offering %q during retry while searching for instance %q in zone %q with capacity %q in nodepool %q", offeringKey, offeringKey.InstanceName, offeringKey.Zone, offeringKey.CapacityType, npName), "offering price unknown after retry — cost tracking will undercount for this nodeclaim until next UpdateOfferings")
310311
}
311-
} else {
312-
oc.Count += 1
313312
}
313+
oc.Count += 1
314314
cc.npCostMap[npName].offeringCounts[offeringKey] = oc
315315
cc.npCostMap[npName].cost += oc.Price
316316
return nil

pkg/state/cost/suite_test.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,60 @@ var _ = Describe("ClusterCost", func() {
394394
Expect(finalClusterCost).To(BeNumerically("~", 8.00, 0.001), // 5.50 + 2.50
395395
"Cluster cost should be sum of all updated nodepool costs")
396396
})
397+
It("should track count correctly when re-adding an offering after the key was deleted", func() {
398+
// The retry path in internalAddOffering (when an offering key was
399+
// deleted after count hit 0) didn't increment oc.Count. The offering
400+
// was re-added with Count: 0, so UpdateOfferings recomputed
401+
// 0 * price = $0 instead of 1 * price. This caused cost tracking
402+
// to undercount after any offering type fully scaled down and back up.
403+
//
404+
// To trigger: keep an anchor alive (so nodepool survives), cycle a
405+
// different offering through add→remove→re-add.
406+
407+
// Use two different instance types so the offering key deleted
408+
// for one doesn't get recreated when the other's nodepool update runs
409+
spotInstance := &cloudprovider.InstanceType{
410+
Name: "spot-only",
411+
Offerings: []*cloudprovider.Offering{spotOffering},
412+
}
413+
odInstance := &cloudprovider.InstanceType{
414+
Name: "od-only",
415+
Offerings: []*cloudprovider.Offering{onDemandOffering},
416+
}
417+
cloudProvider.InstanceTypes = []*cloudprovider.InstanceType{spotInstance, odInstance}
418+
419+
// Add anchor with on-demand instance
420+
anchor := createTestNodeClaim(testNodePool, odInstance.Name, onDemandOffering.CapacityType(), onDemandOffering.Zone())
421+
anchor.Name = "anchor"
422+
Expect(clusterCost.UpdateNodeClaim(ctx, anchor)).To(Succeed())
423+
Expect(clusterCost.GetClusterCost()).To(BeNumerically("~", 3.00, 0.001))
424+
425+
// Add and remove a spot nodeclaim
426+
nc1 := createTestNodeClaim(testNodePool, spotInstance.Name, spotOffering.CapacityType(), spotOffering.Zone())
427+
nc1.Name = "spot-first"
428+
Expect(clusterCost.UpdateNodeClaim(ctx, nc1)).To(Succeed())
429+
Expect(clusterCost.GetClusterCost()).To(BeNumerically("~", 4.50, 0.001))
430+
Expect(clusterCost.DeleteNodeClaim(ctx, client.ObjectKeyFromObject(nc1))).To(Succeed())
431+
Expect(clusterCost.GetClusterCost()).To(BeNumerically("~", 3.00, 0.001))
432+
433+
// Re-add spot — the spot offering key was deleted from the map
434+
// but nodepool survives. This triggers internalNodepoolUpdate.
435+
nc2 := createTestNodeClaim(testNodePool, spotInstance.Name, spotOffering.CapacityType(), spotOffering.Zone())
436+
nc2.Name = "spot-second"
437+
Expect(clusterCost.UpdateNodeClaim(ctx, nc2)).To(Succeed())
438+
439+
cost := clusterCost.GetClusterCost()
440+
Expect(cost).To(BeNumerically("~", 4.50, 0.001),
441+
fmt.Sprintf("cost should be 4.50 (3.00 anchor + 1.50 spot), got %v", cost))
442+
443+
// Now trigger UpdateOfferings which recomputes cost from Count * Price.
444+
// If the count was not incremented properly (stayed at 0 instead of 1),
445+
// updateCost() will compute 0*1.50 + 1*3.00 = 3.00 instead of 4.50.
446+
clusterCost.UpdateOfferings(ctx, testNodePool, cloudProvider.InstanceTypes)
447+
costAfterRefresh := clusterCost.GetClusterCost()
448+
Expect(costAfterRefresh).To(BeNumerically("~", 4.50, 0.001),
449+
fmt.Sprintf("cost should still be 4.50 after UpdateOfferings recomputes from counts, got %v — indicates count was not incremented on re-add", costAfterRefresh))
450+
})
397451
})
398452

399453
Context("Concurrency", func() {

0 commit comments

Comments
 (0)