Skip to content

Commit 3f5da40

Browse files
authored
refactor: instance type filtering (#8016)
1 parent d79c76b commit 3f5da40

File tree

6 files changed

+834
-267
lines changed

6 files changed

+834
-267
lines changed

pkg/apis/v1/labels.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,26 @@ var (
6565
karpv1.ArchitectureAmd64,
6666
karpv1.ArchitectureArm64,
6767
)
68+
WellKnownResources = sets.New[corev1.ResourceName](
69+
corev1.ResourceCPU,
70+
corev1.ResourceMemory,
71+
corev1.ResourceEphemeralStorage,
72+
corev1.ResourcePods,
73+
ResourceAWSPodENI,
74+
ResourceNVIDIAGPU,
75+
ResourceAMDGPU,
76+
ResourceAWSNeuron,
77+
ResourceAWSNeuronCore,
78+
ResourceHabanaGaudi,
79+
ResourceEFA,
80+
)
81+
WellKnownExoticResources = sets.New[corev1.ResourceName](
82+
ResourceNVIDIAGPU,
83+
ResourceAMDGPU,
84+
ResourceAWSNeuron,
85+
ResourceAWSNeuronCore,
86+
ResourceHabanaGaudi,
87+
)
6888
RestrictedLabelDomains = []string{
6989
apis.Group,
7090
}

pkg/cloudprovider/cloudprovider.go

Lines changed: 4 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ import (
3030
coreapis "sigs.k8s.io/karpenter/pkg/apis"
3131
karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1"
3232
"sigs.k8s.io/karpenter/pkg/events"
33-
"sigs.k8s.io/karpenter/pkg/scheduling"
3433
"sigs.k8s.io/karpenter/pkg/utils/resources"
3534

3635
"github.com/aws/karpenter-provider-aws/pkg/apis"
@@ -107,17 +106,14 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
107106
if nodeClassReady.IsUnknown() {
108107
return nil, cloudprovider.NewCreateError(fmt.Errorf("resolving NodeClass readiness, NodeClass is in Ready=Unknown, %s", nodeClassReady.Message), "NodeClassReadinessUnknown", "NodeClass is in Ready=Unknown")
109108
}
110-
instanceTypes, err := c.resolveInstanceTypes(ctx, nodeClaim, nodeClass)
111-
if err != nil {
112-
return nil, cloudprovider.NewCreateError(fmt.Errorf("resolving instance types, %w", err), "InstanceTypeResolutionFailed", "Error resolving instance types")
113-
}
114-
if len(instanceTypes) == 0 {
115-
return nil, cloudprovider.NewInsufficientCapacityError(fmt.Errorf("all requested instance types were unavailable during launch"))
116-
}
117109
tags, err := utils.GetTags(nodeClass, nodeClaim, options.FromContext(ctx).ClusterName)
118110
if err != nil {
119111
return nil, cloudprovider.NewNodeClassNotReadyError(err)
120112
}
113+
instanceTypes, err := c.instanceTypeProvider.List(ctx, nodeClass)
114+
if err != nil {
115+
return nil, cloudprovider.NewCreateError(fmt.Errorf("resolving instance types, %w", err), "InstanceTypeResolutionFailed", "Error resolving instance types")
116+
}
121117
instance, err := c.instanceProvider.Create(ctx, nodeClass, nodeClaim, tags, instanceTypes)
122118
if err != nil {
123119
return nil, fmt.Errorf("creating instance, %w", err)
@@ -335,29 +331,6 @@ func (c *CloudProvider) resolveNodeClassFromNodePool(ctx context.Context, nodePo
335331
return nodeClass, nil
336332
}
337333

338-
func (c *CloudProvider) resolveInstanceTypes(ctx context.Context, nodeClaim *karpv1.NodeClaim, nodeClass *v1.EC2NodeClass) ([]*cloudprovider.InstanceType, error) {
339-
instanceTypes, err := c.instanceTypeProvider.List(ctx, nodeClass)
340-
if err != nil {
341-
return nil, fmt.Errorf("getting instance types, %w", err)
342-
}
343-
reqs := scheduling.NewNodeSelectorRequirementsWithMinValues(nodeClaim.Spec.Requirements...)
344-
instanceTypes = lo.Filter(instanceTypes, func(i *cloudprovider.InstanceType, _ int) bool {
345-
return reqs.Compatible(i.Requirements, scheduling.AllowUndefinedWellKnownLabels) == nil &&
346-
len(i.Offerings.Compatible(reqs).Available()) > 0 &&
347-
resources.Fits(nodeClaim.Spec.Resources.Requests, i.Allocatable())
348-
})
349-
// Filter out exotic instance types, spot instance types more expensive than the cheapest on-demand instance type, etc.
350-
var rejectedInstanceTypes []*cloudprovider.InstanceType
351-
instanceTypes, rejectedInstanceTypes, err = instance.FilterRejectInstanceTypes(nodeClaim, instanceTypes)
352-
if err != nil {
353-
return nil, fmt.Errorf("filtering instance types, %w", err)
354-
}
355-
if len(rejectedInstanceTypes) > 0 {
356-
log.FromContext(ctx).WithValues("instance-types", utils.PrettySlice(lo.Map(rejectedInstanceTypes, func(i *cloudprovider.InstanceType, _ int) string { return i.Name }), 10)).V(1).Info("filtered out instance types from launch")
357-
}
358-
return instanceTypes, nil
359-
}
360-
361334
func (c *CloudProvider) resolveInstanceTypeFromInstance(ctx context.Context, instance *instance.Instance) (*cloudprovider.InstanceType, error) {
362335
nodePool, err := c.resolveNodePoolFromInstance(ctx, instance)
363336
if err != nil {
Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
/*
2+
Licensed under the Apache License, Version 2.0 (the "License");
3+
you may not use this file except in compliance with the License.
4+
You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software
9+
distributed under the License is distributed on an "AS IS" BASIS,
10+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
See the License for the specific language governing permissions and
12+
limitations under the License.
13+
*/
14+
15+
package filter
16+
17+
import (
18+
"math"
19+
"strings"
20+
21+
"github.com/samber/lo"
22+
corev1 "k8s.io/api/core/v1"
23+
"sigs.k8s.io/karpenter/pkg/cloudprovider"
24+
"sigs.k8s.io/karpenter/pkg/scheduling"
25+
"sigs.k8s.io/karpenter/pkg/utils/resources"
26+
27+
v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1"
28+
29+
karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1"
30+
)
31+
32+
type Filter interface {
33+
FilterReject(instanceTypes []*cloudprovider.InstanceType) (kept []*cloudprovider.InstanceType, rejected []*cloudprovider.InstanceType)
34+
Name() string
35+
}
36+
37+
// CompatibleAvailableFilter removes instance types which do not have any compatible, available offerings. Other filters
38+
// should not be used without first using this filter.
39+
func CompatibleAvailableFilter(requirements scheduling.Requirements, requests corev1.ResourceList) Filter {
40+
return compatibleAvailableFilter{
41+
requirements: requirements,
42+
requests: requests,
43+
}
44+
}
45+
46+
type compatibleAvailableFilter struct {
47+
requirements scheduling.Requirements
48+
requests corev1.ResourceList
49+
}
50+
51+
func (f compatibleAvailableFilter) FilterReject(instanceTypes []*cloudprovider.InstanceType) ([]*cloudprovider.InstanceType, []*cloudprovider.InstanceType) {
52+
return lo.FilterReject(instanceTypes, func(i *cloudprovider.InstanceType, _ int) bool {
53+
if !f.requirements.IsCompatible(i.Requirements, scheduling.AllowUndefinedWellKnownLabels) {
54+
return false
55+
}
56+
if !resources.Fits(f.requests, i.Allocatable()) {
57+
return false
58+
}
59+
if len(i.Offerings.Compatible(f.requirements).Available()) == 0 {
60+
return false
61+
}
62+
return true
63+
})
64+
}
65+
66+
func (compatibleAvailableFilter) Name() string {
67+
return "compatible-available-filter"
68+
}
69+
70+
// ReservedOfferingFilter creates a Filter which ensures there's only a single reserved offering per zone. This
71+
// addresses a limitation of the CreateFleet API, which limits calls to specifying a single offering per pool. If there
72+
// are multiple offerings in the same pool, the offering with the greatest capacity will be selected.
73+
func ReservedOfferingFilter(requirements scheduling.Requirements) Filter {
74+
return reservedOfferingFilter{
75+
requirements: requirements,
76+
}
77+
}
78+
79+
type reservedOfferingFilter struct {
80+
requirements scheduling.Requirements
81+
}
82+
83+
func (f reservedOfferingFilter) FilterReject(instanceTypes []*cloudprovider.InstanceType) ([]*cloudprovider.InstanceType, []*cloudprovider.InstanceType) {
84+
if !f.requirements.Get(karpv1.CapacityTypeLabelKey).Has(karpv1.CapacityTypeReserved) {
85+
return instanceTypes, nil
86+
}
87+
88+
var remaining, rejected []*cloudprovider.InstanceType
89+
for _, it := range instanceTypes {
90+
zonalOfferings := map[string]*cloudprovider.Offering{}
91+
for _, o := range it.Offerings.Available().Compatible(f.requirements) {
92+
if o.CapacityType() != karpv1.CapacityTypeReserved {
93+
continue
94+
}
95+
if current, ok := zonalOfferings[o.Zone()]; !ok || o.ReservationCapacity > current.ReservationCapacity {
96+
zonalOfferings[o.Zone()] = o
97+
}
98+
}
99+
if len(zonalOfferings) == 0 {
100+
rejected = append(rejected, it)
101+
continue
102+
}
103+
// WARNING: It is only safe to mutate the slice containing the offerings, not the offerings themselves. The individual
104+
// offerings are cached, but not the slice storing them. This helps keep the launch path simple, but changes to the
105+
// caching strategy employed by the InstanceType provider could result in unexpected behavior.
106+
it.Offerings = lo.Values(zonalOfferings)
107+
remaining = append(remaining, it)
108+
}
109+
if len(remaining) == 0 {
110+
return instanceTypes, nil
111+
}
112+
return remaining, rejected
113+
}
114+
115+
func (reservedOfferingFilter) Name() string {
116+
return "reserved-offering-filter"
117+
}
118+
119+
// ExoticInstanceTypeFilter will remove instances with GPUs and accelerators, along with metal instances, if doing so
120+
// doesn't filter out all instance types. This ensures Karpenter only launches these instances if the NodeClaim
121+
// explicitly requests them or all other compatible instance types are unavailable.
122+
func ExoticInstanceTypeFilter(requirements scheduling.Requirements) Filter {
123+
return exoticInstanceFilter{
124+
requirements: requirements,
125+
}
126+
}
127+
128+
type exoticInstanceFilter struct {
129+
requirements scheduling.Requirements
130+
}
131+
132+
func (f exoticInstanceFilter) FilterReject(instanceTypes []*cloudprovider.InstanceType) ([]*cloudprovider.InstanceType, []*cloudprovider.InstanceType) {
133+
if f.requirements.HasMinValues() {
134+
return instanceTypes, nil
135+
}
136+
137+
genericInstanceTypes, exoticInstanceTypes := lo.FilterReject(instanceTypes, func(it *cloudprovider.InstanceType, _ int) bool {
138+
if lo.ContainsBy(it.Requirements.Get(v1.LabelInstanceSize).Values(), func(size string) bool {
139+
return strings.Contains(size, "metal")
140+
}) {
141+
return false
142+
}
143+
for _, resource := range []corev1.ResourceName{
144+
v1.ResourceAWSNeuron,
145+
v1.ResourceAWSNeuronCore,
146+
v1.ResourceAMDGPU,
147+
v1.ResourceNVIDIAGPU,
148+
v1.ResourceHabanaGaudi,
149+
} {
150+
if !resources.IsZero(it.Capacity[resource]) {
151+
return false
152+
}
153+
}
154+
return true
155+
})
156+
// If there are no available, compatible reserved instance types Karpenter should fallback to exotic instance types
157+
if len(genericInstanceTypes) == 0 {
158+
return instanceTypes, nil
159+
}
160+
return genericInstanceTypes, exoticInstanceTypes
161+
}
162+
163+
func (exoticInstanceFilter) Name() string {
164+
return "exotic-instance-filter"
165+
}
166+
167+
// SpotInstanceFilter removes all instances with spot offerings which are more expensive than the cheapest compatible
168+
// and available on-demand offering. This ensures we don't launch with a more expensive spot instance for a mixed-launch
169+
// NodeClaim. Note that instance types with available, compatible reserved offerings will not be filtered out.
170+
// NOTE: This filter assumes all provided instance types have compatible and available offerings
171+
func SpotInstanceFilter(requirements scheduling.Requirements) Filter {
172+
return spotInstanceFilter{
173+
requirements: requirements,
174+
}
175+
}
176+
177+
type spotInstanceFilter struct {
178+
requirements scheduling.Requirements
179+
}
180+
181+
//nolint:gocyclo
182+
func (f spotInstanceFilter) FilterReject(instanceTypes []*cloudprovider.InstanceType) ([]*cloudprovider.InstanceType, []*cloudprovider.InstanceType) {
183+
if f.requirements.HasMinValues() {
184+
return instanceTypes, nil
185+
}
186+
if req := f.requirements.Get(karpv1.CapacityTypeLabelKey); !req.Has(karpv1.CapacityTypeOnDemand) || !req.Has(karpv1.CapacityTypeSpot) {
187+
return instanceTypes, nil
188+
}
189+
190+
cheapestOnDemand := math.MaxFloat64
191+
hasSpotOfferings := false
192+
hasODOfferings := false
193+
for _, it := range instanceTypes {
194+
for _, o := range it.Offerings.Compatible(f.requirements).Available() {
195+
if ct := o.Requirements.Get(karpv1.CapacityTypeLabelKey).Any(); ct == karpv1.CapacityTypeOnDemand {
196+
hasODOfferings = true
197+
if o.Price < cheapestOnDemand {
198+
cheapestOnDemand = o.Price
199+
}
200+
} else if ct == karpv1.CapacityTypeSpot {
201+
hasSpotOfferings = true
202+
}
203+
}
204+
}
205+
if !hasODOfferings || !hasSpotOfferings {
206+
return instanceTypes, nil
207+
}
208+
209+
// Filter out any types where the cheapest spot offering is more expensive than the cheapest on-demand instance type
210+
// that would have worked. This prevents us from getting a larger, more-expensive spot instance type compared to the
211+
// cheapest sufficiently large on-demand instance type.
212+
return lo.FilterReject(instanceTypes, func(it *cloudprovider.InstanceType, _ int) bool {
213+
var hasSpotOffering bool
214+
for _, o := range it.Offerings.Compatible(f.requirements).Available() {
215+
// Always include instance types which have compatible, available reserved offerings since they're modeled as free
216+
if o.CapacityType() == karpv1.CapacityTypeReserved {
217+
return true
218+
}
219+
// If the offering is spot and cheaper than the cheapest on-demand instance type, include the instance type
220+
if o.CapacityType() == karpv1.CapacityTypeSpot {
221+
hasSpotOffering = true
222+
if o.Price <= cheapestOnDemand {
223+
return true
224+
}
225+
}
226+
}
227+
return !hasSpotOffering
228+
})
229+
}
230+
231+
func (spotInstanceFilter) Name() string {
232+
return "spot-instance-filter"
233+
}

0 commit comments

Comments
 (0)