Skip to content

Commit 7c8d536

Browse files
committed
Merge branch 'main' into rfcs/terminate-before-create-static-capacity
2 parents 79982b9 + 72ce981 commit 7c8d536

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+5951
-309
lines changed

Makefile

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# This is the format of an AWS ECR Public Repo as an example.
22
export KWOK_REPO ?= ${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com
33
export KARPENTER_NAMESPACE=kube-system
4+
export KIND_CLUSTER_NAME ?= test-cluster
45

56
HELM_OPTS ?= --set logLevel=debug \
67
--set controller.resources.requests.cpu=1 \
@@ -42,6 +43,33 @@ apply-with-kind: verify build-with-kind ## Deploy the kwok controller from the c
4243
--set-string controller.env[0].name=ENABLE_PROFILING \
4344
--set-string controller.env[0].value=true
4445

46+
apply-with-kind-dra: verify build-with-kind ## Deploy the kwok controller for DRA testing
47+
kubectl apply -f kwok/charts/crds
48+
helm upgrade --install karpenter kwok/charts --namespace $(KARPENTER_NAMESPACE) --skip-crds \
49+
$(HELM_OPTS) \
50+
--set controller.image.repository=$(IMG_REPOSITORY) \
51+
--set controller.image.tag=$(IMG_TAG) \
52+
--set serviceMonitor.enabled=false \
53+
--set-string controller.env[0].name=ENABLE_PROFILING \
54+
--set-string controller.env[0].value=true
55+
56+
get-kind-image: ## Extract the actual KWOK image repository from Kind cluster
57+
$(eval IMG_REPOSITORY=$(shell docker exec $(KIND_CLUSTER_NAME)-control-plane crictl images | grep "kind.local/kwok" | awk '{print $$1}' | head -1))
58+
$(eval IMG_TAG=latest)
59+
@echo "Using Repository: $(IMG_REPOSITORY), Tag: $(IMG_TAG)"
60+
61+
setup-kind-dra: ## Setup Kind cluster for DRA testing
62+
-kind delete cluster --name $(KIND_CLUSTER_NAME)
63+
kind create cluster --image kindest/node:v1.34.0 --name $(KIND_CLUSTER_NAME)
64+
KWOK_REPO=kind.local KIND_CLUSTER_NAME=$(KIND_CLUSTER_NAME) $(MAKE) install-kwok
65+
KWOK_REPO=kind.local KIND_CLUSTER_NAME=$(KIND_CLUSTER_NAME) $(MAKE) build-with-kind
66+
KWOK_REPO=kind.local KIND_CLUSTER_NAME=$(KIND_CLUSTER_NAME) $(MAKE) apply-with-kind-dra
67+
kubectl taint nodes $(KIND_CLUSTER_NAME)-control-plane CriticalAddonsOnly=true:NoSchedule --overwrite
68+
kubectl create namespace karpenter --dry-run=client -o yaml | kubectl apply -f -
69+
70+
delete-kind-dra: ## Delete DRA Kind cluster
71+
kind delete cluster --name $(KIND_CLUSTER_NAME)
72+
4573
JUNIT_REPORT := $(if $(ARTIFACT_DIR), --ginkgo.junit-report="$(ARTIFACT_DIR)/junit_report.xml")
4674
e2etests: ## Run the e2e suite against your local cluster
4775
cd test && go test \
@@ -84,6 +112,29 @@ test: ## Run tests
84112
test-memory: ## Run memory usage tests for node overlay store
85113
go test -v ./pkg/controllers/nodeoverlay/... -run TestMemoryUsage
86114

115+
test-dra: ## Run DRA KWOK driver unit tests
116+
go test ./dra-kwok-driver/pkg/... \
117+
-race \
118+
-timeout 20m \
119+
--ginkgo.focus="${FOCUS}" \
120+
--ginkgo.randomize-all \
121+
--ginkgo.v \
122+
-cover
123+
124+
e2etest-dra: ## Run DRA e2e integration tests
125+
kubectl apply -f dra-kwok-driver/pkg/apis/crds/test.karpenter.sh_draconfigs.yaml
126+
-kubectl delete draconfigs --all --ignore-not-found=true
127+
-kubectl delete resourceslices --all --ignore-not-found=true
128+
-pkill -f dra-kwok-driver || true
129+
cd dra-kwok-driver && go build -o dra-kwok-driver main.go
130+
cd dra-kwok-driver && ./dra-kwok-driver > /tmp/dra-driver.log 2>&1 & echo $$! > /tmp/dra-driver.pid
131+
sleep 2
132+
TEST_SUITE=dra $(MAKE) e2etests
133+
-kubectl delete draconfigs --all --ignore-not-found=true
134+
-kubectl delete resourceslices --all --ignore-not-found=true
135+
-kill $$(cat /tmp/dra-driver.pid) 2>/dev/null || true
136+
-rm -f /tmp/dra-driver.pid
137+
87138
benchmark: ## Run benchmark tests for node overlay store
88139
go test -bench=. -benchmem ./pkg/controllers/nodeoverlay/... -run=^$$
89140

@@ -133,4 +184,4 @@ download: ## Recursively "go mod download" on all directories where go.mod exist
133184
gen_instance_types:
134185
go run kwok/tools/gen_instance_types.go > kwok/cloudprovider/instance_types.json
135186

136-
.PHONY: help presubmit install-kwok uninstall-kwok build apply delete test test-memory benchmark deflake vulncheck licenses verify download gen_instance_types
187+
.PHONY: help presubmit install-kwok uninstall-kwok build apply delete test test-memory test-dra e2etest-dra benchmark deflake vulncheck licenses verify download gen_instance_types setup-kind-dra delete-kind-dra apply-with-kind-dra

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,13 @@ Karpenter is a multi-cloud project with implementations by the following cloud p
2222
- [AlibabaCloud](https://github.com/cloudpilot-ai/karpenter-provider-alibabacloud)
2323
- [Bizfly Cloud](https://github.com/bizflycloud/karpenter-provider-bizflycloud)
2424
- [Cluster API](https://github.com/kubernetes-sigs/karpenter-provider-cluster-api)
25+
- [Exoscale](https://github.com/exoscale/karpenter-provider-exoscale/)
2526
- [GCP](https://github.com/cloudpilot-ai/karpenter-provider-gcp)
2627
- [IBM Cloud](https://github.com/kubernetes-sigs/karpenter-provider-ibm-cloud)
2728
- [Proxmox](https://github.com/sergelogvinov/karpenter-provider-proxmox)
28-
- [Oracle Cloud Infrastructure (OCI) - Provided by Zoom](https://github.com/zoom/karpenter-oci)
29+
- Oracle Cloud Infrastructure (OCI)
30+
- [Officially supported and maintained by Oracle](https://github.com/oracle/karpenter-provider-oci)
31+
- [Maintained by Zoom](https://github.com/zoom/karpenter-oci)
2932

3033
## Community, discussion, contribution, and support
3134

designs/karpenter-dra-kwok-driver.md

Lines changed: 81 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,20 @@
33
## Summary
44
The upstream kubernetes/perf-tests repository includes a [DRA KWOK Driver](https://github.com/kubernetes/perf-tests/pull/3491/files), but it's designed for **ClusterLoader2 scale testing** with pre-created static nodes that cannot be used for Karpenter testing.
55

6-
This design introduces a **Karpenter DRA KWOK Driver** - a mock DRA driver that acts on behalf of KWOK nodes created by Karpenter. When KWOK nodes register with the cluster, the driver creates ResourceSlices advertising fake GPU/device resources. This simulates what a real DRA driver (like NVIDIA GPU Operator) would do, but with fake devices for testing purposes. The driver watches for KWOK nodes and creates corresponding ResourceSlices based on either Node Overlay or ConfigMap configuration. The driver acts independently as a standard Kubernetes controller, ensuring ResourceSlices exist on the API server for both the scheduler and Karpenter's cluster state to discover.
6+
This design introduces a **Karpenter DRA KWOK Driver** - a mock DRA driver that acts on behalf of KWOK nodes created by Karpenter. When KWOK nodes register with the cluster, the driver creates ResourceSlices advertising fake GPU/device resources. This simulates what a real DRA driver (like NVIDIA GPU Operator) would do, but with fake devices for testing purposes. The driver uses a polling approach (30-second interval) to periodically reconcile all KWOK nodes and creates corresponding ResourceSlices based on either Node Overlay or DRAConfig CRD. The driver acts independently as a standard Kubernetes controller, ensuring ResourceSlices exist on the API server for both the scheduler and Karpenter's cluster state to discover.
77

88
### Workflow
9-
1. **Test creates ResourceClaim** with device attribute selectors
10-
2. **Test creates DRA pod** referencing the ResourceClaim
9+
1. **Test creates DRAConfig CRD** defining device pools and node selectors
10+
2. **Test creates DRA pod** with a ResourceClaim referencing device attributes
1111
3. **Karpenter provisions KWOK node** in response to unschedulable pod
12-
4. **Node registration triggers ResourceSlice creation** based on:
12+
4. **Driver polling loop detects new node** (within 30 seconds) and creates ResourceSlices based on:
1313
- **Case 1:** Check for matching NodeOverlay with embedded ResourceSlice objects (future enhancement)
14-
- **Case 2:** Use ConfigMap mappings if no NodeOverlay matches
14+
- **Case 2:** Use DRAConfig CRD pools if no NodeOverlay matches
1515
- **Case 3:** Eventually cloudproviders will be able to provide potential ResourceSlice shapes through the InstanceType interface (Future TODO: implement a way for cloudproviders to inform our DRAKWOKDriver of those shapes).
1616
5. **Kubernetes scheduler discovers ResourceSlices** and binds pod to node
1717
6. **Pod successfully schedules** to the node with available DRA resources
1818
7. **Test validates** node creation, ResourceSlice creation, pod scheduling, and Karpenter behavior
19-
8. **Cleanup automatically removes** ResourceSlices when nodes are deleted
19+
8. **Cleanup automatically removes** ResourceSlices in next polling cycle when nodes are deleted
2020

2121
## Implementation
2222

@@ -25,7 +25,7 @@ Tests **Karpenter's integrated DRA scheduling** where DRA device counts are know
2525

2626
**Example Node Overlay with DRA** (future API extension):
2727
```yaml
28-
apiVersion: karpenter.sh/v1alpha1
28+
apiVersion: test.karpenter.sh/v1alpha1
2929
kind: NodeOverlay
3030
metadata:
3131
name: gpu-dra-config
@@ -36,107 +36,111 @@ spec:
3636
operator: In
3737
values: ["g5.48xlarge"]
3838
capacity:
39-
karpenter.sh.dra-kwok-driver/device: "8" # Custom extended resource for DRA devices
39+
test.karpenter.sh/device: "8" # Custom extended resource for DRA devices
4040
# TODO: Extend NodeOverlay API to embed ResourceSlice templates
4141
resourceSlices: # FUTURE: Embedded ResourceSlice objects (not yet implemented)
4242
- apiVersion: resource.k8s.io/v1
4343
kind: ResourceSlice
4444
spec:
4545
# nodeName will be filled in by driver when node is created
46-
driver: "karpenter.sh.dra-kwok-driver"
46+
driver: "test.karpenter.sh"
4747
devices:
4848
- name: "nvidia-h100-0"
49-
driver: "karpenter.sh.dra-kwok-driver"
49+
driver: "test.karpenter.sh"
5050
attributes:
5151
memory: "80Gi"
5252
compute-capability: "9.0"
5353
vendor: "nvidia"
5454
- name: "nvidia-h100-1"
55-
driver: "karpenter.sh.dra-kwok-driver"
55+
driver: "test.karpenter.sh"
5656
attributes:
5757
memory: "80Gi"
5858
compute-capability: "9.0"
5959
vendor: "nvidia"
60-
# ... (6 more devices for total of 8)
6160
```
6261
6362
**How it works**:
6463
1. **Test author defines NodeOverlay configuration**: "g5.48xlarge KWOK nodes should have 8x fake H100 GPUs" via ResourceSlices
65-
2. **Driver watches for KWOK nodes**: When Karpenter creates a KWOK node with `instance-type: g5.48xlarge`
66-
3. **NodeOverlay match found**: Driver checks for NodeOverlay with embedded ResourceSlice objects, finds matching configuration
67-
4. **Driver creates ResourceSlice**: Acts as fake DRA driver using embedded ResourceSlice objects from NodeOverlay
68-
5. **Scheduler sees configured devices**: ResourceSlices with fake devices become available for DRA pod scheduling
69-
6. **Test validation**: Validates that the driver correctly provides DRA resources and enables successful pod scheduling
64+
2. **Karpenter creates KWOK node**: Node with `instance-type: g5.48xlarge` is created
65+
3. **Driver polling detects new node**: Within 30 seconds, driver reconciliation loop discovers the node
66+
4. **NodeOverlay match found**: Driver checks for NodeOverlay with embedded ResourceSlice objects, finds matching configuration
67+
5. **Driver creates ResourceSlice**: Acts as fake DRA driver using embedded ResourceSlice objects from NodeOverlay
68+
6. **Scheduler sees configured devices**: ResourceSlices with fake devices become available for DRA pod scheduling
69+
7. **Test validation**: Validates that the driver correctly provides DRA resources and enables successful pod scheduling
7070

71-
### Case 2: ConfigMap Fallback Configuration
72-
Tests **DRA resource provisioning when no NodeOverlay configuration is found** - simulating scenarios where ResourceSlices exist on nodes but weren't defined through NodeOverlay configuration. This addresses when other out of band components manage nodes, partial NodeOverlay coverage (only some instance types configured), and 3rd party DRA driver integration (GPU operators working independently). The driver falls back to ConfigMap-based device configuration when no matching NodeOverlay is found, creating ResourceSlices that Karpenter must then discover and incorporate into future scheduling decisions. This ensures we correctly test that Karpenter successfully discovers ResourceSlices and schedules against them, even if they weren't defined on any NodeOverlays.
71+
### Case 2: CRD-Based Fallback Configuration
72+
Tests **DRA resource provisioning via strongly-typed CRD when no NodeOverlay configuration is found** - simulating scenarios where ResourceSlices exist on nodes but weren't defined through NodeOverlay configuration. This addresses when other out of band components manage nodes, partial NodeOverlay coverage (only some instance types configured), and 3rd party DRA driver integration (GPU operators working independently). The driver falls back to DRAConfig CRD-based device configuration when no matching NodeOverlay is found, creating ResourceSlices that Karpenter must then discover and incorporate into future scheduling decisions. This ensures we correctly test that Karpenter successfully discovers ResourceSlices and schedules against them, even if they weren't defined on any NodeOverlays.
7373

7474
```yaml
75-
apiVersion: v1
76-
kind: ConfigMap
75+
apiVersion: test.karpenter.sh/v1alpha1
76+
kind: DRAConfig
7777
metadata:
78-
name: dra-kwok-configmap
79-
namespace: karpenter
80-
data:
81-
config.yaml: |
82-
driver: "karpenter.sh.dra-kwok-driver"
83-
mappings:
84-
- name: "h100-nodes"
85-
nodeSelector:
86-
matchLabels:
87-
node.kubernetes.io/instance-type: "g5.48xlarge"
88-
kwok.x-k8s.io/node: "fake"
89-
resourceSlice:
90-
devices:
91-
- name: "nvidia-h100"
92-
count: 8
93-
attributes:
94-
memory: "80Gi"
95-
compute-capability: "9.0"
96-
device_class: "gpu"
97-
vendor: "nvidia"
98-
- name: "fpga-nodes"
99-
nodeSelector:
100-
matchLabels:
101-
node.kubernetes.io/instance-type: "f1.2xlarge"
102-
kwok.x-k8s.io/node: "fake"
103-
resourceSlice:
104-
devices:
105-
- name: "xilinx-u250"
106-
count: 1
107-
attributes:
108-
memory: "16Gi"
109-
device_class: "fpga"
110-
vendor: "xilinx"
78+
name: gpu-config # User-chosen name
79+
spec:
80+
driver: "test.karpenter.sh" # Simulated driver name
81+
pools:
82+
- name: "h100-pool"
83+
nodeSelectorTerms:
84+
- matchExpressions:
85+
- key: node.kubernetes.io/instance-type
86+
operator: In
87+
values: ["g5.48xlarge"]
88+
resourceSlices:
89+
- devices:
90+
- name: "nvidia-h100-0"
91+
attributes:
92+
memory: {stringValue: "80Gi"}
93+
compute-capability: {stringValue: "9.0"}
94+
device_class: {stringValue: "gpu"}
95+
vendor: {stringValue: "nvidia"}
96+
- name: "fpga-pool"
97+
nodeSelectorTerms:
98+
- matchExpressions:
99+
- key: node.kubernetes.io/instance-type
100+
operator: In
101+
values: ["f1.2xlarge"]
102+
resourceSlices:
103+
- devices:
104+
- name: "xilinx-u250-0"
105+
attributes:
106+
memory: {stringValue: "16Gi"}
107+
device_class: {stringValue: "fpga"}
108+
vendor: {stringValue: "xilinx"}
109+
111110
```
112111

113112
**How it works**:
114-
1. **Test author defines ConfigMap configuration**: "g5.48xlarge KWOK nodes should have 8x fake H100 GPUs when no NodeOverlay is found"
115-
2. **Driver watches for KWOK nodes**: When Karpenter creates a KWOK node with `instance-type: g5.48xlarge`
116-
3. **No NodeOverlay match found**: Driver checks for NodeOverlay with embedded ResourceSlice objects, finds none, falls back to ConfigMap
117-
4. **Driver creates ResourceSlice**: Acts as fake DRA driver using ConfigMap configuration
118-
5. **Scheduler sees configured devices**: ResourceSlices with fake devices become available for DRA pod scheduling
119-
6. **Test validation**: Validates that the driver correctly provides DRA resources and enables successful pod scheduling
113+
1. **Test author defines DRAConfig CRD**: "g5.48xlarge KWOK nodes should have fake H100 GPUs when no NodeOverlay is found"
114+
2. **Karpenter creates KWOK node**: Node with `instance-type: g5.48xlarge` is created
115+
3. **Driver polling detects new node**: Within 30 seconds, driver reconciliation loop discovers the node
116+
4. **No NodeOverlay match found**: Driver checks for NodeOverlay with embedded ResourceSlice objects, finds none, falls back to DRAConfig CRD
117+
5. **Driver reads DRAConfig**: Gets `test.karpenter.sh` DRAConfig (checked during each 30s polling cycle)
118+
6. **Driver creates ResourceSlices**: For each KWOK node matching the pool's nodeSelectorTerms
119+
7. **Scheduler sees configured devices**: ResourceSlices with fake devices become available for DRA pod scheduling
120+
8. **Test validation**: Validates that the driver correctly provides DRA resources and enables successful pod scheduling
120121

121122
## Directory Structure
122123
```
123124
karpenter/
124-
├── dra-kwok-driver/
125-
│ ├── main.go # Driver entry point
125+
├── dra-kwok-driver/
126+
│ ├── main.go # Driver entry point
126127
│ └── pkg/
127-
│ ├── controller/
128-
│ │ ├── controller.go # Main controller logic
129-
│ │ ├── nodeoverlay.go # NodeOverlay parsing (Case 1)
130-
│ │ ├── configmap.go # ConfigMap parsing (Case 2)
131-
│ │ └── resourceslice.go # ResourceSlice operations
132-
│ └── config/
133-
│ └── types.go # Configuration types
134-
└── test/suites/integration/
135-
└── dra_kwok_test.go # Our DRA KWOK integration tests
128+
│ ├── apis/
129+
│ │ ├── v1alpha1/ # CRD Go types
130+
│ │ │ ├── draconfig_types.go # DRAConfig CRD types
131+
│ │ │ └── zz_generated.deepcopy.go
132+
│ │ └── crds/ # CRD definitions
133+
│ │ └── test.karpenter.sh_draconfigs.yaml
134+
│ └── controllers/
135+
│ ├── resourceslice.go # ResourceSlice lifecycle (manages all drivers)
136+
│ └── resourceslice_test.go # ResourceSlice controller tests
137+
└── test/suites/dra/
138+
├── suite_test.go # Test suite setup
139+
└── dra_kwok_test.go # DRA integration tests
136140
```
137-
1. main.go starts the controller
138-
2. controller.go receives KWOK node events
139-
3. nodeoverlay.go tries to find matching NodeOverlay (Case 1)
140-
4. If no match: configmap.go provides fallback config (Case 2)
141-
5. resourceslice.go creates/updates/deletes the ResourceSlices
142-
6. types.go provides the data structures throughout
141+
142+
**Architecture:**
143+
1. `main.go` starts ResourceSlice controller with namespace
144+
2. `resourceslice.go` polls nodes every 30 seconds, LISTs all DRAConfig CRDs, groups by driver, and creates ResourceSlices for each driver independently
145+
3. `draconfig_types.go` defines CRD types with Pool structs (pool names auto-generated as `<driver>/<node>`)
146+
4. **Multi-driver support:** Single controller manages multiple drivers dynamically (e.g., `gpu.nvidia.com`, `fpga.intel.com`)

0 commit comments

Comments
 (0)