Skip to content

Commit c3753fe

Browse files
authored
feat: allow for configuring auto_scaler_profile (#278)
* feat: allow for configuring auto_scaler_profile Signed-off-by: David van der Spek <vanderspek.david@gmail.com>
1 parent fc9337b commit c3753fe

6 files changed

Lines changed: 288 additions & 98 deletions

File tree

README.md

Lines changed: 116 additions & 98 deletions
Large diffs are not rendered by default.

locals.tf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
locals {
2+
# Abstract if auto_scaler_profile_scale_down_delay_after_delete is not set or null we should use the scan_interval.
3+
auto_scaler_profile_scale_down_delay_after_delete = var.auto_scaler_profile_scale_down_delay_after_delete == null ? var.auto_scaler_profile_scan_interval : var.auto_scaler_profile_scale_down_delay_after_delete
24
# automatic upgrades are either:
35
# - null
46
# - patch, but then the kubernetes_version must not specify a patch number and orchestrator_version must be null

main.tf

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,29 @@ resource "azurerm_kubernetes_cluster" "main" {
9797
subnet_name = var.aci_connector_linux_subnet_name
9898
}
9999
}
100+
dynamic "auto_scaler_profile" {
101+
for_each = var.auto_scaler_profile_enabled ? ["default_auto_scaler_profile"] : []
102+
103+
content {
104+
balance_similar_node_groups = var.auto_scaler_profile_balance_similar_node_groups
105+
empty_bulk_delete_max = var.auto_scaler_profile_empty_bulk_delete_max
106+
expander = var.auto_scaler_profile_expander
107+
max_graceful_termination_sec = var.auto_scaler_profile_max_graceful_termination_sec
108+
max_node_provisioning_time = var.auto_scaler_profile_max_node_provisioning_time
109+
max_unready_nodes = var.auto_scaler_profile_max_unready_nodes
110+
max_unready_percentage = var.auto_scaler_profile_max_unready_percentage
111+
new_pod_scale_up_delay = var.auto_scaler_profile_new_pod_scale_up_delay
112+
scale_down_delay_after_add = var.auto_scaler_profile_scale_down_delay_after_add
113+
scale_down_delay_after_delete = local.auto_scaler_profile_scale_down_delay_after_delete
114+
scale_down_delay_after_failure = var.auto_scaler_profile_scale_down_delay_after_failure
115+
scale_down_unneeded = var.auto_scaler_profile_scale_down_unneeded
116+
scale_down_unready = var.auto_scaler_profile_scale_down_unready
117+
scale_down_utilization_threshold = var.auto_scaler_profile_scale_down_utilization_threshold
118+
scan_interval = var.auto_scaler_profile_scan_interval
119+
skip_nodes_with_local_storage = var.auto_scaler_profile_skip_nodes_with_local_storage
120+
skip_nodes_with_system_pods = var.auto_scaler_profile_skip_nodes_with_system_pods
121+
}
122+
}
100123
dynamic "azure_active_directory_role_based_access_control" {
101124
for_each = var.role_based_access_control_enabled && var.rbac_aad && var.rbac_aad_managed ? ["rbac"] : []
102125

test/unit/unit_test.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,32 @@ func TestInvalidVarsForAutomaticUpgrades(t *testing.T) {
199199
}
200200
}
201201

202+
func TestScaleDownDelayAfterDeleteNotSetShouldUseScanInterval(t *testing.T) {
203+
test_helper.RunE2ETest(t, "../../", "unit-test-fixture", terraform.Options{
204+
Upgrade: false,
205+
Vars: dummyRequiredVariables(),
206+
}, func(t *testing.T, output test_helper.TerraformOutput) {
207+
scaleDownDelayAfterDelete, ok := output["auto_scaler_profile_scale_down_delay_after_delete"].(string)
208+
assert.True(t, ok)
209+
scanInterval, ok := output["auto_scaler_profile_scan_interval"].(string)
210+
assert.True(t, ok)
211+
assert.Equal(t, scanInterval, scaleDownDelayAfterDelete)
212+
})
213+
}
214+
215+
func TestScaleDownDelayAfterDeleteSetShouldUseVar(t *testing.T) {
216+
vars := dummyRequiredVariables()
217+
vars["auto_scaler_profile_scale_down_delay_after_delete"] = "15s"
218+
test_helper.RunE2ETest(t, "../../", "unit-test-fixture", terraform.Options{
219+
Upgrade: false,
220+
Vars: vars,
221+
}, func(t *testing.T, output test_helper.TerraformOutput) {
222+
scaleDownDelayAfterDelete, ok := output["auto_scaler_profile_scale_down_delay_after_delete"].(string)
223+
assert.True(t, ok)
224+
assert.Equal(t, "15s", scaleDownDelayAfterDelete)
225+
})
226+
}
227+
202228
func dummyRequiredVariables() map[string]interface{} {
203229
return map[string]interface{}{
204230
"prefix": "foo",

unit-test-fixture/outputs.tf

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,11 @@ output "log_analytics_workspace" {
1313
output "automatic_channel_upgrade_check" {
1414
value = local.automatic_channel_upgrade_check
1515
}
16+
17+
output "auto_scaler_profile_scale_down_delay_after_delete" {
18+
value = local.auto_scaler_profile_scale_down_delay_after_delete
19+
}
20+
21+
output "auto_scaler_profile_scan_interval" {
22+
value = var.auto_scaler_profile_scan_interval
23+
}

variables.tf

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,119 @@ variable "api_server_authorized_ip_ranges" {
9393
default = null
9494
}
9595

96+
variable "auto_scaler_profile_balance_similar_node_groups" {
97+
description = "Detect similar node groups and balance the number of nodes between them. Defaults to `false`."
98+
type = bool
99+
default = false
100+
}
101+
102+
variable "auto_scaler_profile_empty_bulk_delete_max" {
103+
description = "Maximum number of empty nodes that can be deleted at the same time. Defaults to `10`."
104+
type = number
105+
default = 10
106+
}
107+
108+
variable "auto_scaler_profile_enabled" {
109+
type = bool
110+
description = "Enable configuring the auto scaler profile"
111+
default = false
112+
nullable = false
113+
}
114+
115+
variable "auto_scaler_profile_expander" {
116+
description = "Expander to use. Possible values are `least-waste`, `priority`, `most-pods` and `random`. Defaults to `random`."
117+
type = string
118+
default = "random"
119+
validation {
120+
condition = contains(["least-waste", "most-pods", "priority", "random"], var.auto_scaler_profile_expander)
121+
error_message = "Must be either `least-waste`, `most-pods`, `priority` or `random`."
122+
}
123+
}
124+
125+
variable "auto_scaler_profile_max_graceful_termination_sec" {
126+
description = "Maximum number of seconds the cluster autoscaler waits for pod termination when trying to scale down a node. Defaults to `600`."
127+
type = string
128+
default = "600"
129+
}
130+
131+
variable "auto_scaler_profile_max_node_provisioning_time" {
132+
description = "Maximum time the autoscaler waits for a node to be provisioned. Defaults to `15m`."
133+
type = string
134+
default = "15m"
135+
}
136+
137+
variable "auto_scaler_profile_max_unready_nodes" {
138+
description = "Maximum Number of allowed unready nodes. Defaults to `3`."
139+
type = number
140+
default = 3
141+
}
142+
143+
variable "auto_scaler_profile_max_unready_percentage" {
144+
description = "Maximum percentage of unready nodes the cluster autoscaler will stop if the percentage is exceeded. Defaults to `45`."
145+
type = number
146+
default = 45
147+
}
148+
149+
variable "auto_scaler_profile_new_pod_scale_up_delay" {
150+
description = "For scenarios like burst/batch scale where you don't want CA to act before the kubernetes scheduler could schedule all the pods, you can tell CA to ignore unscheduled pods before they're a certain age. Defaults to `10s`."
151+
type = string
152+
default = "10s"
153+
}
154+
155+
variable "auto_scaler_profile_scale_down_delay_after_add" {
156+
description = "How long after the scale up of AKS nodes the scale down evaluation resumes. Defaults to `10m`."
157+
type = string
158+
default = "10m"
159+
}
160+
161+
variable "auto_scaler_profile_scale_down_delay_after_delete" {
162+
description = "How long after node deletion that scale down evaluation resumes. Defaults to the value used for `scan_interval`."
163+
type = string
164+
default = null
165+
}
166+
167+
variable "auto_scaler_profile_scale_down_delay_after_failure" {
168+
description = "How long after scale down failure that scale down evaluation resumes. Defaults to `3m`."
169+
type = string
170+
default = "3m"
171+
}
172+
173+
variable "auto_scaler_profile_scale_down_unneeded" {
174+
description = "How long a node should be unneeded before it is eligible for scale down. Defaults to `10m`."
175+
type = string
176+
default = "10m"
177+
}
178+
179+
variable "auto_scaler_profile_scale_down_unready" {
180+
description = "How long an unready node should be unneeded before it is eligible for scale down. Defaults to `20m`."
181+
type = string
182+
default = "20m"
183+
}
184+
185+
variable "auto_scaler_profile_scale_down_utilization_threshold" {
186+
description = "Node utilization level, defined as sum of requested resources divided by capacity, below which a node can be considered for scale down. Defaults to `0.5`."
187+
type = string
188+
default = "0.5"
189+
}
190+
191+
variable "auto_scaler_profile_scan_interval" {
192+
description = "How often the AKS Cluster should be re-evaluated for scale up/down. Defaults to `10s`."
193+
type = string
194+
default = "10s"
195+
}
196+
197+
variable "auto_scaler_profile_skip_nodes_with_local_storage" {
198+
description = "If `true` cluster autoscaler will never delete nodes with pods with local storage, for example, EmptyDir or HostPath. Defaults to `true`."
199+
type = bool
200+
default = true
201+
}
202+
203+
variable "auto_scaler_profile_skip_nodes_with_system_pods" {
204+
description = "If `true` cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods). Defaults to `true`."
205+
type = bool
206+
default = true
207+
}
208+
96209
variable "automatic_channel_upgrade" {
97210
type = string
98211
default = null

0 commit comments

Comments
 (0)