Use less aggressive node startup timeouts (#230)

* Use less aggressive node startup timeouts

* Tweaks to try and prevent runaway remediations
This commit is contained in:
Matt Pryor 2024-01-29 14:26:17 +00:00 committed by GitHub
parent 44d77503d7
commit b9eadc2650
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -290,10 +290,10 @@ controlPlane:
enabled: true
# The spec for the health check
spec:
# By default, unhealthy control plane nodes are always remediated
maxUnhealthy: 100%
# If a node takes longer than 10 mins to startup, remediate it
nodeStartupTimeout: 10m0s
# By default, don't remediate control plane nodes when more than one is unhealthy
maxUnhealthy: 1
# If a node takes longer than 30 mins to startup, remediate it
nodeStartupTimeout: 30m0s
# By default, consider a control plane node that has not been Ready
# for more than 5 mins unhealthy
unhealthyConditions:
@ -387,10 +387,11 @@ nodeGroupDefaults:
enabled: true
# The spec for the health check
spec:
# By default, unhealthy worker nodes are always remediated
maxUnhealthy: 100%
# If a node takes longer than 10 mins to startup, remediate it
nodeStartupTimeout: 10m0s
# By default, remediate unhealthy workers as long as they are less than 40% of
# the total number of workers in the node group
maxUnhealthy: 40%
# If a node takes longer than 30 mins to startup, remediate it
nodeStartupTimeout: 30m0s
# By default, consider a worker node that has not been Ready for
# more than 5 mins unhealthy
unhealthyConditions: