Make K8S proxy health check more aggressive
In K8S version 1.10, the proxy can sometimes get stuck believing that some services do not have any endpoints. This seems to be triggered by network instability, though the proxy doesn't seem to recover on its own, while bouncing the pod fixes the issue. This change adds a naive means of detecting and recoverying from this (`iptables-save | grep 'has no endpoints'` in the liveness probe) that may occasionally have false positives. As such, the liveness probe is configured very conservatively to avoid triggering CrashLoopBackoff in the event of a false positive. Finally, there is a whitelist feature to help avoid false positives for services that are known to legitimately have empty endpoints during the course of normal operation (e.g. Patroni might manage such an endpoint list). Change-Id: I29a770fab70b1fb79db59ef5408f40b2af1c01f9
This commit is contained in:
parent
0233c30ffb
commit
69cb269230
26
charts/proxy/templates/bin/_liveness-probe.sh.tpl
Normal file
26
charts/proxy/templates/bin/_liveness-probe.sh.tpl
Normal file
@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
FAILURE=0
|
||||
{{- if .Values.livenessProbe.whitelist }}
|
||||
WHITELIST='({{- join "|" .Values.livenessProbe.whitelist -}})'
|
||||
{{- end }}
|
||||
|
||||
REQUEST='GET /healthz HTTP/1.0\r\nHost: localhost:10256\r\n'
|
||||
|
||||
if [[ $(echo -e "${REQUEST}" | socat - TCP4:localhost:10256 | grep -sc '200 OK') -lt 1 ]]; then
|
||||
echo Failed proxy built-in HTTP health check.
|
||||
echo -e "${REQUEST}" | socat - TCP4:localhost:10256
|
||||
FAILURE=1
|
||||
fi
|
||||
|
||||
if [[ $(iptables-save {{- if .Values.livenessProbe.whitelist }} | grep -Ev "${WHITELIST}" {{- end }} | grep -sc 'has no endpoints') -gt 0 ]]; then
|
||||
echo Some non-whitelisted services have no endpoints:
|
||||
iptables-save | grep 'has no endpoints'
|
||||
FAILURE=1
|
||||
fi
|
||||
|
||||
if [[ "${FAILURE}" == "1" ]]; then
|
||||
exit 1
|
||||
fi
|
5
charts/proxy/templates/bin/_readiness-probe.sh.tpl
Normal file
5
charts/proxy/templates/bin/_readiness-probe.sh.tpl
Normal file
@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
iptables-save | grep 'default/kubernetes:https'
|
26
charts/proxy/templates/configmap-bin.yaml
Normal file
26
charts/proxy/templates/configmap-bin.yaml
Normal file
@ -0,0 +1,26 @@
|
||||
{{/*
|
||||
# Copyright (c) 2018 AT&T Intellectual Property. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License. */}}
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: kubernetes-proxy-bin
|
||||
data:
|
||||
liveness-probe.sh: |
|
||||
{{ tuple "bin/_liveness-probe.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
readiness-probe.sh: |
|
||||
{{ tuple "bin/_readiness-probe.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
...
|
@ -63,24 +63,23 @@ spec:
|
||||
- name: KUBERNETES_SERVICE_PORT
|
||||
value: {{ .Values.kube_service.port | quote }}
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
host: 127.0.0.1
|
||||
path: /healthz
|
||||
port: 10256
|
||||
failureThreshold: 3
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 5
|
||||
{{ toYaml .Values.livenessProbe.config | indent 10 }}
|
||||
exec:
|
||||
command:
|
||||
- /tmp/bin/liveness-probe.sh
|
||||
readinessProbe:
|
||||
exec:
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |-
|
||||
set -ex
|
||||
iptables-save | grep 'default/kubernetes:https'
|
||||
- /tmp/bin/readiness-probe.sh
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 15
|
||||
volumeMounts:
|
||||
- name: bin
|
||||
mountPath: /tmp/bin/
|
||||
serviceAccountName: kube-proxy
|
||||
volumes:
|
||||
- name: bin
|
||||
configMap:
|
||||
name: kubernetes-proxy-bin
|
||||
defaultMode: 0555
|
||||
{{- end }}
|
||||
|
@ -55,3 +55,17 @@ network:
|
||||
kube_service:
|
||||
host: 127.0.0.1
|
||||
port: 6553
|
||||
|
||||
livenessProbe:
|
||||
config:
|
||||
# NOTE(mark-burnett): To avoid cascading failure modes, it is
|
||||
# important that these values are configured to avoid the possibility
|
||||
# of CrashLoopBackoff for this pod. Otherwise, a small non-impacting
|
||||
# issue could disable kube-proxy for the entire site.
|
||||
failureThreshold: 10
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 35
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 10
|
||||
whitelist:
|
||||
# - postgres
|
||||
|
Loading…
x
Reference in New Issue
Block a user