Allow etcd anchor to recover from bad state
- If an etcd member has corrupted data or has somehow been removed from a cluster, the anchor does not currently recover. This change adds a threshold of X monitoring loops after which the anchor will remove the member from the cluster and recreate it. Note: This is safe due to etcd's strict quorum checking on runtime reconfiguration, see [0]. [0] https://github.com/etcd-io/etcd/blob/master/Documentation/op-guide/configuration.md#--strict-reconfig-check Change-Id: Id2ceea7393c46bed9fa5e3ead37014e52c91eac3
This commit is contained in:
parent
59c27d76ab
commit
d2f020fbb7
@ -46,7 +46,16 @@ function sync_configuration {
|
|||||||
sync_file "${TEMP_MANIFEST}" "${MANIFEST_PATH}"
|
sync_file "${TEMP_MANIFEST}" "${MANIFEST_PATH}"
|
||||||
chmod go-rwx "${MANIFEST_PATH}"
|
chmod go-rwx "${MANIFEST_PATH}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function cleanup_host {
|
||||||
|
rm -f $MANIFEST_PATH
|
||||||
|
rm -rf /etcd-etc/tls/
|
||||||
|
rm -rf /etcd-data/*
|
||||||
firstrun=true
|
firstrun=true
|
||||||
|
}
|
||||||
|
|
||||||
|
firstrun=true
|
||||||
|
saddness_duration=0
|
||||||
while true; do
|
while true; do
|
||||||
# TODO(mark-burnett) Need to monitor a file(s) when shutting down/starting
|
# TODO(mark-burnett) Need to monitor a file(s) when shutting down/starting
|
||||||
# up so I don't try to take two actions on the node at once.
|
# up so I don't try to take two actions on the node at once.
|
||||||
@ -91,9 +100,6 @@ while true; do
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
etcdctl member list > /tmp/members
|
etcdctl member list > /tmp/members
|
||||||
# if never started or (ever started and not currently started); then
|
|
||||||
# resync
|
|
||||||
# fi
|
|
||||||
if ! grep $PEER_ENDPOINT /tmp/members; then
|
if ! grep $PEER_ENDPOINT /tmp/members; then
|
||||||
# If this member is not in the cluster, try to add it.
|
# If this member is not in the cluster, try to add it.
|
||||||
if grep -v '\bstarted\b' /tmp/members; then
|
if grep -v '\bstarted\b' /tmp/members; then
|
||||||
@ -108,6 +114,22 @@ while true; do
|
|||||||
echo Successfully added $HOSTNAME to cluster members.
|
echo Successfully added $HOSTNAME to cluster members.
|
||||||
# Refresh member list so we start with the right configuration.
|
# Refresh member list so we start with the right configuration.
|
||||||
etcdctl member list > /tmp/members
|
etcdctl member list > /tmp/members
|
||||||
|
elif grep $PEER_ENDPOINT /tmp/members | grep '\bunstarted\b'; then
|
||||||
|
# This member is in the cluster but not started
|
||||||
|
if [ $saddness_duration -ge {{ .Values.anchor.saddness_threshold }} ]
|
||||||
|
then
|
||||||
|
# We have surpassed the sadness duration, remove the member and try re-adding
|
||||||
|
memberid=$(grep $PEER_ENDPOINT /tmp/members | awk -F ',' '{print $1}')
|
||||||
|
echo "Removing $memberid from etcd cluster to recreate."
|
||||||
|
if etcdctl member remove "$memberid"; then
|
||||||
|
cleanup_host
|
||||||
|
else
|
||||||
|
echo "ERROR: Attempted recreate member and failed!!!"
|
||||||
|
fi
|
||||||
|
continue
|
||||||
|
else
|
||||||
|
saddness_duration=$(($saddness_duration+1))
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
if $firstrun; then
|
if $firstrun; then
|
||||||
sync_configuration /tmp/members
|
sync_configuration /tmp/members
|
||||||
@ -122,6 +144,10 @@ while true; do
|
|||||||
echo Member is not healthy, syncing configurations.
|
echo Member is not healthy, syncing configurations.
|
||||||
sync_configuration /tmp/members
|
sync_configuration /tmp/members
|
||||||
continue
|
continue
|
||||||
|
else
|
||||||
|
saddness_duration=0
|
||||||
fi
|
fi
|
||||||
|
else
|
||||||
|
saddness_duration=0
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
@ -28,7 +28,9 @@ anchor:
|
|||||||
enable_cleanup: true
|
enable_cleanup: true
|
||||||
etcdctl_endpoint: example-etcd
|
etcdctl_endpoint: example-etcd
|
||||||
host_data_path: /var/lib/etcd/example
|
host_data_path: /var/lib/etcd/example
|
||||||
|
# How many monitoring loops the anchor goes through with an unhealthy member
|
||||||
|
# before removing the member from the cluster and recreating
|
||||||
|
saddness_threshold: 3
|
||||||
kubelet:
|
kubelet:
|
||||||
manifest_path: /etc/kubernetes/manifests
|
manifest_path: /etc/kubernetes/manifests
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user