
New members will join as learners. Signed-off-by: Ruslan Aliev <raliev@mirantis.com> Change-Id: If05e4337e8c16ded0520b480a91f9adcf608cd35
174 lines
6.7 KiB
Smarty
174 lines
6.7 KiB
Smarty
#!/bin/sh
|
|
# Copyright 2018 AT&T Intellectual Property. All other rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
set -xu
|
|
TEMP_MANIFEST=/tmp/etcd.yaml
|
|
|
|
sync_file () {
|
|
if ! cmp "$1" "$2"; then
|
|
cp -f "$1" "$2"
|
|
fi
|
|
}
|
|
|
|
sync_certificates () {
|
|
mkdir -p /etcd-etc/tls
|
|
sync_file /etc/etcd/tls/certs/client-ca.pem /etcd-etc/tls/client-ca.pem
|
|
sync_file /etc/etcd/tls/certs/peer-ca.pem /etcd-etc/tls/peer-ca.pem
|
|
sync_file "/etc/etcd/tls/certs/${ETCD_NAME}-etcd-client.pem" /etcd-etc/tls/etcd-client.pem
|
|
sync_file "/etc/etcd/tls/certs/${ETCD_NAME}-etcd-peer.pem" /etcd-etc/tls/etcd-peer.pem
|
|
sync_file "/etc/etcd/tls/keys/${ETCD_NAME}-etcd-client-key.pem" /etcd-etc/tls/etcd-client-key.pem
|
|
sync_file "/etc/etcd/tls/keys/${ETCD_NAME}-etcd-peer-key.pem" /etcd-etc/tls/etcd-peer-key.pem
|
|
}
|
|
|
|
create_manifest () {
|
|
WIP=/tmp/wip-manifest.yaml
|
|
cp -f /anchor-etcd/{{ .Values.service.name }}.yaml $WIP
|
|
sed -i -e 's#_ETCD_INITIAL_CLUSTER_STATE_#'$2'#g' $WIP
|
|
sed -i -e 's#_ETCD_INITIAL_CLUSTER_#'$1'#g' $WIP
|
|
sync_file "$WIP" "$3"
|
|
}
|
|
|
|
sync_configuration () {
|
|
sync_certificates
|
|
ETCD_INITIAL_CLUSTER=$(grep -v $PEER_ENDPOINT "$1" \
|
|
| awk -F ', ' '{ print $3 "=" $4 }' \
|
|
| tr '\n' ',' \
|
|
| sed "s;\$;$ETCD_NAME=https://\$\(POD_IP\):{{ .Values.network.service_peer.target_port }};")
|
|
ETCD_INITIAL_CLUSTER_STATE=existing
|
|
create_manifest "$ETCD_INITIAL_CLUSTER" "$ETCD_INITIAL_CLUSTER_STATE" "$TEMP_MANIFEST"
|
|
sync_file "${TEMP_MANIFEST}" "${MANIFEST_PATH}"
|
|
chmod go-rwx "${MANIFEST_PATH}"
|
|
}
|
|
|
|
cleanup_host () {
|
|
rm -f $MANIFEST_PATH
|
|
rm -rf /etcd-etc/tls/
|
|
rm -rf /etcd-data/*
|
|
firstrun=true
|
|
}
|
|
|
|
firstrun=true
|
|
saddness_duration=0
|
|
while true; do
|
|
# TODO(mark-burnett) Need to monitor a file(s) when shutting down/starting
|
|
# up so I don't try to take two actions on the node at once.
|
|
{{- if .Values.bootstrapping.enabled }}
|
|
if [ -e /bootstrapping/{{ .Values.bootstrapping.filename }} ]; then
|
|
# If the first node is starting, wait for it to become healthy
|
|
end=$(($(date +%s) + {{ .Values.bootstrapping.timeout }}))
|
|
while etcdctl member list | grep $POD_IP; do
|
|
if ETCDCTL_ENDPOINTS=$CLIENT_ENDPOINT etcdctl endpoint health; then
|
|
echo Member appears healthy, removing bootstrap file.
|
|
rm /bootstrapping/{{ .Values.bootstrapping.filename }}
|
|
break
|
|
else
|
|
now=$(date +%s)
|
|
if [ $now -gt $end ]; then
|
|
echo Member did not start successfully before bootstrap timeout. Deleting and trying again.
|
|
rm -f $MANIFEST_PATH
|
|
sleep {{ .Values.anchor.period }}
|
|
break
|
|
fi
|
|
sleep {{ .Values.anchor.period }}
|
|
fi
|
|
done
|
|
fi
|
|
if [ -e /bootstrapping/{{ .Values.bootstrapping.filename }} ]; then
|
|
# Bootstrap the first node
|
|
sync_certificates
|
|
ETCD_INITIAL_CLUSTER=${ETCD_NAME}=https://\$\(POD_IP\):{{ .Values.network.service_peer.target_port }}
|
|
ETCD_INITIAL_CLUSTER_STATE=new
|
|
create_manifest "$ETCD_INITIAL_CLUSTER" "$ETCD_INITIAL_CLUSTER_STATE" "$MANIFEST_PATH"
|
|
sleep {{ .Values.anchor.period }}
|
|
continue
|
|
fi
|
|
{{- end }}
|
|
sleep {{ .Values.anchor.period }}
|
|
if [ -e /tmp/stopped ]; then
|
|
echo Stopping
|
|
break
|
|
fi
|
|
if [ -e /tmp/stopping ]; then
|
|
echo Waiting to stop..
|
|
continue
|
|
fi
|
|
if ! etcdctl member list > /tmp/members; then
|
|
echo Could not get a member list, trying again.
|
|
continue
|
|
fi
|
|
if ! grep $PEER_ENDPOINT /tmp/members; then
|
|
# If this member is not in the cluster, try to add it.
|
|
if grep -v '\bstarted\b' /tmp/members; then
|
|
echo Cluster does not appear fully online, waiting.
|
|
continue
|
|
fi
|
|
# Add this member to the cluster
|
|
if ! etcdctl member add $HOSTNAME --peer-urls $PEER_ENDPOINT --learner; then
|
|
echo Failed to add $HOSTNAME to member list. Waiting.
|
|
continue
|
|
fi
|
|
echo Successfully added $HOSTNAME to cluster members.
|
|
# Refresh member list so we start with the right configuration.
|
|
if ! etcdctl member list > /tmp/members; then
|
|
echo Could not get a member list, trying again.
|
|
continue
|
|
fi
|
|
elif grep $PEER_ENDPOINT /tmp/members | grep '\bunstarted\b'; then
|
|
# This member is in the cluster but not started
|
|
if [ $saddness_duration -ge {{ .Values.anchor.saddness_threshold }} ]
|
|
then
|
|
# We have surpassed the sadness duration, remove the member and try re-adding
|
|
memberid=$(grep $PEER_ENDPOINT /tmp/members | awk -F ',' '{print $1}')
|
|
echo "Removing $memberid from etcd cluster to recreate."
|
|
if etcdctl member remove "$memberid"; then
|
|
cleanup_host
|
|
else
|
|
echo "ERROR: Attempted recreate member and failed!!!"
|
|
fi
|
|
else
|
|
saddness_duration=$(($saddness_duration+1))
|
|
sync_configuration /tmp/members
|
|
sleep {{ .Values.anchor.health_wait_period }}
|
|
firstrun=false
|
|
fi
|
|
continue
|
|
elif [ "$(grep $PEER_ENDPOINT /tmp/members | awk -F, '/,/{gsub(/ /, "", $6); print $6}')" = "true" ]; then
|
|
if ! etcdctl member promote "$(grep $PEER_ENDPOINT /tmp/members | awk -F, '{print $1}')"; then
|
|
echo Failed to promote $HOSTNAME to member list. Waiting.
|
|
fi
|
|
continue
|
|
fi
|
|
if $firstrun; then
|
|
sync_configuration /tmp/members
|
|
firstrun=false
|
|
sleep {{ .Values.anchor.health_wait_period }}
|
|
continue
|
|
fi
|
|
if ! ETCDCTL_ENDPOINTS=$CLIENT_ENDPOINT etcdctl endpoint health; then
|
|
# If not health, sleeps before checking again and then updating configs.
|
|
echo Member is not healthy, sleeping before checking again.
|
|
sleep {{ .Values.anchor.health_wait_period }}
|
|
if ! ETCDCTL_ENDPOINTS=$CLIENT_ENDPOINT etcdctl endpoint health; then
|
|
# If still not healthy updates the configs.
|
|
echo Member is not healthy, syncing configurations.
|
|
sync_configuration /tmp/members
|
|
continue
|
|
else
|
|
saddness_duration=0
|
|
fi
|
|
else
|
|
saddness_duration=0
|
|
fi
|
|
done
|