From 9b5b9011042711990abfbd07ec5f2c16e38f30bf Mon Sep 17 00:00:00 2001 From: Pete Birley Date: Thu, 16 May 2019 16:10:32 -0500 Subject: [PATCH] Rabbit: Ensure node has joined cluster on initial startup This PS extends the rabbit startup locgic to ensure nodes have actually joined the cluster on startup. Change-Id: Ib876d9abd89209d0a7972983bdf4daacf5f8f582 Signed-off-by: Pete Birley --- .../templates/bin/_rabbitmq-readiness.sh.tpl | 6 ++- rabbitmq/templates/bin/_rabbitmq-start.sh.tpl | 43 +++++++++++++++-- rabbitmq/templates/bin/_rabbitmq-test.sh.tpl | 47 ++++++++----------- .../bin/_rabbitmq-wait-for-cluster.sh.tpl | 12 ++++- 4 files changed, 75 insertions(+), 33 deletions(-) diff --git a/rabbitmq/templates/bin/_rabbitmq-readiness.sh.tpl b/rabbitmq/templates/bin/_rabbitmq-readiness.sh.tpl index 2f30aa437..63e1cc3e7 100644 --- a/rabbitmq/templates/bin/_rabbitmq-readiness.sh.tpl +++ b/rabbitmq/templates/bin/_rabbitmq-readiness.sh.tpl @@ -18,4 +18,8 @@ limitations under the License. set -e -exec rabbitmqctl status +if [ -f /run/rabbit-disable-readiness ]; then + exit 1 +else + exec rabbitmqctl status +fi diff --git a/rabbitmq/templates/bin/_rabbitmq-start.sh.tpl b/rabbitmq/templates/bin/_rabbitmq-start.sh.tpl index df067a137..7993518a7 100644 --- a/rabbitmq/templates/bin/_rabbitmq-start.sh.tpl +++ b/rabbitmq/templates/bin/_rabbitmq-start.sh.tpl @@ -29,10 +29,15 @@ function check_rabbit_node_health () { rabbitmq-diagnostics node_health_check -n "${CLUSTER_SEED_NAME}" -t 10 &>/dev/null } -function check_rabbit_node_ready () { +get_node_name () { TARGET_POD=$1 POD_NAME_PREFIX="$(echo "${MY_POD_NAME}" | awk 'BEGIN{FS=OFS="-"}{NF--; print}')" - CLUSTER_SEED_NAME="$(echo "${RABBITMQ_NODENAME}" | awk -F "@${MY_POD_NAME}." "{ print \$1 \"@${POD_NAME_PREFIX}-${TARGET_POD}.\" \$2 }")" + echo "${RABBITMQ_NODENAME}" | awk -F "@${MY_POD_NAME}." "{ print \$1 \"@${POD_NAME_PREFIX}-${TARGET_POD}.\" \$2 }" +} + +function check_rabbit_node_ready () { + TARGET_POD=$1 + CLUSTER_SEED_NAME="$(get_node_name ${TARGET_POD})" CLUSTER_SEED_HOST="$(echo "${CLUSTER_SEED_NAME}" | awk -F '@' '{ print $NF }')" check_rabbit_node_health "${CLUSTER_SEED_NAME}" && \ check_if_open "${CLUSTER_SEED_HOST}" "${PORT_HTTP}" && \ @@ -56,7 +61,39 @@ if ! [ "${POD_INCREMENT}" -eq "0" ] && ! [ -d "/var/lib/rabbitmq/mnesia" ] ; the fi done done - rm -fv /run/rabbit-disable-liveness-probe + + function reset_rabbit () { + rabbitmqctl shutdown || true + rm -rf /var/lib/rabbitmq/* + exit 1 + } + + # Start RabbitMQ, but disable readiness from being reported so the pod is not + # marked as up prematurely. + touch /run/rabbit-disable-readiness + rabbitmq-server & + + # Wait for server to start, and reset if it does not + END=$(($(date +%s) + 180)) + while ! rabbitmqctl -q cluster_status; do + sleep 5 + NOW=$(date +%s) + [ $NOW -gt $END ] && reset_rabbit + done + + # Wait for server to join cluster, reset if it does not + POD_INCREMENT=$(echo "${MY_POD_NAME}" | awk -F '-' '{print $NF}') + END=$(($(date +%s) + 180)) + while ! rabbitmqctl -l --node $(get_node_name 0) -q cluster_status | grep -q "$(get_node_name ${POD_INCREMENT})"; do + sleep 5 + NOW=$(date +%s) + [ $NOW -gt $END ] && reset_rabbit + done + + # Shutdown the inital server + rabbitmqctl shutdown + + rm -fv /run/rabbit-disable-readiness /run/rabbit-disable-liveness-probe fi exec rabbitmq-server diff --git a/rabbitmq/templates/bin/_rabbitmq-test.sh.tpl b/rabbitmq/templates/bin/_rabbitmq-test.sh.tpl index ddbb15ec3..dc9563956 100644 --- a/rabbitmq/templates/bin/_rabbitmq-test.sh.tpl +++ b/rabbitmq/templates/bin/_rabbitmq-test.sh.tpl @@ -16,7 +16,7 @@ See the License for the specific language governing permissions and limitations under the License. */}} -set -e +set -ex # Extract connection details RABBIT_HOSTNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $2}' \ @@ -24,22 +24,30 @@ RABBIT_HOSTNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $2}' \ RABBIT_PORT=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $2}' \ | awk -F'[:/]' '{print $2}'` +set +x # Extract Admin User creadential RABBITMQ_ADMIN_USERNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $1}' \ | awk -F'[//:]' '{print $4}'` RABBITMQ_ADMIN_PASSWORD=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $1}' \ | awk -F'[//:]' '{print $5}'` +set -x -function rabbit_check_node_count () { - echo "Checking node count " - NODES_IN_CLUSTER=$(rabbitmqadmin \ +function rabbitmqadmin_authed () { + set +x + rabbitmqadmin \ --host="${RABBIT_HOSTNAME}" \ --port="${RABBIT_PORT}" \ --username="${RABBITMQ_ADMIN_USERNAME}" \ --password="${RABBITMQ_ADMIN_PASSWORD}" \ - list nodes -f bash | wc -w) + $@ + set -x +} + +function rabbit_check_node_count () { + echo "Checking node count " + NODES_IN_CLUSTER=$(rabbitmqadmin_authed list nodes -f bash | wc -w) if [ "$NODES_IN_CLUSTER" -eq "$RABBIT_REPLICA_COUNT" ]; then - echo "Number of nodes in cluster match number of desired pods ($NODES_IN_CLUSTER)" + echo "Number of nodes in cluster ($NODES_IN_CLUSTER) match number of desired pods ($NODES_IN_CLUSTER)" else echo "Number of nodes in cluster ($NODES_IN_CLUSTER) does not match number of desired pods ($RABBIT_REPLICA_COUNT)" exit 1 @@ -49,13 +57,9 @@ function rabbit_check_node_count () { rabbit_check_node_count function rabbit_find_partitions () { - rabbitmqadmin \ - --host="${RABBIT_HOSTNAME}" \ - --port="${RABBIT_PORT}" \ - --username="${RABBITMQ_ADMIN_USERNAME}" \ - --password="${RABBITMQ_ADMIN_PASSWORD}" \ - list nodes -f raw_json | \ - python -c " + NODE_INFO=$(mktemp) + rabbitmqadmin_authed list nodes -f pretty_json | tee "${NODE_INFO}" + cat "${NODE_INFO}" | python -c " import json, sys, traceback print('Checking cluster partitions') obj=json.load(sys.stdin) @@ -66,31 +70,20 @@ for num, node in enumerate(obj): raise Exception('cluster partition found: %s' % partition) except KeyError: print('Error: partition key not found for node %s' % node) - sys.exit(1) print('No cluster partitions found') " + rm -vf "${NODE_INFO}" } - rabbit_find_partitions function rabbit_check_users_match () { echo "Checking users match on all nodes" - NODES=$(rabbitmqadmin \ - --host="${RABBIT_HOSTNAME}" \ - --port="${RABBIT_PORT}" \ - --username="${RABBITMQ_ADMIN_USERNAME}" \ - --password="${RABBITMQ_ADMIN_PASSWORD}" \ - list nodes -f bash) + NODES=$(rabbitmqadmin_authed list nodes -f bash) USER_LIST=$(mktemp --directory) echo "Found the following nodes: ${NODES}" for NODE in ${NODES}; do echo "Checking Node: ${NODE#*@}" - rabbitmqadmin \ - --host=${NODE#*@} \ - --port="${RABBIT_PORT}" \ - --username="${RABBITMQ_ADMIN_USERNAME}" \ - --password="${RABBITMQ_ADMIN_PASSWORD}" \ - list users -f bash > ${USER_LIST}/${NODE#*@} + rabbitmqadmin_authed list users -f bash > ${USER_LIST}/${NODE#*@} done cd ${USER_LIST}; diff -q --from-file $(ls ${USER_LIST}) echo "User lists match for all nodes" diff --git a/rabbitmq/templates/bin/_rabbitmq-wait-for-cluster.sh.tpl b/rabbitmq/templates/bin/_rabbitmq-wait-for-cluster.sh.tpl index e2687c91b..c9895762a 100644 --- a/rabbitmq/templates/bin/_rabbitmq-wait-for-cluster.sh.tpl +++ b/rabbitmq/templates/bin/_rabbitmq-wait-for-cluster.sh.tpl @@ -30,13 +30,21 @@ RABBITMQ_ADMIN_USERNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $ RABBITMQ_ADMIN_PASSWORD=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $1}' \ | awk -F'[//:]' '{print $5}'` -function active_rabbit_nodes () { +set -ex + +function rabbitmqadmin_authed () { + set +x rabbitmqadmin \ --host="${RABBIT_HOSTNAME}" \ --port="${RABBIT_PORT}" \ --username="${RABBITMQ_ADMIN_USERNAME}" \ --password="${RABBITMQ_ADMIN_PASSWORD}" \ - list nodes -f bash | wc -w + $@ + set -x +} + +function active_rabbit_nodes () { + rabbitmqadmin_authed list nodes -f bash | wc -w } until test "$(active_rabbit_nodes)" -ge "$RABBIT_REPLICA_COUNT"; do