Updated resiliency gate

Updated resiliency gate script to consistently pass all gate stages, using ubuntu bionic image for node deployment. - Updated developer-onbording.rst with information on how to configure and run the resilency gate behind corporate proxy. - Updated the gate scripts to use the proxy configuration. - Updated up.sh to pull the hyperkube image as cache, to speed up and stabalize the initial kublet deployment of kubernetes cluster services. - Updated and added sleeps and retries in some of gate stages and scripts to avoid gate failures due to transient environment issues. - Updated the ubuntu base image for node deployments from xenial to\ bionic base image. - Added code in treadown-nodes stage to manually remove the etcd members: kubernetes and calico, since they still remain listed as etcd members on genesis node, even after genesis is torn down. Change-Id: Ia11d66ab30ac7a07626d4f1d02a6da48155f862d
2020-02-19 02:04:49 +00:00 · 2020-02-19 02:04:49 +00:00 · 9f42b502f7
commit 9f42b502f7
parent cb4ae15eb1
11 changed files with 91 additions and 20 deletions
--- a/doc/source/developer-onboarding.rst
+++ b/doc/source/developer-onboarding.rst
@ -43,6 +43,25 @@ debug it, e.g.:

    ./tools/g2/bin/ssh.sh n0

+Running Resilency Tests Behind Corporate Proxy
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If your development environment is behind a corporate proxy, you will need to
+update following files to add your envrionment's proxy information, dns, or
+possibly your internal ntp servers, in order to deploy airship:
+
+    * `charts/coredns/values.yaml`: Update the upstream coredns nameserver IPs
+      to your internal DNS addresses.
+    * `examples/basic/KubernetesNetwork.yaml`: Since resilency manifest uses
+      the examples/basic environment configuration, you will need to Update
+      the kubernetes network configuration in this folder. Update the upstream
+      nameserver IPs to your internal DNS addresses. Add the http(s) proxy URL
+      and additional_no_proxy list. Also, if your enviornment requires that,
+      update the ntp server list to your internal ntp server addresses for
+      more reliable time sync.
+    * `tools/g2/templates/network-config.sub`: Update the upstream nameserver
+      IPs to your internal DNS addresses.
+
 Bootstrapping
 -------------

--- a/promenade/templates/include/up.sh
+++ b/promenade/templates/include/up.sh
@ -74,6 +74,14 @@ export http_proxy={{ config['KubernetesNetwork:proxy.url'] | default('', true) }
 export https_proxy={{ config['KubernetesNetwork:proxy.url'] | default('', true) }}
 export no_proxy={{ config.get(kind='KubernetesNetwork') | fill_no_proxy }}

+# Configure apt proxy
+if [[ -n "${http_proxy}" ]]; then
+  log "Configuring Apt Proxy"
+  cat << EOF | sudo tee /etc/apt/apt.conf.d/50proxyconf
+Acquire::https::proxy "${https_proxy}";
+Acquire::http::proxy "${http_proxy}";
+EOF
+fi

 # Install system packages
 #
@ -139,5 +147,13 @@ fi
 if systemctl -q is-enabled containerd > /dev/null 2>&1; then
  systemctl restart containerd || true
 fi
+# Pull the hyperkube image prior to restarting kubelet, this is
+# needed for more reliable image pull in an environment with slow
+# network connectivity to avoid image pull timeouts and retries by
+# kubelet.
+# The || true is added to let the deployment continue, evenif the
+# $IMAGE_HYPERKUBE is not defined in the environment, and the image
+# pull doesn't happen.
+docker image pull "${IMAGE_HYPERKUBE}" || true
 systemctl enable kubelet
 systemctl restart kubelet
--- a/promenade/templates/include/utils.sh
+++ b/promenade/templates/include/utils.sh
@ -222,7 +222,7 @@ function validate_kubectl_logs {
    NAMESPACE=default
    POD_NAME=log-test-${NODE}-$(date +%s)

-    cat <<EOPOD | kubectl --namespace $NAMESPACE apply -f -
+    cat <<EOPOD | kubectl --namespace $NAMESPACE --timeout 100s apply -f -
 ---
 apiVersion: v1
 kind: Pod
@ -244,6 +244,7 @@ EOPOD

    wait_for_node_ready $NODE 300
    wait_for_pod_termination $NAMESPACE $POD_NAME
+    sleep 5
    ACTUAL_LOGS=$(kubectl --namespace $NAMESPACE logs $POD_NAME)
    if [ "x$ACTUAL_LOGS" != "xEXPECTED RESULT" ]; then
        log Got unexpected logs:
--- a/tools/g2/lib/config.sh
+++ b/tools/g2/lib/config.sh
@ -1,6 +1,6 @@
 export TEMP_DIR=${TEMP_DIR:-$(mktemp -d)}
-export BASE_IMAGE_SIZE=${BASE_IMAGE_SIZE:-68719476736}
-export BASE_IMAGE_URL=${BASE_IMAGE_URL:-https://cloud-images.ubuntu.com/releases/16.04/release/ubuntu-16.04-server-cloudimg-amd64-disk1.img}
+export BASE_IMAGE_SIZE=${BASE_IMAGE_SIZE:-344784896}
+export BASE_IMAGE_URL=${BASE_IMAGE_URL:-https://cloud-images.ubuntu.com/releases/bionic/release/ubuntu-18.04-server-cloudimg-amd64.img}
 export IMAGE_PROMENADE=${IMAGE_PROMENADE:-quay.io/airshipit/promenade:master}
 export IMAGE_PROMENADE_DISTRO=${IMAGE_PROMENADE_DISTRO:-ubuntu_bionic}
 export IMAGE_HYPERKUBE=${IMAGE_HYPERKUBE:-gcr.io/google_containers/hyperkube-amd64:v1.17.3}
--- a/tools/g2/lib/etcd.sh
+++ b/tools/g2/lib/etcd.sh
@ -14,3 +14,17 @@ etcdctl_member_list() {

    etcdctl_cmd "${CLUSTER}" "${VM}" member list -w json | jq -r '.members[].name' | sort
 }
+
+etcdctl_member_remove() {
+    CLUSTER=${1}
+    VM=${2}
+    NODE=${3}
+    shift 3
+
+    MEMBER_ID=$(etcdctl_cmd $CLUSTER ${VM} member list | awk -F', ' "/${NODE}/ "'{ print $1}')
+    if  [[ -n $MEMBER_ID ]] ; then
+            etcdctl_cmd "${CLUSTER}" "${VM}" member remove "$MEMBER_ID"
+    else
+            log  No members found in cluster "$CLUSTER" for node "$NODE"
+    fi
+}
--- a/tools/g2/lib/validate.sh
+++ b/tools/g2/lib/validate.sh
@ -13,13 +13,24 @@ validate_etcd_membership() {
    EXPECTED_MEMBERS="${*}"

    # NOTE(mark-burnett): Wait a moment for disks in test environment to settle.
-    sleep 10
+    sleep 60
    log Validating "${CLUSTER}" etcd membership via "${VM}" for members: "${EXPECTED_MEMBERS[@]}"
-    FOUND_MEMBERS=$(etcdctl_member_list "${CLUSTER}" "${VM}" | tr '\n' ' ' | sed 's/ $//')

-    if [[ "x${EXPECTED_MEMBERS}" != "x${FOUND_MEMBERS}" ]]; then
-        log Etcd membership check failed for cluster "${CLUSTER}"
+    local retries=25
+    for ((n=0;n<=$retries;n++)); do
+        FOUND_MEMBERS=$(etcdctl_member_list "${CLUSTER}" "${VM}" | tr '\n' ' ' | sed 's/ $//')
+
        log "Found \"${FOUND_MEMBERS}\", expected \"${EXPECTED_MEMBERS}\""
-        exit 1
-    fi
+        if [[ "x${EXPECTED_MEMBERS}" != "x${FOUND_MEMBERS}" ]]; then
+            log Etcd membership check failed for cluster "${CLUSTER}" on attempt "$n".
+            if [[ "$n" == "$retries" ]]; then
+                log Etcd membership check failed for cluster "${CLUSTER}" after "$n" retries. Exiting.
+                exit 1
+            fi
+            sleep 30
+        else
+            log Etcd membership check succeeded for cluster "${CLUSTER}" on attempt "${n}"
+            break
+        fi
+    done
 }
--- a/tools/g2/manifests/resiliency.json
+++ b/tools/g2/manifests/resiliency.json
@ -120,6 +120,8 @@
      "name": "Teardown Genesis",
      "script": "teardown-nodes.sh",
      "arguments": [
+        "-e", "kubernetes",
+        "-e", "calico",
        "-v", "n1",
        "-n", "n0",
        "-r"
@ -160,7 +162,7 @@
    }
  ],
  "vm": {
-    "memory": 3072,
+    "memory": 4096,
    "names": [
      "n0",
      "n1",
--- a/tools/g2/stages/genesis.sh
+++ b/tools/g2/stages/genesis.sh
@ -7,7 +7,9 @@ source "${GATE_UTILS}"
 rsync_cmd "${TEMP_DIR}/scripts"/*genesis* "${GENESIS_NAME}:/root/promenade/"

 set -o pipefail
-ssh_cmd "${GENESIS_NAME}" env "PROMENADE_ENCRYPTION_KEY=${PROMENADE_ENCRYPTION_KEY}" /root/promenade/genesis.sh 2>&1 | tee -a "${LOG_FILE}"
+ssh_cmd "${GENESIS_NAME}" env "IMAGE_HYPERKUBE=${IMAGE_HYPERKUBE}" \
+        env "PROMENADE_ENCRYPTION_KEY=${PROMENADE_ENCRYPTION_KEY}" \
+        /root/promenade/genesis.sh 2>&1 | tee -a "${LOG_FILE}"
 ssh_cmd "${GENESIS_NAME}" /root/promenade/validate-genesis.sh 2>&1 | tee -a "${LOG_FILE}"
 set +o pipefail

--- a/tools/g2/stages/join-nodes.sh
+++ b/tools/g2/stages/join-nodes.sh
@ -52,7 +52,7 @@ mkdir -p "${SCRIPT_DIR}"
 for NAME in "${NODES[@]}"; do
    log Building join script for node "${NAME}"

-    CURL_ARGS=("--fail" "--max-time" "300" "--retry" "16" "--retry-delay" "15")
+    CURL_ARGS=("-v" "--max-time" "600" "--retry" "20" "--retry-delay" "15" "--connect-timeout" "30" "--progress-bar")
    if [[ $GET_KEYSTONE_TOKEN == 1 ]]; then
        TOKEN="$(os_ks_get_token "${VIA}")"
        if [[ -z $TOKEN ]]; then
@ -67,7 +67,7 @@ for NAME in "${NODES[@]}"; do
    promenade_health_check "${VIA}"

    log "Validating documents"
-    ssh_cmd "${VIA}" curl -v "${CURL_ARGS[@]}" -X POST -H "Content-Type: application/json" -d "$(promenade_render_validate_body "${USE_DECKHAND}" "${DECKHAND_REVISION}")" "$(promenade_render_validate_url)"
+    ssh_cmd "${VIA}" curl "${CURL_ARGS[@]}" -X POST -H "Content-Type: application/json" -d "$(promenade_render_validate_body "${USE_DECKHAND}" "${DECKHAND_REVISION}")" "$(promenade_render_validate_url)"

    JOIN_CURL_URL="$(promenade_render_curl_url "${NAME}" "${USE_DECKHAND}" "${DECKHAND_REVISION}" "${LABELS[@]}")"
    log "Fetching join script via: ${JOIN_CURL_URL}"
--- a/tools/g2/stages/move-master.sh
+++ b/tools/g2/stages/move-master.sh
@ -6,15 +6,15 @@ source "${GATE_UTILS}"

 VIA="n1"

-CURL_ARGS=("--fail" "--max-time" "300" "--retry" "16" "--retry-delay" "15")
+CURL_ARGS=("-v" "--max-time" "600" "--retry" "20" "--retry-delay" "15" "--connect-timeout" "30" "--progress-bar")

-log Adding labels to node n0
+log "Adding labels to node n0"
 JSON="{\"calico-etcd\": \"enabled\", \"coredns\": \"enabled\", \"kubernetes-apiserver\": \"enabled\", \"kubernetes-controller-manager\": \"enabled\", \"kubernetes-etcd\": \"enabled\", \"kubernetes-scheduler\": \"enabled\", \"ucp-control-plane\": \"enabled\"}"

-ssh_cmd "${VIA}" curl -v "${CURL_ARGS[@]}" -X PUT -H "Content-Type: application/json" -d "${JSON}" "$(promenade_put_labels_url n0)"
+ssh_cmd "${VIA}" curl "${CURL_ARGS[@]}" -X PUT -H "Content-Type: application/json" -d "${JSON}" "$(promenade_put_labels_url n0)"

 # Need to wait
-sleep 60
+sleep 120

 validate_etcd_membership kubernetes n1 n0 n1 n2 n3
 validate_etcd_membership calico n1 n0 n1 n2 n3
@ -22,10 +22,10 @@ validate_etcd_membership calico n1 n0 n1 n2 n3
 log Removing labels from node n2
 JSON="{\"coredns\": \"enabled\", \"ucp-control-plane\": \"enabled\"}"

-ssh_cmd "${VIA}" curl -v "${CURL_ARGS[@]}" -X PUT -H "Content-Type: application/json" -d "${JSON}" "$(promenade_put_labels_url n2)"
+ssh_cmd "${VIA}" curl "${CURL_ARGS[@]}" -X PUT -H "Content-Type: application/json" -d "${JSON}" "$(promenade_put_labels_url n2)"

 # Need to wait
-sleep 60
+sleep 120

 validate_cluster n1

--- a/tools/g2/stages/teardown-nodes.sh
+++ b/tools/g2/stages/teardown-nodes.sh
@ -8,8 +8,11 @@ declare -a NODES

 RECREATE=0

-while getopts "n:rv:" opt; do
+while getopts "e:n:rv:" opt; do
    case "${opt}" in
+        e)
+            ETCD_CLUSTERS+=("${OPTARG}")
+            ;;
        n)
            NODES+=("${OPTARG}")
            ;;
@ -35,6 +38,9 @@ fi
 for NAME in "${NODES[@]}"; do
    log Tearing down node "${NAME}"
    promenade_teardown_node "${NAME}" "${VIA}"
+    for ETCD_CLUSTER in "${ETCD_CLUSTERS[@]}"; do
+        etcdctl_member_remove "${ETCD_CLUSTER}" "${VIA}" "${NAME}"
+    done
    vm_clean "${NAME}"
    if [[ ${RECREATE} == "1" ]]; then
        vm_create "${NAME}"