From 23840f8f6f1c46d236817e1ba8f4515db6b06953 Mon Sep 17 00:00:00 2001
From: Mark Burnett <mark.m.burnett@gmail.com>
Date: Tue, 13 Feb 2018 13:12:26 -0600
Subject: [PATCH] Make gate scripts more robust

* remove unused kube-proxy credential substitutions
* add liveness & readiness probes to promenade-api
* fix misleading log message about tar file caching
* don't accidentally overwrite TEMP_DIR variable in functions
* add on_error script for genesis

Change-Id: I5d5b46489fa8c0a10200cbac8cf59462030eb144
---
 .../promenade/templates/deployment-api.yaml   | 17 +++++++++++
 examples/basic/armada-resources.yaml          | 22 --------------
 examples/complete/armada-resources.yaml       | 22 --------------
 tools/g2/lib/all.sh                           |  1 +
 tools/g2/lib/docker.sh                        | 26 +++++++++++++++++
 tools/g2/lib/log.sh                           |  4 +--
 tools/g2/lib/nginx.sh                         |  3 +-
 tools/g2/manifests/genesis.json               |  3 +-
 tools/g2/manifests/integration.json           |  3 +-
 tools/g2/manifests/resiliency.json            |  3 +-
 tools/g2/on_error/collect_genesis_info.sh     | 29 +++++++++++++++++++
 tools/g2/stages/genesis.sh                    |  2 ++
 tools/gate.sh                                 |  5 ++--
 .../bootstrap-armada-config.yaml              | 27 -----------------
 14 files changed, 87 insertions(+), 80 deletions(-)
 create mode 100644 tools/g2/lib/docker.sh
 create mode 100755 tools/g2/on_error/collect_genesis_info.sh

diff --git a/charts/promenade/templates/deployment-api.yaml b/charts/promenade/templates/deployment-api.yaml
index df627a6a..e218dadf 100644
--- a/charts/promenade/templates/deployment-api.yaml
+++ b/charts/promenade/templates/deployment-api.yaml
@@ -56,6 +56,23 @@ spec:
           ports:
             - name: api-public
               containerPort: {{ .Values.network.api.port }}
+          livenessProbe:
+            failureThreshold: 5
+            httpGet:
+              path: /api/v1.0/health
+              port: {{ .Values.network.api.target_port }}
+            initialDelaySeconds: 15
+            periodSeconds: 10
+            successThreshold: 1
+            timeoutSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /api/v1.0/health
+              port: {{ .Values.network.api.target_port }}
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 5
           volumeMounts:
             - name: promenade-etc
               mountPath: /etc/promenade/api-paste.ini
diff --git a/examples/basic/armada-resources.yaml b/examples/basic/armada-resources.yaml
index 77dead13..ea93be30 100644
--- a/examples/basic/armada-resources.yaml
+++ b/examples/basic/armada-resources.yaml
@@ -121,28 +121,6 @@ metadata:
     abstract: false
     layer: site
   storagePolicy: cleartext
-  substitutions:
-    -
-      src:
-        schema: deckhand/CertificateAuthority/v1
-        name: kubernetes
-        path: .
-      dest:
-        path: '.values.secrets.tls.ca'
-    -
-      src:
-        schema: deckhand/Certificate/v1
-        name: proxy
-        path: .
-      dest:
-        path: '.values.secrets.tls.cert'
-    -
-      src:
-        schema: deckhand/CertificateKey/v1
-        name: proxy
-        path: .
-      dest:
-        path: '.values.secrets.tls.key'
 data:
   chart_name: proxy
   release: kubernetes-proxy
diff --git a/examples/complete/armada-resources.yaml b/examples/complete/armada-resources.yaml
index 9b586a34..deb0d392 100644
--- a/examples/complete/armada-resources.yaml
+++ b/examples/complete/armada-resources.yaml
@@ -164,28 +164,6 @@ metadata:
     abstract: false
     layer: site
   storagePolicy: cleartext
-  substitutions:
-    -
-      src:
-        schema: deckhand/CertificateAuthority/v1
-        name: kubernetes
-        path: .
-      dest:
-        path: '.values.secrets.tls.ca'
-    -
-      src:
-        schema: deckhand/Certificate/v1
-        name: proxy
-        path: .
-      dest:
-        path: '.values.secrets.tls.cert'
-    -
-      src:
-        schema: deckhand/CertificateKey/v1
-        name: proxy
-        path: .
-      dest:
-        path: '.values.secrets.tls.key'
 data:
   chart_name: proxy
   release: kubernetes-proxy
diff --git a/tools/g2/lib/all.sh b/tools/g2/lib/all.sh
index c4c77a69..fd2b87d7 100644
--- a/tools/g2/lib/all.sh
+++ b/tools/g2/lib/all.sh
@@ -5,6 +5,7 @@ LIB_DIR=$(realpath "$(dirname "${BASH_SOURCE}")")
 
 source "$LIB_DIR"/config.sh
 source "$LIB_DIR"/const.sh
+source "$LIB_DIR"/docker.sh
 source "$LIB_DIR"/etcd.sh
 source "$LIB_DIR"/kube.sh
 source "$LIB_DIR"/log.sh
diff --git a/tools/g2/lib/docker.sh b/tools/g2/lib/docker.sh
new file mode 100644
index 00000000..7591b67f
--- /dev/null
+++ b/tools/g2/lib/docker.sh
@@ -0,0 +1,26 @@
+docker_ps() {
+    VIA="${1}"
+    ssh_cmd "${VIA}" docker ps -a
+}
+
+docker_info() {
+    VIA="${1}"
+    ssh_cmd "${VIA}" docker info 2>&1
+}
+
+docker_exited_containers() {
+    VIA="${1}"
+    ssh_cmd "${VIA}" docker ps -q --filter "status=exited"
+}
+
+docker_inspect() {
+    VIA="${1}"
+    CONTAINER_ID="${2}"
+    ssh_cmd "${VIA}" docker inspect "${CONTAINER_ID}"
+}
+
+docker_logs() {
+    VIA="${1}"
+    CONTAINER_ID="${2}"
+    ssh_cmd "${VIA}" docker logs "${CONTAINER_ID}"
+}
diff --git a/tools/g2/lib/log.sh b/tools/g2/lib/log.sh
index ff699f83..439088cb 100644
--- a/tools/g2/lib/log.sh
+++ b/tools/g2/lib/log.sh
@@ -46,8 +46,7 @@ log_note() {
 
 log_stage_error() {
     NAME=${1}
-    TEMP_DIR=${2}
-    echo -e " ${C_ERROR}== Error in stage ${C_HILIGHT}${NAME}${C_ERROR} ( ${C_TEMP}${TEMP_DIR}${C_ERROR} ) ==${C_CLEAR}"
+    echo -e " ${C_ERROR}== Error in stage ${C_HILIGHT}${NAME}${C_ERROR} ( ${C_TEMP}${LOG_FILE}${C_ERROR} ) ==${C_CLEAR}"
 }
 
 log_stage_footer() {
@@ -65,7 +64,6 @@ log_stage_success() {
 }
 
 log_temp_dir() {
-    TEMP_DIR=${1}
     echo -e "Working in ${C_TEMP}${TEMP_DIR}${C_CLEAR}"
 }
 
diff --git a/tools/g2/lib/nginx.sh b/tools/g2/lib/nginx.sh
index a74efe56..0d4dc74a 100644
--- a/tools/g2/lib/nginx.sh
+++ b/tools/g2/lib/nginx.sh
@@ -20,11 +20,12 @@ nginx_up() {
 nginx_cache_and_replace_tar_urls() {
     log "Finding tar_url options to cache.."
     TAR_NUM=0
+    mkdir -p "${NGINX_DIR}"
     for file in "$@"; do
         grep -Po "^ +tar_url: \K.+$" "${file}" | while read tar_url ; do
             # NOTE(mark-burnet): Does not yet ignore repeated files.
-            log "Caching ${tar_url} in file: ${file}"
             DEST_PATH="${NGINX_DIR}/cached-tar-${TAR_NUM}.tgz"
+            log "Caching ${tar_url} in file: ${DEST_PATH}"
             REPLACEMENT_URL="${NGINX_URL}/cached-tar-${TAR_NUM}.tgz"
             curl -Lo "${DEST_PATH}" "${tar_url}"
             sed -i "s;${tar_url};${REPLACEMENT_URL};" "${file}"
diff --git a/tools/g2/manifests/genesis.json b/tools/g2/manifests/genesis.json
index 572f6506..108518c0 100644
--- a/tools/g2/manifests/genesis.json
+++ b/tools/g2/manifests/genesis.json
@@ -26,7 +26,8 @@
     },
     {
       "name": "Genesis",
-      "script": "genesis.sh"
+      "script": "genesis.sh",
+      "on_error": "collect_genesis_info.sh"
     }
   ],
   "vm": {
diff --git a/tools/g2/manifests/integration.json b/tools/g2/manifests/integration.json
index 9b1139fa..0edc1ef4 100644
--- a/tools/g2/manifests/integration.json
+++ b/tools/g2/manifests/integration.json
@@ -26,7 +26,8 @@
     },
     {
       "name": "Genesis",
-      "script": "genesis.sh"
+      "script": "genesis.sh",
+      "on_error": "collect_genesis_info.sh"
     },
     {
       "name": "Load Site Configuration",
diff --git a/tools/g2/manifests/resiliency.json b/tools/g2/manifests/resiliency.json
index e38a96e3..8a807378 100644
--- a/tools/g2/manifests/resiliency.json
+++ b/tools/g2/manifests/resiliency.json
@@ -30,7 +30,8 @@
     },
     {
       "name": "Genesis",
-      "script": "genesis.sh"
+      "script": "genesis.sh",
+      "on_error": "collect_genesis_info.sh"
     },
     {
       "name": "Join Masters",
diff --git a/tools/g2/on_error/collect_genesis_info.sh b/tools/g2/on_error/collect_genesis_info.sh
new file mode 100755
index 00000000..ed1e713f
--- /dev/null
+++ b/tools/g2/on_error/collect_genesis_info.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+# NOTE(mark-burnett): Keep trying to collect info even if there's an error
+set +e
+set -x
+
+source "${GATE_UTILS}"
+
+ERROR_DIR="${TEMP_DIR}/errors"
+VIA=n0
+mkdir -p "${ERROR_DIR}"
+
+log "Gathering info from failed genesis server (n0) in ${ERROR_DIR}"
+
+log "Gathering docker info for exitted containers"
+mkdir -p "${ERROR_DIR}/docker"
+docker_ps "${VIA}" | tee "${ERROR_DIR}/docker/ps"
+docker_info "${VIA}" | tee "${ERROR_DIR}/docker/info"
+
+for container_id in $(docker_exited_containers "${VIA}"); do
+    docker_inspect "${VIA}" "${container_id}" | tee "${ERROR_DIR}/docker/${container_id}"
+    echo "=== Begin logs ===" | tee -a "${ERROR_DIR}/docker/${container_id}"
+    docker_logs "${VIA}" "${container_id}" | tee -a "${ERROR_DIR}/docker/${container_id}"
+done
+
+log "Gathering kubectl output"
+mkdir -p "${ERROR_DIR}/kube"
+kubectl_cmd "${VIA}" describe nodes n0 | tee "${ERROR_DIR}/kube/n0"
+kubectl_cmd "${VIA}" get --all-namespaces -o wide pod | tee "${ERROR_DIR}/kube/pods"
diff --git a/tools/g2/stages/genesis.sh b/tools/g2/stages/genesis.sh
index 7fb6f126..61461327 100755
--- a/tools/g2/stages/genesis.sh
+++ b/tools/g2/stages/genesis.sh
@@ -6,8 +6,10 @@ source "${GATE_UTILS}"
 
 rsync_cmd "${TEMP_DIR}/scripts"/*genesis* "${GENESIS_NAME}:/root/promenade/"
 
+set -o pipefail
 ssh_cmd "${GENESIS_NAME}" /root/promenade/genesis.sh 2>&1 | tee -a "${LOG_FILE}"
 ssh_cmd "${GENESIS_NAME}" /root/promenade/validate-genesis.sh 2>&1 | tee -a "${LOG_FILE}"
+set +o pipefail
 
 if ! ssh_cmd n0 docker images | tail -n +2 | grep -v registry:5000 ; then
     log_warn "Using some non-cached docker images.  This will slow testing."
diff --git a/tools/gate.sh b/tools/gate.sh
index ce6ab32d..b6487a87 100755
--- a/tools/gate.sh
+++ b/tools/gate.sh
@@ -22,7 +22,7 @@ chmod -R 755 "${TEMP_DIR}"
 
 STAGES_DIR=${WORKSPACE}/tools/g2/stages
 
-log_temp_dir "${TEMP_DIR}"
+log_temp_dir
 echo
 
 STAGES=$(mktemp)
@@ -44,10 +44,11 @@ while read -u 3 stage; do
         log_stage_error "${NAME}" "${LOG_FILE}"
         if echo "${stage}" | jq -e .on_error > /dev/null; then
             log_stage_diagnostic_header
-            ON_ERROR=${WORKSPACE}/$(echo "${stage}" | jq -r .on_error)
+            ON_ERROR=${WORKSPACE}/tools/g2/on_error/$(echo "${stage}" | jq -r .on_error)
             set +e
             $ON_ERROR
         fi
+        log_stage_error "${NAME}" "${TEMP_DIR}"
         exit 1
     fi
     log_stage_footer "${NAME}"
diff --git a/tools/gate/config-templates/bootstrap-armada-config.yaml b/tools/gate/config-templates/bootstrap-armada-config.yaml
index 22b02c0f..549d82d7 100644
--- a/tools/gate/config-templates/bootstrap-armada-config.yaml
+++ b/tools/gate/config-templates/bootstrap-armada-config.yaml
@@ -119,28 +119,6 @@ metadata:
     abstract: false
     layer: site
   storagePolicy: cleartext
-  substitutions:
-    -
-      src:
-        schema: deckhand/CertificateAuthority/v1
-        name: kubernetes
-        path: .
-      dest:
-        path: '.values.secrets.tls.ca'
-    -
-      src:
-        schema: deckhand/Certificate/v1
-        name: proxy
-        path: .
-      dest:
-        path: '.values.secrets.tls.cert'
-    -
-      src:
-        schema: deckhand/CertificateKey/v1
-        name: proxy
-        path: .
-      dest:
-        path: '.values.secrets.tls.key'
 data:
   chart_name: proxy
   release: kubernetes-proxy
@@ -149,11 +127,6 @@ data:
   upgrade:
     no_hooks: true
   values:
-    secrets:
-      tls:
-        ca: placeholder
-        cert: placeholder
-        key: placeholder
     images:
       tags:
         proxy: ${IMAGE_HYPERKUBE}