Updated resiliency gate
Updated resiliency gate script to consistently pass all gate stages, using ubuntu bionic image for node deployment. - Updated developer-onbording.rst with information on how to configure and run the resilency gate behind corporate proxy. - Updated the gate scripts to use the proxy configuration. - Updated up.sh to pull the hyperkube image as cache, to speed up and stabalize the initial kublet deployment of kubernetes cluster services. - Updated and added sleeps and retries in some of gate stages and scripts to avoid gate failures due to transient environment issues. - Updated the ubuntu base image for node deployments from xenial to\ bionic base image. - Added code in treadown-nodes stage to manually remove the etcd members: kubernetes and calico, since they still remain listed as etcd members on genesis node, even after genesis is torn down. Change-Id: Ia11d66ab30ac7a07626d4f1d02a6da48155f862d
This commit is contained in:
parent
cb4ae15eb1
commit
9f42b502f7
@ -43,6 +43,25 @@ debug it, e.g.:
|
||||
|
||||
./tools/g2/bin/ssh.sh n0
|
||||
|
||||
Running Resilency Tests Behind Corporate Proxy
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If your development environment is behind a corporate proxy, you will need to
|
||||
update following files to add your envrionment's proxy information, dns, or
|
||||
possibly your internal ntp servers, in order to deploy airship:
|
||||
|
||||
* `charts/coredns/values.yaml`: Update the upstream coredns nameserver IPs
|
||||
to your internal DNS addresses.
|
||||
* `examples/basic/KubernetesNetwork.yaml`: Since resilency manifest uses
|
||||
the examples/basic environment configuration, you will need to Update
|
||||
the kubernetes network configuration in this folder. Update the upstream
|
||||
nameserver IPs to your internal DNS addresses. Add the http(s) proxy URL
|
||||
and additional_no_proxy list. Also, if your enviornment requires that,
|
||||
update the ntp server list to your internal ntp server addresses for
|
||||
more reliable time sync.
|
||||
* `tools/g2/templates/network-config.sub`: Update the upstream nameserver
|
||||
IPs to your internal DNS addresses.
|
||||
|
||||
Bootstrapping
|
||||
-------------
|
||||
|
||||
|
@ -74,6 +74,14 @@ export http_proxy={{ config['KubernetesNetwork:proxy.url'] | default('', true) }
|
||||
export https_proxy={{ config['KubernetesNetwork:proxy.url'] | default('', true) }}
|
||||
export no_proxy={{ config.get(kind='KubernetesNetwork') | fill_no_proxy }}
|
||||
|
||||
# Configure apt proxy
|
||||
if [[ -n "${http_proxy}" ]]; then
|
||||
log "Configuring Apt Proxy"
|
||||
cat << EOF | sudo tee /etc/apt/apt.conf.d/50proxyconf
|
||||
Acquire::https::proxy "${https_proxy}";
|
||||
Acquire::http::proxy "${http_proxy}";
|
||||
EOF
|
||||
fi
|
||||
|
||||
# Install system packages
|
||||
#
|
||||
@ -139,5 +147,13 @@ fi
|
||||
if systemctl -q is-enabled containerd > /dev/null 2>&1; then
|
||||
systemctl restart containerd || true
|
||||
fi
|
||||
# Pull the hyperkube image prior to restarting kubelet, this is
|
||||
# needed for more reliable image pull in an environment with slow
|
||||
# network connectivity to avoid image pull timeouts and retries by
|
||||
# kubelet.
|
||||
# The || true is added to let the deployment continue, evenif the
|
||||
# $IMAGE_HYPERKUBE is not defined in the environment, and the image
|
||||
# pull doesn't happen.
|
||||
docker image pull "${IMAGE_HYPERKUBE}" || true
|
||||
systemctl enable kubelet
|
||||
systemctl restart kubelet
|
||||
|
@ -222,7 +222,7 @@ function validate_kubectl_logs {
|
||||
NAMESPACE=default
|
||||
POD_NAME=log-test-${NODE}-$(date +%s)
|
||||
|
||||
cat <<EOPOD | kubectl --namespace $NAMESPACE apply -f -
|
||||
cat <<EOPOD | kubectl --namespace $NAMESPACE --timeout 100s apply -f -
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
@ -244,6 +244,7 @@ EOPOD
|
||||
|
||||
wait_for_node_ready $NODE 300
|
||||
wait_for_pod_termination $NAMESPACE $POD_NAME
|
||||
sleep 5
|
||||
ACTUAL_LOGS=$(kubectl --namespace $NAMESPACE logs $POD_NAME)
|
||||
if [ "x$ACTUAL_LOGS" != "xEXPECTED RESULT" ]; then
|
||||
log Got unexpected logs:
|
||||
|
@ -1,6 +1,6 @@
|
||||
export TEMP_DIR=${TEMP_DIR:-$(mktemp -d)}
|
||||
export BASE_IMAGE_SIZE=${BASE_IMAGE_SIZE:-68719476736}
|
||||
export BASE_IMAGE_URL=${BASE_IMAGE_URL:-https://cloud-images.ubuntu.com/releases/16.04/release/ubuntu-16.04-server-cloudimg-amd64-disk1.img}
|
||||
export BASE_IMAGE_SIZE=${BASE_IMAGE_SIZE:-344784896}
|
||||
export BASE_IMAGE_URL=${BASE_IMAGE_URL:-https://cloud-images.ubuntu.com/releases/bionic/release/ubuntu-18.04-server-cloudimg-amd64.img}
|
||||
export IMAGE_PROMENADE=${IMAGE_PROMENADE:-quay.io/airshipit/promenade:master}
|
||||
export IMAGE_PROMENADE_DISTRO=${IMAGE_PROMENADE_DISTRO:-ubuntu_bionic}
|
||||
export IMAGE_HYPERKUBE=${IMAGE_HYPERKUBE:-gcr.io/google_containers/hyperkube-amd64:v1.17.3}
|
||||
|
@ -14,3 +14,17 @@ etcdctl_member_list() {
|
||||
|
||||
etcdctl_cmd "${CLUSTER}" "${VM}" member list -w json | jq -r '.members[].name' | sort
|
||||
}
|
||||
|
||||
etcdctl_member_remove() {
|
||||
CLUSTER=${1}
|
||||
VM=${2}
|
||||
NODE=${3}
|
||||
shift 3
|
||||
|
||||
MEMBER_ID=$(etcdctl_cmd $CLUSTER ${VM} member list | awk -F', ' "/${NODE}/ "'{ print $1}')
|
||||
if [[ -n $MEMBER_ID ]] ; then
|
||||
etcdctl_cmd "${CLUSTER}" "${VM}" member remove "$MEMBER_ID"
|
||||
else
|
||||
log No members found in cluster "$CLUSTER" for node "$NODE"
|
||||
fi
|
||||
}
|
||||
|
@ -13,13 +13,24 @@ validate_etcd_membership() {
|
||||
EXPECTED_MEMBERS="${*}"
|
||||
|
||||
# NOTE(mark-burnett): Wait a moment for disks in test environment to settle.
|
||||
sleep 10
|
||||
sleep 60
|
||||
log Validating "${CLUSTER}" etcd membership via "${VM}" for members: "${EXPECTED_MEMBERS[@]}"
|
||||
FOUND_MEMBERS=$(etcdctl_member_list "${CLUSTER}" "${VM}" | tr '\n' ' ' | sed 's/ $//')
|
||||
|
||||
if [[ "x${EXPECTED_MEMBERS}" != "x${FOUND_MEMBERS}" ]]; then
|
||||
log Etcd membership check failed for cluster "${CLUSTER}"
|
||||
local retries=25
|
||||
for ((n=0;n<=$retries;n++)); do
|
||||
FOUND_MEMBERS=$(etcdctl_member_list "${CLUSTER}" "${VM}" | tr '\n' ' ' | sed 's/ $//')
|
||||
|
||||
log "Found \"${FOUND_MEMBERS}\", expected \"${EXPECTED_MEMBERS}\""
|
||||
exit 1
|
||||
fi
|
||||
if [[ "x${EXPECTED_MEMBERS}" != "x${FOUND_MEMBERS}" ]]; then
|
||||
log Etcd membership check failed for cluster "${CLUSTER}" on attempt "$n".
|
||||
if [[ "$n" == "$retries" ]]; then
|
||||
log Etcd membership check failed for cluster "${CLUSTER}" after "$n" retries. Exiting.
|
||||
exit 1
|
||||
fi
|
||||
sleep 30
|
||||
else
|
||||
log Etcd membership check succeeded for cluster "${CLUSTER}" on attempt "${n}"
|
||||
break
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
@ -120,6 +120,8 @@
|
||||
"name": "Teardown Genesis",
|
||||
"script": "teardown-nodes.sh",
|
||||
"arguments": [
|
||||
"-e", "kubernetes",
|
||||
"-e", "calico",
|
||||
"-v", "n1",
|
||||
"-n", "n0",
|
||||
"-r"
|
||||
@ -160,7 +162,7 @@
|
||||
}
|
||||
],
|
||||
"vm": {
|
||||
"memory": 3072,
|
||||
"memory": 4096,
|
||||
"names": [
|
||||
"n0",
|
||||
"n1",
|
||||
|
@ -7,7 +7,9 @@ source "${GATE_UTILS}"
|
||||
rsync_cmd "${TEMP_DIR}/scripts"/*genesis* "${GENESIS_NAME}:/root/promenade/"
|
||||
|
||||
set -o pipefail
|
||||
ssh_cmd "${GENESIS_NAME}" env "PROMENADE_ENCRYPTION_KEY=${PROMENADE_ENCRYPTION_KEY}" /root/promenade/genesis.sh 2>&1 | tee -a "${LOG_FILE}"
|
||||
ssh_cmd "${GENESIS_NAME}" env "IMAGE_HYPERKUBE=${IMAGE_HYPERKUBE}" \
|
||||
env "PROMENADE_ENCRYPTION_KEY=${PROMENADE_ENCRYPTION_KEY}" \
|
||||
/root/promenade/genesis.sh 2>&1 | tee -a "${LOG_FILE}"
|
||||
ssh_cmd "${GENESIS_NAME}" /root/promenade/validate-genesis.sh 2>&1 | tee -a "${LOG_FILE}"
|
||||
set +o pipefail
|
||||
|
||||
|
@ -52,7 +52,7 @@ mkdir -p "${SCRIPT_DIR}"
|
||||
for NAME in "${NODES[@]}"; do
|
||||
log Building join script for node "${NAME}"
|
||||
|
||||
CURL_ARGS=("--fail" "--max-time" "300" "--retry" "16" "--retry-delay" "15")
|
||||
CURL_ARGS=("-v" "--max-time" "600" "--retry" "20" "--retry-delay" "15" "--connect-timeout" "30" "--progress-bar")
|
||||
if [[ $GET_KEYSTONE_TOKEN == 1 ]]; then
|
||||
TOKEN="$(os_ks_get_token "${VIA}")"
|
||||
if [[ -z $TOKEN ]]; then
|
||||
@ -67,7 +67,7 @@ for NAME in "${NODES[@]}"; do
|
||||
promenade_health_check "${VIA}"
|
||||
|
||||
log "Validating documents"
|
||||
ssh_cmd "${VIA}" curl -v "${CURL_ARGS[@]}" -X POST -H "Content-Type: application/json" -d "$(promenade_render_validate_body "${USE_DECKHAND}" "${DECKHAND_REVISION}")" "$(promenade_render_validate_url)"
|
||||
ssh_cmd "${VIA}" curl "${CURL_ARGS[@]}" -X POST -H "Content-Type: application/json" -d "$(promenade_render_validate_body "${USE_DECKHAND}" "${DECKHAND_REVISION}")" "$(promenade_render_validate_url)"
|
||||
|
||||
JOIN_CURL_URL="$(promenade_render_curl_url "${NAME}" "${USE_DECKHAND}" "${DECKHAND_REVISION}" "${LABELS[@]}")"
|
||||
log "Fetching join script via: ${JOIN_CURL_URL}"
|
||||
|
@ -6,15 +6,15 @@ source "${GATE_UTILS}"
|
||||
|
||||
VIA="n1"
|
||||
|
||||
CURL_ARGS=("--fail" "--max-time" "300" "--retry" "16" "--retry-delay" "15")
|
||||
CURL_ARGS=("-v" "--max-time" "600" "--retry" "20" "--retry-delay" "15" "--connect-timeout" "30" "--progress-bar")
|
||||
|
||||
log Adding labels to node n0
|
||||
log "Adding labels to node n0"
|
||||
JSON="{\"calico-etcd\": \"enabled\", \"coredns\": \"enabled\", \"kubernetes-apiserver\": \"enabled\", \"kubernetes-controller-manager\": \"enabled\", \"kubernetes-etcd\": \"enabled\", \"kubernetes-scheduler\": \"enabled\", \"ucp-control-plane\": \"enabled\"}"
|
||||
|
||||
ssh_cmd "${VIA}" curl -v "${CURL_ARGS[@]}" -X PUT -H "Content-Type: application/json" -d "${JSON}" "$(promenade_put_labels_url n0)"
|
||||
ssh_cmd "${VIA}" curl "${CURL_ARGS[@]}" -X PUT -H "Content-Type: application/json" -d "${JSON}" "$(promenade_put_labels_url n0)"
|
||||
|
||||
# Need to wait
|
||||
sleep 60
|
||||
sleep 120
|
||||
|
||||
validate_etcd_membership kubernetes n1 n0 n1 n2 n3
|
||||
validate_etcd_membership calico n1 n0 n1 n2 n3
|
||||
@ -22,10 +22,10 @@ validate_etcd_membership calico n1 n0 n1 n2 n3
|
||||
log Removing labels from node n2
|
||||
JSON="{\"coredns\": \"enabled\", \"ucp-control-plane\": \"enabled\"}"
|
||||
|
||||
ssh_cmd "${VIA}" curl -v "${CURL_ARGS[@]}" -X PUT -H "Content-Type: application/json" -d "${JSON}" "$(promenade_put_labels_url n2)"
|
||||
ssh_cmd "${VIA}" curl "${CURL_ARGS[@]}" -X PUT -H "Content-Type: application/json" -d "${JSON}" "$(promenade_put_labels_url n2)"
|
||||
|
||||
# Need to wait
|
||||
sleep 60
|
||||
sleep 120
|
||||
|
||||
validate_cluster n1
|
||||
|
||||
|
@ -8,8 +8,11 @@ declare -a NODES
|
||||
|
||||
RECREATE=0
|
||||
|
||||
while getopts "n:rv:" opt; do
|
||||
while getopts "e:n:rv:" opt; do
|
||||
case "${opt}" in
|
||||
e)
|
||||
ETCD_CLUSTERS+=("${OPTARG}")
|
||||
;;
|
||||
n)
|
||||
NODES+=("${OPTARG}")
|
||||
;;
|
||||
@ -35,6 +38,9 @@ fi
|
||||
for NAME in "${NODES[@]}"; do
|
||||
log Tearing down node "${NAME}"
|
||||
promenade_teardown_node "${NAME}" "${VIA}"
|
||||
for ETCD_CLUSTER in "${ETCD_CLUSTERS[@]}"; do
|
||||
etcdctl_member_remove "${ETCD_CLUSTER}" "${VIA}" "${NAME}"
|
||||
done
|
||||
vm_clean "${NAME}"
|
||||
if [[ ${RECREATE} == "1" ]]; then
|
||||
vm_create "${NAME}"
|
||||
|
Loading…
x
Reference in New Issue
Block a user