
Changes in the upgrade procedure cause vault server pods to require restart in order to update to new server version. The work for restart pods is performed in another commit. Defer a request for vault rekey until the server pods match the expected version. The rekey procedure will not proceed if vault pods are being restarted, and so we should not start a rekey when it is anticipated that vault pods will be restarted. Test Plan: PASS bashate PASS unit test PASS vault sanity master branch, rekey PASS simplex upgrade (manual server pod restart) PASS duplex 2+1 (vault ha, 3 replicas) application-update Story: 2011073 Task: 50814 Change-Id: I91334d0577148c1e3f7bc674ab2a3edfaced1d1c Signed-off-by: Michel Thebeau <Michel.Thebeau@windriver.com>
4002 lines
130 KiB
YAML
4002 lines
130 KiB
YAML
apiVersion: v1
|
|
data:
|
|
init.sh: |
|
|
#!/bin/bash
|
|
|
|
# Get the CA path from environment vars
|
|
CERT=$CA_CERT
|
|
# Store cert as a oneliner for curl purposes
|
|
CA_ONELINE=$(awk '{printf "%s\\n", $0}' $CERT)
|
|
|
|
# Template vaules from helm
|
|
VAULT_NS={{ .Release.Namespace }}
|
|
VAULT_NAME={{ .Values.vault.name }}
|
|
VAULT_FN={{ .Values.vault.fullname }}
|
|
HA_REPLICAS={{ .Values.server.ha.replicas }}
|
|
VAULT_VERSION={{ .Values.server.version }}
|
|
|
|
# Set the domain for resolving pod names
|
|
DOMAIN="${VAULT_NS}.pod.cluster.local"
|
|
SVCDOMAIN="${VAULT_NS}.svc.cluster.local"
|
|
|
|
# define host targets and port
|
|
POD_TARGET_BASE="$DOMAIN" # requires 'DNS NAME' of pod
|
|
ACTIVE_TARGET="${VAULT_FN}-active.${SVCDOMAIN}" # only the active
|
|
TARGET_PORT=8200
|
|
|
|
# impermanent location to store files while running
|
|
WORKDIR=/workdir
|
|
|
|
# Health subdirectory. All vault manager health related files
|
|
# Will be placed here.
|
|
HEALTH_SUBDIR=$WORKDIR/health
|
|
mkdir -p $HEALTH_SUBDIR
|
|
|
|
# Selection of kubectl version from helm override
|
|
KUBECTL=kubectl
|
|
KUBECTL_HELM_OVERRIDE={{ .Values.manager.k8s.client_version }}
|
|
|
|
# Trap and trap notification file. When SIGTERM is sent to this pod
|
|
# we want to exit promptly and gracefully.
|
|
TRAPFILE=$WORKDIR/exit_on_trap
|
|
trap "touch $TRAPFILE" SIGTERM
|
|
|
|
# when specifying a trap for debug, remember it with this variable
|
|
# reserve trap '0' for disabling a debugging trap request
|
|
DEBUGGING_TRAP=0
|
|
|
|
# Pause notification file. An option to permit vault-manager to be
|
|
# paused at any of the exit_on_trap code points. The use cases may
|
|
# include:
|
|
# - running an external procedure that should not be permitted to
|
|
# conflict with vault-manager's operation
|
|
# - permitting time for a developer to setup conditions for debug
|
|
# - and test
|
|
PAUSEFILE=$WORKDIR/pause_on_trap
|
|
PAUSE_RATE=1 # rate at which to test for unpause
|
|
EARLY_PAUSE={{ .Values.manager.pause }}
|
|
|
|
# Healthcheck Fail file. If this file exists then we have decided to
|
|
# force vault manager to fail the health check
|
|
HEALTH_CHECK_FAIL=$HEALTH_SUBDIR/health_check_fail
|
|
|
|
# Healthcheck excuses.
|
|
HEALTH_CHECK_DISABLED=$HEALTH_SUBDIR/health_check_disabled
|
|
HEALTH_EXCUSE_NETWORK=$HEALTH_SUBDIR/health_excuse_network
|
|
HEALTH_EXCUSE_INIT=$HEALTH_SUBDIR/health_excuse_init
|
|
HEALTH_EXCUSE_PAUSE=$HEALTH_SUBDIR/health_excuse_pause
|
|
|
|
# Healthcheck excuse messages.
|
|
HC_MSG_DISABLED="Healthcheck is disabled."
|
|
HC_MSG_NETWORK="Vault manager has initiated a network operation."
|
|
HC_MSG_INIT="Vault manager is currently initializing."
|
|
HC_MSG_PAUSE="Vault manager is paused for external operation."
|
|
|
|
# Enable healthcheck excuses.
|
|
HC_DISABLE={{ .Values.manager.healthcheck.disableHC }}
|
|
HC_ENABLE_NETWORK={{ .Values.manager.healthcheck.enableNetwork }}
|
|
HC_ENABLE_INIT={{ .Values.manager.healthcheck.enableInit }}
|
|
HC_ENABLE_PAUSE={{ .Values.manager.healthcheck.enablePause }}
|
|
|
|
# set the default manager mode; modes include
|
|
# VAULT_MANAGER (default)
|
|
# MOUNT_HELPER
|
|
# INTERACTIVE (i.e., when this script is sourced by an author)
|
|
if [ -z "$MANAGER_MODE" ]; then
|
|
MANAGER_MODE="VAULT_MANAGER"
|
|
fi
|
|
if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
|
|
MANAGER_MODE="INTERACTIVE"
|
|
fi
|
|
|
|
# Maximum sleep seconds for mount-helper before exiting
|
|
MOUNT_HELPER_MAX_TIME=60
|
|
|
|
# Maximum seconds to wait for mount-helper pod to start
|
|
MAX_POD_RUN_TRIES=10
|
|
|
|
# Maximum seconds to wait for vault-manager pod to exit
|
|
# Vault-manager is not responding to SIGTERM, so will take 30
|
|
# seconds
|
|
TERMINATE_TRIES_MAX={{ .Values.manager.waitTermination.maxTries }}
|
|
TERMINATE_TRIES_SLEEP={{ .Values.manager.waitTermination.sleepTime }}
|
|
|
|
# Vault key share configuration
|
|
KEY_SECRET_SHARES=5
|
|
KEY_REQUIRED_THRESHOLD=3
|
|
|
|
# Enable vault rekey upon conversion of storage from PVC to k8s
|
|
# secrets
|
|
AUTO_REKEY_CONVERT={{ .Values.manager.rekey.enableOnPVCConversion }}
|
|
|
|
# Keep track of vault-manager restarting the rekey procedure; if
|
|
# this variable is not true (0) and a rekey procedure is in
|
|
# progress, then vault-manager was restarted
|
|
REKEY_STARTED=1
|
|
|
|
# Vault manager will rekey the vault at a time when the vault
|
|
# servers are stable for a period of time specified by
|
|
# REKEY_STABLE_TIME seconds
|
|
REKEY_STABLE_TIME=300
|
|
|
|
# Global variable to share rekey status
|
|
REKEY_STATUS_JSON=''
|
|
|
|
# Keep track of shards that were last successful
|
|
SHARDS_LAST_SUCCESSFUL="cluster-key"
|
|
|
|
# Records for seal status state machine:
|
|
PODREC_F="$WORKDIR/previous_pods_status.txt"
|
|
PODREC_TMP_F="$WORKDIR/new_pods_status.txt"
|
|
|
|
# Vault server health query timeout during HA recovery scenario
|
|
QUERY_TMOUT={{ .Values.manager.api.healthQueryTimeout }}
|
|
|
|
# Default curl timout for REST API commands to vault server.
|
|
# This value is what testing shows is the default timeout.
|
|
# Specifying it explicitly for clarity.
|
|
API_TMOUT=120
|
|
|
|
# API timeout for unseal operations
|
|
API_UNSEAL_OP_TMOUT={{ .Values.manager.api.unsealOpTimeout }}
|
|
|
|
# API timeout values for rekey operations
|
|
API_REKEY_QUERY_TMOUT={{ .Values.manager.api.rekeyStatusTimeout }}
|
|
API_REKEY_OP_TMOUT={{ .Values.manager.api.rekeyOpTimeout }}
|
|
|
|
STATEFULSET_RATE=5
|
|
INIT_CONVERGE_TIME=10
|
|
JOIN_RATE=5
|
|
JOIN_CONVERGE_TIME=1
|
|
UNSEAL_RATE=10
|
|
UNSEAL_CONVERGE_TIME=3
|
|
STATUS_RATE={{ .Values.manager.statusCheckRate }}
|
|
if [ -z "$STATUS_RATE" ] || [ -n "${STATUS_RATE//[0-9]}" ] || \
|
|
[ $STATUS_RATE -lt 1 ]; then
|
|
STATUS_RATE=5
|
|
fi
|
|
|
|
# with STATUS_RATE, the period to delay unseal
|
|
# STATUS_RATE * STATEMACH_START seconds
|
|
STATEMACH_START={{ .Values.manager.unsealWaitIntervals }}
|
|
if [ -z "$STATEMACH_START" ]; then
|
|
STATEMACH_START=3
|
|
fi
|
|
|
|
# Heartbeat file location
|
|
HB_FILE=$HEALTH_SUBDIR/heartbeat
|
|
|
|
# Maximum threshold time in seconds that is allowed between
|
|
# a heartbeat call and health_check call.
|
|
HB_THRESHOLD={{ .Values.manager.healthcheck.heartbeatThreshold }}
|
|
|
|
# Log levels
|
|
DEBUG=1
|
|
INFO=2
|
|
WARNING=3
|
|
ERROR=4
|
|
FATAL=5
|
|
|
|
# Default log level and the set log level (Initially set as default).
|
|
# If the log function detects an override file, then it will switch
|
|
# the set log level and then delete it.
|
|
DEFAULT_LOG_LEVEL=$INFO
|
|
LOG_LEVEL={{ .Values.manager.log.defaultLogLevel }}
|
|
LOG_OVERRIDE_FILE="$WORKDIR/log_level"
|
|
|
|
# FUNCTIONS
|
|
|
|
# takes major/minor version of k8s and compares
|
|
# for example: v1.28 > v1.27 > v1.26
|
|
#
|
|
# Returns:
|
|
# 0 left is larger
|
|
# 1 equal
|
|
# 2 right is larger
|
|
function compareK8sVersion {
|
|
local left="$1"
|
|
local right="$2"
|
|
|
|
# strip leading 'v'
|
|
left="${left#v}"
|
|
right="${right#v}"
|
|
|
|
# compare the strings
|
|
if [ "$left" == "$right" ]; then
|
|
return 1
|
|
fi
|
|
# compare major
|
|
if [ "${left%.*}" -gt "${right%.*}" ]; then
|
|
return 0
|
|
elif [ "${left%.*}" -lt "${right%.*}" ]; then
|
|
return 2
|
|
fi
|
|
|
|
# compare the minor
|
|
if [ "${left#*.}" -gt "${right#*.}" ]; then
|
|
return 0
|
|
fi
|
|
return 2
|
|
}
|
|
|
|
# Give kubectl an opportunity to express complaints in the log
|
|
function k8sComplain {
|
|
local result
|
|
|
|
result="$( $KUBECTL version -o json 2>&1 >/dev/null )"
|
|
if [ -n "$result" ]; then
|
|
log $WARNING "kubectl: $result"
|
|
fi
|
|
}
|
|
|
|
# Double-check that the binary exists before setting the specified
|
|
# value of KUBECTL
|
|
function switchK8sVersion {
|
|
local select="$1"
|
|
local fname="kubectl.$select"
|
|
local newbin="${KUBECTL_INSTALL_PATH}/$fname"
|
|
|
|
which "$fname" >/dev/null
|
|
if [ $? -ne 0 -o ! -f "$newbin" ]; then
|
|
log $ERROR "Missing kubectl version: $select"
|
|
k8sComplain
|
|
return 1
|
|
fi
|
|
|
|
if [ "$KUBECTL" != "$fname" ]; then
|
|
KUBECTL="$fname"
|
|
log $INFO "Switching to use kubectl version $select"
|
|
fi
|
|
|
|
k8sComplain
|
|
return 0
|
|
}
|
|
|
|
# Select the version of kubectl matching the running server
|
|
function pickK8sVersion {
|
|
local result
|
|
local serverver
|
|
local majorver
|
|
local minorver
|
|
local select=""
|
|
local majmin=""
|
|
local maxver
|
|
local minver
|
|
|
|
# omit this code if the image does not support kubectl versions
|
|
if [ -z "$KUBE_VERSIONS" ]; then
|
|
k8sComplain
|
|
return
|
|
fi
|
|
|
|
if [ -n "$KUBECTL_HELM_OVERRIDE" ]; then
|
|
# pick the binary requested, if it exists
|
|
switchK8sVersion "$KUBECTL_HELM_OVERRIDE"
|
|
if [ $? -eq 0 ]; then
|
|
return
|
|
fi
|
|
log $ERROR "kubectl version from helm-override not" \
|
|
"available: $KUBECTL_HELM_OVERRIDE"
|
|
fi
|
|
|
|
# use -o json for consistent usage, as oppose to --short
|
|
result="$( $KUBECTL version -o json 2>/dev/null )"
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Unable to get k8s server version"
|
|
# no change in value of KUBECTL
|
|
k8sComplain
|
|
return
|
|
fi
|
|
|
|
serverver="$( jq -r '.serverVersion.gitVersion' <<<"$result" \
|
|
| grep "[0-9]" )"
|
|
majorver="$( jq -r '.serverVersion.major' <<<"$result" \
|
|
| grep "[0-9]" )"
|
|
minorver="$( jq -r '.serverVersion.minor' <<<"$result" \
|
|
| grep "[0-9]" )"
|
|
if [ -z "$serverver" -o -z "$majorver" -o -z "$minorver" ]; then
|
|
log $ERROR "Unable to detect K8s server version:" \
|
|
"["$result"]"
|
|
# no change in value of KUBECTL
|
|
k8sComplain
|
|
return
|
|
fi
|
|
|
|
# pick matching client major/minor version
|
|
for select in $KUBE_VERSIONS noverhere; do
|
|
majmin="v${majorver}.${minorver}"
|
|
if [[ "$select" =~ ^$majmin ]]; then
|
|
break
|
|
fi
|
|
done
|
|
|
|
if [ "$select" == noverhere ]; then
|
|
# Try to pick a near version. We really shouldn't be in
|
|
# this situation, but here is a compromise. This algorithm
|
|
# assumes that there are no omitted versions in the series
|
|
# of KUBE_VERSIONS, and that they are sorted largest to
|
|
# smallest in that list
|
|
maxver="$( awk '{print $1}' <<<"$KUBE_VERSIONS" )"
|
|
minver="$( awk '{print $NF}' <<<"$KUBE_VERSIONS" )"
|
|
|
|
compareK8sVersion ${serverver%.*} ${maxver%.*}
|
|
if [ "$?" -le 1 ]; then
|
|
select="$maxver"
|
|
else
|
|
compareK8sVersion ${minver%.*} ${serverver%.*}
|
|
if [ "$?" -le 1 ]; then
|
|
select="$minver"
|
|
else
|
|
log $ERROR "Could not pick nearest version for kubectl"
|
|
k8sComplain
|
|
return
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
switchK8sVersion "${select%.*}"
|
|
}
|
|
|
|
# Convert log level to text for log message
|
|
function log_to_str {
|
|
local level="$1"
|
|
local logStr
|
|
|
|
case "$level" in
|
|
$INFO)
|
|
logStr="INFO"
|
|
;;
|
|
$DEBUG)
|
|
logStr="DEBUG"
|
|
;;
|
|
$WARNING)
|
|
logStr="WARNING"
|
|
;;
|
|
$ERROR)
|
|
logStr="ERROR"
|
|
;;
|
|
$FATAL)
|
|
logStr="FATAL"
|
|
;;
|
|
esac
|
|
echo "$logStr"
|
|
}
|
|
|
|
# Print the specified message to stdout if the call's specified
|
|
# level is at least the configured log level
|
|
function log {
|
|
local lvl="$1"
|
|
local logStr
|
|
local newLogLevel
|
|
|
|
# check if log override file "Exists"
|
|
if [ -f $LOG_OVERRIDE_FILE ] \
|
|
&& [ "$MANAGER_MODE" != "INTERACTIVE" ]; then
|
|
newLogLevel=$(cat $LOG_OVERRIDE_FILE)
|
|
# validation for newLogLevel
|
|
if [[ "$newLogLevel" =~ ^[1-5]$ ]]; then
|
|
LOG_LEVEL=$newLogLevel
|
|
logStr="$( log_to_str "$LOG_LEVEL" )"
|
|
echo "$(date +%Y-%m-%dT%H-%M-%S) DEBUG" \
|
|
"Log level set to $logStr"
|
|
else
|
|
echo "$(date +%Y-%m-%dT%H-%M-%S) DEBUG" \
|
|
"Invalid log level read from $LOG_OVERRIDE_FILE."
|
|
fi
|
|
rm $LOG_OVERRIDE_FILE
|
|
fi
|
|
|
|
# validate LOG_LEVEL. If it is not valid, then use
|
|
# DEFAULT_LOG_LEVEL instead.
|
|
if [[ ! "$LOG_LEVEL" =~ ^[1-5]$ ]]; then
|
|
echo "$(date +%Y-%m-%dT%H-%M-%S) DEBUG" \
|
|
"Invalid log level detected, will be set to" \
|
|
"$( log_to_str "$DEFAULT_LOG_LEVEL" )"
|
|
LOG_LEVEL=$DEFAULT_LOG_LEVEL
|
|
fi
|
|
|
|
# check if the log level for this call is equal to or higher
|
|
# than the set log level
|
|
if [ "$lvl" -ge "$LOG_LEVEL" ]; then
|
|
# print log
|
|
logStr="$( log_to_str "$lvl" )"
|
|
echo "$(date +%Y-%m-%dT%H-%M-%S) $logStr ${@:2}"
|
|
fi
|
|
}
|
|
|
|
if ! [[ "$QUERY_TMOUT" =~ ^[0-9]+$ ]]; then
|
|
log $WARNING ".Values.manager.healthQueryTimeout not an integer"
|
|
QUERY_TMOUT=""
|
|
fi
|
|
|
|
# Check the current health status for the vault manager.
|
|
# Return 0 if vault manager is healthy
|
|
# Return 1 if vault manager is unhealthy
|
|
function health_check {
|
|
local excuse_reason=()
|
|
local current_timestamp=0
|
|
local heartbeat_timestamp=0
|
|
local heartbeat_passed=false
|
|
local heartbeat_age=0
|
|
|
|
current_timestamp="$( date +%s )"
|
|
heartbeat_timestamp="$( stat -c %X $HB_FILE )"
|
|
heartbeat_age=$(( current_timestamp - heartbeat_timestamp ))
|
|
if [ $heartbeat_age -gt $HB_THRESHOLD ]; then
|
|
log $DEBUG "Heartbeat check failed"
|
|
heartbeat_passed=false
|
|
else
|
|
heartbeat_passed=true
|
|
fi
|
|
|
|
log $DEBUG "heartbeat time: $heartbeat_age"
|
|
|
|
if $heartbeat_passed && [ ! -f $HEALTH_CHECK_FAIL ]; then
|
|
return 0
|
|
else
|
|
if [ "$HC_DISABLE" = "true" ] || [ -f $HEALTH_CHECK_DISABLED ]; then
|
|
excuse_reason+=("$HC_MSG_DISABLED")
|
|
elif [ "$HC_ENABLE_PAUSE" = "true" ] && [ -f $HEALTH_EXCUSE_PAUSE ]; then
|
|
excuse_reason+=("$( cat $HEALTH_EXCUSE_PAUSE )")
|
|
elif [ "$HC_ENABLE_NETWORK" = "true" ] && [ -f $HEALTH_EXCUSE_NETWORK ]; then
|
|
excuse_reason+=("$( cat $HEALTH_EXCUSE_NETWORK )")
|
|
elif [ "$HC_ENABLE_INIT" = "true" ] && [ -f $HEALTH_EXCUSE_INIT ]; then
|
|
excuse_reason+=("$( cat $HEALTH_EXCUSE_INIT )")
|
|
fi
|
|
|
|
if [ ${#excuse_reason[@]} -gt 0 ]; then
|
|
log $INFO "Health_check fail has been excused. Reasons:"
|
|
for reason in "${excuse_reason[@]}"; do
|
|
log $INFO "$reason"
|
|
done
|
|
return 0
|
|
else
|
|
log $INFO "Health_check has failed."
|
|
return 1
|
|
fi
|
|
fi
|
|
}
|
|
|
|
# Heartbeat function touches the heartbeat file to update the timestamp,
|
|
# and updates the current heartbeat timestamp
|
|
function heartbeat {
|
|
|
|
# Do nothing if mode is not VAULT_MANAGER
|
|
if [ "$MANAGER_MODE" != "VAULT_MANAGER" ]; then
|
|
return
|
|
fi
|
|
|
|
touch $HB_FILE
|
|
}
|
|
|
|
# Create a health excuse file and log the reason.
|
|
function health_excuse_create {
|
|
local excuse_file_name="$1"
|
|
local excuse_reason="$2"
|
|
|
|
# Do nothing if mode is not VAULT_MANAGER
|
|
if [ "$MANAGER_MODE" != "VAULT_MANAGER" ]; then
|
|
return
|
|
fi
|
|
|
|
heartbeat
|
|
|
|
# check if the requested excuse file already exists.
|
|
# If not, create the excuse file and log the message
|
|
if [ -f $excuse_file_name ]; then
|
|
log $DEBUG "The excuse file $excuse_file_name already exists."
|
|
else
|
|
echo $excuse_reason > $excuse_file_name
|
|
log $DEBUG "The excuse file $excuse_file_name created." \
|
|
"Excuse reason: $excuse_reason"
|
|
fi
|
|
}
|
|
|
|
# Remove the named health excuse files.
|
|
function health_excuse_remove {
|
|
local excuse_file_name="$1"
|
|
local excuse_reason
|
|
|
|
# Do nothing if mode is not VAULT_MANAGER
|
|
if [ "$MANAGER_MODE" != "VAULT_MANAGER" ]; then
|
|
return
|
|
fi
|
|
|
|
heartbeat
|
|
|
|
# Check if the named excuse exists, if it is delete the file
|
|
if [ -f $excuse_file_name ]; then
|
|
excuse_reason="$( cat $excuse_file_name )"
|
|
rm $excuse_file_name
|
|
log $DEBUG "The excuse file $excuse_file_name is deleted. " \
|
|
"The excuse reason was: $excuse_reason"
|
|
else
|
|
log $DEBUG "The excuse file $excuse_file_name is already deleted."
|
|
fi
|
|
}
|
|
|
|
function pause_on_trap {
|
|
local thistrap="$1"
|
|
local pausenum
|
|
|
|
if [ ! -e "$PAUSEFILE" ]; then
|
|
# no pause request
|
|
return
|
|
fi
|
|
|
|
pausenum="$( cat "$PAUSEFILE" )"
|
|
if [ -n "$pausenum" ] \
|
|
&& [ "$pausenum" != "$thistrap" ]; then
|
|
# not on this trap
|
|
return
|
|
fi
|
|
|
|
log $INFO "Vault manager is paused ($thistrap)"
|
|
health_excuse_create "$HEALTH_EXCUSE_PAUSE" "$HC_MSG_PAUSE"
|
|
# Until pause file is removed by the author,
|
|
# or until the content of pause_on_trap file is
|
|
# not-empty and not matching the current trap.
|
|
#
|
|
# If the pause_on_trap file containing specific trap number is
|
|
# replaced with empty file: the pause state is maintained.
|
|
while [ -e "$PAUSEFILE" ]; do
|
|
pausenum="$( cat "$PAUSEFILE" )"
|
|
if [ -n "$pausenum" ] \
|
|
&& [ "$thistrap" != "$pausenum" ]; then
|
|
break;
|
|
fi
|
|
sleep "$PAUSE_RATE"
|
|
done
|
|
health_excuse_remove "$HEALTH_EXCUSE_PAUSE"
|
|
log $INFO "Vault manager is unpaused"
|
|
}
|
|
|
|
function exit_on_trap {
|
|
local trap="$1"
|
|
local tfnum=""
|
|
|
|
if [ "$MANAGER_MODE" == "INTERACTIVE" ]; then
|
|
# do not interfere with exit_on_trap intended for
|
|
# vault-manager pod
|
|
return
|
|
fi
|
|
|
|
heartbeat
|
|
|
|
# Debug option pause_on_trap
|
|
pause_on_trap "$trap"
|
|
|
|
if [ -e "$TRAPFILE" ]; then
|
|
tfnum=$(cat $TRAPFILE)
|
|
log $DEBUG "exit_on_trap: removing $TRAPFILE"
|
|
rm "$TRAPFILE" # for workdir on PVC
|
|
if [ -z "$tfnum" ]; then
|
|
# an empty trap file is the default expected behaviour
|
|
log $INFO "exit_on_trap: ($trap)"
|
|
exit
|
|
# handle trap debugging feature - a developer specifies the
|
|
# trap number to target a specific exit_on_trap call.
|
|
# Setting a value of 0 (zero) disables the debugging trap
|
|
elif [ "$tfnum" -eq 0 ]; then
|
|
log $DEBUG "exit_on_trap: ($trap):" \
|
|
"disable debug trap ($DEBUGGING_TRAP)"
|
|
DEBUGGING_TRAP=0
|
|
# there is no trap with value zero
|
|
return
|
|
else
|
|
DEBUGGING_TRAP="$tfnum"
|
|
log $DEBUG "exit_on_trap: ($trap): " \
|
|
"enable debug trap ($DEBUGGING_TRAP)"
|
|
# check now just in case it matches
|
|
if [ "$DEBUGGING_TRAP" -eq "$trap" ]; then
|
|
log $INFO "exit_on_trap: ($trap): matching"
|
|
exit
|
|
fi
|
|
fi
|
|
# check if there is a matching debug trap set
|
|
elif [ "$DEBUGGING_TRAP" -eq "$trap" ]; then
|
|
log $INFO "exit_on_trap: ($trap): matching"
|
|
exit
|
|
else
|
|
log $DEBUG "exit_on_trap: ($trap): no trap file, no exit"
|
|
fi
|
|
}
|
|
|
|
# splits keys into separate files. Each file contains the key and the base64 encoded version.
|
|
# root token will be stored separately
|
|
|
|
function splitShard {
|
|
local index="$1"
|
|
jq '{"keys": [.keys['$index']], "keys_base64": [.keys_base64['$index']]}'
|
|
}
|
|
|
|
# merges two split keys
|
|
function mergeKeyJson {
|
|
# the two parameters are names for variables
|
|
local jstr1="$1"
|
|
local jstr2="$2"
|
|
|
|
mkfifo "$WORKDIR"/s1
|
|
mkfifo "$WORKDIR"/s2
|
|
|
|
(
|
|
jq -Mn --argfile file1 $WORKDIR/s1 --argfile file2 $WORKDIR/s2 '
|
|
def mergek: ($file1, $file2) | .keys as $k | $k;
|
|
def mergeb: ($file1, $file2) | .keys_base64 as $b | $b;
|
|
{keys: (reduce mergek as $x ([]; . + $x)),
|
|
keys_base64: (reduce mergeb as $x ([]; . + $x))}
|
|
' & ) 2>/dev/null
|
|
|
|
echo -n "${!jstr1}" > "$WORKDIR"/s1
|
|
echo -n "${!jstr2}" > "$WORKDIR"/s2
|
|
|
|
rm -f "$WORKDIR"/s1 "$WORKDIR"/s2
|
|
}
|
|
|
|
# Prepare a json document from the k8s secrets prefixed with
|
|
# prefix, and the root token
|
|
#
|
|
# Required parameter: The prefix of the k8s secrets containing
|
|
# the shards
|
|
#
|
|
# Outputs the json document which is comparable to the original
|
|
# response for vault initialization. The calling function is
|
|
# responsible for validating the document content.
|
|
#
|
|
function reconstructInitResponse {
|
|
local prefix="$1"
|
|
local index
|
|
local keys
|
|
local mkeys
|
|
|
|
# pull secrets from k8s and merge into one json file.
|
|
for index in $( seq 0 $(( KEY_SECRET_SHARES - 1 )) ); do
|
|
keys="$( get_secret "${prefix}-$index" )"
|
|
if [ "$index" -eq 0 ]; then
|
|
mkeys="$keys"
|
|
continue
|
|
fi
|
|
mkeys=$( mergeKeyJson mkeys keys )
|
|
done
|
|
|
|
# append the root secret and echo the document
|
|
echo "$mkeys" | jq -c '{keys: .keys,
|
|
keys_base64: .keys_base64,
|
|
root_token: "'$( get_secret "cluster-key-root" )'"}'
|
|
}
|
|
|
|
# Check the structure of json data and confirm equivalence of
|
|
# the stdin with stored secrets
|
|
#
|
|
# Required parameter: The prefix of the k8s secrets containing
|
|
# the shards in stored secrets
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
function validateSecrets {
|
|
local keyprefix="$1"
|
|
local text
|
|
local keys
|
|
local keys_base64
|
|
local root_token
|
|
local count
|
|
local saved
|
|
local shaA
|
|
local shaB
|
|
|
|
text=$( cat )
|
|
keys=$( echo "$text" | jq '.keys' )
|
|
keys_base64=$( echo "$text" | jq '.keys_base64' )
|
|
root_token=$( echo "$text" | jq -r '.root_token' )
|
|
# response is 'null' if the dict key is missing
|
|
# response is empty (-z) is the source document is empty
|
|
if [ -z "$keys" -o "$keys" == "null" \
|
|
-o -z "$keys_base64" -o "$keys_base64" == "null" \
|
|
-o -z "$root_token" -o "$root_token" == "null" ]; then
|
|
log $ERROR "one or more missing keys"
|
|
return 1
|
|
fi
|
|
|
|
count=$( echo "$keys" | jq '. | length' )
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "jq did not parse keys length"
|
|
return 1
|
|
fi
|
|
if [ -z "$count" ] || [ "$count" -ne "$KEY_SECRET_SHARES" ]; then
|
|
log $ERROR "Incorrect array length for keys:" \
|
|
"$count instead of $KEY_SECRET_SHARES"
|
|
return 1
|
|
fi
|
|
count=$( echo "$keys_base64" | jq '. | length' )
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "jq did not parse keys_base64 length"
|
|
return 1
|
|
fi
|
|
if [ -z "$count" ] || [ "$count" -ne "$KEY_SECRET_SHARES" ]; then
|
|
log $ERROR "Incorrect array length for keys_base64:" \
|
|
"$count instead of $KEY_SECRET_SHARES"
|
|
return 1
|
|
fi
|
|
|
|
saved="$( reconstructInitResponse "${keyprefix}" )"
|
|
|
|
# finally ensure that the saved secrets are the same as the
|
|
# supplied text
|
|
shaA=$( echo "$text" | sha256sum )
|
|
shaB=$( echo "$saved" | sha256sum )
|
|
if [ "$shaA" != "$shaB" ]; then
|
|
log $ERROR "saved data differs from source data"
|
|
return 1
|
|
fi
|
|
|
|
log $INFO "Verified stored secrets are the same as supplied data"
|
|
return 0
|
|
}
|
|
|
|
# Creates a list of all k8s vault pods and stores in text file.
|
|
# Converts ips from X.X.X.X or a:b:c::d to X-X-X-X for use as pod
|
|
# dns names
|
|
#
|
|
# Optional parameter:
|
|
# --ha : append vault server active/standby status (boolean)
|
|
#
|
|
# Example output with --ha
|
|
# sva-vault-0 172-16-226-97 true
|
|
function getVaultPods {
|
|
local ha="$1"
|
|
local jpath
|
|
local meta='{.metadata.name}'
|
|
local ip='{.status.podIPs[].ip}'
|
|
local active='{.metadata.labels.vault-active}'
|
|
local jfields=${meta}'{"\t"}'${ip}
|
|
|
|
if [ "$ha" == "--ha" ]; then
|
|
jfields=${jfields}'{"\t"}'${active}
|
|
fi
|
|
jpath='{range .items[*]}'"$jfields"'{"\n"}{end}'
|
|
|
|
$KUBECTL get pods \
|
|
-n "$VAULT_NS" \
|
|
-l component=server,app.kubernetes.io/name=vault \
|
|
-o=jsonpath="$jpath" \
|
|
| sed 's/\.\|:/-/g'
|
|
}
|
|
|
|
# Wait for the vault servers in the stateful set to be
|
|
# created before initializing
|
|
function waitForPods {
|
|
local jsonPath='{range .items[*]}{.metadata.name}{"\t"} \
|
|
{.status.podIPs[].ip}{"\t"}{.status.phase}{"\n"} \
|
|
{end}'
|
|
|
|
CURRENT_PODS=$($KUBECTL get pods \
|
|
-l component=server,app.kubernetes.io/name=vault \
|
|
-o=jsonpath="$jsonPath" \
|
|
| grep Running \
|
|
| wc -l)
|
|
DESIRED_PODS=$1
|
|
|
|
if ! [[ "$CURRENT_PODS" =~ ^[0-9]+$ ]]; then
|
|
log $ERROR "Invalid Running pod number ($CURRENT_PODS) from kubectl get pods"
|
|
CURRENT_PODS=0
|
|
fi
|
|
|
|
while [ $CURRENT_PODS -lt $DESIRED_PODS ]; do
|
|
sleep "$STATEFULSET_RATE"
|
|
log $INFO "Waiting for ${VAULT_FN}" \
|
|
"statefulset running pods ($CURRENT_PODS) to equal" \
|
|
"desired pods ($DESIRED_PODS)"
|
|
CURRENT_PODS=$($KUBECTL get pods \
|
|
-l component=server,app.kubernetes.io/name=vault \
|
|
-o=jsonpath="$jsonPath" \
|
|
| grep Running \
|
|
| wc -l)
|
|
done
|
|
}
|
|
|
|
# Takes the json document output from vault initialization
|
|
# and stores it into secrets for key shards and the root token
|
|
#
|
|
# Required parameter: The prefix of the k8s secrets into which to
|
|
# store the shards
|
|
#
|
|
# This only works if the secrets are not pre-existing. An error
|
|
# is printed by set_secrets.
|
|
function storeVaultInitSecrets {
|
|
local keyprefix="$1"
|
|
local secrets
|
|
local index
|
|
local split_json
|
|
|
|
secrets=$( cat )
|
|
|
|
for index in $(seq 0 $((KEY_SECRET_SHARES - 1 ))); do
|
|
split_json=$( echo -n "$secrets" | splitShard "$index" )
|
|
set_secret "${keyprefix}-$index" /dev/stdin <<< "$split_json"
|
|
done
|
|
|
|
# if the data contains root_token, save it as well
|
|
split_json=$( echo "$secrets" | jq -r '.root_token' )
|
|
if [ -n "$split_json" -a "$split_json" != 'null' ]; then
|
|
set_secret "${keyprefix}-root" /dev/stdin <<< "$split_json"
|
|
fi
|
|
}
|
|
|
|
# Address a vault server with REST API request. Capture stderr,
|
|
# stdout and result of curl commands. Print error and debug logs
|
|
#
|
|
# Required positional parameters, in order:
|
|
# Response variable : variable in which to store the response
|
|
# from vault
|
|
# http request type : GET, POST, DELETE
|
|
# vault server : FQDN
|
|
# vault REST API path : e.g., /sys/health
|
|
#
|
|
# Optional final parameter : a quoted string of data
|
|
#
|
|
# Examples:
|
|
# # get health status query for the active vault status
|
|
# vaultAPI myvar GET $ACTIVE_TARGET /sys/health
|
|
#
|
|
# # post rekey initialization with shares 5 and threshold 3
|
|
# data='{"secret_shares": 5,"secret_threshold": 3}'
|
|
# vaultAPI myvar POST $ACTIVE_TARGET /sys/rekey/init "$data"
|
|
#
|
|
# Overridable ENV variables:
|
|
# API_TMOUT: the curl timeout
|
|
# NO_HEADER: omit header (the root token) if not empty
|
|
#
|
|
# Output:
|
|
# Return the stdout and command result code
|
|
#
|
|
# Print log messages for errors. The responses from vault are
|
|
# restricted to DEBUG lovel log in case there's secret information
|
|
# in them. But a non-specific ERROR message is printed in all
|
|
# cases of errors.
|
|
function vaultAPI {
|
|
local answer="$1"
|
|
local reqarg="$2"
|
|
local server="$3"
|
|
local apipath="$4"
|
|
local data="$5"
|
|
local cmderr=""
|
|
local cmdout=""
|
|
local cmdres=1
|
|
local header=""
|
|
local errors=""
|
|
|
|
if [ -z "$NO_HEADER" ]; then
|
|
header="X-Vault-Token:$( get_secret cluster-key-root )"
|
|
fi
|
|
|
|
log $DEBUG "Executing: [curl -s -S --cacert \""$CERT"\"" \
|
|
${API_TMOUT:+"--connect-timeout" "$API_TMOUT"} \
|
|
${header:+"--header" "xxxx"} \
|
|
"--request \"$reqarg\"" \
|
|
${data:+"--data" "xxxx"} \
|
|
"\"https://${server}:${TARGET_PORT}/v1${apipath}\"]"
|
|
|
|
health_excuse_create "$HEALTH_EXCUSE_NETWORK" "$HC_MSG_NETWORK"
|
|
# Capture stderr and stdout copied from google search example
|
|
# on stack overflow. Add capture of the command result code
|
|
{
|
|
IFS=$'\n' read -r -d '' cmderr;
|
|
IFS=$'\n' read -r -d '' cmdout;
|
|
cmdres="$( echo "$cmdout" | tail -n1 )"
|
|
cmdout="$( echo "$cmdout" | head -n-1 )"
|
|
} < <((printf '\0%s\0' "$(
|
|
curl -s -S --cacert "$CERT" \
|
|
${API_TMOUT:+"--connect-timeout" "$API_TMOUT"} \
|
|
${header:+"--header" "$header"} \
|
|
--request "$reqarg" \
|
|
${data:+"--data" "$data"} \
|
|
"https://${server}:${TARGET_PORT}/v1${apipath}"
|
|
echo "$?"
|
|
)" 1>&2) 2>&1)
|
|
|
|
health_excuse_remove "$HEALTH_EXCUSE_NETWORK"
|
|
|
|
if [ "$cmdres" -ne 0 ]; then
|
|
log $ERROR "curl returns non-zero result: $cmdres"
|
|
fi
|
|
if [ -n "$cmderr" ]; then
|
|
log $ERROR "curl returns stderr"
|
|
log $DEBUG "curl returns stderr: [$cmderr]"
|
|
fi
|
|
|
|
if [ -n "$cmdout" ]; then
|
|
# errors from the REST API
|
|
errors=$( echo "$cmdout" | jq -cr '.errors' )
|
|
if [[ "$errors" != 'null' ]] && [ -n "$errors" ]; then
|
|
log $ERROR "vault REST API error"
|
|
log $DEBUG "vault REST API error: $errors"
|
|
if [ "$cmdres" -eq 0 ]; then
|
|
# this code wants to know if there was an error
|
|
cmdres=1
|
|
fi
|
|
fi
|
|
fi
|
|
eval "$answer"='$cmdout'
|
|
return $cmdres
|
|
}
|
|
|
|
# Initializes the first vault pod, only needs to be performed once
|
|
# after deploying the helm chart
|
|
# Stores the root token and master key shards in k8s secrets
|
|
function initVault {
|
|
local V0 # the zeroeth vault pod
|
|
local keys
|
|
local key_error
|
|
local shares
|
|
local threshold
|
|
|
|
V0=$(awk 'NR==1{print $2}' $WORKDIR/pods.txt)
|
|
log $INFO "Initializing $V0"
|
|
shares='"secret_shares": '$KEY_SECRET_SHARES
|
|
threshold='"secret_threshold": '$KEY_REQUIRED_THRESHOLD
|
|
|
|
NO_HEADER=true \
|
|
vaultAPI keys POST $V0.$POD_TARGET_BASE \
|
|
/sys/init "{$shares, $threshold}"
|
|
|
|
key_error=$(echo -n "$keys"| jq -r '.errors[]?')
|
|
if [ -n "$key_error" ]; then
|
|
log $ERROR "vault init request failed: $key_error"
|
|
fi
|
|
|
|
echo "$keys" | storeVaultInitSecrets cluster-key
|
|
|
|
# check if the secrets match vault's REST API response
|
|
echo "$keys" | validateSecrets cluster-key
|
|
}
|
|
|
|
# Uses the master key shards to unseal vault
|
|
function unsealVault {
|
|
local server="$1"
|
|
local prefix="$2"
|
|
local index
|
|
local b64key
|
|
local data
|
|
local response
|
|
local value
|
|
local autherror
|
|
|
|
if [ -z "$prefix" ]; then
|
|
prefix='cluster-key'
|
|
fi
|
|
|
|
# always abort an unseal in progress
|
|
data='{"reset": true}'
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_UNSEAL_OP_TMOUT \
|
|
vaultAPI response POST $server.$POD_TARGET_BASE \
|
|
/sys/unseal "$data"
|
|
if [ $? -ne 0 ]; then
|
|
# error is already printed
|
|
# Including if vault is already unsealed.
|
|
if [[ "$response" == *"vault is unsealed"* ]]; then
|
|
log $WARNING "unsealVault: server $server is" \
|
|
"already unsealed"
|
|
fi
|
|
return 1
|
|
fi
|
|
|
|
for index in $(seq 0 $((KEY_SECRET_SHARES - 1 ))); do
|
|
b64key=$( get_secret "${prefix}-$index" \
|
|
| jq -r '.keys_base64[]' )
|
|
data="{\"key\": \"$b64key\"}"
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_UNSEAL_OP_TMOUT \
|
|
vaultAPI response POST $server.$POD_TARGET_BASE \
|
|
/sys/unseal "$data"
|
|
if [ $? -ne 0 ]; then
|
|
# error is already printed, including errors from the
|
|
# vault REST API; but for debugging purposes, highlight
|
|
# the authentication error
|
|
autherror="cipher: message authentication failed"
|
|
if [[ "$response" == *"$autherror"* ]]; then
|
|
log $ERROR "Failed to authenticate /sys/unseal" \
|
|
"with $prefix"
|
|
# perhaps use this info in the future
|
|
return 2
|
|
fi
|
|
log $DEBUG "Unknown failure authenticating unseal" \
|
|
"$response"
|
|
return 1
|
|
fi
|
|
|
|
# when the unseal completes with KEY_REQUIRED_THRESHOLD then
|
|
# the response will indicate sealed=false
|
|
value="$( echo "$response" | jq -r ".sealed" )"
|
|
if [ "$value" == "false" ]; then
|
|
log $DEBUG "Success authenticating unseal"
|
|
return 0
|
|
fi
|
|
|
|
value="$( echo "$response" | jq -r ".progress" )"
|
|
log $DEBUG "Success authenticating unseal" \
|
|
"(${value}/${KEY_REQUIRED_THRESHOLD})"
|
|
# Some sleep is required to allow Raft convergence
|
|
sleep "$UNSEAL_CONVERGE_TIME"
|
|
done
|
|
|
|
log $ERROR "unsealVault completes without unseal or error"
|
|
return 1
|
|
}
|
|
|
|
# Unseal a vault server under conditions of recovery,
|
|
# including selecting and remembering alternate shard
|
|
# secrets.
|
|
#
|
|
# This algorithm remembers the last shards used to unseal the vault,
|
|
# to prioritize using those again the next time.
|
|
function unsealVaultRecover {
|
|
local server="$1"
|
|
local attempted
|
|
local use_secrets=""
|
|
|
|
if [ -n "$SHARDS_LAST_SUCCESSFUL" ]; then
|
|
# double check the keys we were using are not deleted
|
|
if assertShardSecrets "$SHARDS_LAST_SUCCESSFUL"; then
|
|
use_secrets="$SHARDS_LAST_SUCCESSFUL"
|
|
fi
|
|
fi
|
|
|
|
use_secrets="$use_secrets $( \
|
|
getOtherShardSecrets "$SHARDS_LAST_SUCCESSFUL" )"
|
|
for attempted in $use_secrets; do
|
|
log $INFO "Attempt unseal with $attempted"
|
|
unsealVault "$server" "$attempted"
|
|
case $? in
|
|
0)
|
|
SHARDS_LAST_SUCCESSFUL="$attempted"
|
|
return 0
|
|
;;
|
|
2)
|
|
# an error is already printed
|
|
# try a different set of shards
|
|
continue
|
|
;;
|
|
*)
|
|
# failure is not clear, try again later
|
|
log $ERROR "Fail to unseal $server with" \
|
|
"$attempted; try later"
|
|
return 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
log $ERROR "No set of shards unseal the server $server:" \
|
|
"attempted: $use_secrets"
|
|
return 1
|
|
}
|
|
|
|
# Takes the address of vault-0 as the cluster leader and
|
|
# joins other nodes to raft
|
|
function joinRaft {
|
|
local dnsname="$1"
|
|
local activeLink="https://${ACTIVE_TARGET}:${TARGET_PORT}"
|
|
local dataJson="{\"leader_api_addr\": \"$activeLink\", \"leader_ca_cert\": \"$CA_ONELINE\"}"
|
|
RAFT_STATUS=""
|
|
while [ "$RAFT_STATUS" != "true" ]; do
|
|
|
|
vaultAPI RAFT_STATUS POST $dnsname.$POD_TARGET_BASE \
|
|
/sys/storage/raft/join "$dataJson"
|
|
|
|
log $INFO "$dnsname $RAFT_STATUS"
|
|
RAFT_STATUS=$(echo $RAFT_STATUS | jq -r .joined)
|
|
sleep "$JOIN_CONVERGE_TIME"
|
|
done
|
|
}
|
|
|
|
function runStateMachine {
|
|
local host="$1"
|
|
local dns_name="$2"
|
|
local sealed="$3"
|
|
local status_rec
|
|
local old_rec
|
|
local counter
|
|
|
|
status_rec="/$host/$dns_name/$sealed/"
|
|
|
|
# log compression: do not print logs when status is unchanged
|
|
# omit counter when checking vault server state change
|
|
old_rec="$( grep "$status_rec" "$PODREC_F" )"
|
|
if [ $? -ne 0 ]; then
|
|
log $DEBUG "$( grep "$dns_name" $WORKDIR/pods.txt )"
|
|
log $INFO "Sealed status of $dns_name is now: $sealed"
|
|
|
|
# reread the record by hostname only
|
|
old_rec="$( grep "^/$host/" "$PODREC_F" )"
|
|
else
|
|
log $DEBUG "There is no change in pod seal status"
|
|
fi
|
|
|
|
if [ "$sealed" != "true" ]; then
|
|
# There is nothing more to do: the vault is unsealed
|
|
# or the sealed status is unclear
|
|
echo "$status_rec" >> "$PODREC_TMP_F"
|
|
return
|
|
fi
|
|
|
|
# The vault is sealed
|
|
#
|
|
# Check if there is a countdown in progress
|
|
#
|
|
# else -z old_rec: "the pod didn't have an IP address the last
|
|
# iteration, but now it does" - treat the same as "sealed
|
|
# without a countdown"
|
|
counter=""
|
|
if [ -n "$old_rec" ]; then
|
|
counter="$( echo "$old_rec" | awk -F/ '{print $5}' )"
|
|
fi
|
|
|
|
if [ -z "$counter" ]; then
|
|
# sealed without a countdown: start counting
|
|
log $DEBUG "Sealed vault $host: begin unseal delay:" \
|
|
"$( expr "$STATUS_RATE" \* "$STATEMACH_START" )s"
|
|
echo "${status_rec}${STATEMACH_START}" >> "$PODREC_TMP_F"
|
|
return
|
|
fi
|
|
|
|
# Check for end of period: 1 means "zero at this interval"
|
|
# "less than 1" for resilience
|
|
if [ "$counter" -le 1 -o "$STATEMACH_START" -eq 0 ]; then
|
|
# We've waited (STATUS_RATE * STATEMACH_START) seconds
|
|
# Or, STATEMACH_START == 0 means do not delay
|
|
log $INFO "Unsealing $dns_name"
|
|
unsealVaultRecover "$dns_name"
|
|
echo "$status_rec" >> "$PODREC_TMP_F"
|
|
return
|
|
fi
|
|
|
|
# finally, continue to countdown
|
|
counter="$( expr "$counter" - 1 )"
|
|
echo "${status_rec}${counter}" >> "$PODREC_TMP_F"
|
|
}
|
|
|
|
function vaultInitialized {
|
|
local response
|
|
local dnsname
|
|
local initialized
|
|
local text
|
|
|
|
# Wait for the pod to respond with a positive vault API response
|
|
# (i.e., not just a curl failure, and not a vault API failure)
|
|
while true; do
|
|
dnsname=$(awk 'NR==1{print $2}' $WORKDIR/pods.txt)
|
|
if [ -z "$dnsname" ]; then
|
|
log $INFO "waiting..."
|
|
sleep $STATUS_RATE
|
|
getVaultPods > $WORKDIR/pods.txt
|
|
continue
|
|
fi
|
|
|
|
log $INFO "Query server $dnsname for initialization status"
|
|
NO_HEADER=true \
|
|
API_TMOUT=$QUERY_TMOUT \
|
|
vaultAPI response GET $dnsname.$POD_TARGET_BASE /sys/health
|
|
if [ $? -ne 0 ]; then
|
|
log $INFO "waiting..."
|
|
sleep $STATUS_RATE
|
|
getVaultPods > $WORKDIR/pods.txt
|
|
continue
|
|
fi
|
|
break
|
|
done
|
|
|
|
echo -n "$response" > $WORKDIR/healthcheck.txt
|
|
initialized=$( echo "$response" | jq -r .initialized )
|
|
|
|
text="$( grep $dnsname $WORKDIR/pods.txt )"
|
|
if [ $? -eq 0 ]; then
|
|
log $DEBUG "$text"
|
|
log $DEBUG "Initialized status is $initialized"
|
|
fi
|
|
|
|
# The empty check is here as a extra safety net, but an
|
|
# investigation into in which exact conditions the result would
|
|
# be empty would be helpful.
|
|
if [ ! -z $initialized ] && [ $initialized = false ]; then
|
|
return 1
|
|
else
|
|
return 0
|
|
fi
|
|
}
|
|
|
|
function set_secret {
|
|
local secret="$1"
|
|
local contentf="$2"
|
|
local output
|
|
local result
|
|
|
|
output="$( $KUBECTL create secret generic -n "$VAULT_NS" \
|
|
"$secret" "--from-file=strdata=$contentf" 2>&1 )"
|
|
result=$?
|
|
if [ "$result" -ne 0 ]; then
|
|
log $ERROR "Failed to create secret $secret"
|
|
log $DEBUG "Output: [$output]"
|
|
fi
|
|
return $result
|
|
}
|
|
|
|
function get_secret {
|
|
local secret="$1"
|
|
|
|
$KUBECTL get secrets -n "$VAULT_NS" "$secret" \
|
|
-o jsonpath='{.data.strdata}' \
|
|
| base64 -d
|
|
}
|
|
|
|
# When vault-manager is run in "MOUNT_HELPER" mode, this function
|
|
# will not return. Instead the function will exit_on_trap or exit
|
|
# when it times-out.
|
|
#
|
|
# Basically: this function doesn't do anything except wait to be
|
|
# terminated.
|
|
#
|
|
# Vault-manager in MOUNT_HELPER has PVC mounted, allowing the real
|
|
# vault-manager to read secrets from cluster_keys.json
|
|
function mountHelper {
|
|
local count
|
|
|
|
# omit this function if this pod is not the mount helper
|
|
if [ -z "$MANAGER_MODE" -o "$MANAGER_MODE" != "MOUNT_HELPER" ]; then
|
|
log $INFO "Mode is VAULT_MANAGER"
|
|
return
|
|
fi
|
|
|
|
# When vault-manager is running in this mode, it should be
|
|
# deleted by vault-manager running in the default mode, which
|
|
# is using this pod to read secrets from mounted PVC
|
|
log $INFO "Mode is $MANAGER_MODE"
|
|
|
|
# start with some debug/error logs
|
|
if [ -f "$PVC_DIR/cluster_keys.json" ]; then
|
|
log $DEBUG "Successfully mounted secrets file"
|
|
else
|
|
log $WARNING "Secrets file not found"
|
|
fi
|
|
|
|
# sleep for MOUNT_HELPER_MAX_TIME, expecting SIGTERM signal
|
|
log $INFO "Waiting for termination request via SIGTERM"
|
|
count=0
|
|
while [ "$count" -lt "$MOUNT_HELPER_MAX_TIME" ]; do
|
|
exit_on_trap
|
|
count=$((count+1))
|
|
sleep 1
|
|
done
|
|
|
|
# Normally should exit by exit_on_trap, but here we timeout
|
|
# waiting for the real vault-manager to delete this job/pod.
|
|
log $INFO "Exiting without receiving SIGTERM request"
|
|
exit 0
|
|
}
|
|
|
|
# Check if a secret exists
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
# Prints the name of the secret
|
|
function secretExists {
|
|
local name="$1"
|
|
$KUBECTL get secrets -n "$VAULT_NS" "$name" \
|
|
-o jsonpath='{.metadata.name}' 2>/dev/null \
|
|
| grep "$name"
|
|
}
|
|
|
|
# Return linux success=0 if any of the secrets exist
|
|
function secretsExistAny {
|
|
local list="$@"
|
|
local name
|
|
|
|
for name in $list; do
|
|
secretExists $name >/dev/null
|
|
if [ $? -eq 0 ]; then
|
|
return 0
|
|
fi
|
|
done
|
|
|
|
return 1
|
|
}
|
|
|
|
# Assert that the shard secrets starting with prefix exist
|
|
#
|
|
# Parameter: prefix for k8s secrets, such as 'cluster-key'
|
|
#
|
|
# Optional second parameter:
|
|
# --nokeys : failed if at least one exists
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
#
|
|
# When --nokeys is selected, the failure return code is the number
|
|
# of secrets found. Zero secrets were expected.
|
|
#
|
|
# When --nokeys is omitted, the failure return code is either the
|
|
# number of secrets found or if the number of secrets found was
|
|
# zero, KEY_SECRET_SHARES is returned as error code
|
|
function assertShardSecrets {
|
|
local prefix="$1"
|
|
local nokey="$2"
|
|
local i
|
|
local count=0
|
|
|
|
for i in $( seq 0 $((KEY_SECRET_SHARES-1)) ); do
|
|
secretExists "${prefix}-$i" >/dev/null
|
|
if [ $? -eq 0 ]; then
|
|
count=$((count+1))
|
|
fi
|
|
done
|
|
if [ "$nokey" == "--nokeys" ]; then
|
|
# 0 secrets == true (0)
|
|
# Else return the number of secrets
|
|
return $count
|
|
fi
|
|
if [ "$count" -eq "$KEY_SECRET_SHARES" ]; then
|
|
return 0
|
|
elif [ "$count" -eq 0 ]; then
|
|
return "$KEY_SECRET_SHARES" # an error result
|
|
fi
|
|
return "$count"
|
|
}
|
|
|
|
# Return a list of other existing Shard secrets other than the set
|
|
# specified
|
|
#
|
|
# Sort by priority order:
|
|
# cluster-key
|
|
# cluster-rekey
|
|
# cluster-key-bk
|
|
#
|
|
function getOtherShardSecrets {
|
|
local omit="$1"
|
|
local secrets="cluster-key cluster-rekey cluster-key-bk"
|
|
local secret
|
|
local others=""
|
|
|
|
for secret in $secrets; do
|
|
if [ "$secret" == "$omit" ]; then
|
|
continue
|
|
fi
|
|
if assertShardSecrets $secret; then
|
|
others="$others $secret"
|
|
fi
|
|
done
|
|
echo $others
|
|
}
|
|
|
|
# Delete the specified list of secrets
|
|
#
|
|
# Uses a single kubectl command
|
|
function deleteSecrets {
|
|
local secrets="$@"
|
|
local text
|
|
text="$( $KUBECTL delete secrets -n "$VAULT_NS" \
|
|
$secrets 2>&1 )"
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Error deleting secrets: ["$text"]"
|
|
return 1
|
|
fi
|
|
log $INFO $text
|
|
return 0
|
|
}
|
|
|
|
# Check if the PVC resource exists
|
|
#
|
|
# Returns 0 if pvc does not exist
|
|
# Returns 1 if pvc exists but is terminating
|
|
# Returns 2 if pvc exists and is not terminating
|
|
# Prints the name of the PVC resource
|
|
function pvcRemoved {
|
|
local text
|
|
local jqscript
|
|
|
|
jqscript='.items
|
|
| map(select(.metadata.name | test("^manager-pvc")))
|
|
| "\(.[0].metadata.name) \(.[0].status.phase)"'
|
|
|
|
# using jq since kubernetes does not support regex
|
|
# the grep makes sure the result contains the 'manager-pvc'
|
|
# string (as opposed to 'null' for example)
|
|
text="$(
|
|
$KUBECTL get persistentvolumeclaims -n "$VAULT_NS" -o json \
|
|
| jq -r "$jqscript" 2>/dev/null \
|
|
| grep manager-pvc )"
|
|
|
|
if [ -n "$text" ]; then
|
|
readarray -d " " -t pvcInfo <<< "$text"
|
|
pvcName="${pvcInfo[0]}"
|
|
pvcStatus="${pvcInfo[1]}"
|
|
echo "$pvcName"
|
|
if [ "$pvcStatus" = "Terminating" ]; then
|
|
return 1
|
|
else
|
|
return 2
|
|
fi
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# Check if the PVC is mounted to any pod in vault namespace
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
# Prints the name of the PVC resource
|
|
function testPVCMount {
|
|
local result
|
|
local cspec
|
|
local vspec
|
|
|
|
cspec=".items[*].spec.containers[*]"
|
|
vspec="volumeMounts[?(@.name=='manager-pvc')].name"
|
|
|
|
# this kubectl query returns zero whether manager-pvc is
|
|
# found or not
|
|
# result variable is either empty or 'manager-pvc'
|
|
result="$( $KUBECTL get pods -n "$VAULT_NS" \
|
|
-o jsonpath="{${cspec}.${vspec}}" )"
|
|
|
|
if [ -n "$result" ]; then
|
|
return 0
|
|
fi
|
|
return 1 # assertion 'fails'
|
|
}
|
|
|
|
# This function prints a DEBUG log of kubectl delete
|
|
function deleteMountHelper {
|
|
local text
|
|
local result
|
|
|
|
log $DEBUG "Waiting for delete of mount-helper job"
|
|
text="$( $KUBECTL delete --ignore-not-found=true --wait=true \
|
|
-f /opt/yaml/pvc-attach.yaml 2>&1 )"
|
|
result=$?
|
|
log $DEBUG "Output of deleting mount-helper: [$text]"
|
|
return $result
|
|
}
|
|
|
|
# Run shred on the file content of PVC
|
|
#
|
|
# All files a shredded, and the result is an error if
|
|
# - command return code is non-zero
|
|
# - file comparison shows unchanged file(s)
|
|
#
|
|
# A warning is issued if shred/kubectl command has any stdout or
|
|
# stderr
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
function securelyWipePVC {
|
|
local helper="$1"
|
|
|
|
if [ -z "$helper" ]; then
|
|
log $ERROR "No pod specified for shredding"
|
|
return 1
|
|
fi
|
|
|
|
# get profile of the files before shredding
|
|
$KUBECTL exec -n "$VAULT_NS" "$helper" -- \
|
|
bash -c 'find /mnt/data -type f \
|
|
| sort | xargs wc | head -n-1' \
|
|
>/tmp/shred_before.txt 2>&1
|
|
log $DEBUG "Original files: [$( cat /tmp/shred_before.txt )]"
|
|
|
|
# run the shred command
|
|
#
|
|
# Shred all the files in mounted /mnt/data/
|
|
#
|
|
# The shred by default has three randomized passes, and with -z
|
|
# option will finalize with zeros. -f prompts shred to work
|
|
# around any unexpected file permissions
|
|
text="$( $KUBECTL exec -n "$VAULT_NS" "$helper" -- \
|
|
bash -c '\
|
|
result=0; \
|
|
while read fname; do \
|
|
shred -f -z "$fname"; \
|
|
[ $? -ne 0 ] && result=1; \
|
|
done <<<"$(find /mnt/data -type f )"; \
|
|
exit $result' 2>&1 )"
|
|
result=$?
|
|
|
|
# get profile of the files after shredding
|
|
$KUBECTL exec -n "$VAULT_NS" "$helper" -- \
|
|
bash -c 'find /mnt/data -type f \
|
|
| sort | xargs wc | head -n-1' \
|
|
>/tmp/shred_after.txt 2>&1
|
|
log $DEBUG "Shredded files: [$( cat /tmp/shred_after.txt )]"
|
|
|
|
# compare the profiles for error reporting
|
|
#
|
|
# If the file lists, pushed through wc, have files with the same
|
|
# character, word, and line counts then report an error: a file
|
|
# has not been shred
|
|
#
|
|
# Ignore files that were empty
|
|
difftext="$( diff -wuU100000 /tmp/shred_before.txt \
|
|
/tmp/shred_after.txt )"
|
|
unchanged="$( echo "$difftext" | grep "^ " \
|
|
| grep -v "^\([ ]\{1,\}0\)\{3\} /" )"
|
|
|
|
# Report the errors/success
|
|
if [ "$result" -ne 0 ]; then
|
|
log $ERROR "Error on shred: [$text]"
|
|
if [ -n "$unchanged" ]; then
|
|
log $ERROR "Unchanged: [$unchanged]"
|
|
fi
|
|
return 1
|
|
fi
|
|
if [ -n "$text" ]; then
|
|
log $WARNING "Output of shred is not empty: [$text]"
|
|
fi
|
|
if [ -n "$unchanged" ]; then
|
|
log $ERROR "Shred did not shred some files"
|
|
log $ERROR "Unchanged: [$unchanged]"
|
|
return 1
|
|
fi
|
|
|
|
log $INFO "Shredding of PVC data verified"
|
|
|
|
return 0
|
|
}
|
|
|
|
# Delete the PVC resource
|
|
#
|
|
# The delete will succeed even if attached to a pod, such as a
|
|
# terminating vault-manager or mount-helper - the PVC remains
|
|
# in terminating status until the pod is also terminated.
|
|
function deletePVC {
|
|
local text
|
|
local name
|
|
|
|
name="$( pvcRemoved )"
|
|
if [ $? -eq 2 ] && [[ "$name" =~ ^manager-pvc ]]; then
|
|
text="$( $KUBECTL delete persistentvolumeclaims \
|
|
-n "$VAULT_NS" "$name" 2>&1 )"
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Error deleting PVC: [$text]"
|
|
else
|
|
log $INFO "$text"
|
|
fi
|
|
else
|
|
log $WARNING "Request to delete PVC but PVC not found"
|
|
fi
|
|
}
|
|
|
|
# Run a job/pod, to mount the PVC resource, and retrieve the secrets
|
|
# from PVC.
|
|
#
|
|
# See also the function mountHelper and the ConfigMap named:
|
|
# {{ .Values.vault.name }}-mount-helper
|
|
#
|
|
# This function does not support overwriting an existing
|
|
# cluster-key-* secret, but it does support validating those secrets
|
|
# if they exist
|
|
function convertPVC {
|
|
local output
|
|
local pod
|
|
local count
|
|
local text
|
|
local PVCtext
|
|
local result
|
|
local waitPVCterm
|
|
|
|
if testPVCMount; then
|
|
log $ERROR "Cannot mount PVC already mounted"
|
|
return 1
|
|
fi
|
|
|
|
# run the pod
|
|
output="$( $KUBECTL apply -f /opt/yaml/pvc-attach.yaml 2>&1 )"
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Failed to apply mount-helper"
|
|
log $DEBUG "Output: [$output]"
|
|
deleteMountHelper
|
|
return 1
|
|
fi
|
|
|
|
# wait for pod
|
|
pod=''
|
|
count=0
|
|
log $INFO "Waiting for mount-helper pod to run"
|
|
while [ -z "$pod" -a "$count" -le "$MAX_POD_RUN_TRIES" ]; do
|
|
count=$((count+1))
|
|
text="$( $KUBECTL get pods -n "$VAULT_NS" \
|
|
| grep "mount-helper" )"
|
|
pod="$( echo "$text" | grep "Running" | awk '{print $1}' )"
|
|
if [ -z "$pod" ]; then
|
|
sleep 1
|
|
fi
|
|
done
|
|
|
|
if [ -z "$pod" ]; then
|
|
log $ERROR "Failed to run mount-helper pod"
|
|
log $DEBUG "Pod state: [$( echo $text )]"
|
|
deleteMountHelper
|
|
return 1
|
|
fi
|
|
|
|
# get the pvc data
|
|
PVCtext="$( $KUBECTL exec -n "$VAULT_NS" "$pod" \
|
|
-- cat /mnt/data/cluster_keys.json )"
|
|
if [ $? -ne 0 -o -z "$PVCtext" ]; then
|
|
log $ERROR "Failed to read cluster_keys.json"
|
|
deleteMountHelper
|
|
return 1
|
|
fi
|
|
log $INFO "Data retrieved from PVC"
|
|
|
|
# if the Root secret is pre-existing, compare the existing
|
|
# shard secrets and root secret before deleting the PVC
|
|
$KUBECTL get secrets -n "$VAULT_NS" \
|
|
cluster-key-root >/dev/null 2>&1
|
|
if [ $? -eq 0 ]; then
|
|
log $INFO "Cluster secrets exist:" \
|
|
"validating"
|
|
else
|
|
# create a secret from the data
|
|
echo "$PVCtext" | storeVaultInitSecrets cluster-key
|
|
fi
|
|
|
|
# verify the data stored versus text from PVC
|
|
echo "$PVCtext" | validateSecrets cluster-key
|
|
result=$?
|
|
if [ "$result" -eq 0 ]; then
|
|
securelyWipePVC "$pod"
|
|
# omit deleting the PVC for manual analysis and shred
|
|
# when the wipe fails
|
|
if [ $? -eq 0 ]; then
|
|
deletePVC
|
|
fi
|
|
fi
|
|
|
|
# clean up but do not care about the result
|
|
deleteMountHelper
|
|
|
|
# Sleep before finishing conversion, so that pvc termination process has started
|
|
waitPVCterm=5
|
|
sleep $waitPVCterm
|
|
|
|
return $result
|
|
}
|
|
|
|
function convertBootstrapSecrets {
|
|
local text
|
|
local count
|
|
|
|
text="$( get_secret cluster-key-bootstrap )"
|
|
echo "$text" | storeVaultInitSecrets cluster-key
|
|
|
|
# verify the split secrets versus the bootstrap text
|
|
echo "$text" | validateSecrets cluster-key
|
|
if [ $? -ne 0 ]; then
|
|
# an error is already printed
|
|
return 1
|
|
fi
|
|
|
|
deleteSecrets cluster-key-bootstrap
|
|
|
|
# Also validate and delete the PVC resource
|
|
# This procedure depends on waiting for the old version
|
|
# of vault-manager pod to exit
|
|
count="$TERMINATE_TRIES_MAX"
|
|
log $INFO "Waiting for vault-manager pod to exit"
|
|
while testPVCMount && [ "$count" -gt 0 ]; do
|
|
sleep "$TERMINATE_TRIES_SLEEP"
|
|
count=$((count-1))
|
|
done
|
|
|
|
if [ $count -eq 0 ]; then
|
|
log $WARNING "Maximum time reached waiting" \
|
|
"for the previous pod to be terminated."
|
|
fi
|
|
|
|
convertPVC
|
|
}
|
|
|
|
# When enabled, after conversion of storage from PVC to k8s secrets,
|
|
# Vault-manager will prompt itself to rekey the vault server
|
|
# storage.
|
|
function requestRekey {
|
|
local value
|
|
|
|
if [ "$AUTO_REKEY_CONVERT" != "true" ]; then
|
|
return
|
|
fi
|
|
log $INFO "Auto rekey enabled: [$AUTO_REKEY_CONVERT]"
|
|
|
|
secretExists cluster-rekey-request >/dev/null
|
|
if [ $? -eq 0 ]; then
|
|
value="$( get_secret cluster-rekey-request )"
|
|
log $WARNING "Auto rekey: rekey request exists: $value"
|
|
return
|
|
fi
|
|
|
|
value=$( uuidgen )
|
|
set_secret cluster-rekey-request /dev/stdin <<<"$value"
|
|
if [ $? -eq 0 ]; then
|
|
log $INFO "Rekey requested: $value"
|
|
else
|
|
log $ERROR "Failed to request rekey: $value"
|
|
fi
|
|
return
|
|
}
|
|
|
|
function runConversion {
|
|
if [ -n "$K8S_SECRETS_PREEXIST" ]; then
|
|
log $INFO "Cluster secrets exist"
|
|
return
|
|
elif [ -n "$BOOTSTRAP_PREEXISTS" ]; then
|
|
# this is the normal application update procedure; the
|
|
# lifecycle code retrieved the secrets from previous version
|
|
# of the application.
|
|
log $INFO "Using secrets provided in $BOOTSTRAP_PREEXISTS"
|
|
convertBootstrapSecrets
|
|
requestRekey
|
|
return
|
|
elif [ -z "$PVC_PREEXISTS" ]; then
|
|
log $INFO "No pre-existing secrets exist"
|
|
return
|
|
fi
|
|
|
|
# Finally, read the pre-existing PVC. This occurs if the
|
|
# application updates outside of application-update. For
|
|
# example if the old application is removed and deleted, and the
|
|
# new application is uploaded and applied.
|
|
convertPVC
|
|
requestRekey
|
|
}
|
|
|
|
# Test whether the specified vault server(s) agree with the
|
|
# specified status of the specified endpoint
|
|
#
|
|
# Print DEBUG logs when status is non-conforming (the function will
|
|
# be used to wait for conformance).
|
|
#
|
|
# The first parameter is the vault API endpoint to check status
|
|
# of, either /sys/rekey/init or /sys/rekey/verify
|
|
# The second parameter is the quoted string of json data returned
|
|
# from vault REST API call. The data should include these fields,
|
|
# which are tested for conformance:
|
|
# {"nonce": "S", "started": B, "progress": N,
|
|
# "verification_required": B}
|
|
#
|
|
# The other parameters are the servers to test, specified as
|
|
# dash-separated IP address output of getVaultPods (XX-XX-XX-XX)
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
function assertRekeyStatus {
|
|
local endpoint="$1"
|
|
local data="$2"
|
|
shift 2
|
|
local -a servers=($@)
|
|
local -a key_arr
|
|
local required
|
|
local jscript
|
|
local key
|
|
local index
|
|
local error
|
|
local server
|
|
local response
|
|
local record
|
|
|
|
required="nonce progress started verification_required"
|
|
jscript=".nonce, .progress, .started, .verification_required"
|
|
if [ "$endpoint" == "/sys/rekey/verify" ]; then
|
|
required="nonce progress started"
|
|
jscript=".nonce, .progress, .started"
|
|
fi
|
|
|
|
# quick check to assure the data parameter is sane
|
|
key_arr=($(echo "$data" | jq -r 'keys[]' | sort))
|
|
for key in $required; do
|
|
if [[ " ${key_arr[*]} " != *" $key "* ]]; then
|
|
log $ERROR "assertRekeyStatus requires: [$required]," \
|
|
"received: ${key_arr[*]}"
|
|
return 1
|
|
fi
|
|
done
|
|
|
|
required="$( echo "$data" | jq -r "$jscript" )"
|
|
|
|
index=0
|
|
error=0
|
|
while [ "$index" -lt "${#servers[@]}" ]; do
|
|
server="${servers[$index]}"
|
|
index=$((index+1))
|
|
server="${server}.$POD_TARGET_BASE"
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI response GET "$server" "$endpoint"
|
|
if [ $? -ne 0 -o -z "$response" ]; then
|
|
# failing the REST API call is not the same
|
|
# as non-conformance
|
|
return 2
|
|
continue
|
|
fi
|
|
|
|
record="$( echo "$response" | jq -r "$jscript" )"
|
|
if [ "$record" != "$required" ]; then
|
|
log $ERROR "$server does not conform to:" \
|
|
"$( echo "$data" | jq -c '.' )"
|
|
log $DEBUG "$server does not confirm: $response"
|
|
error=1
|
|
continue
|
|
fi
|
|
log $DEBUG "$server conforms: $response"
|
|
done
|
|
|
|
return $error
|
|
}
|
|
|
|
# Test whether the vault server(s) agree about rekey status
|
|
#
|
|
# The parameter is the quoted string of json data to pass to
|
|
# assertRekeyStatus
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
function assertServerStatus {
|
|
local reference="$1"
|
|
local pods
|
|
local count
|
|
|
|
pods="$( getVaultPods | awk '{print $2}' )"
|
|
count="$( echo $pods | wc -w )"
|
|
if [ "$count" -ne "$HA_REPLICAS" ]; then
|
|
log $ERROR "server without IP does not conform"
|
|
return 1
|
|
fi
|
|
assertRekeyStatus "/sys/rekey/init" "$reference" $pods
|
|
}
|
|
|
|
# Test whether the vault server(s) agree about rekey validation
|
|
# status. Warn when the active vault server changes
|
|
#
|
|
# The parameter is the quoted string of json data to pass to
|
|
# assertRekeyStatus
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
function assertVerifyStatus {
|
|
local reference="$1"
|
|
local response
|
|
local pods
|
|
local result
|
|
local count
|
|
|
|
# first assert the rekey status; /sys/rekey/verify returns
|
|
# error if a server does not have rekey in progress
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI response GET $ACTIVE_TARGET /sys/rekey/init
|
|
result=$?
|
|
if [ "$result" -ne 0 ]; then
|
|
return $result
|
|
fi
|
|
assertServerStatus "$response"
|
|
result=$?
|
|
if [ $result -ne 0 ]; then
|
|
return $result
|
|
fi
|
|
|
|
pods="$( getVaultPods | awk '{print $2}' )"
|
|
count="$( echo $pods | wc -w )"
|
|
if [ "$count" -ne "$HA_REPLICAS" ]; then
|
|
log $ERROR "server without IP does not conform"
|
|
return 1
|
|
fi
|
|
assertRekeyStatus "/sys/rekey/verify" "$reference" $pods
|
|
}
|
|
|
|
# Assert that the /sys/rekey/init endpoint reports no
|
|
# rekey procedure in progress on any server
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
function assertNoRekey {
|
|
local data
|
|
|
|
data='{"nonce": "", "started": false, "progress": 0'
|
|
data="$data"', "verification_required": false}'
|
|
assertServerStatus "$data"
|
|
}
|
|
|
|
# Retrieve the rekey status from active vault server
|
|
# and assert that all server conform to the status
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
function assertServersConform {
|
|
local response
|
|
local value
|
|
local result
|
|
local pods
|
|
local count
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI response GET $ACTIVE_TARGET /sys/rekey/init
|
|
if [ $? -ne 0 ]; then
|
|
# cannot check conformance
|
|
log $ERROR "Cannot check server conformance to" \
|
|
"/sys/rekey/init"
|
|
return 2
|
|
fi
|
|
|
|
assertServerStatus "$response"
|
|
result="$?"
|
|
if [ "$result" -ne 0 ]; then
|
|
return $result
|
|
fi
|
|
|
|
value="$( echo "$response" | jq -r '.verification_nonce' )"
|
|
if [ -z "$value" -o "$value" == "null" ]; then
|
|
return 0
|
|
fi
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI response GET $ACTIVE_TARGET /sys/rekey/verify
|
|
if [ $? -ne 0 ]; then
|
|
# cannot check conformance
|
|
log $ERROR "Cannot check server conformance to" \
|
|
"/sys/rekey/verify"
|
|
return 2
|
|
fi
|
|
|
|
pods="$( getVaultPods | awk '{print $2}' )"
|
|
count="$( echo $pods | wc -w )"
|
|
if [ "$count" -ne "$HA_REPLICAS" ]; then
|
|
log $ERROR "server without IP does not conform"
|
|
return 1
|
|
fi
|
|
assertRekeyStatus "/sys/rekey/verify" "$response" $pods
|
|
}
|
|
|
|
# This function is used during the pre-rekey assertions
|
|
# Testing if the main loop (via PODREC_F) indicates a server
|
|
# is not running.
|
|
function allServersRunning {
|
|
local records
|
|
local count
|
|
|
|
records="$( grep "^/$VAULT_FN" "$PODREC_F" )"
|
|
count="$( awk -F/ '{print $2}' <<<"$records" | wc -w )"
|
|
if [ "$count" -ne "$HA_REPLICAS" ]; then
|
|
return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
# This function is used during the pre-rekey assertions
|
|
# Testing if the main loop (via PODREC_F) indicates a server
|
|
# is sealed
|
|
function allServersUnsealed {
|
|
local records
|
|
local count
|
|
|
|
records="$( grep "^/$VAULT_FN" "$PODREC_F" )"
|
|
count="$( grep "/false/" <<<"$records" \
|
|
| awk -F/ '{print $2}' | wc -w )"
|
|
if [ "$count" -ne "$HA_REPLICAS" ]; then
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# This function is used during the pre-rekey assertions
|
|
# Testing if the main loop (via PODREC_F) indicates a server
|
|
# omits IP address
|
|
function allServersHaveIP {
|
|
local records
|
|
local count
|
|
|
|
records="$( grep "^/$VAULT_FN" "$PODREC_F" )"
|
|
count="$( echo "$records" | awk -F/ '{print $3}' | wc -w )"
|
|
if [ "$count" -ne "$HA_REPLICAS" ]; then
|
|
return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
# Check the vault server pods' metadata label "vault-version",
|
|
# and assert that all servers are running the expected version
|
|
# which is coded in vault-manager values.yaml server.version
|
|
function allServersCurrent {
|
|
local jdata
|
|
local podcount
|
|
local i
|
|
local poddata
|
|
local name
|
|
local version
|
|
|
|
jdata="$( kubectl get pods -n "$VAULT_NS" -o json )"
|
|
podcount="$( echo "$jdata" | jq ".items | length" )"
|
|
|
|
for i in $( seq 0 $((podcount -1 )) ); do
|
|
poddata="$( echo "$jdata" | jq ".items[$i]" )"
|
|
name="$( echo "$poddata" | jq -r ".metadata.name" )"
|
|
if ! [[ "$name" =~ ^${VAULT_FN}-[0-9]$ ]]; then
|
|
# this is not a vault server pod
|
|
continue
|
|
fi
|
|
|
|
version="$( echo "$poddata" \
|
|
| jq -r '.metadata.labels["vault-version"]' )"
|
|
if [ "$version" != "$VAULT_VERSION" ]; then
|
|
log $INFO "Vault server pod $name is version $version"
|
|
return 1
|
|
fi
|
|
|
|
log $DEBUG "Vault server pod $name is version $version"
|
|
done
|
|
return 0
|
|
}
|
|
|
|
# Test the status of rekey procedure 'started' during pre-rekey
|
|
# tests for procedure progress selection (sharing a single vaultAPI
|
|
# call to GET /sys/rekey/init
|
|
#
|
|
# Return linux true (0) if the status of /sys/rekey/init includes
|
|
# started == true
|
|
#
|
|
# Optional argument --not inverts the logic, but maintains
|
|
# error response 2
|
|
function assertRekeyStarted {
|
|
local started
|
|
local not="$1"
|
|
|
|
# assert that a rekey is in progress
|
|
started="$( echo "$REKEY_STATUS_JSON" | jq -r '.started' )"
|
|
if [ "$started" == "true" ]; then
|
|
started=0
|
|
elif [ "$started" != "false" ]; then
|
|
# the rekey status is unclear
|
|
# an error is probably printed
|
|
log $DEBUG "unclear response for /sys/rekey/init:" \
|
|
"$( jq -c <<<"$REKEY_STATUS_JSON" )"
|
|
return 2
|
|
else
|
|
started=1
|
|
fi
|
|
|
|
if [ "$started" -eq 0 ]; then
|
|
if [ "$not" == "--not" ]; then
|
|
return 1
|
|
fi
|
|
return 0
|
|
fi
|
|
if [ "$not" == "--not" ]; then
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
# Delete the shard secrets with speficied prefix
|
|
#
|
|
# The secrets are deleting on a single kubectl command
|
|
function deleteShardSecrets {
|
|
local prefix="$1"
|
|
local i
|
|
local list=''
|
|
|
|
for i in $( seq 0 $((KEY_SECRET_SHARES-1)) ); do
|
|
if [ -n "$( secretExists "${prefix}-$i" )" ]; then
|
|
list="$list ${prefix}-$i"
|
|
fi
|
|
done
|
|
if [ -n "$list" ]; then
|
|
deleteSecrets $list
|
|
return $?
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
# Make a copy of the shard secrets with specified prefix
|
|
#
|
|
# The calling function needs to verify the result
|
|
function copyShardSecrets {
|
|
local from="$1"
|
|
local to="$2"
|
|
local i
|
|
|
|
for i in $( seq 0 $((KEY_SECRET_SHARES-1))); do
|
|
get_secret "${from}-$i" \
|
|
| set_secret "${to}-$i" /dev/stdin
|
|
if [ $? -ne 0 ]; then
|
|
# don't try anything else
|
|
log $ERROR "Failed to copy ${from}-$i to ${to}-$i"
|
|
break
|
|
fi
|
|
done
|
|
}
|
|
|
|
# Just log the content of cluster-rekey-request again
|
|
#
|
|
# Keeps track of whether vault-manager has been restarted
|
|
# with REKEY_STARTED variable, so that the rekey procedure
|
|
# status is documented in log
|
|
function rekeyResuming {
|
|
if [ "$REKEY_STARTED" -ne 0 ]; then
|
|
log $INFO "Resuming rekey:" \
|
|
"$( get_secret cluster-rekey-request )"
|
|
REKEY_STARTED=0
|
|
fi
|
|
}
|
|
|
|
# Return linux true (0) if a rekey is requested and the vault
|
|
# server pods are in a stable condition
|
|
#
|
|
# If the vault servers are not "stable" then the rekey operation
|
|
# needs that stability first. vault-manager's main runStateMachine
|
|
# will monitor pods and restore unsealed status.
|
|
function needsRekey {
|
|
local pods
|
|
local sealed
|
|
local response
|
|
local apiversion
|
|
|
|
# the first milestone to be created is cluster-rekey-request;
|
|
# the last milestone to be deleted is cluster-rekey-audit;
|
|
# proceed if any exists
|
|
secretsExistAny cluster-rekey-request \
|
|
cluster-rekey-verified \
|
|
cluster-rekey-shuffle \
|
|
cluster-rekey-audit
|
|
if [ $? -ne 0 ]; then
|
|
# rekey is not requested
|
|
return 1
|
|
fi
|
|
|
|
# progress the rekey procedure only if the servers are all
|
|
# running
|
|
if ! allServersRunning; then
|
|
log $INFO "Rekey: wait for vault servers to equal" \
|
|
"$HA_REPLICAS"
|
|
return 1
|
|
fi
|
|
|
|
# progress the rekey procedure only if the servers were
|
|
# previously unsealed.
|
|
if ! allServersUnsealed; then
|
|
log $INFO "Rekey: wait for unsealed vault servers to" \
|
|
"equal $HA_REPLICAS"
|
|
return 1
|
|
fi
|
|
|
|
# progress the rekey procedure only if the servers all have
|
|
# DNS names (IP addresses) provided by k8s
|
|
if ! allServersHaveIP; then
|
|
log $INFO "Rekey: wait for $HA_REPLICAS vault servers" \
|
|
"to have IP addresses"
|
|
return 1
|
|
fi
|
|
|
|
# progress a rekey if all server pods are running the expected
|
|
# server version
|
|
if ! allServersCurrent; then
|
|
log $INFO "Rekey: wait for vault servers to be updated" \
|
|
"to the current version $VAULT_VERSION"
|
|
return 1
|
|
fi
|
|
|
|
# The above four tests are based on output of kubectl get pods
|
|
# command. Doublecheck with REST API call to each server
|
|
pods="$( getVaultPods | grep "^$VAULT_FN" | awk '{print $2}' )"
|
|
for pod in $pods; do
|
|
NO_HEADER=true \
|
|
API_TMOUT=$QUERY_TMOUT \
|
|
vaultAPI response GET ${pod}.$POD_TARGET_BASE /sys/health
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "$pod fails health check during rekey"
|
|
return 1
|
|
fi
|
|
sealed="$( echo "$response" | jq -r '.sealed' )"
|
|
if [ "$sealed" != "false" ]; then
|
|
log $ERROR "$pod is sealed during rekey"
|
|
return 1
|
|
fi
|
|
apiversion="$( echo "$response" | jq -r '.version' )"
|
|
if [ "$apiversion" != "$VAULT_VERSION" ]; then
|
|
log $ERROR "$pod is not version $VAULT_VERSION"
|
|
return 1
|
|
fi
|
|
done
|
|
|
|
assertServersConform
|
|
return $?
|
|
}
|
|
|
|
# Return linux true (0) if the current step of the rekey procedure
|
|
# is to send initialize request to /sys/rekey/int
|
|
#
|
|
# Initialize is the first step
|
|
#
|
|
# Will not begin initialization if there are stale cluster-rekey or
|
|
# cluster-key-bk secrets
|
|
function needsInitialization {
|
|
local progress
|
|
local count
|
|
local error=0
|
|
|
|
assertRekeyStarted --not
|
|
progress=$?
|
|
if [ "$progress" -ne 0 ]; then
|
|
return "$progress"
|
|
fi
|
|
|
|
# skip if this represents a recovery path
|
|
secretsExistAny cluster-rekey-verified \
|
|
cluster-rekey-shuffle \
|
|
cluster-rekey-audit
|
|
if [ $? -eq 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
# make assertions about the artifacts left behind by previous
|
|
# rekey procedure attempts
|
|
# assert that there are no stale keys before starting rekey
|
|
assertShardSecrets cluster-rekey --nokeys
|
|
count=$?
|
|
if [ "$count" -ne 0 ]; then
|
|
log $ERROR "Stale cluster-rekey secrets ($count) present"
|
|
# there was a possibility that vault had cancelled the rekey
|
|
# due to active server failure, so fall through to
|
|
# rekeyRecovery
|
|
return 1
|
|
fi
|
|
|
|
assertShardSecrets cluster-key-bk --nokeys
|
|
count=$?
|
|
if [ "$count" -ne 0 ]; then
|
|
log $ERROR "cluster-key-bk secrets ($count) present"
|
|
return 2
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# Start the rekey procedure
|
|
#
|
|
# Send initialize request to /sys/rekey/int
|
|
#
|
|
# Initialize is the first step
|
|
#
|
|
# Will not begin initialization if there are stale cluster-rekey or
|
|
# cluster-key-bk secrets
|
|
function rekeyInitialize {
|
|
local shares
|
|
local threshold
|
|
local verify
|
|
local data
|
|
local response
|
|
local value
|
|
|
|
log $INFO "Initializing vault rekey"
|
|
|
|
REKEY_STARTED=0
|
|
|
|
shares='"secret_shares": '$KEY_SECRET_SHARES
|
|
threshold='"secret_threshold": '$KEY_REQUIRED_THRESHOLD
|
|
verify='"require_verification": true'
|
|
data="{$shares,$threshold,$verify}"
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response POST $ACTIVE_TARGET /sys/rekey/init "$data"
|
|
if [ $? -ne 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
value="$( echo "$response" | jq -r ".started" )"
|
|
if [ 'false' == "$value" ]; then
|
|
log $ERROR "Rekey not started"
|
|
return 1
|
|
fi
|
|
|
|
# log the nonce
|
|
value="$( echo "$response" | jq -r ".nonce" )"
|
|
verify="$( echo "$response" | jq -r ".verification_required" )"
|
|
log $INFO "Rekey started: $value" \
|
|
"(verification_required==$verify)"
|
|
|
|
# just a sanity check
|
|
if [ 'true' != "$verify" ]; then
|
|
log $ERROR "Rekey started without verification_required:" \
|
|
"aborting"
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response DELETE $ACTIVE_TARGET /sys/rekey/init
|
|
return 1
|
|
fi
|
|
|
|
assertServerStatus "$response"
|
|
return $?
|
|
}
|
|
|
|
# The rekey authentication should happen when
|
|
# - there is a rekey in progress
|
|
# - there is a verification_nonce
|
|
#
|
|
# Authentication of the rekey request is the second step
|
|
#
|
|
# Omit rekey verification if:
|
|
# - there are existing cluster-rekey secrets
|
|
# - Verification is complete: cluster-rekey-verified or any later
|
|
# stage is complete
|
|
#
|
|
# Return linux true (0) if the current stage of rekey
|
|
# is to complete the rekey verification
|
|
# Return linux true (0) if the current stage of rekey
|
|
# is to authentication the rekey request
|
|
function needsAuthentication {
|
|
local progress
|
|
|
|
assertRekeyStarted
|
|
progress=$?
|
|
if [ "$progress" -ne 0 ]; then
|
|
return "$progress"
|
|
fi
|
|
|
|
progress="$( echo "$REKEY_STATUS_JSON" \
|
|
| jq -r '.verification_nonce' )"
|
|
if ! [ -z "$progress" -o "$progress" == "null" ]; then
|
|
# There is a rekey in progress with a verification nonce
|
|
# pass through to recovery
|
|
return 1
|
|
fi
|
|
|
|
# this represents a recovery path
|
|
assertShardSecrets cluster-rekey --nokeys
|
|
if [ $? -ne 0 ]; then
|
|
# There are already cluster-rekey secrets
|
|
return 1
|
|
fi
|
|
|
|
# skip if this represents a recovery path
|
|
secretsExistAny cluster-rekey-verified \
|
|
cluster-rekey-shuffle \
|
|
cluster-rekey-audit
|
|
if [ $? -eq 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
return $?
|
|
}
|
|
|
|
# Submits a keyshard for the rekey procedure
|
|
# Returns 0 on success
|
|
# Returns 1 on failure
|
|
# Returns KEY_SECRET_SHARES when authentication completes
|
|
function rekeySubmitShard {
|
|
local nonce="$1"
|
|
local index="$2"
|
|
local verifyauth="$3"
|
|
local prefix="$4"
|
|
local shard
|
|
local dnonce
|
|
local key
|
|
local data
|
|
local response
|
|
local progress
|
|
local root_token
|
|
local new_doc
|
|
|
|
if [ -z "$prefix" ]; then
|
|
prefix=cluster-key
|
|
fi
|
|
|
|
shard="$( get_secret "${prefix}-$index" | jq -r .keys[0] )"
|
|
dnonce='"nonce": "'$nonce'"'
|
|
key='"key": "'$shard'"'
|
|
data="{$dnonce,$key}"
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response POST $ACTIVE_TARGET /sys/rekey/update "$data"
|
|
if [ $? -ne 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
# Check the response for verification_nonce, which
|
|
# indicates completion
|
|
progress="$( echo "$response" | jq -r '.verification_nonce' )"
|
|
if [ -n "$progress" -a "$progress" != 'null' ]; then
|
|
log $INFO "Success authenticating:" \
|
|
"$((index+1)) of $KEY_REQUIRED_THRESHOLD"
|
|
|
|
if [ "$verifyauth" == "--verify-auth" ]; then
|
|
# delete the rekey and return success
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response DELETE $ACTIVE_TARGET /sys/rekey/init
|
|
return "$KEY_SECRET_SHARES"
|
|
fi
|
|
|
|
# Procedure to ensure that the old and new shards are
|
|
# secured in k8s secrets. Deletion of old shards will only
|
|
# occur when verification is successful.
|
|
root_token="$( get_secret cluster-key-root )"
|
|
new_doc="$( echo "$response" \
|
|
| jq -c '{"keys": .keys,
|
|
"keys_base64": .keys_base64,
|
|
"root_token": "'"$root_token"'"}' )"
|
|
# store the new shards
|
|
echo "$response" \
|
|
| jq -c '{"keys": .keys, "keys_base64": .keys_base64}' \
|
|
| storeVaultInitSecrets cluster-rekey
|
|
|
|
# check that the secrets match vault's rekey response
|
|
echo "$new_doc" | validateSecrets cluster-rekey
|
|
if [ $? -ne 0 ]; then
|
|
# calling function will abort the rekey
|
|
# and any cluster-rekey secrets
|
|
log $ERROR "Failed to store and verify shards" \
|
|
"after rekey authentication complete"
|
|
return 1
|
|
fi
|
|
|
|
# authentication of the rekey request is completed
|
|
# successfully
|
|
log $INFO "Rekey authentication successful"
|
|
return "$KEY_SECRET_SHARES"
|
|
fi
|
|
|
|
# Otherwise verify the response
|
|
progress="$( echo "$response" | jq -r '.progress' )"
|
|
index="$((index+1))"
|
|
if [ "$progress" -ne "$index" ]; then
|
|
log $ERROR "Authentication sequence mismatching" \
|
|
"($progress, $index)"
|
|
return 1
|
|
fi
|
|
|
|
# assert that the servers agree
|
|
assertServerStatus "$response"
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Vault server rekey status fails during" \
|
|
"authentication at $index of $KEY_REQUIRED_THRESHOLD"
|
|
return 1
|
|
fi
|
|
|
|
log $INFO "Success authenticating:" \
|
|
"$index of $KEY_REQUIRED_THRESHOLD"
|
|
return 0
|
|
}
|
|
|
|
# Return linux true (0) if the current step of the rekey procedure
|
|
# is to authenticate the request
|
|
#
|
|
# Authentication of the rekey request is the second step
|
|
#
|
|
function rekeyAuthenticate {
|
|
local verifyauth="$1"
|
|
local prefix="$2"
|
|
local response
|
|
local index
|
|
local value
|
|
local nonce
|
|
local progress
|
|
local result
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI response GET $ACTIVE_TARGET /sys/rekey/init
|
|
if [ $? -ne 0 ]; then
|
|
# an error is already printed
|
|
return 1
|
|
fi
|
|
|
|
value="$( echo "$response" | jq -r '.started' )"
|
|
if [ 'true' != "$value" ]; then
|
|
log $ERROR "Rekey authentication, but rekey not in progress"
|
|
return 1
|
|
fi
|
|
|
|
nonce="$( echo "$response" | jq -r '.nonce' )"
|
|
progress="$( echo "$response" | jq -r '.progress' )"
|
|
if ! [[ "$progress" =~ ^[0-9]{1,}$ ]]; then
|
|
log $ERROR "Rekey authentication progress not integer:" \
|
|
"$response"
|
|
return 1
|
|
elif [ "$progress" -ge "$KEY_SECRET_SHARES" ]; then
|
|
log $ERROR "Rekey authentication progress out of range:" \
|
|
"$response"
|
|
return 1
|
|
fi
|
|
|
|
if [ "$progress" -ne 0 ]; then
|
|
log $WARNING "Continue authenticating rekey at: $progress"
|
|
fi
|
|
|
|
# authenticate and store the new keys
|
|
for index in $( seq $progress $((KEY_SECRET_SHARES-1)) ); do
|
|
rekeySubmitShard "$nonce" "$index" $verifyauth $prefix
|
|
result="$?"
|
|
if [ "$result" -eq "$KEY_SECRET_SHARES" ]; then
|
|
# start the verify procedure now
|
|
if [ "$verifyauth" != "--verify-auth" ]; then
|
|
log $INFO "Starting rekey verify"
|
|
fi
|
|
break
|
|
elif [ "$result" -ne 0 ]; then
|
|
return $result
|
|
fi
|
|
done
|
|
return 0
|
|
}
|
|
|
|
# The rekey verification should happen when
|
|
# - there is a rekey in progress
|
|
# - there is a verification_nonce
|
|
#
|
|
# Omit rekey verification if:
|
|
# - there are existing cluster-rekey secrets
|
|
# - Verification is complete: cluster-rekey-verified or any later
|
|
# stage is complete
|
|
#
|
|
# Return linux true (0) if the current stage of rekey
|
|
# is to complete the rekey verification
|
|
function needsVerify {
|
|
local progress
|
|
|
|
assertRekeyStarted
|
|
progress=$?
|
|
if [ "$progress" -ne 0 ]; then
|
|
return "$progress"
|
|
fi
|
|
|
|
progress="$( echo "$REKEY_STATUS_JSON" \
|
|
| jq -r '.verification_nonce' )"
|
|
if [ -z "$progress" -o "$progress" == "null" ]; then
|
|
# There is a rekey in progress, but not with a
|
|
# verification nonce
|
|
return 1
|
|
fi
|
|
|
|
# Assert that the nonce is UUID-ish
|
|
if ! [[ "$progress" =~ ^[a-f0-9-]{36}$ ]]; then
|
|
log $ERROR "The verification_nonce is not UUID-ish:" \
|
|
"$REKEY_STATUS_JSON"
|
|
return 2
|
|
fi
|
|
|
|
assertShardSecrets cluster-rekey
|
|
if [ $? -ne 0 ]; then
|
|
# this should not happen: verify in progress but no
|
|
# cluster-rekey secrets
|
|
log $ERROR "rekey verify in progress but no cluster-rekey"
|
|
return 1
|
|
fi
|
|
|
|
# skip if this represents a recovery path
|
|
secretsExistAny cluster-rekey-verified \
|
|
cluster-rekey-shuffle \
|
|
cluster-rekey-audit
|
|
if [ $? -eq 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# Submits a keyshard for the rekey verification procedure
|
|
# Returns 0 on success
|
|
# Returns 1 on failure
|
|
# Returns KEY_REQUIRED_THRESHOLD when authentication completes
|
|
function rekeyVerifySubmitShard {
|
|
local nonce="$1"
|
|
local index="$2"
|
|
local shard
|
|
local dnonce
|
|
local key
|
|
local data
|
|
local response
|
|
local progress
|
|
|
|
shard="$( get_secret cluster-rekey-$index \
|
|
| jq -r .keys[0] )"
|
|
dnonce='"nonce": "'$nonce'"'
|
|
key='"key": "'$shard'"'
|
|
data="{$dnonce,$key}"
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response POST $ACTIVE_TARGET \
|
|
/sys/rekey/verify "$data"
|
|
if [ $? -ne 0 ]; then
|
|
# an error is printed
|
|
return 1
|
|
fi
|
|
|
|
progress="$( echo "$response" | jq -r ".complete" )"
|
|
if [ "$progress" == 'true' ]; then
|
|
log $INFO "Success verifying: using new shards"
|
|
set_secret cluster-rekey-verified /dev/stdin \
|
|
<<<"$( get_secret cluster-rekey-request )"
|
|
return $KEY_REQUIRED_THRESHOLD
|
|
fi
|
|
progress="$( echo "$response" | jq -r ".progress" )"
|
|
if [ -z "$progress" -o "$progress" == "null" ]; then
|
|
log $ERROR "Expecting rekey verify progress" \
|
|
"[$((index+1))] instead of [$progress]"
|
|
return 1
|
|
fi
|
|
# Print the progress of rekey verify.
|
|
if [ "$((index+1))" -eq "$progress" ]; then
|
|
log $INFO "Success verifying:" \
|
|
"$progress of $KEY_REQUIRED_THRESHOLD"
|
|
elif [ "$((index+1))" -gt "$progress" ]; then
|
|
# A sanity check only
|
|
log $WARNING "Verify progress [$progress] less" \
|
|
"than expected [$((index+1))]"
|
|
else
|
|
# A sanity check only
|
|
log $WARNING "Verify progress [$progress]" \
|
|
"greater than expected [$((index+1))]"
|
|
fi
|
|
assertVerifyStatus "$response"
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Vault server verify status fails during" \
|
|
"authentication at" \
|
|
"$index of $KEY_REQUIRED_THRESHOLD"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Return linux true (0) if the current step of the rekey procedure
|
|
# is to verify shard secrets
|
|
#
|
|
# This step confirms that vault manager has correctly stored the
|
|
# shards received from the vault server. This allows failures of
|
|
# the procedure to be recovered:
|
|
# - receive the shards from vault
|
|
# - store the shards in k8s secrets
|
|
# - play the shards back to vault
|
|
# - upon successful verification the new shards are effective
|
|
#
|
|
# Verification of the rekey request is the Third step
|
|
#
|
|
function rekeyVerify {
|
|
local value
|
|
local nonce
|
|
local progress
|
|
local response
|
|
local shard
|
|
local dnonce
|
|
local key
|
|
local data
|
|
local index
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI response GET $ACTIVE_TARGET /sys/rekey/verify
|
|
if [ $? -ne 0 ]; then
|
|
# an error is already printed
|
|
return 1
|
|
fi
|
|
|
|
value="$( echo "$response" | jq -r '.started' )"
|
|
if [ 'true' != "$value" ]; then
|
|
log $ERROR "Rekey verify, but rekey not in progress"
|
|
return 1
|
|
fi
|
|
|
|
nonce="$( echo "$response" | jq -r '.nonce' )"
|
|
progress="$( echo "$response" | jq -r '.progress' )"
|
|
if ! [[ "$progress" =~ ^[0-9]{1,}$ ]]; then
|
|
log $ERROR "Rekey authentication progress not integer:" \
|
|
"$response"
|
|
return 1
|
|
elif [ "$progress" -ge "$KEY_SECRET_SHARES" ]; then
|
|
log $ERROR "Rekey authentication progress out of range:" \
|
|
"$response"
|
|
return 1
|
|
fi
|
|
if [ "$progress" -ne 0 ]; then
|
|
log $WARNING "Continue verifying rekey at: $progress"
|
|
fi
|
|
|
|
# assert that the servers agree on verify status
|
|
assertVerifyStatus "$response"
|
|
if [ $? -ne 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
# authenticate the verify procedure
|
|
for index in $( seq $progress $((KEY_SECRET_SHARES-1)) ); do
|
|
rekeyVerifySubmitShard "$nonce" "$index"
|
|
result=$?
|
|
if [ "$result" -eq "$KEY_REQUIRED_THRESHOLD" ]; then
|
|
# rekeyVerifySubmitShard returns KEY_REQUIRED_THRESHOLD
|
|
# when .complete == true was received
|
|
return 0
|
|
elif [ "$result" -ne 0 ]; then
|
|
# any other non-zero result is a failure
|
|
return 1
|
|
fi
|
|
done
|
|
|
|
log $ERROR "Verify procedure ended without completion"
|
|
return 1
|
|
}
|
|
|
|
# The shuffling of keys shards in k8s secrets should happen when
|
|
# th cluster-rekey-verified procedure step is completed.
|
|
#
|
|
# Omit shuffling if:
|
|
# - vault server reports rekey in progress (unclear status)
|
|
# - shuffling is already complete: cluster-rekey-shuffle or later
|
|
# stage is complete
|
|
# - there are no cluster-rekey secrets
|
|
# - there are cluster-key-bk secrets
|
|
#
|
|
# Return linux true (0) if the current stage of rekey
|
|
# is to complete the swapping of validated shards
|
|
function needsShuffle {
|
|
local progress
|
|
|
|
# assert that a rekey is not in progress
|
|
assertRekeyStarted --not
|
|
progress=$?
|
|
if [ "$progress" -ne 0 ]; then
|
|
# 1 - maintain the status of rekey in progress
|
|
# 2 - api error, try again later
|
|
return "$progress"
|
|
fi
|
|
|
|
secretExists cluster-rekey-verified >/dev/null
|
|
if [ $? -ne 0 ]; then
|
|
# proceeds to next procedure step
|
|
return 1
|
|
fi
|
|
|
|
# skip if this represents a recovery path
|
|
secretsExistAny cluster-rekey-shuffle \
|
|
cluster-rekey-audit
|
|
if [ $? -eq 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
assertShardSecrets cluster-rekey
|
|
case $? in
|
|
0)
|
|
# There is no rekey in progress, and there is a set
|
|
# of cluster-rekey shards recorded
|
|
;;
|
|
$KEY_SECRET_SHARES)
|
|
# There is no rekey in progress, and there are no
|
|
# cluster-rekey shards recorded
|
|
return 1
|
|
;;
|
|
*)
|
|
# with cluster-rekey-verified, an incomplete set of
|
|
# cluster-rekey indicates partial deletion after copying
|
|
# to cluster-key
|
|
# will want to audit the cluster-key secrets before
|
|
# deleting cluster-rekey
|
|
log $WARNING "The number key shard secrets for" \
|
|
"cluster-rekey is not complete"
|
|
return 1
|
|
;;
|
|
esac
|
|
|
|
# otherwise allow rekeyShuffleKeys to be re-entrant to
|
|
# the existance of or lack of cluster-key and cluster-key-bk
|
|
# cluster-rekey is only deleted when confirmed to be copied to
|
|
# cluster-key
|
|
return 0
|
|
}
|
|
|
|
# This procedure shuffles the shard secrets from cluster-rekey to
|
|
# cluster-key to cluster-bk
|
|
#
|
|
# The function intends to be resolve failures of the vault manager
|
|
# process where it is interrupted abruptly such as with kill -9.
|
|
# In combination with needsShuffle it can be re-run until it
|
|
# completes the shuffle:
|
|
# - cluster-key shards are copied to cluster-key-bk
|
|
# - cluster-key shards are delete
|
|
# - cluster-rekey is copied to cluster-key
|
|
# - cluster-rekey is delete
|
|
#
|
|
# A subsequent step audits the new keys before deleting the
|
|
# cluster-key-bk secrets
|
|
function rekeyShuffleKeys {
|
|
local key_exists
|
|
local rekey_exists
|
|
local bk_exists
|
|
local key_doc=""
|
|
local rekey_doc=""
|
|
|
|
assertShardSecrets cluster-key
|
|
key_exists=$?
|
|
assertShardSecrets cluster-rekey
|
|
rekey_exists=$?
|
|
assertShardSecrets cluster-key-bk
|
|
bk_exists=$?
|
|
|
|
if [ "$key_exists" -eq 0 ]; then
|
|
key_doc="$( reconstructInitResponse cluster-key )"
|
|
echo "$key_doc" | validateSecrets cluster-key
|
|
if [ $? -ne 0 ]; then
|
|
log $ERRROR "Failed to read cluster-key"
|
|
return 1
|
|
fi
|
|
fi
|
|
|
|
if [ "$rekey_exists" -eq 0 ]; then
|
|
rekey_doc="$( reconstructInitResponse cluster-rekey )"
|
|
echo "$rekey_doc" | validateSecrets cluster-rekey
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Failed to read cluster-rekey"
|
|
return 1
|
|
fi
|
|
else
|
|
# this is recovery path
|
|
if [ -n "key_doc" ]; then
|
|
log $WARNING "Progress cluster-rekey-shuffle without" \
|
|
"cluster-rekey"
|
|
set_secret cluster-rekey-shuffle /dev/stdin \
|
|
<<<"$( get_secret cluster-rekey-request )"
|
|
return
|
|
fi
|
|
log $ERROR "No cluster-key or cluster-rekey"
|
|
return 1
|
|
fi
|
|
|
|
if [ "$bk_exists" -lt "$KEY_SECRET_SHARES" \
|
|
-a "$bk_exists" -ne 0 ]; then
|
|
# this is a recovery path
|
|
# an incomplete copy of cluster-key secrets
|
|
if [ -n "$key_doc" ]; then
|
|
deleteShardSecrets cluster-key-bk
|
|
assertShardSecrets cluster-key-bk
|
|
bk_exists=$?
|
|
if [ "$bk_exists" -lt "$KEY_SECRET_SHARES" ]; then
|
|
log $ERROR "Failed to delete incomplete" \
|
|
"cluster-key-bk"
|
|
return 1
|
|
fi
|
|
else
|
|
# this shouldn't happen;
|
|
# either not both failures is anticipated
|
|
log $ERROR "Sanity: incomplete both cluster-key-bk" \
|
|
"and missing/incomplete cluster-key secrets"
|
|
return 1
|
|
fi
|
|
fi
|
|
if [ "$bk_exists" -eq 0 ]; then
|
|
# this is a recovery path
|
|
if [ -n "$key_doc" ]; then
|
|
# Assert that cluster-key and cluster-key-bk are the
|
|
# same
|
|
log $INFO "Recovering from pre-existing cluster-key-bk"
|
|
echo "$key_doc" | validateSecrets cluster-key-bk
|
|
if [ $? -eq 0 ]; then
|
|
# cluster-key-bk == cluster-key
|
|
deleteShardSecrets cluster-key
|
|
assertShardSecrets cluster-key
|
|
key_exists=$?
|
|
key_doc=""
|
|
else
|
|
echo "$key_doc" | validateSecrets cluster-rekey
|
|
if [ $? -eq 0 ]; then
|
|
# Recovering cluster-key == cluster-rekey
|
|
log $INFO "Recovering with cluster-key"
|
|
deleteShardSecrets cluster-rekey
|
|
set_secret cluster-rekey-shuffle /dev/stdin \
|
|
<<<"$( get_secret cluster-rekey-request )"
|
|
return 0
|
|
else
|
|
log $ERROR "Three different sets of keys" \
|
|
"in k8s secrets"
|
|
return 1
|
|
fi
|
|
fi
|
|
fi
|
|
# else: there is no cluster-key to backup
|
|
else
|
|
# this is the normal procedure path
|
|
log $INFO "Copying cluster-key secrets to cluster-key-bk"
|
|
copyShardSecrets cluster-key cluster-key-bk
|
|
echo "$key_doc" | validateSecrets cluster-key-bk
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Failed to copy cluster-key to cluster-key-bk"
|
|
deleteShardSecrets cluster-key-bk
|
|
return 1
|
|
fi
|
|
deleteShardSecrets cluster-key
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Failed to delete cluster-key secrets"
|
|
return 1
|
|
fi
|
|
assertShardSecrets cluster-key
|
|
key_exists=$?
|
|
key_doc=""
|
|
fi
|
|
|
|
# cluster-key-bk exists here
|
|
# cluster-rekey rekey_doc is valid here
|
|
|
|
# if cluster-key exists, such as number of secrets less than
|
|
# KEY_SECRET_SHARES, then delete them; deleteShardSecrets is a
|
|
# no-op if there are none there
|
|
deleteShardSecrets cluster-key
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Failed to delete cluster-key"
|
|
return 1
|
|
# try again later
|
|
fi
|
|
|
|
log $INFO "Copying cluster-rekey secrets to cluster-key"
|
|
copyShardSecrets cluster-rekey cluster-key
|
|
echo "$rekey_doc" | validateSecrets cluster-key
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Failed to copy cluster-rekey to cluster-key"
|
|
return 1
|
|
fi
|
|
|
|
deleteShardSecrets cluster-rekey
|
|
set_secret cluster-rekey-shuffle /dev/stdin \
|
|
<<<"$( get_secret cluster-rekey-request )"
|
|
|
|
return 0
|
|
}
|
|
|
|
# The audit of cluster-key should happen when these other procedure
|
|
# steps are completed:
|
|
# - cluster-rekey-verified
|
|
# - cluster-rekey-shuffle
|
|
#
|
|
# Omit audit if:
|
|
# - vault server reports rekey in progress (failed previous audit?)
|
|
# - audit is already complete: cluster-rekey-audit exists
|
|
#
|
|
# Return linux true (0) if the current stage of rekey
|
|
# is to run the audit
|
|
function needsAudit {
|
|
local progress
|
|
|
|
# assert that a rekey is not in progress
|
|
assertRekeyStarted --not
|
|
progress=$?
|
|
if [ "$progress" -ne 0 ]; then
|
|
return "$progress"
|
|
fi
|
|
|
|
# Select recovery path with response '3'
|
|
secretExists cluster-rekey-audit >/dev/null
|
|
if [ $? -eq 0 ]; then
|
|
# this path indicates a failure to complete
|
|
# finalizeRekey. cluster-rekey-audit is the last
|
|
# milestone to be deleted
|
|
log $INFO "rekey audit already completed"
|
|
return 3
|
|
fi
|
|
|
|
secretExists cluster-rekey-request >/dev/null
|
|
if [ $? -ne 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
secretExists cluster-rekey-verified >/dev/null
|
|
if [ $? -ne 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
secretExists cluster-rekey-shuffle >/dev/null
|
|
if [ $? -ne 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
assertShardSecrets cluster-key
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "rekey audit requested but cluster-keys absent"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Audit that the active vault server authenticates with the cluster
|
|
# keys specified by prefix
|
|
#
|
|
# Returns 0 on success
|
|
# Returns 1 if the audit failes
|
|
# Returns 2 if there was a failure unrelated to authentication
|
|
function rekeyAudit {
|
|
local prefix="$1"
|
|
local value
|
|
local response
|
|
|
|
if [ -z "$prefix" ]; then
|
|
prefix="cluster-key"
|
|
fi
|
|
|
|
log $INFO "Auditing the shards in $prefix secrets"
|
|
assertNoRekey
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Cannot audit with rekey in progress"
|
|
return 2
|
|
fi
|
|
|
|
assertShardSecrets "$prefix"
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Audit fails with absent $prefix secrets"
|
|
return 1
|
|
fi
|
|
|
|
rekeyInitialize
|
|
if [ $? -ne 0 ]; then
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response DELETE $ACTIVE_TARGET /sys/rekey/init
|
|
return 2
|
|
fi
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI response GET $ACTIVE_TARGET /sys/rekey/init
|
|
if [ $? -ne 0 ]; then
|
|
# There's no reason to believe this one will succeed where
|
|
# the other hadn't
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response DELETE $ACTIVE_TARGET /sys/rekey/init
|
|
return 2
|
|
fi
|
|
|
|
value="$( echo "$response" | jq -r ".verification_required" )"
|
|
if [ "$value" != "true" ]; then
|
|
log $ERROR "Audit sanity: verification_required not set:" \
|
|
"$response"
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response DELETE $ACTIVE_TARGET /sys/rekey/init
|
|
return 1
|
|
fi
|
|
|
|
rekeyAuthenticate --verify-auth "$prefix"
|
|
result="$?"
|
|
if [ "$result" -eq 0 ]; then
|
|
log $INFO "Audit of cluster-key secrets passes"
|
|
else
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response DELETE $ACTIVE_TARGET /sys/rekey/init
|
|
fi
|
|
|
|
return $result
|
|
}
|
|
|
|
# clean up the artifacts from rekey procedure
|
|
# The audit procedure proves the shards in cluster-key
|
|
# secrets will unseal the vault.
|
|
#
|
|
# If vault-manager is killed during this procedure step it should
|
|
# continue to try to delete the artifacts until finally deleting
|
|
# cluster-rekey-audit
|
|
function finalizeRekey {
|
|
local secrettext
|
|
secrettext="$( get_secret cluster-rekey-audit )"
|
|
|
|
log $INFO "removing artifacts of the rekey procedure:" \
|
|
"$secrettext"
|
|
assertShardSecrets cluster-rekey --nokeys
|
|
if [ $? -ne 0 ]; then
|
|
log $WARNING "removing cluster-rekey secrets" \
|
|
"after audit"
|
|
deleteShardSecrets cluster-rekey
|
|
fi
|
|
deleteShardSecrets cluster-key-bk
|
|
deleteSecrets cluster-rekey-verified
|
|
deleteSecrets cluster-rekey-shuffle
|
|
deleteSecrets cluster-rekey-request
|
|
deleteSecrets cluster-rekey-audit
|
|
|
|
log $INFO "Rekey request complete: $secrettext"
|
|
}
|
|
|
|
# This procedure handle a few cases where the vault active server or
|
|
# vault-manager were killed.
|
|
#
|
|
# - rekey authentication completed by vault-manager was killed
|
|
# before the shards could be stored
|
|
# - rekey verification may be cancelled by the failure of the active
|
|
# vault server
|
|
#
|
|
function rekeyRecovery {
|
|
local key_exists
|
|
local rekey_exists
|
|
local bk_exists
|
|
local verified_exists
|
|
local shuffle_exists
|
|
local audit_exists
|
|
local inprogress
|
|
local verifyprogress
|
|
|
|
log $INFO "Recovering the rekey procedure"
|
|
|
|
# assert that the vault server are all up and agree
|
|
# about the rekey status
|
|
allServersRunning \
|
|
&& allServersHaveIP \
|
|
&& allServersUnsealed \
|
|
|| return 1
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI REKEY_STATUS_JSON GET $ACTIVE_TARGET /sys/rekey/init
|
|
if [ $? -ne 0 ]; then
|
|
# an error is printed
|
|
# wait for recovery
|
|
REKEY_STATUS_JSON=''
|
|
return 1
|
|
fi
|
|
assertServerStatus "$REKEY_STATUS_JSON"
|
|
if [ $? -ne 0 ]; then
|
|
# wait for the vault servers to sync
|
|
return 1
|
|
fi
|
|
|
|
inprogress="$( echo "$REKEY_STATUS_JSON" | jq -r '.started' )"
|
|
verifyprogress="$( echo "$REKEY_STATUS_JSON" \
|
|
| jq -r '.verification_nonce' )"
|
|
if [ "$inprogress" == "true" ]; then
|
|
# If a rekey is in progress, then cancel it
|
|
# - an authentication will reinitialize
|
|
# - a verification will reinitialtize
|
|
# - a rekeyAudit will retry
|
|
log $INFO "Cancelling rekey in progress"
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response DELETE $ACTIVE_TARGET /sys/rekey/init
|
|
if [ $? -ne 0 ]; then
|
|
# retry later
|
|
return 1
|
|
fi
|
|
fi
|
|
|
|
assertShardSecrets cluster-key
|
|
key_exists=$?
|
|
assertShardSecrets cluster-rekey
|
|
rekey_exists=$?
|
|
assertShardSecrets cluster-key-bk
|
|
bk_exists=$?
|
|
|
|
secretExists cluster-rekey-verified >/dev/null
|
|
verified_exists=$?
|
|
secretExists cluster-rekey-shuffle >/dev/null
|
|
shuffle_exists=$?
|
|
secretExists cluster-rekey-audit >/dev/null
|
|
audit_exists=$?
|
|
|
|
# review each of the milestones to discern the failure point
|
|
if [ "$audit_exists" -eq 0 ]; then
|
|
true
|
|
# no recovery options here
|
|
# pass through
|
|
elif [ "$shuffle_exists" -eq 0 ]; then
|
|
true
|
|
# no recovery options here
|
|
# pass through
|
|
elif [ "$verified_exists" -eq 0 ]; then
|
|
if [ "$rekey_exists" -gt 0 ]; then
|
|
if [ "$rekey_exists" -lt "$KEY_SECRET_SHARES" ]; then
|
|
# with verified_exists, indicates partial deletion
|
|
# of the cluster-rekey secrets after copying to
|
|
# cluster-key. Audit the cluster-key secrets before
|
|
# deleting rekey
|
|
rekeyAudit cluster-key
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Audit cluster-key fails with a" \
|
|
"partial set of cluster-rekey"
|
|
return 1
|
|
fi
|
|
|
|
deleteShardSecrets cluster-rekey
|
|
fi
|
|
|
|
# Handle condition where secrets were shuffled but
|
|
# vault-manager failed before recording the
|
|
# milestone cluster-rekey-shuffle
|
|
|
|
# auditRekey will double-check that cluster-key is
|
|
# in use
|
|
set_secret cluster-rekey-shuffle /dev/stdin \
|
|
<<<"$( get_secret cluster-rekey-request )"
|
|
log $INFO "Continuing rekey procedure with audit" \
|
|
"of cluster-key"
|
|
return 0
|
|
fi
|
|
# else: pass through
|
|
else
|
|
if [ "$rekey_exists" -eq 0 ]; then
|
|
# Handle condition where an active server fails during
|
|
# verification: vault may have cancelled the rekey procedure
|
|
|
|
# This question is: which shards are the vault servers
|
|
# using?
|
|
log $INFO "Recovering from mismatch of cluster-rekey" \
|
|
"and verified status"
|
|
|
|
# Audit the existing shards to see which ones the
|
|
# vault servers are keyed for.
|
|
# Most likely that the verification failed due to
|
|
# active server failing, start with cluster-key
|
|
rekeyAudit cluster-key
|
|
if [ $? -eq 0 ]; then
|
|
# The rekey verification did not complete
|
|
# remove cluster-rekey secrets
|
|
# The rekey procedure should restart
|
|
deleteShardSecrets cluster-rekey
|
|
log $INFO "Restart rekey procedure"
|
|
return 0
|
|
fi
|
|
|
|
# this happens when vault-manager process is killed
|
|
rekeyAudit cluster-rekey
|
|
if [ $? -eq 0 ]; then
|
|
set_secret cluster-rekey-verified /dev/null \
|
|
<<<$( get_secret cluster-rekey-request )
|
|
log $INFO "Continue rekey procedure with cluster-rekey"
|
|
return 0
|
|
fi
|
|
# else: pass through
|
|
elif [ "$rekey_exists" -eq 5 ]; then
|
|
# There are no cluster-rekey secrets; and the rekey is
|
|
# cancelled: the rekey procedure will restart
|
|
log $INFO "Continue rekey procedure with initialization"
|
|
return 0
|
|
else # cluster-rekey secrets are incomplete
|
|
# Handle condition where verification is needed but
|
|
# vault-manager did not store shards. The rekey was
|
|
# canceled above
|
|
|
|
# assert cluster-key before deleteing rekey
|
|
rekeyAudit cluster-key
|
|
if [ $? -eq 0 ]; then
|
|
# the rekey procedure will restart
|
|
log $INFO "Deleting partial set of" \
|
|
"cluster-rekey secrets"
|
|
deleteShardSecrets cluster-rekey
|
|
return 0
|
|
fi
|
|
# else: pass through
|
|
fi
|
|
fi
|
|
|
|
log $ERROR "Did not recover from current rekey status"
|
|
}
|
|
|
|
# The state machine for rekeying the vault server
|
|
#
|
|
# The overall procedure for rekey request includes:
|
|
# - wait for stability of vault servers
|
|
# - initialize the procedure
|
|
# - authenticate the rekey procedure by supplying shards
|
|
# - store the new shards
|
|
# - verify the rekey with the new shards read from k8s secrets
|
|
# - rotate the shard secrets:
|
|
# cluster-rekey - cluster-key - cluster-key-bk
|
|
# - Audit the new shards with active vault server
|
|
# - Remove artifacts of rekey procedure:
|
|
# cluster-key-bk, milestone secrets
|
|
#
|
|
function vaultRekey {
|
|
local records
|
|
local count
|
|
local result
|
|
local secrettext
|
|
|
|
if ! needsRekey; then
|
|
return
|
|
fi
|
|
|
|
# Retrieve and record the rekey status once for the tests that
|
|
# follow
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI REKEY_STATUS_JSON GET $ACTIVE_TARGET /sys/rekey/init
|
|
if [ $? -ne 0 ]; then
|
|
# an error is printed
|
|
REKEY_STATUS_JSON=''
|
|
return
|
|
fi
|
|
|
|
needsAudit
|
|
case $? in
|
|
0)
|
|
rekeyResuming
|
|
rekeyAudit
|
|
if [ $? -eq 0 ]; then
|
|
set_secret cluster-rekey-audit /dev/stdin \
|
|
<<<$( get_secret cluster-rekey-request )
|
|
|
|
finalizeRekey
|
|
fi
|
|
return
|
|
;;
|
|
1) # continue to procedure step
|
|
;;
|
|
3) # audit is already completed
|
|
secretExists cluster-rekey-audit >/dev/null
|
|
if [ $? -eq 0 ]; then
|
|
# the cluster-key secrets were audit, but vault
|
|
# manager didn't get a chance to set
|
|
# cluster-rekey-audit milestone
|
|
finalizeRekey
|
|
return
|
|
fi
|
|
log $ERROR "Discrepancy between needsAudit and" \
|
|
"rekeyVault"
|
|
return
|
|
;;
|
|
*)
|
|
# an error occurs for which the procedure should not
|
|
# continue
|
|
return
|
|
;;
|
|
esac
|
|
|
|
needsShuffle
|
|
case $? in
|
|
0)
|
|
rekeyResuming
|
|
rekeyShuffleKeys
|
|
return
|
|
;;
|
|
1) # continue to procedure step
|
|
;;
|
|
*)
|
|
# an error occurs for which the procedure should not
|
|
# continue
|
|
return
|
|
;;
|
|
esac
|
|
|
|
needsVerify
|
|
case $? in
|
|
0)
|
|
rekeyResuming
|
|
rekeyVerify
|
|
return
|
|
;;
|
|
1) # continue to procedure step
|
|
;;
|
|
*)
|
|
# an error occurs for which the procedure should not
|
|
# continue
|
|
return
|
|
;;
|
|
esac
|
|
|
|
needsAuthentication
|
|
case $? in
|
|
0)
|
|
rekeyResuming
|
|
rekeyAuthenticate
|
|
return
|
|
;;
|
|
1) # continue to procedure step
|
|
;;
|
|
*)
|
|
# an error occurs for which the procedure should not
|
|
# continue
|
|
return
|
|
;;
|
|
esac
|
|
|
|
needsInitialization
|
|
case $? in
|
|
0)
|
|
secrettext="$( get_secret cluster-rekey-request )"
|
|
log $INFO "Rekey request started: $secrettext"
|
|
rekeyInitialize
|
|
return
|
|
;;
|
|
1) # continue to failure
|
|
;;
|
|
*)
|
|
# an error occurs for which the procedure should not
|
|
# continue
|
|
return
|
|
;;
|
|
esac
|
|
|
|
# falling through the case statements requires remediation
|
|
rekeyResuming
|
|
rekeyRecovery
|
|
}
|
|
|
|
# Return 0 (true) if either the vault server status shows a rekey
|
|
# is in progress, or if vault-manager is engaged in the process of
|
|
# rekeying the vault
|
|
#
|
|
# Vault manager rekey is in progress if either of these secrets
|
|
# exists:
|
|
# cluster-rekey-request - the first to be created
|
|
# cluster-rekey-audit - the last to be removed
|
|
function rekeyInProgress {
|
|
# query the vault server
|
|
assertNoRekey
|
|
if [ $? -ne 0 ]; then
|
|
return 0
|
|
fi
|
|
|
|
# look for vault-manager's milestone secrets
|
|
secretsExistAny cluster-rekey-request cluster-rekey-audit
|
|
return $?
|
|
}
|
|
|
|
# Check conditions that need to be met before taking a snapshot of
|
|
# the vault. The same conditions apply for snapshot restore.
|
|
#
|
|
# The required conditions are:
|
|
# - vault server pods matches HA_REPLICAS
|
|
# - vault server pods are unsealed
|
|
# - there is no rekey in progress
|
|
#
|
|
# Returns 0 for success, or >0 for conditions not met
|
|
# The fail conditions are logged to stdout/stderr
|
|
function snapshotPreCheck {
|
|
local errors=0
|
|
local pods
|
|
local podcount
|
|
local host
|
|
local dnsname
|
|
local server_status
|
|
local sealed
|
|
|
|
pods="$( getVaultPods | grep "^$VAULT_FN" )"
|
|
podcount="$( echo "$pods" | awk '{print $1}' | wc -w )"
|
|
|
|
if [ "$podcount" -ne "$HA_REPLICAS" ]; then
|
|
log $ERROR "snapshotPreCheck: vault pods ($podcount)" \
|
|
"does not match replicas ($HA_REPLICAS)"
|
|
errors=$(( errors + 1 ))
|
|
fi
|
|
|
|
while read host dnsname; do
|
|
NO_HEADER=true \
|
|
API_TMOUT=$QUERY_TMOUT \
|
|
vaultAPI server_status GET $dnsname.$POD_TARGET_BASE \
|
|
/sys/health
|
|
sealed="$( echo "$server_status" | jq .sealed )"
|
|
if [ "$sealed" != "false" ]; then
|
|
log $ERROR "snapshotPreCheck: $host ($dnsname)" \
|
|
"sealed status is [$sealed]"
|
|
errors=$(( errors + 1 ))
|
|
else
|
|
log $DEBUG "snapshotPreCheck: $host ($dnsname)" \
|
|
"sealed status is [$sealed]"
|
|
fi
|
|
done <<<"$pods"
|
|
|
|
if rekeyInProgress; then
|
|
log $ERROR "snapshotPreCheck: a rekey is in progress"
|
|
errors=$(( errors + 1 ))
|
|
fi
|
|
|
|
return $errors
|
|
}
|
|
|
|
# Take a snapshot of the vault, which is output to stdout
|
|
function snapshotCreate {
|
|
local apipath=/sys/storage/raft/snapshot
|
|
|
|
curl -s -S --cacert "$CERT" \
|
|
--connect-timeout $QUERY_TMOUT \
|
|
--header "X-Vault-Token:$( get_secret cluster-key-root )" \
|
|
--request "GET" \
|
|
"https://$ACTIVE_TARGET:${TARGET_PORT}/v1${apipath}"
|
|
}
|
|
|
|
# Store the init response and metadata associated with a vault
|
|
# snapshot into the specified k8s secret.
|
|
#
|
|
# metadata should be a dictionary type structure in this form:
|
|
# {"date":"xxx","snapshot_sum":"yyy","secret":"zzz"}
|
|
#
|
|
# The 'snapshot' of the init response should be taken promptly with
|
|
# the snapshot of the vault. Especially, consider pausing vault
|
|
# manager, in addition to using snapshotPreCheck, to ensure the
|
|
# two are consistent.
|
|
#
|
|
# In practice the metadata can contain any information; the
|
|
# procedure only requires the value of 'secret', as in:
|
|
# echo "$metadata" | jq -r .secret
|
|
function snapshotSetSecret {
|
|
local secret="$1"
|
|
local metadata="$2"
|
|
local jqlog
|
|
local result
|
|
local keys
|
|
local data
|
|
|
|
# make sure the user supplied data is ok
|
|
jqlog="$( echo "$metadata" | jq . 2>&1 >/dev/null )"
|
|
result=$?
|
|
if [ $result -ne 0 ]; then
|
|
log $ERROR "snapshotSetSecret: error parsing metadata:" \
|
|
"[$result] [$jqlog]"
|
|
return 1
|
|
fi
|
|
|
|
# check that the user supplied metadata contains 'secret',
|
|
# which is the only value the procedure requires.
|
|
jqlog="$( echo "$metadata" | jq -r .secret 2>&1 )"
|
|
if [ $? -ne 0 -o -z "$jqlog" -o "$jqlog" == "null" ]; then
|
|
log $WARNING "snapshotSetSecret: metadata omits 'secret'"
|
|
fi
|
|
|
|
keys="$( reconstructInitResponse cluster-key )"
|
|
data="{\"metadata\":$metadata,\"init\":$keys}"
|
|
|
|
# make sure the assembled secret data is ok
|
|
echo "$data" | jq . >/dev/null 2>&1
|
|
result=$?
|
|
if [ $result -ne 0 ]; then
|
|
log $ERROR "snapshotSetSecret: error parsing secret data:" \
|
|
"[$result]"
|
|
return 1
|
|
fi
|
|
|
|
echo "$data" | jq -c . | set_secret "$secret" /dev/stdin
|
|
|
|
# verify the copy of shards secrets
|
|
get_secret "$secret" | jq -c .init | validateSecrets cluster-key
|
|
if [ $? -ne 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# POST stdin to the active vault server API endpoint for restoring
|
|
# the snapshot. stdin is the snapshot file of the vault cluster.
|
|
#
|
|
# The required parameter is the metadata associated with the
|
|
# snapshot, which contains the name of the k8s secret which has
|
|
# the unseal shards for the vault data being restored. The metadata
|
|
# needs to contain at least '{"secret":"xxx"}', and this secret
|
|
# needs to exist in the vault namespace.
|
|
#
|
|
# The content of the secret will be used to restore the unseal
|
|
# shards for the vault that is being restored.
|
|
function snapshotRestore {
|
|
local metadata="$1"
|
|
local secret
|
|
local logs
|
|
local result
|
|
local initdata
|
|
local apipath="/sys/storage/raft/snapshot-force"
|
|
|
|
# check that the associated secret exists
|
|
secret="$( echo "$metadata" | jq -r .secret 2>/dev/null )"
|
|
if [ -z "$secret" -o "$secret" == "null" ]; then
|
|
log $ERROR "Metadata omits the k8s secret associated with" \
|
|
"the snapshot"
|
|
return 1
|
|
fi
|
|
|
|
secretExists "$secret" >/dev/null
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "K8s secret [$secret] associated with the" \
|
|
"snapshot does not exist"
|
|
return 1
|
|
fi
|
|
|
|
# check the init response associated with the snapshot
|
|
initdata="$( get_secret "$secret" | jq -c .init 2>/dev/null )"
|
|
if [ -z "$initdata" -o "$initdata" == 'null' ]; then
|
|
log $ERROR "Failed to retrieve init response from" \
|
|
"k8s secret [$secret]"
|
|
return 1
|
|
fi
|
|
|
|
# The snapshot API success does not give a response. On vault
|
|
# API error the return code is also 0. If there is a log, then
|
|
# there was an error.
|
|
logs="$( curl -s -S --cacert "$CERT" \
|
|
--connect-timeout $QUERY_TMOUT \
|
|
--header "X-Vault-Token:$( get_secret cluster-key-root )" \
|
|
--request POST \
|
|
--data-binary @/dev/stdin \
|
|
"https://$ACTIVE_TARGET:${TARGET_PORT}/v1${apipath}" 2>&1 )"
|
|
|
|
result=$?
|
|
log $INFO "Snapshot restore API response: $result"
|
|
if [ "$result" -ne 0 -o -n "$logs" ]; then
|
|
log $ERROR "Snapshot restore: [$logs]"
|
|
return 1
|
|
fi
|
|
|
|
# Restore the secrets associated with the snapshot
|
|
# We're done if the secrets haven't changed.
|
|
echo "$initdata" | validateSecrets cluster-key
|
|
if [ $? -eq 0 ]; then
|
|
return 0
|
|
fi
|
|
|
|
# replace vault's init response in k8s secrets
|
|
deleteShardSecrets cluster-key
|
|
deleteSecrets cluster-key-root
|
|
echo "$initdata" | storeVaultInitSecrets cluster-key
|
|
|
|
# finally, verify the storage was successful
|
|
echo "$initdata" | validateSecrets cluster-key
|
|
return $?
|
|
}
|
|
|
|
# function that calls exit_on_trap for every second of sleep
|
|
# takes total sleep time as parameter
|
|
function trap_sleep {
|
|
local sleep_time="$1"
|
|
|
|
for i in $(seq 1 $sleep_time); do
|
|
sleep 1
|
|
exit_on_trap 22
|
|
done
|
|
}
|
|
|
|
|
|
#
|
|
# LOGIC
|
|
#
|
|
if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
|
|
# This script was sourced
|
|
return 0
|
|
fi
|
|
|
|
health_excuse_create "$HEALTH_EXCUSE_INIT" "$HC_MSG_INIT"
|
|
if [ -n "$EARLY_PAUSE" ]; then
|
|
echo -n "$EARLY_PAUSE" > $PAUSEFILE
|
|
fi
|
|
|
|
exit_on_trap 1
|
|
|
|
# Match kubectl version to server version (or etc)
|
|
pickK8sVersion
|
|
|
|
# check if this pod is helping to convert storage from pvc to k8s
|
|
# secrets
|
|
mountHelper
|
|
exit_on_trap 15
|
|
|
|
# check if there are existing key shard secrets, boot strap secret,
|
|
# or pre-existing resource
|
|
K8S_SECRETS_PREEXIST="$( secretExists cluster-key-root )"
|
|
exit_on_trap 16
|
|
BOOTSTRAP_PREEXISTS="$( secretExists cluster-key-bootstrap )"
|
|
exit_on_trap 17
|
|
PVC_PREEXISTS="$( pvcRemoved )"
|
|
exit_on_trap 18
|
|
|
|
runConversion
|
|
exit_on_trap 19
|
|
|
|
# check if PVC still persisted after conversion, and if so issue a warning.
|
|
PVC_PREEXISTS="$( pvcRemoved )"
|
|
PVC_STATUS=$?
|
|
if [ $PVC_STATUS -eq 1 ]; then
|
|
log $DEBUG "PVC storage $PVC_PREEXISTS is currently terminating"
|
|
elif [ $PVC_STATUS -eq 2 ]; then
|
|
log $WARNING "PVC storage $PVC_PREEXISTS deletion has failed during conversion"
|
|
fi
|
|
|
|
# Waiting for at least one vault server, to check initialization
|
|
waitForPods 1
|
|
exit_on_trap 2
|
|
|
|
log $DEBUG "Putting a list of vault pods and ip in $WORKDIR/pods.txt"
|
|
getVaultPods > $WORKDIR/pods.txt
|
|
exit_on_trap 3
|
|
|
|
vaultInitialized
|
|
IS_VAULT_INITIALIZED=$?
|
|
if [ $IS_VAULT_INITIALIZED -eq 1 ]; then
|
|
exit_on_trap 4
|
|
desired_pods=$HA_REPLICAS
|
|
|
|
# Waiting for vault servers to come up
|
|
waitForPods $desired_pods
|
|
exit_on_trap 5
|
|
|
|
log $INFO "Putting a list of vault pods and IPs in $WORKDIR/pods.txt"
|
|
getVaultPods > $WORKDIR/pods.txt
|
|
exit_on_trap 6
|
|
|
|
log $DEBUG "Initializing the vault on vault-0 and" \
|
|
"storing keys in k8s secrets"
|
|
initVault
|
|
|
|
#Some sleep required to allow convergence"
|
|
sleep "$INIT_CONVERGE_TIME"
|
|
|
|
log $DEBUG "Unsealing vault-0 using the init shards"
|
|
for row in $(awk 'NR==1{print $2}' $WORKDIR/pods.txt); do
|
|
unsealVault "$row"
|
|
done
|
|
|
|
log $DEBUG "Joining other vault servers to the HA Raft cluster"
|
|
for row in $(awk 'NR>1{print $2}' $WORKDIR/pods.txt); do
|
|
log $DEBUG "$( grep $row $WORKDIR/pods.txt )"
|
|
joinRaft "$row"
|
|
sleep "$JOIN_RATE"
|
|
done
|
|
|
|
exit_on_trap 7
|
|
log $INFO "Unsealing the remaining vaults"
|
|
for row in $(awk 'NR>1{print $2}' $WORKDIR/pods.txt); do
|
|
log $DEBUG "$( grep $row $WORKDIR/pods.txt )"
|
|
unsealVault "$row"
|
|
sleep "$UNSEAL_RATE"
|
|
exit_on_trap 8
|
|
done
|
|
else
|
|
log $INFO "Vault is initialized"
|
|
fi
|
|
|
|
exit_on_trap 9
|
|
# initialize the state machine - vault server status records
|
|
echo "" > "$PODREC_F"
|
|
while read host dns_name; do
|
|
if [ -z "$host" ]; then
|
|
continue
|
|
fi
|
|
status_rec="/$host/$dns_name//"
|
|
echo "$status_rec" >> "$PODREC_F"
|
|
done <$WORKDIR/pods.txt
|
|
|
|
health_excuse_remove "$HEALTH_EXCUSE_INIT"
|
|
|
|
# Loop forever to check the seal status of vaults and
|
|
# unseal if required
|
|
log $INFO "Checking vault pods seal status in perpetuity..."
|
|
while true; do
|
|
exit_on_trap 10
|
|
trap_sleep "$STATUS_RATE"
|
|
exit_on_trap 20
|
|
pickK8sVersion # check if the k8s server version is changed
|
|
|
|
count=$( kubectl get pods -n "${VAULT_NS}" \
|
|
-o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' \
|
|
| grep "^${VAULT_FN}-manager" | wc -w )
|
|
if [ "$count" -gt 1 ]; then
|
|
log $ERROR "Multiple instances of vault manager detected. Waiting until one left"
|
|
exit_on_trap 21
|
|
continue
|
|
fi
|
|
|
|
rm $WORKDIR/pods.txt
|
|
echo "" > "$PODREC_TMP_F"
|
|
exit_on_trap 11
|
|
getVaultPods > $WORKDIR/pods.txt
|
|
exit_on_trap 12
|
|
|
|
while read host dnsname; do
|
|
if [ -z "$dnsname" ]; then
|
|
# probably a recovering pod waiting for an IP address
|
|
log $DEBUG "pod list has empty data: [$host] [$dnsname]"
|
|
continue
|
|
fi
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$QUERY_TMOUT \
|
|
vaultAPI server_status GET $dnsname.$POD_TARGET_BASE \
|
|
/sys/health
|
|
echo -n "$server_status" > $WORKDIR/healthcheck.txt
|
|
|
|
TEMP=$( echo "$server_status" | jq -r .sealed )
|
|
|
|
exit_on_trap 13
|
|
# Decide when to unseal the vault server; includes
|
|
# Adding records to new_pods_status.txt
|
|
runStateMachine "$host" "$dnsname" "$TEMP"
|
|
exit_on_trap 14
|
|
done <$WORKDIR/pods.txt
|
|
mv "$PODREC_TMP_F" "$PODREC_F"
|
|
|
|
vaultRekey
|
|
done
|
|
kind: ConfigMap
|
|
metadata:
|
|
managedFields:
|
|
- apiVersion: v1
|
|
fieldsType: FieldsV1
|
|
fieldsV1:
|
|
f:data:
|
|
.: {}
|
|
f:init.sh: {}
|
|
manager: vault-init-unseal
|
|
name: vault-init-unseal-3
|
|
namespace: {{ .Release.Namespace }}
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
managedFields:
|
|
- apiVersion: v1
|
|
fieldsType: FieldsV1
|
|
fieldsV1:
|
|
f:data:
|
|
.: {}
|
|
f:pvc-attach.yaml: {}
|
|
manager: {{ .Values.vault.name }}-mount-helper
|
|
name: {{ .Values.vault.name }}-mount-helper
|
|
namespace: {{ .Release.Namespace }}
|
|
data:
|
|
pvc-attach.yaml: |
|
|
---
|
|
apiVersion: batch/v1
|
|
kind: Job
|
|
metadata:
|
|
name: {{ .Values.vault.fullname }}-mount-helper
|
|
namespace: vault
|
|
spec:
|
|
activeDeadlineSeconds: 600
|
|
completions: 1
|
|
parallelism: 1
|
|
ttlSecondsAfterFinished: 0
|
|
template:
|
|
spec:
|
|
restartPolicy: Never
|
|
serviceAccountName: "{{ .Values.vault.fullname }}-manager-1"
|
|
{{- if .Values.manager.imagePullSecrets }}
|
|
imagePullSecrets:
|
|
{{- toYaml .Values.manager.imagePullSecrets | nindent 12 }}
|
|
{{- end }}
|
|
{{- if .Values.manager.tolerations }}
|
|
tolerations:
|
|
{{- tpl .Values.manager.tolerations . | nindent 12 }}
|
|
{{- end }}
|
|
securityContext:
|
|
runAsUser: 0
|
|
runAsGroup: 0
|
|
containers:
|
|
- name: mount
|
|
image: "{{ .Values.manager.image.repository }}:{{ .Values.manager.image.tag }}"
|
|
imagePullPolicy: "{{ .Values.manager.image.pullPolicy }}"
|
|
args:
|
|
- bash
|
|
- /opt/script/init.sh
|
|
env:
|
|
- name: MANAGER_MODE
|
|
value: MOUNT_HELPER
|
|
- name: PVC_DIR
|
|
value: /mnt/data
|
|
volumeMounts:
|
|
- name: mount-helper
|
|
mountPath: /opt/script
|
|
readOnly: true
|
|
- name: manager-pvc
|
|
mountPath: /mnt/data
|
|
readOnly: false
|
|
volumes:
|
|
- name: mount-helper
|
|
configMap:
|
|
name: vault-init-unseal-3
|
|
- name: manager-pvc
|
|
persistentVolumeClaim:
|
|
claimName: manager-pvc-sva-vault-manager-0
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: Role
|
|
metadata:
|
|
namespace: {{ .Release.Namespace }}
|
|
name: {{ .Values.vault.fullname }}-manager-1
|
|
rules:
|
|
- apiGroups: [""] # "" indicates the core API group
|
|
resources: ["pods"]
|
|
verbs: ["get", "watch", "list"]
|
|
- apiGroups: [""] # "" indicates the core API group
|
|
resources: ["pods/exec"]
|
|
verbs: ["create"]
|
|
- apiGroups: [""] # "" indicates the core API group
|
|
resources: ["secrets"]
|
|
verbs: ["get", "create", "delete"]
|
|
- apiGroups: ["batch"]
|
|
resources: ["jobs"]
|
|
verbs: ["get", "create", "delete"]
|
|
- apiGroups: [""] # "" indicates the core API group
|
|
resources: ["persistentvolumeclaims"]
|
|
verbs: ["list", "delete"]
|
|
---
|
|
apiVersion: v1
|
|
kind: ServiceAccount
|
|
metadata:
|
|
name: {{ .Values.vault.fullname }}-manager-1
|
|
namespace: {{ .Release.Namespace }}
|
|
labels:
|
|
helm.sh/chart: {{ .Values.manager.chart }}
|
|
app.kubernetes.io/name: {{ .Values.vault.name }}-manager
|
|
app.kubernetes.io/instance: {{ .Release.Name }}
|
|
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: RoleBinding
|
|
metadata:
|
|
name: {{ .Values.vault.fullname }}-manager-1
|
|
namespace: {{ .Release.Namespace }}
|
|
subjects:
|
|
- kind: ServiceAccount
|
|
name: {{ .Values.vault.fullname }}-manager-1
|
|
roleRef:
|
|
kind: Role
|
|
name: {{ .Values.vault.fullname }}-manager-1
|
|
apiGroup: rbac.authorization.k8s.io
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: StatefulSet
|
|
metadata:
|
|
name: {{ .Values.vault.fullname }}-manager-3
|
|
namespace: {{ .Release.Namespace }}
|
|
labels:
|
|
app.kubernetes.io/name: {{ .Values.vault.name }}-manager
|
|
app.kubernetes.io/instance: {{ .Release.Name }}
|
|
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
|
component: webhook
|
|
spec:
|
|
serviceName: {{ .Values.vault.fullname }}
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app.kubernetes.io/instance: {{ .Release.Name }}
|
|
component: webhook
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app.kubernetes.io/name: {{ .Values.vault.name }}-manager
|
|
app.kubernetes.io/instance: {{ .Release.Name }}
|
|
component: webhook
|
|
{{- if .Values.manager.extraLabels }}
|
|
{{- toYaml .Values.manager.extraLabels | nindent 8 -}}
|
|
{{- end }}
|
|
spec:
|
|
serviceAccountName: "{{ .Values.vault.fullname }}-manager-1"
|
|
{{- if .Values.manager.imagePullSecrets }}
|
|
imagePullSecrets:
|
|
{{- toYaml .Values.manager.imagePullSecrets | nindent 8 }}
|
|
{{- end }}
|
|
{{- if .Values.manager.tolerations }}
|
|
tolerations:
|
|
{{- tpl .Values.manager.tolerations . | nindent 8 }}
|
|
{{- end }}
|
|
containers:
|
|
- name: manager
|
|
image: "{{ .Values.manager.image.repository }}:{{ .Values.manager.image.tag }}"
|
|
imagePullPolicy: "{{ .Values.manager.image.pullPolicy }}"
|
|
args:
|
|
- bash
|
|
- /opt/script/init.sh
|
|
env:
|
|
- name: CA_CERT
|
|
value: /mnt/data/ca/tls.crt
|
|
livenessProbe:
|
|
exec:
|
|
command:
|
|
- bash
|
|
- -c
|
|
- "source /opt/script/init.sh; health_check"
|
|
initialDelaySeconds: {{ .Values.manager.livenessProbe.initialDelaySeconds }}
|
|
periodSeconds: {{ .Values.manager.livenessProbe.periodSeconds }}
|
|
timeoutSeconds: {{ .Values.manager.livenessProbe.timeoutSeconds }}
|
|
successThreshold: {{ .Values.manager.livenessProbe.successThreshold }}
|
|
failureThreshold: {{ .Values.manager.livenessProbe.failureThreshold }}
|
|
terminationGracePeriodSeconds: {{ .Values.manager.livenessProbe.terminationGracePeriodSeconds }}
|
|
volumeMounts:
|
|
- name: vault-init-unseal-3
|
|
mountPath: /opt/script
|
|
readOnly: false
|
|
- name: mount-helper-yaml
|
|
mountPath: /opt/yaml
|
|
readOnly: true
|
|
- name: vault-ca
|
|
mountPath: /mnt/data/ca
|
|
readOnly: true
|
|
volumes:
|
|
- name: vault-init-unseal-3
|
|
configMap:
|
|
name: vault-init-unseal-3
|
|
- name: mount-helper-yaml
|
|
configMap:
|
|
name: {{ .Values.vault.name }}-mount-helper
|
|
- name: vault-ca
|
|
secret:
|
|
secretName: vault-ca
|