Merge "[CEPH] Journal automation and disk cleanup updates"

This commit is contained in:
Zuul 2019-01-28 06:05:45 +00:00 committed by Gerrit Code Review
commit f0f1b57b3c
4 changed files with 116 additions and 57 deletions

View File

@ -78,6 +78,10 @@ if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then
exit 1
else
OSD_JOURNAL="${OSD_JOURNAL_DISK}"
if [ -e "${OSD_PATH}/run_mkjournal" ]; then
ceph-osd -i ${OSD_ID} --mkjournal
rm -rf ${OSD_PATH}/run_mkjournal
fi
fi
fi
if [ "x${JOURNAL_TYPE}" == "xdirectory" ]; then

View File

@ -23,6 +23,7 @@ set -ex
: "${CEPH_CONF:="/etc/ceph/${CLUSTER}.conf"}"
: "${OSD_BOOTSTRAP_KEYRING:=/var/lib/ceph/bootstrap-osd/${CLUSTER}.keyring}"
: "${OSD_JOURNAL_UUID:=$(uuidgen)}"
: "${OSD_JOURNAL_SIZE:=$(awk '/^osd_journal_size/{print $3}' ${CEPH_CONF}.template)}"
eval OSD_PG_INTERVAL_FIX=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["osd_pg_interval_fix"]))')
eval CRUSH_FAILURE_DOMAIN_TYPE=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain"]))')
@ -142,6 +143,16 @@ function dev_part {
fi
}
function disk_zap {
# Run all the commands that ceph-disk zap uses to clear a disk
local device=${1}
wipefs --all ${device}
# Wipe the first 200MB boundary, as Bluestore redeployments will not work otherwise
dd if=/dev/zero of=${device} bs=1M count=200
sgdisk --zap-all -- ${device}
sgdisk --clear --mbrtogpt -- ${device}
}
function osd_pg_interval_fix {
# NOTE(supamatt): https://tracker.ceph.com/issues/21142 is impacting us due to the older Ceph version 12.2.3 that we are running
if [ "x${OSD_PG_INTERVAL_FIX}" == "xtrue" ]; then
@ -154,7 +165,9 @@ function osd_pg_interval_fix {
function udev_settle {
partprobe "${OSD_DEVICE}"
if [ "x$JOURNAL_TYPE" == "xblock-logical" ]; then
partprobe "${OSD_JOURNAL}"
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
local JDEV=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g')
partprobe "${JDEV}"
fi
# watch the udev event queue, and exit if all current events are handled
udevadm settle --timeout=600

View File

@ -20,7 +20,7 @@ set -ex
source /tmp/osd-common.sh
: "${OSD_FORCE_ZAP:=1}"
: "${OSD_FORCE_REPAIR:=1}"
# We do not want to zap journal disk. Tracking this option seperatly.
: "${JOURNAL_FORCE_ZAP:=0}"
@ -55,62 +55,91 @@ function osd_disk_prepare {
# check device status first
if ! parted --script ${OSD_DEVICE} print > /dev/null 2>&1; then
if [[ ${OSD_FORCE_ZAP} -eq 1 ]]; then
echo "It looks like ${OSD_DEVICE} isn't consistent, however OSD_FORCE_ZAP is enabled so we are zapping the device anyway"
sgdisk -Z ${OSD_DEVICE}
if [[ ${OSD_FORCE_REPAIR} -eq 1 ]]; then
echo "It looks like ${OSD_DEVICE} isn't consistent, however OSD_FORCE_REPAIR is enabled so we are zapping the device anyway"
disk_zap ${OSD_DEVICE}
else
echo "Regarding parted, device ${OSD_DEVICE} is inconsistent/broken/weird."
echo "It would be too dangerous to destroy it without any notification."
echo "Please set OSD_FORCE_ZAP to '1' if you really want to zap this disk."
echo "Please set OSD_FORCE_REPAIR to '1' if you really want to zap this disk."
exit 1
fi
fi
udev_settle
# then search for some ceph metadata on the disk
if [[ "$(parted --script ${OSD_DEVICE} print | egrep '^ 1.*ceph data')" ]]; then
if [[ ${OSD_FORCE_ZAP} -eq 1 ]]; then
if [[ ${OSD_FORCE_REPAIR} -eq 1 ]]; then
if [ -b "${OSD_DEVICE}1" ]; then
local cephFSID=$(ceph-conf --lookup fsid)
if [ ! -z "${cephFSID}" ]; then
local tmpmnt=$(mktemp -d)
mount ${OSD_DEVICE}1 ${tmpmnt}
if [ "${OSD_BLUESTORE:-0}" -ne 1 ] && [ "x$JOURNAL_TYPE" != "xdirectory" ]; then
# we only care about journals for filestore.
if [ -f "${tmpmnt}/whoami" ]; then
OSD_JOURNAL_DISK=$(readlink -f "${tmpmnt}/journal")
local osd_id=$(cat "${tmpmnt}/whoami")
if [ ! -b "${OSD_JOURNAL_DISK}" ]; then
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
local jdev=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g')
if [ ${jdev} == ${OSD_JOURNAL} ]; then
echo "It appears that ${OSD_DEVICE} is missing the journal at ${OSD_JOURNAL}."
echo "Because OSD_FORCE_REPAIR is set, we will wipe the metadata of the OSD and zap it."
rm -rf ${tmpmnt}/ceph_fsid
else
echo "It appears that ${OSD_DEVICE} is missing the journal at ${OSD_JOURNAL_DISK}."
echo "Because OSD_FORCE_REPAIR is set and paritions are manually defined, we will"
echo "attempt to recreate the missing journal device partitions."
osd_journal_create ${OSD_JOURNAL}
ln -sf /dev/disk/by-partuuid/${OSD_JOURNAL_UUID} ${tmpmnt}/journal
echo ${OSD_JOURNAL_UUID} | tee ${tmpmnt}/journal_uuid
chown ceph. ${OSD_JOURNAL}
# During OSD start we will format the journal and set the fsid
touch ${tmpmnt}/run_mkjournal
fi
fi
else
echo "It looks like ${OSD_DEVICE} has a ceph data partition but is missing it's metadata."
echo "The device may contain inconsistent metadata or be corrupted."
echo "Because OSD_FORCE_REPAIR is set, we will wipe the metadata of the OSD and zap it."
rm -rf ${tmpmnt}/ceph_fsid
fi
fi
if [ -f "${tmpmnt}/ceph_fsid" ]; then
osdFSID=$(cat "${tmpmnt}/ceph_fsid")
umount ${tmpmnt}
if [ ${osdFSID} != ${cephFSID} ]; then
echo "It looks like ${OSD_DEVICE} is an OSD belonging to a different (or old) ceph cluster."
echo "The OSD FSID is ${osdFSID} while this cluster is ${cephFSID}"
echo "Because OSD_FORCE_ZAP was set, we will zap this device."
sgdisk -Z ${OSD_DEVICE}
echo "Because OSD_FORCE_REPAIR was set, we will zap this device."
disk_zap ${OSD_DEVICE}
else
echo "It looks like ${OSD_DEVICE} is an OSD belonging to a this ceph cluster."
echo "OSD_FORCE_ZAP is set, but will be ignored and the device will not be zapped."
echo "OSD_FORCE_REPAIR is set, but will be ignored and the device will not be zapped."
echo "Moving on, trying to activate the OSD now."
return
fi
else
umount ${tmpmnt}
echo "It looks like ${OSD_DEVICE} has a ceph data partition but no FSID."
echo "Because OSD_FORCE_ZAP was set, we will zap this device."
sgdisk -Z ${OSD_DEVICE}
echo "Because OSD_FORCE_REPAIR was set, we will zap this device."
disk_zap ${OSD_DEVICE}
fi
else
echo "Unable to determine the FSID of the current cluster."
echo "OSD_FORCE_ZAP is set, but this OSD will not be zapped."
echo "OSD_FORCE_REPAIR is set, but this OSD will not be zapped."
echo "Moving on, trying to activate the OSD now."
return
fi
else
echo "parted says ${OSD_DEVICE}1 should exist, but we do not see it."
echo "We will ignore OSD_FORCE_ZAP and try to use the device as-is"
echo "We will ignore OSD_FORCE_REPAIR and try to use the device as-is"
echo "Moving on, trying to activate the OSD now."
return
fi
else
echo "INFO- It looks like ${OSD_DEVICE} is an OSD, set OSD_FORCE_ZAP=1 to use this device anyway and zap its content"
echo "You can also use the zap_device scenario on the appropriate device to zap it"
echo "INFO- It looks like ${OSD_DEVICE} is an OSD, set OSD_FORCE_REPAIR=1 to use this device anyway and zap its content"
echo "You can also use the disk_zap scenario on the appropriate device to zap it"
echo "Moving on, trying to activate the OSD now."
return
fi
@ -118,54 +147,60 @@ function osd_disk_prepare {
if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then
# we only care about journals for filestore.
if [ -n "${OSD_JOURNAL}" ]; then
if [ -b $OSD_JOURNAL ]; then
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
OSD_JOURNAL_PARTITION=$(echo $OSD_JOURNAL_PARTITION | sed 's/[^0-9]//g')
if [ -z "${OSD_JOURNAL_PARTITION}" ]; then
# maybe they specified the journal as a /dev path like '/dev/sdc12':
local JDEV=$(echo ${OSD_JOURNAL} | sed 's/\(.*[^0-9]\)[0-9]*$/\1/')
if [ -d /sys/block/$(basename ${JDEV})/$(basename ${OSD_JOURNAL}) ]; then
OSD_JOURNAL=$(dev_part ${JDEV} `echo ${OSD_JOURNAL} |\
sed 's/.*[^0-9]\([0-9]*\)$/\1/'`)
OSD_JOURNAL_PARTITION=${JDEV}
fi
else
OSD_JOURNAL=$(dev_part ${OSD_JOURNAL} ${OSD_JOURNAL_PARTITION})
fi
fi
chown ceph. ${OSD_JOURNAL}
else
echo "No journal device specified. OSD and journal will share ${OSD_DEVICE}"
echo "For better performance on HDD, consider moving your journal to a separate device"
fi
CLI_OPTS="${CLI_OPTS} --filestore"
osd_journal_prepare
else
OSD_JOURNAL=''
CLI_OPTS="${CLI_OPTS} --bluestore"
fi
if [ -b "${OSD_JOURNAL}" -a "${JOURNAL_FORCE_ZAP:-0}" -eq 1 ]; then
# if we got here and zap is set, it's ok to wipe the journal.
echo "OSD_FORCE_ZAP is set, so we will erase the journal device ${OSD_JOURNAL}"
if [ -z "${OSD_JOURNAL_PARTITION}" ]; then
# it's a raw block device. nuke any existing partition table.
sgdisk -Z ${OSD_JOURNAL}
else
# we are likely working on a partition. Just make a filesystem on
# the device, as other partitions may be in use so nuking the whole
# disk isn't safe.
wipefs ${OSD_JOURNAL}
fi
fi
udev_settle
if [ "x$JOURNAL_TYPE" == "xdirectory" ]; then
export OSD_JOURNAL="--journal-file"
ceph-disk -v prepare ${CLI_OPTS} --journal-uuid ${OSD_JOURNAL_UUID} ${OSD_DEVICE} --journal-file
else
ceph-disk -v prepare ${CLI_OPTS} --journal-uuid ${OSD_JOURNAL_UUID} ${OSD_DEVICE} ${OSD_JOURNAL}
fi
ceph-disk -v prepare ${CLI_OPTS} --journal-uuid ${OSD_JOURNAL_UUID} ${OSD_DEVICE} ${OSD_JOURNAL}
}
udev_settle
function osd_journal_create {
local osd_journal=${1}
local osd_journal_partition=$(echo ${osd_journal} | sed 's/[^0-9]//g')
local jdev=$(echo ${osd_journal} | sed 's/[0-9]//g')
if [ -b "${jdev}" ]; then
sgdisk --new=${osd_journal_partition}:0:+${OSD_JOURNAL_SIZE}M \
--change-name='${osd_journal_partition}:ceph journal' \
--partition-guid=${osd_journal_partition}:${OSD_JOURNAL_UUID} \
--typecode=${osd_journal_partition}:45b0969e-9b03-4f30-b4c6-b4b80ceff106 --mbrtogpt -- ${jdev}
OSD_JOURNAL=$(dev_part ${jdev} ${osd_journal_partition})
else
echo "The backing device ${jdev} for ${OSD_JOURNAL} does not exist on this system."
exit 1
fi
}
function osd_journal_prepare {
if [ -n "${OSD_JOURNAL}" ]; then
if [ -b ${OSD_JOURNAL} ]; then
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
OSD_JOURNAL_PARTITION=$(echo ${OSD_JOURNAL} | sed 's/[^0-9]//g')
local jdev=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g')
if [ -z "${OSD_JOURNAL_PARTITION}" ]; then
OSD_JOURNAL=$(dev_part ${jdev} ${OSD_JOURNAL_PARTITION})
else
OSD_JOURNAL=${OSD_JOURNAL}
fi
elif [ "x$JOURNAL_TYPE" != "xdirectory" ]; then
# The block device exists but doesn't appear to be paritioned, we will proceed with parititioning the device.
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
osd_journal_create ${OSD_JOURNAL}
fi
chown ceph. ${OSD_JOURNAL}
elif [ "x$JOURNAL_TYPE" != "xdirectory" ]; then
echo "No journal device specified. OSD and journal will share ${OSD_DEVICE}"
echo "For better performance on HDD, consider moving your journal to a separate device"
fi
CLI_OPTS="${CLI_OPTS} --filestore"
}
if ! [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then

View File

@ -165,10 +165,17 @@ conf:
location: /var/lib/openstack-helm/ceph/osd/journal-one
# - data:
# type: block-logical
# location: /dev/sdd
# journal:
# type: block-logical
# location: /dev/sdf1
# - data:
# type: block-logical
# location: /dev/sde
# journal:
# type: block-logical
# location: /dev/sdf
# location: /dev/sdf2
# - data:
# type: block-logical
# location: /dev/sdg