
After the system upgrade, the ceph mds data was deleted, and we could no longer find the files stored with CephFS. This procedure resets and recreates the mds so that the files appear again after the volume is mounted. Test Plan: - Tested Upgrading procedure from 21.05 on AIO-SX - Tested B&R procedure on AIO-SX Closes-bug: 1955823 Signed-off-by: Thiago Miranda <ThiagoOliveira.Miranda@windriver.com> Change-Id: Ic146dda35c3c71cf11c73a4f972b274ec3404d07
313 lines
10 KiB
YAML
313 lines
10 KiB
YAML
---
|
|
#
|
|
# Copyright (c) 2019-2021 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# ROLE DESCRIPTION:
|
|
# This role is to restore the Ceph cluster data
|
|
|
|
|
|
- name: Restore Ceph cluster data
|
|
block:
|
|
|
|
# Check if Ceph osd partitions are present
|
|
- name: Run ceph-disk cmd
|
|
command: ceph-disk list --format=json
|
|
register: ceph_disk_out
|
|
|
|
- name: Parse output of ceph-disk
|
|
set_fact:
|
|
ceph_disk_parsed: "{{ ceph_disk_out.stdout | from_json | selectattr('partitions', 'defined') | list }}"
|
|
|
|
- name: Get Ceph partitions
|
|
set_fact:
|
|
ceph_osd_parts: "{% for e in ceph_disk_parsed if 'cluster' in e.partitions.0
|
|
and e.partitions.0.cluster == 'ceph' and e.partitions.0.type == 'data' %}{{ e }}{% endfor %}"
|
|
|
|
- name: Fail if there are no Ceph osd partitions
|
|
fail:
|
|
msg: "No Ceph osd partition found, aborting restore operation"
|
|
when: not ceph_osd_parts
|
|
|
|
- name: Restore ceph.conf file
|
|
command: >-
|
|
tar -C / -xpf {{ restore_data_file }}
|
|
'{{ ceph_conf[1:] }}'
|
|
args:
|
|
warn: false
|
|
|
|
- name: Get list of OSDs defined in {{ ceph_conf }}
|
|
shell: grep "\[osd\.[0-9]*\]" {{ ceph_conf }} | grep -oh "osd\.[0-9]*"
|
|
register: ceph_conf_osd_list
|
|
failed_when: false
|
|
|
|
# Allow starting ceph with a ceph.conf from a backup done on controller-1
|
|
# w/o this it will try to initialize OSDs that are not configured on controller-0.
|
|
- name: Remove "[osd.*]" sections from {{ ceph_conf }}
|
|
ini_file:
|
|
path: "{{ ceph_conf }}"
|
|
section: "{{ item }}"
|
|
state: absent
|
|
with_items: "{{ ceph_conf_osd_list.stdout_lines }}"
|
|
|
|
- name: Set initial ceph-mon name
|
|
set_fact:
|
|
mon_name: 'controller-0'
|
|
|
|
# On AIO-DX we have a single, floating, monitor with
|
|
# the data partition DRBD synced between controllers
|
|
# So, before restoring ceph-mon data, we have to first
|
|
# set up DRBD for the ceph-mon partition.
|
|
- name: Enable DRBD for ceph-mon on AIO-DX
|
|
block:
|
|
- name: Update host config data to get ceph-mon size
|
|
command: "/usr/bin/sysinv-puppet create-host-config"
|
|
failed_when: false
|
|
register: host_config_result
|
|
|
|
- name: Fail if host hieradata cannot be generated
|
|
fail:
|
|
msg: "Failed to create puppet hiera host config."
|
|
when: host_config_result.rc != 0
|
|
|
|
- name: Create list of Ceph DRBD classes to pass to puppet
|
|
copy:
|
|
dest: "/tmp/ceph-mon.yml"
|
|
content: |
|
|
classes:
|
|
- platform::drbd::cephmon::runtime
|
|
|
|
- name: Applying puppet ceph-mon DRBD manifest
|
|
command: >
|
|
/usr/local/bin/puppet-manifest-apply.sh
|
|
{{ puppet_permdir }}/hieradata/
|
|
{{ derived_network_params.controller_0_address }}
|
|
controller runtime /tmp/ceph-mon.yml
|
|
controller ansible_bootstrap > {{ ceph_mon_manifest_apply_log }}
|
|
register: ceph_mon_manifest
|
|
environment:
|
|
INITIAL_CONFIG_PRIMARY: "true"
|
|
LC_ALL: "en_US.UTF-8"
|
|
|
|
- name: Fail if puppet manifest apply script returns an error
|
|
fail:
|
|
msg: >-
|
|
Failed to apply ceph-mon DRBD manifest. See /var/log/puppet/latest/puppet.log
|
|
for details.
|
|
when: ceph_mon_manifest.rc != 0
|
|
|
|
- name: Set mode to floating controller
|
|
set_fact:
|
|
mon_name: 'controller'
|
|
when: (restore_system_mode != 'simplex') and
|
|
(system_type == 'All-in-one')
|
|
|
|
- name: Allow Ceph to start with a single monitor on a Standard deployment
|
|
ini_file:
|
|
path: "{{ ceph_conf }}"
|
|
section: global
|
|
option: mon_initial_members
|
|
value: controller-0
|
|
when: system_type == 'Standard'
|
|
|
|
- name: Check if ceph-mon processes are running
|
|
command: pgrep ceph-mon
|
|
register: ceph_mons
|
|
failed_when: false
|
|
|
|
- name: Shut down Ceph monitor and OSDs if they are running
|
|
command: "{{ item }}"
|
|
with_items:
|
|
- /etc/init.d/ceph stop osd
|
|
- /etc/init.d/ceph stop mon
|
|
when: ceph_mons.stdout != ""
|
|
|
|
- name: Grab the mgr IP address
|
|
command: "sed -nr -e '/mgr.'$MON_NAME'/!b;n;s/public_addr = (.*)/\\1/p' /etc/ceph/ceph.conf"
|
|
args:
|
|
warn: false
|
|
register: result
|
|
environment:
|
|
MON_NAME: "{{ mon_name }}"
|
|
|
|
- debug: msg="IP address is {{ result.stdout }}"
|
|
|
|
- name: Determine if IPv6 encapsulation is needed
|
|
set_fact:
|
|
ipv6_encap: "{{ result.stdout | ipv6 }}"
|
|
|
|
- debug: msg="IPv6 addressing is {{ ipv6_encap }}"
|
|
|
|
# Four sed patterns for the monitor public_addr: First two remove the MSGR
|
|
# V1 port if it already exists, the second two add it when it doesn't exist.
|
|
# Prevents us from continually adding the port in case this role becomes
|
|
# reentrant
|
|
#
|
|
# Handling two patterns of restored ceph.conf for ipv4 and ipv6:
|
|
# 1) For AIO-SX/STD
|
|
#
|
|
# [mon.controller-0]
|
|
# public_addr = 192.168.204.2
|
|
# host = controller-0
|
|
#
|
|
# or:
|
|
#
|
|
# [mon.controller-0]
|
|
# public_addr = fd00::1
|
|
# host = controller-0
|
|
#
|
|
# 2) For AIO-DX
|
|
#
|
|
# [mon.controller]
|
|
# host = controller-0
|
|
# public_addr = 192.168.204.2
|
|
#
|
|
# or:
|
|
#
|
|
# [mon.controller]
|
|
# host = controller-0
|
|
# public_addr = fd00::1
|
|
|
|
- name: Explicitly enable Msgr V1 port on Ceph monitor public address
|
|
command: "{{ item }}"
|
|
args:
|
|
warn: false
|
|
with_items:
|
|
- "sed -ri -e '/mon.'$MON_NAME'/!b;n;s/public_addr = '$ESTART'(.*)'$EEND':'$MON_PORT'$
|
|
/public_addr = \\1/' /etc/ceph/ceph.conf"
|
|
- "sed -ri -e '/mon.'$MON_NAME'/!b;n;!b;n;s/public_addr = '$ESTART'(.*)'$EEND':'$MON_PORT'$
|
|
/public_addr = \\1/' /etc/ceph/ceph.conf"
|
|
- "sed -ri -e '/mon.'$MON_NAME'/!b;n;s/public_addr = (.*)$/
|
|
public_addr = '$ESTART'\\1'$EEND':'$MON_PORT'/' /etc/ceph/ceph.conf"
|
|
- "sed -ri -e '/mon.'$MON_NAME'/!b;n;!b;n;s/public_addr = (.*)$/
|
|
public_addr = '$ESTART'\\1'$EEND':'$MON_PORT'/' /etc/ceph/ceph.conf"
|
|
environment:
|
|
MON_PORT: 6789
|
|
MON_NAME: "{{ mon_name }}"
|
|
ESTART: "{{ '' if ipv6_encap == False else '\\[' }}"
|
|
EEND: "{{ '' if ipv6_encap == False else '\\]' }}"
|
|
|
|
# On a partial restore ceph-osds are not wiped.
|
|
# 'ceph-disk list' command returns the list of ceph osds
|
|
# This task:
|
|
# 1. parses the output of 'ceph-disk list' and extracts
|
|
# the ceph osds, create for every ceph osd a folder under
|
|
# /var/lib/ceph/osd and mount the osd in there.
|
|
# 2. Gets ceph-mon size from sysinv, creates ceph-mon-lv,
|
|
# format and mounts it under /var/lib/ceph/mon
|
|
# then populates the data structure for controller-0 monitor
|
|
# so that Ceph can be started.
|
|
- name: Mount ceph-osds and format ceph-mon
|
|
script: prepare_ceph_partitions.py
|
|
register: prepare_ceph_partitions
|
|
|
|
- debug: var=prepare_ceph_partitions.stdout_lines
|
|
|
|
- name: Bring up ceph-mon
|
|
command: /etc/init.d/ceph start mon
|
|
|
|
# Recover ceph-data from every osd with ceph-objectore-tool
|
|
- name: Recover ceph-data
|
|
script: recover_ceph_data.py
|
|
register: ceph_data_out
|
|
|
|
- debug: var=ceph_data_out.stdout_lines
|
|
|
|
- name: Bring down ceph-mon
|
|
command: /etc/init.d/ceph stop mon
|
|
|
|
- name: Delete store.db file from ceph-mon
|
|
file:
|
|
path: "/var/lib/ceph/mon/ceph-{{ mon_name }}/store.db"
|
|
state: absent
|
|
|
|
# Cannot use the 'copy' module with 'remote_src: yes' for
|
|
# recursive copy till Ansible 2.8.
|
|
- name: Restore store.db from mon-store
|
|
shell: cp -ar /tmp/mon-store/store.db /var/lib/ceph/mon/ceph-{{ mon_name }}
|
|
|
|
- name: Bring up ceph Monitor
|
|
command: /etc/init.d/ceph start mon
|
|
|
|
- name: Wait for ceph monitor to be up
|
|
shell: ceph -s
|
|
until: true
|
|
retries: 5
|
|
delay: 2
|
|
|
|
# During initialization of OSD, it requests the monmap to the monitor
|
|
# before sending the monitor commands to be run. Since there are different
|
|
# threads involved, it is possible the OSD sends the command before receiving
|
|
# the monmap, causing an error of "wrong fsid" and making OSD to stop its
|
|
# execution. So we add a retry of 5 to make sure it will receive the monmap when
|
|
# it should
|
|
- name: Bring up ceph OSDs
|
|
command: /etc/init.d/ceph start osd
|
|
retries: 5
|
|
delay: 3
|
|
register: result
|
|
until: result.rc == 0
|
|
|
|
- name: Enable Ceph Msgr v2 protocol
|
|
shell: ceph mon enable-msgr2
|
|
until: true
|
|
retries: 5
|
|
delay: 2
|
|
|
|
- name: Wait for V2 protocol to be enabled
|
|
shell: ceph -s
|
|
register: result
|
|
until: "'1 monitors have not enabled msgr2' not in result"
|
|
retries: 30
|
|
delay: 10
|
|
|
|
- name: Start Ceph manager
|
|
command: /usr/bin/ceph-mgr --cluster ceph --id controller-0 - start ceph-mgr
|
|
|
|
- name: Wait for ceph-mgr to detect Ceph's pools
|
|
shell: ceph -s
|
|
register: result
|
|
until: "'0 pools' not in result"
|
|
retries: 30
|
|
delay: 10
|
|
|
|
- name: Wait {{ ceph_wait_time }} for Ceph to detect that peer OSDs on controller-1 are down
|
|
wait_for:
|
|
timeout: "{{ ceph_wait_time }}"
|
|
when: restore_system_mode != 'simplex'
|
|
|
|
- name: Check and recover CephFs filesystem
|
|
script: recover_cephfs.sh
|
|
register: cephfs_recovery_out
|
|
|
|
- name: Create ceph.client.guest.keyring to allow ceph mount again
|
|
command: touch /etc/ceph/ceph.client.guest.keyring
|
|
|
|
- debug: var=cephfs_recovery_out.stdout_lines
|
|
|
|
- name: Restart ceph one more time to pick latest changes
|
|
command: /etc/init.d/ceph restart
|
|
|
|
- name: Remove Ceph option to start with a single monitor on a Standard deployment
|
|
ini_file:
|
|
path: "{{ ceph_conf }}"
|
|
section: global
|
|
option: mon_initial_members
|
|
state: absent
|
|
when: system_type == 'Standard'
|
|
|
|
# Return to the setting before puppet was run to allow node unlock w/o ip's on the
|
|
# management network. Same change is required for both IPv4 and IPv6.
|
|
# Puppet will then set the correct values as per DB on first run.
|
|
- name: Adjust DRBD ceph-mon resource for AIO-DX to start without networking
|
|
shell: |
|
|
awk '/address ipv/{c++; { sub("ipv.*$","ipv4 127.0.0." c ":7788;")}}1' \
|
|
{{ drbd_cephmon_res }} > {{ tmp_drbd_cephmon_res }}
|
|
cp -f {{ tmp_drbd_cephmon_res }} {{ drbd_cephmon_res }}
|
|
when: (restore_system_mode != 'simplex') and
|
|
(system_type == 'All-in-one')
|
|
|
|
become: yes
|
|
become_user: root
|