add test remove all grastate.dat for OSP18 /refactor

Change-Id: I041821ab4229b9f20b16def5ca95170908785e5a
This commit is contained in:
dabarzil 2024-07-09 15:52:48 +03:00 committed by Eduardo Olivares
parent 7bfd11555a
commit 35c34ea980
4 changed files with 143 additions and 88 deletions

View File

@ -32,5 +32,8 @@ skip_if_podified = _topology.skip_if_podified
get_dataplane_ssh_keypair = _openshift.get_dataplane_ssh_keypair get_dataplane_ssh_keypair = _openshift.get_dataplane_ssh_keypair
has_podified_cp = _openshift.has_podified_cp has_podified_cp = _openshift.has_podified_cp
get_ovndbcluter = _openshift.get_ovndbcluter get_ovndbcluter = _openshift.get_ovndbcluter
execute_in_pod = _openshift.execute_in_pod
get_openstack_config_secret = _openshift.get_openstack_config_secret
get_pods = _openshift.get_pods
get_container_runtime_name = containers.get_container_runtime_name get_container_runtime_name = containers.get_container_runtime_name

View File

@ -27,6 +27,7 @@ LOG = log.getLogger(__name__)
OSP_CONTROLPLANE = 'openstackcontrolplane' OSP_CONTROLPLANE = 'openstackcontrolplane'
OSP_DP_NODESET = 'openstackdataplanenodeset' OSP_DP_NODESET = 'openstackdataplanenodeset'
DP_SSH_SECRET_NAME = 'secret/dataplane-ansible-ssh-private-key-secret' DP_SSH_SECRET_NAME = 'secret/dataplane-ansible-ssh-private-key-secret'
OSP_CONFIG_SECRET_NAME = 'secret/openstack-config-secret'
OSP_BM_HOST = 'baremetalhost.metal3.io' OSP_BM_HOST = 'baremetalhost.metal3.io'
OSP_BM_CRD = 'baremetalhosts.metal3.io' OSP_BM_CRD = 'baremetalhosts.metal3.io'
OCP_WORKERS = 'nodes' OCP_WORKERS = 'nodes'
@ -149,6 +150,17 @@ def get_dataplane_ssh_keypair():
return private_key, public_key return private_key, public_key
def get_openstack_config_secret():
with oc.project(CONF.tobiko.podified.osp_project):
try:
secret_object = oc.selector(OSP_CONFIG_SECRET_NAME).object()
except oc.OpenShiftPythonException as err:
LOG.info("Error while trying to get openstack config secret "
f"{OSP_CONFIG_SECRET_NAME} from Openshift. Error: {err}")
return
return secret_object.as_dict()
def list_edpm_nodes(): def list_edpm_nodes():
nodes = [] nodes = []
with oc.project(CONF.tobiko.podified.osp_project): with oc.project(CONF.tobiko.podified.osp_project):
@ -398,3 +410,9 @@ def _check_ping_results(pod):
else: else:
tobiko.fail("Failed to copy ping log files from the POD " tobiko.fail("Failed to copy ping log files from the POD "
f"{pod.name()}. Error: {cp.err}") f"{pod.name()}. Error: {cp.err}")
def execute_in_pod(pod_name, command, container_name=None):
with oc.project(CONF.tobiko.podified.osp_project):
return oc.selector(f'pod/{pod_name}').object().execute(
['sh', '-c', command], container_name=container_name)

View File

@ -15,8 +15,9 @@
# under the License. # under the License.
from __future__ import absolute_import from __future__ import absolute_import
import functools
import re import re
import time import random
import openshift_client as oc import openshift_client as oc
from oslo_log import log from oslo_log import log
@ -24,115 +25,138 @@ from oslo_log import log
import tobiko import tobiko
from tobiko import config from tobiko import config
from tobiko import podified from tobiko import podified
# from tobiko.openstack import glance from tobiko.openstack import keystone
# from tobiko.openstack import keystone
# from tobiko.openstack import neutron
# from tobiko.openstack import stacks
# from tobiko.openstack import tests
# from tobiko.openstack import topology
# from tobiko.tests.faults.ha import test_cloud_recovery
# from tobiko.shell import ping
# from tobiko.shell import sh
CONF = config.CONF CONF = config.CONF
LOG = log.getLogger(__name__) LOG = log.getLogger(__name__)
kill_galera = 'kill -9 $(pidof mysqld)'
rm_grastate = 'rm -rf /var/lib/mysql/grastate.dat'
galera_cluster_size = 'mysql -u root --password={passwd} -e \'SHOW STATUS ' \
'LIKE "wsrep_cluster_size"\''
check_bootstrap = """
ps -eo lstart,cmd | grep -v grep|
grep wsrep-cluster-address=gcomm://
"""
class GaleraBoostrapException(tobiko.TobikoException):
message = "Bootstrap has not been activated"
class DownException(tobiko.TobikoException):
message = "The resource is not down"
class RestoredException(tobiko.TobikoException):
message = "The resource is not restored"
@functools.lru_cache()
def get_galera_pods_per_service(galera_service):
# the aim of this function is just to cache results and avoid sending
# oc requests every time
return podified.get_pods({'service': galera_service})
@podified.skip_if_not_podified
def kill_all_galera_services(): def kill_all_galera_services():
"""kill all galera processes, """kill all galera processes,
check in pacemaker it is down""" check in pacemaker it is down"""
galera_pods_num = sum( # get galera pods sorted into 2 different lists:
1 for node_name in oc.selector('nodes').qnames() # one with 'cell-galera' an one without
for pod_obj in oc.get_pods_by_node(node_name) for galera_service in ('openstack-cell1-galera', 'openstack-galera'):
if 'cell1-galera' in pod_obj.fqname() pods = get_galera_pods_per_service(galera_service)
) kill_all_galera_pods(pods)
for i in range(galera_pods_num): check_all_galera_cells_down(pods[0].name())
oc.selector('pod/openstack-cell1-galera-{}'.format(i)).object()\ verify_all_galera_cells_restored(pods)
.execute(['sh', '-c', 'kill -9 $(pidof mysqld)'],
container_name='galera')
LOG.info('kill galera cell-{}'.format(i))
retry = tobiko.retry(timeout=30, interval=5)
for _ in retry:
try:
# checks wsrep cluster size is now unavailable
result = oc.selector('pod/openstack-cell1-galera-0').object(
).execute(['sh', '-c', """mysql -u root --password=12345678
-e 'SHOW STATUS LIKE "wsrep_cluster_size"'"""])
# Capture and filter the error output
error_output = result.err()
non_error_message = """
Defaulted container "galera" out of: galera,
mysql-bootstrap (init)\n"""
filtered_err_output = error_output.replace(non_error_message, '')
if not filtered_err_output.strip():
continue
except oc.OpenShiftPythonException:
LOG.info('all galera cells down')
break
time.sleep(60)
for _ in retry:
try:
if int(re.search(r'wsrep_cluster_size\s+(\d+)', oc.selector(
'pod/openstack-cell1-galera-0').object().execute(
['sh', '-c', """mysql -u root --password=12345678 -e 'SHOW
STATUS LIKE "wsrep_cluster_size"'"""], container_name='galera'
).out()).group(1)) == galera_pods_num:
LOG.info('all galera cells are restored')
return
except oc.OpenShiftPythonException:
continue
return False
@podified.skip_if_not_podified
def remove_all_grastate_galera(): def remove_all_grastate_galera():
"""shut down galera properly, """shut down galera properly,
remove all grastate""" remove all grastate"""
galera_pods_num = sum( for galera_service in ('openstack-cell1-galera', 'openstack-galera'):
1 for node_name in oc.selector('nodes').qnames() pods = get_galera_pods_per_service(galera_service)
for pod_obj in oc.get_pods_by_node(node_name) for pod in pods:
if 'cell1-galera' in pod_obj.fqname() remove_grastate(pod.name())
) # TODO: change kill to graceful stop/ scale down
for i in range(galera_pods_num): kill_all_galera_pods(pods)
oc.selector('pod/openstack-cell1-galera-{}'.format(i)).object()\ check_all_galera_cells_down(pods[0].name())
.execute(['sh', '-c', 'rm -rf /var/lib/mysql/grastate.dat '], verify_all_galera_cells_restored(pods)
container_name='galera')
LOG.info('delete grastate.dat cell-{}'.format(i))
for i in range(galera_pods_num): def remove_one_grastate_galera():
oc.selector('pod/openstack-cell1-galera-{}'.format(i)).object()\ """shut down galera properly,
.execute(['sh', '-c', 'kill -9 $(pidof mysqld)'], delete /var/lib/mysql/grastate.dat in a random node,
container_name='galera') check that bootstrap is done from a node with grastate"""
LOG.info('kill galera cell-{}'.format(i)) for galera_service in ('openstack-cell1-galera', 'openstack-galera'):
pods = get_galera_pods_per_service(galera_service)
random_pod_name = random.choice(pods).name()
remove_grastate(random_pod_name)
# TODO: change kill to graceful stop/ scale down
kill_all_galera_pods(pods)
check_all_galera_cells_down(pods[0].name())
verify_all_galera_cells_restored(pods)
# gcomm:// without args means that bootstrap is done from this node
bootstrap = podified.execute_in_pod(
random_pod_name, check_bootstrap, 'galera').out().strip()
if len(pods) > 1:
if re.search(r'wsrep-cluster-address=gcomm://(?:\s|$)', bootstrap
) is None:
raise GaleraBoostrapException()
elif re.search(r'wsrep-cluster-address=gcomm://', bootstrap) is None:
raise GaleraBoostrapException()
lastDate = re.findall(r"\w{,3}\s*\w{,3}\s*\d{,2}\s*\d{,2}:\d{,2}"
r":\d{,2}\s*\d{4}", bootstrap)[-1]
LOG.info(f'last boostrap required at {lastDate}')
def remove_grastate(pod_name):
podified.execute_in_pod(pod_name, rm_grastate, 'galera')
LOG.info(f'grastate.dat removed from {pod_name}')
def kill_all_galera_pods(galera_pods):
for pod in galera_pods:
podified.execute_in_pod(pod.name(), kill_galera, 'galera')
LOG.info(f'kill galera pod {pod}')
def check_all_galera_cells_down(pod_name):
pw = keystone.keystone_credentials().password
retry = tobiko.retry(timeout=30, interval=5) retry = tobiko.retry(timeout=30, interval=5)
for _ in retry: for _ in retry:
try: try:
# checks wsrep cluster size is now unavailable cluster_size = podified.execute_in_pod(
result = oc.selector('pod/openstack-cell1-galera-0').object( pod_name, galera_cluster_size.format(passwd=pw), 'galera')
).execute(['sh', '-c', """mysql -u root --password=12345678 error_output = cluster_size.err()
-e 'SHOW STATUS LIKE "wsrep_cluster_size"'"""]) non_error_message = "Defaulted container \"galera\" out of:"\
# Capture and filter the error output "galera, mysql-bootstrap (init)\n"
error_output = result.err()
non_error_message = """
Defaulted container "galera" out of: galera,
mysql-bootstrap (init)\n"""
filtered_err_output = error_output.replace(non_error_message, '') filtered_err_output = error_output.replace(non_error_message, '')
if not filtered_err_output.strip(): if not filtered_err_output.strip():
continue continue
except oc.OpenShiftPythonException: except oc.OpenShiftPythonException:
LOG.info('all galera cells down') LOG.info('all galera cells down')
break return
time.sleep(60) raise DownException()
def verify_all_galera_cells_restored(pods):
pw = keystone.keystone_credentials().password
retry = tobiko.retry(timeout=60, interval=10)
for _ in retry: for _ in retry:
pod_name = pods[0].name()
try: try:
if int(re.search(r'wsrep_cluster_size\s+(\d+)', oc.selector( cluster_size = podified.execute_in_pod(
'pod/openstack-cell1-galera-0').object().execute( pod_name, galera_cluster_size.format(passwd=pw), 'galera')
['sh', '-c', """mysql -u root --password=12345678 -e 'SHOW
STATUS LIKE "wsrep_cluster_size"'"""], container_name='galera'
).out()).group(1)) == galera_pods_num:
LOG.info('all galera cells are restored')
return
except oc.OpenShiftPythonException: except oc.OpenShiftPythonException:
continue continue
return False
wsrep_cluster_size = int(re.search(r'wsrep_cluster_size\s+(\d+)',
cluster_size.out()).group(1))
if wsrep_cluster_size == len(pods):
LOG.info('all galera cells are restored')
return
raise RestoredException()

View File

@ -31,3 +31,13 @@ class DisruptPodifiedNodesTest(testtools.TestCase):
# HealthCheck.run_before() # HealthCheck.run_before()
cloud_disruptions.kill_all_galera_services() cloud_disruptions.kill_all_galera_services()
# HealthCheck.run_after() # HealthCheck.run_after()
def test_remove_all_grastate_galera(self):
# HealthCheck.run_before()
cloud_disruptions.remove_all_grastate_galera()
# HealthCheck.run_before()
def test_remove_one_grastate_galera(self):
# HealthCheck.run_before()
cloud_disruptions.remove_one_grastate_galera()
# HealthCheck.run_after()