From 35c34ea980625a4946ae4e52d416056ca54209fb Mon Sep 17 00:00:00 2001 From: dabarzil Date: Tue, 9 Jul 2024 15:52:48 +0300 Subject: [PATCH] add test remove all grastate.dat for OSP18 /refactor Change-Id: I041821ab4229b9f20b16def5ca95170908785e5a --- tobiko/podified/__init__.py | 3 + tobiko/podified/_openshift.py | 18 ++ .../faults/podified/ha/cloud_disruptions.py | 200 ++++++++++-------- .../faults/podified/ha/test_cloud_recovery.py | 10 + 4 files changed, 143 insertions(+), 88 deletions(-) diff --git a/tobiko/podified/__init__.py b/tobiko/podified/__init__.py index a48d493c3..c2141545e 100644 --- a/tobiko/podified/__init__.py +++ b/tobiko/podified/__init__.py @@ -32,5 +32,8 @@ skip_if_podified = _topology.skip_if_podified get_dataplane_ssh_keypair = _openshift.get_dataplane_ssh_keypair has_podified_cp = _openshift.has_podified_cp get_ovndbcluter = _openshift.get_ovndbcluter +execute_in_pod = _openshift.execute_in_pod +get_openstack_config_secret = _openshift.get_openstack_config_secret +get_pods = _openshift.get_pods get_container_runtime_name = containers.get_container_runtime_name diff --git a/tobiko/podified/_openshift.py b/tobiko/podified/_openshift.py index e3c7bb46d..ec4de1aa5 100644 --- a/tobiko/podified/_openshift.py +++ b/tobiko/podified/_openshift.py @@ -27,6 +27,7 @@ LOG = log.getLogger(__name__) OSP_CONTROLPLANE = 'openstackcontrolplane' OSP_DP_NODESET = 'openstackdataplanenodeset' DP_SSH_SECRET_NAME = 'secret/dataplane-ansible-ssh-private-key-secret' +OSP_CONFIG_SECRET_NAME = 'secret/openstack-config-secret' OSP_BM_HOST = 'baremetalhost.metal3.io' OSP_BM_CRD = 'baremetalhosts.metal3.io' OCP_WORKERS = 'nodes' @@ -149,6 +150,17 @@ def get_dataplane_ssh_keypair(): return private_key, public_key +def get_openstack_config_secret(): + with oc.project(CONF.tobiko.podified.osp_project): + try: + secret_object = oc.selector(OSP_CONFIG_SECRET_NAME).object() + except oc.OpenShiftPythonException as err: + LOG.info("Error while trying to get openstack config secret " + f"{OSP_CONFIG_SECRET_NAME} from Openshift. Error: {err}") + return + return secret_object.as_dict() + + def list_edpm_nodes(): nodes = [] with oc.project(CONF.tobiko.podified.osp_project): @@ -398,3 +410,9 @@ def _check_ping_results(pod): else: tobiko.fail("Failed to copy ping log files from the POD " f"{pod.name()}. Error: {cp.err}") + + +def execute_in_pod(pod_name, command, container_name=None): + with oc.project(CONF.tobiko.podified.osp_project): + return oc.selector(f'pod/{pod_name}').object().execute( + ['sh', '-c', command], container_name=container_name) diff --git a/tobiko/tests/faults/podified/ha/cloud_disruptions.py b/tobiko/tests/faults/podified/ha/cloud_disruptions.py index 24855b1d8..982a3effe 100644 --- a/tobiko/tests/faults/podified/ha/cloud_disruptions.py +++ b/tobiko/tests/faults/podified/ha/cloud_disruptions.py @@ -15,8 +15,9 @@ # under the License. from __future__ import absolute_import +import functools import re -import time +import random import openshift_client as oc from oslo_log import log @@ -24,115 +25,138 @@ from oslo_log import log import tobiko from tobiko import config from tobiko import podified -# from tobiko.openstack import glance -# from tobiko.openstack import keystone -# from tobiko.openstack import neutron -# from tobiko.openstack import stacks -# from tobiko.openstack import tests -# from tobiko.openstack import topology -# from tobiko.tests.faults.ha import test_cloud_recovery -# from tobiko.shell import ping -# from tobiko.shell import sh +from tobiko.openstack import keystone CONF = config.CONF LOG = log.getLogger(__name__) +kill_galera = 'kill -9 $(pidof mysqld)' +rm_grastate = 'rm -rf /var/lib/mysql/grastate.dat' +galera_cluster_size = 'mysql -u root --password={passwd} -e \'SHOW STATUS ' \ + 'LIKE "wsrep_cluster_size"\'' +check_bootstrap = """ +ps -eo lstart,cmd | grep -v grep| +grep wsrep-cluster-address=gcomm:// +""" + + +class GaleraBoostrapException(tobiko.TobikoException): + message = "Bootstrap has not been activated" + + +class DownException(tobiko.TobikoException): + message = "The resource is not down" + + +class RestoredException(tobiko.TobikoException): + message = "The resource is not restored" + + +@functools.lru_cache() +def get_galera_pods_per_service(galera_service): + # the aim of this function is just to cache results and avoid sending + # oc requests every time + return podified.get_pods({'service': galera_service}) + -@podified.skip_if_not_podified def kill_all_galera_services(): """kill all galera processes, check in pacemaker it is down""" - galera_pods_num = sum( - 1 for node_name in oc.selector('nodes').qnames() - for pod_obj in oc.get_pods_by_node(node_name) - if 'cell1-galera' in pod_obj.fqname() - ) - for i in range(galera_pods_num): - oc.selector('pod/openstack-cell1-galera-{}'.format(i)).object()\ - .execute(['sh', '-c', 'kill -9 $(pidof mysqld)'], - container_name='galera') - LOG.info('kill galera cell-{}'.format(i)) - - retry = tobiko.retry(timeout=30, interval=5) - for _ in retry: - try: - # checks wsrep cluster size is now unavailable - result = oc.selector('pod/openstack-cell1-galera-0').object( - ).execute(['sh', '-c', """mysql -u root --password=12345678 - -e 'SHOW STATUS LIKE "wsrep_cluster_size"'"""]) - # Capture and filter the error output - error_output = result.err() - non_error_message = """ - Defaulted container "galera" out of: galera, - mysql-bootstrap (init)\n""" - filtered_err_output = error_output.replace(non_error_message, '') - if not filtered_err_output.strip(): - continue - except oc.OpenShiftPythonException: - LOG.info('all galera cells down') - break - time.sleep(60) - for _ in retry: - try: - if int(re.search(r'wsrep_cluster_size\s+(\d+)', oc.selector( - 'pod/openstack-cell1-galera-0').object().execute( - ['sh', '-c', """mysql -u root --password=12345678 -e 'SHOW - STATUS LIKE "wsrep_cluster_size"'"""], container_name='galera' - ).out()).group(1)) == galera_pods_num: - LOG.info('all galera cells are restored') - return - except oc.OpenShiftPythonException: - continue - return False + # get galera pods sorted into 2 different lists: + # one with 'cell-galera' an one without + for galera_service in ('openstack-cell1-galera', 'openstack-galera'): + pods = get_galera_pods_per_service(galera_service) + kill_all_galera_pods(pods) + check_all_galera_cells_down(pods[0].name()) + verify_all_galera_cells_restored(pods) -@podified.skip_if_not_podified def remove_all_grastate_galera(): """shut down galera properly, remove all grastate""" - galera_pods_num = sum( - 1 for node_name in oc.selector('nodes').qnames() - for pod_obj in oc.get_pods_by_node(node_name) - if 'cell1-galera' in pod_obj.fqname() - ) - for i in range(galera_pods_num): - oc.selector('pod/openstack-cell1-galera-{}'.format(i)).object()\ - .execute(['sh', '-c', 'rm -rf /var/lib/mysql/grastate.dat '], - container_name='galera') - LOG.info('delete grastate.dat cell-{}'.format(i)) - for i in range(galera_pods_num): - oc.selector('pod/openstack-cell1-galera-{}'.format(i)).object()\ - .execute(['sh', '-c', 'kill -9 $(pidof mysqld)'], - container_name='galera') - LOG.info('kill galera cell-{}'.format(i)) + for galera_service in ('openstack-cell1-galera', 'openstack-galera'): + pods = get_galera_pods_per_service(galera_service) + for pod in pods: + remove_grastate(pod.name()) + # TODO: change kill to graceful stop/ scale down + kill_all_galera_pods(pods) + check_all_galera_cells_down(pods[0].name()) + verify_all_galera_cells_restored(pods) + + +def remove_one_grastate_galera(): + """shut down galera properly, + delete /var/lib/mysql/grastate.dat in a random node, + check that bootstrap is done from a node with grastate""" + for galera_service in ('openstack-cell1-galera', 'openstack-galera'): + pods = get_galera_pods_per_service(galera_service) + random_pod_name = random.choice(pods).name() + remove_grastate(random_pod_name) + # TODO: change kill to graceful stop/ scale down + kill_all_galera_pods(pods) + check_all_galera_cells_down(pods[0].name()) + verify_all_galera_cells_restored(pods) + # gcomm:// without args means that bootstrap is done from this node + bootstrap = podified.execute_in_pod( + random_pod_name, check_bootstrap, 'galera').out().strip() + if len(pods) > 1: + if re.search(r'wsrep-cluster-address=gcomm://(?:\s|$)', bootstrap + ) is None: + raise GaleraBoostrapException() + elif re.search(r'wsrep-cluster-address=gcomm://', bootstrap) is None: + raise GaleraBoostrapException() + lastDate = re.findall(r"\w{,3}\s*\w{,3}\s*\d{,2}\s*\d{,2}:\d{,2}" + r":\d{,2}\s*\d{4}", bootstrap)[-1] + LOG.info(f'last boostrap required at {lastDate}') + + +def remove_grastate(pod_name): + podified.execute_in_pod(pod_name, rm_grastate, 'galera') + LOG.info(f'grastate.dat removed from {pod_name}') + + +def kill_all_galera_pods(galera_pods): + for pod in galera_pods: + podified.execute_in_pod(pod.name(), kill_galera, 'galera') + LOG.info(f'kill galera pod {pod}') + + +def check_all_galera_cells_down(pod_name): + pw = keystone.keystone_credentials().password + retry = tobiko.retry(timeout=30, interval=5) for _ in retry: try: - # checks wsrep cluster size is now unavailable - result = oc.selector('pod/openstack-cell1-galera-0').object( - ).execute(['sh', '-c', """mysql -u root --password=12345678 - -e 'SHOW STATUS LIKE "wsrep_cluster_size"'"""]) - # Capture and filter the error output - error_output = result.err() - non_error_message = """ - Defaulted container "galera" out of: galera, - mysql-bootstrap (init)\n""" + cluster_size = podified.execute_in_pod( + pod_name, galera_cluster_size.format(passwd=pw), 'galera') + error_output = cluster_size.err() + non_error_message = "Defaulted container \"galera\" out of:"\ + "galera, mysql-bootstrap (init)\n" filtered_err_output = error_output.replace(non_error_message, '') if not filtered_err_output.strip(): continue except oc.OpenShiftPythonException: LOG.info('all galera cells down') - break - time.sleep(60) + return + raise DownException() + + +def verify_all_galera_cells_restored(pods): + pw = keystone.keystone_credentials().password + + retry = tobiko.retry(timeout=60, interval=10) for _ in retry: + pod_name = pods[0].name() try: - if int(re.search(r'wsrep_cluster_size\s+(\d+)', oc.selector( - 'pod/openstack-cell1-galera-0').object().execute( - ['sh', '-c', """mysql -u root --password=12345678 -e 'SHOW - STATUS LIKE "wsrep_cluster_size"'"""], container_name='galera' - ).out()).group(1)) == galera_pods_num: - LOG.info('all galera cells are restored') - return + cluster_size = podified.execute_in_pod( + pod_name, galera_cluster_size.format(passwd=pw), 'galera') except oc.OpenShiftPythonException: continue - return False + + wsrep_cluster_size = int(re.search(r'wsrep_cluster_size\s+(\d+)', + cluster_size.out()).group(1)) + if wsrep_cluster_size == len(pods): + LOG.info('all galera cells are restored') + return + + raise RestoredException() diff --git a/tobiko/tests/faults/podified/ha/test_cloud_recovery.py b/tobiko/tests/faults/podified/ha/test_cloud_recovery.py index 5f8e85caa..b81fb6b9b 100644 --- a/tobiko/tests/faults/podified/ha/test_cloud_recovery.py +++ b/tobiko/tests/faults/podified/ha/test_cloud_recovery.py @@ -31,3 +31,13 @@ class DisruptPodifiedNodesTest(testtools.TestCase): # HealthCheck.run_before() cloud_disruptions.kill_all_galera_services() # HealthCheck.run_after() + + def test_remove_all_grastate_galera(self): + # HealthCheck.run_before() + cloud_disruptions.remove_all_grastate_galera() + # HealthCheck.run_before() + + def test_remove_one_grastate_galera(self): + # HealthCheck.run_before() + cloud_disruptions.remove_one_grastate_galera() + # HealthCheck.run_after()