diff --git a/tobiko/tests/faults/ha/cloud_disruptions.py b/tobiko/tests/faults/ha/cloud_disruptions.py index 44bf9b795..a684a6a37 100644 --- a/tobiko/tests/faults/ha/cloud_disruptions.py +++ b/tobiko/tests/faults/ha/cloud_disruptions.py @@ -4,6 +4,7 @@ from __future__ import absolute_import import time import random import urllib.parse +import re from oslo_log import log @@ -35,15 +36,28 @@ network_disruption = """ undisrupt_network = """ sudo iptables-restore /root/working.iptables.rules """ -ovn_db_pcs_resource_restart = """sudo pcs resource restart ovn-dbs-bundle""" -kill_rabbit = """sudo kill -9 $(pgrep beam.smp)""" -kill_galera = """sudo kill -9 $(pgrep mysqld)""" +ovn_db_pcs_resource_restart = "sudo pcs resource restart ovn-dbs-bundle" +kill_rabbit = "sudo kill -9 $(pgrep beam.smp)" +kill_galera = "sudo kill -9 $(pgrep mysqld)" +remove_grastate = "sudo rm -rf /var/lib/mysql/grastate.dat" +check_grastate = """ps ax | grep -v grep| +grep wsrep-cluster-address=gcomm://""" +disable_galera = "sudo pcs resource disable galera --wait=60" +enable_galera = "sudo pcs resource enable galera --wait=90" +disable_haproxy = "sudo pcs resource disable haproxy-bundle --wait=30" +enable_haproxy = "sudo pcs resource enable haproxy-bundle --wait=60" -def get_node(node_name): - node_name = node_name.split('.')[0] - return [node for node in topology.list_openstack_nodes() if - node.name == node_name][0] +class PcsDisableException(tobiko.TobikoException): + message = "pcs disable didn't shut down the resource" + + +class PcsEnableException(tobiko.TobikoException): + message = "pcs enable didn't start the resource" + + +class GaleraBoostrapException(tobiko.TobikoException): + message = "Bootstrap should not be done from node without grastate.dat" def network_disrupt_node(node_name, disrupt_method=network_disruption): @@ -62,7 +76,7 @@ def disrupt_node(node_name, disrupt_method=network_disruption): # container_restart # using ssh_client.connect we use a fire and forget reboot method - node = get_node(node_name) + node = tripleo_topology.get_node(node_name) node.ssh_client.connect().exec_command(disrupt_method) LOG.info('disrupt exec: {} on server: {}'.format(disrupt_method, node.name)) @@ -76,7 +90,7 @@ def reboot_node(node_name, wait=True, reboot_method=sh.hard_reset_method): # method : method of disruption to use : reset | network_disruption # using ssh_client.connect we use a fire and forget reboot method - node = get_node(node_name) + node = tripleo_topology.get_node(node_name) sh.reboot_host(ssh_client=node.ssh_client, wait=wait, method=reboot_method) LOG.info('disrupt exec: {} on server: {}'.format(reboot_method, node.name)) @@ -310,11 +324,10 @@ def reset_ovndb_master_container(): def kill_rabbitmq_service(): """kill a rabbit process on a random controller, check in pacemaker it is down""" - if 'messaging' in topology.list_openstack_node_groups(): - group = 'messaging' + if tripleo_topology.is_composable_roles_env(): + nodes = topology.list_openstack_nodes(group='messaging') else: - group = 'controller' - nodes = topology.list_openstack_nodes(group=group) + nodes = topology.list_openstack_nodes(group='controller') node = random.choice(nodes) sh.execute(kill_rabbit, ssh_client=node.ssh_client) LOG.info('kill rabbit: {} on server: {}'.format(kill_rabbit, @@ -329,11 +342,10 @@ def kill_rabbitmq_service(): def kill_all_galera_services(): """kill all galera processes, check in pacemaker it is down""" - if 'database' in topology.list_openstack_node_groups(): - group = 'database' + if tripleo_topology.is_composable_roles_env(): + nodes = topology.list_openstack_nodes(group='database') else: - group = 'controller' - nodes = topology.list_openstack_nodes(group=group) + nodes = topology.list_openstack_nodes(group='controller') for node in nodes: sh.execute(kill_galera, ssh_client=node.ssh_client) LOG.info('kill galera: {} on server: {}'.format(kill_galera, @@ -345,6 +357,65 @@ def kill_all_galera_services(): return +def remove_all_grastate_galera(): + """shut down galera properly, + remove all grastate""" + if tripleo_topology.is_composable_roles_env(): + nodes = topology.list_openstack_nodes(group='database') + else: + nodes = topology.list_openstack_nodes(group='controller') + LOG.info('shut down galera: {} on all servers: {}'. + format(disable_galera, nodes)) + if "resource 'galera' is not running on any node" not in\ + sh.execute(disable_galera, ssh_client=nodes[0].ssh_client).stdout: + raise PcsDisableException() + for node in nodes: + sh.execute(remove_grastate, ssh_client=node.ssh_client) + LOG.info('enable back galera: {} on all servers: {}'. + format(enable_galera, nodes)) + if "resource 'galera' is master on node" not in\ + sh.execute(enable_galera, ssh_client=nodes[0].ssh_client).stdout: + raise PcsEnableException() + + +def remove_one_grastate_galera(): + """shut down galera properly, + delete /var/lib/mysql/grastate.dat in a random node, + check that bootstrap is done from a node with grastate""" + if tripleo_topology.is_composable_roles_env(): + nodes = topology.list_openstack_nodes(group='database') + else: + nodes = topology.list_openstack_nodes(group='controller') + node = random.choice(nodes) + LOG.info('disable haproxy-bunble') + if "resource 'haproxy-bundle' is not running on any node" not in\ + sh.execute(disable_haproxy, ssh_client=node.ssh_client).stdout: + raise PcsDisableException() + LOG.info('shut down galera: {} on all servers: {}'. + format(disable_galera, nodes)) + if "resource 'galera' is not running on any node" not in\ + sh.execute(disable_galera, ssh_client=node.ssh_client).stdout: + raise PcsDisableException() + LOG.info('remove grastate: {} on server: {}'.format(remove_grastate, + node.name)) + sh.execute(remove_grastate, ssh_client=node.ssh_client) + LOG.info('enable back galera: {} on all servers: {}'. + format(enable_galera, nodes)) + if "resource 'galera' is master on node" not in\ + sh.execute(enable_galera, ssh_client=node.ssh_client).stdout: + raise PcsEnableException() + LOG.info('enable haproxy-bundle') + if "resource 'haproxy-bundle' is running on node" not in\ + sh.execute(enable_haproxy, ssh_client=node.ssh_client).stdout: + raise PcsEnableException() + # gcomm:// without args means that bootstrap is done from this node + if re.search('wsrep-cluster-address=gcomm:// --', + sh.execute(check_grastate, + ssh_client=node.ssh_client).stdout) is not None: + raise GaleraBoostrapException() + return node + + def evac_failover_compute(compute_host, failover_type=sh.hard_reset_method): """disrupt a compute, to trigger it's instance-HA evacuation failover_type=hard_reset_method etc..""" diff --git a/tobiko/tests/faults/ha/test_cloud_recovery.py b/tobiko/tests/faults/ha/test_cloud_recovery.py index b43b6047a..bffc8c1af 100644 --- a/tobiko/tests/faults/ha/test_cloud_recovery.py +++ b/tobiko/tests/faults/ha/test_cloud_recovery.py @@ -130,6 +130,16 @@ class DisruptTripleoNodesTest(testtools.TestCase): cloud_disruptions.kill_all_galera_services() overcloud_health_checks() + def test_remove_all_grastate_galera(self): + overcloud_health_checks() + cloud_disruptions.remove_all_grastate_galera() + overcloud_health_checks() + + def test_remove_one_grastate_galera(self): + overcloud_health_checks() + cloud_disruptions.remove_one_grastate_galera() + overcloud_health_checks() + # [..] # more tests to follow # run health checks diff --git a/tobiko/tripleo/_topology.py b/tobiko/tripleo/_topology.py index addd3f58f..05e5fe554 100644 --- a/tobiko/tripleo/_topology.py +++ b/tobiko/tripleo/_topology.py @@ -161,3 +161,17 @@ def ip_to_hostname(oc_ip): def actual_node_groups(groups): """return only existing node groups""" return set(groups).intersection(topology.list_openstack_node_groups()) + + +def get_node(node_name): + node_name = node_name.split('.')[0] + return [node for node in topology.list_openstack_nodes() if + node.name == node_name][0] + + +def is_composable_roles_env(): + composable_nodes = ['messaging', 'database', 'networker'] + for nodes in composable_nodes: + if nodes in topology.list_openstack_node_groups(): + return True + return False