diff --git a/tobiko/tests/faults/ha/cloud_disruptions.py b/tobiko/tests/faults/ha/cloud_disruptions.py index 92d8af109..14d3d2adf 100644 --- a/tobiko/tests/faults/ha/cloud_disruptions.py +++ b/tobiko/tests/faults/ha/cloud_disruptions.py @@ -37,6 +37,8 @@ network_disruption = """ undisrupt_network = """ sudo iptables-restore /home/heat-admin/working.iptables.rules """ +ban_resource = "sudo pcs resource ban {} {}" +clear_resource = "sudo pcs resource clear {} {}" ovn_db_pcs_resource_restart = "sudo pcs resource restart ovn-dbs-bundle" kill_rabbit = "sudo kill -9 $(pgrep beam.smp)" kill_galera = "sudo kill -9 $(pgrep mysqld)" @@ -59,6 +61,10 @@ class PcsEnableException(tobiko.TobikoException): message = "pcs enable didn't start the resource" +class PcsBanException(tobiko.TobikoException): + message = "the resource wasn't banned" + + class GaleraBoostrapException(tobiko.TobikoException): message = "Bootstrap should not be done from node without grastate.dat" @@ -337,6 +343,54 @@ def reset_ovndb_master_container(): container_host=node) +def ban_master_resource(resource_type, resource_name): + """ban master resource and check that it stopped + and another node is promoted to master""" + nodes = topology.list_openstack_nodes(group='controller') + resource_num = pacemaker.PacemakerResourcesStatus().resource_count( + resource_type) + # repeat process for all nodes except one + for i in range(resource_num - 1): + master_node_name = pacemaker.get_resource_master_node(resource_type) + if not master_node_name: + break + else: + sh.execute(ban_resource.format(resource_name, + master_node_name[0]), + ssh_client=topology.get_openstack_node( + master_node_name[0]).ssh_client) + for attempt_number in range(60): + try: + # check if resource banned and another slave promoted + if pacemaker.PacemakerResourcesStatus().resource_banned( + resource_type): + # if there one resource left(master), test succeded + if i == resource_num - 2: + clear_resources(nodes, resource_name) + time.sleep(10) + return + # more than 2 resources, so repeat process + else: + time.sleep(20) + break + else: + raise PcsBanException() + except PcsBanException(): + LOG.info('Retrying pacemaker resource checks attempt ' + '{} of 60'.format(attempt_number)) + time.sleep(1) + clear_resources(nodes, resource_name) + tobiko.fail('The resource {} was not promoted to master'.format( + resource_name)) + + +def clear_resources(nodes, resource_name): + for cont in range(len(nodes)): + sh.execute(clear_resource.format(resource_name, 'controller-{}'. + format(cont)), + ssh_client=nodes[0].ssh_client) + + def kill_rabbitmq_service(): """kill a rabbit process on a random controller, check in pacemaker it is down""" diff --git a/tobiko/tests/faults/ha/test_cloud_recovery.py b/tobiko/tests/faults/ha/test_cloud_recovery.py index 3981ca344..8d3e8f803 100644 --- a/tobiko/tests/faults/ha/test_cloud_recovery.py +++ b/tobiko/tests/faults/ha/test_cloud_recovery.py @@ -245,6 +245,17 @@ class DisruptTripleoNodesTest(testtools.TestCase): LOG.info("Verify can create VMs after controllers power on...") tests.test_server_creation() + def test_ban_redis(self): + OvercloudHealthCheck.run_before() + cloud_disruptions.ban_master_resource("(ocf::heartbeat:redis):", + "redis-bundle") + OvercloudHealthCheck.run_after() + + def test_ban_ovs(self): + OvercloudHealthCheck.run_before() + cloud_disruptions.ban_master_resource("(ocf::ovn:ovndb-servers):", + "ovn-dbs-bundle") + OvercloudHealthCheck.run_after() # [..] # more tests to follow diff --git a/tobiko/tripleo/pacemaker.py b/tobiko/tripleo/pacemaker.py index 19247f5c5..4902ee6dc 100644 --- a/tobiko/tripleo/pacemaker.py +++ b/tobiko/tripleo/pacemaker.py @@ -224,6 +224,27 @@ class PacemakerResourcesStatus(object): # exhausted all retries tobiko.fail('pcs cluster is not in a healthy state') + def resource_banned(self, resource_type): + self.pcs_df = get_pcs_resources_table() + nodes_num = self.resource_count(resource_type) + master_num = self.resource_count_in_state( + resource_type, "Master") + slave_num = self.resource_count_in_state( + resource_type, "Slave") + banned_num = self.resource_count_in_state( + resource_type, "Stopped") + if (master_num == 1 and banned_num >= 1) and\ + (slave_num == nodes_num - master_num - banned_num): + LOG.info("""pcs status check: resource has been banned successfully + and another one has been promoted""") + return True + elif banned_num == 0: + LOG.info("pcs status check: resource has not been banned") + return False + else: + LOG.info("pcs status check: resource is in not in a healthy state") + return False + def get_overcloud_nodes_running_pcs_resource(resource=None, resource_type=None, @@ -253,7 +274,7 @@ def get_overcloud_nodes_running_pcs_resource(resource=None, def get_resource_master_node(resource_type=None): - get_overcloud_nodes_running_pcs_resource( + return get_overcloud_nodes_running_pcs_resource( resource_type=resource_type, resource_state='Master')