From 204fcc27b4747119a72df03f22110215e049225f Mon Sep 17 00:00:00 2001 From: Roman Safronov Date: Wed, 17 Jul 2024 18:03:43 +0300 Subject: [PATCH] L3HA tests fixes and improvements Since some tests can be disruptive and can break some important nodes an option for using only safe nodes was introduced. With this option enabled, tests that are doing disruptive actions will not use OSP controller nodes for this but will retry to create objects for testing on a safer node. The option is enabled by default. In order to make test_l3ha_reboot_node to work properly, discover_nodes() was changed to support nodes rescan when there are shut off nodes. test_l3ha_bring_down_interface was changed to use node_tenant_interface defined in config file instead of letting the test to discover it. Changed default value for hypervisor_host to match the current value used on environments deployed by ci-framework. Moved all power operations functions to a separate class that disruptive tests should inherit from. Fixed incorrect calls of find_host_virsh_name by adding required parameter and some other fixes in power operations functions. Change-Id: Iafe9b0210972e0c59529c4a09edbe11d68f91514 --- .../common/utils.py | 1 + whitebox_neutron_tempest_plugin/config.py | 12 +- .../tests/scenario/base.py | 192 ++++++++++-------- .../tests/scenario/test_internal_dns.py | 12 +- .../tests/scenario/test_l3ha_ovn.py | 73 ++++--- 5 files changed, 173 insertions(+), 117 deletions(-) diff --git a/whitebox_neutron_tempest_plugin/common/utils.py b/whitebox_neutron_tempest_plugin/common/utils.py index 462d89c..81c2ef5 100644 --- a/whitebox_neutron_tempest_plugin/common/utils.py +++ b/whitebox_neutron_tempest_plugin/common/utils.py @@ -228,6 +228,7 @@ def remote_service_action(client, service, action): action=action, service=service) LOG.debug("Running '{}' on {}".format(cmd, client.host)) client.exec_command(cmd) + time.sleep(5) def retry_on_assert_fail(max_retries): diff --git a/whitebox_neutron_tempest_plugin/config.py b/whitebox_neutron_tempest_plugin/config.py index 287b4e7..b29265c 100644 --- a/whitebox_neutron_tempest_plugin/config.py +++ b/whitebox_neutron_tempest_plugin/config.py @@ -85,6 +85,12 @@ WhiteboxNeutronPluginOptions = [ 'power operations, like shutdown/startup openstack nodes.' 'These tests can be disruptive and not suitable for some ' 'environments.'), + cfg.BoolOpt('avoid_disrupting_controllers', + default=True, + help='Whether to avoid executing disruptive operations on ' + 'OSP controller or OCP master/worker nodes which can be ' + 'hosting neutron routers. With this option test will try ' + 'to use other nodes.'), cfg.IntOpt('broadcast_receivers_count', default=2, help='How many receivers to use in broadcast tests. Default ' @@ -142,8 +148,8 @@ WhiteboxNeutronPluginOptions = [ 'Can be used for tweaking iperf in case default value ' 'does not work for some reason, e.g. MTU issues.'), cfg.StrOpt('node_tenant_interface', - default='eth1', - help='Physical interface of a node that intended to pass tenant' + default='vlan22', + help='Interface of a node that intended to pass tenant' 'network traffic. Note: currently only environments with ' 'the same name of the tenant interface are supported'), cfg.IntOpt('capture_timeout', @@ -154,7 +160,7 @@ WhiteboxNeutronPluginOptions = [ 'remote process in case test or connection was ' 'interrupted unexpectedly.'), cfg.StrOpt('hypervisor_host', - default='hypervisor-1', + default='hypervisor-1.utility', help='Hypervisor host for podified environment based on libvirt' 'virtual machines, typically deployed by ci-framework: ' 'https://github.com/openstack-k8s-operators/ci-framework'), diff --git a/whitebox_neutron_tempest_plugin/tests/scenario/base.py b/whitebox_neutron_tempest_plugin/tests/scenario/base.py index 728459a..48e04b6 100644 --- a/whitebox_neutron_tempest_plugin/tests/scenario/base.py +++ b/whitebox_neutron_tempest_plugin/tests/scenario/base.py @@ -39,6 +39,7 @@ from tempest import config from tempest.lib.common import fixed_network from tempest.lib.common.utils import data_utils from tempest.lib.common.utils import test_utils +from tempest.lib import exceptions as lib_exceptions from whitebox_neutron_tempest_plugin.common import constants as local_constants from whitebox_neutron_tempest_plugin.common import tcpdump_capture as capture @@ -297,7 +298,7 @@ class BaseTempestWhiteboxTestCase(base.BaseTempestTestCase): host['hypervisor_hostname'] for host in cls.os_admin.hv_client.list_hypervisors()['hypervisors']] if WB_CONF.openstack_type == 'podified': - cls.nodes = cls.get_podified_nodes_data() + cls.nodes_data = cls.get_podified_nodes_data() with open(WB_CONF.proxy_host_key_file, 'r') as file: id_cifw_key = file.read() cls.keys_data = { @@ -306,17 +307,20 @@ class BaseTempestWhiteboxTestCase(base.BaseTempestTestCase): devscripts_key = cls.proxy_host_client.exec_command( 'cat ' + cls.ocp_nodes_key_path) cls.keys_data['devscripts_key'] = devscripts_key - for host in cls.nodes: + for host in cls.nodes_data: client = cls.get_node_client( host=host['ip'], username=host['user'], pkey=f"{cls.keys_data[host['key']]}") host['client'] = client else: - cls.nodes = [] + cls.nodes_data = [] for host in set([*l3_agent_hosts, *compute_hosts]): - cls.nodes.append( + cls.nodes_data.append( {'ip': host, 'client': cls.get_node_client(host)}) - for host in cls.nodes: + cls.nodes = [] + for host in cls.nodes_data: + if not local_utils.host_responds_to_ping(host['ip']): + continue host['name'] = cls.get_full_name( host['client'].exec_command('hostname').strip()) host['full_name'] = cls.get_full_name( @@ -328,6 +332,13 @@ class BaseTempestWhiteboxTestCase(base.BaseTempestTestCase): host['is_controller'] = (output.strip() != "") host['is_compute'] = (host['full_name'] in compute_hosts) host['is_networker'] = (host['full_name'] in l3_agent_hosts) + cls.nodes.append(host) + + @classmethod + def get_standalone_networkers(cls): + return [node['name'] for node in cls.nodes + if node['is_networker'] and not + (node['is_controller'] or node['is_compute'])] @classmethod def is_setup_single_node(cls): @@ -908,70 +919,6 @@ class BaseTempestWhiteboxTestCase(base.BaseTempestTestCase): LOG.debug('Command for resource creation succeeded') return _id - @classmethod - def find_host_virsh_name(cls, host): - cmd = ("timeout 10 ssh {} sudo virsh list --name | grep -w {}").format( - WB_CONF.hypervisor_host, host) - return cls.proxy_host_client.exec_command(cmd).strip() - - @classmethod - def is_host_state_is_shut_off(cls, host): - cmd = ("timeout 10 ssh {} virsh list --state-shutoff | grep -w {} " - "|| true".format(WB_CONF.hypervisor_host, host)) - output = cls.proxy_host_client.exec_command(cmd) - return True if host in output else False - - @classmethod - def is_host_loginable(cls, host): - cmd = "timeout 10 ssh {} ssh {} hostname || true".format( - WB_CONF.hypervisor_host, host) - output = cls.proxy_host_client.exec_command(cmd) - return True if host in output else False - - @classmethod - def power_off_host(cls, host): - if not WB_CONF.run_power_operations_tests: - raise cls.skipException("Power operations are not allowed") - cmd = "timeout 10 ssh {} sudo virsh destroy {}".format( - WB_CONF.hypervisor_host, cls.find_host_virsh_name()) - cls.proxy_host_client.exec_command(cmd) - common_utils.wait_until_true( - lambda: cls.is_host_state_is_shut_off(host), - timeout=30, sleep=5) - - @classmethod - def power_on_host(cls, host): - if not WB_CONF.run_power_operations_tests: - raise cls.skipException("Power operations are not allowed") - cmd = "timeout 10 ssh {} sudo virsh start {}".format( - WB_CONF.hypervisor_host, cls.find_host_virsh_name()) - cls.proxy_host_client.exec_command(cmd) - # TODO(rsafrono): implement and apply additional health checks - common_utils.wait_until_true( - lambda: cls.is_host_loginable(host), - timeout=120, sleep=5) - - @classmethod - def reboot_host(cls, host): - if not WB_CONF.run_power_operations_tests: - raise cls.skipException("Power operations are not allowed") - cmd = "timeout 10 ssh {} sudo virsh reboot {}".format( - WB_CONF.hypervisor_host, cls.find_host_virsh_name()) - cls.proxy_host_client.exec_command(cmd) - common_utils.wait_until_true( - lambda: cls.is_host_loginable(host), - timeout=120, sleep=5) - - def ensure_overcloud_nodes_active(self): - """Checks all openstack nodes are up, otherwise activates them. - """ - # get overcloud nodes info if it doesn't exist - if not hasattr(self, 'nodes'): - self.discover_nodes() - for node in self.nodes: - if self.is_host_state_is_shut_off(node['name']): - self.power_on_host(node['name']) - class BaseTempestTestCaseAdvanced(BaseTempestWhiteboxTestCase): """Base class skips test suites unless advanced image is available, @@ -1027,15 +974,19 @@ class TrafficFlowTest(BaseTempestWhiteboxTestCase): "is not configured.") cls.discover_nodes() if WB_CONF.openstack_type == 'podified': - cmd = ("{} get pods --field-selector=status.phase=Running " - "-o custom-columns=NODE:.spec.nodeName,NAME:.metadata.name " - "| grep ovn-controller-ovs".format(cls.OC)) - output = cls.proxy_host_client.exec_command( - cmd).strip().split('\n') - for line in output: - for node in cls.nodes: - if node['name'] == line.split()[0]: - node['ovs_pod'] = line.split()[1] + cls.set_ovs_pods_for_nodes() + + @classmethod + def set_ovs_pods_for_nodes(cls): + cmd = ("{} get pods --field-selector=status.phase=Running " + "-o custom-columns=NODE:.spec.nodeName,NAME:.metadata.name " + "-l service=ovn-controller-ovs".format(cls.OC)) + output = cls.proxy_host_client.exec_command( + cmd).strip().splitlines() + for line in output: + for node in cls.nodes: + if node['name'] == line.split()[0]: + node['ovs_pod'] = line.split()[1] def _start_captures(self, filters, interface=None): def get_interface(client): @@ -1057,9 +1008,9 @@ class TrafficFlowTest(BaseTempestWhiteboxTestCase): return ','.join(interfaces) for node in self.nodes: - if not(node['is_controller'] or - node['is_compute'] or - node['is_networker']): + if not (node['is_controller'] or + node['is_compute'] or + node['is_networker']): LOG.debug('Traffic is not captured on node %s because it is ' 'not: controller, compute, networker', node['name']) continue @@ -1357,6 +1308,85 @@ class BaseTempestTestCaseOvn(BaseTempestWhiteboxTestCase): self.assertEqual(output, '') +class BaseDisruptiveTempestTestCase(BaseTempestWhiteboxTestCase): + @classmethod + def resource_setup(cls): + super(BaseDisruptiveTempestTestCase, cls).resource_setup() + try: + cls.proxy_host_client.exec_command( + "timeout 10 ssh {} virsh list".format(WB_CONF.hypervisor_host)) + except lib_exceptions.SSHExecCommandFailed: + raise cls.skipException( + "No access to virsh tool on hypervisor node. Please make sure " + "that hypervisor_host is configured properly and/or virsh " + "is deployed there.") + + @classmethod + def find_host_virsh_name(cls, host): + cmd = ("timeout 10 ssh {} sudo virsh list --all --name " + "| grep -w {}").format( + WB_CONF.hypervisor_host, host) + return cls.proxy_host_client.exec_command(cmd).strip() + + @classmethod + def is_host_state_is_shut_off(cls, host): + cmd = ("timeout 10 ssh {} virsh list --state-shutoff | grep -w {} " + "|| true".format(WB_CONF.hypervisor_host, host)) + output = cls.proxy_host_client.exec_command(cmd) + return True if host in output else False + + @classmethod + def is_host_loginable(cls, host): + cmd = "timeout 10 ssh {} ssh {} hostname || true".format( + WB_CONF.hypervisor_host, host) + output = cls.proxy_host_client.exec_command(cmd) + return True if host in output else False + + @classmethod + def power_off_host(cls, host): + if not WB_CONF.run_power_operations_tests: + raise cls.skipException("Power operations are not allowed") + cmd = "timeout 10 ssh {} sudo virsh destroy {}".format( + WB_CONF.hypervisor_host, cls.find_host_virsh_name(host)) + cls.proxy_host_client.exec_command(cmd) + common_utils.wait_until_true( + lambda: cls.is_host_state_is_shut_off(host), + timeout=30, sleep=5) + + @classmethod + def power_on_host(cls, host): + if not WB_CONF.run_power_operations_tests: + raise cls.skipException("Power operations are not allowed") + cmd = "timeout 10 ssh {} sudo virsh start {}".format( + WB_CONF.hypervisor_host, cls.find_host_virsh_name(host)) + cls.proxy_host_client.exec_command(cmd) + # TODO(rsafrono): implement and apply additional health checks + common_utils.wait_until_true( + lambda: cls.is_host_loginable(host), + timeout=120, sleep=5) + + @classmethod + def reboot_host(cls, host): + if not WB_CONF.run_power_operations_tests: + raise cls.skipException("Power operations are not allowed") + cmd = "timeout 10 ssh {} sudo virsh reboot {}".format( + WB_CONF.hypervisor_host, cls.find_host_virsh_name(host)) + cls.proxy_host_client.exec_command(cmd) + common_utils.wait_until_true( + lambda: cls.is_host_loginable(host), + timeout=120, sleep=5) + + def ensure_overcloud_nodes_active(self): + """Checks all openstack nodes are up, otherwise activates them. + """ + hosts = self.proxy_host_client.exec_command( + "timeout 10 ssh {} sudo virsh list --all --name".format( + WB_CONF.hypervisor_host)).strip().split() + for host in hosts: + if self.is_host_state_is_shut_off(host): + self.power_on_host(host) + + # user_data_cmd is used to generate a VLAN interface on VM instances with PF # ports user_data_cmd = """ diff --git a/whitebox_neutron_tempest_plugin/tests/scenario/test_internal_dns.py b/whitebox_neutron_tempest_plugin/tests/scenario/test_internal_dns.py index 86152de..a2f1556 100644 --- a/whitebox_neutron_tempest_plugin/tests/scenario/test_internal_dns.py +++ b/whitebox_neutron_tempest_plugin/tests/scenario/test_internal_dns.py @@ -13,7 +13,6 @@ # License for the specific language governing permissions and limitations # under the License. import re -import testtools from neutron_lib import constants as lib_constants from neutron_tempest_plugin.common import ssh @@ -290,7 +289,8 @@ class InternalDNSInterruptionsTestOvn(InternalDNSBaseOvn): class InternalDNSInterruptionsAdvancedTestOvn( InternalDNSBaseOvn, - base.BaseTempestTestCaseAdvanced): + base.BaseTempestTestCaseAdvanced, + base.BaseDisruptiveTempestTestCase): """Tests internal DNS capabilities with interruptions in overcloud, on advanced image only. """ @@ -318,8 +318,6 @@ class InternalDNSInterruptionsAdvancedTestOvn( @decorators.attr(type='slow') @utils.requires_ext(extension="dns-integration", service="network") - @testtools.skipIf(WB_CONF.openstack_type == 'podified', - 'Not yet adapted for podified environment') @decorators.idempotent_id('e6c5dbea-d704-4cda-bb92-a5bfd0aa1bb2') def test_ovn_dns_name_after_networker_reboot(self): """Tests that OpenStack port, guest VM and OVN NB database have correct @@ -355,7 +353,13 @@ class InternalDNSInterruptionsAdvancedTestOvn( vm_1['fip'] = self.create_floatingip(port=dns_port) vm_1['ssh_client'] = self._create_ssh_client( vm_1['fip']['floating_ip_address']) + self._get_router_and_nodes_info() + if self.get_node_setting(self.router_gateway_chassis, 'is_controller'): + raise self.skipException( + "The test currently does not support a required action " + "when gateway chassis is on a node with OSP control plane " + "services rather than on a standalone networker node.") # soft shutdown master networker node self.power_off_host(self.router_gateway_chassis) # validate hostname (dns-name) using API, guest VM, diff --git a/whitebox_neutron_tempest_plugin/tests/scenario/test_l3ha_ovn.py b/whitebox_neutron_tempest_plugin/tests/scenario/test_l3ha_ovn.py index 0cfe243..27ecc82 100644 --- a/whitebox_neutron_tempest_plugin/tests/scenario/test_l3ha_ovn.py +++ b/whitebox_neutron_tempest_plugin/tests/scenario/test_l3ha_ovn.py @@ -34,14 +34,12 @@ WB_CONF = config.CONF.whitebox_neutron_plugin_options LOG = log.getLogger(__name__) -@testtools.skipIf(WB_CONF.openstack_type == 'podified', - 'Not yet adapted for podified environment') -class L3haOvnTest(base.TrafficFlowTest, base.BaseTempestTestCaseOvn): +class L3haOvnCommon(base.TrafficFlowTest, base.BaseTempestTestCaseOvn): credentials = ['primary', 'admin'] @classmethod def resource_setup(cls): - super(L3haOvnTest, cls).resource_setup() + super(L3haOvnCommon, cls).resource_setup() cls.setup_api_microversion_fixture( compute_microversion='2.74') @@ -95,17 +93,34 @@ class L3haOvnTest(base.TrafficFlowTest, base.BaseTempestTestCaseOvn): "when gateway chassis is on controller.") def _setup(self): - router = self.create_router_by_client() - self.router_port = self.os_admin.network_client.list_ports( - device_id=router['id'], - device_owner=lib_constants.DEVICE_OWNER_ROUTER_GW)['ports'][0] - self.chassis_list = self.get_router_gateway_chassis_list( - self.router_port['id']) - self._validate_gateway_chassis(self.chassis_list[0]) - chassis_name = self.get_router_gateway_chassis_by_id( - self.chassis_list[0]) - LOG.debug("router chassis name = {}".format(chassis_name)) + def create_router_candidate(): + router = self.create_router_by_client() + self.router_port = self.os_admin.network_client.list_ports( + device_id=router['id'], + device_owner=lib_constants.DEVICE_OWNER_ROUTER_GW)['ports'][0] + self.chassis_list = self.get_router_gateway_chassis_list( + self.router_port['id']) + chassis_name = self.get_router_gateway_chassis_by_id( + self.chassis_list[0]) + LOG.debug("router chassis name = {}".format(chassis_name)) + return router, chassis_name + if (WB_CONF.avoid_disrupting_controllers and + self.get_standalone_networkers()): + attempts = 5 + controller_nodes = [node['name'] for node in self.nodes + if node['is_controller']] + for i in range(1, attempts): + LOG.debug("Router creation attempt {}".format(i)) + router, chassis_name = create_router_candidate() + if chassis_name in controller_nodes: + continue + else: + break + else: + router, chassis_name = create_router_candidate() + + self._validate_gateway_chassis(self.chassis_list[0]) # Since we are going to spawn VMs with 'host' option which # is available only for admin user, we create security group # and keypair also as admin @@ -160,9 +175,16 @@ class L3haOvnTest(base.TrafficFlowTest, base.BaseTempestTestCaseOvn): self.ignore_outbound = False self.verify_routing_via_chassis(self.chassis_list[0]) - @testtools.skipUnless(WB_CONF.run_power_operations_tests, - "run_power_operations_tests conf value is not " - "enabled.") + def refresh_nodes_data(self): + self.discover_nodes() + if WB_CONF.openstack_type == 'podified': + self.set_ovs_pods_for_nodes() + + +@testtools.skipUnless(WB_CONF.run_power_operations_tests, + "run_power_operations_tests conf value is not " + "enabled.") +class L3haOvnDisruptiveTest(L3haOvnCommon, base.BaseDisruptiveTempestTestCase): @decorators.idempotent_id('cf47a5e3-35cb-423c-84af-4cc6d389cfbd') @decorators.attr(type='slow') def test_l3ha_reboot_node(self): @@ -197,13 +219,15 @@ class L3haOvnTest(base.TrafficFlowTest, base.BaseTempestTestCaseOvn): gateway_node = self.router_gateway_chassis self.power_off_host(gateway_node) - self.discover_nodes() + self.refresh_nodes_data() self.verify_routing_via_chassis(self.chassis_list[1]) self.power_on_host(gateway_node) - self.discover_nodes() + self.refresh_nodes_data() self.verify_routing_via_chassis(self.chassis_list[0]) + +class L3haOvnTest(L3haOvnCommon): @decorators.idempotent_id('f8fe1f69-a87f-41d8-ac6e-ed7905438338') @decorators.attr(type='slow') def test_l3ha_bring_down_interface(self): @@ -234,16 +258,7 @@ class L3haOvnTest(base.TrafficFlowTest, base.BaseTempestTestCaseOvn): """ self._setup() node_client = self.find_node_client(self.router_gateway_chassis) - # We need to find a physical interface that is passing tenant traffic - bridge = node_client.exec_command( - "sudo ovs-vsctl get open . external_ids:ovn-bridge-mappings | " - r"sed 's/^\".*tenant:\(.*\).*\"$/\1/'").rstrip() - physical_interfaces = node_client.exec_command( - "find /sys/class/net -type l -not -lname '*virtual*' " - "-printf '%f\n'").rstrip().split('\n') - bridge_interfaces = node_client.exec_command( - "sudo ovs-vsctl list-ifaces " + bridge).rstrip().split('\n') - interface = (set(physical_interfaces) & set(bridge_interfaces)).pop() + interface = WB_CONF.node_tenant_interface self.addCleanup( utils.interface_state_set, node_client, interface, constants.STATE_UP)