L3HA tests fixes and improvements

Since some tests can be disruptive and can break some important
nodes an option for using only safe nodes was introduced.
With this option enabled, tests that are doing disruptive actions
will not use OSP controller nodes for this but will retry
to create objects for testing on a safer node. The option is
enabled by default.

In order to make test_l3ha_reboot_node to work properly,
discover_nodes() was changed to support nodes rescan when there
are shut off nodes.

test_l3ha_bring_down_interface was changed to use
node_tenant_interface defined in config file instead of letting
the test to discover it.

Changed default value for hypervisor_host to match the
current value used on environments deployed by ci-framework.

Moved all power operations functions to a separate class that
disruptive tests should inherit from.

Fixed incorrect calls of find_host_virsh_name by
adding required parameter and some other fixes in power
operations functions.

Change-Id: Iafe9b0210972e0c59529c4a09edbe11d68f91514
This commit is contained in:
Roman Safronov 2024-07-17 18:03:43 +03:00
parent 17351a7865
commit 204fcc27b4
5 changed files with 173 additions and 117 deletions

View File

@ -228,6 +228,7 @@ def remote_service_action(client, service, action):
action=action, service=service)
LOG.debug("Running '{}' on {}".format(cmd, client.host))
client.exec_command(cmd)
time.sleep(5)
def retry_on_assert_fail(max_retries):

View File

@ -85,6 +85,12 @@ WhiteboxNeutronPluginOptions = [
'power operations, like shutdown/startup openstack nodes.'
'These tests can be disruptive and not suitable for some '
'environments.'),
cfg.BoolOpt('avoid_disrupting_controllers',
default=True,
help='Whether to avoid executing disruptive operations on '
'OSP controller or OCP master/worker nodes which can be '
'hosting neutron routers. With this option test will try '
'to use other nodes.'),
cfg.IntOpt('broadcast_receivers_count',
default=2,
help='How many receivers to use in broadcast tests. Default '
@ -142,8 +148,8 @@ WhiteboxNeutronPluginOptions = [
'Can be used for tweaking iperf in case default value '
'does not work for some reason, e.g. MTU issues.'),
cfg.StrOpt('node_tenant_interface',
default='eth1',
help='Physical interface of a node that intended to pass tenant'
default='vlan22',
help='Interface of a node that intended to pass tenant'
'network traffic. Note: currently only environments with '
'the same name of the tenant interface are supported'),
cfg.IntOpt('capture_timeout',
@ -154,7 +160,7 @@ WhiteboxNeutronPluginOptions = [
'remote process in case test or connection was '
'interrupted unexpectedly.'),
cfg.StrOpt('hypervisor_host',
default='hypervisor-1',
default='hypervisor-1.utility',
help='Hypervisor host for podified environment based on libvirt'
'virtual machines, typically deployed by ci-framework: '
'https://github.com/openstack-k8s-operators/ci-framework'),

View File

@ -39,6 +39,7 @@ from tempest import config
from tempest.lib.common import fixed_network
from tempest.lib.common.utils import data_utils
from tempest.lib.common.utils import test_utils
from tempest.lib import exceptions as lib_exceptions
from whitebox_neutron_tempest_plugin.common import constants as local_constants
from whitebox_neutron_tempest_plugin.common import tcpdump_capture as capture
@ -297,7 +298,7 @@ class BaseTempestWhiteboxTestCase(base.BaseTempestTestCase):
host['hypervisor_hostname'] for host
in cls.os_admin.hv_client.list_hypervisors()['hypervisors']]
if WB_CONF.openstack_type == 'podified':
cls.nodes = cls.get_podified_nodes_data()
cls.nodes_data = cls.get_podified_nodes_data()
with open(WB_CONF.proxy_host_key_file, 'r') as file:
id_cifw_key = file.read()
cls.keys_data = {
@ -306,17 +307,20 @@ class BaseTempestWhiteboxTestCase(base.BaseTempestTestCase):
devscripts_key = cls.proxy_host_client.exec_command(
'cat ' + cls.ocp_nodes_key_path)
cls.keys_data['devscripts_key'] = devscripts_key
for host in cls.nodes:
for host in cls.nodes_data:
client = cls.get_node_client(
host=host['ip'], username=host['user'],
pkey=f"{cls.keys_data[host['key']]}")
host['client'] = client
else:
cls.nodes = []
cls.nodes_data = []
for host in set([*l3_agent_hosts, *compute_hosts]):
cls.nodes.append(
cls.nodes_data.append(
{'ip': host, 'client': cls.get_node_client(host)})
for host in cls.nodes:
cls.nodes = []
for host in cls.nodes_data:
if not local_utils.host_responds_to_ping(host['ip']):
continue
host['name'] = cls.get_full_name(
host['client'].exec_command('hostname').strip())
host['full_name'] = cls.get_full_name(
@ -328,6 +332,13 @@ class BaseTempestWhiteboxTestCase(base.BaseTempestTestCase):
host['is_controller'] = (output.strip() != "")
host['is_compute'] = (host['full_name'] in compute_hosts)
host['is_networker'] = (host['full_name'] in l3_agent_hosts)
cls.nodes.append(host)
@classmethod
def get_standalone_networkers(cls):
return [node['name'] for node in cls.nodes
if node['is_networker'] and not
(node['is_controller'] or node['is_compute'])]
@classmethod
def is_setup_single_node(cls):
@ -908,70 +919,6 @@ class BaseTempestWhiteboxTestCase(base.BaseTempestTestCase):
LOG.debug('Command for resource creation succeeded')
return _id
@classmethod
def find_host_virsh_name(cls, host):
cmd = ("timeout 10 ssh {} sudo virsh list --name | grep -w {}").format(
WB_CONF.hypervisor_host, host)
return cls.proxy_host_client.exec_command(cmd).strip()
@classmethod
def is_host_state_is_shut_off(cls, host):
cmd = ("timeout 10 ssh {} virsh list --state-shutoff | grep -w {} "
"|| true".format(WB_CONF.hypervisor_host, host))
output = cls.proxy_host_client.exec_command(cmd)
return True if host in output else False
@classmethod
def is_host_loginable(cls, host):
cmd = "timeout 10 ssh {} ssh {} hostname || true".format(
WB_CONF.hypervisor_host, host)
output = cls.proxy_host_client.exec_command(cmd)
return True if host in output else False
@classmethod
def power_off_host(cls, host):
if not WB_CONF.run_power_operations_tests:
raise cls.skipException("Power operations are not allowed")
cmd = "timeout 10 ssh {} sudo virsh destroy {}".format(
WB_CONF.hypervisor_host, cls.find_host_virsh_name())
cls.proxy_host_client.exec_command(cmd)
common_utils.wait_until_true(
lambda: cls.is_host_state_is_shut_off(host),
timeout=30, sleep=5)
@classmethod
def power_on_host(cls, host):
if not WB_CONF.run_power_operations_tests:
raise cls.skipException("Power operations are not allowed")
cmd = "timeout 10 ssh {} sudo virsh start {}".format(
WB_CONF.hypervisor_host, cls.find_host_virsh_name())
cls.proxy_host_client.exec_command(cmd)
# TODO(rsafrono): implement and apply additional health checks
common_utils.wait_until_true(
lambda: cls.is_host_loginable(host),
timeout=120, sleep=5)
@classmethod
def reboot_host(cls, host):
if not WB_CONF.run_power_operations_tests:
raise cls.skipException("Power operations are not allowed")
cmd = "timeout 10 ssh {} sudo virsh reboot {}".format(
WB_CONF.hypervisor_host, cls.find_host_virsh_name())
cls.proxy_host_client.exec_command(cmd)
common_utils.wait_until_true(
lambda: cls.is_host_loginable(host),
timeout=120, sleep=5)
def ensure_overcloud_nodes_active(self):
"""Checks all openstack nodes are up, otherwise activates them.
"""
# get overcloud nodes info if it doesn't exist
if not hasattr(self, 'nodes'):
self.discover_nodes()
for node in self.nodes:
if self.is_host_state_is_shut_off(node['name']):
self.power_on_host(node['name'])
class BaseTempestTestCaseAdvanced(BaseTempestWhiteboxTestCase):
"""Base class skips test suites unless advanced image is available,
@ -1027,15 +974,19 @@ class TrafficFlowTest(BaseTempestWhiteboxTestCase):
"is not configured.")
cls.discover_nodes()
if WB_CONF.openstack_type == 'podified':
cmd = ("{} get pods --field-selector=status.phase=Running "
"-o custom-columns=NODE:.spec.nodeName,NAME:.metadata.name "
"| grep ovn-controller-ovs".format(cls.OC))
output = cls.proxy_host_client.exec_command(
cmd).strip().split('\n')
for line in output:
for node in cls.nodes:
if node['name'] == line.split()[0]:
node['ovs_pod'] = line.split()[1]
cls.set_ovs_pods_for_nodes()
@classmethod
def set_ovs_pods_for_nodes(cls):
cmd = ("{} get pods --field-selector=status.phase=Running "
"-o custom-columns=NODE:.spec.nodeName,NAME:.metadata.name "
"-l service=ovn-controller-ovs".format(cls.OC))
output = cls.proxy_host_client.exec_command(
cmd).strip().splitlines()
for line in output:
for node in cls.nodes:
if node['name'] == line.split()[0]:
node['ovs_pod'] = line.split()[1]
def _start_captures(self, filters, interface=None):
def get_interface(client):
@ -1057,9 +1008,9 @@ class TrafficFlowTest(BaseTempestWhiteboxTestCase):
return ','.join(interfaces)
for node in self.nodes:
if not(node['is_controller'] or
node['is_compute'] or
node['is_networker']):
if not (node['is_controller'] or
node['is_compute'] or
node['is_networker']):
LOG.debug('Traffic is not captured on node %s because it is '
'not: controller, compute, networker', node['name'])
continue
@ -1357,6 +1308,85 @@ class BaseTempestTestCaseOvn(BaseTempestWhiteboxTestCase):
self.assertEqual(output, '')
class BaseDisruptiveTempestTestCase(BaseTempestWhiteboxTestCase):
@classmethod
def resource_setup(cls):
super(BaseDisruptiveTempestTestCase, cls).resource_setup()
try:
cls.proxy_host_client.exec_command(
"timeout 10 ssh {} virsh list".format(WB_CONF.hypervisor_host))
except lib_exceptions.SSHExecCommandFailed:
raise cls.skipException(
"No access to virsh tool on hypervisor node. Please make sure "
"that hypervisor_host is configured properly and/or virsh "
"is deployed there.")
@classmethod
def find_host_virsh_name(cls, host):
cmd = ("timeout 10 ssh {} sudo virsh list --all --name "
"| grep -w {}").format(
WB_CONF.hypervisor_host, host)
return cls.proxy_host_client.exec_command(cmd).strip()
@classmethod
def is_host_state_is_shut_off(cls, host):
cmd = ("timeout 10 ssh {} virsh list --state-shutoff | grep -w {} "
"|| true".format(WB_CONF.hypervisor_host, host))
output = cls.proxy_host_client.exec_command(cmd)
return True if host in output else False
@classmethod
def is_host_loginable(cls, host):
cmd = "timeout 10 ssh {} ssh {} hostname || true".format(
WB_CONF.hypervisor_host, host)
output = cls.proxy_host_client.exec_command(cmd)
return True if host in output else False
@classmethod
def power_off_host(cls, host):
if not WB_CONF.run_power_operations_tests:
raise cls.skipException("Power operations are not allowed")
cmd = "timeout 10 ssh {} sudo virsh destroy {}".format(
WB_CONF.hypervisor_host, cls.find_host_virsh_name(host))
cls.proxy_host_client.exec_command(cmd)
common_utils.wait_until_true(
lambda: cls.is_host_state_is_shut_off(host),
timeout=30, sleep=5)
@classmethod
def power_on_host(cls, host):
if not WB_CONF.run_power_operations_tests:
raise cls.skipException("Power operations are not allowed")
cmd = "timeout 10 ssh {} sudo virsh start {}".format(
WB_CONF.hypervisor_host, cls.find_host_virsh_name(host))
cls.proxy_host_client.exec_command(cmd)
# TODO(rsafrono): implement and apply additional health checks
common_utils.wait_until_true(
lambda: cls.is_host_loginable(host),
timeout=120, sleep=5)
@classmethod
def reboot_host(cls, host):
if not WB_CONF.run_power_operations_tests:
raise cls.skipException("Power operations are not allowed")
cmd = "timeout 10 ssh {} sudo virsh reboot {}".format(
WB_CONF.hypervisor_host, cls.find_host_virsh_name(host))
cls.proxy_host_client.exec_command(cmd)
common_utils.wait_until_true(
lambda: cls.is_host_loginable(host),
timeout=120, sleep=5)
def ensure_overcloud_nodes_active(self):
"""Checks all openstack nodes are up, otherwise activates them.
"""
hosts = self.proxy_host_client.exec_command(
"timeout 10 ssh {} sudo virsh list --all --name".format(
WB_CONF.hypervisor_host)).strip().split()
for host in hosts:
if self.is_host_state_is_shut_off(host):
self.power_on_host(host)
# user_data_cmd is used to generate a VLAN interface on VM instances with PF
# ports
user_data_cmd = """

View File

@ -13,7 +13,6 @@
# License for the specific language governing permissions and limitations
# under the License.
import re
import testtools
from neutron_lib import constants as lib_constants
from neutron_tempest_plugin.common import ssh
@ -290,7 +289,8 @@ class InternalDNSInterruptionsTestOvn(InternalDNSBaseOvn):
class InternalDNSInterruptionsAdvancedTestOvn(
InternalDNSBaseOvn,
base.BaseTempestTestCaseAdvanced):
base.BaseTempestTestCaseAdvanced,
base.BaseDisruptiveTempestTestCase):
"""Tests internal DNS capabilities with interruptions in overcloud,
on advanced image only.
"""
@ -318,8 +318,6 @@ class InternalDNSInterruptionsAdvancedTestOvn(
@decorators.attr(type='slow')
@utils.requires_ext(extension="dns-integration", service="network")
@testtools.skipIf(WB_CONF.openstack_type == 'podified',
'Not yet adapted for podified environment')
@decorators.idempotent_id('e6c5dbea-d704-4cda-bb92-a5bfd0aa1bb2')
def test_ovn_dns_name_after_networker_reboot(self):
"""Tests that OpenStack port, guest VM and OVN NB database have correct
@ -355,7 +353,13 @@ class InternalDNSInterruptionsAdvancedTestOvn(
vm_1['fip'] = self.create_floatingip(port=dns_port)
vm_1['ssh_client'] = self._create_ssh_client(
vm_1['fip']['floating_ip_address'])
self._get_router_and_nodes_info()
if self.get_node_setting(self.router_gateway_chassis, 'is_controller'):
raise self.skipException(
"The test currently does not support a required action "
"when gateway chassis is on a node with OSP control plane "
"services rather than on a standalone networker node.")
# soft shutdown master networker node
self.power_off_host(self.router_gateway_chassis)
# validate hostname (dns-name) using API, guest VM,

View File

@ -34,14 +34,12 @@ WB_CONF = config.CONF.whitebox_neutron_plugin_options
LOG = log.getLogger(__name__)
@testtools.skipIf(WB_CONF.openstack_type == 'podified',
'Not yet adapted for podified environment')
class L3haOvnTest(base.TrafficFlowTest, base.BaseTempestTestCaseOvn):
class L3haOvnCommon(base.TrafficFlowTest, base.BaseTempestTestCaseOvn):
credentials = ['primary', 'admin']
@classmethod
def resource_setup(cls):
super(L3haOvnTest, cls).resource_setup()
super(L3haOvnCommon, cls).resource_setup()
cls.setup_api_microversion_fixture(
compute_microversion='2.74')
@ -95,17 +93,34 @@ class L3haOvnTest(base.TrafficFlowTest, base.BaseTempestTestCaseOvn):
"when gateway chassis is on controller.")
def _setup(self):
router = self.create_router_by_client()
self.router_port = self.os_admin.network_client.list_ports(
device_id=router['id'],
device_owner=lib_constants.DEVICE_OWNER_ROUTER_GW)['ports'][0]
self.chassis_list = self.get_router_gateway_chassis_list(
self.router_port['id'])
self._validate_gateway_chassis(self.chassis_list[0])
chassis_name = self.get_router_gateway_chassis_by_id(
self.chassis_list[0])
LOG.debug("router chassis name = {}".format(chassis_name))
def create_router_candidate():
router = self.create_router_by_client()
self.router_port = self.os_admin.network_client.list_ports(
device_id=router['id'],
device_owner=lib_constants.DEVICE_OWNER_ROUTER_GW)['ports'][0]
self.chassis_list = self.get_router_gateway_chassis_list(
self.router_port['id'])
chassis_name = self.get_router_gateway_chassis_by_id(
self.chassis_list[0])
LOG.debug("router chassis name = {}".format(chassis_name))
return router, chassis_name
if (WB_CONF.avoid_disrupting_controllers and
self.get_standalone_networkers()):
attempts = 5
controller_nodes = [node['name'] for node in self.nodes
if node['is_controller']]
for i in range(1, attempts):
LOG.debug("Router creation attempt {}".format(i))
router, chassis_name = create_router_candidate()
if chassis_name in controller_nodes:
continue
else:
break
else:
router, chassis_name = create_router_candidate()
self._validate_gateway_chassis(self.chassis_list[0])
# Since we are going to spawn VMs with 'host' option which
# is available only for admin user, we create security group
# and keypair also as admin
@ -160,9 +175,16 @@ class L3haOvnTest(base.TrafficFlowTest, base.BaseTempestTestCaseOvn):
self.ignore_outbound = False
self.verify_routing_via_chassis(self.chassis_list[0])
@testtools.skipUnless(WB_CONF.run_power_operations_tests,
"run_power_operations_tests conf value is not "
"enabled.")
def refresh_nodes_data(self):
self.discover_nodes()
if WB_CONF.openstack_type == 'podified':
self.set_ovs_pods_for_nodes()
@testtools.skipUnless(WB_CONF.run_power_operations_tests,
"run_power_operations_tests conf value is not "
"enabled.")
class L3haOvnDisruptiveTest(L3haOvnCommon, base.BaseDisruptiveTempestTestCase):
@decorators.idempotent_id('cf47a5e3-35cb-423c-84af-4cc6d389cfbd')
@decorators.attr(type='slow')
def test_l3ha_reboot_node(self):
@ -197,13 +219,15 @@ class L3haOvnTest(base.TrafficFlowTest, base.BaseTempestTestCaseOvn):
gateway_node = self.router_gateway_chassis
self.power_off_host(gateway_node)
self.discover_nodes()
self.refresh_nodes_data()
self.verify_routing_via_chassis(self.chassis_list[1])
self.power_on_host(gateway_node)
self.discover_nodes()
self.refresh_nodes_data()
self.verify_routing_via_chassis(self.chassis_list[0])
class L3haOvnTest(L3haOvnCommon):
@decorators.idempotent_id('f8fe1f69-a87f-41d8-ac6e-ed7905438338')
@decorators.attr(type='slow')
def test_l3ha_bring_down_interface(self):
@ -234,16 +258,7 @@ class L3haOvnTest(base.TrafficFlowTest, base.BaseTempestTestCaseOvn):
"""
self._setup()
node_client = self.find_node_client(self.router_gateway_chassis)
# We need to find a physical interface that is passing tenant traffic
bridge = node_client.exec_command(
"sudo ovs-vsctl get open . external_ids:ovn-bridge-mappings | "
r"sed 's/^\".*tenant:\(.*\).*\"$/\1/'").rstrip()
physical_interfaces = node_client.exec_command(
"find /sys/class/net -type l -not -lname '*virtual*' "
"-printf '%f\n'").rstrip().split('\n')
bridge_interfaces = node_client.exec_command(
"sudo ovs-vsctl list-ifaces " + bridge).rstrip().split('\n')
interface = (set(physical_interfaces) & set(bridge_interfaces)).pop()
interface = WB_CONF.node_tenant_interface
self.addCleanup(
utils.interface_state_set, node_client, interface,
constants.STATE_UP)