Improve faults tests' neutron agent health checks

One of the verifications part of the cloud health checks executed before
and after the disruptions from the faults tests was to check that all
the neutron agents are alive.
This patch adds an extra verification to this: the agents have to be
consistently alive along time.

Besides that, the number of reruns from the faults test
test_controllers_shutdown is set to 0 because it makes analysing its
failures more complicated. This test will still be flaky because we need
to determine whether there are more issues with it.

Change-Id: I354c66453493339622f99c0d18e1ff98f9f609e0
This commit is contained in:
Eduardo Olivares 2023-03-10 08:41:20 +01:00
parent 35083361eb
commit 92a7c7275a
4 changed files with 40 additions and 17 deletions

View File

@ -22,6 +22,8 @@ from tobiko.openstack.tests import _nova
InvalidDBConnString = _neutron.InvalidDBConnString InvalidDBConnString = _neutron.InvalidDBConnString
RAFTStatusError = _neutron.RAFTStatusError RAFTStatusError = _neutron.RAFTStatusError
test_neutron_agents_are_alive = _neutron.test_neutron_agents_are_alive test_neutron_agents_are_alive = _neutron.test_neutron_agents_are_alive
test_alive_agents_are_consistent_along_time = (
_neutron.test_alive_agents_are_consistent_along_time)
test_ovn_dbs_validations = _neutron.test_ovn_dbs_validations test_ovn_dbs_validations = _neutron.test_ovn_dbs_validations
test_ovs_bridges_mac_table_size = _neutron.test_ovs_bridges_mac_table_size test_ovs_bridges_mac_table_size = _neutron.test_ovs_bridges_mac_table_size
test_ovs_namespaces_are_absent = _neutron.test_ovs_namespaces_are_absent test_ovs_namespaces_are_absent = _neutron.test_ovs_namespaces_are_absent

View File

@ -94,6 +94,41 @@ def test_neutron_agents_are_alive(timeout=420., interval=5.) \
return agents return agents
def test_alive_agents_are_consistent_along_time(previous_alive_agents=None):
test_case = tobiko.get_test_case()
if previous_alive_agents is None:
# the following dict of agents is obtained when:
# - the list_agents request is replied with 200
# - the list is not empty
# - no agents are dead
alive_agents = {agent['id']: agent
for agent in test_neutron_agents_are_alive()}
else:
alive_agents = previous_alive_agents
for attempt in tobiko.retry(sleep_time=5., count=5):
agents = neutron.list_agents()
actual = {agent['id']: agent
for agent in agents}
# any dead agents? If yes, fail now
dead_agents = agents.with_items(alive=False)
test_case.assertEqual(
[], dead_agents, "Some neutron agents died")
if len(actual) > len(alive_agents):
LOG.debug('Some new agents appeared! It seems not all the agents '
'had been started yet, so let\'s restart this check')
return test_alive_agents_are_consistent_along_time(actual)
# any agent disappeared? If yes, fail now
test_case.assertEqual(
set(alive_agents), set(actual), 'Some agents disappeared')
if attempt.is_last:
break
def ovn_dbs_vip_bindings(test_case): def ovn_dbs_vip_bindings(test_case):
ovn_conn_str = get_ovn_db_connections() ovn_conn_str = get_ovn_db_connections()
# ovn db sockets might be centrillized or distributed # ovn db sockets might be centrillized or distributed

View File

@ -45,7 +45,7 @@ def overcloud_health_checks(passive_checks_only=False,
check_pacemaker_resources_health() check_pacemaker_resources_health()
check_overcloud_processes_health() check_overcloud_processes_health()
nova.check_nova_services_health() nova.check_nova_services_health()
tests.test_neutron_agents_are_alive() tests.test_alive_agents_are_consistent_along_time()
if not passive_checks_only: if not passive_checks_only:
# create a uniq stack # create a uniq stack
check_vm_create() check_vm_create()
@ -265,7 +265,7 @@ class DisruptTripleoNodesTest(testtools.TestCase):
cloud_disruptions.request_galera_sst() cloud_disruptions.request_galera_sst()
OvercloudHealthCheck.run_after() OvercloudHealthCheck.run_after()
@pytest.mark.flaky(reruns=3, reruns_delay=60) @pytest.mark.flaky(reruns=0)
def test_controllers_shutdown(self): def test_controllers_shutdown(self):
OvercloudHealthCheck.run_before() OvercloudHealthCheck.run_before()
cloud_disruptions.test_controllers_shutdown() cloud_disruptions.test_controllers_shutdown()

View File

@ -17,8 +17,6 @@ from __future__ import absolute_import
import pytest import pytest
import testtools import testtools
import tobiko
from tobiko.openstack import neutron
from tobiko.openstack import tests from tobiko.openstack import tests
@ -29,16 +27,4 @@ class NeutronAgentTest(testtools.TestCase):
tests.test_neutron_agents_are_alive() tests.test_neutron_agents_are_alive()
def test_alive_agents_are_consistent_along_time(self): def test_alive_agents_are_consistent_along_time(self):
alive_agents = {agent['id']: agent tests.test_alive_agents_are_consistent_along_time()
for agent in tests.test_neutron_agents_are_alive()}
for attempt in tobiko.retry(sleep_time=5., count=5):
agents = neutron.list_agents()
actual = {agent['id']: agent
for agent in agents}
self.assertEqual(set(alive_agents), set(actual),
'Agents appeared or disappeared')
dead_agents = agents.with_items(alive=False)
self.assertEqual([], dead_agents,
"Neutron agent(s) no more alive")
if attempt.is_last:
break