Merge "Add first health checks to faults podified ha tests"

This commit is contained in:
Zuul 2025-03-19 10:58:41 +00:00 committed by Gerrit Code Review
commit 8a3b7ea932
9 changed files with 277 additions and 245 deletions

@ -13,6 +13,7 @@
# under the License.
from __future__ import absolute_import
from tobiko.openstack.nova import _checks
from tobiko.openstack.nova import _client
from tobiko.openstack.nova import _cloud_init
from tobiko.openstack.nova import _hypervisor
@ -85,5 +86,12 @@ find_server_ip_address = _server.find_server_ip_address
HasServerMixin = _server.HasServerMixin
get_server_id = _server.get_server_id
list_server_ip_addresses = _server.list_server_ip_addresses
action_on_all_instances = _server.action_on_all_instances
wait_for_services_up = _service.wait_for_services_up
check_nova_services_health = _checks.check_nova_services_health
check_virsh_domains_running = _checks.check_virsh_domains_running
wait_for_all_instances_status = _checks.wait_for_all_instances_status
check_vms_ping = _checks.check_vms_ping
check_vm_evacuations = _checks.check_vm_evacuations

@ -0,0 +1,147 @@
# Copyright (c) 2025 Red Hat, Inc.
#
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from oslo_log import log
import tobiko
from tobiko.openstack.nova import _client
from tobiko.openstack.nova import _server
from tobiko.openstack.nova import _service
from tobiko.openstack import topology
from tobiko.shell import ping
from tobiko.shell import sh
LOG = log.getLogger(__name__)
def check_nova_services_health(timeout=600., interval=2.):
retry = tobiko.retry(timeout=timeout, interval=interval)
_service.wait_for_services_up(retry=retry)
def check_virsh_domains_running():
"""check all vms are running via virsh list command"""
for compute in topology.list_openstack_nodes(group='compute'):
hostname = sh.get_hostname(ssh_client=compute.ssh_client,
fqdn=True)
param = {'OS-EXT-SRV-ATTR:hypervisor_hostname': hostname}
vm_list_per_compute = _client.list_servers(**param)
for vm in vm_list_per_compute:
for attempt in tobiko.retry(timeout=120, interval=5):
if check_vm_running_via_virsh(compute, vm.id):
LOG.info(f"{vm.id} is running ok on {hostname}")
break
else:
msg = f"{vm.id} is not in running state on {hostname}"
if attempt.is_last:
tobiko.fail("timeout!! " + msg)
LOG.error(f"{vm.id} is not in running state on "
f"{hostname} ... Retrying")
def check_vms_ping(vm_list):
for vm in vm_list:
fip = _server.list_server_ip_addresses(vm,
address_type='floating').first
ping.ping_until_received(fip).assert_replied()
def check_vm_evacuations(vms_old=None, compute_host=None, timeout=600,
interval=2, check_no_evacuation=False):
"""check evacuation of vms
input: old vm status and expected new compute"""
for attempt in tobiko.retry(timeout=timeout, interval=interval):
failures = []
param = ({} if compute_host is None
else {'OS-EXT-SRV-ATTR:hypervisor_hostname': compute_host})
vms_new = _client.list_servers(**param)
for vm_old in vms_old or []:
old_bm_host = vm_old._info[ # pylint: disable=W0212
'OS-EXT-SRV-ATTR:hypervisor_hostname']
new_vm_host = vms_new.with_attributes( # pylint: disable=W0212
id=vm_old.id).uniq._info[
'OS-EXT-SRV-ATTR:hypervisor_hostname']
if check_no_evacuation:
cond = bool(old_bm_host != new_vm_host)
else:
cond = bool(old_bm_host == new_vm_host)
if cond:
failures.append(
'Failed vm evacuations: {}\n\n'.format(vm_old))
if not failures:
LOG.debug(vms_old.to_string())
LOG.debug('All vms were evacuated!')
return
if attempt.is_last:
tobiko.fail(
'Timeout checking VM evacuations:\n{!s}', '\n'.join(failures))
else:
LOG.error('Failed nova evacuation:\n {}'.format(failures))
LOG.error('Retrying...')
def check_vm_running_via_virsh(topology_compute, vm_id):
"""check that a vm is in running state via virsh command,
return false if not"""
if vm_id in get_vm_uuid_list_running_via_virsh(topology_compute):
return True
else:
return False
def get_vm_uuid_list_running_via_virsh(topology_compute):
from tobiko import podified
from tobiko.tripleo import containers
from tobiko.tripleo import overcloud
get_uuid_loop = ("for i in `virsh list --name --state-running`; do "
"virsh domuuid $i; done")
containerized_libvirt_cmd = \
"{container_runtime} exec -u root {nova_libvirt} sh -c '{get_uuids}'"
if podified.has_podified_cp():
command = containerized_libvirt_cmd.format(
container_runtime=podified.CONTAINER_RUNTIME,
nova_libvirt=podified.NOVA_LIBVIRT_CONTAINER,
get_uuids=get_uuid_loop)
elif overcloud.has_overcloud():
command = containerized_libvirt_cmd.format(
container_runtime=containers.get_container_runtime_name(),
nova_libvirt=containers.get_libvirt_container_name(),
get_uuids=get_uuid_loop)
else:
command = get_uuid_loop
return sh.execute(command,
ssh_client=topology_compute.ssh_client,
sudo=True).stdout.split()
def wait_for_all_instances_status(status, timeout=None):
"""wait for all instances for a certain status or raise an exception"""
for instance in _client.list_servers():
_client.wait_for_server_status(server=instance.id, status=status,
timeout=timeout)
instance_info = 'instance {nova_instance} is {state} on {host}'.format(
nova_instance=instance.name,
state=status,
host=instance._info[ # pylint: disable=W0212
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
LOG.info(instance_info)

@ -136,3 +136,24 @@ def get_server_id(server: _client.ServerType) -> str:
return server
else:
return server.id
def action_on_all_instances(action):
"""try to start/stop all instances"""
if action not in ('active', 'shutoff'):
tobiko.fail(f'Wrong action on VM instances: {action}')
client_action_method = (_client.activate_server if action == 'active'
else _client.shutoff_server)
expected_vm_status = 'ACTIVE' if action == 'active' else 'SHUTOFF'
for instance in _client.list_servers():
activated_instance = client_action_method(instance)
instance_info = 'instance {nova_instance} is {state} on {host}'.format(
nova_instance=activated_instance.name,
state=activated_instance.status,
host=activated_instance._info[ # pylint: disable=W0212
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
LOG.info(instance_info)
if activated_instance.status != expected_vm_status:
tobiko.fail(instance_info)

@ -18,6 +18,9 @@ from tobiko.podified import _openshift
from tobiko.podified import containers
NOVA_LIBVIRT_CONTAINER = 'nova_compute'
CONTAINER_RUNTIME = 'podman'
EDPM_NODE = _topology.EDPM_NODE
OCP_WORKER = _topology.OCP_WORKER
EDPM_COMPUTE_GROUP = _openshift.EDPM_COMPUTE_GROUP

@ -31,10 +31,14 @@ class HostnameError(tobiko.TobikoException):
HOSTNAMES_CACHE: typing.MutableMapping[typing.Optional[ssh.SSHClientFixture],
str] = weakref.WeakKeyDictionary()
HOSTNAMES_FQDN_CACHE: \
typing.MutableMapping[typing.Optional[ssh.SSHClientFixture],
str] = weakref.WeakKeyDictionary()
def get_hostname(ssh_client: ssh.SSHClientType = None,
cached=True,
fqdn=False,
**execute_params) -> str:
ssh_client = ssh.ssh_client_fixture(ssh_client)
if ssh_client is None:
@ -42,25 +46,36 @@ def get_hostname(ssh_client: ssh.SSHClientType = None,
if cached:
try:
hostname = HOSTNAMES_CACHE[ssh_client]
if not fqdn:
hostname = HOSTNAMES_CACHE[ssh_client]
else:
hostname = HOSTNAMES_FQDN_CACHE[ssh_client]
except KeyError:
pass
else:
return hostname
hostname = ssh_hostname(ssh_client=ssh_client,
fqdn=fqdn,
**execute_params)
if cached:
HOSTNAMES_CACHE[ssh_client] = hostname
if not fqdn:
HOSTNAMES_CACHE[ssh_client] = hostname
else:
HOSTNAMES_FQDN_CACHE[ssh_client] = hostname
return hostname
def ssh_hostname(ssh_client: ssh.SSHClientFixture,
fqdn=False,
**execute_params) \
-> str:
tobiko.check_valid_type(ssh_client, ssh.SSHClientFixture)
command = 'hostname'
if fqdn:
command += ' -f'
try:
result = _execute.execute('hostname',
result = _execute.execute(command,
ssh_client=ssh_client,
**execute_params)
except _exception.ShellCommandFailed as ex:

@ -31,6 +31,7 @@ from tobiko import config
from tobiko.openstack import glance
from tobiko.openstack import keystone
from tobiko.openstack import neutron
from tobiko.openstack import nova
from tobiko.openstack import stacks
from tobiko.openstack import tests
from tobiko.openstack import topology
@ -38,7 +39,6 @@ from tobiko.tests.faults.ha import test_cloud_recovery
from tobiko.shell import ping
from tobiko.shell import sh
from tobiko.tripleo import containers
from tobiko.tripleo import nova
from tobiko.tripleo import pacemaker
from tobiko.tripleo import topology as tripleo_topology
from tobiko import tripleo
@ -760,6 +760,15 @@ def evac_failover_compute(compute_host, failover_type=sh.hard_reset_method):
disrupt_node(compute_host, disrupt_method=failover_type)
def get_random_compute_with_vms():
for compute in nova.list_hypervisors():
param = {'OS-EXT-SRV-ATTR:hypervisor_hostname':
compute.hypervisor_hostname}
vm_list_per_compute = nova.list_servers(**param)
if len(vm_list_per_compute) > 0:
return compute.hypervisor_hostname
def check_iha_evacuation(failover_type=None, vm_type=None):
"""check vms on compute host,disrupt compute host,
check all vms evacuated and pingable"""
@ -767,36 +776,37 @@ def check_iha_evacuation(failover_type=None, vm_type=None):
LOG.info(f'Begin IHA tests iteration {iteration}')
LOG.info('create 2 vms')
tests.test_servers_creation(number_of_servers=2)
compute_host = nova.get_random_compute_with_vms_name()
vms_starting_state_df = nova.get_compute_vms_df(compute_host)
compute_host = get_random_compute_with_vms()
vms_starting_state = nova.list_servers(
**{'OS-EXT-SRV-ATTR:hypervisor_hostname': compute_host})
if vm_type == 'shutoff':
nova.stop_all_instances()
nova.action_on_all_instances('shutoff')
if vm_type == 'evac_image_vm':
evac_vm_stack = tests.test_evacuable_server_creation()
evac_vm_id = nova.get_stack_server_id(evac_vm_stack)
org_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
evac_vm_id = evac_vm_stack.server_details.id
old_nova_evac = nova.get_server(server_id=evac_vm_id)
if not vm_type == 'shutoff':
nova.check_df_vms_ping(vms_starting_state_df)
nova.check_vms_ping(vms_starting_state)
LOG.info(f'perform a failover on {compute_host}')
evac_failover_compute(compute_host, failover_type=failover_type)
test_cloud_recovery.overcloud_health_checks(passive_checks_only=True)
if vm_type == 'evac_image_vm':
nova.check_vm_evacuations(vms_df_old=org_nova_evac_df,
nova.check_vm_evacuations(vms_old=old_nova_evac,
compute_host=compute_host,
timeout=600,
check_no_evacuation=True)
# delete evacuable tagged image because it prevents
# non tagged evacuations if exists
delete_evacuable_tagged_image()
new_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
nova.check_vm_evacuations(org_nova_evac_df, new_nova_evac_df)
new_nova_evac = nova.get_server(server_id=evac_vm_id)
nova.check_vm_evacuations(old_nova_evac, new_nova_evac)
else:
nova.check_vm_evacuations(vms_df_old=vms_starting_state_df,
nova.check_vm_evacuations(vms_old=vms_starting_state,
compute_host=compute_host,
timeout=600)
LOG.info('check evac is Done')
if not vm_type == 'shutoff':
nova.check_df_vms_ping(vms_starting_state_df)
nova.check_vms_ping(vms_starting_state)
def check_iha_evacuation_evac_image_vm():

@ -50,16 +50,15 @@ has_external_lb = CONF.tobiko.rhosp.has_external_load_balancer
def overcloud_health_checks(passive_checks_only=False,
skip_mac_table_size_test=False):
# this method will be changed in future commit
check_pacemaker_resources_health()
check_overcloud_processes_health()
nova.check_nova_services_health()
nova_osp.check_nova_services_health()
tests.test_alive_agents_are_consistent_along_time()
if not passive_checks_only:
# create a uniq stack
# create a unique stack that will be cleaned up at the end of each test
check_vm_create()
nova.start_all_instances()
nova.check_computes_vms_running_via_virsh()
nova_osp.action_on_all_instances('active')
nova_osp.check_virsh_domains_running()
containers.list_node_containers.cache_clear()
containers.assert_all_tripleo_containers_running()
containers.assert_equal_containers_state()
@ -231,10 +230,10 @@ class DisruptTripleoNodesTest(testtools.TestCase):
hard_reset=False,
sequentially=sequentially)
# verify VM status is updated after reboot
nova.wait_for_all_instances_status('SHUTOFF')
nova_osp.wait_for_all_instances_status('SHUTOFF')
# start all VM instance
# otherwise sidecar containers will not run after computes reboot
nova.start_all_instances()
nova_osp.action_on_all_instances('active')
OvercloudHealthCheck.run_after(passive_checks_only=True)
_run_test()
@ -247,7 +246,7 @@ class DisruptTripleoNodesTest(testtools.TestCase):
# nova.wait_for_all_instances_status('SHUTOFF')
# # start all VM instance
# # otherwise sidecar containers will not run after computes reboot
# nova.start_all_instances()
# nova_osp.action_on_all_instances('active')
# OvercloudHealthCheck.run_after(passive_checks_only=True)
@testtools.skipIf(has_external_lb, SKIP_MESSAGE_EXTLB)

@ -14,11 +14,37 @@
# License for the specific language governing permissions and limitations
from __future__ import absolute_import
from oslo_log import log
import testtools
from tobiko.tests.faults.ha import test_cloud_recovery
from tobiko.tests.faults.podified.ha import cloud_disruptions
from tobiko.openstack import tests
from tobiko import podified
from tobiko.openstack import nova
LOG = log.getLogger(__name__)
def podified_health_checks():
nova.check_nova_services_health()
tests.test_alive_agents_are_consistent_along_time()
# create a unique stack that will be cleaned up at the end of each test
# TODO(eolivare) add tests.test_server_creation_no_fip() when BGP is
# configured with expose_tenant_networks
tests.test_server_creation()
nova.action_on_all_instances('active')
nova.check_virsh_domains_running()
test_cloud_recovery.octavia_health_checks()
class PodifiedCloudHealthCheck(test_cloud_recovery.OvercloudHealthCheck):
def setup_fixture(self):
# run validations
LOG.info("Start executing Podified health checks.")
podified_health_checks()
LOG.info("Podified health checks successfully executed.")
@podified.skip_if_not_podified
@ -27,17 +53,20 @@ class DisruptPodifiedNodesTest(testtools.TestCase):
disruptive_action: a function that runs some
disruptive scenario on a node"""
def test_0vercloud_health_check(self):
PodifiedCloudHealthCheck.run_before()
def test_kill_all_galera_services(self):
# HealthCheck.run_before()
PodifiedCloudHealthCheck.run_before()
cloud_disruptions.kill_all_galera_services()
# HealthCheck.run_after()
PodifiedCloudHealthCheck.run_after()
def test_remove_all_grastate_galera(self):
# HealthCheck.run_before()
PodifiedCloudHealthCheck.run_before()
cloud_disruptions.remove_all_grastate_galera()
# HealthCheck.run_before()
PodifiedCloudHealthCheck.run_after()
def test_remove_one_grastate_galera(self):
# HealthCheck.run_before()
PodifiedCloudHealthCheck.run_before()
cloud_disruptions.remove_one_grastate_galera()
# HealthCheck.run_after()
PodifiedCloudHealthCheck.run_after()

@ -1,13 +1,24 @@
# Copyright (c) 2025 Red Hat, Inc.
#
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from __future__ import absolute_import
import time
import typing # noqa
from functools import wraps
import netaddr
from oslo_log import log
import pandas
import tobiko
from tobiko.tripleo import overcloud
@ -15,217 +26,6 @@ from tobiko.shell import iperf3
from tobiko.shell import ping
from tobiko.shell import sh
from tobiko.shell import ssh
from tobiko.openstack import nova
from tobiko.openstack import topology
from tobiko.tripleo import containers
LOG = log.getLogger(__name__)
def check_nova_services_health(timeout=600., interval=2.):
retry = tobiko.retry(timeout=timeout, interval=interval)
nova.wait_for_services_up(retry=retry)
def start_all_instances():
"""try to start all stopped overcloud instances"""
for instance in nova.list_servers():
activated_instance = nova.activate_server(instance)
time.sleep(3)
instance_info = 'instance {nova_instance} is {state} on {host}'.format(
nova_instance=activated_instance.name,
state=activated_instance.status,
host=activated_instance._info[ # pylint: disable=W0212
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
LOG.info(instance_info)
if activated_instance.status != 'ACTIVE':
tobiko.fail(instance_info)
def stop_all_instances():
"""try to start all stopped overcloud instances"""
for instance in nova.list_servers():
activated_instance = nova.shutoff_server(instance)
time.sleep(3)
instance_info = 'instance {nova_instance} is {state} on {host}'.format(
nova_instance=activated_instance.name,
state=activated_instance.status,
host=activated_instance._info[ # pylint: disable=W0212
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
LOG.info(instance_info)
if activated_instance.status != 'SHUTOFF':
tobiko.fail(instance_info)
def wait_for_all_instances_status(status, timeout=None):
"""wait for all instances for a certain status or raise an exception"""
for instance in nova.list_servers():
nova.wait_for_server_status(server=instance.id, status=status,
timeout=timeout)
instance_info = 'instance {nova_instance} is {state} on {host}'.format(
nova_instance=instance.name,
state=status,
host=instance._info[ # pylint: disable=W0212
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
LOG.info(instance_info)
def get_vms_table():
"""populate a dataframe with vm host,id,status"""
vms_data = [(vm._info[ # pylint: disable=W0212
'OS-EXT-SRV-ATTR:hypervisor_hostname'], vm.id,
vm.status) for vm in nova.list_servers()]
vms_df = pandas.DataFrame(vms_data, columns=['vm_host', 'vm_id',
'vm_state'])
return vms_df
def list_computes():
"""list compute host names"""
return [compute.hypervisor_hostname for compute in nova.list_hypervisors()]
def get_compute_vms_df(compute_host):
"""input: compute hostname (can be short)
output: dataframe with vms of that host"""
return get_vms_table().query(f"vm_host=='{compute_host}'")
def get_random_compute_with_vms_name():
"""get a randomcompute holding vm/s"""
for compute in list_computes():
if not get_compute_vms_df(compute).empty:
return compute
def vm_info(vm_id, vms_df):
"""input: vm and a vms df
output: host string"""
return vms_df.query(f"vm_id == '{vm_id}'").to_string()
def vm_df(vm_id, vms_df):
"""input: vm and a vms df
output: host string"""
return vms_df.query(f"vm_id == '{vm_id}'")
def vm_floating_ip(vm_id):
"""input: vm_id
output it's floating ip"""
vm = nova.get_server(vm_id)
floating_ip = nova.list_server_ip_addresses(
vm, address_type='floating').first
return floating_ip
def check_ping_vm_fip(fip):
ping.ping_until_received(fip).assert_replied()
def check_df_vms_ping(df):
"""input: dataframe with vms_ids
try to ping all vms in df"""
for vm_id in df.vm_id.to_list():
check_ping_vm_fip(vm_floating_ip(vm_id))
def vm_location(vm_id, vms_df):
"""input: vm and a vms df
output: host string"""
return vms_df.query(f"vm_id == '{vm_id}'")['vm_host'].to_string(
index=False)
def check_vm_evacuations(vms_df_old=None, compute_host=None, timeout=600,
interval=2, check_no_evacuation=False):
"""check evacuation of vms
input: old and new vms_state_tables dfs"""
failures = []
start = time.time()
while time.time() - start < timeout:
failures = []
vms_df_new = get_compute_vms_df(compute_host)
for vm_id in vms_df_old.vm_id.to_list():
old_bm_host = vm_location(vm_id, vms_df_old)
new_vm_host = vm_location(vm_id, vms_df_new)
if check_no_evacuation:
cond = bool(old_bm_host != new_vm_host)
else:
cond = bool(old_bm_host == new_vm_host)
if cond:
failures.append(
'failed vm evacuations: {}\n\n'.format(vm_info(vm_id,
vms_df_old)))
if failures:
LOG.info('Failed nova evacuation:\n {}'.format(failures))
LOG.info('Not all nova vms evacuated ..')
LOG.info('Retrying , timeout at: {}'
.format(timeout-(time.time() - start)))
time.sleep(interval)
else:
LOG.info(vms_df_old.to_string())
LOG.info('All vms were evacuated!')
return
# exhausted all retries
if failures:
tobiko.fail(
'failed vm evacuations:\n{!s}', '\n'.join(failures))
def get_stack_server_id(stack):
return stack.server_details.id
def get_fqdn_from_topology_node(topology_node):
return sh.execute("hostname -f", ssh_client=topology_node.ssh_client,
expect_exit_status=None).stdout.strip()
def check_vm_running_via_virsh(topology_compute, vm_id):
"""check that a vm is in running state via virsh command,
return false if not"""
if vm_id in get_vm_uuid_list_running_via_virsh(topology_compute):
return True
else:
return False
def get_vm_uuid_list_running_via_virsh(topology_compute):
if overcloud.has_overcloud():
container_runtime = containers.get_container_runtime_name()
nova_libvirt = containers.get_libvirt_container_name()
command = f"sudo {container_runtime} exec {nova_libvirt} " \
f"sh -c 'for i in `virsh list --name --state-running` " \
f";do virsh domuuid $i;done'"
else:
command = "for i in `sudo virsh list --name --state-running` " \
";do virsh domuuid $i;done'"
return sh.execute(command,
ssh_client=topology_compute.ssh_client).stdout.split()
def check_computes_vms_running_via_virsh():
"""check all vms are running via virsh list command"""
for compute in topology.list_openstack_nodes(group='compute'):
hostname = get_fqdn_from_topology_node(compute)
retry = tobiko.retry(timeout=120, interval=5)
vms_df = get_compute_vms_df(hostname)
for vm_id in vms_df.vm_id.to_list():
for _ in retry:
if check_vm_running_via_virsh(compute, vm_id):
LOG.info(f"{vm_id} is running ok on "
f"{compute.hostname}")
break
else:
LOG.info(f"{vm_id} is not in running state on "
f"{compute.hostname}")
# Test is inteded for D/S env