Merge "Add first health checks to faults podified ha tests"
This commit is contained in:
commit
8a3b7ea932
@ -13,6 +13,7 @@
|
|||||||
# under the License.
|
# under the License.
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
|
from tobiko.openstack.nova import _checks
|
||||||
from tobiko.openstack.nova import _client
|
from tobiko.openstack.nova import _client
|
||||||
from tobiko.openstack.nova import _cloud_init
|
from tobiko.openstack.nova import _cloud_init
|
||||||
from tobiko.openstack.nova import _hypervisor
|
from tobiko.openstack.nova import _hypervisor
|
||||||
@ -85,5 +86,12 @@ find_server_ip_address = _server.find_server_ip_address
|
|||||||
HasServerMixin = _server.HasServerMixin
|
HasServerMixin = _server.HasServerMixin
|
||||||
get_server_id = _server.get_server_id
|
get_server_id = _server.get_server_id
|
||||||
list_server_ip_addresses = _server.list_server_ip_addresses
|
list_server_ip_addresses = _server.list_server_ip_addresses
|
||||||
|
action_on_all_instances = _server.action_on_all_instances
|
||||||
|
|
||||||
wait_for_services_up = _service.wait_for_services_up
|
wait_for_services_up = _service.wait_for_services_up
|
||||||
|
|
||||||
|
check_nova_services_health = _checks.check_nova_services_health
|
||||||
|
check_virsh_domains_running = _checks.check_virsh_domains_running
|
||||||
|
wait_for_all_instances_status = _checks.wait_for_all_instances_status
|
||||||
|
check_vms_ping = _checks.check_vms_ping
|
||||||
|
check_vm_evacuations = _checks.check_vm_evacuations
|
||||||
|
147
tobiko/openstack/nova/_checks.py
Normal file
147
tobiko/openstack/nova/_checks.py
Normal file
@ -0,0 +1,147 @@
|
|||||||
|
# Copyright (c) 2025 Red Hat, Inc.
|
||||||
|
#
|
||||||
|
# All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License. You may obtain
|
||||||
|
# a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
# License for the specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
from oslo_log import log
|
||||||
|
|
||||||
|
import tobiko
|
||||||
|
from tobiko.openstack.nova import _client
|
||||||
|
from tobiko.openstack.nova import _server
|
||||||
|
from tobiko.openstack.nova import _service
|
||||||
|
from tobiko.openstack import topology
|
||||||
|
from tobiko.shell import ping
|
||||||
|
from tobiko.shell import sh
|
||||||
|
|
||||||
|
|
||||||
|
LOG = log.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def check_nova_services_health(timeout=600., interval=2.):
|
||||||
|
retry = tobiko.retry(timeout=timeout, interval=interval)
|
||||||
|
_service.wait_for_services_up(retry=retry)
|
||||||
|
|
||||||
|
|
||||||
|
def check_virsh_domains_running():
|
||||||
|
"""check all vms are running via virsh list command"""
|
||||||
|
for compute in topology.list_openstack_nodes(group='compute'):
|
||||||
|
hostname = sh.get_hostname(ssh_client=compute.ssh_client,
|
||||||
|
fqdn=True)
|
||||||
|
param = {'OS-EXT-SRV-ATTR:hypervisor_hostname': hostname}
|
||||||
|
vm_list_per_compute = _client.list_servers(**param)
|
||||||
|
for vm in vm_list_per_compute:
|
||||||
|
for attempt in tobiko.retry(timeout=120, interval=5):
|
||||||
|
if check_vm_running_via_virsh(compute, vm.id):
|
||||||
|
LOG.info(f"{vm.id} is running ok on {hostname}")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
msg = f"{vm.id} is not in running state on {hostname}"
|
||||||
|
if attempt.is_last:
|
||||||
|
tobiko.fail("timeout!! " + msg)
|
||||||
|
LOG.error(f"{vm.id} is not in running state on "
|
||||||
|
f"{hostname} ... Retrying")
|
||||||
|
|
||||||
|
|
||||||
|
def check_vms_ping(vm_list):
|
||||||
|
for vm in vm_list:
|
||||||
|
fip = _server.list_server_ip_addresses(vm,
|
||||||
|
address_type='floating').first
|
||||||
|
ping.ping_until_received(fip).assert_replied()
|
||||||
|
|
||||||
|
|
||||||
|
def check_vm_evacuations(vms_old=None, compute_host=None, timeout=600,
|
||||||
|
interval=2, check_no_evacuation=False):
|
||||||
|
"""check evacuation of vms
|
||||||
|
input: old vm status and expected new compute"""
|
||||||
|
|
||||||
|
for attempt in tobiko.retry(timeout=timeout, interval=interval):
|
||||||
|
failures = []
|
||||||
|
param = ({} if compute_host is None
|
||||||
|
else {'OS-EXT-SRV-ATTR:hypervisor_hostname': compute_host})
|
||||||
|
vms_new = _client.list_servers(**param)
|
||||||
|
for vm_old in vms_old or []:
|
||||||
|
old_bm_host = vm_old._info[ # pylint: disable=W0212
|
||||||
|
'OS-EXT-SRV-ATTR:hypervisor_hostname']
|
||||||
|
new_vm_host = vms_new.with_attributes( # pylint: disable=W0212
|
||||||
|
id=vm_old.id).uniq._info[
|
||||||
|
'OS-EXT-SRV-ATTR:hypervisor_hostname']
|
||||||
|
|
||||||
|
if check_no_evacuation:
|
||||||
|
cond = bool(old_bm_host != new_vm_host)
|
||||||
|
else:
|
||||||
|
cond = bool(old_bm_host == new_vm_host)
|
||||||
|
|
||||||
|
if cond:
|
||||||
|
failures.append(
|
||||||
|
'Failed vm evacuations: {}\n\n'.format(vm_old))
|
||||||
|
if not failures:
|
||||||
|
LOG.debug(vms_old.to_string())
|
||||||
|
LOG.debug('All vms were evacuated!')
|
||||||
|
return
|
||||||
|
|
||||||
|
if attempt.is_last:
|
||||||
|
tobiko.fail(
|
||||||
|
'Timeout checking VM evacuations:\n{!s}', '\n'.join(failures))
|
||||||
|
else:
|
||||||
|
LOG.error('Failed nova evacuation:\n {}'.format(failures))
|
||||||
|
LOG.error('Retrying...')
|
||||||
|
|
||||||
|
|
||||||
|
def check_vm_running_via_virsh(topology_compute, vm_id):
|
||||||
|
"""check that a vm is in running state via virsh command,
|
||||||
|
return false if not"""
|
||||||
|
if vm_id in get_vm_uuid_list_running_via_virsh(topology_compute):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_vm_uuid_list_running_via_virsh(topology_compute):
|
||||||
|
from tobiko import podified
|
||||||
|
from tobiko.tripleo import containers
|
||||||
|
from tobiko.tripleo import overcloud
|
||||||
|
|
||||||
|
get_uuid_loop = ("for i in `virsh list --name --state-running`; do "
|
||||||
|
"virsh domuuid $i; done")
|
||||||
|
containerized_libvirt_cmd = \
|
||||||
|
"{container_runtime} exec -u root {nova_libvirt} sh -c '{get_uuids}'"
|
||||||
|
|
||||||
|
if podified.has_podified_cp():
|
||||||
|
command = containerized_libvirt_cmd.format(
|
||||||
|
container_runtime=podified.CONTAINER_RUNTIME,
|
||||||
|
nova_libvirt=podified.NOVA_LIBVIRT_CONTAINER,
|
||||||
|
get_uuids=get_uuid_loop)
|
||||||
|
elif overcloud.has_overcloud():
|
||||||
|
command = containerized_libvirt_cmd.format(
|
||||||
|
container_runtime=containers.get_container_runtime_name(),
|
||||||
|
nova_libvirt=containers.get_libvirt_container_name(),
|
||||||
|
get_uuids=get_uuid_loop)
|
||||||
|
else:
|
||||||
|
command = get_uuid_loop
|
||||||
|
|
||||||
|
return sh.execute(command,
|
||||||
|
ssh_client=topology_compute.ssh_client,
|
||||||
|
sudo=True).stdout.split()
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_all_instances_status(status, timeout=None):
|
||||||
|
"""wait for all instances for a certain status or raise an exception"""
|
||||||
|
for instance in _client.list_servers():
|
||||||
|
_client.wait_for_server_status(server=instance.id, status=status,
|
||||||
|
timeout=timeout)
|
||||||
|
instance_info = 'instance {nova_instance} is {state} on {host}'.format(
|
||||||
|
nova_instance=instance.name,
|
||||||
|
state=status,
|
||||||
|
host=instance._info[ # pylint: disable=W0212
|
||||||
|
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
|
||||||
|
LOG.info(instance_info)
|
@ -136,3 +136,24 @@ def get_server_id(server: _client.ServerType) -> str:
|
|||||||
return server
|
return server
|
||||||
else:
|
else:
|
||||||
return server.id
|
return server.id
|
||||||
|
|
||||||
|
|
||||||
|
def action_on_all_instances(action):
|
||||||
|
"""try to start/stop all instances"""
|
||||||
|
if action not in ('active', 'shutoff'):
|
||||||
|
tobiko.fail(f'Wrong action on VM instances: {action}')
|
||||||
|
|
||||||
|
client_action_method = (_client.activate_server if action == 'active'
|
||||||
|
else _client.shutoff_server)
|
||||||
|
expected_vm_status = 'ACTIVE' if action == 'active' else 'SHUTOFF'
|
||||||
|
|
||||||
|
for instance in _client.list_servers():
|
||||||
|
activated_instance = client_action_method(instance)
|
||||||
|
instance_info = 'instance {nova_instance} is {state} on {host}'.format(
|
||||||
|
nova_instance=activated_instance.name,
|
||||||
|
state=activated_instance.status,
|
||||||
|
host=activated_instance._info[ # pylint: disable=W0212
|
||||||
|
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
|
||||||
|
LOG.info(instance_info)
|
||||||
|
if activated_instance.status != expected_vm_status:
|
||||||
|
tobiko.fail(instance_info)
|
||||||
|
@ -18,6 +18,9 @@ from tobiko.podified import _openshift
|
|||||||
from tobiko.podified import containers
|
from tobiko.podified import containers
|
||||||
|
|
||||||
|
|
||||||
|
NOVA_LIBVIRT_CONTAINER = 'nova_compute'
|
||||||
|
CONTAINER_RUNTIME = 'podman'
|
||||||
|
|
||||||
EDPM_NODE = _topology.EDPM_NODE
|
EDPM_NODE = _topology.EDPM_NODE
|
||||||
OCP_WORKER = _topology.OCP_WORKER
|
OCP_WORKER = _topology.OCP_WORKER
|
||||||
EDPM_COMPUTE_GROUP = _openshift.EDPM_COMPUTE_GROUP
|
EDPM_COMPUTE_GROUP = _openshift.EDPM_COMPUTE_GROUP
|
||||||
|
@ -31,10 +31,14 @@ class HostnameError(tobiko.TobikoException):
|
|||||||
|
|
||||||
HOSTNAMES_CACHE: typing.MutableMapping[typing.Optional[ssh.SSHClientFixture],
|
HOSTNAMES_CACHE: typing.MutableMapping[typing.Optional[ssh.SSHClientFixture],
|
||||||
str] = weakref.WeakKeyDictionary()
|
str] = weakref.WeakKeyDictionary()
|
||||||
|
HOSTNAMES_FQDN_CACHE: \
|
||||||
|
typing.MutableMapping[typing.Optional[ssh.SSHClientFixture],
|
||||||
|
str] = weakref.WeakKeyDictionary()
|
||||||
|
|
||||||
|
|
||||||
def get_hostname(ssh_client: ssh.SSHClientType = None,
|
def get_hostname(ssh_client: ssh.SSHClientType = None,
|
||||||
cached=True,
|
cached=True,
|
||||||
|
fqdn=False,
|
||||||
**execute_params) -> str:
|
**execute_params) -> str:
|
||||||
ssh_client = ssh.ssh_client_fixture(ssh_client)
|
ssh_client = ssh.ssh_client_fixture(ssh_client)
|
||||||
if ssh_client is None:
|
if ssh_client is None:
|
||||||
@ -42,25 +46,36 @@ def get_hostname(ssh_client: ssh.SSHClientType = None,
|
|||||||
|
|
||||||
if cached:
|
if cached:
|
||||||
try:
|
try:
|
||||||
hostname = HOSTNAMES_CACHE[ssh_client]
|
if not fqdn:
|
||||||
|
hostname = HOSTNAMES_CACHE[ssh_client]
|
||||||
|
else:
|
||||||
|
hostname = HOSTNAMES_FQDN_CACHE[ssh_client]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
return hostname
|
return hostname
|
||||||
|
|
||||||
hostname = ssh_hostname(ssh_client=ssh_client,
|
hostname = ssh_hostname(ssh_client=ssh_client,
|
||||||
|
fqdn=fqdn,
|
||||||
**execute_params)
|
**execute_params)
|
||||||
if cached:
|
if cached:
|
||||||
HOSTNAMES_CACHE[ssh_client] = hostname
|
if not fqdn:
|
||||||
|
HOSTNAMES_CACHE[ssh_client] = hostname
|
||||||
|
else:
|
||||||
|
HOSTNAMES_FQDN_CACHE[ssh_client] = hostname
|
||||||
return hostname
|
return hostname
|
||||||
|
|
||||||
|
|
||||||
def ssh_hostname(ssh_client: ssh.SSHClientFixture,
|
def ssh_hostname(ssh_client: ssh.SSHClientFixture,
|
||||||
|
fqdn=False,
|
||||||
**execute_params) \
|
**execute_params) \
|
||||||
-> str:
|
-> str:
|
||||||
tobiko.check_valid_type(ssh_client, ssh.SSHClientFixture)
|
tobiko.check_valid_type(ssh_client, ssh.SSHClientFixture)
|
||||||
|
command = 'hostname'
|
||||||
|
if fqdn:
|
||||||
|
command += ' -f'
|
||||||
try:
|
try:
|
||||||
result = _execute.execute('hostname',
|
result = _execute.execute(command,
|
||||||
ssh_client=ssh_client,
|
ssh_client=ssh_client,
|
||||||
**execute_params)
|
**execute_params)
|
||||||
except _exception.ShellCommandFailed as ex:
|
except _exception.ShellCommandFailed as ex:
|
||||||
|
@ -31,6 +31,7 @@ from tobiko import config
|
|||||||
from tobiko.openstack import glance
|
from tobiko.openstack import glance
|
||||||
from tobiko.openstack import keystone
|
from tobiko.openstack import keystone
|
||||||
from tobiko.openstack import neutron
|
from tobiko.openstack import neutron
|
||||||
|
from tobiko.openstack import nova
|
||||||
from tobiko.openstack import stacks
|
from tobiko.openstack import stacks
|
||||||
from tobiko.openstack import tests
|
from tobiko.openstack import tests
|
||||||
from tobiko.openstack import topology
|
from tobiko.openstack import topology
|
||||||
@ -38,7 +39,6 @@ from tobiko.tests.faults.ha import test_cloud_recovery
|
|||||||
from tobiko.shell import ping
|
from tobiko.shell import ping
|
||||||
from tobiko.shell import sh
|
from tobiko.shell import sh
|
||||||
from tobiko.tripleo import containers
|
from tobiko.tripleo import containers
|
||||||
from tobiko.tripleo import nova
|
|
||||||
from tobiko.tripleo import pacemaker
|
from tobiko.tripleo import pacemaker
|
||||||
from tobiko.tripleo import topology as tripleo_topology
|
from tobiko.tripleo import topology as tripleo_topology
|
||||||
from tobiko import tripleo
|
from tobiko import tripleo
|
||||||
@ -760,6 +760,15 @@ def evac_failover_compute(compute_host, failover_type=sh.hard_reset_method):
|
|||||||
disrupt_node(compute_host, disrupt_method=failover_type)
|
disrupt_node(compute_host, disrupt_method=failover_type)
|
||||||
|
|
||||||
|
|
||||||
|
def get_random_compute_with_vms():
|
||||||
|
for compute in nova.list_hypervisors():
|
||||||
|
param = {'OS-EXT-SRV-ATTR:hypervisor_hostname':
|
||||||
|
compute.hypervisor_hostname}
|
||||||
|
vm_list_per_compute = nova.list_servers(**param)
|
||||||
|
if len(vm_list_per_compute) > 0:
|
||||||
|
return compute.hypervisor_hostname
|
||||||
|
|
||||||
|
|
||||||
def check_iha_evacuation(failover_type=None, vm_type=None):
|
def check_iha_evacuation(failover_type=None, vm_type=None):
|
||||||
"""check vms on compute host,disrupt compute host,
|
"""check vms on compute host,disrupt compute host,
|
||||||
check all vms evacuated and pingable"""
|
check all vms evacuated and pingable"""
|
||||||
@ -767,36 +776,37 @@ def check_iha_evacuation(failover_type=None, vm_type=None):
|
|||||||
LOG.info(f'Begin IHA tests iteration {iteration}')
|
LOG.info(f'Begin IHA tests iteration {iteration}')
|
||||||
LOG.info('create 2 vms')
|
LOG.info('create 2 vms')
|
||||||
tests.test_servers_creation(number_of_servers=2)
|
tests.test_servers_creation(number_of_servers=2)
|
||||||
compute_host = nova.get_random_compute_with_vms_name()
|
compute_host = get_random_compute_with_vms()
|
||||||
vms_starting_state_df = nova.get_compute_vms_df(compute_host)
|
vms_starting_state = nova.list_servers(
|
||||||
|
**{'OS-EXT-SRV-ATTR:hypervisor_hostname': compute_host})
|
||||||
if vm_type == 'shutoff':
|
if vm_type == 'shutoff':
|
||||||
nova.stop_all_instances()
|
nova.action_on_all_instances('shutoff')
|
||||||
if vm_type == 'evac_image_vm':
|
if vm_type == 'evac_image_vm':
|
||||||
evac_vm_stack = tests.test_evacuable_server_creation()
|
evac_vm_stack = tests.test_evacuable_server_creation()
|
||||||
evac_vm_id = nova.get_stack_server_id(evac_vm_stack)
|
evac_vm_id = evac_vm_stack.server_details.id
|
||||||
org_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
|
old_nova_evac = nova.get_server(server_id=evac_vm_id)
|
||||||
if not vm_type == 'shutoff':
|
if not vm_type == 'shutoff':
|
||||||
nova.check_df_vms_ping(vms_starting_state_df)
|
nova.check_vms_ping(vms_starting_state)
|
||||||
LOG.info(f'perform a failover on {compute_host}')
|
LOG.info(f'perform a failover on {compute_host}')
|
||||||
evac_failover_compute(compute_host, failover_type=failover_type)
|
evac_failover_compute(compute_host, failover_type=failover_type)
|
||||||
test_cloud_recovery.overcloud_health_checks(passive_checks_only=True)
|
test_cloud_recovery.overcloud_health_checks(passive_checks_only=True)
|
||||||
if vm_type == 'evac_image_vm':
|
if vm_type == 'evac_image_vm':
|
||||||
nova.check_vm_evacuations(vms_df_old=org_nova_evac_df,
|
nova.check_vm_evacuations(vms_old=old_nova_evac,
|
||||||
compute_host=compute_host,
|
compute_host=compute_host,
|
||||||
timeout=600,
|
timeout=600,
|
||||||
check_no_evacuation=True)
|
check_no_evacuation=True)
|
||||||
# delete evacuable tagged image because it prevents
|
# delete evacuable tagged image because it prevents
|
||||||
# non tagged evacuations if exists
|
# non tagged evacuations if exists
|
||||||
delete_evacuable_tagged_image()
|
delete_evacuable_tagged_image()
|
||||||
new_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
|
new_nova_evac = nova.get_server(server_id=evac_vm_id)
|
||||||
nova.check_vm_evacuations(org_nova_evac_df, new_nova_evac_df)
|
nova.check_vm_evacuations(old_nova_evac, new_nova_evac)
|
||||||
else:
|
else:
|
||||||
nova.check_vm_evacuations(vms_df_old=vms_starting_state_df,
|
nova.check_vm_evacuations(vms_old=vms_starting_state,
|
||||||
compute_host=compute_host,
|
compute_host=compute_host,
|
||||||
timeout=600)
|
timeout=600)
|
||||||
LOG.info('check evac is Done')
|
LOG.info('check evac is Done')
|
||||||
if not vm_type == 'shutoff':
|
if not vm_type == 'shutoff':
|
||||||
nova.check_df_vms_ping(vms_starting_state_df)
|
nova.check_vms_ping(vms_starting_state)
|
||||||
|
|
||||||
|
|
||||||
def check_iha_evacuation_evac_image_vm():
|
def check_iha_evacuation_evac_image_vm():
|
||||||
|
@ -50,16 +50,15 @@ has_external_lb = CONF.tobiko.rhosp.has_external_load_balancer
|
|||||||
|
|
||||||
def overcloud_health_checks(passive_checks_only=False,
|
def overcloud_health_checks(passive_checks_only=False,
|
||||||
skip_mac_table_size_test=False):
|
skip_mac_table_size_test=False):
|
||||||
# this method will be changed in future commit
|
|
||||||
check_pacemaker_resources_health()
|
check_pacemaker_resources_health()
|
||||||
check_overcloud_processes_health()
|
check_overcloud_processes_health()
|
||||||
nova.check_nova_services_health()
|
nova_osp.check_nova_services_health()
|
||||||
tests.test_alive_agents_are_consistent_along_time()
|
tests.test_alive_agents_are_consistent_along_time()
|
||||||
if not passive_checks_only:
|
if not passive_checks_only:
|
||||||
# create a uniq stack
|
# create a unique stack that will be cleaned up at the end of each test
|
||||||
check_vm_create()
|
check_vm_create()
|
||||||
nova.start_all_instances()
|
nova_osp.action_on_all_instances('active')
|
||||||
nova.check_computes_vms_running_via_virsh()
|
nova_osp.check_virsh_domains_running()
|
||||||
containers.list_node_containers.cache_clear()
|
containers.list_node_containers.cache_clear()
|
||||||
containers.assert_all_tripleo_containers_running()
|
containers.assert_all_tripleo_containers_running()
|
||||||
containers.assert_equal_containers_state()
|
containers.assert_equal_containers_state()
|
||||||
@ -231,10 +230,10 @@ class DisruptTripleoNodesTest(testtools.TestCase):
|
|||||||
hard_reset=False,
|
hard_reset=False,
|
||||||
sequentially=sequentially)
|
sequentially=sequentially)
|
||||||
# verify VM status is updated after reboot
|
# verify VM status is updated after reboot
|
||||||
nova.wait_for_all_instances_status('SHUTOFF')
|
nova_osp.wait_for_all_instances_status('SHUTOFF')
|
||||||
# start all VM instance
|
# start all VM instance
|
||||||
# otherwise sidecar containers will not run after computes reboot
|
# otherwise sidecar containers will not run after computes reboot
|
||||||
nova.start_all_instances()
|
nova_osp.action_on_all_instances('active')
|
||||||
OvercloudHealthCheck.run_after(passive_checks_only=True)
|
OvercloudHealthCheck.run_after(passive_checks_only=True)
|
||||||
|
|
||||||
_run_test()
|
_run_test()
|
||||||
@ -247,7 +246,7 @@ class DisruptTripleoNodesTest(testtools.TestCase):
|
|||||||
# nova.wait_for_all_instances_status('SHUTOFF')
|
# nova.wait_for_all_instances_status('SHUTOFF')
|
||||||
# # start all VM instance
|
# # start all VM instance
|
||||||
# # otherwise sidecar containers will not run after computes reboot
|
# # otherwise sidecar containers will not run after computes reboot
|
||||||
# nova.start_all_instances()
|
# nova_osp.action_on_all_instances('active')
|
||||||
# OvercloudHealthCheck.run_after(passive_checks_only=True)
|
# OvercloudHealthCheck.run_after(passive_checks_only=True)
|
||||||
|
|
||||||
@testtools.skipIf(has_external_lb, SKIP_MESSAGE_EXTLB)
|
@testtools.skipIf(has_external_lb, SKIP_MESSAGE_EXTLB)
|
||||||
|
@ -14,11 +14,37 @@
|
|||||||
# License for the specific language governing permissions and limitations
|
# License for the specific language governing permissions and limitations
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
|
from oslo_log import log
|
||||||
import testtools
|
import testtools
|
||||||
|
|
||||||
|
from tobiko.tests.faults.ha import test_cloud_recovery
|
||||||
from tobiko.tests.faults.podified.ha import cloud_disruptions
|
from tobiko.tests.faults.podified.ha import cloud_disruptions
|
||||||
|
from tobiko.openstack import tests
|
||||||
from tobiko import podified
|
from tobiko import podified
|
||||||
|
from tobiko.openstack import nova
|
||||||
|
|
||||||
|
|
||||||
|
LOG = log.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def podified_health_checks():
|
||||||
|
nova.check_nova_services_health()
|
||||||
|
tests.test_alive_agents_are_consistent_along_time()
|
||||||
|
# create a unique stack that will be cleaned up at the end of each test
|
||||||
|
# TODO(eolivare) add tests.test_server_creation_no_fip() when BGP is
|
||||||
|
# configured with expose_tenant_networks
|
||||||
|
tests.test_server_creation()
|
||||||
|
nova.action_on_all_instances('active')
|
||||||
|
nova.check_virsh_domains_running()
|
||||||
|
test_cloud_recovery.octavia_health_checks()
|
||||||
|
|
||||||
|
|
||||||
|
class PodifiedCloudHealthCheck(test_cloud_recovery.OvercloudHealthCheck):
|
||||||
|
def setup_fixture(self):
|
||||||
|
# run validations
|
||||||
|
LOG.info("Start executing Podified health checks.")
|
||||||
|
podified_health_checks()
|
||||||
|
LOG.info("Podified health checks successfully executed.")
|
||||||
|
|
||||||
|
|
||||||
@podified.skip_if_not_podified
|
@podified.skip_if_not_podified
|
||||||
@ -27,17 +53,20 @@ class DisruptPodifiedNodesTest(testtools.TestCase):
|
|||||||
disruptive_action: a function that runs some
|
disruptive_action: a function that runs some
|
||||||
disruptive scenario on a node"""
|
disruptive scenario on a node"""
|
||||||
|
|
||||||
|
def test_0vercloud_health_check(self):
|
||||||
|
PodifiedCloudHealthCheck.run_before()
|
||||||
|
|
||||||
def test_kill_all_galera_services(self):
|
def test_kill_all_galera_services(self):
|
||||||
# HealthCheck.run_before()
|
PodifiedCloudHealthCheck.run_before()
|
||||||
cloud_disruptions.kill_all_galera_services()
|
cloud_disruptions.kill_all_galera_services()
|
||||||
# HealthCheck.run_after()
|
PodifiedCloudHealthCheck.run_after()
|
||||||
|
|
||||||
def test_remove_all_grastate_galera(self):
|
def test_remove_all_grastate_galera(self):
|
||||||
# HealthCheck.run_before()
|
PodifiedCloudHealthCheck.run_before()
|
||||||
cloud_disruptions.remove_all_grastate_galera()
|
cloud_disruptions.remove_all_grastate_galera()
|
||||||
# HealthCheck.run_before()
|
PodifiedCloudHealthCheck.run_after()
|
||||||
|
|
||||||
def test_remove_one_grastate_galera(self):
|
def test_remove_one_grastate_galera(self):
|
||||||
# HealthCheck.run_before()
|
PodifiedCloudHealthCheck.run_before()
|
||||||
cloud_disruptions.remove_one_grastate_galera()
|
cloud_disruptions.remove_one_grastate_galera()
|
||||||
# HealthCheck.run_after()
|
PodifiedCloudHealthCheck.run_after()
|
||||||
|
@ -1,13 +1,24 @@
|
|||||||
|
# Copyright (c) 2025 Red Hat, Inc.
|
||||||
|
#
|
||||||
|
# All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License. You may obtain
|
||||||
|
# a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
# License for the specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
import time
|
|
||||||
import typing # noqa
|
import typing # noqa
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
|
|
||||||
|
|
||||||
import netaddr
|
import netaddr
|
||||||
from oslo_log import log
|
|
||||||
import pandas
|
|
||||||
|
|
||||||
import tobiko
|
import tobiko
|
||||||
from tobiko.tripleo import overcloud
|
from tobiko.tripleo import overcloud
|
||||||
@ -15,217 +26,6 @@ from tobiko.shell import iperf3
|
|||||||
from tobiko.shell import ping
|
from tobiko.shell import ping
|
||||||
from tobiko.shell import sh
|
from tobiko.shell import sh
|
||||||
from tobiko.shell import ssh
|
from tobiko.shell import ssh
|
||||||
from tobiko.openstack import nova
|
|
||||||
from tobiko.openstack import topology
|
|
||||||
from tobiko.tripleo import containers
|
|
||||||
|
|
||||||
|
|
||||||
LOG = log.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def check_nova_services_health(timeout=600., interval=2.):
|
|
||||||
retry = tobiko.retry(timeout=timeout, interval=interval)
|
|
||||||
nova.wait_for_services_up(retry=retry)
|
|
||||||
|
|
||||||
|
|
||||||
def start_all_instances():
|
|
||||||
"""try to start all stopped overcloud instances"""
|
|
||||||
for instance in nova.list_servers():
|
|
||||||
activated_instance = nova.activate_server(instance)
|
|
||||||
time.sleep(3)
|
|
||||||
instance_info = 'instance {nova_instance} is {state} on {host}'.format(
|
|
||||||
nova_instance=activated_instance.name,
|
|
||||||
state=activated_instance.status,
|
|
||||||
host=activated_instance._info[ # pylint: disable=W0212
|
|
||||||
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
|
|
||||||
LOG.info(instance_info)
|
|
||||||
if activated_instance.status != 'ACTIVE':
|
|
||||||
tobiko.fail(instance_info)
|
|
||||||
|
|
||||||
|
|
||||||
def stop_all_instances():
|
|
||||||
"""try to start all stopped overcloud instances"""
|
|
||||||
for instance in nova.list_servers():
|
|
||||||
activated_instance = nova.shutoff_server(instance)
|
|
||||||
time.sleep(3)
|
|
||||||
instance_info = 'instance {nova_instance} is {state} on {host}'.format(
|
|
||||||
nova_instance=activated_instance.name,
|
|
||||||
state=activated_instance.status,
|
|
||||||
host=activated_instance._info[ # pylint: disable=W0212
|
|
||||||
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
|
|
||||||
LOG.info(instance_info)
|
|
||||||
if activated_instance.status != 'SHUTOFF':
|
|
||||||
tobiko.fail(instance_info)
|
|
||||||
|
|
||||||
|
|
||||||
def wait_for_all_instances_status(status, timeout=None):
|
|
||||||
"""wait for all instances for a certain status or raise an exception"""
|
|
||||||
for instance in nova.list_servers():
|
|
||||||
nova.wait_for_server_status(server=instance.id, status=status,
|
|
||||||
timeout=timeout)
|
|
||||||
instance_info = 'instance {nova_instance} is {state} on {host}'.format(
|
|
||||||
nova_instance=instance.name,
|
|
||||||
state=status,
|
|
||||||
host=instance._info[ # pylint: disable=W0212
|
|
||||||
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
|
|
||||||
LOG.info(instance_info)
|
|
||||||
|
|
||||||
|
|
||||||
def get_vms_table():
|
|
||||||
"""populate a dataframe with vm host,id,status"""
|
|
||||||
vms_data = [(vm._info[ # pylint: disable=W0212
|
|
||||||
'OS-EXT-SRV-ATTR:hypervisor_hostname'], vm.id,
|
|
||||||
vm.status) for vm in nova.list_servers()]
|
|
||||||
vms_df = pandas.DataFrame(vms_data, columns=['vm_host', 'vm_id',
|
|
||||||
'vm_state'])
|
|
||||||
return vms_df
|
|
||||||
|
|
||||||
|
|
||||||
def list_computes():
|
|
||||||
"""list compute host names"""
|
|
||||||
return [compute.hypervisor_hostname for compute in nova.list_hypervisors()]
|
|
||||||
|
|
||||||
|
|
||||||
def get_compute_vms_df(compute_host):
|
|
||||||
"""input: compute hostname (can be short)
|
|
||||||
output: dataframe with vms of that host"""
|
|
||||||
return get_vms_table().query(f"vm_host=='{compute_host}'")
|
|
||||||
|
|
||||||
|
|
||||||
def get_random_compute_with_vms_name():
|
|
||||||
"""get a randomcompute holding vm/s"""
|
|
||||||
for compute in list_computes():
|
|
||||||
if not get_compute_vms_df(compute).empty:
|
|
||||||
return compute
|
|
||||||
|
|
||||||
|
|
||||||
def vm_info(vm_id, vms_df):
|
|
||||||
"""input: vm and a vms df
|
|
||||||
output: host string"""
|
|
||||||
return vms_df.query(f"vm_id == '{vm_id}'").to_string()
|
|
||||||
|
|
||||||
|
|
||||||
def vm_df(vm_id, vms_df):
|
|
||||||
"""input: vm and a vms df
|
|
||||||
output: host string"""
|
|
||||||
return vms_df.query(f"vm_id == '{vm_id}'")
|
|
||||||
|
|
||||||
|
|
||||||
def vm_floating_ip(vm_id):
|
|
||||||
"""input: vm_id
|
|
||||||
output it's floating ip"""
|
|
||||||
|
|
||||||
vm = nova.get_server(vm_id)
|
|
||||||
floating_ip = nova.list_server_ip_addresses(
|
|
||||||
vm, address_type='floating').first
|
|
||||||
return floating_ip
|
|
||||||
|
|
||||||
|
|
||||||
def check_ping_vm_fip(fip):
|
|
||||||
ping.ping_until_received(fip).assert_replied()
|
|
||||||
|
|
||||||
|
|
||||||
def check_df_vms_ping(df):
|
|
||||||
"""input: dataframe with vms_ids
|
|
||||||
try to ping all vms in df"""
|
|
||||||
|
|
||||||
for vm_id in df.vm_id.to_list():
|
|
||||||
check_ping_vm_fip(vm_floating_ip(vm_id))
|
|
||||||
|
|
||||||
|
|
||||||
def vm_location(vm_id, vms_df):
|
|
||||||
"""input: vm and a vms df
|
|
||||||
output: host string"""
|
|
||||||
return vms_df.query(f"vm_id == '{vm_id}'")['vm_host'].to_string(
|
|
||||||
index=False)
|
|
||||||
|
|
||||||
|
|
||||||
def check_vm_evacuations(vms_df_old=None, compute_host=None, timeout=600,
|
|
||||||
interval=2, check_no_evacuation=False):
|
|
||||||
"""check evacuation of vms
|
|
||||||
input: old and new vms_state_tables dfs"""
|
|
||||||
failures = []
|
|
||||||
start = time.time()
|
|
||||||
|
|
||||||
while time.time() - start < timeout:
|
|
||||||
failures = []
|
|
||||||
vms_df_new = get_compute_vms_df(compute_host)
|
|
||||||
for vm_id in vms_df_old.vm_id.to_list():
|
|
||||||
old_bm_host = vm_location(vm_id, vms_df_old)
|
|
||||||
new_vm_host = vm_location(vm_id, vms_df_new)
|
|
||||||
|
|
||||||
if check_no_evacuation:
|
|
||||||
cond = bool(old_bm_host != new_vm_host)
|
|
||||||
else:
|
|
||||||
cond = bool(old_bm_host == new_vm_host)
|
|
||||||
|
|
||||||
if cond:
|
|
||||||
failures.append(
|
|
||||||
'failed vm evacuations: {}\n\n'.format(vm_info(vm_id,
|
|
||||||
vms_df_old)))
|
|
||||||
if failures:
|
|
||||||
LOG.info('Failed nova evacuation:\n {}'.format(failures))
|
|
||||||
LOG.info('Not all nova vms evacuated ..')
|
|
||||||
LOG.info('Retrying , timeout at: {}'
|
|
||||||
.format(timeout-(time.time() - start)))
|
|
||||||
time.sleep(interval)
|
|
||||||
else:
|
|
||||||
LOG.info(vms_df_old.to_string())
|
|
||||||
LOG.info('All vms were evacuated!')
|
|
||||||
return
|
|
||||||
# exhausted all retries
|
|
||||||
if failures:
|
|
||||||
tobiko.fail(
|
|
||||||
'failed vm evacuations:\n{!s}', '\n'.join(failures))
|
|
||||||
|
|
||||||
|
|
||||||
def get_stack_server_id(stack):
|
|
||||||
return stack.server_details.id
|
|
||||||
|
|
||||||
|
|
||||||
def get_fqdn_from_topology_node(topology_node):
|
|
||||||
return sh.execute("hostname -f", ssh_client=topology_node.ssh_client,
|
|
||||||
expect_exit_status=None).stdout.strip()
|
|
||||||
|
|
||||||
|
|
||||||
def check_vm_running_via_virsh(topology_compute, vm_id):
|
|
||||||
"""check that a vm is in running state via virsh command,
|
|
||||||
return false if not"""
|
|
||||||
if vm_id in get_vm_uuid_list_running_via_virsh(topology_compute):
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def get_vm_uuid_list_running_via_virsh(topology_compute):
|
|
||||||
if overcloud.has_overcloud():
|
|
||||||
container_runtime = containers.get_container_runtime_name()
|
|
||||||
nova_libvirt = containers.get_libvirt_container_name()
|
|
||||||
command = f"sudo {container_runtime} exec {nova_libvirt} " \
|
|
||||||
f"sh -c 'for i in `virsh list --name --state-running` " \
|
|
||||||
f";do virsh domuuid $i;done'"
|
|
||||||
else:
|
|
||||||
command = "for i in `sudo virsh list --name --state-running` " \
|
|
||||||
";do virsh domuuid $i;done'"
|
|
||||||
return sh.execute(command,
|
|
||||||
ssh_client=topology_compute.ssh_client).stdout.split()
|
|
||||||
|
|
||||||
|
|
||||||
def check_computes_vms_running_via_virsh():
|
|
||||||
"""check all vms are running via virsh list command"""
|
|
||||||
for compute in topology.list_openstack_nodes(group='compute'):
|
|
||||||
hostname = get_fqdn_from_topology_node(compute)
|
|
||||||
retry = tobiko.retry(timeout=120, interval=5)
|
|
||||||
vms_df = get_compute_vms_df(hostname)
|
|
||||||
for vm_id in vms_df.vm_id.to_list():
|
|
||||||
for _ in retry:
|
|
||||||
if check_vm_running_via_virsh(compute, vm_id):
|
|
||||||
LOG.info(f"{vm_id} is running ok on "
|
|
||||||
f"{compute.hostname}")
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
LOG.info(f"{vm_id} is not in running state on "
|
|
||||||
f"{compute.hostname}")
|
|
||||||
|
|
||||||
|
|
||||||
# Test is inteded for D/S env
|
# Test is inteded for D/S env
|
||||||
|
Loading…
x
Reference in New Issue
Block a user