From 19be771767d716b105873eadbdbc87fdec59a3f3 Mon Sep 17 00:00:00 2001 From: Eduardo Olivares Date: Mon, 30 Dec 2024 16:07:49 +0100 Subject: [PATCH] Add first health checks to faults podified ha tests This patch adds mostly blackbox health checks using the Openstack component APIs (neutron, nova) and creating a workload and checking connectivity to it. To implement in next patches: - Health checks using Openshift client (pods status) - Connect to EDPM nodes and verify they are healthy Change-Id: I412cb9933c4cdd2662561fafcfc1c18fceccc0c3 --- tobiko/openstack/nova/__init__.py | 8 + tobiko/openstack/nova/_checks.py | 147 +++++++++++ tobiko/openstack/nova/_server.py | 21 ++ tobiko/podified/__init__.py | 3 + tobiko/shell/sh/_hostname.py | 21 +- tobiko/tests/faults/ha/cloud_disruptions.py | 34 ++- tobiko/tests/faults/ha/test_cloud_recovery.py | 15 +- .../faults/podified/ha/test_cloud_recovery.py | 43 +++- tobiko/tripleo/nova.py | 230 ++---------------- 9 files changed, 277 insertions(+), 245 deletions(-) create mode 100644 tobiko/openstack/nova/_checks.py diff --git a/tobiko/openstack/nova/__init__.py b/tobiko/openstack/nova/__init__.py index 0ab197bd9..5e37df98e 100644 --- a/tobiko/openstack/nova/__init__.py +++ b/tobiko/openstack/nova/__init__.py @@ -13,6 +13,7 @@ # under the License. from __future__ import absolute_import +from tobiko.openstack.nova import _checks from tobiko.openstack.nova import _client from tobiko.openstack.nova import _cloud_init from tobiko.openstack.nova import _hypervisor @@ -85,5 +86,12 @@ find_server_ip_address = _server.find_server_ip_address HasServerMixin = _server.HasServerMixin get_server_id = _server.get_server_id list_server_ip_addresses = _server.list_server_ip_addresses +action_on_all_instances = _server.action_on_all_instances wait_for_services_up = _service.wait_for_services_up + +check_nova_services_health = _checks.check_nova_services_health +check_virsh_domains_running = _checks.check_virsh_domains_running +wait_for_all_instances_status = _checks.wait_for_all_instances_status +check_vms_ping = _checks.check_vms_ping +check_vm_evacuations = _checks.check_vm_evacuations diff --git a/tobiko/openstack/nova/_checks.py b/tobiko/openstack/nova/_checks.py new file mode 100644 index 000000000..347e048ca --- /dev/null +++ b/tobiko/openstack/nova/_checks.py @@ -0,0 +1,147 @@ +# Copyright (c) 2025 Red Hat, Inc. +# +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +from oslo_log import log + +import tobiko +from tobiko.openstack.nova import _client +from tobiko.openstack.nova import _server +from tobiko.openstack.nova import _service +from tobiko.openstack import topology +from tobiko.shell import ping +from tobiko.shell import sh + + +LOG = log.getLogger(__name__) + + +def check_nova_services_health(timeout=600., interval=2.): + retry = tobiko.retry(timeout=timeout, interval=interval) + _service.wait_for_services_up(retry=retry) + + +def check_virsh_domains_running(): + """check all vms are running via virsh list command""" + for compute in topology.list_openstack_nodes(group='compute'): + hostname = sh.get_hostname(ssh_client=compute.ssh_client, + fqdn=True) + param = {'OS-EXT-SRV-ATTR:hypervisor_hostname': hostname} + vm_list_per_compute = _client.list_servers(**param) + for vm in vm_list_per_compute: + for attempt in tobiko.retry(timeout=120, interval=5): + if check_vm_running_via_virsh(compute, vm.id): + LOG.info(f"{vm.id} is running ok on {hostname}") + break + else: + msg = f"{vm.id} is not in running state on {hostname}" + if attempt.is_last: + tobiko.fail("timeout!! " + msg) + LOG.error(f"{vm.id} is not in running state on " + f"{hostname} ... Retrying") + + +def check_vms_ping(vm_list): + for vm in vm_list: + fip = _server.list_server_ip_addresses(vm, + address_type='floating').first + ping.ping_until_received(fip).assert_replied() + + +def check_vm_evacuations(vms_old=None, compute_host=None, timeout=600, + interval=2, check_no_evacuation=False): + """check evacuation of vms + input: old vm status and expected new compute""" + + for attempt in tobiko.retry(timeout=timeout, interval=interval): + failures = [] + param = ({} if compute_host is None + else {'OS-EXT-SRV-ATTR:hypervisor_hostname': compute_host}) + vms_new = _client.list_servers(**param) + for vm_old in vms_old or []: + old_bm_host = vm_old._info[ # pylint: disable=W0212 + 'OS-EXT-SRV-ATTR:hypervisor_hostname'] + new_vm_host = vms_new.with_attributes( # pylint: disable=W0212 + id=vm_old.id).uniq._info[ + 'OS-EXT-SRV-ATTR:hypervisor_hostname'] + + if check_no_evacuation: + cond = bool(old_bm_host != new_vm_host) + else: + cond = bool(old_bm_host == new_vm_host) + + if cond: + failures.append( + 'Failed vm evacuations: {}\n\n'.format(vm_old)) + if not failures: + LOG.debug(vms_old.to_string()) + LOG.debug('All vms were evacuated!') + return + + if attempt.is_last: + tobiko.fail( + 'Timeout checking VM evacuations:\n{!s}', '\n'.join(failures)) + else: + LOG.error('Failed nova evacuation:\n {}'.format(failures)) + LOG.error('Retrying...') + + +def check_vm_running_via_virsh(topology_compute, vm_id): + """check that a vm is in running state via virsh command, + return false if not""" + if vm_id in get_vm_uuid_list_running_via_virsh(topology_compute): + return True + else: + return False + + +def get_vm_uuid_list_running_via_virsh(topology_compute): + from tobiko import podified + from tobiko.tripleo import containers + from tobiko.tripleo import overcloud + + get_uuid_loop = ("for i in `virsh list --name --state-running`; do " + "virsh domuuid $i; done") + containerized_libvirt_cmd = \ + "{container_runtime} exec -u root {nova_libvirt} sh -c '{get_uuids}'" + + if podified.has_podified_cp(): + command = containerized_libvirt_cmd.format( + container_runtime=podified.CONTAINER_RUNTIME, + nova_libvirt=podified.NOVA_LIBVIRT_CONTAINER, + get_uuids=get_uuid_loop) + elif overcloud.has_overcloud(): + command = containerized_libvirt_cmd.format( + container_runtime=containers.get_container_runtime_name(), + nova_libvirt=containers.get_libvirt_container_name(), + get_uuids=get_uuid_loop) + else: + command = get_uuid_loop + + return sh.execute(command, + ssh_client=topology_compute.ssh_client, + sudo=True).stdout.split() + + +def wait_for_all_instances_status(status, timeout=None): + """wait for all instances for a certain status or raise an exception""" + for instance in _client.list_servers(): + _client.wait_for_server_status(server=instance.id, status=status, + timeout=timeout) + instance_info = 'instance {nova_instance} is {state} on {host}'.format( + nova_instance=instance.name, + state=status, + host=instance._info[ # pylint: disable=W0212 + 'OS-EXT-SRV-ATTR:hypervisor_hostname']) + LOG.info(instance_info) diff --git a/tobiko/openstack/nova/_server.py b/tobiko/openstack/nova/_server.py index 4aa4283b4..061e8eaec 100644 --- a/tobiko/openstack/nova/_server.py +++ b/tobiko/openstack/nova/_server.py @@ -136,3 +136,24 @@ def get_server_id(server: _client.ServerType) -> str: return server else: return server.id + + +def action_on_all_instances(action): + """try to start/stop all instances""" + if action not in ('active', 'shutoff'): + tobiko.fail(f'Wrong action on VM instances: {action}') + + client_action_method = (_client.activate_server if action == 'active' + else _client.shutoff_server) + expected_vm_status = 'ACTIVE' if action == 'active' else 'SHUTOFF' + + for instance in _client.list_servers(): + activated_instance = client_action_method(instance) + instance_info = 'instance {nova_instance} is {state} on {host}'.format( + nova_instance=activated_instance.name, + state=activated_instance.status, + host=activated_instance._info[ # pylint: disable=W0212 + 'OS-EXT-SRV-ATTR:hypervisor_hostname']) + LOG.info(instance_info) + if activated_instance.status != expected_vm_status: + tobiko.fail(instance_info) diff --git a/tobiko/podified/__init__.py b/tobiko/podified/__init__.py index c2141545e..585d9303a 100644 --- a/tobiko/podified/__init__.py +++ b/tobiko/podified/__init__.py @@ -18,6 +18,9 @@ from tobiko.podified import _openshift from tobiko.podified import containers +NOVA_LIBVIRT_CONTAINER = 'nova_compute' +CONTAINER_RUNTIME = 'podman' + EDPM_NODE = _topology.EDPM_NODE OCP_WORKER = _topology.OCP_WORKER EDPM_COMPUTE_GROUP = _openshift.EDPM_COMPUTE_GROUP diff --git a/tobiko/shell/sh/_hostname.py b/tobiko/shell/sh/_hostname.py index 9fb9b6c3f..200ca6834 100644 --- a/tobiko/shell/sh/_hostname.py +++ b/tobiko/shell/sh/_hostname.py @@ -31,10 +31,14 @@ class HostnameError(tobiko.TobikoException): HOSTNAMES_CACHE: typing.MutableMapping[typing.Optional[ssh.SSHClientFixture], str] = weakref.WeakKeyDictionary() +HOSTNAMES_FQDN_CACHE: \ + typing.MutableMapping[typing.Optional[ssh.SSHClientFixture], + str] = weakref.WeakKeyDictionary() def get_hostname(ssh_client: ssh.SSHClientType = None, cached=True, + fqdn=False, **execute_params) -> str: ssh_client = ssh.ssh_client_fixture(ssh_client) if ssh_client is None: @@ -42,25 +46,36 @@ def get_hostname(ssh_client: ssh.SSHClientType = None, if cached: try: - hostname = HOSTNAMES_CACHE[ssh_client] + if not fqdn: + hostname = HOSTNAMES_CACHE[ssh_client] + else: + hostname = HOSTNAMES_FQDN_CACHE[ssh_client] except KeyError: pass else: return hostname hostname = ssh_hostname(ssh_client=ssh_client, + fqdn=fqdn, **execute_params) if cached: - HOSTNAMES_CACHE[ssh_client] = hostname + if not fqdn: + HOSTNAMES_CACHE[ssh_client] = hostname + else: + HOSTNAMES_FQDN_CACHE[ssh_client] = hostname return hostname def ssh_hostname(ssh_client: ssh.SSHClientFixture, + fqdn=False, **execute_params) \ -> str: tobiko.check_valid_type(ssh_client, ssh.SSHClientFixture) + command = 'hostname' + if fqdn: + command += ' -f' try: - result = _execute.execute('hostname', + result = _execute.execute(command, ssh_client=ssh_client, **execute_params) except _exception.ShellCommandFailed as ex: diff --git a/tobiko/tests/faults/ha/cloud_disruptions.py b/tobiko/tests/faults/ha/cloud_disruptions.py index dd82d5b84..271b6262d 100644 --- a/tobiko/tests/faults/ha/cloud_disruptions.py +++ b/tobiko/tests/faults/ha/cloud_disruptions.py @@ -31,6 +31,7 @@ from tobiko import config from tobiko.openstack import glance from tobiko.openstack import keystone from tobiko.openstack import neutron +from tobiko.openstack import nova from tobiko.openstack import stacks from tobiko.openstack import tests from tobiko.openstack import topology @@ -38,7 +39,6 @@ from tobiko.tests.faults.ha import test_cloud_recovery from tobiko.shell import ping from tobiko.shell import sh from tobiko.tripleo import containers -from tobiko.tripleo import nova from tobiko.tripleo import pacemaker from tobiko.tripleo import topology as tripleo_topology from tobiko import tripleo @@ -760,6 +760,15 @@ def evac_failover_compute(compute_host, failover_type=sh.hard_reset_method): disrupt_node(compute_host, disrupt_method=failover_type) +def get_random_compute_with_vms(): + for compute in nova.list_hypervisors(): + param = {'OS-EXT-SRV-ATTR:hypervisor_hostname': + compute.hypervisor_hostname} + vm_list_per_compute = nova.list_servers(**param) + if len(vm_list_per_compute) > 0: + return compute.hypervisor_hostname + + def check_iha_evacuation(failover_type=None, vm_type=None): """check vms on compute host,disrupt compute host, check all vms evacuated and pingable""" @@ -767,36 +776,37 @@ def check_iha_evacuation(failover_type=None, vm_type=None): LOG.info(f'Begin IHA tests iteration {iteration}') LOG.info('create 2 vms') tests.test_servers_creation(number_of_servers=2) - compute_host = nova.get_random_compute_with_vms_name() - vms_starting_state_df = nova.get_compute_vms_df(compute_host) + compute_host = get_random_compute_with_vms() + vms_starting_state = nova.list_servers( + **{'OS-EXT-SRV-ATTR:hypervisor_hostname': compute_host}) if vm_type == 'shutoff': - nova.stop_all_instances() + nova.action_on_all_instances('shutoff') if vm_type == 'evac_image_vm': evac_vm_stack = tests.test_evacuable_server_creation() - evac_vm_id = nova.get_stack_server_id(evac_vm_stack) - org_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table()) + evac_vm_id = evac_vm_stack.server_details.id + old_nova_evac = nova.get_server(server_id=evac_vm_id) if not vm_type == 'shutoff': - nova.check_df_vms_ping(vms_starting_state_df) + nova.check_vms_ping(vms_starting_state) LOG.info(f'perform a failover on {compute_host}') evac_failover_compute(compute_host, failover_type=failover_type) test_cloud_recovery.overcloud_health_checks(passive_checks_only=True) if vm_type == 'evac_image_vm': - nova.check_vm_evacuations(vms_df_old=org_nova_evac_df, + nova.check_vm_evacuations(vms_old=old_nova_evac, compute_host=compute_host, timeout=600, check_no_evacuation=True) # delete evacuable tagged image because it prevents # non tagged evacuations if exists delete_evacuable_tagged_image() - new_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table()) - nova.check_vm_evacuations(org_nova_evac_df, new_nova_evac_df) + new_nova_evac = nova.get_server(server_id=evac_vm_id) + nova.check_vm_evacuations(old_nova_evac, new_nova_evac) else: - nova.check_vm_evacuations(vms_df_old=vms_starting_state_df, + nova.check_vm_evacuations(vms_old=vms_starting_state, compute_host=compute_host, timeout=600) LOG.info('check evac is Done') if not vm_type == 'shutoff': - nova.check_df_vms_ping(vms_starting_state_df) + nova.check_vms_ping(vms_starting_state) def check_iha_evacuation_evac_image_vm(): diff --git a/tobiko/tests/faults/ha/test_cloud_recovery.py b/tobiko/tests/faults/ha/test_cloud_recovery.py index 768138ed8..d57117b8b 100644 --- a/tobiko/tests/faults/ha/test_cloud_recovery.py +++ b/tobiko/tests/faults/ha/test_cloud_recovery.py @@ -50,16 +50,15 @@ has_external_lb = CONF.tobiko.rhosp.has_external_load_balancer def overcloud_health_checks(passive_checks_only=False, skip_mac_table_size_test=False): - # this method will be changed in future commit check_pacemaker_resources_health() check_overcloud_processes_health() - nova.check_nova_services_health() + nova_osp.check_nova_services_health() tests.test_alive_agents_are_consistent_along_time() if not passive_checks_only: - # create a uniq stack + # create a unique stack that will be cleaned up at the end of each test check_vm_create() - nova.start_all_instances() - nova.check_computes_vms_running_via_virsh() + nova_osp.action_on_all_instances('active') + nova_osp.check_virsh_domains_running() containers.list_node_containers.cache_clear() containers.assert_all_tripleo_containers_running() containers.assert_equal_containers_state() @@ -231,10 +230,10 @@ class DisruptTripleoNodesTest(testtools.TestCase): hard_reset=False, sequentially=sequentially) # verify VM status is updated after reboot - nova.wait_for_all_instances_status('SHUTOFF') + nova_osp.wait_for_all_instances_status('SHUTOFF') # start all VM instance # otherwise sidecar containers will not run after computes reboot - nova.start_all_instances() + nova_osp.action_on_all_instances('active') OvercloudHealthCheck.run_after(passive_checks_only=True) _run_test() @@ -247,7 +246,7 @@ class DisruptTripleoNodesTest(testtools.TestCase): # nova.wait_for_all_instances_status('SHUTOFF') # # start all VM instance # # otherwise sidecar containers will not run after computes reboot - # nova.start_all_instances() + # nova_osp.action_on_all_instances('active') # OvercloudHealthCheck.run_after(passive_checks_only=True) @testtools.skipIf(has_external_lb, SKIP_MESSAGE_EXTLB) diff --git a/tobiko/tests/faults/podified/ha/test_cloud_recovery.py b/tobiko/tests/faults/podified/ha/test_cloud_recovery.py index b81fb6b9b..d215127c6 100644 --- a/tobiko/tests/faults/podified/ha/test_cloud_recovery.py +++ b/tobiko/tests/faults/podified/ha/test_cloud_recovery.py @@ -14,11 +14,37 @@ # License for the specific language governing permissions and limitations from __future__ import absolute_import +from oslo_log import log import testtools - +from tobiko.tests.faults.ha import test_cloud_recovery from tobiko.tests.faults.podified.ha import cloud_disruptions +from tobiko.openstack import tests from tobiko import podified +from tobiko.openstack import nova + + +LOG = log.getLogger(__name__) + + +def podified_health_checks(): + nova.check_nova_services_health() + tests.test_alive_agents_are_consistent_along_time() + # create a unique stack that will be cleaned up at the end of each test + # TODO(eolivare) add tests.test_server_creation_no_fip() when BGP is + # configured with expose_tenant_networks + tests.test_server_creation() + nova.action_on_all_instances('active') + nova.check_virsh_domains_running() + test_cloud_recovery.octavia_health_checks() + + +class PodifiedCloudHealthCheck(test_cloud_recovery.OvercloudHealthCheck): + def setup_fixture(self): + # run validations + LOG.info("Start executing Podified health checks.") + podified_health_checks() + LOG.info("Podified health checks successfully executed.") @podified.skip_if_not_podified @@ -27,17 +53,20 @@ class DisruptPodifiedNodesTest(testtools.TestCase): disruptive_action: a function that runs some disruptive scenario on a node""" + def test_0vercloud_health_check(self): + PodifiedCloudHealthCheck.run_before() + def test_kill_all_galera_services(self): - # HealthCheck.run_before() + PodifiedCloudHealthCheck.run_before() cloud_disruptions.kill_all_galera_services() - # HealthCheck.run_after() + PodifiedCloudHealthCheck.run_after() def test_remove_all_grastate_galera(self): - # HealthCheck.run_before() + PodifiedCloudHealthCheck.run_before() cloud_disruptions.remove_all_grastate_galera() - # HealthCheck.run_before() + PodifiedCloudHealthCheck.run_after() def test_remove_one_grastate_galera(self): - # HealthCheck.run_before() + PodifiedCloudHealthCheck.run_before() cloud_disruptions.remove_one_grastate_galera() - # HealthCheck.run_after() + PodifiedCloudHealthCheck.run_after() diff --git a/tobiko/tripleo/nova.py b/tobiko/tripleo/nova.py index e25d278c3..ee49ec2fb 100644 --- a/tobiko/tripleo/nova.py +++ b/tobiko/tripleo/nova.py @@ -1,13 +1,24 @@ +# Copyright (c) 2025 Red Hat, Inc. +# +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. from __future__ import absolute_import -import time import typing # noqa from functools import wraps - import netaddr -from oslo_log import log -import pandas import tobiko from tobiko.tripleo import overcloud @@ -15,217 +26,6 @@ from tobiko.shell import iperf3 from tobiko.shell import ping from tobiko.shell import sh from tobiko.shell import ssh -from tobiko.openstack import nova -from tobiko.openstack import topology -from tobiko.tripleo import containers - - -LOG = log.getLogger(__name__) - - -def check_nova_services_health(timeout=600., interval=2.): - retry = tobiko.retry(timeout=timeout, interval=interval) - nova.wait_for_services_up(retry=retry) - - -def start_all_instances(): - """try to start all stopped overcloud instances""" - for instance in nova.list_servers(): - activated_instance = nova.activate_server(instance) - time.sleep(3) - instance_info = 'instance {nova_instance} is {state} on {host}'.format( - nova_instance=activated_instance.name, - state=activated_instance.status, - host=activated_instance._info[ # pylint: disable=W0212 - 'OS-EXT-SRV-ATTR:hypervisor_hostname']) - LOG.info(instance_info) - if activated_instance.status != 'ACTIVE': - tobiko.fail(instance_info) - - -def stop_all_instances(): - """try to start all stopped overcloud instances""" - for instance in nova.list_servers(): - activated_instance = nova.shutoff_server(instance) - time.sleep(3) - instance_info = 'instance {nova_instance} is {state} on {host}'.format( - nova_instance=activated_instance.name, - state=activated_instance.status, - host=activated_instance._info[ # pylint: disable=W0212 - 'OS-EXT-SRV-ATTR:hypervisor_hostname']) - LOG.info(instance_info) - if activated_instance.status != 'SHUTOFF': - tobiko.fail(instance_info) - - -def wait_for_all_instances_status(status, timeout=None): - """wait for all instances for a certain status or raise an exception""" - for instance in nova.list_servers(): - nova.wait_for_server_status(server=instance.id, status=status, - timeout=timeout) - instance_info = 'instance {nova_instance} is {state} on {host}'.format( - nova_instance=instance.name, - state=status, - host=instance._info[ # pylint: disable=W0212 - 'OS-EXT-SRV-ATTR:hypervisor_hostname']) - LOG.info(instance_info) - - -def get_vms_table(): - """populate a dataframe with vm host,id,status""" - vms_data = [(vm._info[ # pylint: disable=W0212 - 'OS-EXT-SRV-ATTR:hypervisor_hostname'], vm.id, - vm.status) for vm in nova.list_servers()] - vms_df = pandas.DataFrame(vms_data, columns=['vm_host', 'vm_id', - 'vm_state']) - return vms_df - - -def list_computes(): - """list compute host names""" - return [compute.hypervisor_hostname for compute in nova.list_hypervisors()] - - -def get_compute_vms_df(compute_host): - """input: compute hostname (can be short) - output: dataframe with vms of that host""" - return get_vms_table().query(f"vm_host=='{compute_host}'") - - -def get_random_compute_with_vms_name(): - """get a randomcompute holding vm/s""" - for compute in list_computes(): - if not get_compute_vms_df(compute).empty: - return compute - - -def vm_info(vm_id, vms_df): - """input: vm and a vms df - output: host string""" - return vms_df.query(f"vm_id == '{vm_id}'").to_string() - - -def vm_df(vm_id, vms_df): - """input: vm and a vms df - output: host string""" - return vms_df.query(f"vm_id == '{vm_id}'") - - -def vm_floating_ip(vm_id): - """input: vm_id - output it's floating ip""" - - vm = nova.get_server(vm_id) - floating_ip = nova.list_server_ip_addresses( - vm, address_type='floating').first - return floating_ip - - -def check_ping_vm_fip(fip): - ping.ping_until_received(fip).assert_replied() - - -def check_df_vms_ping(df): - """input: dataframe with vms_ids - try to ping all vms in df""" - - for vm_id in df.vm_id.to_list(): - check_ping_vm_fip(vm_floating_ip(vm_id)) - - -def vm_location(vm_id, vms_df): - """input: vm and a vms df - output: host string""" - return vms_df.query(f"vm_id == '{vm_id}'")['vm_host'].to_string( - index=False) - - -def check_vm_evacuations(vms_df_old=None, compute_host=None, timeout=600, - interval=2, check_no_evacuation=False): - """check evacuation of vms - input: old and new vms_state_tables dfs""" - failures = [] - start = time.time() - - while time.time() - start < timeout: - failures = [] - vms_df_new = get_compute_vms_df(compute_host) - for vm_id in vms_df_old.vm_id.to_list(): - old_bm_host = vm_location(vm_id, vms_df_old) - new_vm_host = vm_location(vm_id, vms_df_new) - - if check_no_evacuation: - cond = bool(old_bm_host != new_vm_host) - else: - cond = bool(old_bm_host == new_vm_host) - - if cond: - failures.append( - 'failed vm evacuations: {}\n\n'.format(vm_info(vm_id, - vms_df_old))) - if failures: - LOG.info('Failed nova evacuation:\n {}'.format(failures)) - LOG.info('Not all nova vms evacuated ..') - LOG.info('Retrying , timeout at: {}' - .format(timeout-(time.time() - start))) - time.sleep(interval) - else: - LOG.info(vms_df_old.to_string()) - LOG.info('All vms were evacuated!') - return - # exhausted all retries - if failures: - tobiko.fail( - 'failed vm evacuations:\n{!s}', '\n'.join(failures)) - - -def get_stack_server_id(stack): - return stack.server_details.id - - -def get_fqdn_from_topology_node(topology_node): - return sh.execute("hostname -f", ssh_client=topology_node.ssh_client, - expect_exit_status=None).stdout.strip() - - -def check_vm_running_via_virsh(topology_compute, vm_id): - """check that a vm is in running state via virsh command, - return false if not""" - if vm_id in get_vm_uuid_list_running_via_virsh(topology_compute): - return True - else: - return False - - -def get_vm_uuid_list_running_via_virsh(topology_compute): - if overcloud.has_overcloud(): - container_runtime = containers.get_container_runtime_name() - nova_libvirt = containers.get_libvirt_container_name() - command = f"sudo {container_runtime} exec {nova_libvirt} " \ - f"sh -c 'for i in `virsh list --name --state-running` " \ - f";do virsh domuuid $i;done'" - else: - command = "for i in `sudo virsh list --name --state-running` " \ - ";do virsh domuuid $i;done'" - return sh.execute(command, - ssh_client=topology_compute.ssh_client).stdout.split() - - -def check_computes_vms_running_via_virsh(): - """check all vms are running via virsh list command""" - for compute in topology.list_openstack_nodes(group='compute'): - hostname = get_fqdn_from_topology_node(compute) - retry = tobiko.retry(timeout=120, interval=5) - vms_df = get_compute_vms_df(hostname) - for vm_id in vms_df.vm_id.to_list(): - for _ in retry: - if check_vm_running_via_virsh(compute, vm_id): - LOG.info(f"{vm_id} is running ok on " - f"{compute.hostname}") - break - else: - LOG.info(f"{vm_id} is not in running state on " - f"{compute.hostname}") # Test is inteded for D/S env