Merge "Add first health checks to faults podified ha tests"

2025-03-19 10:58:41 +00:00 · 2025-03-19 10:58:41 +00:00 · 8a3b7ea932
commit 8a3b7ea932
parent ba14b305d5 19be771767
9 changed files with 277 additions and 245 deletions
--- a/tobiko/openstack/nova/init.py
+++ b/tobiko/openstack/nova/init.py
@ -13,6 +13,7 @@
 #    under the License.
 from __future__ import absolute_import
 from tobiko.openstack.nova import _checks
 from tobiko.openstack.nova import _client
 from tobiko.openstack.nova import _cloud_init
 from tobiko.openstack.nova import _hypervisor
@ -85,5 +86,12 @@ find_server_ip_address = _server.find_server_ip_address
 HasServerMixin = _server.HasServerMixin
 get_server_id = _server.get_server_id
 list_server_ip_addresses = _server.list_server_ip_addresses
 action_on_all_instances = _server.action_on_all_instances
 wait_for_services_up = _service.wait_for_services_up
 check_nova_services_health = _checks.check_nova_services_health
 check_virsh_domains_running = _checks.check_virsh_domains_running
 wait_for_all_instances_status = _checks.wait_for_all_instances_status
 check_vms_ping = _checks.check_vms_ping
 check_vm_evacuations = _checks.check_vm_evacuations
--- a/tobiko/openstack/nova/_checks.py
+++ b/tobiko/openstack/nova/_checks.py
@ -0,0 +1,147 @@
 # Copyright (c) 2025 Red Hat, Inc.
 #
 # All Rights Reserved.
 #
 #    Licensed under the Apache License, Version 2.0 (the "License"); you may
 #    not use this file except in compliance with the License. You may obtain
 #    a copy of the License at
 #
 #         http://www.apache.org/licenses/LICENSE-2.0
 #
 #    Unless required by applicable law or agreed to in writing, software
 #    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 #    License for the specific language governing permissions and limitations
 #    under the License.
 from oslo_log import log
 import tobiko
 from tobiko.openstack.nova import _client
 from tobiko.openstack.nova import _server
 from tobiko.openstack.nova import _service
 from tobiko.openstack import topology
 from tobiko.shell import ping
 from tobiko.shell import sh
 LOG = log.getLogger(__name__)
 def check_nova_services_health(timeout=600., interval=2.):
    retry = tobiko.retry(timeout=timeout, interval=interval)
    _service.wait_for_services_up(retry=retry)
 def check_virsh_domains_running():
    """check all vms are running via virsh list command"""
    for compute in topology.list_openstack_nodes(group='compute'):
        hostname = sh.get_hostname(ssh_client=compute.ssh_client,
                                   fqdn=True)
        param = {'OS-EXT-SRV-ATTR:hypervisor_hostname': hostname}
        vm_list_per_compute = _client.list_servers(**param)
        for vm in vm_list_per_compute:
            for attempt in tobiko.retry(timeout=120, interval=5):
                if check_vm_running_via_virsh(compute, vm.id):
                    LOG.info(f"{vm.id} is running ok on {hostname}")
                    break
                else:
                    msg = f"{vm.id} is not in running state on {hostname}"
                    if attempt.is_last:
                        tobiko.fail("timeout!! " + msg)
                    LOG.error(f"{vm.id} is not in running state on "
                              f"{hostname} ... Retrying")
 def check_vms_ping(vm_list):
    for vm in vm_list:
        fip = _server.list_server_ip_addresses(vm,
                                               address_type='floating').first
        ping.ping_until_received(fip).assert_replied()
 def check_vm_evacuations(vms_old=None, compute_host=None, timeout=600,
                         interval=2, check_no_evacuation=False):
    """check evacuation of vms
    input: old vm status and expected new compute"""
    for attempt in tobiko.retry(timeout=timeout, interval=interval):
        failures = []
        param = ({} if compute_host is None
                 else {'OS-EXT-SRV-ATTR:hypervisor_hostname': compute_host})
        vms_new = _client.list_servers(**param)
        for vm_old in vms_old or []:
            old_bm_host = vm_old._info[  # pylint: disable=W0212
                'OS-EXT-SRV-ATTR:hypervisor_hostname']
            new_vm_host = vms_new.with_attributes(  # pylint: disable=W0212
                id=vm_old.id).uniq._info[
                    'OS-EXT-SRV-ATTR:hypervisor_hostname']
            if check_no_evacuation:
                cond = bool(old_bm_host != new_vm_host)
            else:
                cond = bool(old_bm_host == new_vm_host)
            if cond:
                failures.append(
                    'Failed vm evacuations: {}\n\n'.format(vm_old))
        if not failures:
            LOG.debug(vms_old.to_string())
            LOG.debug('All vms were evacuated!')
            return
        if attempt.is_last:
            tobiko.fail(
                'Timeout checking VM evacuations:\n{!s}', '\n'.join(failures))
        else:
            LOG.error('Failed nova evacuation:\n {}'.format(failures))
            LOG.error('Retrying...')
 def check_vm_running_via_virsh(topology_compute, vm_id):
    """check that a vm is in running state via virsh command,
    return false if not"""
    if vm_id in get_vm_uuid_list_running_via_virsh(topology_compute):
        return True
    else:
        return False
 def get_vm_uuid_list_running_via_virsh(topology_compute):
    from tobiko import podified
    from tobiko.tripleo import containers
    from tobiko.tripleo import overcloud
    get_uuid_loop = ("for i in `virsh list --name --state-running`; do "
                     "virsh domuuid $i; done")
    containerized_libvirt_cmd = \
        "{container_runtime} exec -u root {nova_libvirt} sh -c '{get_uuids}'"
    if podified.has_podified_cp():
        command = containerized_libvirt_cmd.format(
            container_runtime=podified.CONTAINER_RUNTIME,
            nova_libvirt=podified.NOVA_LIBVIRT_CONTAINER,
            get_uuids=get_uuid_loop)
    elif overcloud.has_overcloud():
        command = containerized_libvirt_cmd.format(
            container_runtime=containers.get_container_runtime_name(),
            nova_libvirt=containers.get_libvirt_container_name(),
            get_uuids=get_uuid_loop)
    else:
        command = get_uuid_loop
    return sh.execute(command,
                      ssh_client=topology_compute.ssh_client,
                      sudo=True).stdout.split()
 def wait_for_all_instances_status(status, timeout=None):
    """wait for all instances for a certain status or raise an exception"""
    for instance in _client.list_servers():
        _client.wait_for_server_status(server=instance.id, status=status,
                                       timeout=timeout)
        instance_info = 'instance {nova_instance} is {state} on {host}'.format(
            nova_instance=instance.name,
            state=status,
            host=instance._info[  # pylint: disable=W0212
                'OS-EXT-SRV-ATTR:hypervisor_hostname'])
        LOG.info(instance_info)
--- a/tobiko/openstack/nova/_server.py
+++ b/tobiko/openstack/nova/_server.py
@ -136,3 +136,24 @@ def get_server_id(server: _client.ServerType) -> str:
        return server
    else:
        return server.id
 def action_on_all_instances(action):
    """try to start/stop all instances"""
    if action not in ('active', 'shutoff'):
        tobiko.fail(f'Wrong action on VM instances: {action}')
    client_action_method = (_client.activate_server if action == 'active'
                            else _client.shutoff_server)
    expected_vm_status = 'ACTIVE' if action == 'active' else 'SHUTOFF'
    for instance in _client.list_servers():
        activated_instance = client_action_method(instance)
        instance_info = 'instance {nova_instance} is {state} on {host}'.format(
            nova_instance=activated_instance.name,
            state=activated_instance.status,
            host=activated_instance._info[  # pylint: disable=W0212
                'OS-EXT-SRV-ATTR:hypervisor_hostname'])
        LOG.info(instance_info)
        if activated_instance.status != expected_vm_status:
            tobiko.fail(instance_info)
--- a/tobiko/podified/init.py
+++ b/tobiko/podified/init.py
@ -18,6 +18,9 @@ from tobiko.podified import _openshift
 from tobiko.podified import containers
 NOVA_LIBVIRT_CONTAINER = 'nova_compute'
 CONTAINER_RUNTIME = 'podman'
 EDPM_NODE = _topology.EDPM_NODE
 OCP_WORKER = _topology.OCP_WORKER
 EDPM_COMPUTE_GROUP = _openshift.EDPM_COMPUTE_GROUP
--- a/tobiko/shell/sh/_hostname.py
+++ b/tobiko/shell/sh/_hostname.py
@ -31,10 +31,14 @@ class HostnameError(tobiko.TobikoException):
 HOSTNAMES_CACHE: typing.MutableMapping[typing.Optional[ssh.SSHClientFixture],
                                       str] = weakref.WeakKeyDictionary()
 HOSTNAMES_FQDN_CACHE: \
    typing.MutableMapping[typing.Optional[ssh.SSHClientFixture],
                          str] = weakref.WeakKeyDictionary()
 def get_hostname(ssh_client: ssh.SSHClientType = None,
                 cached=True,
                 fqdn=False,
                 **execute_params) -> str:
    ssh_client = ssh.ssh_client_fixture(ssh_client)
    if ssh_client is None:
@ -42,25 +46,36 @@ def get_hostname(ssh_client: ssh.SSHClientType = None,
    if cached:
        try:
-            hostname = HOSTNAMES_CACHE[ssh_client]
+            if not fqdn:
                hostname = HOSTNAMES_CACHE[ssh_client]
            else:
                hostname = HOSTNAMES_FQDN_CACHE[ssh_client]
        except KeyError:
            pass
        else:
            return hostname
    hostname = ssh_hostname(ssh_client=ssh_client,
                            fqdn=fqdn,
                            **execute_params)
    if cached:
-        HOSTNAMES_CACHE[ssh_client] = hostname
+        if not fqdn:
            HOSTNAMES_CACHE[ssh_client] = hostname
        else:
            HOSTNAMES_FQDN_CACHE[ssh_client] = hostname
    return hostname
 def ssh_hostname(ssh_client: ssh.SSHClientFixture,
                 fqdn=False,
                 **execute_params) \
        -> str:
    tobiko.check_valid_type(ssh_client, ssh.SSHClientFixture)
    command = 'hostname'
    if fqdn:
        command += ' -f'
    try:
-        result = _execute.execute('hostname',
+        result = _execute.execute(command,
                                  ssh_client=ssh_client,
                                  **execute_params)
    except _exception.ShellCommandFailed as ex:
--- a/tobiko/tests/faults/ha/cloud_disruptions.py
+++ b/tobiko/tests/faults/ha/cloud_disruptions.py
@ -31,6 +31,7 @@ from tobiko import config
 from tobiko.openstack import glance
 from tobiko.openstack import keystone
 from tobiko.openstack import neutron
 from tobiko.openstack import nova
 from tobiko.openstack import stacks
 from tobiko.openstack import tests
 from tobiko.openstack import topology
@ -38,7 +39,6 @@ from tobiko.tests.faults.ha import test_cloud_recovery
 from tobiko.shell import ping
 from tobiko.shell import sh
 from tobiko.tripleo import containers
 from tobiko.tripleo import nova
 from tobiko.tripleo import pacemaker
 from tobiko.tripleo import topology as tripleo_topology
 from tobiko import tripleo
@ -760,6 +760,15 @@ def evac_failover_compute(compute_host, failover_type=sh.hard_reset_method):
        disrupt_node(compute_host, disrupt_method=failover_type)
 def get_random_compute_with_vms():
    for compute in nova.list_hypervisors():
        param = {'OS-EXT-SRV-ATTR:hypervisor_hostname':
                 compute.hypervisor_hostname}
        vm_list_per_compute = nova.list_servers(**param)
        if len(vm_list_per_compute) > 0:
            return compute.hypervisor_hostname
 def check_iha_evacuation(failover_type=None, vm_type=None):
    """check vms on compute host,disrupt compute host,
    check all vms evacuated and pingable"""
@ -767,36 +776,37 @@ def check_iha_evacuation(failover_type=None, vm_type=None):
        LOG.info(f'Begin IHA tests iteration {iteration}')
        LOG.info('create 2 vms')
        tests.test_servers_creation(number_of_servers=2)
-        compute_host = nova.get_random_compute_with_vms_name()
+        compute_host = get_random_compute_with_vms()
-        vms_starting_state_df = nova.get_compute_vms_df(compute_host)
+        vms_starting_state = nova.list_servers(
            **{'OS-EXT-SRV-ATTR:hypervisor_hostname': compute_host})
        if vm_type == 'shutoff':
-            nova.stop_all_instances()
+            nova.action_on_all_instances('shutoff')
        if vm_type == 'evac_image_vm':
            evac_vm_stack = tests.test_evacuable_server_creation()
-            evac_vm_id = nova.get_stack_server_id(evac_vm_stack)
+            evac_vm_id = evac_vm_stack.server_details.id
-            org_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
+            old_nova_evac = nova.get_server(server_id=evac_vm_id)
        if not vm_type == 'shutoff':
-            nova.check_df_vms_ping(vms_starting_state_df)
+            nova.check_vms_ping(vms_starting_state)
        LOG.info(f'perform a failover on {compute_host}')
        evac_failover_compute(compute_host, failover_type=failover_type)
        test_cloud_recovery.overcloud_health_checks(passive_checks_only=True)
        if vm_type == 'evac_image_vm':
-            nova.check_vm_evacuations(vms_df_old=org_nova_evac_df,
+            nova.check_vm_evacuations(vms_old=old_nova_evac,
                                      compute_host=compute_host,
                                      timeout=600,
                                      check_no_evacuation=True)
            # delete evacuable tagged image because it prevents
            # non tagged evacuations if exists
            delete_evacuable_tagged_image()
-            new_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
+            new_nova_evac = nova.get_server(server_id=evac_vm_id)
-            nova.check_vm_evacuations(org_nova_evac_df, new_nova_evac_df)
+            nova.check_vm_evacuations(old_nova_evac, new_nova_evac)
        else:
-            nova.check_vm_evacuations(vms_df_old=vms_starting_state_df,
+            nova.check_vm_evacuations(vms_old=vms_starting_state,
                                      compute_host=compute_host,
                                      timeout=600)
        LOG.info('check evac is Done')
        if not vm_type == 'shutoff':
-            nova.check_df_vms_ping(vms_starting_state_df)
+            nova.check_vms_ping(vms_starting_state)
 def check_iha_evacuation_evac_image_vm():
--- a/tobiko/tests/faults/ha/test_cloud_recovery.py
+++ b/tobiko/tests/faults/ha/test_cloud_recovery.py
@ -50,16 +50,15 @@ has_external_lb = CONF.tobiko.rhosp.has_external_load_balancer
 def overcloud_health_checks(passive_checks_only=False,
                            skip_mac_table_size_test=False):
    # this method will be changed in future commit
    check_pacemaker_resources_health()
    check_overcloud_processes_health()
-    nova.check_nova_services_health()
+    nova_osp.check_nova_services_health()
    tests.test_alive_agents_are_consistent_along_time()
    if not passive_checks_only:
-        # create a uniq stack
+        # create a unique stack that will be cleaned up at the end of each test
        check_vm_create()
-        nova.start_all_instances()
+        nova_osp.action_on_all_instances('active')
-        nova.check_computes_vms_running_via_virsh()
+        nova_osp.check_virsh_domains_running()
    containers.list_node_containers.cache_clear()
    containers.assert_all_tripleo_containers_running()
    containers.assert_equal_containers_state()
@ -231,10 +230,10 @@ class DisruptTripleoNodesTest(testtools.TestCase):
                hard_reset=False,
                sequentially=sequentially)
            # verify VM status is updated after reboot
-            nova.wait_for_all_instances_status('SHUTOFF')
+            nova_osp.wait_for_all_instances_status('SHUTOFF')
            # start all VM instance
            # otherwise sidecar containers will not run after computes reboot
-            nova.start_all_instances()
+            nova_osp.action_on_all_instances('active')
            OvercloudHealthCheck.run_after(passive_checks_only=True)
        _run_test()
@ -247,7 +246,7 @@ class DisruptTripleoNodesTest(testtools.TestCase):
    #     nova.wait_for_all_instances_status('SHUTOFF')
    #     # start all VM instance
    #     # otherwise sidecar containers will not run after computes reboot
-    #     nova.start_all_instances()
+    #     nova_osp.action_on_all_instances('active')
    #     OvercloudHealthCheck.run_after(passive_checks_only=True)
    @testtools.skipIf(has_external_lb, SKIP_MESSAGE_EXTLB)
--- a/tobiko/tests/faults/podified/ha/test_cloud_recovery.py
+++ b/tobiko/tests/faults/podified/ha/test_cloud_recovery.py
@ -14,11 +14,37 @@
 #    License for the specific language governing permissions and limitations
 from __future__ import absolute_import
 from oslo_log import log
 import testtools
-
+from tobiko.tests.faults.ha import test_cloud_recovery
 from tobiko.tests.faults.podified.ha import cloud_disruptions
 from tobiko.openstack import tests
 from tobiko import podified
 from tobiko.openstack import nova
 LOG = log.getLogger(__name__)
 def podified_health_checks():
    nova.check_nova_services_health()
    tests.test_alive_agents_are_consistent_along_time()
    # create a unique stack that will be cleaned up at the end of each test
    # TODO(eolivare) add tests.test_server_creation_no_fip() when BGP is
    # configured with expose_tenant_networks
    tests.test_server_creation()
    nova.action_on_all_instances('active')
    nova.check_virsh_domains_running()
    test_cloud_recovery.octavia_health_checks()
 class PodifiedCloudHealthCheck(test_cloud_recovery.OvercloudHealthCheck):
    def setup_fixture(self):
        # run validations
        LOG.info("Start executing Podified health checks.")
        podified_health_checks()
        LOG.info("Podified health checks successfully executed.")
@podified.skip_if_not_podified
@ -27,17 +53,20 @@ class DisruptPodifiedNodesTest(testtools.TestCase):
    disruptive_action: a function that runs some
    disruptive scenario on a node"""
    def test_0vercloud_health_check(self):
        PodifiedCloudHealthCheck.run_before()
    def test_kill_all_galera_services(self):
-        # HealthCheck.run_before()
+        PodifiedCloudHealthCheck.run_before()
        cloud_disruptions.kill_all_galera_services()
-        # HealthCheck.run_after()
+        PodifiedCloudHealthCheck.run_after()
    def test_remove_all_grastate_galera(self):
-        # HealthCheck.run_before()
+        PodifiedCloudHealthCheck.run_before()
        cloud_disruptions.remove_all_grastate_galera()
-        # HealthCheck.run_before()
+        PodifiedCloudHealthCheck.run_after()
    def test_remove_one_grastate_galera(self):
-        # HealthCheck.run_before()
+        PodifiedCloudHealthCheck.run_before()
        cloud_disruptions.remove_one_grastate_galera()
-        # HealthCheck.run_after()
+        PodifiedCloudHealthCheck.run_after()
--- a/tobiko/tripleo/nova.py
+++ b/tobiko/tripleo/nova.py
@ -1,13 +1,24 @@
 # Copyright (c) 2025 Red Hat, Inc.
 #
 # All Rights Reserved.
 #
 #    Licensed under the Apache License, Version 2.0 (the "License"); you may
 #    not use this file except in compliance with the License. You may obtain
 #    a copy of the License at
 #
 #         http://www.apache.org/licenses/LICENSE-2.0
 #
 #    Unless required by applicable law or agreed to in writing, software
 #    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 #    License for the specific language governing permissions and limitations
 #    under the License.
 from __future__ import absolute_import
 import time
 import typing  # noqa
 from functools import wraps
 import netaddr
 from oslo_log import log
 import pandas
 import tobiko
 from tobiko.tripleo import overcloud
@ -15,217 +26,6 @@ from tobiko.shell import iperf3
 from tobiko.shell import ping
 from tobiko.shell import sh
 from tobiko.shell import ssh
 from tobiko.openstack import nova
 from tobiko.openstack import topology
 from tobiko.tripleo import containers
 LOG = log.getLogger(__name__)
 def check_nova_services_health(timeout=600., interval=2.):
    retry = tobiko.retry(timeout=timeout, interval=interval)
    nova.wait_for_services_up(retry=retry)
 def start_all_instances():
    """try to start all stopped overcloud instances"""
    for instance in nova.list_servers():
        activated_instance = nova.activate_server(instance)
        time.sleep(3)
        instance_info = 'instance {nova_instance} is {state} on {host}'.format(
            nova_instance=activated_instance.name,
            state=activated_instance.status,
            host=activated_instance._info[  # pylint: disable=W0212
                'OS-EXT-SRV-ATTR:hypervisor_hostname'])
        LOG.info(instance_info)
        if activated_instance.status != 'ACTIVE':
            tobiko.fail(instance_info)
 def stop_all_instances():
    """try to start all stopped overcloud instances"""
    for instance in nova.list_servers():
        activated_instance = nova.shutoff_server(instance)
        time.sleep(3)
        instance_info = 'instance {nova_instance} is {state} on {host}'.format(
            nova_instance=activated_instance.name,
            state=activated_instance.status,
            host=activated_instance._info[  # pylint: disable=W0212
                'OS-EXT-SRV-ATTR:hypervisor_hostname'])
        LOG.info(instance_info)
        if activated_instance.status != 'SHUTOFF':
            tobiko.fail(instance_info)
 def wait_for_all_instances_status(status, timeout=None):
    """wait for all instances for a certain status or raise an exception"""
    for instance in nova.list_servers():
        nova.wait_for_server_status(server=instance.id, status=status,
                                    timeout=timeout)
        instance_info = 'instance {nova_instance} is {state} on {host}'.format(
            nova_instance=instance.name,
            state=status,
            host=instance._info[  # pylint: disable=W0212
                'OS-EXT-SRV-ATTR:hypervisor_hostname'])
        LOG.info(instance_info)
 def get_vms_table():
    """populate a dataframe with vm host,id,status"""
    vms_data = [(vm._info[  # pylint: disable=W0212
                     'OS-EXT-SRV-ATTR:hypervisor_hostname'], vm.id,
                 vm.status) for vm in nova.list_servers()]
    vms_df = pandas.DataFrame(vms_data, columns=['vm_host', 'vm_id',
                                                 'vm_state'])
    return vms_df
 def list_computes():
    """list compute host names"""
    return [compute.hypervisor_hostname for compute in nova.list_hypervisors()]
 def get_compute_vms_df(compute_host):
    """input: compute hostname (can be short)
    output: dataframe with vms of that host"""
    return get_vms_table().query(f"vm_host=='{compute_host}'")
 def get_random_compute_with_vms_name():
    """get a randomcompute holding vm/s"""
    for compute in list_computes():
        if not get_compute_vms_df(compute).empty:
            return compute
 def vm_info(vm_id, vms_df):
    """input: vm and a vms df
    output: host string"""
    return vms_df.query(f"vm_id == '{vm_id}'").to_string()
 def vm_df(vm_id, vms_df):
    """input: vm and a vms df
    output: host string"""
    return vms_df.query(f"vm_id == '{vm_id}'")
 def vm_floating_ip(vm_id):
    """input: vm_id
    output it's floating ip"""
    vm = nova.get_server(vm_id)
    floating_ip = nova.list_server_ip_addresses(
        vm, address_type='floating').first
    return floating_ip
 def check_ping_vm_fip(fip):
    ping.ping_until_received(fip).assert_replied()
 def check_df_vms_ping(df):
    """input: dataframe with vms_ids
    try to ping all vms in df"""
    for vm_id in df.vm_id.to_list():
        check_ping_vm_fip(vm_floating_ip(vm_id))
 def vm_location(vm_id, vms_df):
    """input: vm and a vms df
    output: host string"""
    return vms_df.query(f"vm_id == '{vm_id}'")['vm_host'].to_string(
            index=False)
 def check_vm_evacuations(vms_df_old=None, compute_host=None, timeout=600,
                         interval=2, check_no_evacuation=False):
    """check evacuation of vms
    input: old and new vms_state_tables dfs"""
    failures = []
    start = time.time()
    while time.time() - start < timeout:
        failures = []
        vms_df_new = get_compute_vms_df(compute_host)
        for vm_id in vms_df_old.vm_id.to_list():
            old_bm_host = vm_location(vm_id, vms_df_old)
            new_vm_host = vm_location(vm_id, vms_df_new)
            if check_no_evacuation:
                cond = bool(old_bm_host != new_vm_host)
            else:
                cond = bool(old_bm_host == new_vm_host)
            if cond:
                failures.append(
                    'failed vm evacuations: {}\n\n'.format(vm_info(vm_id,
                                                           vms_df_old)))
            if failures:
                LOG.info('Failed nova evacuation:\n {}'.format(failures))
                LOG.info('Not all nova vms evacuated ..')
                LOG.info('Retrying , timeout at: {}'
                         .format(timeout-(time.time() - start)))
                time.sleep(interval)
            else:
                LOG.info(vms_df_old.to_string())
                LOG.info('All vms were evacuated!')
                return
    # exhausted all retries
    if failures:
        tobiko.fail(
            'failed vm evacuations:\n{!s}', '\n'.join(failures))
 def get_stack_server_id(stack):
    return stack.server_details.id
 def get_fqdn_from_topology_node(topology_node):
    return sh.execute("hostname -f", ssh_client=topology_node.ssh_client,
                      expect_exit_status=None).stdout.strip()
 def check_vm_running_via_virsh(topology_compute, vm_id):
    """check that a vm is in running state via virsh command,
    return false if not"""
    if vm_id in get_vm_uuid_list_running_via_virsh(topology_compute):
        return True
    else:
        return False
 def get_vm_uuid_list_running_via_virsh(topology_compute):
    if overcloud.has_overcloud():
        container_runtime = containers.get_container_runtime_name()
        nova_libvirt = containers.get_libvirt_container_name()
        command = f"sudo {container_runtime} exec {nova_libvirt} " \
                  f"sh -c 'for i in `virsh list --name --state-running` " \
                  f";do virsh domuuid $i;done'"
    else:
        command = "for i in `sudo virsh list --name --state-running` " \
                  ";do virsh domuuid $i;done'"
    return sh.execute(command,
                      ssh_client=topology_compute.ssh_client).stdout.split()
 def check_computes_vms_running_via_virsh():
    """check all vms are running via virsh list command"""
    for compute in topology.list_openstack_nodes(group='compute'):
        hostname = get_fqdn_from_topology_node(compute)
        retry = tobiko.retry(timeout=120, interval=5)
        vms_df = get_compute_vms_df(hostname)
        for vm_id in vms_df.vm_id.to_list():
            for _ in retry:
                if check_vm_running_via_virsh(compute, vm_id):
                    LOG.info(f"{vm_id} is running ok on "
                             f"{compute.hostname}")
                    break
                else:
                    LOG.info(f"{vm_id} is not in running state on "
                             f"{compute.hostname}")
 # Test is inteded for D/S env