Improve reset_all_compute_nodes method

faults/ha tests use this method to reboot compute nodes, either sequentially or in parallel. In order to verify that the reboot was completed, this method executed a 'hostname' command on the rebooted computes. This didn't really guarantee that the reboot had been completed. Replacing this 'hostname' command by the existing RebootHostOperation.wait_for_operation method that obtains the output from the 'uptime' command and compares it from the previous value: the reboot is done when the uptime value has been updated. Change-Id: I4f6a68eabec509b36e915fec7f4dfbd763c343d4
2023-07-13 10:44:10 +02:00 · 2023-07-13 10:44:10 +02:00 · a86c116782
commit a86c116782
parent 5d2a5574ea
1 changed files with 12 additions and 15 deletions
--- a/tobiko/tests/faults/ha/cloud_disruptions.py
+++ b/tobiko/tests/faults/ha/cloud_disruptions.py
@ -434,33 +434,30 @@ def network_undisrupt_controllers_non_main_vip():
 def reset_all_compute_nodes(hard_reset=False, sequentially=False):

    # reboot all computes and wait for ssh Up on them
-    def _check_compute(ssh_client):
-        compute_checked = sh.execute(
-            "hostname", ssh_client=ssh_client, expect_exit_status=None).stdout
-        LOG.info('{} is up '.format(compute_checked))
-
    if hard_reset:
        reset_method = sh.hard_reset_method
    else:
        reset_method = sh.soft_reset_method

    nodes = topology.list_openstack_nodes(group='compute')
+    compute_reboot_operation_list = []
    for compute in nodes:
        # using ssh_client.connect we use a fire and forget reboot method
-        sh.reboot_host(ssh_client=compute.ssh_client, wait=False,
-                       method=reset_method)
+        # if sequentially, then wait and check uptime has changed
+        # else, do not wait (uptime will be checked later)
+        reboot_operation = sh.reboot_host(ssh_client=compute.ssh_client,
+                                          wait=sequentially,
+                                          method=reset_method)
+        compute_reboot_operation_list.append(reboot_operation)
        LOG.info('reboot exec:  {} on server: {}'.format(reset_method,
                                                         compute.name))
-        tobiko.cleanup_fixture(compute.ssh_client)
-        if sequentially:
-            # without the sleep, the command succeeds right after the reboot
-            # - i.e. the node had not been stopped yet
-            time.sleep(5)
-            _check_compute(compute.ssh_client)

    if not sequentially:
-        for compute in nodes:
-            _check_compute(compute.ssh_client)
+        for reboot_operation in compute_reboot_operation_list:
+            # checking uptime on each compute - it should have been updated
+            # after the reboot is done
+            reboot_operation.wait_for_operation()
+            LOG.info(f'{reboot_operation.hostname} is up')


 def reset_ovndb_pcs_master_resource():