Artom Lifshitz 6679a3853a Stop silently overriding clients with admin clients
Previously, our compute base class overwrote some clients with their
admin versions. This is needlessly confusing. This patch just removes
the setup_clients() method altogether, along with the
create_test_server() override, and forces tests to explicitly use
either self.os_primary or self.os_admin as the clients.

This change has a few simple but widespread consequences:

* We need to start using self.get_host_for_server() in a few places,
  instead of looking up the 'OS-EXT-SRV-ATTR:host' attribute in the
  server dict, as that's only present in the admin response.

* We can drop the public visibility in copy_default_image(), as that's
  only allowed for admins, and the default shared visibility should
  work just as well for us.

* The unit test for list_compute_hosts() would need to get fixed to
  account for the use of self.os_admin.services_client instead of
  self.services_client. Rather than do that, just drop the test
  entirely, it adds no value as list_compute_hosts() is used in
  whitebox tests themselves.

* We need to start explicitly plassing wait_until='ACTIVE' to every
  one of our create_test_server calls, as the override used to do that
  for us.

* Our live_migrate() helper now needs to be passed a clients manager
  so that it can pass that through to the waiter when waiting for the
  server to reach a particular status after the live migration.

Depends-on: https://review.opendev.org/c/openstack/tempest/+/820062
Change-Id: I8d5be63275bd8a28b7012e14b99cadafdea53a47
2021-12-02 13:39:07 -05:00

495 lines
22 KiB
Python

# Copyright 2020 Red Hat Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from tempest.common.utils.linux import remote_client
from tempest.common import waiters
from tempest import config
from tempest.lib.common.utils import data_utils
from tempest.lib.common.utils import test_utils
from whitebox_tempest_plugin.api.compute import base
from oslo_log import log as logging
CONF = config.CONF
LOG = logging.getLogger(__name__)
def get_pci_address(domain, bus, slot, func):
"""Assembles PCI address components into a fully-specified PCI address.
NOTE(jparker): This has been lifted from nova.pci.utils with no
adjustments
Does not validate that the components are valid hex or wildcard values.
:param domain, bus, slot, func: Hex or wildcard strings.
:return: A string of the form "<domain>:<bus>:<slot>.<function>".
"""
return '%s:%s:%s.%s' % (domain, bus, slot, func)
class VGPUTest(base.BaseWhiteboxComputeTest):
# NOTE(jparker) as of Queens all hypervisors that support vGPUs accept
# a single vGPU per instance, so this value is not exposed as a whitebox
# hardware configurable at this time.
vgpu_amount_per_instance = 1
@classmethod
def skip_checks(cls):
super(VGPUTest, cls).skip_checks()
if (CONF.whitebox_hardware.vgpu_vendor_id is None):
msg = "CONF.whitebox_hardware.vgpu_vendor_id needs to be set."
raise cls.skipException(msg)
@classmethod
def resource_setup(cls):
# NOTE(jparker) Currently the inheritance tree for Whitebox test
# classes for the test method create_flavor() does not resolve to a
# classmethod for Whitebox. resource_setup expects setup methods to be
# classmethods, so directly calling the flavors client to create the
# necessary flavor for the tests
super(VGPUTest, cls).resource_setup()
flavor_name = data_utils.rand_name('vgpu_test_flavor')
extra_specs = {"resources:VGPU": str(cls.vgpu_amount_per_instance)}
cls.vgpu_flavor = cls.admin_flavors_client.create_flavor(
name=flavor_name,
ram=64,
vcpus=2,
disk=CONF.whitebox.flavor_volume_size,
is_public='True')['flavor']
cls.flavors_client.set_flavor_extra_spec(cls.vgpu_flavor['id'],
**extra_specs)
cls.addClassResourceCleanup(
cls.admin_flavors_client.wait_for_resource_deletion,
cls.vgpu_flavor['id'])
cls.addClassResourceCleanup(cls.admin_flavors_client.delete_flavor,
cls.vgpu_flavor['id'])
def _create_ssh_client(self, server, validation_resources):
"""Create an ssh client to execute commands on the guest instance
:param server: the ssh client will be setup to interface with the
provided server instance
:param valdiation_resources: necessary validation information to setup
an ssh session
:return linux_client: the ssh client that allows for guest command
execution
"""
linux_client = remote_client.RemoteClient(
self.get_server_ip(server, validation_resources),
self.image_ssh_user,
self.image_ssh_password,
validation_resources['keypair']['private_key'],
server=server,
servers_client=self.servers_client)
linux_client.validate_authentication()
return linux_client
@classmethod
def setup_credentials(cls):
cls.prepare_instance_network()
super(VGPUTest, cls).setup_credentials()
def _get_rp_uuid_from_hostname(self, hostname):
"""Given a provided compute host return its associated rp uuid
:param hostname: str, compute hostname to check
:return parent_rp_uuid: str, string representation of the rp uuid
found on the compute host
"""
resp = self.os_admin.resource_providers_client.list_resource_providers(
name=hostname)
return resp.get('resource_providers')[0].get('uuid')
def _get_all_children_of_resource_provider(self, rp_uuid):
"""List all child RP UUIDs of provided resource provider
Given a parent resource provider uuid, get all in-tree child RP UUID
that can provide the request resource amount
API Reference:
https://docs.openstack.org/api-ref/placement/#resource-providers
:param rp_uuid: str, string representation of rp uuid to be searched
:return rp_children_uuids: list of str, all rp uuids that match the
resource=amount request
"""
params = {'in_tree': rp_uuid}
resp = self.os_admin.resource_providers_client.list_resource_providers(
**params)
# Create a list of uuids based on the return response of resource
# providers from the rp client, exclude the parent uuid from this
# list.
child_uuids = [x.get('uuid') for x in resp.get('resource_providers')
if x != rp_uuid]
return child_uuids
def _get_usage_for_resource_class_vgpu(self, rp_uuids):
"""Total usage of resource class vGPU from provided list of RP UUIDs
:param rp_uuids: list, comprised of str representing all RP UUIDs to
query
:return total_vgpu_usage: int, total usage of resource class VGPU from
all provided RP UUIDS
"""
total_vgpu_usage = 0
for rp_uuid in rp_uuids:
resp = self.os_admin.resource_providers_client.\
list_resource_provider_usages(rp_uuid=rp_uuid)
rp_usages = resp.get('usages')
vgpu_usage = rp_usages.get('VGPU', 0)
total_vgpu_usage += vgpu_usage
return total_vgpu_usage
def _get_vgpu_util_for_host(self, hostname):
"""Get the total usage of a vGPU resource class from the compute host
:param hostname: str, compute hostname to gather usage data from
:return resource_usage_count: int, the current total usage for the vGPU
resource class
"""
rp_uuid = self._get_rp_uuid_from_hostname(hostname)
rp_children = self._get_all_children_of_resource_provider(
rp_uuid=rp_uuid)
resource_usage_count = \
self._get_usage_for_resource_class_vgpu(rp_children)
return resource_usage_count
def _get_pci_addr_from_device(self, xml_element):
"""Return pci address value from provided domain device xml element
:param xml_element: Etree XML element device from guest instance
:return str: the pci address found from the xml element in the format
sys:bus:slot:function
"""
pci_addr_element = xml_element.find(".address[@type='pci']")
domain = pci_addr_element.get('domain').replace('0x', '')
bus = pci_addr_element.get('bus').replace('0x', '')
slot = pci_addr_element.get('slot').replace('0x', '')
func = pci_addr_element.get('function').replace('0x', '')
pci_address = get_pci_address(domain, bus, slot, func)
return pci_address
def _assert_vendor_id_in_guest(self, pci_address, linux_client):
"""Confirm vgpu vendor id is present in server instance sysfs
:param pci_address: str when searching the guest's pci devices use
provided pci_address value to parse for vendor id
"""
cmd = "cat /sys/bus/pci/devices/%s/vendor" % pci_address
sys_out = linux_client.exec_command(cmd)
self.assertIn(CONF.whitebox_hardware.vgpu_vendor_id, sys_out,
"Vendor ID %s not found in output %s" %
(CONF.whitebox_hardware.vgpu_vendor_id, sys_out))
def _cold_migrate_server(self, server_id, target_host, revert=False):
"""Cold migrate a server with the option to revert the migration
:param server_id: str, uuid of the server to migrate
:param revert: bool, revert server migration action if true
"""
src_host = self.get_host_for_server(server_id)
self.admin_servers_client.migrate_server(server_id, host=target_host)
waiters.wait_for_server_status(self.servers_client, server_id,
'VERIFY_RESIZE')
if revert:
self.admin_servers_client.revert_resize_server(server_id)
assert_func = self.assertEqual
else:
self.admin_servers_client.confirm_resize_server(server_id)
assert_func = self.assertNotEqual
waiters.wait_for_server_status(self.servers_client,
server_id, 'ACTIVE')
dst_host = self.get_host_for_server(server_id)
assert_func(src_host, dst_host)
def _validate_vgpu_instance(self, server, linux_client,
expected_device_count):
"""Confirm vgpu guest XML is correct and vendor id is present in guest
:param server: dict, attributes describing the guest instance
:param linux_client: ssh client capable of interacting with guest
:param expected_device_count: int, expected number of XML vgpu devices
in the guest
"""
# Find all hostdev devices on the instance of type mdev in the provided
# instance's XML
vgpu_devices = self.get_server_xml(server['id']).findall(
"./devices/hostdev[@type='mdev']"
)
# Validate the numer of mdev host devices is equal to the expected
# count provided to the method
self.assertEqual(
expected_device_count, len(vgpu_devices), "Expected %d "
"xml hostdev vgpu element(s) on instance %s but instead found %d" %
(expected_device_count, server['id'],
len(vgpu_devices)))
# If there are no expected mdev devices, additional verification of
# the vgpu device is not necessary
if expected_device_count == 0:
return
# Determine the pci address of the vgpu hostdev element and use this
# address to search for the vendor id in the guest sysfs
for vgpu_xml_element in vgpu_devices:
pci_address = self._get_pci_addr_from_device(vgpu_xml_element)
# Validate the vendor id is present in guest instance
self._assert_vendor_id_in_guest(pci_address, linux_client)
def create_validateable_instance(self, flavor, validation_resources):
"""Create a validateable instance based on provided flavor
:param flavor: dict, attributes describing flavor
:param validation_resources: dict, parameters necessary to setup ssh
client and validate the guest
"""
server = self.create_test_server(
flavor=flavor['id'],
validatable=True,
validation_resources=validation_resources,
wait_until='ACTIVE')
# NOTE(jparker) Order of operations for clean attempts to remove
# validation resources before server is removed. Because of this
# cleanup of validation fails since the server is still present.
# Need to explicitly add cleanup of server first in order to remove
# validations at end of test.
self.addCleanup(waiters.wait_for_server_termination,
self.servers_client, server['id'])
self.addCleanup(test_utils.call_and_ignore_notfound_exc,
self.servers_client.delete_server, server['id'])
return server
class VGPUSanity(VGPUTest):
def test_boot_instance_with_vgpu(self):
"""Test creating an instance with a vGPU resource"""
# Confirm vGPU guest XML contains correct number of vgpu devices. Then
# confirm the vgpu vendor id is present in the sysfs for the guest
validation_resources = self.get_test_validation_resources(
self.os_primary)
server = self.create_validateable_instance(
self.vgpu_flavor,
validation_resources)
linux_client = self._create_ssh_client(server, validation_resources)
self._validate_vgpu_instance(
server,
linux_client=linux_client,
expected_device_count=self.vgpu_amount_per_instance)
class VGPUColdMigration(VGPUTest):
# Requires at least placement microversion 1.14 in order search through
# nested resources providers via the 'in_tree=<UUID>' parameter
placement_min_microversion = '1.14'
placement_max_microversion = 'latest'
@classmethod
def skip_checks(cls):
super(VGPUColdMigration, cls).skip_checks()
if CONF.compute.min_compute_nodes < 2:
msg = "Need two or more compute nodes to execute cold migration"
raise cls.skipException(msg)
if not CONF.whitebox_hardware.vgpu_cold_migration_supported:
msg = "vGPU Cold Migration support needed in order to run tests"
raise cls.skipException(msg)
def test_vgpu_cold_migration(self):
validation_resources = self.get_test_validation_resources(
self.os_primary)
server = self.create_validateable_instance(
self.vgpu_flavor, validation_resources)
linux_client = self._create_ssh_client(server, validation_resources)
# Determine the host the vGPU enabled guest is currently on. Next
# get another potential compute host to serve as the migration target
src_host = self.get_host_for_server(server['id'])
dest_host = self.get_host_other_than(server['id'])
# Get the current VGPU usage from the resource providers on
# the source and destination compute hosts.
pre_src_usage = self._get_vgpu_util_for_host(src_host)
pre_dest_usage = self._get_vgpu_util_for_host(dest_host)
# Cold migrate the the instance to the target host
self._cold_migrate_server(server['id'],
target_host=dest_host,
revert=False)
LOG.info('Guest %(server)s was just cold migrated to %(dest_host)s, '
'guest will now be validated after operation',
{'server': server['id'], 'dest_host': dest_host})
self._validate_vgpu_instance(
server,
linux_client=linux_client,
expected_device_count=self.vgpu_amount_per_instance)
# Regather the VGPU resource usage on both compute hosts involved in
# the cold migration. Confirm the original source host's VGPU usage has
# updated to no longer report original usage for the vGPU resource and
# the destination is now accounting for the resource.
post_src_usage = self._get_vgpu_util_for_host(src_host)
post_dest_usage = self._get_vgpu_util_for_host(dest_host)
expected_src_usage = pre_src_usage - self.vgpu_amount_per_instance
self.assertEqual(
expected_src_usage,
post_src_usage, 'After migration, host %s expected to have %s '
'usage for resource class VGPU but instead found %d' %
(src_host, expected_src_usage, post_src_usage))
expected_dest_usage = pre_dest_usage + self.vgpu_amount_per_instance
self.assertEqual(
expected_dest_usage, post_dest_usage, 'After migration, Host '
'%s expected to have resource class VGPU usage totaling %d but '
'instead found %d' %
(dest_host, expected_dest_usage, post_dest_usage))
def test_revert_vgpu_cold_migration(self):
validation_resources = self.get_test_validation_resources(
self.os_primary)
server = self.create_validateable_instance(
self.vgpu_flavor,
validation_resources)
linux_client = self._create_ssh_client(server, validation_resources)
# Determine the host the vGPU enabled guest is currently on. Next
# get another potential compute host to serve as the migration target
src_host = self.get_host_for_server(server['id'])
dest_host = self.get_host_other_than(server['id'])
# Get the current VGPU usage from the resource providers on
# the source and destination compute hosts.
pre_src_usage = self._get_vgpu_util_for_host(src_host)
pre_dest_usage = self._get_vgpu_util_for_host(dest_host)
# Cold migrate the the instance to the target host
self._cold_migrate_server(server['id'],
target_host=dest_host,
revert=True)
# Sanity check the guest, confirming the vgpu XML device is present
# and the vendor id is present in the sysfs
LOG.info(
'Cold migration of guest %(server)s has been reverted back to '
'%(src_host)s, vGPU guest will now be validated after revert '
'operation', {'server': server['id'], 'src_host': src_host})
self._validate_vgpu_instance(
server,
linux_client=linux_client,
expected_device_count=self.vgpu_amount_per_instance)
# Regather the VGPU resource usage on both compute hosts involved in
# the cold migration. Due to the migration revert, confirm the target
# host is not reporting vGPU usage for the instance and the source
# host accurately reports current usage based on flavor request
current_src_usage = self._get_vgpu_util_for_host(src_host)
current_dest_usage = self._get_vgpu_util_for_host(dest_host)
self.assertEqual(
pre_dest_usage, current_dest_usage, 'After migration revert, host '
'%s expected to not have any usage for resource class VGPU but '
'instead found %d' % (dest_host, current_dest_usage))
self.assertEqual(
pre_src_usage, current_src_usage, 'After migration revert '
'host %s expected to have resource class VGPU usage totaling %d '
'but instead found %d' %
(src_host, pre_src_usage, current_dest_usage))
# Do a final sanity check of the guest after the rever to confirm the
# vgpu device is present in the XML and vendor id is present in sysfs
self._validate_vgpu_instance(
server,
linux_client=linux_client,
expected_device_count=self.vgpu_amount_per_instance)
class VGPUResizeInstance(VGPUTest):
# Requires at least placement microversion 1.14 in order search through
# nested resources providers via the 'in_tree=<UUID>' parameter
placement_min_microversion = '1.14'
placement_max_microversion = 'latest'
@classmethod
def skip_checks(cls):
super(VGPUResizeInstance, cls).skip_checks()
if not CONF.whitebox_hardware.vgpu_cold_migration_supported:
msg = "vGPU Cold Migration support needed in order to run " \
"resize tests"
raise cls.skipException(msg)
if not CONF.compute_feature_enabled.resize:
msg = 'Resize not available.'
raise cls.skipException(msg)
def test_vgpu_to_standard_resize(self):
# Create a vGPU instance and get the vGPU resource utilization from
# its compute host
server = self.create_test_server(flavor=self.vgpu_flavor['id'],
wait_until='ACTIVE')
host = self.get_host_for_server(server['id'])
pre_resize_usage = self._get_vgpu_util_for_host(host)
standard_flavor = self.create_flavor()
self.resize_server(server['id'], standard_flavor['id'])
# Check the guest's XML and confirm the mdev device is no longer
# present
self._validate_vgpu_instance(
server,
linux_client=None,
expected_device_count=0)
if CONF.compute_feature_enabled.console_output:
# Confirm there are no errors when interacting with the guest
# after it was resized from vgpu to standard
self.servers_client.get_console_output(server['id'])
# Gather the vGPU resource utilization from the compute host. The
# instance will either land on a new compute host or remain on
# the same source host but will be resized to a standard flavor.
# In either action the source host should always report a vGPU usage
# that is less than what the guest was originally utilizing
post_resize_usage = self._get_vgpu_util_for_host(host)
expected_usage = pre_resize_usage - self.vgpu_amount_per_instance
# Confirm the original host's vGPU resource usage now accounts for the
# guest resizing to a flavor without any vGPU resources
self.assertEqual(
expected_usage, post_resize_usage, 'After guest resize, host '
'%s should be reporting total vGPU usage of %d, but instead is'
'reporting %d' % (host, expected_usage, post_resize_usage))
def test_standard_to_vgpu_resize(self):
# Create a standard instance and then resize the instance to a flavor
# that uses a vGPU resource
validation_resources = self.get_test_validation_resources(
self.os_primary)
standard_flavor = self.create_flavor()
server = self.create_validateable_instance(
standard_flavor,
validation_resources)
linux_client = self._create_ssh_client(server, validation_resources)
self.resize_server(server['id'], self.vgpu_flavor['id'])
# Check the guest's XML and confirm that the correct number of vGPU
# devices are present and the devices are present in the guest sysfs
self._validate_vgpu_instance(
server,
linux_client=linux_client,
expected_device_count=self.vgpu_amount_per_instance)