
Previously, our compute base class overwrote some clients with their admin versions. This is needlessly confusing. This patch just removes the setup_clients() method altogether, along with the create_test_server() override, and forces tests to explicitly use either self.os_primary or self.os_admin as the clients. This change has a few simple but widespread consequences: * We need to start using self.get_host_for_server() in a few places, instead of looking up the 'OS-EXT-SRV-ATTR:host' attribute in the server dict, as that's only present in the admin response. * We can drop the public visibility in copy_default_image(), as that's only allowed for admins, and the default shared visibility should work just as well for us. * The unit test for list_compute_hosts() would need to get fixed to account for the use of self.os_admin.services_client instead of self.services_client. Rather than do that, just drop the test entirely, it adds no value as list_compute_hosts() is used in whitebox tests themselves. * We need to start explicitly plassing wait_until='ACTIVE' to every one of our create_test_server calls, as the override used to do that for us. * Our live_migrate() helper now needs to be passed a clients manager so that it can pass that through to the waiter when waiting for the server to reach a particular status after the live migration. Depends-on: https://review.opendev.org/c/openstack/tempest/+/820062 Change-Id: I8d5be63275bd8a28b7012e14b99cadafdea53a47
495 lines
22 KiB
Python
495 lines
22 KiB
Python
# Copyright 2020 Red Hat Inc.
|
|
# All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from tempest.common.utils.linux import remote_client
|
|
from tempest.common import waiters
|
|
from tempest import config
|
|
from tempest.lib.common.utils import data_utils
|
|
from tempest.lib.common.utils import test_utils
|
|
from whitebox_tempest_plugin.api.compute import base
|
|
|
|
from oslo_log import log as logging
|
|
|
|
CONF = config.CONF
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
|
|
def get_pci_address(domain, bus, slot, func):
|
|
"""Assembles PCI address components into a fully-specified PCI address.
|
|
|
|
NOTE(jparker): This has been lifted from nova.pci.utils with no
|
|
adjustments
|
|
|
|
Does not validate that the components are valid hex or wildcard values.
|
|
:param domain, bus, slot, func: Hex or wildcard strings.
|
|
:return: A string of the form "<domain>:<bus>:<slot>.<function>".
|
|
"""
|
|
return '%s:%s:%s.%s' % (domain, bus, slot, func)
|
|
|
|
|
|
class VGPUTest(base.BaseWhiteboxComputeTest):
|
|
|
|
# NOTE(jparker) as of Queens all hypervisors that support vGPUs accept
|
|
# a single vGPU per instance, so this value is not exposed as a whitebox
|
|
# hardware configurable at this time.
|
|
vgpu_amount_per_instance = 1
|
|
|
|
@classmethod
|
|
def skip_checks(cls):
|
|
super(VGPUTest, cls).skip_checks()
|
|
if (CONF.whitebox_hardware.vgpu_vendor_id is None):
|
|
msg = "CONF.whitebox_hardware.vgpu_vendor_id needs to be set."
|
|
raise cls.skipException(msg)
|
|
|
|
@classmethod
|
|
def resource_setup(cls):
|
|
# NOTE(jparker) Currently the inheritance tree for Whitebox test
|
|
# classes for the test method create_flavor() does not resolve to a
|
|
# classmethod for Whitebox. resource_setup expects setup methods to be
|
|
# classmethods, so directly calling the flavors client to create the
|
|
# necessary flavor for the tests
|
|
super(VGPUTest, cls).resource_setup()
|
|
flavor_name = data_utils.rand_name('vgpu_test_flavor')
|
|
extra_specs = {"resources:VGPU": str(cls.vgpu_amount_per_instance)}
|
|
cls.vgpu_flavor = cls.admin_flavors_client.create_flavor(
|
|
name=flavor_name,
|
|
ram=64,
|
|
vcpus=2,
|
|
disk=CONF.whitebox.flavor_volume_size,
|
|
is_public='True')['flavor']
|
|
cls.flavors_client.set_flavor_extra_spec(cls.vgpu_flavor['id'],
|
|
**extra_specs)
|
|
|
|
cls.addClassResourceCleanup(
|
|
cls.admin_flavors_client.wait_for_resource_deletion,
|
|
cls.vgpu_flavor['id'])
|
|
cls.addClassResourceCleanup(cls.admin_flavors_client.delete_flavor,
|
|
cls.vgpu_flavor['id'])
|
|
|
|
def _create_ssh_client(self, server, validation_resources):
|
|
"""Create an ssh client to execute commands on the guest instance
|
|
|
|
:param server: the ssh client will be setup to interface with the
|
|
provided server instance
|
|
:param valdiation_resources: necessary validation information to setup
|
|
an ssh session
|
|
:return linux_client: the ssh client that allows for guest command
|
|
execution
|
|
"""
|
|
linux_client = remote_client.RemoteClient(
|
|
self.get_server_ip(server, validation_resources),
|
|
self.image_ssh_user,
|
|
self.image_ssh_password,
|
|
validation_resources['keypair']['private_key'],
|
|
server=server,
|
|
servers_client=self.servers_client)
|
|
linux_client.validate_authentication()
|
|
return linux_client
|
|
|
|
@classmethod
|
|
def setup_credentials(cls):
|
|
cls.prepare_instance_network()
|
|
super(VGPUTest, cls).setup_credentials()
|
|
|
|
def _get_rp_uuid_from_hostname(self, hostname):
|
|
"""Given a provided compute host return its associated rp uuid
|
|
|
|
:param hostname: str, compute hostname to check
|
|
:return parent_rp_uuid: str, string representation of the rp uuid
|
|
found on the compute host
|
|
"""
|
|
resp = self.os_admin.resource_providers_client.list_resource_providers(
|
|
name=hostname)
|
|
return resp.get('resource_providers')[0].get('uuid')
|
|
|
|
def _get_all_children_of_resource_provider(self, rp_uuid):
|
|
"""List all child RP UUIDs of provided resource provider
|
|
|
|
Given a parent resource provider uuid, get all in-tree child RP UUID
|
|
that can provide the request resource amount
|
|
API Reference:
|
|
https://docs.openstack.org/api-ref/placement/#resource-providers
|
|
|
|
:param rp_uuid: str, string representation of rp uuid to be searched
|
|
:return rp_children_uuids: list of str, all rp uuids that match the
|
|
resource=amount request
|
|
"""
|
|
params = {'in_tree': rp_uuid}
|
|
resp = self.os_admin.resource_providers_client.list_resource_providers(
|
|
**params)
|
|
# Create a list of uuids based on the return response of resource
|
|
# providers from the rp client, exclude the parent uuid from this
|
|
# list.
|
|
child_uuids = [x.get('uuid') for x in resp.get('resource_providers')
|
|
if x != rp_uuid]
|
|
return child_uuids
|
|
|
|
def _get_usage_for_resource_class_vgpu(self, rp_uuids):
|
|
"""Total usage of resource class vGPU from provided list of RP UUIDs
|
|
|
|
:param rp_uuids: list, comprised of str representing all RP UUIDs to
|
|
query
|
|
:return total_vgpu_usage: int, total usage of resource class VGPU from
|
|
all provided RP UUIDS
|
|
"""
|
|
total_vgpu_usage = 0
|
|
for rp_uuid in rp_uuids:
|
|
resp = self.os_admin.resource_providers_client.\
|
|
list_resource_provider_usages(rp_uuid=rp_uuid)
|
|
rp_usages = resp.get('usages')
|
|
vgpu_usage = rp_usages.get('VGPU', 0)
|
|
total_vgpu_usage += vgpu_usage
|
|
return total_vgpu_usage
|
|
|
|
def _get_vgpu_util_for_host(self, hostname):
|
|
"""Get the total usage of a vGPU resource class from the compute host
|
|
|
|
:param hostname: str, compute hostname to gather usage data from
|
|
:return resource_usage_count: int, the current total usage for the vGPU
|
|
resource class
|
|
"""
|
|
rp_uuid = self._get_rp_uuid_from_hostname(hostname)
|
|
rp_children = self._get_all_children_of_resource_provider(
|
|
rp_uuid=rp_uuid)
|
|
resource_usage_count = \
|
|
self._get_usage_for_resource_class_vgpu(rp_children)
|
|
return resource_usage_count
|
|
|
|
def _get_pci_addr_from_device(self, xml_element):
|
|
"""Return pci address value from provided domain device xml element
|
|
|
|
:param xml_element: Etree XML element device from guest instance
|
|
:return str: the pci address found from the xml element in the format
|
|
sys:bus:slot:function
|
|
"""
|
|
pci_addr_element = xml_element.find(".address[@type='pci']")
|
|
domain = pci_addr_element.get('domain').replace('0x', '')
|
|
bus = pci_addr_element.get('bus').replace('0x', '')
|
|
slot = pci_addr_element.get('slot').replace('0x', '')
|
|
func = pci_addr_element.get('function').replace('0x', '')
|
|
pci_address = get_pci_address(domain, bus, slot, func)
|
|
return pci_address
|
|
|
|
def _assert_vendor_id_in_guest(self, pci_address, linux_client):
|
|
"""Confirm vgpu vendor id is present in server instance sysfs
|
|
|
|
:param pci_address: str when searching the guest's pci devices use
|
|
provided pci_address value to parse for vendor id
|
|
"""
|
|
cmd = "cat /sys/bus/pci/devices/%s/vendor" % pci_address
|
|
sys_out = linux_client.exec_command(cmd)
|
|
self.assertIn(CONF.whitebox_hardware.vgpu_vendor_id, sys_out,
|
|
"Vendor ID %s not found in output %s" %
|
|
(CONF.whitebox_hardware.vgpu_vendor_id, sys_out))
|
|
|
|
def _cold_migrate_server(self, server_id, target_host, revert=False):
|
|
"""Cold migrate a server with the option to revert the migration
|
|
|
|
:param server_id: str, uuid of the server to migrate
|
|
:param revert: bool, revert server migration action if true
|
|
"""
|
|
src_host = self.get_host_for_server(server_id)
|
|
self.admin_servers_client.migrate_server(server_id, host=target_host)
|
|
waiters.wait_for_server_status(self.servers_client, server_id,
|
|
'VERIFY_RESIZE')
|
|
|
|
if revert:
|
|
self.admin_servers_client.revert_resize_server(server_id)
|
|
assert_func = self.assertEqual
|
|
else:
|
|
self.admin_servers_client.confirm_resize_server(server_id)
|
|
assert_func = self.assertNotEqual
|
|
|
|
waiters.wait_for_server_status(self.servers_client,
|
|
server_id, 'ACTIVE')
|
|
dst_host = self.get_host_for_server(server_id)
|
|
assert_func(src_host, dst_host)
|
|
|
|
def _validate_vgpu_instance(self, server, linux_client,
|
|
expected_device_count):
|
|
"""Confirm vgpu guest XML is correct and vendor id is present in guest
|
|
|
|
:param server: dict, attributes describing the guest instance
|
|
:param linux_client: ssh client capable of interacting with guest
|
|
:param expected_device_count: int, expected number of XML vgpu devices
|
|
in the guest
|
|
"""
|
|
# Find all hostdev devices on the instance of type mdev in the provided
|
|
# instance's XML
|
|
vgpu_devices = self.get_server_xml(server['id']).findall(
|
|
"./devices/hostdev[@type='mdev']"
|
|
)
|
|
|
|
# Validate the numer of mdev host devices is equal to the expected
|
|
# count provided to the method
|
|
self.assertEqual(
|
|
expected_device_count, len(vgpu_devices), "Expected %d "
|
|
"xml hostdev vgpu element(s) on instance %s but instead found %d" %
|
|
(expected_device_count, server['id'],
|
|
len(vgpu_devices)))
|
|
|
|
# If there are no expected mdev devices, additional verification of
|
|
# the vgpu device is not necessary
|
|
if expected_device_count == 0:
|
|
return
|
|
|
|
# Determine the pci address of the vgpu hostdev element and use this
|
|
# address to search for the vendor id in the guest sysfs
|
|
for vgpu_xml_element in vgpu_devices:
|
|
pci_address = self._get_pci_addr_from_device(vgpu_xml_element)
|
|
|
|
# Validate the vendor id is present in guest instance
|
|
self._assert_vendor_id_in_guest(pci_address, linux_client)
|
|
|
|
def create_validateable_instance(self, flavor, validation_resources):
|
|
"""Create a validateable instance based on provided flavor
|
|
|
|
:param flavor: dict, attributes describing flavor
|
|
:param validation_resources: dict, parameters necessary to setup ssh
|
|
client and validate the guest
|
|
"""
|
|
server = self.create_test_server(
|
|
flavor=flavor['id'],
|
|
validatable=True,
|
|
validation_resources=validation_resources,
|
|
wait_until='ACTIVE')
|
|
|
|
# NOTE(jparker) Order of operations for clean attempts to remove
|
|
# validation resources before server is removed. Because of this
|
|
# cleanup of validation fails since the server is still present.
|
|
# Need to explicitly add cleanup of server first in order to remove
|
|
# validations at end of test.
|
|
self.addCleanup(waiters.wait_for_server_termination,
|
|
self.servers_client, server['id'])
|
|
self.addCleanup(test_utils.call_and_ignore_notfound_exc,
|
|
self.servers_client.delete_server, server['id'])
|
|
return server
|
|
|
|
|
|
class VGPUSanity(VGPUTest):
|
|
def test_boot_instance_with_vgpu(self):
|
|
"""Test creating an instance with a vGPU resource"""
|
|
# Confirm vGPU guest XML contains correct number of vgpu devices. Then
|
|
# confirm the vgpu vendor id is present in the sysfs for the guest
|
|
validation_resources = self.get_test_validation_resources(
|
|
self.os_primary)
|
|
server = self.create_validateable_instance(
|
|
self.vgpu_flavor,
|
|
validation_resources)
|
|
linux_client = self._create_ssh_client(server, validation_resources)
|
|
self._validate_vgpu_instance(
|
|
server,
|
|
linux_client=linux_client,
|
|
expected_device_count=self.vgpu_amount_per_instance)
|
|
|
|
|
|
class VGPUColdMigration(VGPUTest):
|
|
|
|
# Requires at least placement microversion 1.14 in order search through
|
|
# nested resources providers via the 'in_tree=<UUID>' parameter
|
|
placement_min_microversion = '1.14'
|
|
placement_max_microversion = 'latest'
|
|
|
|
@classmethod
|
|
def skip_checks(cls):
|
|
super(VGPUColdMigration, cls).skip_checks()
|
|
if CONF.compute.min_compute_nodes < 2:
|
|
msg = "Need two or more compute nodes to execute cold migration"
|
|
raise cls.skipException(msg)
|
|
if not CONF.whitebox_hardware.vgpu_cold_migration_supported:
|
|
msg = "vGPU Cold Migration support needed in order to run tests"
|
|
raise cls.skipException(msg)
|
|
|
|
def test_vgpu_cold_migration(self):
|
|
validation_resources = self.get_test_validation_resources(
|
|
self.os_primary)
|
|
server = self.create_validateable_instance(
|
|
self.vgpu_flavor, validation_resources)
|
|
linux_client = self._create_ssh_client(server, validation_resources)
|
|
|
|
# Determine the host the vGPU enabled guest is currently on. Next
|
|
# get another potential compute host to serve as the migration target
|
|
src_host = self.get_host_for_server(server['id'])
|
|
dest_host = self.get_host_other_than(server['id'])
|
|
|
|
# Get the current VGPU usage from the resource providers on
|
|
# the source and destination compute hosts.
|
|
pre_src_usage = self._get_vgpu_util_for_host(src_host)
|
|
pre_dest_usage = self._get_vgpu_util_for_host(dest_host)
|
|
|
|
# Cold migrate the the instance to the target host
|
|
self._cold_migrate_server(server['id'],
|
|
target_host=dest_host,
|
|
revert=False)
|
|
|
|
LOG.info('Guest %(server)s was just cold migrated to %(dest_host)s, '
|
|
'guest will now be validated after operation',
|
|
{'server': server['id'], 'dest_host': dest_host})
|
|
self._validate_vgpu_instance(
|
|
server,
|
|
linux_client=linux_client,
|
|
expected_device_count=self.vgpu_amount_per_instance)
|
|
|
|
# Regather the VGPU resource usage on both compute hosts involved in
|
|
# the cold migration. Confirm the original source host's VGPU usage has
|
|
# updated to no longer report original usage for the vGPU resource and
|
|
# the destination is now accounting for the resource.
|
|
post_src_usage = self._get_vgpu_util_for_host(src_host)
|
|
post_dest_usage = self._get_vgpu_util_for_host(dest_host)
|
|
expected_src_usage = pre_src_usage - self.vgpu_amount_per_instance
|
|
self.assertEqual(
|
|
expected_src_usage,
|
|
post_src_usage, 'After migration, host %s expected to have %s '
|
|
'usage for resource class VGPU but instead found %d' %
|
|
(src_host, expected_src_usage, post_src_usage))
|
|
expected_dest_usage = pre_dest_usage + self.vgpu_amount_per_instance
|
|
self.assertEqual(
|
|
expected_dest_usage, post_dest_usage, 'After migration, Host '
|
|
'%s expected to have resource class VGPU usage totaling %d but '
|
|
'instead found %d' %
|
|
(dest_host, expected_dest_usage, post_dest_usage))
|
|
|
|
def test_revert_vgpu_cold_migration(self):
|
|
validation_resources = self.get_test_validation_resources(
|
|
self.os_primary)
|
|
server = self.create_validateable_instance(
|
|
self.vgpu_flavor,
|
|
validation_resources)
|
|
linux_client = self._create_ssh_client(server, validation_resources)
|
|
|
|
# Determine the host the vGPU enabled guest is currently on. Next
|
|
# get another potential compute host to serve as the migration target
|
|
src_host = self.get_host_for_server(server['id'])
|
|
dest_host = self.get_host_other_than(server['id'])
|
|
|
|
# Get the current VGPU usage from the resource providers on
|
|
# the source and destination compute hosts.
|
|
pre_src_usage = self._get_vgpu_util_for_host(src_host)
|
|
pre_dest_usage = self._get_vgpu_util_for_host(dest_host)
|
|
|
|
# Cold migrate the the instance to the target host
|
|
self._cold_migrate_server(server['id'],
|
|
target_host=dest_host,
|
|
revert=True)
|
|
|
|
# Sanity check the guest, confirming the vgpu XML device is present
|
|
# and the vendor id is present in the sysfs
|
|
LOG.info(
|
|
'Cold migration of guest %(server)s has been reverted back to '
|
|
'%(src_host)s, vGPU guest will now be validated after revert '
|
|
'operation', {'server': server['id'], 'src_host': src_host})
|
|
self._validate_vgpu_instance(
|
|
server,
|
|
linux_client=linux_client,
|
|
expected_device_count=self.vgpu_amount_per_instance)
|
|
|
|
# Regather the VGPU resource usage on both compute hosts involved in
|
|
# the cold migration. Due to the migration revert, confirm the target
|
|
# host is not reporting vGPU usage for the instance and the source
|
|
# host accurately reports current usage based on flavor request
|
|
current_src_usage = self._get_vgpu_util_for_host(src_host)
|
|
current_dest_usage = self._get_vgpu_util_for_host(dest_host)
|
|
self.assertEqual(
|
|
pre_dest_usage, current_dest_usage, 'After migration revert, host '
|
|
'%s expected to not have any usage for resource class VGPU but '
|
|
'instead found %d' % (dest_host, current_dest_usage))
|
|
self.assertEqual(
|
|
pre_src_usage, current_src_usage, 'After migration revert '
|
|
'host %s expected to have resource class VGPU usage totaling %d '
|
|
'but instead found %d' %
|
|
(src_host, pre_src_usage, current_dest_usage))
|
|
|
|
# Do a final sanity check of the guest after the rever to confirm the
|
|
# vgpu device is present in the XML and vendor id is present in sysfs
|
|
self._validate_vgpu_instance(
|
|
server,
|
|
linux_client=linux_client,
|
|
expected_device_count=self.vgpu_amount_per_instance)
|
|
|
|
|
|
class VGPUResizeInstance(VGPUTest):
|
|
|
|
# Requires at least placement microversion 1.14 in order search through
|
|
# nested resources providers via the 'in_tree=<UUID>' parameter
|
|
placement_min_microversion = '1.14'
|
|
placement_max_microversion = 'latest'
|
|
|
|
@classmethod
|
|
def skip_checks(cls):
|
|
super(VGPUResizeInstance, cls).skip_checks()
|
|
if not CONF.whitebox_hardware.vgpu_cold_migration_supported:
|
|
msg = "vGPU Cold Migration support needed in order to run " \
|
|
"resize tests"
|
|
raise cls.skipException(msg)
|
|
if not CONF.compute_feature_enabled.resize:
|
|
msg = 'Resize not available.'
|
|
raise cls.skipException(msg)
|
|
|
|
def test_vgpu_to_standard_resize(self):
|
|
# Create a vGPU instance and get the vGPU resource utilization from
|
|
# its compute host
|
|
server = self.create_test_server(flavor=self.vgpu_flavor['id'],
|
|
wait_until='ACTIVE')
|
|
host = self.get_host_for_server(server['id'])
|
|
pre_resize_usage = self._get_vgpu_util_for_host(host)
|
|
standard_flavor = self.create_flavor()
|
|
self.resize_server(server['id'], standard_flavor['id'])
|
|
|
|
# Check the guest's XML and confirm the mdev device is no longer
|
|
# present
|
|
self._validate_vgpu_instance(
|
|
server,
|
|
linux_client=None,
|
|
expected_device_count=0)
|
|
|
|
if CONF.compute_feature_enabled.console_output:
|
|
# Confirm there are no errors when interacting with the guest
|
|
# after it was resized from vgpu to standard
|
|
self.servers_client.get_console_output(server['id'])
|
|
|
|
# Gather the vGPU resource utilization from the compute host. The
|
|
# instance will either land on a new compute host or remain on
|
|
# the same source host but will be resized to a standard flavor.
|
|
# In either action the source host should always report a vGPU usage
|
|
# that is less than what the guest was originally utilizing
|
|
post_resize_usage = self._get_vgpu_util_for_host(host)
|
|
expected_usage = pre_resize_usage - self.vgpu_amount_per_instance
|
|
|
|
# Confirm the original host's vGPU resource usage now accounts for the
|
|
# guest resizing to a flavor without any vGPU resources
|
|
self.assertEqual(
|
|
expected_usage, post_resize_usage, 'After guest resize, host '
|
|
'%s should be reporting total vGPU usage of %d, but instead is'
|
|
'reporting %d' % (host, expected_usage, post_resize_usage))
|
|
|
|
def test_standard_to_vgpu_resize(self):
|
|
# Create a standard instance and then resize the instance to a flavor
|
|
# that uses a vGPU resource
|
|
validation_resources = self.get_test_validation_resources(
|
|
self.os_primary)
|
|
standard_flavor = self.create_flavor()
|
|
server = self.create_validateable_instance(
|
|
standard_flavor,
|
|
validation_resources)
|
|
linux_client = self._create_ssh_client(server, validation_resources)
|
|
self.resize_server(server['id'], self.vgpu_flavor['id'])
|
|
|
|
# Check the guest's XML and confirm that the correct number of vGPU
|
|
# devices are present and the devices are present in the guest sysfs
|
|
self._validate_vgpu_instance(
|
|
server,
|
|
linux_client=linux_client,
|
|
expected_device_count=self.vgpu_amount_per_instance)
|