Walter A. Boring IV 912809b0fa Add an Online check for the FC HBA
Fibre Channel devices can go into Linkdown or Offline mode after
a many times being PCI pass through to virsh domains.  We need
to make sure that the HBA we are attempting to pass through to the
virsh domain is actually in port_state="Online".  If the HBA
isn't online, then every volume attachment via that HBA will fail
100% of the time.

This patch adds a test against the requested HBA(s) and makes sure
that they are Online.  If all the requested HBA(s) are not Online,
then the script will fail.

Change-Id: Icf05bc3ed6adb842006852f1804696fc416c0d26
2015-11-16 08:39:55 -08:00

245 lines
8.3 KiB
Bash

#!/usr/bin/env bash
# Copyright (C) 2015 Hewlett-Packard Development Company, L.P.
# Copyright (C) 2015 Pure Storage, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
#
# See the License for the specific language governing permissions and
# limitations under the License.
# Shell commands to get virsh the information it
# needs to successfully pass through a Fibre Channel PCI Card to the virtual
# machine this script is running on. The instance only knows its IP address,
# while its Virsh name is required for pass through. This script uses Nova on
# the provider blade as an intermediary to find the name. Meanwhile, this
# script finds the Fibre Channel PCI card on the provider and generates the
# information Virsh needs to attach it.
#
# Expect four env variables, the provider hostname (optionally user if needed)
# the private key file we should use to connect to the provider, and the file
# that should be sourced for OpenStack credentials.
#
# export FC_PROVIDER=my.provider.hostname
# export FC_PROVIDER_USER=root
# export FC_PROVIDER_KEY=/opt/nodepool-scripts/passthrough
# export FC_PROVIDER_RC=/root/keystonerc_jenkins
#
# The maximum number of FC devices to passthrough, failing if they cannot all be
# aquired
# export FC_NUM=2 (default 1)
#
# For single node setups where the hypervisor is the same as the provider, and dns
# is not configured, export this variable to use the provider ip as the hypervisor
# export FC_SINGLE_NODE=1
FC_NUM=${FC_NUM:-1}
FC_PCI_VAR_NAME=${FC_PCI_VAR_NAME:-"fc_pci_device"}
echo "Planning to passthrough $FC_NUM pci devices"
eth0_ip=$(hostname -I | cut -f1 -d' ')
PROVIDER=${FC_PROVIDER}
if [[ -z $PROVIDER ]]; then
eth0_ip_base=$(echo $eth0_ip | cut -f1,2,3 -d.)
PROVIDER="${eth0_ip_base}.1"
fi
PROVIDER_KEY=${FC_PROVIDER_KEY:-"/opt-nodepool-scripts/passthrough"}
PROVIDER_RC=${FC_PROVIDER_RC:-"keystonerc_jenkins"}
CURRENT_USER=$(whoami)
PROVIDER_USER=${FC_PROVIDER_USER:-$CURRENT_USER}
# Passthrough is a private key that needs to be setup for the provider
# and any compute nodes that might end up hosting the VM we want passthrough on.
# We will assume ownership of the key (probably as the jenkins user..), also
# assuming the group is the same name as the user...
sudo chown $CURRENT_USER:$CURRENT_USER $PROVIDER_KEY
chmod 0400 $PROVIDER_KEY
# Get our NOVA_ID
NOVA_LIST=$(ssh -i $PROVIDER_KEY $PROVIDER_USER@$PROVIDER "source $PROVIDER_RC && nova list")
nova_result=$?
NOVA_ID=$(echo "$NOVA_LIST" | grep ACTIVE | grep -v deleting | grep $eth0_ip | cut -d \| -f 2 | tr -d '[:space:]')
echo "NOVA_ID result: $nova_result"
if [[ $nova_result -ne 0 || -z "$NOVA_ID" ]]; then
echo "Unable to get Nova ID. Aborting. Debug info:"
echo $NOVA_LIST
echo "NOVA_ID: $NOVA_ID"
exit 2
fi
# Get instance details
NOVA_DETAILS=$(ssh -i $PROVIDER_KEY $PROVIDER_USER@$PROVIDER "source $PROVIDER_RC && nova show $NOVA_ID")
nova_results=$?
# Get our Virsh name
VIRSH_NAME=$(echo "$NOVA_DETAILS" | grep instance_name | cut -d \| -f 3 | tr -d '[:space:]')
virsh_result=$?
echo "VIRSH_NAME result: $virsh_result"
if [[ $nova_result -ne 0 || $virsh_result -ne 0 || -z "$VIRSH_NAME" ]]; then
echo "Unable to get Virsh Name. Aborting. Debug info:"
echo "NOVA_LIST:"
echo $NOVA_LIST
echo "NOVA_DETAILS:"
echo $NOVA_DETAILS
echo "VIRSH_NAME: $VIRSH_NAME"
exit 2
fi
# Get the hypervisor_hostname
if [[ -z $FC_SINGLE_NODE ]]; then
HYPERVISOR=$(echo "$NOVA_DETAILS" | grep hypervisor_hostname | cut -d \| -f 3 | tr -d '[:space:]')
hypervisor_result=$?
echo "HYPERVISOR result: $hypervisor_result"
if [[ $hypervisor_result -ne 0 || -z "$HYPERVISOR" ]]; then
echo "Unable to get Hypervisor Host Name. Aborting. Debug info:"
echo "NOVA_LIST:"
echo $NOVA_LIST
echo "NOVA_DETAILS:"
echo $NOVA_DETAILS
echo "HYPERVISOR: $HYPERVISOR"
exit 2
fi
else
HYPERVISOR=$PROVIDER
fi
echo "Found Hypervisor hostname: $HYPERVISOR"
fc_pci_device_cmd="echo \$$FC_PCI_VAR_NAME"
fc_pci_device=$(ssh -i $PROVIDER_KEY $PROVIDER_USER@$HYPERVISOR "$fc_pci_device_cmd")
if [[ -z $fc_pci_device ]]; then
echo "No FC device known. Set fc_pci_device in your /etc/profile.d or /etc/environment (depending on distro and ssh configuration) to the desired 'Class Device path', e.g. '0000:21:00.2'"
exit 2
fi
echo "Found pci devices: $fc_pci_device"
function is_device_online() {
fc_device=$1
# If a device is not "Online" we'll get an empty
# string as a result of the following command.
cmd="systool -c fc_host -v"
OUTPUT=$(ssh -i $PROVIDER_KEY $PROVIDER_USER@$HYPERVISOR "systool -c fc_host -v")
test_fc_online="systool -c fc_host -v | grep -B12 'Online' | grep 'Class Device path' | grep '$fc_device'"
ONLINE=$(ssh -i $PROVIDER_KEY $PROVIDER_USER@$HYPERVISOR "$test_fc_online")
echo "online result='$ONLINE'"
if [ -z "$ONLINE" ]; then
return 0;
else
return 1;
fi
}
exit_code=1
errexit=$(set +o | grep errexit)
#Ignore errors
set +e
let num_attached=0
for pci in $fc_pci_device; do
echo "Trying passthrough for $pci"
is_device_online $pci
online=$?
if [ $online -eq 1 ]; then
echo "Device($pci) is Online"
else
echo "Device($pci) is NOT Online"
# It does no good to passthrough an HBA that isn't Online.
# When an HBA goes into 'Linkdown' or 'Offline' mode, the
# host typically needs to get rebooted.
continue
fi
BUS=$(echo $pci | cut -d : -f2)
SLOT=$(echo $pci | cut -d : -f3 | cut -d . -f1)
FUNCTION=$(echo $pci | cut -d : -f3 | cut -d . -f2)
XML="<hostdev mode='subsystem' type='pci' managed='yes'><source><address domain='0x0000' bus='0x$BUS' slot='0x$SLOT' function='0x$FUNCTION'/></source></hostdev>"
echo $XML
fcoe=`mktemp --suffix=_fcoe.xml`
echo $XML > $fcoe
fc_virsh_device="pci_0000_${BUS}_${SLOT}_${FUNCTION}"
scp -i $PROVIDER_KEY $fcoe $PROVIDER_USER@$HYPERVISOR:/tmp/
# Run passthrough and clean up.
# TODO: At the point where we can do more than one node on a provider we
# will need to do this cleanup at the end of the job and not *before* attaching
# since we won't know which ones are still in use
echo $(sudo lspci | grep -i fib)
ssh -i $PROVIDER_KEY $PROVIDER_USER@$HYPERVISOR "virsh nodedev-dettach $fc_virsh_device"
detach_result=$?
echo "Detach result: $detach_result"
if [[ $detach_result -ne 0 ]]; then
echo "Detach failed ($detach_result). Trying next device..."
continue
fi
# Reattach the device to the host.
# This will hopefully reset the device
echo $(sudo lspci | grep -i fib)
ssh -i $PROVIDER_KEY $PROVIDER_USER@$HYPERVISOR "virsh nodedev-reattach $fc_virsh_device"
reattach_result=$?
echo "reattach result: $reattach_result"
if [[ $reattach_result -ne 0 ]]; then
echo "Reattach failed ($reattach_result). Trying next device..."
continue
fi
echo $(sudo lspci | grep -i fib)
ssh -i $PROVIDER_KEY $PROVIDER_USER@$HYPERVISOR "virsh attach-device $VIRSH_NAME $fcoe"
attach_result=$?
echo "Attach result: $attach_result"
if [[ $attach_result -eq 0 ]]; then
echo "Attached succeed. Trying next device..."
(( num_attached += 1 ))
exit_code=0
fi
echo $(sudo lspci | grep -i fib)
echo $num_attached
if [[ $num_attached -eq $FC_NUM ]]; then
echo "Attached $num_attached devices. Stopping"
break
fi
done
$errexit
if [[ $exit_code -ne 0 ]]; then
echo "FC Passthrough failed. Aborting."
exit $exit_code
fi
if [[ $num_attached -ne $FC_NUM ]]; then
echo "FC requested $FC_NUM, but only attached $num_attached. Aborting."
exit 1
fi
# Make sure that really it worked...
sudo modprobe lpfc
echo $?
sudo systool -c fc_host -v
echo $?
echo $(sudo lspci | grep -i fib)
device_path=$(sudo systool -c fc_host -v | grep "Device path")
if [[ ${#device_path} -eq 0 ]]; then
echo "Failed to install FC Drivers. Aborting."
exit 1
fi