
The introduction of the new pxeboot network requires maintenance verify and report on messaging failures over that network. Towards that, this update introduces periodic mtcAlive messaging between the mtcAgent and mtcClinet. Test Plan: PASS: Verify install and provision each system type with a mix of networking modes ; ethernet, bond and vlan - AIO SX, AIO DX, AIO DX plus - Standard System 2+1 - Storage System 2+1+1 PASS: Verify feature with physical on management interface PASS: Verify feature with vlan on management interface PASS: Verify feature with bonded management interface PASS: Verify feature with bonded vlans on management interface PASS: Verify in bonded cases handling with 2, 1 or no slaves found PASS: Verify mgmt-combined or separate cluster-host network PASS: Verify mtcClient pxeboot interface address learning - for worker and storage nodes ; dhcp leases file - for controller nodes before unlock ; dhcp leases file - for controller nodes after unlock ; static from ifcfg - from controller within 10 seconds of process restart PASS: Verify mtcAgent pxeboot interface address learning from dnsmasq.hosts file PASS: Verify pxeboot mtcAlive initiation, handling, loss detection and recovery PASS: Verify success and failure handling of all new pxeboot ip address learning functions ; - dhcp - all system node installs. - dnsmasq.hosts - active controller for all hosts. - interfaces.d - controller's mtcClient pxeboot address. - pxeboot req mtcAlive - mtcAgent mtcAlive request message. PASS: Verify mtcClient pxeboot network 'mtcAlive request' and 'reboot' command handling for ethernet, vlan and bond configs. PASS: Verify mtcAlive sequence number monitoring, out-of-sequence detection, handling and logging. PASS: Verify pxeboot rx socket binding and non-blocking attribute PASS: Verify mtcAgent handling stress soaking of sustained incoming 500+ msgs/sec ; batch handling and logging. PASS: Verify mtcAgent and mtcClient pxeboot tx and rx socket messaging, failure recovery handling and logging. PASS: Verify pxeboot receiver is not setup on the oam interface on controller-0 first install until after initial config complete. Regression: PASS: Verify mtcAgent/mtcClient online and offline state management PASS: Verify mtcAgent/mtcClient command handling - over management network - over cluster-host network PASS: Verify mtcClient interface chain log for all iface types - bond : vlan123 -> pxeboot0 (802.3ad 4) -> enp0s8 and enp0s9 - vlan : vlan123 -> enp0s8 - ethernet: enp0s8 PASS: Verify mtcAgent/mtcClient handling and logging including debug logging for standard operations - node install and unlock - node lock and unlock - node reinstall, reboot, reset PASS: Verify graceful recovery handling of heartbeat loss failure. - node reboot - management interface down PASS: Verify systemcontroller and subcloud install with dc-libvirt PASS: Verify no log flooding, coredumps, memory leaks Story: 2010940 Task: 49541 Change-Id: Ibc87b85e3e0e07c3b8c40b5291bd3372506fbdfb Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
436 lines
17 KiB
C++
Executable File
436 lines
17 KiB
C++
Executable File
/*
|
|
* Copyright (c) 2013, 2016, 2024 Wind River Systems, Inc.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
*/
|
|
|
|
/***************************************************************************
|
|
*
|
|
* @file
|
|
* Wind River CGTS Platform Node Maintenance "Finite State Machine"
|
|
*
|
|
* Description: This FSM follows the X.731 specification.
|
|
*
|
|
* The FSM manages nodes based on the following three perspectives
|
|
*
|
|
* Administrative: action taken on node (mtc_nodeAdministrative_action_type)
|
|
* Operational : state of the node mtc_nodeOperational_state_type)
|
|
* Availability : status of current node state (mtc_nodeAvailability_status_type)
|
|
*
|
|
*/
|
|
|
|
using namespace std;
|
|
|
|
#define __AREA__ "fsm"
|
|
|
|
#include "nodeClass.h"
|
|
#include "tokenUtil.h"
|
|
#include "mtcNodeFsm.h"
|
|
#include "mtcInvApi.h"
|
|
#include "mtcNodeMsg.h"
|
|
#include "mtcNodeHdlrs.h" /* for ... mtcTimer_handl */
|
|
|
|
int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
int rc = PASS ;
|
|
|
|
if ( node_ptr == NULL )
|
|
{
|
|
slog ("Null Node Pointer\n");
|
|
return FAIL ;
|
|
}
|
|
|
|
/* handle clear task request */
|
|
if ( node_ptr->clear_task == true )
|
|
{
|
|
mtcInvApi_update_task ( node_ptr, "" );
|
|
node_ptr->clear_task = false ;
|
|
}
|
|
|
|
/* Service the libEvent work queue */
|
|
workQueue_process ( node_ptr ) ;
|
|
|
|
/* Service the maintenance command queue if there are commands waiting */
|
|
if ( node_ptr->mtcCmd_work_fifo.size())
|
|
{
|
|
rc = nodeLinkClass::cmd_handler ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
return (rc);
|
|
}
|
|
}
|
|
|
|
/* Monitor and Manage active threads */
|
|
thread_handler ( node_ptr->bmc_thread_ctrl, node_ptr->bmc_thread_info );
|
|
if ( node_ptr->bmc_thread_ctrl.stage == THREAD_STAGE__KILL )
|
|
{
|
|
/* do nothing while thread is being killed */
|
|
return RETRY ;
|
|
}
|
|
|
|
/* manage the host connected state and board management alarms */
|
|
nodeLinkClass::bmc_handler ( node_ptr );
|
|
|
|
/* manage host's degrade state */
|
|
nodeLinkClass::degrade_handler ( node_ptr );
|
|
|
|
/*
|
|
* Always run the offline handler
|
|
*
|
|
* - does nothing unless in fault handling mode
|
|
* - looks for offline state during fault handling
|
|
*/
|
|
nodeLinkClass::offline_handler ( node_ptr );
|
|
|
|
/*
|
|
* Always run the online handler.
|
|
*
|
|
* - handles offline/online state transitions based on periodic audit
|
|
* with mtcAlive debouncing
|
|
*/
|
|
nodeLinkClass::online_handler ( node_ptr );
|
|
|
|
/*
|
|
* Always run the mtcAlive handler.
|
|
*
|
|
* - monitor host's mtcAlive messaging
|
|
* - manage host's mtcAlive missing alarm
|
|
*/
|
|
nodeLinkClass::pxeboot_mtcAlive_monitor ( node_ptr );
|
|
|
|
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__DELETE )
|
|
{
|
|
flog ("%s -> Delete Action\n", node_ptr->hostname.c_str());
|
|
nodeLinkClass::delete_handler ( node_ptr );
|
|
return (PASS);
|
|
}
|
|
|
|
|
|
/* Run the config FSM if the configAction bool is set.
|
|
* We keep this as a separate action unto itself so that
|
|
* mtce can continue to service all other actions for the
|
|
* same host while it handles configuration commands */
|
|
if (( node_ptr->configAction == MTC_CONFIG_ACTION__INSTALL_PASSWD ) ||
|
|
( node_ptr->configAction == MTC_CONFIG_ACTION__CHANGE_PASSWD ) ||
|
|
( node_ptr->configAction == MTC_CONFIG_ACTION__CHANGE_PASSWD_AGAIN ))
|
|
{
|
|
nodeLinkClass::cfg_handler ( node_ptr );
|
|
}
|
|
|
|
/****************************************************************************
|
|
* No Op: Do nothing for this Healthy Enabled Running Host
|
|
* This block of code was added to resolve an issue. With this change:
|
|
* the insv_test_handler gets run as soon as a host's main function is enabled.
|
|
****************************************************************************
|
|
*/
|
|
if (( node_ptr->ar_disabled == false ) &&
|
|
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
|
((node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
|
|
(node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )))
|
|
{
|
|
// flog ("%s -> insv_test_handler\n", node_ptr->hostname.c_str());
|
|
nodeLinkClass::insv_test_handler ( node_ptr );
|
|
}
|
|
|
|
/****************************************************************************
|
|
* Add Host Services:
|
|
****************************************************************************
|
|
*/
|
|
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__ADD )
|
|
{
|
|
flog ("%s -> Add Action\n", node_ptr->hostname.c_str());
|
|
nodeLinkClass::add_handler ( node_ptr );
|
|
}
|
|
|
|
|
|
/****************************************************************************
|
|
* No Op: Do nothing for this Healthy Enabled Running Host
|
|
****************************************************************************
|
|
*/
|
|
else if (( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
|
|
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
|
((node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
|
|
(node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )))
|
|
{
|
|
// flog ("%s -> oos_test_handler\n", node_ptr->hostname.c_str());
|
|
nodeLinkClass::oos_test_handler ( node_ptr );
|
|
}
|
|
|
|
else if ( node_ptr->adminAction == MTC_ADMIN_ACTION__POWERCYCLE )
|
|
{
|
|
nodeLinkClass::powercycle_handler ( node_ptr );
|
|
}
|
|
|
|
/****************************************************************************
|
|
* Reset Host: Run the Reset handler for this Reset Action on Locked Host
|
|
****************************************************************************
|
|
*/
|
|
else if ( node_ptr->adminAction == MTC_ADMIN_ACTION__RESET )
|
|
{
|
|
flog ("%s -> Reset Action\n", node_ptr->hostname.c_str());
|
|
nodeLinkClass::reset_handler ( node_ptr );
|
|
nodeLinkClass::oos_test_handler ( node_ptr );
|
|
}
|
|
|
|
/****************************************************************************
|
|
* Reboot Host: Run the Reboot handler for this Reboot Action on Locked Host
|
|
****************************************************************************
|
|
*/
|
|
else if ( node_ptr->adminAction == MTC_ADMIN_ACTION__REBOOT )
|
|
{
|
|
flog ("%s -> Reboot Action\n", node_ptr->hostname.c_str());
|
|
nodeLinkClass::reboot_handler ( node_ptr );
|
|
}
|
|
|
|
/****************************************************************************
|
|
* Recovering Host: Run Enable handler for failed or recovering host
|
|
****************************************************************************
|
|
*/
|
|
else if ((( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
|
|
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__FAILED )) ||
|
|
( node_ptr->adminAction == MTC_ADMIN_ACTION__ENABLE))
|
|
{
|
|
flog ("%s -> Run Enable Handler\n", node_ptr->hostname.c_str());
|
|
nodeLinkClass::enable_handler ( node_ptr );
|
|
}
|
|
|
|
/* Do nothing with locked disabled offline state */
|
|
else if (( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
|
|
( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__DISABLED ) &&
|
|
(( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFDUTY ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF )))
|
|
{
|
|
flog ("%s -> Run OOS Test Handler\n", node_ptr->hostname.c_str());
|
|
nodeLinkClass::oos_test_handler ( node_ptr );
|
|
}
|
|
|
|
/****************************************************************************
|
|
* Recovering Host: Run Recovery handler for failed or recovering host
|
|
****************************************************************************
|
|
*/
|
|
else if (( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER ) &&
|
|
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ))
|
|
{
|
|
flog ("%s -> Run Recovery\n", node_ptr->hostname.c_str());
|
|
nodeLinkClass::recovery_handler ( node_ptr );
|
|
}
|
|
|
|
/****************************************************************************
|
|
* Recovering Host: Run Enable handler for failed or recovering host
|
|
****************************************************************************
|
|
*/
|
|
else if ( ( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
|
|
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__DISABLED ) &&
|
|
(( node_ptr->availStatus == MTC_AVAIL_STATUS__FAILED ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__INTEST ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE )))
|
|
{
|
|
flog ("%s -> Run Enable\n", node_ptr->hostname.c_str());
|
|
nodeLinkClass::enable_handler ( node_ptr );
|
|
}
|
|
|
|
/* Try and recover an accidentally powered of host */
|
|
else if (( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
|
|
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF ) &&
|
|
( node_ptr->hwmon_powercycle.attempts == 0 ) &&
|
|
( node_ptr->hwmon_powercycle.state == RECOVERY_STATE__INIT ))
|
|
{
|
|
ilog ("%s auto-poweron for unlocked host\n", node_ptr->hostname.c_str());
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__POWERON );
|
|
|
|
/* FSM sanity check below will reject this operation, need exit now */
|
|
return (PASS);
|
|
}
|
|
|
|
/****************************************************************************
|
|
* Unlock Host: Run Enable handler for the Unlock Action
|
|
***************************************************************************/
|
|
else if ( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK )
|
|
{
|
|
flog ("%s -> Unlock Action\n", node_ptr->hostname.c_str());
|
|
|
|
/* Proceed to unlock host */
|
|
nodeLinkClass::enable_handler ( node_ptr );
|
|
}
|
|
|
|
/****************************************************************************
|
|
* Run the Subfunction FSM, usually after the ADD or at the end of the enable
|
|
* in a small system.
|
|
****************************************************************************/
|
|
else if ( node_ptr->adminAction == MTC_ADMIN_ACTION__ENABLE_SUBF )
|
|
{
|
|
flog ("%s -> Running SubFunction Enable handler (%d)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->enableStage );
|
|
|
|
nodeLinkClass::enable_subf_handler ( node_ptr );
|
|
}
|
|
|
|
/****************************************************************************
|
|
* Lock Host: Run Disable handler for the Lock Action
|
|
****************************************************************************
|
|
*/
|
|
else if (( node_ptr->adminAction == MTC_ADMIN_ACTION__LOCK ) ||
|
|
( node_ptr->adminAction == MTC_ADMIN_ACTION__FORCE_LOCK ))
|
|
{
|
|
// flog ("%s -> Lock Action\n", node_ptr->hostname.c_str());
|
|
nodeLinkClass::disable_handler ( node_ptr );
|
|
}
|
|
|
|
/****************************************************************************
|
|
* Semantic Handling: Reject Recovery Actions Against In-Service Host
|
|
****************************************************************************
|
|
*/
|
|
else if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
(( node_ptr->adminAction == MTC_ADMIN_ACTION__POWEROFF ) ||
|
|
( node_ptr->adminAction == MTC_ADMIN_ACTION__RESET ) ||
|
|
( node_ptr->adminAction == MTC_ADMIN_ACTION__REBOOT ) ||
|
|
( node_ptr->adminAction == MTC_ADMIN_ACTION__REINSTALL )))
|
|
{
|
|
flog ("%s -> OOS Action Check\n", node_ptr->hostname.c_str());
|
|
|
|
elog ("%s Administrative '%s' Operation Rejected\n",
|
|
node_ptr->hostname.c_str(),
|
|
get_adminAction_str (node_ptr->adminAction) );
|
|
|
|
elog ("%s Cannot perform out-of-service action against in-service host\n",
|
|
node_ptr->hostname.c_str());
|
|
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
|
|
|
|
/* Clear the UI task since we are not really taking this action */
|
|
mtcInvApi_update_task ( node_ptr, "" );
|
|
}
|
|
|
|
/****************************************************************************
|
|
* Reload Host: Run the Reload handler to Nuke the disk on Locked Host
|
|
****************************************************************************
|
|
*/
|
|
else if ( node_ptr->adminAction == MTC_ADMIN_ACTION__REINSTALL )
|
|
{
|
|
flog ("%s -> Reload Action\n", node_ptr->hostname.c_str());
|
|
nodeLinkClass::reinstall_handler ( node_ptr );
|
|
}
|
|
|
|
/****************************************************************************
|
|
* No Op: Do nothing for this Healthy Enabled Locked AIO Simplex Host
|
|
****************************************************************************
|
|
*/
|
|
else if (( this->system_type == SYSTEM_TYPE__AIO__SIMPLEX ) &&
|
|
( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
|
|
( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ))
|
|
{
|
|
nodeLinkClass::insv_test_handler ( node_ptr );
|
|
nodeLinkClass::oos_test_handler ( node_ptr );
|
|
}
|
|
|
|
/****************************************************************************
|
|
* Power-Off Host:
|
|
****************************************************************************
|
|
*/
|
|
else if ( node_ptr->adminAction == MTC_ADMIN_ACTION__POWEROFF )
|
|
{
|
|
flog ("%s -> Power-Off Action\n", node_ptr->hostname.c_str());
|
|
nodeLinkClass::power_handler ( node_ptr );
|
|
nodeLinkClass::oos_test_handler ( node_ptr );
|
|
}
|
|
|
|
/****************************************************************************
|
|
* Power-On Host:
|
|
****************************************************************************
|
|
*/
|
|
else if ( node_ptr->adminAction == MTC_ADMIN_ACTION__POWERON )
|
|
{
|
|
flog ("%s -> Power-On Action\n", node_ptr->hostname.c_str());
|
|
nodeLinkClass::power_handler ( node_ptr );
|
|
nodeLinkClass::oos_test_handler ( node_ptr );
|
|
}
|
|
|
|
/****************************************************************************
|
|
* Swact Host Services:
|
|
****************************************************************************
|
|
*/
|
|
else if (( node_ptr->adminAction == MTC_ADMIN_ACTION__SWACT ) ||
|
|
( node_ptr->adminAction == MTC_ADMIN_ACTION__FORCE_SWACT ))
|
|
|
|
{
|
|
flog ("%s -> Swact Action\n", node_ptr->hostname.c_str());
|
|
nodeLinkClass::swact_handler ( node_ptr );
|
|
}
|
|
|
|
/***** DEGRADED Cases *******/
|
|
|
|
/* Handle the degrade action */
|
|
else if (( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
|
|
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ))
|
|
{
|
|
/* We do nothing, the in service test catches this */
|
|
// flog ("%s -> Degrade Recovery\n", node_ptr->hostname.c_str());
|
|
; // nodeLinkClass::degrade_handler ( node_ptr );
|
|
}
|
|
|
|
else
|
|
{
|
|
if (( node_ptr->adminState >= MTC_ADMIN_STATES ) ||
|
|
( node_ptr->operState >= MTC_OPER_STATES ) ||
|
|
( node_ptr->availStatus >= MTC_AVAIL_STATUS ))
|
|
{
|
|
elog ("Unhandled FSM Case: %s %d-%d-%d\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->adminState,
|
|
node_ptr->operState,
|
|
node_ptr->availStatus );
|
|
}
|
|
else
|
|
{
|
|
wlog ("Unsupported FSM State: %s Action:%s %s-%s-%s ; auto-correcting ...\n",
|
|
node_ptr->hostname.c_str(),
|
|
get_adminAction_str ( node_ptr->adminAction ),
|
|
adminState_enum_to_str (node_ptr->adminState).c_str(),
|
|
operState_enum_to_str (node_ptr->operState).c_str(),
|
|
availStatus_enum_to_str (node_ptr->availStatus).c_str());
|
|
|
|
}
|
|
/* Unlocked state overrides unsupported oper-avail states
|
|
* Try to recover the host */
|
|
if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
|
|
{
|
|
/* Reset the state in the database for these error states */
|
|
node_ptr->adminState = MTC_ADMIN_STATE__UNLOCKED ;
|
|
node_ptr->operState = MTC_OPER_STATE__DISABLED ;
|
|
node_ptr->availStatus = MTC_AVAIL_STATUS__ONLINE ;
|
|
mtcInvApi_update_states ( node_ptr, "unlocked", "disabled" , "online" );
|
|
|
|
/* Force the action */
|
|
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__UNLOCK );
|
|
|
|
}
|
|
else
|
|
{
|
|
/* Reset the state in the database for these error states */
|
|
node_ptr->adminState = MTC_ADMIN_STATE__LOCKED ;
|
|
node_ptr->operState = MTC_OPER_STATE__DISABLED ;
|
|
node_ptr->availStatus = MTC_AVAIL_STATUS__OFFLINE ;
|
|
mtcInvApi_update_states ( node_ptr, "locked", "disabled" , "offline" );
|
|
|
|
/* Force the action */
|
|
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__FORCE_LOCK );
|
|
}
|
|
return (PASS);
|
|
}
|
|
|
|
return (rc) ;
|
|
}
|