metal/mtce/src/maintenance/mtcNodeFsm.cpp
Eric MacDonald 14bb67789e Add pxeboot network mtcAlive messaging to Maintenance
The introduction of the new pxeboot network requires maintenance
verify and report on messaging failures over that network.

Towards that, this update introduces periodic mtcAlive messaging
between the mtcAgent and mtcClinet.

Test Plan:

PASS: Verify install and provision each system type with a mix
             of networking modes ; ethernet, bond and vlan
             - AIO SX, AIO DX, AIO DX plus
             - Standard System 2+1
             - Storage System 2+1+1
PASS: Verify feature with physical on management interface
PASS: Verify feature with vlan on management interface
PASS: Verify feature with bonded management interface
PASS: Verify feature with bonded vlans on management interface
PASS: Verify in bonded cases handling with 2, 1 or no slaves found
PASS: Verify mgmt-combined or separate cluster-host network
PASS: Verify mtcClient pxeboot interface address learning
             - for worker and storage nodes       ; dhcp leases file
             - for controller nodes before unlock ; dhcp leases file
             - for controller nodes after unlock  ; static from ifcfg
             - from controller within 10 seconds of process restart
PASS: Verify mtcAgent pxeboot interface address learning from
             dnsmasq.hosts file
PASS: Verify pxeboot mtcAlive initiation, handling, loss detection
             and recovery
PASS: Verify success and failure handling of all new pxeboot ip
             address learning functions ;
             - dhcp - all system node installs.
             - dnsmasq.hosts - active controller for all hosts.
             - interfaces.d - controller's mtcClient pxeboot address.
             - pxeboot req mtcAlive - mtcAgent mtcAlive request message.
PASS: Verify mtcClient pxeboot network 'mtcAlive request' and 'reboot'
             command handling for ethernet, vlan and bond configs.
PASS: Verify mtcAlive sequence number monitoring, out-of-sequence
             detection, handling and logging.
PASS: Verify pxeboot rx socket binding and non-blocking attribute
PASS: Verify mtcAgent handling stress soaking of sustained incoming
             500+ msgs/sec ; batch handling and logging.
PASS: Verify mtcAgent and mtcClient pxeboot tx and rx socket messaging,
             failure recovery handling and logging.
PASS: Verify pxeboot receiver is not setup on the oam interface on
             controller-0 first install until after initial config
             complete.

Regression:

PASS: Verify mtcAgent/mtcClient online and offline state management
PASS: Verify mtcAgent/mtcClient command handling
      - over management network
      - over cluster-host network
PASS: Verify mtcClient interface chain log for all iface types
      - bond    : vlan123 -> pxeboot0 (802.3ad 4) -> enp0s8 and enp0s9
      - vlan    : vlan123 -> enp0s8
      - ethernet: enp0s8
PASS: Verify mtcAgent/mtcClient handling and logging including debug
      logging for standard operations
      - node install and unlock
      - node lock and unlock
      - node reinstall, reboot, reset
PASS: Verify graceful recovery handling of heartbeat loss failure.
      - node reboot
      - management interface down
PASS: Verify systemcontroller and subcloud install with dc-libvirt
PASS: Verify no log flooding, coredumps, memory leaks

Story: 2010940
Task: 49541
Change-Id: Ibc87b85e3e0e07c3b8c40b5291bd3372506fbdfb
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
2024-03-28 15:28:27 +00:00

436 lines
17 KiB
C++
Executable File

/*
* Copyright (c) 2013, 2016, 2024 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
*/
/***************************************************************************
*
* @file
* Wind River CGTS Platform Node Maintenance "Finite State Machine"
*
* Description: This FSM follows the X.731 specification.
*
* The FSM manages nodes based on the following three perspectives
*
* Administrative: action taken on node (mtc_nodeAdministrative_action_type)
* Operational : state of the node mtc_nodeOperational_state_type)
* Availability : status of current node state (mtc_nodeAvailability_status_type)
*
*/
using namespace std;
#define __AREA__ "fsm"
#include "nodeClass.h"
#include "tokenUtil.h"
#include "mtcNodeFsm.h"
#include "mtcInvApi.h"
#include "mtcNodeMsg.h"
#include "mtcNodeHdlrs.h" /* for ... mtcTimer_handl */
int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
{
int rc = PASS ;
if ( node_ptr == NULL )
{
slog ("Null Node Pointer\n");
return FAIL ;
}
/* handle clear task request */
if ( node_ptr->clear_task == true )
{
mtcInvApi_update_task ( node_ptr, "" );
node_ptr->clear_task = false ;
}
/* Service the libEvent work queue */
workQueue_process ( node_ptr ) ;
/* Service the maintenance command queue if there are commands waiting */
if ( node_ptr->mtcCmd_work_fifo.size())
{
rc = nodeLinkClass::cmd_handler ( node_ptr );
if ( rc == RETRY )
{
return (rc);
}
}
/* Monitor and Manage active threads */
thread_handler ( node_ptr->bmc_thread_ctrl, node_ptr->bmc_thread_info );
if ( node_ptr->bmc_thread_ctrl.stage == THREAD_STAGE__KILL )
{
/* do nothing while thread is being killed */
return RETRY ;
}
/* manage the host connected state and board management alarms */
nodeLinkClass::bmc_handler ( node_ptr );
/* manage host's degrade state */
nodeLinkClass::degrade_handler ( node_ptr );
/*
* Always run the offline handler
*
* - does nothing unless in fault handling mode
* - looks for offline state during fault handling
*/
nodeLinkClass::offline_handler ( node_ptr );
/*
* Always run the online handler.
*
* - handles offline/online state transitions based on periodic audit
* with mtcAlive debouncing
*/
nodeLinkClass::online_handler ( node_ptr );
/*
* Always run the mtcAlive handler.
*
* - monitor host's mtcAlive messaging
* - manage host's mtcAlive missing alarm
*/
nodeLinkClass::pxeboot_mtcAlive_monitor ( node_ptr );
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__DELETE )
{
flog ("%s -> Delete Action\n", node_ptr->hostname.c_str());
nodeLinkClass::delete_handler ( node_ptr );
return (PASS);
}
/* Run the config FSM if the configAction bool is set.
* We keep this as a separate action unto itself so that
* mtce can continue to service all other actions for the
* same host while it handles configuration commands */
if (( node_ptr->configAction == MTC_CONFIG_ACTION__INSTALL_PASSWD ) ||
( node_ptr->configAction == MTC_CONFIG_ACTION__CHANGE_PASSWD ) ||
( node_ptr->configAction == MTC_CONFIG_ACTION__CHANGE_PASSWD_AGAIN ))
{
nodeLinkClass::cfg_handler ( node_ptr );
}
/****************************************************************************
* No Op: Do nothing for this Healthy Enabled Running Host
* This block of code was added to resolve an issue. With this change:
* the insv_test_handler gets run as soon as a host's main function is enabled.
****************************************************************************
*/
if (( node_ptr->ar_disabled == false ) &&
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
((node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
(node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )))
{
// flog ("%s -> insv_test_handler\n", node_ptr->hostname.c_str());
nodeLinkClass::insv_test_handler ( node_ptr );
}
/****************************************************************************
* Add Host Services:
****************************************************************************
*/
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__ADD )
{
flog ("%s -> Add Action\n", node_ptr->hostname.c_str());
nodeLinkClass::add_handler ( node_ptr );
}
/****************************************************************************
* No Op: Do nothing for this Healthy Enabled Running Host
****************************************************************************
*/
else if (( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
((node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
(node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )))
{
// flog ("%s -> oos_test_handler\n", node_ptr->hostname.c_str());
nodeLinkClass::oos_test_handler ( node_ptr );
}
else if ( node_ptr->adminAction == MTC_ADMIN_ACTION__POWERCYCLE )
{
nodeLinkClass::powercycle_handler ( node_ptr );
}
/****************************************************************************
* Reset Host: Run the Reset handler for this Reset Action on Locked Host
****************************************************************************
*/
else if ( node_ptr->adminAction == MTC_ADMIN_ACTION__RESET )
{
flog ("%s -> Reset Action\n", node_ptr->hostname.c_str());
nodeLinkClass::reset_handler ( node_ptr );
nodeLinkClass::oos_test_handler ( node_ptr );
}
/****************************************************************************
* Reboot Host: Run the Reboot handler for this Reboot Action on Locked Host
****************************************************************************
*/
else if ( node_ptr->adminAction == MTC_ADMIN_ACTION__REBOOT )
{
flog ("%s -> Reboot Action\n", node_ptr->hostname.c_str());
nodeLinkClass::reboot_handler ( node_ptr );
}
/****************************************************************************
* Recovering Host: Run Enable handler for failed or recovering host
****************************************************************************
*/
else if ((( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
( node_ptr->availStatus == MTC_AVAIL_STATUS__FAILED )) ||
( node_ptr->adminAction == MTC_ADMIN_ACTION__ENABLE))
{
flog ("%s -> Run Enable Handler\n", node_ptr->hostname.c_str());
nodeLinkClass::enable_handler ( node_ptr );
}
/* Do nothing with locked disabled offline state */
else if (( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__DISABLED ) &&
(( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE ) ||
( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE ) ||
( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFDUTY ) ||
( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF )))
{
flog ("%s -> Run OOS Test Handler\n", node_ptr->hostname.c_str());
nodeLinkClass::oos_test_handler ( node_ptr );
}
/****************************************************************************
* Recovering Host: Run Recovery handler for failed or recovering host
****************************************************************************
*/
else if (( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER ) &&
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ))
{
flog ("%s -> Run Recovery\n", node_ptr->hostname.c_str());
nodeLinkClass::recovery_handler ( node_ptr );
}
/****************************************************************************
* Recovering Host: Run Enable handler for failed or recovering host
****************************************************************************
*/
else if ( ( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__DISABLED ) &&
(( node_ptr->availStatus == MTC_AVAIL_STATUS__FAILED ) ||
( node_ptr->availStatus == MTC_AVAIL_STATUS__INTEST ) ||
( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE ) ||
( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE )))
{
flog ("%s -> Run Enable\n", node_ptr->hostname.c_str());
nodeLinkClass::enable_handler ( node_ptr );
}
/* Try and recover an accidentally powered of host */
else if (( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF ) &&
( node_ptr->hwmon_powercycle.attempts == 0 ) &&
( node_ptr->hwmon_powercycle.state == RECOVERY_STATE__INIT ))
{
ilog ("%s auto-poweron for unlocked host\n", node_ptr->hostname.c_str());
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__POWERON );
/* FSM sanity check below will reject this operation, need exit now */
return (PASS);
}
/****************************************************************************
* Unlock Host: Run Enable handler for the Unlock Action
***************************************************************************/
else if ( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK )
{
flog ("%s -> Unlock Action\n", node_ptr->hostname.c_str());
/* Proceed to unlock host */
nodeLinkClass::enable_handler ( node_ptr );
}
/****************************************************************************
* Run the Subfunction FSM, usually after the ADD or at the end of the enable
* in a small system.
****************************************************************************/
else if ( node_ptr->adminAction == MTC_ADMIN_ACTION__ENABLE_SUBF )
{
flog ("%s -> Running SubFunction Enable handler (%d)\n",
node_ptr->hostname.c_str(),
node_ptr->enableStage );
nodeLinkClass::enable_subf_handler ( node_ptr );
}
/****************************************************************************
* Lock Host: Run Disable handler for the Lock Action
****************************************************************************
*/
else if (( node_ptr->adminAction == MTC_ADMIN_ACTION__LOCK ) ||
( node_ptr->adminAction == MTC_ADMIN_ACTION__FORCE_LOCK ))
{
// flog ("%s -> Lock Action\n", node_ptr->hostname.c_str());
nodeLinkClass::disable_handler ( node_ptr );
}
/****************************************************************************
* Semantic Handling: Reject Recovery Actions Against In-Service Host
****************************************************************************
*/
else if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
(( node_ptr->adminAction == MTC_ADMIN_ACTION__POWEROFF ) ||
( node_ptr->adminAction == MTC_ADMIN_ACTION__RESET ) ||
( node_ptr->adminAction == MTC_ADMIN_ACTION__REBOOT ) ||
( node_ptr->adminAction == MTC_ADMIN_ACTION__REINSTALL )))
{
flog ("%s -> OOS Action Check\n", node_ptr->hostname.c_str());
elog ("%s Administrative '%s' Operation Rejected\n",
node_ptr->hostname.c_str(),
get_adminAction_str (node_ptr->adminAction) );
elog ("%s Cannot perform out-of-service action against in-service host\n",
node_ptr->hostname.c_str());
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
/* Clear the UI task since we are not really taking this action */
mtcInvApi_update_task ( node_ptr, "" );
}
/****************************************************************************
* Reload Host: Run the Reload handler to Nuke the disk on Locked Host
****************************************************************************
*/
else if ( node_ptr->adminAction == MTC_ADMIN_ACTION__REINSTALL )
{
flog ("%s -> Reload Action\n", node_ptr->hostname.c_str());
nodeLinkClass::reinstall_handler ( node_ptr );
}
/****************************************************************************
* No Op: Do nothing for this Healthy Enabled Locked AIO Simplex Host
****************************************************************************
*/
else if (( this->system_type == SYSTEM_TYPE__AIO__SIMPLEX ) &&
( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ))
{
nodeLinkClass::insv_test_handler ( node_ptr );
nodeLinkClass::oos_test_handler ( node_ptr );
}
/****************************************************************************
* Power-Off Host:
****************************************************************************
*/
else if ( node_ptr->adminAction == MTC_ADMIN_ACTION__POWEROFF )
{
flog ("%s -> Power-Off Action\n", node_ptr->hostname.c_str());
nodeLinkClass::power_handler ( node_ptr );
nodeLinkClass::oos_test_handler ( node_ptr );
}
/****************************************************************************
* Power-On Host:
****************************************************************************
*/
else if ( node_ptr->adminAction == MTC_ADMIN_ACTION__POWERON )
{
flog ("%s -> Power-On Action\n", node_ptr->hostname.c_str());
nodeLinkClass::power_handler ( node_ptr );
nodeLinkClass::oos_test_handler ( node_ptr );
}
/****************************************************************************
* Swact Host Services:
****************************************************************************
*/
else if (( node_ptr->adminAction == MTC_ADMIN_ACTION__SWACT ) ||
( node_ptr->adminAction == MTC_ADMIN_ACTION__FORCE_SWACT ))
{
flog ("%s -> Swact Action\n", node_ptr->hostname.c_str());
nodeLinkClass::swact_handler ( node_ptr );
}
/***** DEGRADED Cases *******/
/* Handle the degrade action */
else if (( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ))
{
/* We do nothing, the in service test catches this */
// flog ("%s -> Degrade Recovery\n", node_ptr->hostname.c_str());
; // nodeLinkClass::degrade_handler ( node_ptr );
}
else
{
if (( node_ptr->adminState >= MTC_ADMIN_STATES ) ||
( node_ptr->operState >= MTC_OPER_STATES ) ||
( node_ptr->availStatus >= MTC_AVAIL_STATUS ))
{
elog ("Unhandled FSM Case: %s %d-%d-%d\n",
node_ptr->hostname.c_str(),
node_ptr->adminState,
node_ptr->operState,
node_ptr->availStatus );
}
else
{
wlog ("Unsupported FSM State: %s Action:%s %s-%s-%s ; auto-correcting ...\n",
node_ptr->hostname.c_str(),
get_adminAction_str ( node_ptr->adminAction ),
adminState_enum_to_str (node_ptr->adminState).c_str(),
operState_enum_to_str (node_ptr->operState).c_str(),
availStatus_enum_to_str (node_ptr->availStatus).c_str());
}
/* Unlocked state overrides unsupported oper-avail states
* Try to recover the host */
if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
{
/* Reset the state in the database for these error states */
node_ptr->adminState = MTC_ADMIN_STATE__UNLOCKED ;
node_ptr->operState = MTC_OPER_STATE__DISABLED ;
node_ptr->availStatus = MTC_AVAIL_STATUS__ONLINE ;
mtcInvApi_update_states ( node_ptr, "unlocked", "disabled" , "online" );
/* Force the action */
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__UNLOCK );
}
else
{
/* Reset the state in the database for these error states */
node_ptr->adminState = MTC_ADMIN_STATE__LOCKED ;
node_ptr->operState = MTC_OPER_STATE__DISABLED ;
node_ptr->availStatus = MTC_AVAIL_STATUS__OFFLINE ;
mtcInvApi_update_states ( node_ptr, "locked", "disabled" , "offline" );
/* Force the action */
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__FORCE_LOCK );
}
return (PASS);
}
return (rc) ;
}