
This update adds Maintenance support for receiving host degrade assert and clear messages from collectd. This update also disables platform memory, cpu and file system resource monitoring in the maintenance resource monitor process rmon. These disabled resources are now monitored by collectd and therefore should not be monitored by rmond any longer. Change-Id: I13fd033bb1d14f299dcb97fa80296641c958d0a9 Signed-off-by: Jack Ding <jack.ding@windriver.com>
6806 lines
281 KiB
C++
Executable File
6806 lines
281 KiB
C++
Executable File
/*
|
|
* Copyright (c) 2013-2016 Wind River Systems, Inc.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
*/
|
|
|
|
/****************************************************************************
|
|
* @file
|
|
* Wind River CGTS Platform Node "Handlers" Implementation
|
|
*
|
|
* Description: This file contains the handlers that implement the X.731 FSM.
|
|
|
|
* Interfaces:
|
|
*
|
|
* nodeLinkClass::timer_handler
|
|
* nodeLinkClass::enable_handler
|
|
* nodeLinkClass::disable_handler
|
|
* nodeLinkClass::delete_handler
|
|
* nodeLinkClass::degrade_handler
|
|
* nodeLinkClass::reset_handler
|
|
* nodeLinkClass::reinstall_handler
|
|
* nodeLinkClass::event_handler
|
|
* nodeLinkClass::power_handler
|
|
* nodeLinkClass::recovery_handler
|
|
* nodeLinkClass::cfg_handler
|
|
|
|
****************************************************************************/
|
|
|
|
using namespace std;
|
|
|
|
#define __AREA__ "hdl"
|
|
|
|
#include "nodeBase.h" /* for ... basic definitions */
|
|
#include "mtcAlarm.h" /* for ... mtcAlarm_<severity> */
|
|
#include "nodeTimers.h" /* for ... mtcTimer_start/stop */
|
|
|
|
#include "jsonUtil.h" /* for ... jsonApi_array_value */
|
|
#include "tokenUtil.h"
|
|
#include "regexUtil.h" /* for ... regexUtil_pattern_match */
|
|
|
|
#include "nodeClass.h" /* All base stuff */
|
|
#include "ipmiUtil.h" /* for ... power and reset support */
|
|
|
|
#include "mtcNodeMsg.h" /* for ... send_mtc_cmd */
|
|
#include "mtcInvApi.h" /* for ... SYSINV API */
|
|
#include "mtcSmgrApi.h" /* for ... SM API */
|
|
#include "mtcVimApi.h" /* for ... VIm API */
|
|
|
|
#include "daemon_ini.h" /* for ... ini_parse */
|
|
#include "daemon_common.h"
|
|
|
|
|
|
#define LOAD_NODETYPE_TIMERS \
|
|
if ( is_controller(node_ptr) ) \
|
|
{ \
|
|
node_ptr->mtcalive_timeout = daemon_get_cfg_ptr()->controller_mtcalive_timeout ; \
|
|
} \
|
|
else \
|
|
{ \
|
|
node_ptr->mtcalive_timeout = daemon_get_cfg_ptr()->compute_mtcalive_timeout ; \
|
|
} \
|
|
this->goenabled_timeout = daemon_get_cfg_ptr()->goenabled_timeout + 3 ; \
|
|
// Adding 3 seconds to the timeout so that the agent timeout is a
|
|
// little longer than the client.
|
|
|
|
/*************************************************************
|
|
*
|
|
* Name : calc_reset_prog_timeout
|
|
*
|
|
* Purpose : Calculate the overall reset progression timeout
|
|
*
|
|
* ***********************************************************/
|
|
int nodeLinkClass::calc_reset_prog_timeout ( struct nodeLinkClass::node * node_ptr,
|
|
int retries )
|
|
{
|
|
/* for the management interface */
|
|
int to = MTC_RESET_PROG_OFFLINE_TIMEOUT ;
|
|
|
|
/* and add on for the bmc interface if its provisioned */
|
|
if ( node_ptr->bm_provisioned == true )
|
|
to += MTC_RESET_PROG_OFFLINE_TIMEOUT ;
|
|
|
|
/* add a small buffer */
|
|
to += (MTC_ENABLED_TIMER*4) ;
|
|
|
|
/* factor in the number of retries */
|
|
to *= (retries+1) ;
|
|
|
|
ilog ("%s Reboot/Reset progression has %d sec 'wait for offline' timeout\n",
|
|
node_ptr->hostname.c_str(), to );
|
|
ilog ("%s ... sources - mgmnt:Yes infra:%s bmc:%s\n",
|
|
node_ptr->hostname.c_str(),
|
|
infra_network_provisioned ? "Yes" : "No",
|
|
node_ptr->bm_provisioned ? "Yes" : "No" );
|
|
return (to);
|
|
}
|
|
|
|
void mtcTimer_handler ( int sig, siginfo_t *si, void *uc);
|
|
|
|
/* Looks up the timer ID and asserts the corresponding node's ringer */
|
|
void nodeLinkClass::timer_handler ( int sig, siginfo_t *si, void *uc)
|
|
{
|
|
struct nodeLinkClass::node * node_ptr ;
|
|
timer_t * tid_ptr = (void**)si->si_value.sival_ptr ;
|
|
|
|
/* Avoid compiler errors/warnings for parms we must
|
|
* have but currently do nothing with */
|
|
sig=sig ; uc = uc ;
|
|
|
|
if ( !(*tid_ptr) )
|
|
{
|
|
// tlog ("Called with a NULL Timer ID\n");
|
|
return ;
|
|
}
|
|
|
|
/* Is this an offline timer */
|
|
node_ptr = get_offline_timer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
// tlog ("%s offline timer ring\n", node_ptr->hostname.c_str());
|
|
mtcTimer_stop_int_safe ( node_ptr->offline_timer );
|
|
node_ptr->offline_timer.ring = true ;
|
|
return ;
|
|
}
|
|
|
|
/* Is this TID a mtcAlive timer TID ? */
|
|
node_ptr = get_mtcAlive_timer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
// tlog ("%s MtcAlive 'offline' timer ring\n", node_ptr->hostname.c_str());
|
|
mtcTimer_stop_int_safe ( node_ptr->mtcAlive_timer );
|
|
node_ptr->mtcAlive_timer.ring = true ;
|
|
return ;
|
|
}
|
|
|
|
/* Is this TID a for the command FSM */
|
|
node_ptr = get_mtcCmd_timer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
// tlog ("%s Mtc Command FSM timer ring\n", node_ptr->hostname.c_str());
|
|
mtcTimer_stop_int_safe ( node_ptr->mtcCmd_timer );
|
|
node_ptr->mtcCmd_timer.ring = true ;
|
|
return ;
|
|
}
|
|
|
|
/* Is this TID a inservice test timer TID ? */
|
|
node_ptr = get_insvTestTimer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
// tlog ("%s Insv Test timer ring\n", node_ptr->hostname.c_str());
|
|
mtcTimer_stop_int_safe ( node_ptr->insvTestTimer );
|
|
node_ptr->insvTestTimer.ring = true ;
|
|
return ;
|
|
}
|
|
|
|
/* Is this TID a out-of-service test timer TID ? */
|
|
node_ptr = get_oosTestTimer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
// tlog ("%s Oos Test timer ring\n", node_ptr->hostname.c_str());
|
|
mtcTimer_stop_int_safe ( node_ptr->oosTestTimer );
|
|
node_ptr->oosTestTimer.ring = true ;
|
|
return ;
|
|
}
|
|
|
|
/* Is this TID a swact timer TID ? */
|
|
node_ptr = get_mtcSwact_timer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
// tlog ("%s Swact Timer ring\n", node_ptr->hostname.c_str());
|
|
mtcTimer_stop_int_safe ( node_ptr->mtcSwact_timer );
|
|
node_ptr->mtcSwact_timer.ring = true ;
|
|
return ;
|
|
}
|
|
|
|
/* Dead Office Recovery Mode Timer */
|
|
if ( *tid_ptr == mtcTimer_dor.tid )
|
|
{
|
|
mtcTimer_stop_int_safe ( mtcTimer_dor );
|
|
mtcTimer_dor.ring = true ;
|
|
return ;
|
|
}
|
|
|
|
/* Multi-Node Failure Avoidance Timer ? */
|
|
if ( *tid_ptr == mtcTimer_mnfa.tid )
|
|
{
|
|
// tlog ("%s Mnfa timer ring\n", mtcTimer_mnfa.hostname.c_str());
|
|
mtcTimer_stop_int_safe ( mtcTimer_mnfa );
|
|
mtcTimer_mnfa.ring = true ;
|
|
return ;
|
|
}
|
|
|
|
/* is base mtc timer */
|
|
if ( *tid_ptr == mtcTimer.tid )
|
|
{
|
|
// tlog ("%s Mtc timer ring\n", mtcTimer.hostname.c_str());
|
|
mtcTimer_stop_int_safe ( mtcTimer );
|
|
mtcTimer.ring = true ;
|
|
return ;
|
|
}
|
|
|
|
/* is uptime refresh timer ? */
|
|
if ( *tid_ptr == mtcTimer_uptime.tid )
|
|
{
|
|
// tlog ("%s Uptime 'refresh' timer ring\n", mtcTimer_uptime.hostname.c_str());
|
|
mtcTimer_stop_int_safe ( mtcTimer_uptime );
|
|
mtcTimer_uptime.ring = true ;
|
|
|
|
/* This timer provides self corrective action handler as a secondary service
|
|
* Currently it looks for the following ...
|
|
*
|
|
* 1. Stuck libevent smgrEvent.mutex gate and frees it after 5 uptime intervals
|
|
*
|
|
**/
|
|
if ( smgrEvent.mutex )
|
|
{
|
|
daemon_config_type * cfg_ptr = daemon_get_cfg_ptr();
|
|
|
|
/* Clear this mutex flag if stuck for more than 5 minutes */
|
|
if ( ++smgrEvent.stuck > ((cfg_ptr->swact_timeout/60)+1))
|
|
{
|
|
// wlog ("Swact Mutex found stuck and has been auto cleared\n");
|
|
smgrEvent.stuck = 0 ;
|
|
smgrEvent.mutex = false ;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* Clear the stuck count */
|
|
smgrEvent.stuck = 0 ;
|
|
}
|
|
return ;
|
|
}
|
|
/* is keystone token refresh timer ? */
|
|
if (( *tid_ptr == mtcTimer_token.tid ) )
|
|
{
|
|
// tlog ("%s Token 'refresh' timer ring\n", mtcTimer_token.hostname.c_str());
|
|
mtcTimer_stop_int_safe ( mtcTimer_token );
|
|
mtcTimer_token.ring = true ;
|
|
return ;
|
|
}
|
|
|
|
/* is the http request timer ? */
|
|
node_ptr = get_http_timer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
// tlog ("%s Http timer ring\n", node_ptr->http_timer.hostname.c_str());
|
|
mtcTimer_stop_int_safe ( node_ptr->http_timer );
|
|
node_ptr->http_timer.ring = true ;
|
|
|
|
if ( node_ptr->http_timer.mutex == true )
|
|
node_ptr->http_timer.error = true ;
|
|
|
|
return ;
|
|
}
|
|
|
|
/* get the node */
|
|
node_ptr = get_mtcTimer_timer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
// tlog ("%s Timer ring\n", node_ptr->hostname.c_str());
|
|
mtcTimer_stop_int_safe ( node_ptr->mtcTimer );
|
|
node_ptr->mtcTimer.ring = true ;
|
|
return ;
|
|
}
|
|
|
|
/* Is this TID a config timer TID ? */
|
|
node_ptr = get_mtcConfig_timer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
// tlog ("%s Config Timer ring\n", node_ptr->hostname.c_str());
|
|
mtcTimer_stop_int_safe ( node_ptr->mtcConfig_timer );
|
|
node_ptr->mtcConfig_timer.ring = true ;
|
|
return ;
|
|
}
|
|
|
|
/* is the thread timer ? */
|
|
node_ptr = get_thread_timer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
mtcTimer_stop_int_safe ( node_ptr->ipmitool_thread_ctrl.timer );
|
|
node_ptr->ipmitool_thread_ctrl.timer.ring = true ;
|
|
return ;
|
|
}
|
|
|
|
/* is the ping timer ? */
|
|
node_ptr = get_ping_timer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
/* is this the bm ping timer */
|
|
if ( *tid_ptr == node_ptr->bm_ping_info.timer.tid )
|
|
{
|
|
mtcTimer_stop_int_safe ( node_ptr->bm_ping_info.timer );
|
|
node_ptr->bm_ping_info.timer.ring = true ;
|
|
return ;
|
|
}
|
|
/* there may be other ping timers introduced later */
|
|
}
|
|
|
|
/* is the bmc handler timer ? */
|
|
node_ptr = get_bm_timer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
/* is this the bm ping timer */
|
|
if ( *tid_ptr == node_ptr->bm_timer.tid )
|
|
{
|
|
mtcTimer_stop_int_safe ( node_ptr->bm_timer );
|
|
node_ptr->bm_timer.ring = true ;
|
|
return ;
|
|
}
|
|
}
|
|
|
|
/* is the bmc handler timer ? */
|
|
node_ptr = get_bmc_access_timer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
/* is this the bm ping timer */
|
|
if ( *tid_ptr == node_ptr->bmc_access_timer.tid )
|
|
{
|
|
mtcTimer_stop_int_safe ( node_ptr->bmc_access_timer );
|
|
node_ptr->bmc_access_timer.ring = true ;
|
|
return ;
|
|
}
|
|
}
|
|
|
|
/* is the host services handler timer ? */
|
|
node_ptr = get_host_services_timer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
/* is this the bm ping timer */
|
|
if ( *tid_ptr == node_ptr->host_services_timer.tid )
|
|
{
|
|
mtcTimer_stop_int_safe ( node_ptr->host_services_timer );
|
|
node_ptr->host_services_timer.ring = true ;
|
|
return ;
|
|
}
|
|
}
|
|
|
|
node_ptr = get_powercycle_recovery_timer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
if (( *tid_ptr == node_ptr->hwmon_powercycle.recovery_timer.tid ) )
|
|
{
|
|
if ( node_ptr->hwmon_powercycle.attempts )
|
|
{
|
|
tlog ("%s powercycle monitor completed successfully after attempt %d\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->hwmon_powercycle.attempts);
|
|
}
|
|
|
|
recovery_ctrl_init ( node_ptr->hwmon_powercycle );
|
|
|
|
if (( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
|
|
( node_ptr->availStatus != MTC_AVAIL_STATUS__POWERED_OFF ))
|
|
{
|
|
node_ptr->clear_task = true ;
|
|
}
|
|
|
|
/* cancel the timer */
|
|
mtcTimer_stop_int_safe ( node_ptr->hwmon_powercycle.recovery_timer );
|
|
|
|
node_ptr->hwmon_powercycle.recovery_timer.ring = true ;
|
|
|
|
return ;
|
|
}
|
|
}
|
|
|
|
node_ptr = get_powercycle_control_timer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
if (( *tid_ptr == node_ptr->hwmon_powercycle.control_timer.tid ) )
|
|
{
|
|
/* cancel the timer */
|
|
mtcTimer_stop_int_safe ( node_ptr->hwmon_powercycle.control_timer );
|
|
|
|
node_ptr->hwmon_powercycle.control_timer.ring = true ;
|
|
|
|
return ;
|
|
}
|
|
}
|
|
|
|
/* Is this TID a reset recovery timer TID ? */
|
|
node_ptr = get_reset_recovery_timer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
if (( *tid_ptr == node_ptr->hwmon_reset.recovery_timer.tid ) )
|
|
{
|
|
tlog ("%s clearing hwmon reset holdoff timer\n",
|
|
node_ptr->hostname.c_str());
|
|
|
|
recovery_ctrl_init ( node_ptr->hwmon_reset );
|
|
|
|
mtcTimer_stop_int_safe ( node_ptr->hwmon_reset.recovery_timer );
|
|
|
|
node_ptr->hwmon_reset.recovery_timer.ring = true ;
|
|
return ;
|
|
}
|
|
}
|
|
|
|
/* Is this TID a reset control timer TID ? */
|
|
node_ptr = get_reset_control_timer ( *tid_ptr );
|
|
if ( node_ptr )
|
|
{
|
|
if (( *tid_ptr == node_ptr->hwmon_reset.control_timer.tid ) )
|
|
{
|
|
tlog ("%s ringing hwmon reset control timer\n",
|
|
node_ptr->hostname.c_str());
|
|
|
|
mtcTimer_stop_int_safe ( node_ptr->hwmon_reset.control_timer );
|
|
|
|
node_ptr->hwmon_reset.control_timer.ring = true ;
|
|
|
|
return ;
|
|
}
|
|
}
|
|
|
|
/* cancel the timer by tid */
|
|
mtcTimer_stop_tid_int_safe ( tid_ptr );
|
|
}
|
|
|
|
/* Inventory Object wrapper - does a node lookup and calls the timer handler */
|
|
void mtcTimer_handler ( int sig, siginfo_t *si, void *uc)
|
|
{
|
|
nodeLinkClass * object_ptr = get_mtcInv_ptr() ;
|
|
object_ptr->timer_handler ( sig, si, uc );
|
|
}
|
|
|
|
/** Responsible for recovering a host into its enabled state
|
|
*
|
|
* Steps: availibility is either unavailable or failed or intest if previous enable failed
|
|
* 1. enable Start
|
|
* operational = disabled
|
|
* 2. Notify VM Manager (signal)
|
|
* 3. send disabled message to heartbeat service (message)
|
|
* 4. reboot host (message)
|
|
* availability = intest
|
|
* 5. wait for mtc alive (timer)
|
|
* 6. wait for go enabled (timer)
|
|
* 7. send enabled message to heartbeat service (message)
|
|
* 8. change state to enabled
|
|
* availability - available
|
|
*/
|
|
|
|
int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
int rc = PASS ;
|
|
|
|
if ( THIS_HOST )
|
|
{
|
|
/******************************************************************
|
|
*
|
|
* Intercept the unlock action for self.
|
|
* 1. change the admin state to unlocked,
|
|
* 2. send a lazy reboot and
|
|
* 3. wait for the reboot
|
|
*
|
|
******************************************************************/
|
|
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK )
|
|
{
|
|
bool aio = false ;
|
|
if ( SIMPLEX_CPE_SYSTEM )
|
|
aio = true ;
|
|
else
|
|
aio = false ;
|
|
|
|
mtcInvApi_update_states_now ( node_ptr, "unlocked", "disabled" , "offline", "disabled", "offline" );
|
|
mtcInvApi_update_task_now ( node_ptr, aio ? MTC_TASK_CPE_SX_UNLOCK_MSG : MTC_TASK_SELF_UNLOCK_MSG );
|
|
|
|
wlog ("%s unlocking %s with reboot\n",
|
|
my_hostname.c_str(),
|
|
aio ? "Simplex System" : "Active Controller" );
|
|
|
|
/* should not return */
|
|
return ( lazy_graceful_fs_reboot ( node_ptr ));
|
|
}
|
|
}
|
|
|
|
switch ( (int)node_ptr->handlerStage.enable )
|
|
{
|
|
case MTC_ENABLE__FAILURE:
|
|
{
|
|
/**************************************************************
|
|
* Failure of thr active controller has special handling.
|
|
*
|
|
* Condition 1: While there is no in-service backup controller
|
|
* to swact to. In this case the ctive controller
|
|
* - is only degraded to avoid a system outage.
|
|
* - the CPE subfunction is failed
|
|
* - compute SubFunction Alarm is raised
|
|
* - Enable alarm is raised
|
|
* - A process monitor alarm may also be raised if
|
|
* the failure was that of a critical process.
|
|
*
|
|
* Condition 2: While there is another controller to Swact to.
|
|
* In this case the active conroller is failed
|
|
* and maintenance will trigger SM to Swact and
|
|
* the failing active controller will get
|
|
* auto-recovered by the takeover controller.
|
|
*
|
|
* Condition 3: AIO Simplex failures can request thresholded
|
|
* auto-recovery. In doing so maintenance will
|
|
* increment the count in an auto recovery counter
|
|
* file and self reboot if that count does not exceed
|
|
* the auto recovery threshold. After 3 retries the
|
|
* threshold is exceeded and then maiantenance stops
|
|
* self rebooting and enters the state specified by
|
|
* condition 1 above.
|
|
*
|
|
***************************************************************/
|
|
bool degrade_only = false ;
|
|
|
|
elog ("%s Main Enable FSM (from failed)\n", node_ptr->hostname.c_str());
|
|
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
|
|
/* Stop heartbeat */
|
|
send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST );
|
|
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
|
|
{
|
|
hbs_minor_clear ( node_ptr, (iface_enum)iface );
|
|
}
|
|
|
|
node_ptr->cmdReq = MTC_CMD_NONE ;
|
|
node_ptr->cmdRsp = MTC_CMD_NONE ;
|
|
node_ptr->cmdRsp_status = 0 ;
|
|
|
|
/* Raise Critical Enable Alarm */
|
|
alarm_enabled_failure ( node_ptr );
|
|
|
|
/* Handle active controller failures */
|
|
if ( THIS_HOST )
|
|
{
|
|
/* Don't fail the only controller, degrade instead */
|
|
degrade_only = true ;
|
|
|
|
/* If the inactive controller is enabled then tru to swact to it.
|
|
* SM will reject till its eady, until then just run degraded */
|
|
if ( is_inactive_controller_main_insv() == true )
|
|
{
|
|
wlog ("%s has critical failure\n", node_ptr->hostname.c_str());
|
|
wlog ("%s ... requesting swact to in-service inactive controller\n", node_ptr->hostname.c_str());
|
|
|
|
mtcInvApi_update_task_now ( node_ptr, MTC_TASK_FAILED_SWACT_REQ );
|
|
|
|
/* Inform the VIM of the failure */
|
|
mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );
|
|
|
|
/* ask SM to swact to the backup controller */
|
|
mtcSmgrApi_request ( node_ptr, CONTROLLER_SWACT, 0 );
|
|
|
|
for ( int i = 0 ; i < SMGR_MAX_RETRIES ; i++ )
|
|
{
|
|
daemon_signal_hdlr ();
|
|
sleep (1);
|
|
|
|
/* Try and receive the response */
|
|
if ( mtcHttpUtil_receive ( nodeLinkClass::smgrEvent ) != RETRY )
|
|
{
|
|
wlog ("%s SM Swact Request Response: %s\n",
|
|
node_ptr->hostname.c_str(),
|
|
smgrEvent.response.c_str());
|
|
break ;
|
|
}
|
|
}
|
|
if ( nodeLinkClass::smgrEvent.active == true )
|
|
{
|
|
slog ("%s freeing smgrEvent activity state\n", node_ptr->hostname.c_str());
|
|
nodeLinkClass::smgrEvent.active = false ;
|
|
}
|
|
|
|
/* if we get here then proceed to delay for another swact attempt */
|
|
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_SWACT_WAIT );
|
|
|
|
/* force ourselves into the enable handler */
|
|
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE) &&
|
|
( node_ptr->adminAction != MTC_ADMIN_ACTION__SWACT) &&
|
|
( node_ptr->adminAction != MTC_ADMIN_ACTION__LOCK) &&
|
|
( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK))
|
|
{
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__ENABLE );
|
|
}
|
|
|
|
/* Wait 30 seconds before trying the Swact again */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_30 );
|
|
break ;
|
|
}
|
|
else
|
|
{
|
|
this->autorecovery_enabled = true ;
|
|
|
|
/* use thresholded auto recovery for simplext failure case */
|
|
manage_autorecovery ( node_ptr );
|
|
|
|
if ( this->autorecovery_disabled == false )
|
|
{
|
|
wlog ("%s has critical failure.\n", node_ptr->hostname.c_str());
|
|
wlog ("%s ... downgrading to degrade with auto recovery disabled\n", node_ptr->hostname.c_str());
|
|
wlog ("%s ... to avoid disabling only enabled controller\n", node_ptr->hostname.c_str());
|
|
this->autorecovery_disabled = true ;
|
|
}
|
|
|
|
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
|
{
|
|
/* Raise Critical Compute Function Alarm */
|
|
alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_CRITICAL );
|
|
}
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
|
}
|
|
}
|
|
|
|
/* Start fresh the next time we enter graceful recovery handler */
|
|
node_ptr->graceful_recovery_counter = 0 ;
|
|
node_ptr->health_threshold_counter = 0 ;
|
|
|
|
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
|
{
|
|
node_ptr->inservice_failed_subf = true ;
|
|
subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
|
|
MTC_AVAIL_STATUS__FAILED );
|
|
}
|
|
|
|
if ( degrade_only == true )
|
|
{
|
|
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
|
MTC_OPER_STATE__ENABLED,
|
|
MTC_AVAIL_STATUS__DEGRADED );
|
|
}
|
|
else
|
|
{
|
|
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
|
MTC_OPER_STATE__DISABLED,
|
|
MTC_AVAIL_STATUS__FAILED );
|
|
}
|
|
|
|
/* Inform the VIM of the failure */
|
|
mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );
|
|
|
|
/* if we get here in controller simplex mode then go degraded
|
|
* if we are not already degraded. Otherwise, fail. */
|
|
if ( THIS_HOST && ( is_inactive_controller_main_insv() == false ))
|
|
{
|
|
/* autorecovery must be disabled */
|
|
if (( node_ptr->adminState != MTC_ADMIN_STATE__UNLOCKED ) ||
|
|
( node_ptr->operState != MTC_OPER_STATE__ENABLED ) ||
|
|
( node_ptr->availStatus != MTC_AVAIL_STATUS__DEGRADED))
|
|
{
|
|
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
|
MTC_OPER_STATE__ENABLED,
|
|
MTC_AVAIL_STATUS__DEGRADED );
|
|
}
|
|
/* adminAction state is already changed to NONE. */
|
|
}
|
|
else
|
|
{
|
|
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_WAIT );
|
|
}
|
|
|
|
break;
|
|
}
|
|
case MTC_ENABLE__FAILURE_SWACT_WAIT:
|
|
{
|
|
if (( node_ptr->operState != MTC_OPER_STATE__ENABLED ) ||
|
|
( node_ptr->availStatus != MTC_AVAIL_STATUS__DEGRADED ))
|
|
{
|
|
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
|
MTC_OPER_STATE__ENABLED,
|
|
MTC_AVAIL_STATUS__DEGRADED );
|
|
}
|
|
|
|
/* wait for the swact or to re-try MTC_ENABLE_FAILURE and likely
|
|
* try the swact request again */
|
|
if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_ENABLE__FAILURE_WAIT:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) == false )
|
|
{
|
|
break ;
|
|
}
|
|
/* Stop the enable sequence if the locked now;
|
|
* this might occur if the unlock failed from inventory */
|
|
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
|
|
{
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
|
mtcInvApi_update_task ( node_ptr, "" );
|
|
}
|
|
enableStageChange ( node_ptr, MTC_ENABLE__START );
|
|
node_ptr->mtcTimer.ring = false ;
|
|
break ;
|
|
/* Fall through */
|
|
}
|
|
case MTC_ENABLE__START:
|
|
{
|
|
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );
|
|
|
|
plog ("%s Main Enable FSM (from start)%s\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->was_dor_recovery_mode ? " (from DOR)" : "" );
|
|
|
|
/* clear all the past enable failure bools */
|
|
clear_main_failed_bools ( node_ptr );
|
|
clear_subf_failed_bools ( node_ptr );
|
|
clear_hostservices_ctls ( node_ptr );
|
|
|
|
/* Clear all degrade flags except for the HWMON one */
|
|
clear_host_degrade_causes ( node_ptr->degrade_mask );
|
|
node_ptr->degraded_resources_list.clear();
|
|
|
|
/* Purge this hosts work and done queues */
|
|
workQueue_purge ( node_ptr );
|
|
doneQueue_purge ( node_ptr );
|
|
mtcCmd_workQ_purge ( node_ptr );
|
|
mtcCmd_doneQ_purge ( node_ptr );
|
|
|
|
/* Assert the mtc alive gate */
|
|
node_ptr->mtcAlive_gate = true ;
|
|
|
|
node_ptr->mtcAlive_online = false ;
|
|
node_ptr->mtcAlive_offline = true ;
|
|
node_ptr->health_threshold_counter = 0 ;
|
|
node_ptr->graceful_recovery_counter = 0 ;
|
|
node_ptr->http_retries_cur = 0 ;
|
|
node_ptr->insv_test_count = 0 ;
|
|
node_ptr->mnfa_graceful_recovery = false ;
|
|
|
|
node_ptr->goEnabled = false ;
|
|
node_ptr->goEnabled_subf = false ;
|
|
|
|
mtc_nodeAvailStatus_enum availStatus_temp = node_ptr->availStatus ;
|
|
switch ( node_ptr->availStatus )
|
|
{
|
|
case MTC_AVAIL_STATUS__INTEST:
|
|
case MTC_AVAIL_STATUS__FAILED:
|
|
|
|
/* enable auto recovery if the inactive controller
|
|
* is out of service */
|
|
if (( is_controller (node_ptr) ) && ( NOT_THIS_HOST ))
|
|
this->autorecovery_enabled = true ;
|
|
|
|
/* fall through */
|
|
|
|
case MTC_AVAIL_STATUS__DEGRADED:
|
|
case MTC_AVAIL_STATUS__AVAILABLE:
|
|
{
|
|
if (( is_active_controller ( node_ptr->hostname )) &&
|
|
( is_inactive_controller_main_insv() == false ))
|
|
{
|
|
wlog ("%s recovering active controller from %s-%s-%s\n",
|
|
node_ptr->hostname.c_str(),
|
|
get_adminState_str(node_ptr->adminState).c_str(),
|
|
get_operState_str(node_ptr->operState).c_str(),
|
|
get_availStatus_str(node_ptr->availStatus).c_str());
|
|
|
|
mtcInvApi_update_task ( node_ptr, "" );
|
|
|
|
/* Special case */
|
|
// alarm_enabled_clear ( node_ptr, false );
|
|
|
|
//mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
|
|
//node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CLEAR ;
|
|
|
|
//allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
|
// MTC_OPER_STATE__ENABLED,
|
|
// MTC_AVAIL_STATUS__DEGRADED );
|
|
|
|
// adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
|
|
|
// return (PASS);
|
|
}
|
|
else
|
|
{
|
|
alarm_enabled_failure ( node_ptr );
|
|
|
|
if ( node_ptr->availStatus != MTC_AVAIL_STATUS__FAILED )
|
|
{
|
|
if ( node_ptr->operState != MTC_OPER_STATE__DISABLED )
|
|
{
|
|
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_FAILED );
|
|
}
|
|
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
|
MTC_OPER_STATE__DISABLED,
|
|
MTC_AVAIL_STATUS__FAILED );
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
/* Lets make any availability state corrections */
|
|
|
|
case MTC_AVAIL_STATUS__OFFDUTY:
|
|
case MTC_AVAIL_STATUS__ONLINE:
|
|
availStatus_temp = MTC_AVAIL_STATUS__ONLINE;
|
|
break ;
|
|
case MTC_AVAIL_STATUS__OFFLINE:
|
|
case MTC_AVAIL_STATUS__NOT_INSTALLED:
|
|
availStatus_temp = MTC_AVAIL_STATUS__OFFLINE;
|
|
break ;
|
|
|
|
default:
|
|
slog ("Unknown availability state (%d)\n", availStatus_temp);
|
|
break ;
|
|
}
|
|
|
|
/* Never send a disable request to SM for this controller
|
|
* or SM will shut us down. */
|
|
if ( is_controller ( node_ptr ) && NOT_THIS_HOST )
|
|
{
|
|
mtcSmgrApi_request ( node_ptr,
|
|
CONTROLLER_DISABLED,
|
|
SMGR_MAX_RETRIES );
|
|
}
|
|
rc = allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
|
MTC_OPER_STATE__DISABLED,
|
|
availStatus_temp );
|
|
|
|
if (( rc != PASS ) && ( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK ))
|
|
{
|
|
allStateChange ( node_ptr, MTC_ADMIN_STATE__LOCKED,
|
|
MTC_OPER_STATE__DISABLED,
|
|
availStatus_temp );
|
|
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_UNLOCK_FAILED );
|
|
|
|
elog ("%s 'unlock' failed by System Inventory (rc:%d)\n",
|
|
node_ptr->hostname.c_str(), rc ) ;
|
|
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, 15 );
|
|
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_WAIT );
|
|
break ;
|
|
}
|
|
|
|
if ( NOT_THIS_HOST )
|
|
{
|
|
/* lets stop heartbeat */
|
|
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_STOP_CMD );
|
|
}
|
|
else
|
|
{
|
|
/* skip over the reset part as that was taken care and we are
|
|
* in the reboot recovery phase now. Look for the mtcAlive */
|
|
|
|
/* In self-enable we don't need to purge mtcAlive just need
|
|
* to wait for one more. Assum,e offline, not online and open
|
|
* the mtcAlive gate. */
|
|
node_ptr->mtcAlive_gate = false ;
|
|
node_ptr->mtcAlive_online = false ;
|
|
node_ptr->mtcAlive_offline = true ;
|
|
/* set mtcAlive timeout */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_30 ) ;
|
|
|
|
/* timer is started ok so we can do the stage transition */
|
|
enableStageChange ( node_ptr, MTC_ENABLE__MTCALIVE_WAIT );
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_ENABLE__HEARTBEAT_STOP_CMD:
|
|
{
|
|
/* Stop heartbeat */
|
|
send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST );
|
|
|
|
/* Clear the minor and failkure flags if it is set for this host */
|
|
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
|
|
{
|
|
hbs_minor_clear ( node_ptr, (iface_enum)iface );
|
|
node_ptr->heartbeat_failed[iface] = false ;
|
|
}
|
|
|
|
/* now reset/reboot the node by running reset progression */
|
|
enableStageChange ( node_ptr, MTC_ENABLE__RESET_PROGRESSION );
|
|
|
|
break ;
|
|
}
|
|
|
|
case MTC_ENABLE__RECOVERY_TIMER:
|
|
{
|
|
/* start the recovery wait timer */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECOVERY_TIMEOUT );
|
|
ilog ("%s Delaying Recovery for %d seconds\n",
|
|
node_ptr->hostname.c_str(),MTC_RECOVERY_TIMEOUT);
|
|
|
|
enableStageChange ( node_ptr, MTC_ENABLE__RECOVERY_WAIT );
|
|
|
|
break ;
|
|
}
|
|
|
|
case MTC_ENABLE__RECOVERY_WAIT:
|
|
{
|
|
if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
enableStageChange ( node_ptr, MTC_ENABLE__RESET_PROGRESSION );
|
|
|
|
node_ptr->mtcTimer.ring = false ;
|
|
}
|
|
if ( node_ptr->availStatus != MTC_AVAIL_STATUS__FAILED )
|
|
{
|
|
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__FAILED );
|
|
}
|
|
break;
|
|
}
|
|
case MTC_ENABLE__RESET_PROGRESSION:
|
|
{
|
|
int overall_timeout = 0 ;
|
|
|
|
plog ("%s reboot\n", node_ptr->hostname.c_str() );
|
|
|
|
/* Health will get updated in the first
|
|
* mtcAlive message after reset */
|
|
node_ptr->health = NODE_HEALTH_UNKNOWN ;
|
|
|
|
node_ptr->mtcCmd_work_fifo.clear();
|
|
mtcCmd_init ( node_ptr->cmd );
|
|
node_ptr->cmd.stage = MTC_CMD_STAGE__START ;
|
|
node_ptr->cmd.cmd = MTC_OPER__RESET_PROGRESSION ;
|
|
node_ptr->cmd.parm1 = 0 ; /* retries */
|
|
node_ptr->cmd.task = true ; /* send task updates */
|
|
node_ptr->mtcCmd_work_fifo.push_front(node_ptr->cmd);
|
|
|
|
/* calculate the overall timeout period taking into account
|
|
* all the reboot/reset sources that will be tried */
|
|
overall_timeout = calc_reset_prog_timeout ( node_ptr , node_ptr->cmd.parm1 ) ;
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, overall_timeout ) ;
|
|
enableStageChange ( node_ptr, MTC_ENABLE__RESET_WAIT );
|
|
|
|
break ;
|
|
}
|
|
case MTC_ENABLE__RESET_WAIT:
|
|
{
|
|
/* Wait or reset progression FSM to complete */
|
|
if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
wlog ("%s Reset Progression Timeout\n", node_ptr->hostname.c_str());
|
|
|
|
/* trigger some delay before another attempt */
|
|
enableStageChange ( node_ptr, MTC_ENABLE__RECOVERY_TIMER );
|
|
|
|
/* if we timeout then remove the reset progression command
|
|
* and cleanup the done queue ; just in case */
|
|
if ( node_ptr->mtcCmd_done_fifo.size() )
|
|
node_ptr->mtcCmd_done_fifo.pop_front();
|
|
if ( node_ptr->mtcCmd_work_fifo.size() )
|
|
node_ptr->mtcCmd_work_fifo.pop_front();
|
|
}
|
|
else if ( node_ptr->mtcCmd_done_fifo.size() )
|
|
{
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
|
|
node_ptr->mtcCmd_done_fifo_ptr =
|
|
node_ptr->mtcCmd_done_fifo.begin();
|
|
if ( node_ptr->mtcCmd_done_fifo_ptr->status != PASS )
|
|
{
|
|
wlog ("%s Reset Unsuccessful (retries:%d) (rc:%d)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->cmd.parm1,
|
|
node_ptr->mtcCmd_done_fifo_ptr->status );
|
|
|
|
/* trigger some delay before another attempt */
|
|
enableStageChange ( node_ptr, MTC_ENABLE__RECOVERY_TIMER );
|
|
}
|
|
else /* ... we got the reset or reboot */
|
|
{
|
|
/* Set the FSM task state to booting */
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_BOOTING );
|
|
enableStageChange ( node_ptr, MTC_ENABLE__INTEST_START );
|
|
}
|
|
/* Remove the reset progression command now that it is done */
|
|
node_ptr->mtcCmd_done_fifo.pop_front();
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_ENABLE__INTEST_START:
|
|
{
|
|
plog ("%s Booting (timeout: %d secs) (%d)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->mtcalive_timeout,
|
|
node_ptr->node_unlocked_counter);
|
|
|
|
node_ptr->cmdReq = MTC_CMD_NONE ;
|
|
node_ptr->cmdRsp = MTC_CMD_NONE ;
|
|
node_ptr->unknown_health_reported = false ;
|
|
node_ptr->mtcAlive_online = false ;
|
|
node_ptr->mtcAlive_offline = true ;
|
|
node_ptr->goEnabled = false ;
|
|
|
|
clear_service_readies ( node_ptr );
|
|
|
|
/* Set uptime to zero in mtce and in the database */
|
|
node_ptr->uptime_save = 0 ;
|
|
set_uptime ( node_ptr, 0 , false );
|
|
|
|
/* start the timer that waits for MTC READY */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, node_ptr->mtcalive_timeout );
|
|
|
|
node_ptr->mtcAlive_purge = 0 ;
|
|
|
|
/* timer is started ok so we can do the stage transition */
|
|
enableStageChange ( node_ptr, MTC_ENABLE__MTCALIVE_PURGE );
|
|
|
|
break ;
|
|
}
|
|
case MTC_ENABLE__MTCALIVE_PURGE:
|
|
{
|
|
node_ptr->mtcAlive_purge += 1 ;
|
|
|
|
if ( node_ptr->mtcAlive_purge >= 20 )
|
|
{
|
|
/* open gate */
|
|
node_ptr->mtcAlive_gate = false ;
|
|
|
|
node_ptr->mtcAlive_purge = 0 ;
|
|
/* timer is started ok so we can do the stage transition */
|
|
enableStageChange ( node_ptr, MTC_ENABLE__MTCALIVE_WAIT );
|
|
}
|
|
#ifdef WANT_PURGE_LOG
|
|
else
|
|
{
|
|
dlog2 ("%s purging (%d) ...\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->mtcAlive_purge );
|
|
}
|
|
#endif
|
|
/* Clear out any mtcAlive messages that may
|
|
* have come in while we were purging */
|
|
node_ptr->mtcAlive_online = false ;
|
|
node_ptr->mtcAlive_offline = true ;
|
|
break ;
|
|
}
|
|
case MTC_ENABLE__MTCALIVE_WAIT:
|
|
{
|
|
/* search for the mtc alive message */
|
|
if ( node_ptr->mtcAlive_online == true )
|
|
{
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
|
|
/* Check to see if the host is/got configured correctly */
|
|
if ( (node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED) == 0 )
|
|
{
|
|
elog ("%s configuration incomplete or failed (oob:%x:%x)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->mtce_flags,
|
|
MTC_FLAG__I_AM_CONFIGURED);
|
|
|
|
/* raise an alarm for the failure of the config */
|
|
alarm_config_failure ( node_ptr );
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_CONFIG_FAIL );
|
|
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
|
}
|
|
else
|
|
{
|
|
plog ("%s is MTCALIVE (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime );
|
|
|
|
/* Set the node mtcAlive timer to configured value.
|
|
* This will revert bact to normal timeout after any first
|
|
* unlock value that may be in effect. */
|
|
LOAD_NODETYPE_TIMERS ;
|
|
|
|
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_ONLINE );
|
|
node_ptr->offline_log_reported = false ;
|
|
node_ptr->online_log_reported = true ;
|
|
|
|
/* Request Out-Of--Service test execution */
|
|
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MAIN_GOENABLED, MGMNT_INTERFACE );
|
|
|
|
/* now officially in the In-Test state */
|
|
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__INTEST );
|
|
|
|
/* O.K. Clear the alive */
|
|
node_ptr->mtcAlive_online = false ;
|
|
|
|
/* Go to the goEnabled stage */
|
|
enableStageChange ( node_ptr, MTC_ENABLE__GOENABLED_TIMER );
|
|
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_TESTING );
|
|
}
|
|
break ;
|
|
}
|
|
else if ( mtcTimer_expired ( node_ptr->mtcTimer) )
|
|
{
|
|
elog ("%s Timeout waiting for MTCALIVE\n", node_ptr->hostname.c_str());
|
|
|
|
/* raise an alarm for the enable failure */
|
|
alarm_enabled_failure ( node_ptr );
|
|
|
|
/* go back and issue reboot again */
|
|
enableStageChange ( node_ptr, MTC_ENABLE__RESET_PROGRESSION );
|
|
|
|
if ( node_ptr->availStatus != MTC_AVAIL_STATUS__FAILED )
|
|
{
|
|
/* no longer In-Test ; we are 'Failed' again" */
|
|
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__FAILED );
|
|
}
|
|
|
|
/* Set the FSM task state to init failed */
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_BOOT_FAIL );
|
|
|
|
break ;
|
|
}
|
|
else if ( node_ptr->mtcAlive_gate == true )
|
|
{
|
|
slog ("%s mtcAlive gate unexpectedly set, correcting ...\n",
|
|
node_ptr->hostname.c_str());
|
|
|
|
node_ptr->mtcAlive_gate = false ;
|
|
}
|
|
|
|
/* wait some more */
|
|
break ;
|
|
}
|
|
|
|
case MTC_ENABLE__GOENABLED_TIMER:
|
|
{
|
|
mtcTimer_start ( node_ptr->mtcTimer,
|
|
mtcTimer_handler, this->goenabled_timeout);
|
|
|
|
ilog ("%s waiting for GOENABLED (timeout: %d secs)\n",
|
|
node_ptr->hostname.c_str(), this->goenabled_timeout );
|
|
|
|
node_ptr->goEnabled = false ;
|
|
|
|
/* start waiting fhr the ENABLE READY message */
|
|
enableStageChange ( node_ptr, MTC_ENABLE__GOENABLED_WAIT );
|
|
|
|
break ;
|
|
}
|
|
case MTC_ENABLE__GOENABLED_WAIT:
|
|
{
|
|
/* The healthy code comes from the host in the mtcAlive message.
|
|
* This 'if' clause was introduced to detected failure of host
|
|
* without having to wait for the GOENABLED phase to timeout.
|
|
*
|
|
* This case is particularly important in the DOR case where
|
|
* computes may have come up and fail to run their manifests
|
|
* and sit there in an unconfigured state. We don't want them to
|
|
* be gracefully recovered to enabled in that case. Instead
|
|
* we want to recover the card through a reset as quickly as
|
|
* possible. */
|
|
if ( node_ptr->health == NODE_UNHEALTHY )
|
|
{
|
|
elog ("%s is UNHEALTHY\n", node_ptr->hostname.c_str());
|
|
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
this->force_full_enable ( node_ptr );
|
|
}
|
|
/* search for the Go Enable message */
|
|
else if ( node_ptr->goEnabled_failed == true )
|
|
{
|
|
elog ("%s got GOENABLED Failed\n", node_ptr->hostname.c_str());
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_INTEST_FAIL );
|
|
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
|
}
|
|
/* search for the Go Enable message */
|
|
else if ( node_ptr->goEnabled == true )
|
|
{
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
plog ("%s got GOENABLED\n", node_ptr->hostname.c_str());
|
|
// plog ("%s main configured OK\n", node_ptr->hostname.c_str());
|
|
|
|
/* O.K. clearing the state now that we got it */
|
|
node_ptr->goEnabled = false ;
|
|
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_INITIALIZING );
|
|
|
|
/* ok. great, got the go-enabled message, lets move on */
|
|
enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_START );
|
|
}
|
|
else if ( mtcTimer_expired ( node_ptr->mtcTimer ))
|
|
{
|
|
elog ("%s has GOENABLED Timeout\n", node_ptr->hostname.c_str());
|
|
ilog ("%s ... the out-of-service tests took too long to complete\n",
|
|
node_ptr->hostname.c_str());
|
|
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_INTEST_FAIL_TO_ );
|
|
node_ptr->mtcTimer.ring = false ;
|
|
|
|
/* raise an alarm for the enable failure */
|
|
alarm_enabled_failure ( node_ptr );
|
|
|
|
/* go back and issue reboot again */
|
|
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
|
|
|
/* no longer In-Test ; we are 'Failed' again" */
|
|
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__FAILED );
|
|
}
|
|
else
|
|
{
|
|
; /* wait some more */
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_ENABLE__HOST_SERVICES_START:
|
|
{
|
|
bool start = true ;
|
|
|
|
plog ("%s Starting Host Services\n", node_ptr->hostname.c_str());
|
|
if ( this->launch_host_services_cmd ( node_ptr, start ) != PASS )
|
|
{
|
|
node_ptr->hostservices_failed = true ;
|
|
|
|
elog ("%s %s failed ; launch\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->host_services_req.name.c_str());
|
|
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_START_SERVICE_FAIL );
|
|
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
|
}
|
|
else
|
|
{
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLING );
|
|
|
|
/* Only run hardware monitor if board management is provisioned */
|
|
if ( node_ptr->bm_provisioned == true )
|
|
{
|
|
send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
|
|
}
|
|
|
|
enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_WAIT );
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_ENABLE__HOST_SERVICES_WAIT:
|
|
{
|
|
/* Wait for host services to complete - pass or fail.
|
|
* The host_services_handler manages timeout. */
|
|
rc = this->host_services_handler ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
/* wait for the mtcClient's response ... */
|
|
break ;
|
|
}
|
|
else if ( rc != PASS )
|
|
{
|
|
node_ptr->hostservices_failed = true ;
|
|
/* distinguish 'timeout' from other 'execution' failures */
|
|
if ( rc == FAIL_TIMEOUT )
|
|
{
|
|
elog ("%s %s failed ; timeout\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->host_services_req.name.c_str());
|
|
|
|
mtcInvApi_update_task ( node_ptr,
|
|
MTC_TASK_START_SERVICE_TO );
|
|
}
|
|
else
|
|
{
|
|
elog ("%s %s failed ; rc:%d\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->host_services_req.name.c_str(),
|
|
rc);
|
|
|
|
mtcInvApi_update_task ( node_ptr,
|
|
MTC_TASK_START_SERVICE_FAIL );
|
|
}
|
|
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
|
}
|
|
else /* success path */
|
|
{
|
|
/* Don't start the self heartbeat for the active controller.
|
|
* Also, in AIO , hosts that have a controller function also
|
|
* have a compute function and the heartbeat for those hosts
|
|
* are started at the end of the subfunction handler. */
|
|
if (( THIS_HOST ) ||
|
|
(( CPE_SYSTEM ) && ( is_controller(node_ptr)) ))
|
|
{
|
|
enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
|
|
}
|
|
else
|
|
{
|
|
/* allow the fsm to wait for up to 1 minute for the
|
|
* hbsClient's ready event before starting heartbeat
|
|
* test. */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
|
|
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_WAIT );
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_ENABLE__HEARTBEAT_WAIT:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
|
{
|
|
wlog ("%s hbsClient ready event timeout\n", node_ptr->hostname.c_str());
|
|
}
|
|
else if ( node_ptr->hbsClient_ready == false )
|
|
{
|
|
break ;
|
|
}
|
|
else
|
|
{
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
}
|
|
|
|
plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
|
|
node_ptr->hostname.c_str(),
|
|
MTC_HEARTBEAT_SOAK_BEFORE_ENABLE,
|
|
node_ptr->hbsClient_ready ? " ready event" : "out ready event" );
|
|
|
|
/* Start Monitoring Services - heartbeat, process and hardware */
|
|
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
|
|
|
|
/* allow heartbeat to run for 10 seconds before we declare enable */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
|
|
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK );
|
|
|
|
break ;
|
|
}
|
|
case MTC_ENABLE__HEARTBEAT_SOAK:
|
|
{
|
|
if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
plog ("%s heartbeating\n", node_ptr->hostname.c_str() );
|
|
/* if heartbeat is not working then we will
|
|
* never get here and enable the host */
|
|
enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_ENABLE__STATE_CHANGE:
|
|
{
|
|
/* Check the work queue complete and done status's */
|
|
mtcInvApi_force_task ( node_ptr, "" );
|
|
|
|
if ( node_ptr->degrade_mask )
|
|
{
|
|
/* Allow host to enable in the degraded state */
|
|
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
|
MTC_OPER_STATE__ENABLED,
|
|
MTC_AVAIL_STATUS__DEGRADED );
|
|
}
|
|
else
|
|
{
|
|
/* Set node as unlocked-enabled */
|
|
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
|
MTC_OPER_STATE__ENABLED,
|
|
MTC_AVAIL_STATUS__AVAILABLE );
|
|
}
|
|
|
|
/* Now that we have posted the unlocked-enabled-available state we need
|
|
* to force the final part of the enable sequence through */
|
|
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE )
|
|
{
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__ENABLE );
|
|
}
|
|
|
|
/* Start a timer that failed enable if the work queue
|
|
* does not empty or if commands in the done queue have failed */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, work_queue_timeout );
|
|
|
|
enableStageChange ( node_ptr, MTC_ENABLE__WORKQUEUE_WAIT );
|
|
|
|
break ;
|
|
}
|
|
case MTC_ENABLE__WORKQUEUE_WAIT:
|
|
{
|
|
bool fail = false ;
|
|
rc = workQueue_done ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
/* wait longer */
|
|
break ;
|
|
}
|
|
else if ( rc == FAIL_WORKQ_TIMEOUT )
|
|
{
|
|
elog ("%s enable failed ; Enable workQueue timeout, purging ...\n", node_ptr->hostname.c_str());
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_WORK_TO );
|
|
fail = true ;
|
|
}
|
|
else if ( rc != PASS )
|
|
{
|
|
elog ("%s Enable failed ; Enable doneQueue has failed commands\n", node_ptr->hostname.c_str());
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_WORK_FAIL );
|
|
fail = true ;
|
|
}
|
|
else if ( NOT_THIS_HOST )
|
|
{
|
|
/* Loop over the heartbeat interfaces and fail the Enable if any of them are failing */
|
|
for ( int i = 0 ; i < MAX_IFACES ; i++ )
|
|
{
|
|
if ( node_ptr->heartbeat_failed[i] == true )
|
|
{
|
|
elog ("%s Enable failure due to %s Network *** Heartbeat Loss ***\n",
|
|
node_ptr->hostname.c_str(),
|
|
get_iface_name_str ((iface_enum)i));
|
|
|
|
fail = true ;
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_FAIL_HB );
|
|
}
|
|
}
|
|
}
|
|
|
|
if ( fail == false )
|
|
{
|
|
/* Go enabled */
|
|
enableStageChange ( node_ptr, MTC_ENABLE__ENABLED );
|
|
}
|
|
else
|
|
{
|
|
workQueue_purge ( node_ptr );
|
|
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
|
}
|
|
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
|
|
break ;
|
|
}
|
|
case MTC_ENABLE__ENABLED:
|
|
{
|
|
if ( is_controller(node_ptr) )
|
|
{
|
|
/* Defer telling SM the controller state if
|
|
* this is a CPE and this is the only controller */
|
|
if ( CPE_SYSTEM && ( num_controllers_enabled() > 0 ))
|
|
{
|
|
wlog ("%s deferring SM enable notification till subfunction-enable complete\n",
|
|
node_ptr->hostname.c_str());
|
|
}
|
|
else
|
|
{
|
|
mtc_cmd_enum cmd = CONTROLLER_ENABLED ;
|
|
|
|
/* Override cmd of ENABLED if action is UNLOCK */
|
|
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK )
|
|
{
|
|
cmd = CONTROLLER_UNLOCKED ;
|
|
}
|
|
|
|
if ( mtcSmgrApi_request ( node_ptr, cmd, SMGR_MAX_RETRIES ) != PASS )
|
|
{
|
|
wlog ("%s Failed to send 'unlocked-enabled' to HA Service Manager (%d) ; enabling anyway\n",
|
|
node_ptr->hostname.c_str(), cmd );
|
|
}
|
|
}
|
|
}
|
|
|
|
alarm_enabled_clear ( node_ptr, false );
|
|
|
|
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
|
|
node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CLEAR ;
|
|
node_ptr->degrade_mask &= ~DEGRADE_MASK_CONFIG ;
|
|
|
|
enableStageChange ( node_ptr, MTC_ENABLE__START );
|
|
|
|
if (( CPE_SYSTEM ) && ( is_controller(node_ptr)))
|
|
{
|
|
ilog ("%s running compute sub-function enable handler\n", node_ptr->hostname.c_str());
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLING_SUBF );
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__ENABLE_SUBF );
|
|
}
|
|
else
|
|
{
|
|
|
|
node_ptr->enabled_count++ ;
|
|
|
|
/* Inform the VIM that this host is enabled */
|
|
mtcVimApi_state_change ( node_ptr, VIM_HOST_ENABLED, 3 );
|
|
|
|
plog ("%s is ENABLED%s\n", node_ptr->hostname.c_str(),
|
|
node_ptr->was_dor_recovery_mode ? " (from DOR)" : "");
|
|
node_ptr->dor_recovery_mode = false ;
|
|
node_ptr->was_dor_recovery_mode = false ;
|
|
node_ptr->http_retries_cur = 0 ;
|
|
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
|
|
|
node_ptr->health_threshold_counter = 0 ;
|
|
}
|
|
|
|
break ;
|
|
}
|
|
|
|
default:
|
|
rc = FAIL_BAD_CASE ;
|
|
}
|
|
return (rc);
|
|
}
|
|
|
|
int recovery_state_gate = -1 ;
|
|
|
|
/* Graceful Recovery handler
|
|
* -------------------------
|
|
* Tries to recover a failed host back in service
|
|
* - auto recovery if it only disappeared for 5 seconds
|
|
* - avoiding a double reset if it was gone for longer or was known to reset */
|
|
int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
int rc = PASS ;
|
|
|
|
if ( node_ptr->recoveryStage != recovery_state_gate )
|
|
{
|
|
recovery_state_gate = node_ptr->recoveryStage ;
|
|
}
|
|
switch ( (int)node_ptr->recoveryStage )
|
|
{
|
|
case MTC_RECOVERY__FAILURE:
|
|
{
|
|
if ( node_ptr->mtcTimer.ring == false )
|
|
{
|
|
break ;
|
|
}
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
|
|
node_ptr->mtcTimer.ring = false ;
|
|
|
|
break ;
|
|
}
|
|
|
|
case MTC_RECOVERY__START:
|
|
{
|
|
/* Purge this hosts work queues */
|
|
mtcCmd_workQ_purge ( node_ptr );
|
|
mtcCmd_doneQ_purge ( node_ptr );
|
|
|
|
node_ptr->http_retries_cur = 0 ;
|
|
node_ptr->unknown_health_reported = false ;
|
|
|
|
plog ("%s %sGraceful Recovery (uptime was %d)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->mnfa_graceful_recovery ? "MNFA " : "",
|
|
node_ptr->uptime );
|
|
|
|
/* Cancel any outstanding timers */
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
|
|
/* clear all the past enable failure bools */
|
|
clear_main_failed_bools ( node_ptr );
|
|
clear_subf_failed_bools ( node_ptr );
|
|
clear_hostservices_ctls ( node_ptr );
|
|
|
|
/* Disable the heartbeat service for Graceful Recovery */
|
|
send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST );
|
|
|
|
/* Clear the minor and failure flags if it is set for this host */
|
|
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
|
|
{
|
|
hbs_minor_clear ( node_ptr, (iface_enum)iface );
|
|
node_ptr->heartbeat_failed[iface] = false ;
|
|
}
|
|
|
|
/* Have we reached the maximum allowed fast recovery attempts.
|
|
*
|
|
* If we have then force the full enable by
|
|
* 1. clearing the recovery action
|
|
* 2. Setting the node operational state to Disabled
|
|
* 3. Setting the Enable action
|
|
*/
|
|
if ( ++node_ptr->graceful_recovery_counter > MTC_MAX_FAST_ENABLES )
|
|
{
|
|
/* gate off further mtcAlive messaging timme the offline
|
|
* handler runs. This prevents stale messages from making it
|
|
* in and prolong the offline detection time */
|
|
node_ptr->mtcAlive_gate = true ;
|
|
|
|
elog ("%s Graceful Recovery Failed (retries=%d)\n",
|
|
node_ptr->hostname.c_str(), node_ptr->graceful_recovery_counter );
|
|
|
|
/* This forces exit from the recover handler and entry into the
|
|
* enable_handler via FAILED availability state and no aciton. */
|
|
nodeLinkClass::force_full_enable ( node_ptr );
|
|
|
|
break ;
|
|
}
|
|
else
|
|
{
|
|
/* TODO: Consider taking this log out as writing to the database
|
|
* during a fast graceful recovery might no be the best idea */
|
|
if ( node_ptr->graceful_recovery_counter > 1 )
|
|
mtcInvApi_update_task ( node_ptr, "Graceful Recovery Retry" );
|
|
else
|
|
mtcInvApi_update_task ( node_ptr, "Graceful Recovery");
|
|
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__REQ_MTCALIVE ) ;
|
|
}
|
|
|
|
break ;
|
|
}
|
|
case MTC_RECOVERY__REQ_MTCALIVE:
|
|
{
|
|
/* Clear any recent mtcAlive notification ; start a new :) */
|
|
node_ptr->mtcAlive_online = false ;
|
|
|
|
/* Clear any recent goEnable notification ; start a new :) */
|
|
node_ptr->goEnabled = false ;
|
|
|
|
/* Save the node's last recorded uptime and request mtcAlive from
|
|
* seemingly failed host. Uptime is saved because when the next
|
|
* mtcAlive comes it the uptime will be over written and we need
|
|
* it to compare as a dicision point later on in recovery handling */
|
|
node_ptr->uptime_save = node_ptr->uptime ;
|
|
|
|
/* A host is considered failed if it goes away for more
|
|
* than a Loss Of Communication Recovery Timeout specified as mtc.ini
|
|
* configuration option 'loc_recovery_timeout' time in seconds. */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, loc_recovery_timeout );
|
|
|
|
ilog ("%s requesting mtcAlive with %d sec timeout\n",
|
|
node_ptr->hostname.c_str(), loc_recovery_timeout);
|
|
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__REQ_MTCALIVE_WAIT ) ;
|
|
|
|
break ;
|
|
}
|
|
case MTC_RECOVERY__REQ_MTCALIVE_WAIT:
|
|
{
|
|
if ( node_ptr->mtcAlive_online == true )
|
|
{
|
|
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );
|
|
|
|
mtcTimer_stop ( node_ptr->mtcTimer );
|
|
|
|
ilog ("%s got requested mtcAlive%s\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->was_dor_recovery_mode ? " (DOR)" : "" );
|
|
|
|
/* Check to see if the host is/got configured correctly */
|
|
if ( (node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED) == 0 )
|
|
{
|
|
elog ("%s Not Configured (Graceful Recovery)\n", node_ptr->hostname.c_str());
|
|
|
|
/* raise an alarm for the failure of the config */
|
|
alarm_config_failure ( node_ptr );
|
|
force_full_enable ( node_ptr );
|
|
break ;
|
|
}
|
|
|
|
/* Check to see if the host is/got configured correctly */
|
|
else if ( (node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY) )
|
|
{
|
|
elog ("%s Configuration Failure (Graceful Recovery)\n", node_ptr->hostname.c_str());
|
|
|
|
/* raise an alarm for the failure of the config */
|
|
alarm_config_failure ( node_ptr );
|
|
force_full_enable ( node_ptr );
|
|
break ;
|
|
}
|
|
|
|
else if ( node_ptr->mnfa_graceful_recovery == true )
|
|
{
|
|
if ( node_ptr->uptime > MTC_MINS_10 )
|
|
{
|
|
/* did not reboot case */
|
|
wlog ("%s Connectivity Recovered ; host did not reset\n", node_ptr->hostname.c_str());
|
|
wlog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str());
|
|
wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str());
|
|
|
|
/* allow the fsm to wait for up to 1 minute for the
|
|
* hbsClient's ready event before starting heartbeat
|
|
* test. */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START ) ;
|
|
}
|
|
else
|
|
{
|
|
/* did reboot case */
|
|
wlog ("%s Connectivity Recovered ; host has reset\n", node_ptr->hostname.c_str());
|
|
ilog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str());
|
|
ilog ("%s ... without additional reboot %s\n",
|
|
node_ptr->hostname.c_str(), node_ptr->bm_ip.empty() ? "or reset" : "" );
|
|
|
|
/* now officially in the In-Test state */
|
|
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__INTEST );
|
|
|
|
/* O.K. Clear the alive */
|
|
node_ptr->mtcAlive_online = false ;
|
|
|
|
/* Go to the goEnabled stage */
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );
|
|
|
|
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
|
|
{
|
|
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
|
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ;
|
|
}
|
|
break ;
|
|
}
|
|
}
|
|
else if (( node_ptr->uptime_save ) && ( node_ptr->uptime >= node_ptr->uptime_save ))
|
|
{
|
|
/* did not reboot case */
|
|
wlog ("%s Connectivity Recovered ; host did not reset%s\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->was_dor_recovery_mode ? " (DOR)" : "" );
|
|
|
|
wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str());
|
|
wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str());
|
|
|
|
/* allow the fsm to wait for up to 1 minute for the
|
|
* hbsClient's ready event before starting heartbeat
|
|
* test. */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START ) ;
|
|
}
|
|
else
|
|
{
|
|
wlog ("%s Connectivity Recovered ; host has reset\n", node_ptr->hostname.c_str());
|
|
ilog ("%s ... continuing%sgraceful recovery ; (OOB: %08x)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->was_dor_recovery_mode ? " (DOR) " : " ",
|
|
node_ptr->mtce_flags);
|
|
ilog ("%s ... without additional reboot %s (uptime:%d)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->bm_ip.empty() ? "or reset" : "",
|
|
node_ptr->uptime );
|
|
|
|
/* now officially in the In-Test state */
|
|
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__INTEST );
|
|
|
|
/* Go to the goEnabled stage */
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );
|
|
|
|
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
|
|
{
|
|
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
|
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ;
|
|
}
|
|
}
|
|
}
|
|
/* A timer ring indicates that the host is not up */
|
|
else if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
/* So now this means the node is failed
|
|
* we need to stop services and transition into
|
|
* a longer 'waiting' for the asynchronous mtcAlive
|
|
* that should come as part of the automatic reboot
|
|
* Steps are
|
|
* 1. Stop Services
|
|
* 2. Create mtcAlive timer
|
|
* 2a. MtcAlive indicating reset ; run start services and recover
|
|
* 2b. MtcAlive indicating no reset ; force full enable
|
|
* 2c MtcAlive Timeout: force full enable
|
|
*/
|
|
wlog ("%s Loss Of Communication for %d seconds ; disabling host%s\n",
|
|
node_ptr->hostname.c_str(),
|
|
loc_recovery_timeout,
|
|
node_ptr->dor_recovery_mode ? " (DOR)" : "" );
|
|
wlog ("%s ... stopping host services\n", node_ptr->hostname.c_str());
|
|
wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str());
|
|
|
|
/* clear all mtc flags. Will be updated on the next/first
|
|
* mtcAlive message upon recovery */
|
|
node_ptr->mtce_flags = 0 ;
|
|
|
|
/* Set node as unlocked-disabled-failed */
|
|
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
|
MTC_OPER_STATE__DISABLED,
|
|
MTC_AVAIL_STATUS__FAILED );
|
|
|
|
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
|
{
|
|
subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
|
|
MTC_AVAIL_STATUS__FAILED );
|
|
}
|
|
|
|
/* Inform the VIM that this host has failed */
|
|
mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );
|
|
|
|
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
|
|
{
|
|
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
|
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ;
|
|
}
|
|
/* Clear all degrade flags except for the HWMON one */
|
|
clear_host_degrade_causes ( node_ptr->degrade_mask );
|
|
node_ptr->degraded_resources_list.clear();
|
|
|
|
if ( is_controller(node_ptr) )
|
|
{
|
|
if ( mtcSmgrApi_request ( node_ptr, CONTROLLER_DISABLED , SMGR_MAX_RETRIES ) != PASS )
|
|
{
|
|
wlog ("%s Failed to send 'unlocked-disabled' to HA Service Manager\n",
|
|
node_ptr->hostname.c_str() );
|
|
}
|
|
}
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__MTCALIVE_TIMER );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_RECOVERY__MTCALIVE_TIMER:
|
|
{
|
|
int timeout = 0 ;
|
|
|
|
/* Set the FSM task state to booting */
|
|
node_ptr->uptime = 0 ;
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT );
|
|
|
|
start_offline_handler ( node_ptr );
|
|
|
|
timeout = node_ptr->mtcalive_timeout ;
|
|
|
|
/* Only try and issue in-line recovery reboot or reset if
|
|
* NOT in Dead Office Recovery (DOR) mode. */
|
|
if ( node_ptr->dor_recovery_mode == false )
|
|
{
|
|
/* If the infrastructure network is provisioned then try
|
|
* and issue a reset over it to expedite the recovery
|
|
* for the case where the management heartbeat has
|
|
* failed but the infra has not.
|
|
* Keeping it simple by just issing the command and not looping on it */
|
|
if (( node_ptr->infra_ip.length () > 5 ) &&
|
|
( node_ptr->heartbeat_failed[MGMNT_IFACE] == true ) &&
|
|
( node_ptr->heartbeat_failed[INFRA_IFACE] == false ))
|
|
{
|
|
ilog ("%s issuing one time graceful recovery reboot over infra network\n", node_ptr->hostname.c_str());
|
|
send_mtc_cmd ( node_ptr->hostname, MTC_CMD_REBOOT, INFRA_INTERFACE ) ;
|
|
}
|
|
|
|
if ((node_ptr->bm_provisioned) && (node_ptr->bm_accessible))
|
|
{
|
|
ilog ("%s issuing one time board management graceful recovery reset\n", node_ptr->hostname.c_str());
|
|
|
|
rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_RESET );
|
|
if ( rc )
|
|
{
|
|
wlog ("%s board management reset failed\n", node_ptr->hostname.c_str());
|
|
}
|
|
else
|
|
{
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__RESET_RECV_WAIT );
|
|
break ;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
wlog ("%s cannot issue Reset\n", node_ptr->hostname.c_str() );
|
|
wlog ("%s ... board management not provisioned or accessible\n", node_ptr->hostname.c_str() );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* Just allow Graceful Recovery to take its course. */
|
|
/* Load configured mtcAlive and goEnabled timers */
|
|
LOAD_NODETYPE_TIMERS ;
|
|
|
|
/* load the mtcAlive timeout to accomodate for dor recovery */
|
|
timeout = node_ptr->mtcalive_timeout + daemon_get_cfg_ptr()->dor_recovery_timeout_ext ;
|
|
}
|
|
|
|
/* start the timer that waits for MTCALIVE */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, timeout );
|
|
|
|
plog ("%s %s (%d secs)%s(uptime was %d) \n",
|
|
node_ptr->hostname.c_str(),
|
|
MTC_TASK_RECOVERY_WAIT,
|
|
timeout,
|
|
node_ptr->dor_recovery_mode ? " (DOR) " : " " ,
|
|
node_ptr->uptime_save );
|
|
|
|
clear_service_readies ( node_ptr );
|
|
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__MTCALIVE_WAIT );
|
|
break ;
|
|
}
|
|
|
|
case MTC_RECOVERY__RESET_RECV_WAIT:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ))
|
|
{
|
|
rc = ipmi_command_recv ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
|
break ;
|
|
}
|
|
|
|
if ( rc )
|
|
{
|
|
elog ("%s Reset command failed\n", node_ptr->hostname.c_str());
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s is Resetting\n", node_ptr->hostname.c_str());
|
|
}
|
|
|
|
/* start the timer that waits for MTCALIVE */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, node_ptr->mtcalive_timeout );
|
|
|
|
plog ("%s %s (%d secs) (uptime was %d)\n",
|
|
node_ptr->hostname.c_str(),
|
|
MTC_TASK_RECOVERY_WAIT,
|
|
node_ptr->mtcalive_timeout,
|
|
node_ptr->uptime_save );
|
|
|
|
clear_service_readies ( node_ptr );
|
|
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__MTCALIVE_WAIT );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_RECOVERY__MTCALIVE_WAIT:
|
|
{
|
|
/* search for the mtc alive message */
|
|
if ( node_ptr->mtcAlive_online == true )
|
|
{
|
|
mtcTimer_stop ( node_ptr->mtcTimer );
|
|
|
|
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );
|
|
|
|
/* If the host's uptime is bigger than the saved uptime then
|
|
* the host has not reset yet we have disabled services
|
|
* then now we need to reset the host to prevet VM duplication
|
|
* by forcing a full enable */
|
|
if ( node_ptr->uptime_save & ( node_ptr->uptime >= node_ptr->uptime_save ) )
|
|
{
|
|
ilog ("%s regained MTCALIVE from host that did not reboot (uptime:%d)\n",
|
|
node_ptr->hostname.c_str(), node_ptr->uptime );
|
|
ilog ("%s ... uptimes before:%d after:%d\n", node_ptr->hostname.c_str(), node_ptr->uptime_save, node_ptr->uptime );
|
|
ilog ("%s ... exiting graceful recovery\n", node_ptr->hostname.c_str());
|
|
ilog ("%s ... forcing full enable with reset\n", node_ptr->hostname.c_str());
|
|
|
|
nodeLinkClass::force_full_enable ( node_ptr );
|
|
}
|
|
/* Check to see if the host is/got configured */
|
|
else if ( (node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED) == 0 )
|
|
{
|
|
elog ("%s Not Configured (Graceful Recovery)\n", node_ptr->hostname.c_str());
|
|
|
|
/* raise an alarm for the failure of the config */
|
|
alarm_config_failure ( node_ptr );
|
|
force_full_enable ( node_ptr );
|
|
break ;
|
|
}
|
|
|
|
/* Check to see if the host is/got configured correctly */
|
|
else if ( (node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY) )
|
|
{
|
|
elog ("%s Configuration Failure (Graceful Recovery)\n", node_ptr->hostname.c_str());
|
|
|
|
/* raise an alarm for the failure of the config */
|
|
alarm_config_failure ( node_ptr );
|
|
force_full_enable ( node_ptr );
|
|
break ;
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s regained MTCALIVE from host that has rebooted (uptime curr:%d save:%d)\n",
|
|
node_ptr->hostname.c_str(), node_ptr->uptime, node_ptr->uptime_save );
|
|
ilog ("%s ... continuing with graceful recovery %s\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->dor_recovery_mode ? "(DOR)" : " ");
|
|
ilog ("%s ... without additional reboot %s\n",
|
|
node_ptr->hostname.c_str(), node_ptr->bm_ip.empty() ? "or reset" : "" );
|
|
|
|
/* now officially in the In-Test state */
|
|
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__INTEST );
|
|
|
|
/* O.K. Clear the alive */
|
|
node_ptr->mtcAlive_online = false ;
|
|
|
|
/* Go to the goEnabled stage */
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );
|
|
}
|
|
break ;
|
|
}
|
|
else if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );
|
|
|
|
/* Set the FSM task state to init failed */
|
|
mtcInvApi_update_task ( node_ptr, "Graceful Recovery Failed" );
|
|
|
|
node_ptr->mtcTimer.ring = false ;
|
|
|
|
elog ("%s has MTCALIVE Timeout\n", node_ptr->hostname.c_str());
|
|
|
|
nodeLinkClass::force_full_enable ( node_ptr );
|
|
|
|
break ;
|
|
}
|
|
else if (( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF ) &&
|
|
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->bm_provisioned == true ) &&
|
|
( node_ptr->bm_accessible == true ) &&
|
|
( node_ptr->hwmon_powercycle.state == RECOVERY_STATE__INIT ) &&
|
|
( thread_idle ( node_ptr->ipmitool_thread_ctrl )) &&
|
|
( node_ptr->ipmitool_thread_info.command != IPMITOOL_THREAD_CMD__POWER_ON ))
|
|
{
|
|
ilog ("%s powering on unlocked powered off host\n", node_ptr->hostname.c_str());
|
|
if ( ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON ) != PASS )
|
|
{
|
|
node_ptr->ipmitool_thread_ctrl.done = true ;
|
|
thread_kill ( node_ptr->ipmitool_thread_ctrl , node_ptr->ipmitool_thread_info ) ;
|
|
}
|
|
}
|
|
else if (( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF ) &&
|
|
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->bm_provisioned == true ) &&
|
|
( node_ptr->bm_accessible == true ) &&
|
|
( node_ptr->hwmon_powercycle.state == RECOVERY_STATE__INIT ) &&
|
|
( thread_done ( node_ptr->ipmitool_thread_ctrl )) &&
|
|
( node_ptr->ipmitool_thread_info.command == IPMITOOL_THREAD_CMD__POWER_ON ))
|
|
{
|
|
if ( ipmi_command_recv ( node_ptr ) == PASS )
|
|
{
|
|
ilog ("%s powered on\n", node_ptr->hostname.c_str());
|
|
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__OFFLINE );
|
|
}
|
|
}
|
|
else if ( node_ptr->mtcAlive_gate == true )
|
|
{
|
|
slog ("%s mtcAlive gate unexpectedly set, auto-correcting ...\n",
|
|
node_ptr->hostname.c_str());
|
|
|
|
node_ptr->mtcAlive_gate = false ;
|
|
}
|
|
|
|
/* wait some more */
|
|
break ;
|
|
}
|
|
case MTC_RECOVERY__GOENABLED_TIMER:
|
|
{
|
|
node_ptr->goEnabled = false ;
|
|
|
|
/* See if the host is there and already in the go enabled state */
|
|
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MAIN_GOENABLED, MGMNT_INTERFACE );
|
|
|
|
/* start the reboot timer - is cought in the mtc alive case */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, this->goenabled_timeout );
|
|
|
|
/* ok time started */
|
|
ilog ("%s waiting for GOENABLED ; with %d sec timeout\n",
|
|
node_ptr->hostname.c_str(),
|
|
this->goenabled_timeout );
|
|
|
|
|
|
/* Default to unknown health */
|
|
node_ptr->health = NODE_HEALTH_UNKNOWN ;
|
|
|
|
/* start waiting fhr the ENABLE READY message */
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_WAIT );
|
|
break ;
|
|
}
|
|
case MTC_RECOVERY__GOENABLED_WAIT:
|
|
{
|
|
/* The healthy code comes from the host in the mtcAlive message.
|
|
* This 'if' clause was introduced to detected failure of host
|
|
* without having to wait for the GOENABLED phase to timeout.
|
|
*
|
|
* This case is particularly important in the DOR case where
|
|
* computes may have come up and fail to run their manifests
|
|
* and sit there in an unconfigured state. We don't want them to
|
|
* be gracefully recovered to enabled in that case. Instead
|
|
* we want to recover the card through a reset as quickly as
|
|
* possible. */
|
|
if ( node_ptr->health == NODE_UNHEALTHY )
|
|
{
|
|
elog ("%s is UNHEALTHY\n", node_ptr->hostname.c_str());
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
this->force_full_enable ( node_ptr );
|
|
}
|
|
/* search for the Go Enable message */
|
|
else if ( node_ptr->goEnabled_failed == true )
|
|
{
|
|
elog ("%s got GOENABLED Failed\n", node_ptr->hostname.c_str());
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_INTEST_FAIL );
|
|
this->force_full_enable ( node_ptr );
|
|
}
|
|
|
|
/* search for the Go Enable message */
|
|
else if ( node_ptr->goEnabled == true )
|
|
{
|
|
plog ("%s got GOENABLED (Graceful Recovery)\n", node_ptr->hostname.c_str());
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
|
|
/* O.K. clearing the state now that we got it */
|
|
node_ptr->goEnabled = false ;
|
|
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__HOST_SERVICES_START );
|
|
}
|
|
else if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
elog ("%s has GOENABLED Timeout\n", node_ptr->hostname.c_str());
|
|
|
|
node_ptr->mtcTimer.ring = false ;
|
|
|
|
this->force_full_enable ( node_ptr );
|
|
}
|
|
break;
|
|
}
|
|
|
|
case MTC_RECOVERY__HOST_SERVICES_START:
|
|
{
|
|
bool start = true ;
|
|
|
|
plog ("%s Starting Host Services\n", node_ptr->hostname.c_str());
|
|
if ( this->launch_host_services_cmd ( node_ptr, start ) != PASS )
|
|
{
|
|
elog ("%s %s failed ; launch\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->host_services_req.name.c_str());
|
|
node_ptr->hostservices_failed = true ;
|
|
this->force_full_enable ( node_ptr );
|
|
}
|
|
else
|
|
{
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__HOST_SERVICES_WAIT );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_RECOVERY__HOST_SERVICES_WAIT:
|
|
{
|
|
/* Wait for host services to complete - pass or fail.
|
|
* The host_services_handler manages timeout. */
|
|
rc = this->host_services_handler ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
/* wait for the mtcClient's response ... */
|
|
break ;
|
|
}
|
|
else if ( rc != PASS )
|
|
{
|
|
node_ptr->hostservices_failed = true ;
|
|
if ( rc == FAIL_TIMEOUT )
|
|
{
|
|
elog ("%s %s failed ; timeout\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->host_services_req.name.c_str());
|
|
|
|
mtcInvApi_update_task ( node_ptr,
|
|
MTC_TASK_START_SERVICE_TO );
|
|
}
|
|
else
|
|
{
|
|
elog ("%s %s failed ; rc=%d\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->host_services_req.name.c_str(),
|
|
rc);
|
|
|
|
mtcInvApi_update_task ( node_ptr,
|
|
MTC_TASK_START_SERVICE_FAIL );
|
|
}
|
|
this->force_full_enable ( node_ptr );
|
|
}
|
|
else /* success path */
|
|
{
|
|
/* The active controller would never get/be here but
|
|
* if it did then just fall through to change state. */
|
|
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
|
{
|
|
/* Here we need to run the sub-fnction goenable and start
|
|
* host services if this is the other controller in a AIO
|
|
* system. */
|
|
if ( NOT_THIS_HOST )
|
|
{
|
|
/* start a timer that waits for the /var/run/.compute_config_complete flag */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_COMPUTE_CONFIG_TIMEOUT );
|
|
|
|
/* We will come back to MTC_RECOVERY__HEARTBEAT_START
|
|
* after we enable the compute subfunction */
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__CONFIG_COMPLETE_WAIT );
|
|
}
|
|
else
|
|
{
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE );
|
|
}
|
|
}
|
|
/* Otherwise in a normal system and not the active controller,
|
|
* just start the heartbeat soak */
|
|
else if ( NOT_THIS_HOST )
|
|
{
|
|
/* allow the fsm to wait for up to 1 minute for the
|
|
* hbsClient's ready event before starting heartbeat
|
|
* test. */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START );
|
|
}
|
|
else
|
|
{
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE );
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_RECOVERY__CONFIG_COMPLETE_WAIT:
|
|
{
|
|
/* look for file */
|
|
if ( node_ptr->mtce_flags & MTC_FLAG__SUBF_CONFIGURED )
|
|
{
|
|
plog ("%s-compute configured\n", node_ptr->hostname.c_str());
|
|
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__SUBF_GOENABLED_TIMER );
|
|
}
|
|
|
|
/* timeout handling */
|
|
else if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
elog ("%s-compute configuration timeout\n", node_ptr->hostname.c_str());
|
|
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_FAIL );
|
|
nodeLinkClass::force_full_enable ( node_ptr );
|
|
}
|
|
else
|
|
{
|
|
; /* wait longer */
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_RECOVERY__SUBF_GOENABLED_TIMER:
|
|
{
|
|
ilog ("%s-compute running out-of-service tests\n", node_ptr->hostname.c_str());
|
|
|
|
/* See if the host is there and already in the go enabled state */
|
|
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_SUBF_GOENABLED, MGMNT_INTERFACE );
|
|
|
|
/* start the reboot timer - is cought in the mtc alive case */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, this->goenabled_timeout );
|
|
|
|
node_ptr->goEnabled_subf = false ;
|
|
|
|
/* start waiting for the GOENABLED message */
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__SUBF_GOENABLED_WAIT );
|
|
|
|
break ;
|
|
}
|
|
case MTC_RECOVERY__SUBF_GOENABLED_WAIT:
|
|
{
|
|
/* search for the Go Enable message */
|
|
if ( node_ptr->goEnabled_failed_subf == true )
|
|
{
|
|
elog ("%s-compute one or more out-of-service tests failed\n", node_ptr->hostname.c_str());
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_FAIL );
|
|
this->force_full_enable ( node_ptr );
|
|
}
|
|
|
|
/* search for the Go Enable message */
|
|
else if ( node_ptr->goEnabled_subf == true )
|
|
{
|
|
/* stop the timer */
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
|
|
plog ("%s-compute passed out-of-service tests\n", node_ptr->hostname.c_str());
|
|
|
|
/* O.K. clearing the state now that we got it */
|
|
node_ptr->goEnabled_subf = false ;
|
|
|
|
/* ok. great, got the go-enabled message, lets move on */
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__SUBF_SERVICES_START );
|
|
}
|
|
else if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
elog ("%s-compute out-of-service test execution timeout\n", node_ptr->hostname.c_str());
|
|
node_ptr->mtcTimer.ring = false ;
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_FAIL );
|
|
this->force_full_enable ( node_ptr );
|
|
}
|
|
else
|
|
{
|
|
; /* wait some more */
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_RECOVERY__SUBF_SERVICES_START:
|
|
{
|
|
bool start = true ;
|
|
bool subf = true ;
|
|
|
|
plog ("%s-compute Starting Host Services\n", node_ptr->hostname.c_str());
|
|
|
|
if ( this->launch_host_services_cmd ( node_ptr, start, subf ) != PASS )
|
|
{
|
|
elog ("%s-compute %s failed ; launch\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->host_services_req.name.c_str());
|
|
node_ptr->hostservices_failed_subf = true ;
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_FAIL );
|
|
this->force_full_enable ( node_ptr );
|
|
}
|
|
else
|
|
{
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__SUBF_SERVICES_WAIT );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_RECOVERY__SUBF_SERVICES_WAIT:
|
|
{
|
|
/* Wait for host services to complete - pass or fail.
|
|
* The host_services_handler manages timeout. */
|
|
rc = this->host_services_handler ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
/* wait for the mtcClient's response ... */
|
|
break ;
|
|
}
|
|
else if ( rc != PASS )
|
|
{
|
|
node_ptr->hostservices_failed_subf = true ;
|
|
if ( rc == FAIL_TIMEOUT )
|
|
{
|
|
elog ("%s-compute %s failed ; timeout\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->host_services_req.name.c_str());
|
|
|
|
mtcInvApi_update_task ( node_ptr,
|
|
MTC_TASK_START_SERVICE_TO );
|
|
}
|
|
else
|
|
{
|
|
elog ("%s-compute %s failed ; rc=%d\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->host_services_req.name.c_str(),
|
|
rc);
|
|
|
|
mtcInvApi_update_task ( node_ptr,
|
|
MTC_TASK_START_SERVICE_FAIL );
|
|
}
|
|
this->force_full_enable ( node_ptr );
|
|
}
|
|
else /* success path */
|
|
{
|
|
/* allow the fsm to wait for up to 1 minute for the
|
|
* hbsClient's ready event before starting heartbeat
|
|
* test. */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_RECOVERY__HEARTBEAT_START:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
|
{
|
|
wlog ("%s hbsClient ready event timeout\n", node_ptr->hostname.c_str());
|
|
}
|
|
else if ( node_ptr->hbsClient_ready == false )
|
|
{
|
|
break ;
|
|
}
|
|
else
|
|
{
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
}
|
|
|
|
plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
|
|
node_ptr->hostname.c_str(),
|
|
MTC_HEARTBEAT_SOAK_BEFORE_ENABLE,
|
|
node_ptr->hbsClient_ready ? " ready event" : "out ready event" );
|
|
|
|
/* Enable the heartbeat service for Graceful Recovery */
|
|
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
|
|
|
|
/* allow heartbeat to run for 10 seconds before we declare enable */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
|
|
|
|
/* if heartbeat is not working then we will
|
|
* never get here and enable the host */
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_SOAK );
|
|
|
|
break ;
|
|
}
|
|
case MTC_RECOVERY__HEARTBEAT_SOAK:
|
|
{
|
|
if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
/* if heartbeat is not working then we will
|
|
* never get here and enable the host */
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_RECOVERY__STATE_CHANGE:
|
|
{
|
|
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
|
{
|
|
/* Set node as unlocked-enabled */
|
|
subfStateChange ( node_ptr, MTC_OPER_STATE__ENABLED,
|
|
MTC_AVAIL_STATUS__AVAILABLE );
|
|
}
|
|
|
|
if ( node_ptr->degrade_mask )
|
|
{
|
|
/* Allow host to enable in the degraded state */
|
|
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
|
MTC_OPER_STATE__ENABLED,
|
|
MTC_AVAIL_STATUS__DEGRADED );
|
|
}
|
|
else
|
|
{
|
|
/* Set node as unlocked-enabled */
|
|
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
|
MTC_OPER_STATE__ENABLED,
|
|
MTC_AVAIL_STATUS__AVAILABLE );
|
|
}
|
|
|
|
/* Only run hardware monitor board management is provisioned */
|
|
if ( node_ptr->bm_provisioned == true )
|
|
{
|
|
send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
|
|
}
|
|
|
|
/* Inform the VIM that this host is enabled */
|
|
mtcVimApi_state_change ( node_ptr, VIM_HOST_ENABLED, 3 );
|
|
|
|
/* Start a timer that failed enable if the work queue
|
|
* does not empty or if commands in the done queue have failed */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, work_queue_timeout );
|
|
|
|
mtcInvApi_force_task ( node_ptr, "" );
|
|
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__WORKQUEUE_WAIT ) ;
|
|
break ;
|
|
}
|
|
case MTC_RECOVERY__WORKQUEUE_WAIT:
|
|
{
|
|
rc = workQueue_done ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
/* wait longer */
|
|
break ;
|
|
}
|
|
else if ( rc == PASS )
|
|
{
|
|
/* Start Graceful Recovery */
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_START ) ;
|
|
break ;
|
|
}
|
|
else if ( rc == FAIL_WORKQ_TIMEOUT )
|
|
{
|
|
wlog ("%s Graceful Recovery failed ; workQueue empty timeout, purging ...\n", node_ptr->hostname.c_str());
|
|
workQueue_purge ( node_ptr );
|
|
}
|
|
else if ( rc != PASS )
|
|
{
|
|
wlog ("%s Graceful Recovery failed ; doneQueue contains failed commands\n", node_ptr->hostname.c_str());
|
|
}
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_FAIL );
|
|
nodeLinkClass::force_full_enable ( node_ptr );
|
|
break ;
|
|
}
|
|
case MTC_RECOVERY__ENABLE_START:
|
|
{
|
|
/* Create the recovery enable timer. This timer is short.
|
|
* A node need to stay enabled with the hartbeat service
|
|
* running for a period of time before declaring it enabled */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
|
|
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_WAIT ) ;
|
|
break;
|
|
}
|
|
case MTC_RECOVERY__ENABLE_WAIT:
|
|
{
|
|
/* When this timer fires the host has been up for enough time */
|
|
if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
if ( is_controller(node_ptr) )
|
|
{
|
|
if ( mtcSmgrApi_request ( node_ptr,
|
|
CONTROLLER_ENABLED,
|
|
SMGR_MAX_RETRIES ) != PASS )
|
|
{
|
|
wlog ("%s Failed to send 'unlocked-disabled' to HA Service Manager ; allowing enable\n",
|
|
node_ptr->hostname.c_str());
|
|
}
|
|
}
|
|
/* Node Has Recovered */
|
|
node_ptr->graceful_recovery_counter = 0 ;
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
|
node_ptr->health_threshold_counter = 0 ;
|
|
node_ptr->enabled_count++ ;
|
|
node_ptr->http_retries_cur = 0 ;
|
|
|
|
doneQueue_purge ( node_ptr );
|
|
if ( node_ptr->was_dor_recovery_mode )
|
|
{
|
|
report_dor_recovery ( node_ptr , "is ENABLED" );
|
|
}
|
|
else
|
|
{
|
|
plog ("%s is ENABLED (Gracefully Recovered)\n",
|
|
node_ptr->hostname.c_str());
|
|
}
|
|
alarm_enabled_clear ( node_ptr, false );
|
|
}
|
|
break ;
|
|
}
|
|
default:
|
|
{
|
|
rc = FAIL_BAD_CASE ;
|
|
break ;
|
|
}
|
|
}
|
|
return (rc);
|
|
}
|
|
|
|
/*
|
|
* Start Stop Host Services Handler
|
|
* --------------------------------
|
|
* Waits for the specified host services command to complete.
|
|
*
|
|
* Returns PASS - command completed successfully
|
|
* RETRY - command still running
|
|
* FAIL_xxxx - command failure for reason
|
|
*
|
|
*/
|
|
int nodeLinkClass::host_services_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
int rc = FAIL ;
|
|
|
|
if ( node_ptr && ( is_host_services_cmd ( node_ptr->host_services_req.cmd ) == true ))
|
|
{
|
|
/* Handle command overall umbrella timeout */
|
|
if ( mtcTimer_expired ( node_ptr->host_services_timer ) )
|
|
{
|
|
elog ("%s %s timeout\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->host_services_req.name.c_str());
|
|
|
|
/* treat as command failure */
|
|
mtcCmd_workQ_purge ( node_ptr );
|
|
mtcCmd_doneQ_purge ( node_ptr );
|
|
rc = FAIL_TIMEOUT ;
|
|
}
|
|
|
|
/* Handle the case where both the done and work fifo's are empty.
|
|
* ... yet this is the state while we are waiting for */
|
|
else if (( node_ptr->mtcCmd_done_fifo.size() == 0 ) &&
|
|
( node_ptr->mtcCmd_work_fifo.size() == 0 ))
|
|
{
|
|
mtcTimer_reset ( node_ptr->host_services_timer );
|
|
slog ("%s %s command missing\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->host_services_req.name.c_str());
|
|
rc = FAIL_BAD_STATE ;
|
|
}
|
|
|
|
/* look for 'done' case - pass and failed */
|
|
else if (( node_ptr->mtcCmd_done_fifo.size() != 0 ) &&
|
|
( node_ptr->mtcCmd_work_fifo.size() == 0 ))
|
|
{
|
|
mtcTimer_reset ( node_ptr->host_services_timer );
|
|
if ( node_ptr->host_services_req.status == PASS )
|
|
{
|
|
ilog ("%s %s completed\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->host_services_req.name.c_str());
|
|
rc = PASS ;
|
|
}
|
|
else
|
|
{
|
|
wlog ("%s %s ; rc:%d\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->host_services_req.status_string.c_str(),
|
|
node_ptr->host_services_req.status);
|
|
|
|
rc = FAIL_OPERATION ;
|
|
}
|
|
/* Purge the done command fifo now that we have consumed the result.
|
|
* The work fifo is already empty or we would not be in this case */
|
|
mtcCmd_doneQ_purge ( node_ptr );
|
|
}
|
|
/* still working ... */
|
|
else
|
|
{
|
|
/* wait longer */
|
|
rc = RETRY ;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
slog ("%s invalid host services command (%d)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->cmd.parm1 );
|
|
|
|
rc = FAIL_BAD_PARM ;
|
|
}
|
|
|
|
return (rc);
|
|
}
|
|
|
|
|
|
/* Disable handler
|
|
* ---------------
|
|
* Algorithm that puts a node into the operationally disabled state */
|
|
int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
int rc = PASS ;
|
|
|
|
switch ( (int)node_ptr->handlerStage.disable )
|
|
{
|
|
case MTC_DISABLE__START:
|
|
{
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
|
|
/* Purge this hosts work and done queues */
|
|
workQueue_purge ( node_ptr );
|
|
doneQueue_purge ( node_ptr );
|
|
mtcCmd_workQ_purge ( node_ptr );
|
|
mtcCmd_doneQ_purge ( node_ptr );
|
|
|
|
/* clear all the enable failure bools */
|
|
clear_main_failed_bools ( node_ptr );
|
|
clear_subf_failed_bools ( node_ptr );
|
|
clear_hostservices_ctls ( node_ptr );
|
|
|
|
disableStageChange ( node_ptr, MTC_DISABLE__DIS_SERVICES_WAIT) ;
|
|
|
|
stop_offline_handler ( node_ptr );
|
|
|
|
if (( node_ptr->bm_provisioned == true ) &&
|
|
( node_ptr->bm_accessible == true ) &&
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF ))
|
|
{
|
|
rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON );
|
|
if ( rc )
|
|
{
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_POWERON_SEND) ;
|
|
}
|
|
else
|
|
{
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_IPMITOOL_REQUEST_DELAY );
|
|
disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_POWERON_RECV) ;
|
|
}
|
|
|
|
if ( rc == PASS )
|
|
{
|
|
ilog ("%s Power On request sent\n", node_ptr->hostname.c_str());
|
|
}
|
|
}
|
|
|
|
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__FORCE_LOCK )
|
|
{
|
|
mtc_nodeAvailStatus_enum locked_status = MTC_AVAIL_STATUS__OFFLINE ;
|
|
plog ("%s Administrative 'force-lock' Operation\n", node_ptr->hostname.c_str());
|
|
|
|
/* If the host was inservice then set its locked state as ONLINE for now.
|
|
* Otherwise its defaulted to offline */
|
|
if (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__INTEST ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__FAILED ))
|
|
{
|
|
locked_status = MTC_AVAIL_STATUS__ONLINE ;
|
|
}
|
|
|
|
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
|
MTC_OPER_STATE__DISABLED,
|
|
locked_status );
|
|
|
|
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
|
{
|
|
subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
|
|
locked_status );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
plog ("%s Administrative 'lock' Operation\n", node_ptr->hostname.c_str());
|
|
}
|
|
|
|
/* reset retries counter in prep for next stage */
|
|
node_ptr->retries = 0 ;
|
|
node_ptr->http_retries_cur = 0 ;
|
|
node_ptr->pmond_ready = false ;
|
|
|
|
/* Clear all degrade flags except for the HWMON one */
|
|
clear_host_degrade_causes ( node_ptr->degrade_mask );
|
|
node_ptr->degraded_resources_list.clear();
|
|
|
|
if ( is_controller(node_ptr) )
|
|
{
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_DISABLE_CONTROL );
|
|
}
|
|
// else
|
|
// {
|
|
// consider putting in the host type
|
|
// }
|
|
|
|
if ( NOT_THIS_HOST )
|
|
{
|
|
/* Disable path for Controllers */
|
|
if ( is_controller(node_ptr) )
|
|
{
|
|
if ( mtcSmgrApi_request ( node_ptr,
|
|
CONTROLLER_LOCKED,
|
|
SMGR_MAX_RETRIES ) != PASS )
|
|
{
|
|
wlog ("%s Failed to send 'locked-disabled' to HA Service Manager\n",
|
|
node_ptr->hostname.c_str() );
|
|
}
|
|
}
|
|
|
|
/* Clear the minor flag if it is set for this host */
|
|
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
|
|
{
|
|
hbs_minor_clear ( node_ptr, (iface_enum)iface );
|
|
}
|
|
|
|
/* Turn off Heartbeat to that host */
|
|
send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST );
|
|
}
|
|
|
|
/* If the stage is still MTC_DISABLE__DIS_SERVICES_WAIT then the
|
|
* host should already be powered on so lets send the stop
|
|
* services command */
|
|
if ( node_ptr->handlerStage.disable == MTC_DISABLE__DIS_SERVICES_WAIT )
|
|
{
|
|
bool start = false ;
|
|
if ( this->launch_host_services_cmd ( node_ptr, start ) != PASS )
|
|
{
|
|
wlog ("%s %s failed ; launch\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->host_services_req.name.c_str());
|
|
|
|
/* proceed to handle force lock if the launch fails */
|
|
disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK );
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_DISABLE__DIS_SERVICES_WAIT:
|
|
{
|
|
/* manage host services stop command to this target */
|
|
rc = this->host_services_handler ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
break ;
|
|
}
|
|
else if ( rc != PASS )
|
|
{
|
|
if ( rc == FAIL_TIMEOUT )
|
|
{
|
|
wlog ("%s %s failed ; timeout\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->host_services_req.name.c_str());
|
|
}
|
|
else
|
|
{
|
|
wlog ("%s %s failed ; rc:%d\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->host_services_req.name.c_str(),
|
|
rc);
|
|
}
|
|
}
|
|
disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK) ;
|
|
break ;
|
|
}
|
|
case MTC_DISABLE__HANDLE_POWERON_SEND:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ))
|
|
{
|
|
rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON );
|
|
if ( rc )
|
|
{
|
|
elog ("%s failed to send Power On request\n", node_ptr->hostname.c_str());
|
|
disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK) ;
|
|
}
|
|
else
|
|
{
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_IPMITOOL_REQUEST_DELAY );
|
|
disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_POWERON_RECV) ;
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_DISABLE__HANDLE_POWERON_RECV:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ))
|
|
{
|
|
rc = ipmi_command_recv ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
|
break ;
|
|
}
|
|
if ( rc )
|
|
{
|
|
elog ("%s auto power-on failed\n", node_ptr->hostname.c_str());
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s is Powering On\n", node_ptr->hostname.c_str());
|
|
}
|
|
disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK) ;
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_DISABLE__HANDLE_FORCE_LOCK:
|
|
{
|
|
/* If this is a force lock against a compute then we have to reset it */
|
|
if (( node_ptr->adminAction == MTC_ADMIN_ACTION__FORCE_LOCK ))
|
|
{
|
|
/* Stop the timer if it is active coming into this case */
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
|
|
/* purge in support of retries */
|
|
mtcCmd_doneQ_purge ( node_ptr );
|
|
mtcCmd_workQ_purge ( node_ptr );
|
|
|
|
ilog ("%s Issuing Force-Lock Reset\n", node_ptr->hostname.c_str());
|
|
mtcCmd_init ( node_ptr->cmd );
|
|
node_ptr->cmd.stage = MTC_CMD_STAGE__START ;
|
|
node_ptr->cmd.cmd = MTC_OPER__RESET_PROGRESSION ;
|
|
node_ptr->cmd.parm1 = 2 ; /* 2 retries */
|
|
node_ptr->mtcCmd_work_fifo.push_back(node_ptr->cmd);
|
|
|
|
int timeout = ((MTC_RESET_PROG_TIMEOUT*(node_ptr->cmd.parm1+1))*2) ;
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, timeout ) ;
|
|
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_DISABLE_FORCE );
|
|
|
|
/* Force instance evacuation */
|
|
disableStageChange ( node_ptr, MTC_DISABLE__RESET_HOST_WAIT );
|
|
}
|
|
else
|
|
{
|
|
disableStageChange ( node_ptr, MTC_DISABLE__TASK_STATE_UPDATE ) ;
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_DISABLE__RESET_HOST_WAIT:
|
|
{
|
|
/* Check for the operation timeout - should not occur */
|
|
if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
wlog ("%s Reset Progression Timeout ; aborting ...\n", node_ptr->hostname.c_str());
|
|
|
|
/* Purge this hosts work and done queues */
|
|
mtcCmd_doneQ_purge ( node_ptr );
|
|
mtcCmd_workQ_purge ( node_ptr );
|
|
|
|
/* aborting after timeout ; need to avoid a stuck FSM
|
|
* reset progression already did retries */
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_REBOOT_TIMEOUT );
|
|
|
|
disableStageChange ( node_ptr, MTC_DISABLE__TASK_STATE_UPDATE );
|
|
}
|
|
|
|
/* Handle the case where the done fifo is empty ; avoid the segfault */
|
|
else if ( node_ptr->mtcCmd_done_fifo.size() == 0 )
|
|
{
|
|
/* Should never get here but .....
|
|
* Handle the case where the work queue is also empty.
|
|
* Avoid stuck FSM */
|
|
if ( node_ptr->mtcCmd_work_fifo.size() == 0 )
|
|
{
|
|
slog ("%s unexpected empty work queue ; trying reboot/reset again\n",
|
|
node_ptr->hostname.c_str() );
|
|
|
|
/* reset progression failed so try again */
|
|
disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK );
|
|
}
|
|
else
|
|
{
|
|
; /* typical wait path - wait some more */
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* TODO: Future: get the specific command rather than just the head */
|
|
node_ptr->mtcCmd_done_fifo_ptr = node_ptr->mtcCmd_done_fifo.begin();
|
|
|
|
/* defensive programming */
|
|
if ( node_ptr->mtcCmd_done_fifo_ptr != node_ptr->mtcCmd_work_fifo.end())
|
|
{
|
|
/* exit reset progression and any retries once the host is offline */
|
|
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE )
|
|
{
|
|
mtcTimer_stop ( node_ptr->mtcTimer );
|
|
stop_offline_handler ( node_ptr );
|
|
disableStageChange ( node_ptr, MTC_DISABLE__TASK_STATE_UPDATE ) ;
|
|
}
|
|
else if ( node_ptr->mtcCmd_done_fifo_ptr->cmd != MTC_OPER__RESET_PROGRESSION )
|
|
{
|
|
slog ("%s purging front entry of done cmdQueue\n",
|
|
node_ptr->hostname.c_str());
|
|
|
|
/* reset progression failed so try again */
|
|
disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK );
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s host still not offline ; trying reboot/reset again ....\n", node_ptr->hostname.c_str() );
|
|
|
|
/* reset progression failed so try again */
|
|
disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
slog ("%s unexpected empty work queue ; trying force lock\n", node_ptr->hostname.c_str() );
|
|
|
|
/* reset progression failed so try again */
|
|
disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK );
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_DISABLE__TASK_STATE_UPDATE:
|
|
{
|
|
mtc_nodeAvailStatus_enum avail ;
|
|
|
|
/* Tell the host that it is locked */
|
|
send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, MGMNT_INTERFACE );
|
|
if ( infra_network_provisioned )
|
|
{
|
|
send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, INFRA_INTERFACE );
|
|
}
|
|
|
|
/* Change the oper and avail states in the database */
|
|
if (( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__FAILED ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF ))
|
|
{
|
|
avail = MTC_AVAIL_STATUS__OFFLINE ;
|
|
}
|
|
else
|
|
{
|
|
avail = MTC_AVAIL_STATUS__ONLINE ;
|
|
}
|
|
allStateChange ( node_ptr, MTC_ADMIN_STATE__LOCKED, MTC_OPER_STATE__DISABLED, avail );
|
|
mtcInvApi_subf_states (node_ptr,"disabled",get_availStatus_str(avail));
|
|
|
|
/* Inform the VIM that this host is disabled */
|
|
mtcVimApi_state_change ( node_ptr, VIM_HOST_DISABLED, 3 );
|
|
|
|
/* Inform the VIM that the dataports are offline */
|
|
update_dport_states (node_ptr, MTC_EVENT_AVS_OFFLINE );
|
|
mtcVimApi_state_change ( node_ptr, VIM_DPORT_OFFLINE, 3 );
|
|
|
|
/* Start a timer that waits for the work queue to complete */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, work_queue_timeout );
|
|
disableStageChange( node_ptr, MTC_DISABLE__WORKQUEUE_WAIT );
|
|
|
|
break ;
|
|
}
|
|
case MTC_DISABLE__WORKQUEUE_WAIT:
|
|
{
|
|
rc = workQueue_done ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
/* wait longer */
|
|
break ;
|
|
}
|
|
else if ( rc == FAIL_WORKQ_TIMEOUT )
|
|
{
|
|
wlog ("%s Disable warning ; workQueue empty timeout, purging ...\n", node_ptr->hostname.c_str());
|
|
workQueue_purge ( node_ptr );
|
|
}
|
|
else if ( rc != PASS )
|
|
{
|
|
wlog ("%s Disable warning ; doneQueue contained failed commands\n", node_ptr->hostname.c_str());
|
|
}
|
|
disableStageChange( node_ptr, MTC_DISABLE__DISABLED );
|
|
break ;
|
|
}
|
|
case MTC_DISABLE__DISABLED:
|
|
{
|
|
/* Stop the timer if it is active coming into this case */
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
|
|
/* This will get updated during the next
|
|
* mtcLive message from this blade */
|
|
node_ptr->health = NODE_HEALTH_UNKNOWN ;
|
|
|
|
/* Set the lock alarm */
|
|
if (( node_ptr->adminAction == MTC_ADMIN_ACTION__LOCK ) ||
|
|
( node_ptr->adminAction == MTC_ADMIN_ACTION__FORCE_LOCK ))
|
|
{
|
|
mtcAlarm_warning ( node_ptr->hostname, MTC_ALARM_ID__LOCK );
|
|
node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_WARNING ;
|
|
}
|
|
|
|
/* open the mtcAlive gate while we are disabled */
|
|
node_ptr->mtcAlive_gate = false ;
|
|
|
|
disableStageChange( node_ptr, MTC_DISABLE__START );
|
|
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
|
|
|
|
node_ptr->mtcCmd_work_fifo.clear();
|
|
node_ptr->mtcCmd_done_fifo.clear();
|
|
node_ptr->http_retries_cur = 0 ;
|
|
|
|
/***** Powercycle FSM Stuff *****/
|
|
|
|
recovery_ctrl_init ( node_ptr->hwmon_reset );
|
|
recovery_ctrl_init ( node_ptr->hwmon_powercycle );
|
|
|
|
/* Load configured mtcAlive and goEnabled timers */
|
|
LOAD_NODETYPE_TIMERS ;
|
|
|
|
mtcInvApi_force_task ( node_ptr, "" );
|
|
|
|
plog ("%s Disable Complete\n", node_ptr->hostname.c_str());
|
|
|
|
break ;
|
|
}
|
|
|
|
default:
|
|
{
|
|
elog ("%s Bad Case (%d)\n", node_ptr->hostname.c_str(),
|
|
node_ptr->handlerStage.disable );
|
|
rc = FAIL_BAD_CASE ;
|
|
}
|
|
}
|
|
return (rc);
|
|
}
|
|
|
|
/* Uptime handler
|
|
* ---------------*/
|
|
int nodeLinkClass::uptime_handler ( void )
|
|
{
|
|
/* Service uptime refresh timer */
|
|
if ( this->mtcTimer_uptime.ring == true )
|
|
{
|
|
int rc = PASS ;
|
|
unsigned int uptime = 0;
|
|
|
|
/* Send uptime valies to inventory */
|
|
for ( this->host = this->hostname_inventory.begin () ;
|
|
this->host != this->hostname_inventory.end () ;
|
|
this->host++ )
|
|
{
|
|
bool do_uptime_update = false ;
|
|
string hostname = "" ;
|
|
|
|
hostname.append( this->host->c_str()) ;
|
|
|
|
/* only update every 5 minutes after being up for an hour */
|
|
uptime = this->get_uptime ( hostname ) ;
|
|
if ( uptime < 3600 )
|
|
{
|
|
do_uptime_update = true ;
|
|
}
|
|
else
|
|
{
|
|
int ctr = this->get_uptime_refresh_ctr ( hostname );
|
|
|
|
/* Update uptime only every 5 minutes after the
|
|
* host has been up for more than one hour */
|
|
if (( uptime > 3600 ) && ( (ctr*(this->uptime_period)) >= MTC_MINS_5 ))
|
|
{
|
|
do_uptime_update = true ;
|
|
}
|
|
else
|
|
{
|
|
this->set_uptime_refresh_ctr ( hostname , (ctr+1) ) ;
|
|
}
|
|
}
|
|
/* Handle update if required */
|
|
if (( rc != PASS ) && ( do_uptime_update == true ))
|
|
{
|
|
wlog ("%s Uptime refresh bypassed due to previous error\n", hostname.c_str());
|
|
}
|
|
else if (( do_uptime_update == true ) || ( uptime == 0 ))
|
|
{
|
|
/* Sent uptime update request.
|
|
* But exit this iteration if we get an error as we
|
|
* don't want to stall mtce for all hosts on such a
|
|
* simple operation */
|
|
|
|
// ilog ("%s - %d\n", hostname.c_str(), uptime );
|
|
if ( uptime == 0 )
|
|
{
|
|
this->set_uptime ( hostname, uptime , false ) ;
|
|
}
|
|
else
|
|
{
|
|
this->set_uptime ( hostname, uptime , true ) ;
|
|
}
|
|
}
|
|
}
|
|
/* Re-Start the uptime timer */
|
|
mtcTimer_start ( this->mtcTimer_uptime, mtcTimer_handler,
|
|
(this->uptime_period+(rand()%10)));
|
|
}
|
|
return PASS ;
|
|
}
|
|
|
|
/* Offline handler
|
|
* ---------------
|
|
* Algorithm that manages offline/online state for a locked host */
|
|
int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
switch ( (int)node_ptr->offlineStage )
|
|
{
|
|
case MTC_OFFLINE__IDLE:
|
|
{
|
|
return (PASS) ; /* typical path */
|
|
}
|
|
case MTC_OFFLINE__START:
|
|
{
|
|
node_ptr->mtcAlive_mgmnt = false ;
|
|
node_ptr->mtcAlive_infra = false ;
|
|
|
|
node_ptr->offline_search_count = 0 ;
|
|
|
|
mtcTimer_reset ( node_ptr->offline_timer );
|
|
ilog ("%s starting %d msec offline audit (%s-%s)\n",
|
|
node_ptr->hostname.c_str(),
|
|
offline_period,
|
|
operState_enum_to_str(node_ptr->operState).c_str(),
|
|
availStatus_enum_to_str(node_ptr->availStatus).c_str());
|
|
|
|
node_ptr->offlineStage = MTC_OFFLINE__SEND_MTCALIVE ;
|
|
/* fall through on start */
|
|
}
|
|
case MTC_OFFLINE__SEND_MTCALIVE:
|
|
{
|
|
alog2 ("%s searching for offline (%s-%s)\n",
|
|
node_ptr->hostname.c_str(),
|
|
operState_enum_to_str(node_ptr->operState).c_str(),
|
|
availStatus_enum_to_str(node_ptr->availStatus).c_str());
|
|
|
|
node_ptr->mtcAlive_gate = false ;
|
|
node_ptr->mtcAlive_mgmnt = false ;
|
|
node_ptr->mtcAlive_infra = false ;
|
|
|
|
/* Request a mtcAlive from host from Mgmnt and Infra (if provisioned) */
|
|
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, MGMNT_INTERFACE );
|
|
if ( infra_network_provisioned )
|
|
{
|
|
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, INFRA_INTERFACE );
|
|
}
|
|
|
|
/* reload the timer */
|
|
mtcTimer_start_msec ( node_ptr->offline_timer, mtcTimer_handler, offline_period );
|
|
|
|
node_ptr->offlineStage = MTC_OFFLINE__WAIT ;
|
|
|
|
break ;
|
|
}
|
|
case MTC_OFFLINE__WAIT:
|
|
{
|
|
/* be sure the mtcAlive gate is open */
|
|
node_ptr->mtcAlive_gate = false ;
|
|
if ( mtcTimer_expired ( node_ptr->offline_timer ) == true )
|
|
{
|
|
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE )
|
|
{
|
|
plog ("%s offline (external)\n", node_ptr->hostname.c_str());
|
|
node_ptr->offlineStage = MTC_OFFLINE__IDLE ;
|
|
}
|
|
else if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
|
|
{
|
|
slog ("%s offline search while 'enabled' ; invalid\n", node_ptr->hostname.c_str());
|
|
node_ptr->offlineStage = MTC_OFFLINE__IDLE ;
|
|
}
|
|
else if ( !node_ptr->mtcAlive_mgmnt && !node_ptr->mtcAlive_infra )
|
|
{
|
|
if ( ++node_ptr->offline_search_count > offline_threshold )
|
|
{
|
|
node_ptr->mtcAlive_online = false ;
|
|
|
|
plog ("%s going offline ; (threshold (%d msec * %d)\n",
|
|
node_ptr->hostname.c_str(),
|
|
offline_period,
|
|
offline_threshold );
|
|
|
|
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__OFFLINE );
|
|
|
|
/* Inform the VIM that this host is offline */
|
|
mtcVimApi_state_change ( node_ptr, VIM_HOST_OFFLINE, 1 );
|
|
|
|
node_ptr->offlineStage = MTC_OFFLINE__IDLE ;
|
|
}
|
|
else
|
|
{
|
|
alog ("%s missed mtcAlive %d of %d times\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->offline_search_count,
|
|
offline_threshold );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
node_ptr->mtcAlive_online = true ;
|
|
if ( node_ptr->mtcAlive_mgmnt || node_ptr->mtcAlive_infra )
|
|
{
|
|
ilog ("%s still seeing mtcAlive (%c:%c)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
|
|
node_ptr->mtcAlive_infra ? 'Y' : 'n');
|
|
}
|
|
else
|
|
{
|
|
alog ("%s still seeing mtcAlive (%c:%c)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
|
|
node_ptr->mtcAlive_infra ? 'Y' : 'n');
|
|
}
|
|
}
|
|
|
|
if ( node_ptr->offlineStage == MTC_OFFLINE__IDLE )
|
|
{
|
|
ilog ("%s exiting offline handling\n", node_ptr->hostname.c_str());
|
|
}
|
|
else
|
|
{
|
|
node_ptr->offlineStage = MTC_OFFLINE__SEND_MTCALIVE ;
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
default:
|
|
{
|
|
slog ("%s unexpected stage ; correcting to idle\n",
|
|
node_ptr->hostname.c_str());
|
|
|
|
node_ptr->offlineStage = MTC_OFFLINE__IDLE ;
|
|
}
|
|
}
|
|
return (PASS);
|
|
}
|
|
|
|
/* Online handler
|
|
* ---------------
|
|
* Algorithm that manages offline/online state for a locked host */
|
|
int nodeLinkClass::online_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
int rc = PASS ;
|
|
|
|
/* don't need to manage the offline or online state
|
|
* for the following availability states */
|
|
if (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFDUTY ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__INTEST ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__NOT_INSTALLED ))
|
|
{
|
|
return (PASS);
|
|
}
|
|
|
|
switch ( (int)node_ptr->onlineStage )
|
|
{
|
|
case MTC_ONLINE__START:
|
|
{
|
|
alog3 ("%s Offline Handler (%d)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->onlineStage );
|
|
|
|
if ( node_ptr->mtcAlive_gate == true )
|
|
{
|
|
alog ("%s mtcAlive gate unexpectedly set, correcting ...\n",
|
|
node_ptr->hostname.c_str());
|
|
|
|
node_ptr->mtcAlive_gate = false ;
|
|
}
|
|
|
|
/* Start with a zero count. This counter is incremented every
|
|
* time we get a mtc alive message from that host */
|
|
node_ptr->mtcAlive_online = false ;
|
|
node_ptr->mtcAlive_misses = 0 ;
|
|
|
|
/* Start mtcAlive message timer */
|
|
mtcTimer_start ( node_ptr->mtcAlive_timer, mtcTimer_handler, online_period );
|
|
node_ptr->onlineStage = MTC_ONLINE__WAITING ;
|
|
break ;
|
|
}
|
|
case MTC_ONLINE__RETRYING:
|
|
{
|
|
/* Start mtcAlive message timer */
|
|
mtcTimer_start ( node_ptr->mtcAlive_timer, mtcTimer_handler, online_period );
|
|
node_ptr->onlineStage = MTC_ONLINE__WAITING ;
|
|
break ;
|
|
}
|
|
case MTC_ONLINE__WAITING:
|
|
{
|
|
if ( node_ptr->mtcAlive_timer.ring == false )
|
|
break ;
|
|
|
|
alog ("%s mtcAlive [%s] [ misses:%d]\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->mtcAlive_online ? "Yes" : "No",
|
|
node_ptr->mtcAlive_misses );
|
|
|
|
if ( node_ptr->mtcAlive_online == false )
|
|
{
|
|
node_ptr->mtcAlive_hits = 0 ;
|
|
if ( node_ptr->mtcAlive_misses++ > MTC_OFFLINE_MISSES )
|
|
{
|
|
/* If already online then and no counts then that means the node is not up - go offline */
|
|
if (( node_ptr->availStatus != MTC_AVAIL_STATUS__OFFLINE ) &&
|
|
( node_ptr->availStatus != MTC_AVAIL_STATUS__POWERED_OFF ))
|
|
{
|
|
ilog ("%s mtcAlive lost ; going 'offline'\n",
|
|
node_ptr->hostname.c_str());
|
|
|
|
clear_service_readies ( node_ptr );
|
|
|
|
/* otherwise change state */
|
|
mtcInvApi_update_state(node_ptr, MTC_JSON_INV_AVAIL,"offline" );
|
|
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
|
{
|
|
mtcInvApi_update_state(node_ptr, MTC_JSON_INV_AVAIL_SUBF,"offline" );
|
|
}
|
|
|
|
/* Inform the VIM that this host is offline */
|
|
mtcVimApi_state_change ( node_ptr, VIM_HOST_OFFLINE, 1 );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* handle retries < MTC_OFFLINE_MISSES */
|
|
node_ptr->mtcAlive_timer.ring = false ;
|
|
node_ptr->onlineStage = MTC_ONLINE__RETRYING ;
|
|
break ;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
bool gate_online = false ;
|
|
|
|
/* if we are getting counts then the node is up so change status */
|
|
if ( node_ptr->availStatus != MTC_AVAIL_STATUS__ONLINE )
|
|
{
|
|
node_ptr->mtcAlive_hits++ ;
|
|
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF )
|
|
{
|
|
/* need 5 mtcAlive messages befpore we allow a power-off to go online */
|
|
if ( node_ptr->mtcAlive_hits < MTC_MTCALIVE_HITS_TO_GO_ONLINE )
|
|
{
|
|
gate_online = true ;
|
|
dlog ("%s ... %d\n", node_ptr->hostname.c_str(), node_ptr->mtcAlive_hits );
|
|
}
|
|
}
|
|
|
|
if ( gate_online == false )
|
|
{
|
|
ilog ("%s mtcAlive ; going 'online'\n",
|
|
node_ptr->hostname.c_str());
|
|
|
|
mtcInvApi_update_state ( node_ptr, MTC_JSON_INV_AVAIL, "online" );
|
|
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
|
{
|
|
mtcInvApi_update_state ( node_ptr, MTC_JSON_INV_AVAIL_SUBF, "online" );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* While the host is locked ... */
|
|
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
|
|
{
|
|
/* ... keep the 'host locked' file on this host refreshed while in the locked state
|
|
* ... send it on both interfaces just in case */
|
|
send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, MGMNT_INTERFACE );
|
|
// send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, INFRA_INTERFACE );
|
|
}
|
|
|
|
/* Start over */
|
|
node_ptr->mtcAlive_timer.ring = false ;
|
|
node_ptr->onlineStage = MTC_ONLINE__START ;
|
|
break ;
|
|
}
|
|
default:
|
|
node_ptr->onlineStage = MTC_ONLINE__START ;
|
|
}
|
|
return (rc);
|
|
}
|
|
|
|
|
|
/* Controller Swact Handler
|
|
* ------------------------
|
|
* Using a REST API into HA Service Manager through Inventory, this handler
|
|
* is responsible for quering for active services on the specified
|
|
* controller and then if services are found to be running , requesting
|
|
* migration of those active services away from that controller */
|
|
|
|
#define SWACT_DONE \
|
|
{ \
|
|
if ( node_ptr->mtcSwact_timer.tid ) \
|
|
{ \
|
|
mtcTimer_stop ( node_ptr->mtcSwact_timer ); \
|
|
} \
|
|
mtcTimer_start ( node_ptr->mtcSwact_timer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ); \
|
|
node_ptr->swactStage = MTC_SWACT__DONE ; \
|
|
}
|
|
|
|
#define SWACT_FAIL_THRESHOLD (3)
|
|
#define SWACT_RETRY_THRESHOLD (10)
|
|
#define SWACT_FAIL_MSEC_DELAY (250)
|
|
#define SWACT_RECV_MSEC_DELAY (50)
|
|
#define SWACT_POLL_DELAY (10)
|
|
#define SWACT_TIMEOUT_DELAY (50)
|
|
|
|
int nodeLinkClass::swact_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
int rc = PASS ;
|
|
|
|
if ( daemon_is_file_present ( PLATFORM_SIMPLEX_MODE ) == true )
|
|
{
|
|
slog ("%s rejecting Swact request in simplex mode\n", node_ptr->hostname.c_str());
|
|
node_ptr->swactStage = MTC_SWACT__START ;
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
|
return (PASS);
|
|
}
|
|
switch ( (int)node_ptr->swactStage )
|
|
{
|
|
/* Start / Init Stage */
|
|
case MTC_SWACT__START:
|
|
{
|
|
plog ("%s Administrative SWACT Requested\n", node_ptr->hostname.c_str() );
|
|
|
|
/* Cleanup and init the swact timer - start fresh */
|
|
if ( node_ptr->mtcSwact_timer.tid )
|
|
{
|
|
wlog ("%s Cancelling outstanding Swact timer\n", node_ptr->hostname.c_str());
|
|
mtcTimer_stop ( node_ptr->mtcSwact_timer );
|
|
}
|
|
mtcTimer_init ( node_ptr->mtcSwact_timer );
|
|
|
|
/* reset error / control Counters to zero */
|
|
nodeLinkClass::smgrEvent.count = 0 ;
|
|
nodeLinkClass::smgrEvent.fails = 0 ;
|
|
nodeLinkClass::smgrEvent.cur_retries = 0 ;
|
|
|
|
/* Empty the event message strings */
|
|
nodeLinkClass::smgrEvent.payload = "" ;
|
|
nodeLinkClass::smgrEvent.response = "" ;
|
|
|
|
/* Post a user message 'Swact: Request' and
|
|
* then delay to allow it to be displayed */
|
|
mtcInvApi_force_task ( node_ptr, MTC_TASK_SWACT_REQUEST );
|
|
mtcTimer_start ( node_ptr->mtcSwact_timer, mtcTimer_handler, (MTC_TASK_UPDATE_DELAY/2) );
|
|
node_ptr->swactStage = MTC_SWACT__QUERY ;
|
|
break ;
|
|
}
|
|
|
|
/* Handle and threshold all Query Failures */
|
|
case MTC_SWACT__QUERY_FAIL:
|
|
{
|
|
if ( ++nodeLinkClass::smgrEvent.fails >= SWACT_FAIL_THRESHOLD )
|
|
{
|
|
wlog ("%s Query Services Failed: Max Retries (max:%d)\n",
|
|
node_ptr->hostname.c_str(), nodeLinkClass::smgrEvent.fails);
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_FAIL_QUERY);
|
|
SWACT_DONE ;
|
|
}
|
|
else
|
|
{
|
|
wlog ("%s Query Services: Retrying (cnt:%d)\n",
|
|
node_ptr->hostname.c_str(), nodeLinkClass::smgrEvent.fails);
|
|
mtcTimer_start_msec ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_FAIL_MSEC_DELAY );
|
|
node_ptr->swactStage = MTC_SWACT__QUERY ;
|
|
}
|
|
break ;
|
|
}
|
|
|
|
/* Query Services on this host */
|
|
case MTC_SWACT__QUERY:
|
|
{
|
|
if ( node_ptr->mtcSwact_timer.ring == true )
|
|
{
|
|
rc = mtcSmgrApi_request ( node_ptr, CONTROLLER_QUERY, 0 );
|
|
if ( rc )
|
|
{
|
|
nodeLinkClass::smgrEvent.status = rc ;
|
|
node_ptr->swactStage = MTC_SWACT__QUERY_FAIL ;
|
|
}
|
|
else
|
|
{
|
|
/* Ok, we got a successful send request ;
|
|
* delay a bit and check for the response */
|
|
nodeLinkClass::smgrEvent.cur_retries = 0 ;
|
|
nodeLinkClass::smgrEvent.fails = 0 ;
|
|
mtcTimer_start_msec ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_RECV_MSEC_DELAY );
|
|
node_ptr->swactStage = MTC_SWACT__QUERY_RECV ;
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_SWACT__QUERY_RECV:
|
|
{
|
|
if ( node_ptr->mtcSwact_timer.ring == true )
|
|
{
|
|
/* Try and receive the response */
|
|
rc = mtcHttpUtil_receive ( nodeLinkClass::smgrEvent );
|
|
if ( rc == RETRY )
|
|
{
|
|
if ( ++nodeLinkClass::smgrEvent.cur_retries > SWACT_RETRY_THRESHOLD )
|
|
{
|
|
wlog ("%s Too many receive retries (cnt:%d)\n",
|
|
node_ptr->hostname.c_str(), nodeLinkClass::smgrEvent.cur_retries );
|
|
rc = FAIL ;
|
|
}
|
|
else
|
|
{
|
|
mtcTimer_start_msec ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_RECV_MSEC_DELAY );
|
|
break ;
|
|
}
|
|
}
|
|
if (( rc != PASS ) && ( rc != RETRY ))
|
|
{
|
|
elog ("%s Service Query Failed: Receive Error (rc:%d)\n",
|
|
node_ptr->hostname.c_str(), rc );
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_FAILED);
|
|
SWACT_DONE ;
|
|
}
|
|
else
|
|
{
|
|
/* Parse through the response - no retries on response string errors */
|
|
bool active = false ;
|
|
rc = mtcSmgrApi_service_state ( nodeLinkClass::smgrEvent, active );
|
|
if ( rc )
|
|
{
|
|
/* Setup common error message for the user*/
|
|
ilog ("%s Swact: Service Query Failed\n", node_ptr->hostname.c_str());
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_FAILED);
|
|
SWACT_DONE ;
|
|
}
|
|
else if ( active == true )
|
|
{
|
|
/* O.K. We need to Swact */
|
|
nodeLinkClass::smgrEvent.fails = 0 ;
|
|
nodeLinkClass::smgrEvent.cur_retries = 0 ;
|
|
node_ptr->swactStage = MTC_SWACT__SWACT ;
|
|
|
|
/* Tell the user what we are doing */
|
|
mtcInvApi_force_task ( node_ptr, MTC_TASK_SWACT_INPROGRESS );
|
|
}
|
|
else
|
|
{
|
|
/* If not true then somehow we are being asked to
|
|
* Swact a controller that is not running any services */
|
|
ilog ("%s %s\n", node_ptr->hostname.c_str(), MTC_TASK_SWACT_NOSERVICE);
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_NOSERVICE);
|
|
SWACT_DONE ;
|
|
}
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
|
|
/* Phase 2: Perform Swact */
|
|
case MTC_SWACT__SWACT:
|
|
{
|
|
rc = mtcSmgrApi_request ( node_ptr, CONTROLLER_SWACT, 0 );
|
|
if ( rc )
|
|
{
|
|
/* Abort after SWACT_FAIL_THRESHOLD retries - verified */
|
|
if ( ++nodeLinkClass::smgrEvent.fails >= SWACT_FAIL_THRESHOLD )
|
|
{
|
|
elog ( "%s Swact: Failed Request (rc:%d) (max:%d)\n",
|
|
node_ptr->hostname.c_str(), rc,
|
|
nodeLinkClass::smgrEvent.fails);
|
|
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_FAILED );
|
|
SWACT_DONE ;
|
|
}
|
|
else
|
|
{
|
|
elog ( "%s Swact: Retrying Request (rc:%d) (cnt:%d)\n",
|
|
node_ptr->hostname.c_str(), rc,
|
|
nodeLinkClass::smgrEvent.fails);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
plog ("%s Swact: In Progress\n", node_ptr->hostname.c_str());
|
|
nodeLinkClass::smgrEvent.status = PASS ;
|
|
nodeLinkClass::smgrEvent.fails = 0 ;
|
|
nodeLinkClass::smgrEvent.cur_retries = 0 ;
|
|
mtcTimer_start_msec ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_RECV_MSEC_DELAY );
|
|
node_ptr->swactStage = MTC_SWACT__SWACT_RECV ;
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_SWACT__SWACT_RECV:
|
|
{
|
|
if ( node_ptr->mtcSwact_timer.ring == true )
|
|
{
|
|
/* Try and receive the response */
|
|
rc = mtcHttpUtil_receive ( nodeLinkClass::smgrEvent );
|
|
if ( rc == RETRY )
|
|
{
|
|
if ( ++nodeLinkClass::smgrEvent.cur_retries > SWACT_RETRY_THRESHOLD )
|
|
{
|
|
wlog ("%s Too many receive retries (cnt:%d)\n",
|
|
node_ptr->hostname.c_str(), nodeLinkClass::smgrEvent.cur_retries );
|
|
rc = FAIL ;
|
|
}
|
|
else
|
|
{
|
|
mtcTimer_start_msec ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_RECV_MSEC_DELAY );
|
|
break ;
|
|
}
|
|
}
|
|
if (( rc != PASS ) && ( rc != RETRY ))
|
|
{
|
|
elog ("%s Swact Failed: Receive Error (rc:%d)\n",
|
|
node_ptr->hostname.c_str(), rc );
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_FAILED);
|
|
SWACT_DONE ;
|
|
}
|
|
else
|
|
{
|
|
mtcTimer_start ( node_ptr->mtcSwact_timer, mtcTimer_handler, MTC_SWACT_POLL_TIMER );
|
|
mtcSmgrApi_request ( node_ptr, CONTROLLER_QUERY, 0 );
|
|
node_ptr->swactStage = MTC_SWACT__SWACT_POLL ;
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_SWACT__SWACT_POLL:
|
|
{
|
|
if ( node_ptr->mtcSwact_timer.ring == true )
|
|
{
|
|
if (++nodeLinkClass::smgrEvent.count >=
|
|
(nodeLinkClass::swact_timeout/MTC_SWACT_POLL_TIMER))
|
|
{
|
|
elog ("%s Swact Failed: Timeout\n", node_ptr->hostname.c_str());
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_TIMEOUT);
|
|
SWACT_DONE ;
|
|
}
|
|
rc = mtcHttpUtil_receive ( smgrEvent );
|
|
if ( rc != RETRY )
|
|
{
|
|
bool active = true ;
|
|
mtcSmgrApi_service_state ( smgrEvent, active );
|
|
if ( active == false )
|
|
{
|
|
dlog ("%s %s\n",node_ptr->hostname.c_str(), MTC_TASK_SWACT_COMPLETE );
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_COMPLETE );
|
|
SWACT_DONE ;
|
|
break ;
|
|
}
|
|
else
|
|
{
|
|
mtcSmgrApi_request ( node_ptr, CONTROLLER_QUERY, 0 );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
plog ("%s Swact: In-Progress\n", node_ptr->hostname.c_str());
|
|
}
|
|
mtcTimer_start ( node_ptr->mtcSwact_timer, mtcTimer_handler, MTC_SWACT_POLL_TIMER );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_SWACT__DONE:
|
|
{
|
|
/* Wait for the done timer to expire.
|
|
* When it does ; exit the SWACT FSM after clearing
|
|
* the task and setting it back to the start. */
|
|
if ( node_ptr->mtcSwact_timer.ring == true )
|
|
{
|
|
mtcInvApi_force_task ( node_ptr, "");
|
|
nodeLinkClass::smgrEvent.active = false ;
|
|
nodeLinkClass::smgrEvent.mutex = false ;
|
|
node_ptr->mtcSwact_timer.ring = false ;
|
|
node_ptr->swactStage = MTC_SWACT__START ;
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
|
if ( smgrEvent.status )
|
|
{
|
|
wlog ("%s Swact: Failed\n", node_ptr->hostname.c_str());
|
|
|
|
}
|
|
else
|
|
{
|
|
plog ("%s Swact: Completed\n", node_ptr->hostname.c_str());
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
|
|
default:
|
|
node_ptr->swactStage = MTC_SWACT__START ;
|
|
}
|
|
return (rc);
|
|
}
|
|
|
|
/* Reset Handler
|
|
* ------------
|
|
* Issue a reset to a host */
|
|
int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
int rc = PASS ;
|
|
switch ( node_ptr->resetStage )
|
|
{
|
|
case MTC_RESET__FAIL:
|
|
{
|
|
elog ("%s Reset failed ; aborting after max retries\n", node_ptr->hostname.c_str());
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_RESET_FAIL);
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
|
|
resetStageChange ( node_ptr , MTC_RESET__FAIL_WAIT );
|
|
break ;
|
|
}
|
|
case MTC_RESET__FAIL_WAIT:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
|
{
|
|
node_ptr->mtcTimer.ring = false ;
|
|
resetStageChange ( node_ptr , MTC_RESET__DONE );
|
|
|
|
recovery_ctrl_init ( node_ptr->hwmon_reset ); ;
|
|
mtcTimer_reset ( node_ptr->hwmon_reset.recovery_timer );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_RESET__START:
|
|
{
|
|
plog ("%s Administrative 'Reset' Action\n", node_ptr->hostname.c_str());
|
|
mtcInvApi_update_task ( node_ptr, "Reset Requested" );
|
|
node_ptr->retries = 0 ;
|
|
|
|
start_offline_handler ( node_ptr );
|
|
|
|
if ( hostUtil_is_valid_ip_addr (node_ptr->bm_ip ) == false )
|
|
{
|
|
/**
|
|
* New working provisioning is learned by from the
|
|
* dnsmasq.bmc_hosts file changes through inotify watch so
|
|
* it is entirely possible that the retries in this fsm
|
|
* eventually succeed.
|
|
**/
|
|
wlog ("%s bm_ip (%s) is invalid (%d) \n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->bm_ip.c_str(),
|
|
rc );
|
|
resetStageChange ( node_ptr , MTC_RESET__FAIL );
|
|
break ;
|
|
}
|
|
node_ptr->power_action_retries = MTC_RESET_ACTION_RETRY_COUNT ;
|
|
/* the fall through is intentional */
|
|
}
|
|
case MTC_RESET__REQ_SEND:
|
|
{
|
|
node_ptr->power_action_retries--;
|
|
|
|
/* Handle loss of connectivity over retries */
|
|
if ( node_ptr->bm_provisioned == false )
|
|
{
|
|
elog ("%s BMC not provisioned\n", node_ptr->hostname.c_str() );
|
|
mtcInvApi_force_task ( node_ptr, MTC_TASK_BMC_NOT_PROV );
|
|
resetStageChange ( node_ptr , MTC_RESET__FAIL );
|
|
break ;
|
|
}
|
|
|
|
if ( node_ptr->bm_accessible == false )
|
|
{
|
|
wlog ("%s Power Off request rejected ; BMC not accessible ; retry in %d seconds \n",
|
|
node_ptr->hostname.c_str(),
|
|
MTC_POWER_ACTION_RETRY_DELAY);
|
|
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
resetStageChange ( node_ptr , MTC_RESET__QUEUE );
|
|
break ;
|
|
}
|
|
|
|
else
|
|
{
|
|
rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_RESET );
|
|
|
|
if ( rc )
|
|
{
|
|
wlog ("%s Reset request failed (%d)\n", node_ptr->hostname.c_str(), rc );
|
|
resetStageChange ( node_ptr , MTC_RESET__QUEUE );
|
|
}
|
|
else
|
|
{
|
|
blog ("%s Reset requested\n", node_ptr->hostname.c_str());
|
|
resetStageChange ( node_ptr , MTC_RESET__RESP_WAIT );
|
|
}
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_RESET__RESP_WAIT:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
|
{
|
|
rc = ipmi_command_recv ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
|
break ;
|
|
}
|
|
|
|
if ( rc )
|
|
{
|
|
elog ("%s Reset command failed (rc:%d)\n", node_ptr->hostname.c_str(), rc );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
resetStageChange ( node_ptr, MTC_RESET__QUEUE );
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s is Resetting\n", node_ptr->hostname.c_str());
|
|
mtcInvApi_update_task ( node_ptr, "Resetting: waiting for offline" );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_TO_OFFLINE_TIMEOUT );
|
|
resetStageChange ( node_ptr, MTC_RESET__OFFLINE_WAIT );
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_RESET__QUEUE:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
|
{
|
|
node_ptr->mtcTimer.ring = false ;
|
|
if ( node_ptr->power_action_retries > 0 )
|
|
{
|
|
char buffer[64] ;
|
|
int attempts = MTC_RESET_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
|
|
snprintf ( buffer, 64, MTC_TASK_RESET_QUEUE, attempts, MTC_RESET_ACTION_RETRY_COUNT);
|
|
mtcInvApi_update_task ( node_ptr, buffer);
|
|
|
|
/* check the thread error status if thetre is one */
|
|
if ( node_ptr->ipmitool_thread_info.status )
|
|
{
|
|
wlog ("%s ... %s (rc:%d)\n", node_ptr->hostname.c_str(),
|
|
node_ptr->ipmitool_thread_info.status_string.c_str(),
|
|
node_ptr->ipmitool_thread_info.status );
|
|
}
|
|
|
|
resetStageChange ( node_ptr , MTC_RESET__REQ_SEND );
|
|
}
|
|
else
|
|
{
|
|
resetStageChange ( node_ptr , MTC_RESET__FAIL );
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_RESET__OFFLINE_WAIT:
|
|
{
|
|
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE )
|
|
{
|
|
if (node_ptr->mtcTimer.tid)
|
|
mtcTimer_stop ( node_ptr->mtcTimer );
|
|
|
|
plog ("%s Reset Successful\n", node_ptr->hostname.c_str());
|
|
resetStageChange ( node_ptr , MTC_RESET__DONE );
|
|
}
|
|
else if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
elog ("%s Reset operation timeout - host did not go offline\n", node_ptr->hostname.c_str());
|
|
resetStageChange ( node_ptr , MTC_RESET__FAIL );
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_RESET__DONE:
|
|
default:
|
|
{
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
|
|
hwmon_recovery_monitor ( node_ptr, MTC_EVENT_HWMON_RESET );
|
|
|
|
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
|
|
|
|
mtcInvApi_force_task ( node_ptr, "" );
|
|
|
|
clear_service_readies ( node_ptr );
|
|
|
|
plog ("%s Reset Completed\n", node_ptr->hostname.c_str());
|
|
break ;
|
|
}
|
|
}
|
|
return (PASS);
|
|
}
|
|
|
|
/* Reinstall handler
|
|
* --------------
|
|
* Manage reinstall operations for a locked-disabled host */
|
|
int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
switch ( node_ptr->reinstallStage )
|
|
{
|
|
case MTC_REINSTALL__START:
|
|
{
|
|
int host_reinstall_wait_timer = node_ptr->mtcalive_timeout + node_reinstall_timeout ;
|
|
node_ptr->retries = host_reinstall_wait_timer / MTC_REINSTALL_WAIT_TIMER ;
|
|
|
|
start_offline_handler ( node_ptr );
|
|
|
|
node_ptr->cmdReq = MTC_CMD_WIPEDISK ;
|
|
|
|
plog ("%s Administrative Reinstall Requested\n", node_ptr->hostname.c_str());
|
|
if ( send_mtc_cmd ( node_ptr->hostname, MTC_CMD_WIPEDISK, MGMNT_INTERFACE ) != PASS )
|
|
{
|
|
elog ("Failed to send 'reinstall' request to %s\n", node_ptr->hostname.c_str());
|
|
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
|
|
}
|
|
else
|
|
{
|
|
node_ptr->cmdRsp = MTC_CMD_NONE ;
|
|
|
|
if ( node_ptr->mtcTimer.tid )
|
|
{
|
|
mtcTimer_stop ( node_ptr->mtcTimer );
|
|
}
|
|
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_CMD_RSP_TIMEOUT );
|
|
|
|
ilog ("%s waiting for REINSTALL ACK \n", node_ptr->hostname.c_str() );
|
|
|
|
reinstallStageChange ( node_ptr , MTC_REINSTALL__RESP_WAIT );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_REINSTALL__RESP_WAIT:
|
|
{
|
|
if ( node_ptr->cmdRsp != MTC_CMD_WIPEDISK )
|
|
{
|
|
if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
elog ("%s REINSTALL ACK Timeout\n",
|
|
node_ptr->hostname.c_str());
|
|
|
|
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* declare successful reinstall request */
|
|
plog ("%s REINSTALL Request Succeeded\n", node_ptr->hostname.c_str());
|
|
|
|
mtcTimer_stop ( node_ptr->mtcTimer );
|
|
|
|
/* We need to wait for the host to go offline */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_TO_OFFLINE_TIMEOUT );
|
|
|
|
/* Wait for the host to go offline */
|
|
reinstallStageChange ( node_ptr , MTC_REINSTALL__OFFLINE_WAIT );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_REINSTALL__OFFLINE_WAIT:
|
|
{
|
|
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE )
|
|
{
|
|
mtcTimer_stop ( node_ptr->mtcTimer );
|
|
|
|
clear_service_readies ( node_ptr );
|
|
|
|
ilog ("%s Reinstall Progress: host is offline ; waiting for host to come back\n", node_ptr->hostname.c_str());
|
|
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_WAIT_TIMER );
|
|
reinstallStageChange ( node_ptr , MTC_REINSTALL__ONLINE_WAIT );
|
|
}
|
|
else if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
elog ("%s offline timeout - reinstall failed\n", node_ptr->hostname.c_str());
|
|
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_REINSTALL__ONLINE_WAIT:
|
|
{
|
|
if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE )
|
|
{
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_SUCCESS);
|
|
mtcTimer_stop ( node_ptr->mtcTimer );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
|
|
reinstallStageChange ( node_ptr , MTC_REINSTALL__MSG_DISPLAY );
|
|
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_REINSTALL_COMPLETE );
|
|
}
|
|
else
|
|
{
|
|
if ( --node_ptr->retries < 0 )
|
|
{
|
|
elog ("%s online timeout - reinstall failed\n", node_ptr->hostname.c_str());
|
|
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
|
|
}
|
|
else
|
|
{
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_WAIT_TIMER );
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case MTC_REINSTALL__FAIL:
|
|
{
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL);
|
|
mtcTimer_stop ( node_ptr->mtcTimer );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
|
|
reinstallStageChange ( node_ptr , MTC_REINSTALL__MSG_DISPLAY );
|
|
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED );
|
|
break ;
|
|
}
|
|
case MTC_REINSTALL__MSG_DISPLAY:
|
|
{
|
|
if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
node_ptr->mtcTimer.ring = false ;
|
|
reinstallStageChange ( node_ptr , MTC_REINSTALL__DONE );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_REINSTALL__DONE:
|
|
default:
|
|
{
|
|
plog ("%s Reinstall Completed\n", node_ptr->hostname.c_str());
|
|
|
|
/* Default timeout values */
|
|
LOAD_NODETYPE_TIMERS ;
|
|
|
|
mtcTimer_stop ( node_ptr->mtcTimer );
|
|
|
|
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
|
|
|
|
recovery_ctrl_init ( node_ptr->hwmon_reset );
|
|
recovery_ctrl_init ( node_ptr->hwmon_powercycle );
|
|
|
|
mtcInvApi_force_task ( node_ptr, "" );
|
|
break ;
|
|
}
|
|
}
|
|
return (PASS);
|
|
}
|
|
|
|
/* Reboot handler
|
|
* --------------
|
|
* Manage reinstall operations for a disabled host */
|
|
int nodeLinkClass::reboot_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
// ilog ("%s Administrative 'reboot' Action (%d)\n", node_ptr->hostname.c_str(), node_ptr->resetProgStage );
|
|
|
|
switch ( node_ptr->resetProgStage )
|
|
{
|
|
case MTC_RESETPROG__START:
|
|
{
|
|
plog ("%s Administrative Reboot Requested\n", node_ptr->hostname.c_str() );
|
|
|
|
/* start with a clean command slate */
|
|
mtcCmd_doneQ_purge ( node_ptr );
|
|
mtcCmd_workQ_purge ( node_ptr );
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_RESET_PROG );
|
|
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__REBOOT )
|
|
{
|
|
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_AUTO_REBOOT );
|
|
}
|
|
node_ptr->retries = 0 ;
|
|
|
|
/* If this is a simplex all-in-one system then issue the lazy reboot and just wait */
|
|
if ( THIS_HOST )
|
|
{
|
|
mtcInvApi_update_task_now ( node_ptr, "Please stand-by while the active controller gracefully reboots" );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_2 ) ;
|
|
node_ptr->resetProgStage = MTC_RESETPROG__WAIT ;
|
|
|
|
/* Launch a backup sysreq thread */
|
|
fork_sysreq_reboot ( daemon_get_cfg_ptr()->failsafe_shutdown_delay );
|
|
|
|
/* Tell SM we are unhealthy so that it shuts down all its services */
|
|
daemon_log ( SMGMT_UNHEALTHY_FILE, "Active Controller Reboot request" );
|
|
send_mtc_cmd ( node_ptr->hostname, MTC_CMD_LAZY_REBOOT, MGMNT_INTERFACE ) ;
|
|
}
|
|
else
|
|
{
|
|
node_ptr->resetProgStage = MTC_RESETPROG__REBOOT ;
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_RESETPROG__REBOOT:
|
|
{
|
|
#define REBOOT_RETRIES (0)
|
|
node_ptr->mtcCmd_work_fifo.clear();
|
|
mtcCmd_init ( node_ptr->cmd );
|
|
node_ptr->cmd.stage = MTC_CMD_STAGE__START ;
|
|
node_ptr->cmd.cmd = MTC_OPER__RESET_PROGRESSION ;
|
|
node_ptr->cmd.parm1 = REBOOT_RETRIES ; /* retries */
|
|
node_ptr->cmd.task = false ; /* send task updates */
|
|
node_ptr->mtcCmd_work_fifo.push_front(node_ptr->cmd);
|
|
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
|
|
/* calculate the overall timeout period taking into account
|
|
* all the reboot/reset sources that will be tried */
|
|
int overall_timeout = calc_reset_prog_timeout ( node_ptr, REBOOT_RETRIES ) ;
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, overall_timeout ) ;
|
|
node_ptr->resetProgStage = MTC_RESETPROG__WAIT ;
|
|
|
|
break ;
|
|
}
|
|
case MTC_RESETPROG__WAIT:
|
|
{
|
|
/* Look for the command handler FSM timeout and abor in that case */
|
|
if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
ilog ("%s reboot (progression) timeout\n", node_ptr->hostname.c_str());
|
|
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ) ;
|
|
mtcInvApi_force_task ( node_ptr, MTC_TASK_REBOOT_ABORT );
|
|
node_ptr->resetProgStage = MTC_RESETPROG__FAIL ;
|
|
}
|
|
else if ( THIS_HOST )
|
|
{
|
|
; /* wait for the reboot or FSM timeout */
|
|
}
|
|
else if ( node_ptr->mtcCmd_work_fifo.empty())
|
|
{
|
|
slog ("%s unexpected empty cmd queue\n", node_ptr->hostname.c_str());
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ) ;
|
|
mtcInvApi_force_task ( node_ptr, MTC_TASK_REBOOT_ABORT );
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
|
node_ptr->resetProgStage = MTC_RESETPROG__FAIL ;
|
|
}
|
|
else
|
|
{
|
|
node_ptr->mtcCmd_work_fifo_ptr = node_ptr->mtcCmd_work_fifo.begin() ;
|
|
if ( node_ptr->mtcCmd_work_fifo_ptr->stage == MTC_CMD_STAGE__DONE )
|
|
{
|
|
if ( node_ptr->mtcTimer.tid )
|
|
mtcTimer_stop ( node_ptr->mtcTimer );
|
|
|
|
if ( node_ptr->mtcCmd_work_fifo_ptr->status == PASS )
|
|
{
|
|
plog ("%s Reboot Completed\n", node_ptr->hostname.c_str() );
|
|
node_ptr->mtcTimer.ring = true ;
|
|
node_ptr->resetProgStage = MTC_RESETPROG__FAIL ; /* not really fail but use its clean up function */
|
|
}
|
|
else if ( ++node_ptr->retries <= 5 )
|
|
{
|
|
char buffer[255] ;
|
|
snprintf ( buffer, 255, MTC_TASK_REBOOT_FAIL_RETRY, node_ptr->retries, 5 );
|
|
wlog ("%s %s\n", node_ptr->hostname.c_str(), buffer );
|
|
mtcInvApi_update_task ( node_ptr, buffer );
|
|
if ( node_ptr->mtcCmd_done_fifo.size() )
|
|
node_ptr->mtcCmd_done_fifo.pop_front();
|
|
node_ptr->resetProgStage = MTC_RESETPROG__REBOOT ;
|
|
}
|
|
else
|
|
{
|
|
wlog ("%s %s\n", node_ptr->hostname.c_str(), MTC_TASK_REBOOT_ABORT );
|
|
if ( node_ptr->mtcCmd_done_fifo.size() )
|
|
node_ptr->mtcCmd_done_fifo.pop_front();
|
|
mtcInvApi_force_task ( node_ptr, MTC_TASK_REBOOT_ABORT );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ) ;
|
|
node_ptr->resetProgStage = MTC_RESETPROG__FAIL ;
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_RESETPROG__FAIL:
|
|
{
|
|
if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
if ( !node_ptr->mtcCmd_work_fifo.empty() )
|
|
node_ptr->mtcCmd_work_fifo.pop_front();
|
|
if ( !node_ptr->mtcCmd_work_fifo.empty() )
|
|
mtcCmd_workQ_purge ( node_ptr );
|
|
|
|
if ( !node_ptr->mtcCmd_done_fifo.empty() )
|
|
node_ptr->mtcCmd_done_fifo.pop_front();
|
|
if ( !node_ptr->mtcCmd_done_fifo.empty() )
|
|
mtcCmd_doneQ_purge ( node_ptr );
|
|
|
|
mtcInvApi_force_task ( node_ptr, "" );
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
|
node_ptr->resetProgStage = MTC_RESETPROG__START ;
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
default:
|
|
{
|
|
slog ("%s unsupported reboot stage (%d) ; clearing action\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->resetProgStage );
|
|
|
|
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
|
|
}
|
|
}
|
|
|
|
return (PASS);
|
|
}
|
|
|
|
/* Power Handler
|
|
* ----------------- */
|
|
int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
int rc = PASS ;
|
|
switch ( node_ptr->powerStage )
|
|
{
|
|
case MTC_POWEROFF__FAIL:
|
|
{
|
|
elog ("%s Power-Off failed ; aborting after max retries\n", node_ptr->hostname.c_str());
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_POWEROFF_FAIL);
|
|
mtcTimer_reset ( node_ptr->mtcTimer ) ;
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
|
|
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL_WAIT );
|
|
break ;
|
|
}
|
|
case MTC_POWEROFF__FAIL_WAIT:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
|
{
|
|
node_ptr->mtcTimer.ring = false ;
|
|
powerStageChange ( node_ptr , MTC_POWER__DONE );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_POWEROFF__START:
|
|
{
|
|
plog ("%s Administrative 'Power-Off' Action\n", node_ptr->hostname.c_str());
|
|
mtcInvApi_force_task ( node_ptr, "Power-Off Requested" );
|
|
|
|
start_offline_handler ( node_ptr );
|
|
|
|
if ( hostUtil_is_valid_ip_addr (node_ptr->bm_ip ) == false )
|
|
{
|
|
/**
|
|
* New working provisioning is learned by from the
|
|
* dnsmasq.bmc_hosts file changes through inotify watch so
|
|
* it is entirely possible that the retries in this fsm
|
|
* eventually succeed.
|
|
**/
|
|
wlog ("%s bm_ip (%s) is invalid (%d) \n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->bm_ip.c_str(),
|
|
rc );
|
|
}
|
|
else
|
|
{
|
|
; // send_hwmon_command ( node_ptr->hostname, MTC_CMD_STOP_HOST );
|
|
}
|
|
|
|
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
|
|
//the fall through to MTC_POWEROFF__REQ_SEND is intentional
|
|
}
|
|
case MTC_POWEROFF__REQ_SEND:
|
|
{
|
|
node_ptr->power_action_retries--;
|
|
|
|
/* Handle loss of connectivity over retries */
|
|
if ( node_ptr->bm_provisioned == false )
|
|
{
|
|
elog ("%s BMC not provisioned\n", node_ptr->hostname.c_str());
|
|
mtcInvApi_force_task ( node_ptr, MTC_TASK_BMC_NOT_PROV );
|
|
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
|
|
break ;
|
|
}
|
|
|
|
if ( node_ptr->bm_accessible == false )
|
|
{
|
|
wlog ("%s Power Off request rejected ; BMC not accessible ; retry in %d seconds\n",
|
|
node_ptr->hostname.c_str(),
|
|
MTC_POWER_ACTION_RETRY_DELAY);
|
|
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
|
|
break ;
|
|
}
|
|
|
|
else
|
|
{
|
|
rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_OFF );
|
|
if ( rc )
|
|
{
|
|
wlog ("%s Power-Off request failed (%d)\n", node_ptr->hostname.c_str(), rc );
|
|
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
|
|
}
|
|
else
|
|
{
|
|
blog ("%s Power-Off requested\n", node_ptr->hostname.c_str());
|
|
powerStageChange ( node_ptr , MTC_POWEROFF__RESP_WAIT );
|
|
}
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_POWEROFF__RESP_WAIT:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
|
{
|
|
rc = ipmi_command_recv ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
|
break ;
|
|
}
|
|
|
|
if ( rc )
|
|
{
|
|
elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str());
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s is Powering Off\n", node_ptr->hostname.c_str() );
|
|
mtcInvApi_update_task ( node_ptr, "Powering Off" );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
|
|
powerStageChange ( node_ptr , MTC_POWEROFF__DONE );
|
|
node_ptr->power_on = false ;
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_POWEROFF__QUEUE:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
|
{
|
|
node_ptr->mtcTimer.ring = false ;
|
|
if ( node_ptr->power_action_retries > 0 )
|
|
{
|
|
char buffer[255] ;
|
|
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
|
|
snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT);
|
|
mtcInvApi_update_task ( node_ptr, buffer);
|
|
|
|
/* check the thread error status if thetre is one */
|
|
if ( node_ptr->ipmitool_thread_info.status )
|
|
{
|
|
wlog ("%s ... %s (rc:%d)\n", node_ptr->hostname.c_str(),
|
|
node_ptr->ipmitool_thread_info.status_string.c_str(),
|
|
node_ptr->ipmitool_thread_info.status );
|
|
}
|
|
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
|
|
}
|
|
else
|
|
{
|
|
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_POWEROFF__DONE:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
|
{
|
|
plog ("%s Power-Off Completed\n", node_ptr->hostname.c_str());
|
|
|
|
stop_offline_handler ( node_ptr );
|
|
|
|
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__POWERED_OFF );
|
|
|
|
powerStageChange ( node_ptr , MTC_POWER__DONE );
|
|
}
|
|
break ;
|
|
}
|
|
|
|
/* ----------------------- */
|
|
/* POWER ON Group of Cases */
|
|
/* ----------------------- */
|
|
|
|
case MTC_POWERON__FAIL:
|
|
{
|
|
elog ("%s Power-On failed ; aborting after max retries\n", node_ptr->hostname.c_str());
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_POWERON_FAIL);
|
|
mtcTimer_reset ( node_ptr->mtcTimer ) ;
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
|
|
powerStageChange ( node_ptr , MTC_POWERON__FAIL_WAIT );
|
|
break ;
|
|
}
|
|
case MTC_POWERON__FAIL_WAIT:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
|
{
|
|
node_ptr->mtcTimer.ring = false ;
|
|
powerStageChange ( node_ptr , MTC_POWER__DONE );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_POWERON__START:
|
|
{
|
|
plog ("%s Administrative 'Power-On' Action\n", node_ptr->hostname.c_str());
|
|
mtcInvApi_update_task ( node_ptr, "Power-On Requested" );
|
|
|
|
if ( hostUtil_is_valid_ip_addr ( node_ptr->bm_ip ) == false )
|
|
{
|
|
/**
|
|
* New working provisioning is learned by from the
|
|
* dnsmasq.bmc_hosts file changes through inotify watch so
|
|
* it is entirely possible that the retries in this fsm
|
|
* eventually succeed.
|
|
**/
|
|
wlog ("%s bm_ip (%s) is invalid (%d) \n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->bm_ip.c_str(),
|
|
rc );
|
|
}
|
|
|
|
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
|
|
powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS );
|
|
//the fall through to MTC_POWERON__REQ_SEND is intentional
|
|
}
|
|
case MTC_POWERON__POWER_STATUS:
|
|
{
|
|
if ( node_ptr->bm_accessible == false )
|
|
{
|
|
wlog ("%s Power On request rejected ; BMC not accessible ; retry in %d seconds\n",
|
|
node_ptr->hostname.c_str(),
|
|
MTC_POWER_ACTION_RETRY_DELAY);
|
|
|
|
node_ptr->power_action_retries-- ;
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
|
|
break ;
|
|
}
|
|
|
|
rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_STATUS ) ;
|
|
if ( rc )
|
|
{
|
|
node_ptr->power_action_retries-- ;
|
|
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
|
|
}
|
|
else
|
|
{
|
|
powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS_WAIT );
|
|
}
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_IPMITOOL_REQUEST_DELAY );
|
|
break ;
|
|
}
|
|
case MTC_POWERON__POWER_STATUS_WAIT:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
|
{
|
|
rc = ipmi_command_recv ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
|
}
|
|
else if ( rc == PASS )
|
|
{
|
|
if ( node_ptr->ipmitool_thread_info.data.find (IPMITOOL_POWER_ON_STATUS) != std::string::npos )
|
|
{
|
|
ilog ("%s power is already on ; no action required\n", node_ptr->hostname.c_str());
|
|
node_ptr->power_on = true ;
|
|
mtcInvApi_update_task ( node_ptr, "Power Already On" );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
|
|
powerStageChange ( node_ptr , MTC_POWERON__DONE );
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s power is off ; powering on ...\n", node_ptr->hostname.c_str() );
|
|
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS );
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_POWERON__REQ_SEND:
|
|
{
|
|
node_ptr->power_action_retries--;
|
|
|
|
/* Ensure that mtce is updated with the latest board
|
|
* management ip address for this host */
|
|
if ( node_ptr->bm_provisioned == false )
|
|
{
|
|
elog ("%s BMC not provisioned or accessible (%d:%d)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->bm_provisioned,
|
|
node_ptr->bm_accessible );
|
|
|
|
powerStageChange ( node_ptr , MTC_POWERON__FAIL );
|
|
break ;
|
|
}
|
|
|
|
if ( node_ptr->bm_accessible == false )
|
|
{
|
|
wlog ("%s Power-On will fail ; not accessible to BMC ; retry in %d seconds \n",
|
|
node_ptr->hostname.c_str(), MTC_POWER_ACTION_RETRY_DELAY);
|
|
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
|
|
break ;
|
|
}
|
|
else
|
|
{
|
|
rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON );
|
|
if ( rc )
|
|
{
|
|
wlog ("%s Power-On request failed (%d)\n",
|
|
node_ptr->hostname.c_str(), rc );
|
|
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
|
|
}
|
|
else
|
|
{
|
|
blog ("%s Power-On requested\n", node_ptr->hostname.c_str());
|
|
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_IPMITOOL_REQUEST_DELAY );
|
|
|
|
powerStageChange ( node_ptr , MTC_POWERON__RESP_WAIT );
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_POWERON__RESP_WAIT:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
|
{
|
|
rc = ipmi_command_recv ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
|
|
break ;
|
|
}
|
|
|
|
if ( rc )
|
|
{
|
|
elog ("%s Power-On command failed\n", node_ptr->hostname.c_str());
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s is Powering On\n", node_ptr->hostname.c_str() );
|
|
mtcInvApi_update_task ( node_ptr, "Powering On" );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
|
|
powerStageChange ( node_ptr , MTC_POWERON__DONE );
|
|
node_ptr->power_on = true ;
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_POWERON__QUEUE:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
|
{
|
|
node_ptr->mtcTimer.ring = false ;
|
|
if ( node_ptr->power_action_retries > 0 )
|
|
{
|
|
char buffer[64] ;
|
|
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
|
|
snprintf ( buffer, 64, MTC_TASK_POWERON_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT);
|
|
mtcInvApi_update_task ( node_ptr, buffer);
|
|
|
|
/* check the thread error status if thetre is one */
|
|
if ( node_ptr->ipmitool_thread_info.status )
|
|
{
|
|
wlog ("%s ... %s (rc:%d)\n", node_ptr->hostname.c_str(),
|
|
node_ptr->ipmitool_thread_info.status_string.c_str(),
|
|
node_ptr->ipmitool_thread_info.status );
|
|
}
|
|
|
|
powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS );
|
|
}
|
|
else
|
|
{
|
|
powerStageChange ( node_ptr , MTC_POWERON__FAIL );
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_POWERON__DONE:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
|
{
|
|
plog ("%s Power-On Completed\n", node_ptr->hostname.c_str());
|
|
|
|
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__OFFLINE );
|
|
|
|
// send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
|
|
|
|
powerStageChange ( node_ptr , MTC_POWER__DONE );
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_POWER__DONE:
|
|
default:
|
|
{
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
|
|
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
|
|
|
|
recovery_ctrl_init ( node_ptr->hwmon_reset );
|
|
recovery_ctrl_init ( node_ptr->hwmon_powercycle );
|
|
|
|
mtcInvApi_force_task ( node_ptr, "" );
|
|
break ;
|
|
}
|
|
}
|
|
return (PASS);
|
|
}
|
|
|
|
|
|
/* Power Cycle Handler
|
|
* ------------------- */
|
|
int nodeLinkClass::powercycle_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
int rc = PASS ;
|
|
|
|
if ( node_ptr->bm_accessible == false )
|
|
{
|
|
wlog ("%s 'powercycle' abort ; not accessible to BMC\n", node_ptr->hostname.c_str() );
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
|
|
}
|
|
|
|
/* Manage max retries */
|
|
if ( node_ptr->hwmon_powercycle.retries >= MAX_POWERCYCLE_STAGE_RETRIES )
|
|
{
|
|
wlog ("%s 'powercycle' abort ; max retries reached\n", node_ptr->hostname.c_str() );
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
|
|
}
|
|
|
|
/* Manage max retries */
|
|
if ( node_ptr->hwmon_powercycle.queries >= MAX_POWERCYCLE_QUERY_RETRIES )
|
|
{
|
|
wlog ("%s power state query retries exceeded ; failing current iteration\n", node_ptr->hostname.c_str());
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
|
|
}
|
|
|
|
switch ( node_ptr->powercycleStage )
|
|
{
|
|
case MTC_POWERCYCLE__FAIL:
|
|
{
|
|
mtcTimer_reset ( node_ptr->hwmon_powercycle.control_timer );
|
|
|
|
wlog ("%s entering 'powercycle' failed stage ATTEMPT: %d\n",
|
|
node_ptr->hostname.c_str() ,
|
|
node_ptr->hwmon_powercycle.attempts );
|
|
|
|
/* Note: hwmon will continue to send powercycle requests to restart once it is accessible */
|
|
|
|
/* TODO: RELEASE NOTE: Node may be left in the disabled state
|
|
* - need to track power state and raise logs or alarms if host is stuck in power off state.
|
|
* - The ipmitool update does add tracking of the power state but does not introduce the alarm */
|
|
|
|
// send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
|
|
|
|
/* Let the next event perform anothe rpower-cycle retry */
|
|
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__DONE );
|
|
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_POWERCYCLE_FAIL , node_ptr->hwmon_powercycle.attempts );
|
|
|
|
hwmon_recovery_monitor ( node_ptr, MTC_EVENT_HWMON_POWERCYCLE );
|
|
|
|
break ;
|
|
}
|
|
case MTC_POWERCYCLE__START:
|
|
{
|
|
switch ( node_ptr->subStage )
|
|
{
|
|
case MTC_SUBSTAGE__START:
|
|
{
|
|
if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
|
|
{
|
|
ilog ("%s failing host for powercycle\n", node_ptr->hostname.c_str() );
|
|
alarm_enabled_failure ( node_ptr );
|
|
|
|
/* Set node as unlocked-disabled-failed */
|
|
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
|
MTC_OPER_STATE__DISABLED,
|
|
MTC_AVAIL_STATUS__FAILED );
|
|
}
|
|
ilog ("%s is %s-%s-%s\n", node_ptr->hostname.c_str(),
|
|
get_adminState_str (node_ptr->adminState).c_str(),
|
|
get_operState_str (node_ptr->operState).c_str(),
|
|
get_availStatus_str(node_ptr->availStatus).c_str());
|
|
|
|
node_ptr->hwmon_powercycle.state = RECOVERY_STATE__ACTION ;
|
|
|
|
node_ptr->hwmon_powercycle.attempts++ ;
|
|
|
|
mtcTimer_reset ( node_ptr->hwmon_powercycle.control_timer );
|
|
|
|
/***********************************************************************************
|
|
*
|
|
* Perminent Power-Down Case
|
|
* -------------------------
|
|
* If we exceed the maximum power cycle attempt retries then we
|
|
* give up and power the unit down and leave it that way.
|
|
*
|
|
***********************************************************************************/
|
|
if ( node_ptr->hwmon_powercycle.attempts > MAX_POWERCYCLE_ATTEMPT_RETRIES )
|
|
{
|
|
ilog ("%s -------------------------------------------------------------\n", node_ptr->hostname.c_str());
|
|
wlog ("%s critical event is persistent ; too many failed attempts (%d)\n",
|
|
node_ptr->hostname.c_str(), node_ptr->hwmon_powercycle.attempts );
|
|
ilog ("%s -------------------------------------------------------------\n", node_ptr->hostname.c_str());
|
|
|
|
/* terminate any in progress work, likely auto recovery if unlocked, for this host */
|
|
mtcTimer_reset ( node_ptr->mtcCmd_timer );
|
|
mtcCmd_workQ_purge ( node_ptr );
|
|
mtcCmd_doneQ_purge ( node_ptr );
|
|
|
|
// node_ptr->powercycle_completed = true ;
|
|
node_ptr->hwmon_powercycle.retries = 0 ;
|
|
node_ptr->hwmon_powercycle.queries = 0 ;
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_POWERCYCLE_DOWN, node_ptr->hwmon_powercycle.attempts );
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWEROFF );
|
|
}
|
|
else
|
|
{
|
|
wlog ("%s starting 'powercycle' recovery ATTEMPT: %d\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->hwmon_powercycle.attempts );
|
|
|
|
// send_hwmon_command ( node_ptr->hostname, MTC_CMD_STOP_HOST);
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_POWERCYCLE_HOST, node_ptr->hwmon_powercycle.attempts );
|
|
|
|
node_ptr->hwmon_powercycle.retries = 0 ; /* remove for back to back power cycles */
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, 1 );
|
|
subStageChange ( node_ptr, MTC_SUBSTAGE__SEND );
|
|
}
|
|
break ;
|
|
}
|
|
|
|
/* Query current power state */
|
|
case MTC_SUBSTAGE__SEND:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
|
|
{
|
|
int delay = MTC_IPMITOOL_REQUEST_DELAY ;
|
|
ilog ("%s querying current power state\n", node_ptr->hostname.c_str());
|
|
|
|
rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_STATUS );
|
|
if ( rc )
|
|
{
|
|
node_ptr->hwmon_powercycle.retries++ ;
|
|
wlog ("%s failed to send 'power state query' ; retrying %d of %d\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->hwmon_powercycle.retries,
|
|
MAX_POWERCYCLE_STAGE_RETRIES );
|
|
|
|
node_ptr->hwmon_powercycle.queries++ ;
|
|
|
|
/* Retry the send */
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
}
|
|
else
|
|
{
|
|
node_ptr->hwmon_powercycle.queries = 0 ;
|
|
subStageChange ( node_ptr, MTC_SUBSTAGE__RECV );
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, delay );
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
|
|
/* Interpret current power state query */
|
|
case MTC_SUBSTAGE__RECV:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
|
|
{
|
|
rc = ipmi_command_recv ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_RETRY_WAIT );
|
|
break ;
|
|
}
|
|
|
|
if ( rc )
|
|
{
|
|
node_ptr->hwmon_powercycle.retries++ ;
|
|
elog ("%s 'power query' command failed ; retrying %d or %d\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->hwmon_powercycle.retries,
|
|
MAX_POWERCYCLE_STAGE_RETRIES );
|
|
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
subStageChange ( node_ptr, MTC_SUBSTAGE__SEND );
|
|
}
|
|
else
|
|
{
|
|
bool on = false ;
|
|
|
|
ilog ("%s Power Status: %s\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->ipmitool_thread_info.data.c_str());
|
|
|
|
if ( node_ptr->ipmitool_thread_info.data.find ( IPMITOOL_POWER_ON_STATUS ) != std::string::npos )
|
|
{
|
|
on = true ;
|
|
}
|
|
if ( rc == PASS )
|
|
{
|
|
/* maintain current power state */
|
|
node_ptr->power_on = on ;
|
|
|
|
if ( on == true )
|
|
{
|
|
ilog ("%s invoking 'powerdown' phase\n", node_ptr->hostname.c_str());
|
|
|
|
subStageChange ( node_ptr, MTC_SUBSTAGE__DONE );
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWEROFF );
|
|
}
|
|
else
|
|
{
|
|
wlog ("%s is already powered-off ; starting powercycle with power-on\n", node_ptr->hostname.c_str() );
|
|
subStageChange ( node_ptr, MTC_SUBSTAGE__DONE );
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWERON );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* TODO: use FAIL handler */
|
|
node_ptr->hwmon_powercycle.retries = MAX_POWERCYCLE_STAGE_RETRIES+1 ;
|
|
// powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
|
|
elog ("%s failed to query power status ; aborting powercycle action\n",
|
|
node_ptr->hostname.c_str());
|
|
}
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
default:
|
|
{
|
|
slog ("%s %s.%s stage\n", node_ptr->hostname.c_str(),
|
|
get_powercycleStages_str(node_ptr->powercycleStage).c_str(),
|
|
get_subStages_str(node_ptr->subStage).c_str());
|
|
|
|
subStageChange ( node_ptr, MTC_SUBSTAGE__DONE );
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__START );
|
|
break ;
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_POWERCYCLE__POWEROFF:
|
|
{
|
|
int delay = MTC_IPMITOOL_REQUEST_DELAY ;
|
|
|
|
/* Stop heartbeat if we are powering off the host */
|
|
send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST );
|
|
|
|
rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_OFF );
|
|
if ( rc )
|
|
{
|
|
elog ("%s failed to send power-off command to BMC (%d)\n",
|
|
node_ptr->hostname.c_str(),
|
|
rc );
|
|
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
|
|
}
|
|
else
|
|
{
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, delay );
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWEROFF_CMND_WAIT );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_POWERCYCLE__POWEROFF_CMND_WAIT:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
|
|
{
|
|
rc = ipmi_command_recv ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_RETRY_WAIT );
|
|
break ;
|
|
}
|
|
|
|
if ( rc )
|
|
{
|
|
elog ("%s power-off command failed (rc:%d:%d)\n",
|
|
node_ptr->hostname.c_str(),
|
|
rc , node_ptr->ipmitool_thread_info.status);
|
|
|
|
if ( node_ptr->ipmitool_thread_info.status )
|
|
{
|
|
wlog ("%s ... %s\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->ipmitool_thread_info.status_string.c_str());
|
|
}
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s waiting up to %d seconds for 'offline'\n", node_ptr->hostname.c_str(), MTC_POWEROFF_TO_OFFLINE_TIMEOUT );
|
|
|
|
/* Set the power-off timeout */
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer,
|
|
mtcTimer_handler,
|
|
MTC_POWEROFF_TO_OFFLINE_TIMEOUT );
|
|
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWEROFF_WAIT );
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_POWERCYCLE__POWEROFF_WAIT:
|
|
{
|
|
if (( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE ))
|
|
{
|
|
/* since the host is powered down lets reflect that in the database */
|
|
node_ptr->uptime = 0 ;
|
|
mtcInvApi_update_uptime ( node_ptr, node_ptr->uptime );
|
|
|
|
clear_service_readies ( node_ptr );
|
|
|
|
mtcTimer_reset ( node_ptr->hwmon_powercycle.control_timer );
|
|
|
|
if ( node_ptr->hwmon_powercycle.attempts > MAX_POWERCYCLE_ATTEMPT_RETRIES )
|
|
{
|
|
wlog ("%s -------------------------------------------------------------------\n",
|
|
node_ptr->hostname.c_str() );
|
|
wlog ("%s ... Leaving server POWERED DOWN to protect hardware from damage ...\n",
|
|
node_ptr->hostname.c_str() );
|
|
wlog ("%s -------------------------------------------------------------------\n",
|
|
node_ptr->hostname.c_str() );
|
|
|
|
/* Cancelling the recovery timer prevents auto-recovery.
|
|
* Recovery must be through manual actions. */
|
|
mtcTimer_reset ( node_ptr->hwmon_powercycle.recovery_timer );
|
|
node_ptr->hwmon_powercycle.state = RECOVERY_STATE__BLOCKED ;
|
|
|
|
/* Block Auto-Recovery Path
|
|
* ------------------------
|
|
* If we have reached the max retries and are unlocked then
|
|
* leave the powercycle action active so that the enable
|
|
* and graceful recovery handlers don't recover this host.
|
|
* -------------------------
|
|
* Manual action is required to recover a host that has
|
|
* exceeded the maximum powercycle retries */
|
|
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
|
|
{
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
|
}
|
|
|
|
/* While the node_ptr->hwmon_powercycle.control_timer is
|
|
* inactive the MTC_POWERCYCLE__DONE stagwe is a NOOP
|
|
* thereby keeping us doing nothing till the next manual
|
|
* action */
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__DONE );
|
|
}
|
|
else if ( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF )
|
|
{
|
|
ilog ("%s already powered-off, skipping cool-off\n", node_ptr->hostname.c_str());
|
|
mtcTimer_reset ( node_ptr->hwmon_powercycle.control_timer );
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, 10 );
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWERON );
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s waiting %d seconds before power-on ; cool down time\n",
|
|
node_ptr->hostname.c_str(),
|
|
MTC_POWERCYCLE_COOLDOWN_DELAY );
|
|
|
|
node_ptr->hwmon_powercycle.holdoff = MTC_POWERCYCLE_COOLDOWN_DELAY/60 ;
|
|
|
|
/* Set the power-off timeout */
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, COMMAND_DELAY );
|
|
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__COOLOFF );
|
|
node_ptr->hwmon_powercycle.state = RECOVERY_STATE__COOLOFF ;
|
|
}
|
|
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__POWERED_OFF );
|
|
}
|
|
|
|
/* handle timeout case */
|
|
else if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
|
|
{
|
|
/* TODO: manage the retry count */
|
|
elog ("%s timeout waiting for 'offline' state ; retrying ...\n", node_ptr->hostname.c_str() );
|
|
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWEROFF );
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_POWERCYCLE__COOLOFF:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
|
|
{
|
|
mtcInvApi_update_task ( node_ptr,
|
|
MTC_TASK_POWERCYCLE_COOL,
|
|
node_ptr->hwmon_powercycle.attempts,
|
|
node_ptr->hwmon_powercycle.holdoff);
|
|
ilog ("%s Power-Cycle cool-off (%d minutes remaining)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->hwmon_powercycle.holdoff );
|
|
|
|
if ( node_ptr->hwmon_powercycle.holdoff > 1 )
|
|
{
|
|
node_ptr->hwmon_powercycle.holdoff-- ;
|
|
}
|
|
else
|
|
{
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWERON );
|
|
node_ptr->hwmon_powercycle.state = RECOVERY_STATE__ACTION ;
|
|
}
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_MINS_1 );
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_POWERCYCLE__POWERON:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
|
|
{
|
|
int delay = MTC_IPMITOOL_REQUEST_DELAY ;
|
|
clog ("%s %s stage\n", node_ptr->hostname.c_str(),
|
|
get_powercycleStages_str(node_ptr->powercycleStage).c_str());
|
|
|
|
if ( node_ptr->bm_accessible == false )
|
|
{
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer,
|
|
mtcTimer_handler,
|
|
MTC_POWERCYCLE_COOLDOWN_DELAY );
|
|
|
|
wlog ("%s not accessible ; waiting another %d seconds before power-on\n",
|
|
node_ptr->hostname.c_str(),
|
|
MTC_POWERCYCLE_COOLDOWN_DELAY );
|
|
}
|
|
rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON );
|
|
if ( rc )
|
|
{
|
|
elog ("%s failed to send power-on command to BMC (%d)\n",
|
|
node_ptr->hostname.c_str(),
|
|
rc );
|
|
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s Power-On requested\n", node_ptr->hostname.c_str() );
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_POWERCYCLE_ON, node_ptr->hwmon_powercycle.attempts );
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, delay );
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWERON_CMND_WAIT );
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_POWERCYCLE__POWERON_CMND_WAIT:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
|
|
{
|
|
rc = ipmi_command_recv ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_RETRY_WAIT );
|
|
break ;
|
|
}
|
|
|
|
if ( rc )
|
|
{
|
|
wlog ("%s Power-On request failed (rc:%d)\n", node_ptr->hostname.c_str(), rc );
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s Power-On response: %s\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->ipmitool_thread_info.data.c_str() );
|
|
|
|
/* Give the power on request time to execute */
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_CMD_RSP_TIMEOUT );
|
|
|
|
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__OFFLINE );
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWERON_VERIFY );
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_POWERCYCLE__POWERON_VERIFY:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
|
|
{
|
|
rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_STATUS );
|
|
if ( rc )
|
|
{
|
|
wlog ("%s Power-On command failed (rc:%d)\n", node_ptr->hostname.c_str(), rc );
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
|
|
}
|
|
else
|
|
{
|
|
wlog ("%s power status query requested\n", node_ptr->hostname.c_str() );
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_IPMITOOL_REQUEST_DELAY );
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWERON_VERIFY_WAIT );
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_POWERCYCLE__POWERON_VERIFY_WAIT:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
|
|
{
|
|
bool on = false ;
|
|
|
|
rc = ipmi_command_recv ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_RETRY_WAIT );
|
|
break ;
|
|
}
|
|
if ( rc == PASS )
|
|
{
|
|
if ( node_ptr->ipmitool_thread_info.data.find (IPMITOOL_POWER_ON_STATUS) != std::string::npos )
|
|
{
|
|
on = true ;
|
|
}
|
|
}
|
|
|
|
ilog ("%s power state query result: %s\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->ipmitool_thread_info.data.c_str() );
|
|
|
|
if (( rc == PASS ) && ( on == true ))
|
|
{
|
|
node_ptr->power_on = true ;
|
|
ilog ("%s is Powered On - waiting for 'online' (%d sec timeout)\n",
|
|
node_ptr->hostname.c_str(),
|
|
MTC_POWERON_TO_ONLINE_TIMEOUT);
|
|
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_POWERCYCLE_BOOT, node_ptr->hwmon_powercycle.attempts );
|
|
|
|
/* Set the online timeout */
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_POWERON_TO_ONLINE_TIMEOUT );
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWERON_WAIT );
|
|
}
|
|
else
|
|
{
|
|
wlog ("%s Power-On failed or did not occur ; retrying (rc:%d:%d)\n", node_ptr->hostname.c_str(), rc, on );
|
|
node_ptr->power_on = false ;
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_POWERCYCLE_RETRY, node_ptr->hwmon_powercycle.attempts );
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_BM_POWERON_TIMEOUT );
|
|
node_ptr->hwmon_powercycle.queries++ ;
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWERON );
|
|
break ;
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_POWERCYCLE__POWERON_WAIT:
|
|
{
|
|
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE )
|
|
{
|
|
ilog ("%s online (after powercycle)\n", node_ptr->hostname.c_str());
|
|
|
|
node_ptr->hwmon_powercycle.holdoff = MTC_POWERCYCLE_BACK2BACK_DELAY/60 ;
|
|
|
|
mtcTimer_reset ( node_ptr->hwmon_powercycle.control_timer );
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, 1 );
|
|
node_ptr->hwmon_powercycle.state = RECOVERY_STATE__HOLDOFF ;
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__HOLDOFF );
|
|
}
|
|
else if ( node_ptr->hwmon_powercycle.control_timer.ring == true )
|
|
{
|
|
elog ("%s timeout waiting for 'online' state\n", node_ptr->hostname.c_str() );
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_POWERCYCLE__HOLDOFF:
|
|
{
|
|
if ( node_ptr->hwmon_powercycle.control_timer.ring == true )
|
|
{
|
|
mtcInvApi_update_task ( node_ptr,
|
|
MTC_TASK_POWERCYCLE_HOLD,
|
|
node_ptr->hwmon_powercycle.attempts,
|
|
node_ptr->hwmon_powercycle.holdoff);
|
|
ilog ("%s Power-Cycle hold-off (%d minutes remaining) (uptime:%d)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->hwmon_powercycle.holdoff,
|
|
node_ptr->uptime );
|
|
|
|
if ( node_ptr->hwmon_powercycle.holdoff > 1 )
|
|
{
|
|
node_ptr->hwmon_powercycle.holdoff--;
|
|
}
|
|
else
|
|
{
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__DONE );
|
|
}
|
|
mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_MINS_1 );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_POWERCYCLE__DONE:
|
|
{
|
|
if ( node_ptr->hwmon_powercycle.control_timer.ring == true )
|
|
{
|
|
mtcInvApi_update_task ( node_ptr, "" );
|
|
|
|
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
|
|
node_ptr->addStage = MTC_ADD__START ;
|
|
|
|
hwmon_recovery_monitor ( node_ptr, MTC_EVENT_HWMON_POWERCYCLE );
|
|
|
|
enableStageChange ( node_ptr, MTC_ENABLE__START );
|
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__START); /* reset the fsm */
|
|
disableStageChange ( node_ptr, MTC_DISABLE__START); /* reset the fsm */
|
|
|
|
plog ("%s Power-Cycle Completed (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime );
|
|
}
|
|
break ;
|
|
}
|
|
|
|
default:
|
|
{
|
|
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__DONE );
|
|
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
|
|
break ;
|
|
}
|
|
}
|
|
return (rc);
|
|
}
|
|
|
|
/* Delete Handler
|
|
* ----------------- */
|
|
int nodeLinkClass::delete_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
switch ( node_ptr->delStage )
|
|
{
|
|
case MTC_DEL__START:
|
|
{
|
|
ilog ("%s Delete Operation Started (%s)\n", node_ptr->hostname.c_str(), node_ptr->uuid.c_str());
|
|
node_ptr->retries = 0 ;
|
|
send_mtc_cmd ( node_ptr->hostname, MTC_CMD_WIPEDISK, MGMNT_INTERFACE ) ;
|
|
|
|
if ( node_ptr->bm_provisioned == true )
|
|
{
|
|
set_bm_prov ( node_ptr, false);
|
|
}
|
|
|
|
if ( node_ptr->ipmitool_thread_ctrl.stage != THREAD_STAGE__IDLE )
|
|
{
|
|
int delay = THREAD_POST_KILL_WAIT ;
|
|
thread_kill ( node_ptr->ipmitool_thread_ctrl , node_ptr->ipmitool_thread_info ) ;
|
|
|
|
ilog ("%s thread active ; sending kill ; waiting %d seconds\n",
|
|
node_ptr->hostname.c_str(), delay );
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, delay );
|
|
node_ptr->delStage = MTC_DEL__WAIT ;
|
|
}
|
|
else
|
|
{
|
|
node_ptr->delStage = MTC_DEL__DONE ;
|
|
}
|
|
|
|
|
|
/* Send delete commands to monitor services */
|
|
send_hbs_command ( node_ptr->hostname, MTC_CMD_DEL_HOST );
|
|
send_hwmon_command ( node_ptr->hostname, MTC_CMD_DEL_HOST );
|
|
send_guest_command ( node_ptr->hostname, MTC_CMD_DEL_HOST );
|
|
|
|
/* Clear all the alarms for this host and generate a costomer delete log */
|
|
alarmUtil_clear_all ( node_ptr->hostname );
|
|
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_DELETE );
|
|
|
|
break ;
|
|
}
|
|
case MTC_DEL__WAIT:
|
|
{
|
|
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
|
{
|
|
if ( node_ptr->ipmitool_thread_ctrl.stage != THREAD_STAGE__IDLE )
|
|
{
|
|
if ( node_ptr->retries++ < 3 )
|
|
{
|
|
wlog ("%s still waiting on active thread ; sending another kill signal (try %d or %d)\n",
|
|
node_ptr->hostname.c_str(), node_ptr->retries, 3 );
|
|
|
|
thread_kill ( node_ptr->ipmitool_thread_ctrl, node_ptr->ipmitool_thread_info ) ;
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, THREAD_POST_KILL_WAIT );
|
|
break ;
|
|
}
|
|
else
|
|
{
|
|
elog ("%s thread refuses to stop ; giving up ...\n",
|
|
node_ptr->hostname.c_str());
|
|
}
|
|
}
|
|
node_ptr->delStage = MTC_DEL__DONE ;
|
|
}
|
|
break ;
|
|
}
|
|
default:
|
|
case MTC_DEL__DONE:
|
|
{
|
|
dlog ("%s delete almost done !!\n", node_ptr->hostname.c_str());
|
|
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
|
|
del_host ( node_ptr->hostname );
|
|
this->host_deleted = true ;
|
|
break ;
|
|
}
|
|
}
|
|
return (PASS);
|
|
}
|
|
|
|
|
|
/* Add Handler
|
|
* ----------------- */
|
|
int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
int rc = PASS ;
|
|
|
|
switch ( node_ptr->addStage )
|
|
{
|
|
case MTC_ADD__START:
|
|
case MTC_ADD__START_DELAY:
|
|
{
|
|
bool timer_set = false ;
|
|
plog ("%s Host Add\n", node_ptr->hostname.c_str());
|
|
|
|
/* Request a mtcAlive message ; gives us uptime ; don't trust what is in the database */
|
|
node_ptr->uptime = 0 ;
|
|
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, MGMNT_INTERFACE );
|
|
|
|
ilog ("%s %s %s-%s-%s (%s)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->ip.c_str(),
|
|
adminState_enum_to_str (node_ptr->adminState).c_str(),
|
|
operState_enum_to_str (node_ptr->operState).c_str(),
|
|
availStatus_enum_to_str(node_ptr->availStatus).c_str(),
|
|
|
|
node_ptr->uuid.length() ? node_ptr->uuid.c_str() : "" );
|
|
|
|
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
|
{
|
|
if ( daemon_is_file_present ( CONFIG_COMPLETE_COMPUTE ) == false )
|
|
{
|
|
if ( node_ptr->operState_subf != MTC_OPER_STATE__DISABLED )
|
|
{
|
|
subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED, MTC_AVAIL_STATUS__OFFLINE );
|
|
}
|
|
}
|
|
ilog ("%s-%s %s-%s-%s\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->subfunction_str.c_str(),
|
|
adminState_enum_to_str (node_ptr->adminState).c_str(),
|
|
operState_enum_to_str (node_ptr->operState_subf).c_str(),
|
|
availStatus_enum_to_str(node_ptr->availStatus_subf).c_str());
|
|
}
|
|
|
|
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ))
|
|
{
|
|
wlog ("%s Add with availability status 'unlocked-enabled-%s' ; overriding to 'available'\n",
|
|
node_ptr->hostname.c_str(),
|
|
availStatus_enum_to_str(node_ptr->availStatus).c_str());
|
|
mtcInvApi_update_state ( node_ptr, "availability", "available" );
|
|
}
|
|
|
|
/* handle other cases */
|
|
EFmAlarmSeverityT sev = mtcAlarm_state ( node_ptr->hostname,
|
|
MTC_ALARM_ID__ENABLE);
|
|
|
|
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
|
|
{
|
|
node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_WARNING ;
|
|
|
|
/* If the node is locked then the Enable alarm
|
|
* should not be present */
|
|
if ( sev != FM_ALARM_SEVERITY_CLEAR )
|
|
{
|
|
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
|
sev = FM_ALARM_SEVERITY_CLEAR ;
|
|
}
|
|
}
|
|
|
|
/* Manage enable alarm over process restart.
|
|
*
|
|
* - clear the alarm in the active controller case
|
|
* - maintain the alarm, set degrade state in MAJOR and CRIT cases
|
|
* - clear alarm for all other severities.
|
|
*/
|
|
if ( THIS_HOST )
|
|
{
|
|
if ( sev != FM_ALARM_SEVERITY_CLEAR )
|
|
{
|
|
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (( sev == FM_ALARM_SEVERITY_CRITICAL ) ||
|
|
( sev == FM_ALARM_SEVERITY_MAJOR ))
|
|
{
|
|
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = sev ;
|
|
node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ;
|
|
}
|
|
else if ( sev != FM_ALARM_SEVERITY_CLEAR )
|
|
{
|
|
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
|
}
|
|
}
|
|
|
|
if ( is_controller(node_ptr) )
|
|
{
|
|
mtc_cmd_enum state = CONTROLLER_DISABLED ;
|
|
|
|
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
|
(( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )))
|
|
{
|
|
state = CONTROLLER_UNLOCKED ;
|
|
}
|
|
else if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
|
|
{
|
|
state = CONTROLLER_LOCKED ;
|
|
}
|
|
|
|
if ( THIS_HOST )
|
|
{
|
|
nodeLinkClass::set_active_controller_hostname(node_ptr->hostname);
|
|
if ( !node_ptr->task.compare(MTC_TASK_SWACT_INPROGRESS) )
|
|
{
|
|
ilog ("%s %s\n",node_ptr->hostname.c_str(), MTC_TASK_SWACT_NO_COMPLETE);
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_NO_COMPLETE);
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, 20 );
|
|
timer_set = true ;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
nodeLinkClass::set_inactive_controller_hostname(node_ptr->hostname);
|
|
|
|
if ( !node_ptr->task.compare(MTC_TASK_SWACT_INPROGRESS) )
|
|
{
|
|
ilog ("%s %s\n",node_ptr->hostname.c_str(), MTC_TASK_SWACT_COMPLETE );
|
|
|
|
/* Work Around for issue: */
|
|
mtcInvApi_update_uptime ( node_ptr, node_ptr->uptime );
|
|
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_COMPLETE );
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, 10 );
|
|
timer_set = true ;
|
|
}
|
|
}
|
|
|
|
/*************************************************************
|
|
* Don't send a disable to SM if we are in simplex and locked.
|
|
* This will cause SM to shut down all services.
|
|
*
|
|
* Including a hostname check just in case simplex mode
|
|
* is ever or still true with a second controller provisioned
|
|
* but not unlocked. Defensive code.
|
|
*
|
|
* TODO: This should exist in AIO. Without it services will
|
|
* not be running if you lock controller and then
|
|
* reboot while this controller is disabled.
|
|
*/
|
|
if (( THIS_HOST ) &&
|
|
( is_inactive_controller_main_insv() == false ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__DISABLED ))
|
|
{
|
|
ilog ("%s recovering from %s-disabled\n",
|
|
node_ptr->hostname.c_str(),
|
|
get_adminState_str (node_ptr->adminState).c_str());
|
|
}
|
|
else
|
|
{
|
|
mtcSmgrApi_request ( node_ptr, state , SWACT_FAIL_THRESHOLD );
|
|
}
|
|
}
|
|
|
|
if ( daemon_get_cfg_ptr()->debug_level & 1 )
|
|
nodeLinkClass::host_print (node_ptr);
|
|
|
|
if ( timer_set == false )
|
|
{
|
|
node_ptr->mtcTimer.ring = true ;
|
|
}
|
|
node_ptr->addStage = MTC_ADD__CLEAR_TASK ;
|
|
break ;
|
|
}
|
|
|
|
case MTC_ADD__CLEAR_TASK:
|
|
{
|
|
if ( is_controller(node_ptr) )
|
|
{
|
|
if ( node_ptr->mtcTimer.ring == true )
|
|
{
|
|
if ( !node_ptr->task.empty () )
|
|
{
|
|
mtcInvApi_force_task ( node_ptr, "" );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
break ;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* do it immediately for all otyher server types */
|
|
mtcInvApi_force_task ( node_ptr, "" );
|
|
}
|
|
/* default retries counter to zero before START_SERVICES */
|
|
node_ptr->retries = 0 ;
|
|
node_ptr->addStage = MTC_ADD__START_SERVICES ;
|
|
break ;
|
|
}
|
|
|
|
case MTC_ADD__START_SERVICES:
|
|
{
|
|
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
|
(( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )))
|
|
{
|
|
ilog ("%s scheduling start host services\n",
|
|
node_ptr->hostname.c_str());
|
|
|
|
node_ptr->start_services_needed = true ;
|
|
node_ptr->start_services_retries = 0 ;
|
|
}
|
|
|
|
node_ptr->addStage = MTC_ADD__MTC_SERVICES ;
|
|
break ;
|
|
}
|
|
case MTC_ADD__MTC_SERVICES:
|
|
{
|
|
if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
|
|
{
|
|
/* Inform the VIM that this host is enabled */
|
|
mtcVimApi_state_change ( node_ptr, VIM_HOST_ENABLED, 3 );
|
|
}
|
|
else
|
|
{
|
|
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__FAILED )
|
|
{
|
|
mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );
|
|
}
|
|
else
|
|
{
|
|
mtcVimApi_state_change ( node_ptr, VIM_HOST_DISABLED, 3 );
|
|
}
|
|
}
|
|
|
|
send_hbs_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
|
|
|
|
/* Add this host to other maintenance services */
|
|
if (( ! SIMPLEX_CPE_SYSTEM ) && ( node_ptr->bm_provisioned ))
|
|
{
|
|
send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
|
|
}
|
|
if ( ( CPE_SYSTEM ) || ( is_compute (node_ptr) == true ))
|
|
{
|
|
send_guest_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
|
|
}
|
|
|
|
/* Start a timer that failed enable if the work queue
|
|
* does not empty or if commands in the done queue have failed */
|
|
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, work_queue_timeout );
|
|
|
|
node_ptr->addStage = MTC_ADD__WORKQUEUE_WAIT ;
|
|
break ;
|
|
}
|
|
case MTC_ADD__WORKQUEUE_WAIT:
|
|
{
|
|
rc = workQueue_done ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
/* wait longer */
|
|
break ;
|
|
}
|
|
else if ( rc == FAIL_WORKQ_TIMEOUT )
|
|
{
|
|
wlog ("%s Add failed ; workQueue empty timeout, purging ...\n", node_ptr->hostname.c_str());
|
|
workQueue_purge ( node_ptr );
|
|
}
|
|
else if ( rc != PASS )
|
|
{
|
|
wlog ("%s Add failed ; doneQueue contains failed commands\n", node_ptr->hostname.c_str());
|
|
}
|
|
|
|
/* Stop the work queue wait timer */
|
|
mtcTimer_reset ( node_ptr->mtcTimer );
|
|
|
|
/* Only start it on this add operation if host is
|
|
* already unlocked and enabled and not the active controller */
|
|
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
|
|
{
|
|
/* start the heartbeat service in all cases except for
|
|
* THIS host and CPE controller hosts */
|
|
if ( NOT_THIS_HOST )
|
|
{
|
|
if (( LARGE_SYSTEM ) ||
|
|
(( CPE_SYSTEM ) && ( is_controller(node_ptr) == false )))
|
|
{
|
|
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
|
|
}
|
|
}
|
|
}
|
|
/* Only run hardware monitor if the bm ip is provisioned */
|
|
if (( hostUtil_is_valid_bm_type ( node_ptr->bm_type )) &&
|
|
( hostUtil_is_valid_ip_addr ( node_ptr->bm_ip )))
|
|
{
|
|
set_bm_prov ( node_ptr, true ) ;
|
|
send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
|
|
}
|
|
|
|
node_ptr->mtcAlive_gate = false ;
|
|
node_ptr->addStage = MTC_ADD__DONE ;
|
|
break;
|
|
}
|
|
case MTC_ADD__DONE:
|
|
default:
|
|
{
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
|
|
|
/* Send sysinv the wrsroot password hash
|
|
* and aging data as an install command */
|
|
if ( SIMPLEX && THIS_HOST &&
|
|
( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ))
|
|
{
|
|
node_ptr->configStage = MTC_CONFIG__START ;
|
|
node_ptr->configAction = MTC_CONFIG_ACTION__INSTALL_PASSWD ;
|
|
}
|
|
|
|
if (( ! SIMPLEX_CPE_SYSTEM ) &&
|
|
( node_ptr->bm_provisioned == true ))
|
|
{
|
|
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__BM );
|
|
node_ptr->alarms[MTC_ALARM_ID__BM] = FM_ALARM_SEVERITY_CLEAR ;
|
|
}
|
|
|
|
/* Special Add handling for the AIO system */
|
|
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
|
{
|
|
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
|
|
{
|
|
/* In AIO if in DOR mode and the host is unlocked enabled
|
|
* we need to run the subfunction handler and request
|
|
* to start host services. */
|
|
if ( this->dor_mode_active )
|
|
{
|
|
node_ptr->start_services_needed_subf = true ;
|
|
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__ENABLE_SUBF );
|
|
}
|
|
}
|
|
}
|
|
|
|
node_ptr->addStage = MTC_ADD__START;
|
|
plog ("%s Host Add Completed (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime );
|
|
node_ptr->add_completed = true ;
|
|
break ;
|
|
}
|
|
}
|
|
return (rc);
|
|
}
|
|
|
|
int nodeLinkClass::bm_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
/* Call the bmc ssh connection monitor if this node's bm is provisioned */
|
|
if ( node_ptr->bm_provisioned == true )
|
|
{
|
|
if (( node_ptr->bm_accessible == true ) && ( node_ptr->bm_ping_info.ok == false ))
|
|
{
|
|
wlog ("%s bmc access lost\n", node_ptr->hostname.c_str());
|
|
|
|
/* remove the mc info file in case there is a firmware
|
|
* upgrade in progress. hwmond reads it and get
|
|
* the bmc fw version from it */
|
|
string mc_info_filename = IPMITOOL_OUTPUT_DIR ;
|
|
mc_info_filename.append(node_ptr->hostname);
|
|
mc_info_filename.append(IPMITOOL_MC_INFO_FILE_SUFFIX);
|
|
daemon_remove_file ( mc_info_filename.data() );
|
|
|
|
thread_kill ( node_ptr->ipmitool_thread_ctrl, node_ptr->ipmitool_thread_info );
|
|
|
|
bmc_access_data_init ( node_ptr );
|
|
|
|
ipmiUtil_mc_info_init ( node_ptr->mc_info );
|
|
|
|
node_ptr->bm_ping_info.stage = PINGUTIL_MONITOR_STAGE__FAIL ;
|
|
|
|
/* start a timer that will raise the BM Access alarm
|
|
* if we are not accessible by the time it expires */
|
|
plog ("%s bmc access timer started (%d secs)\n", node_ptr->hostname.c_str(), MTC_MINS_2);
|
|
mtcTimer_reset ( node_ptr->bmc_access_timer );
|
|
mtcTimer_start ( node_ptr->bmc_access_timer, mtcTimer_handler, MTC_MINS_2 );
|
|
}
|
|
|
|
/* This block queries and logs BMC Info and last Reset Cause */
|
|
if (( node_ptr->bm_accessible == false ) &&
|
|
( node_ptr->bm_ping_info.ok == true ) &&
|
|
(( node_ptr->mc_info_query_done == false ) ||
|
|
( node_ptr->reset_cause_query_done == false ) ||
|
|
( node_ptr->power_status_query_done == false )) &&
|
|
( mtcTimer_expired (node_ptr->bm_timer ) == true ))
|
|
{
|
|
int rc = PASS ;
|
|
if (( node_ptr->mc_info_query_active == false ) &&
|
|
( node_ptr->mc_info_query_done == false ))
|
|
{
|
|
if ( ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__MC_INFO ) != PASS )
|
|
{
|
|
elog ("%s %s send failed\n",
|
|
node_ptr->hostname.c_str(),
|
|
getIpmiCmd_str(node_ptr->ipmitool_thread_info.command));
|
|
mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
}
|
|
else
|
|
{
|
|
dlog ("%s %s\n", node_ptr->hostname.c_str(),
|
|
getIpmiCmd_str(node_ptr->ipmitool_thread_info.command));
|
|
mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_RETRY_WAIT );
|
|
node_ptr->mc_info_query_active = true ;
|
|
}
|
|
}
|
|
else if (( node_ptr->mc_info_query_active == true ) &&
|
|
( node_ptr->mc_info_query_done == false))
|
|
{
|
|
if ( ( rc = ipmi_command_recv ( node_ptr ) ) == RETRY )
|
|
{
|
|
mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_RETRY_WAIT );
|
|
}
|
|
else if ( rc != PASS )
|
|
{
|
|
/* this error is reported by the ipmi receive driver ...
|
|
* blog ("%s %s command failed\n", node_ptr->hostname.c_str(),
|
|
* getIpmiCmd_str(node_ptr->ipmitool_thread_info.command));
|
|
*/
|
|
node_ptr->mc_info_query_active = false ;
|
|
node_ptr->ipmitool_thread_ctrl.done = true ;
|
|
mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
}
|
|
else
|
|
{
|
|
node_ptr->mc_info_query_active = false ;
|
|
node_ptr->mc_info_query_done = true ;
|
|
node_ptr->ipmitool_thread_ctrl.done = true ;
|
|
ipmiUtil_mc_info_load ( node_ptr->hostname, node_ptr->ipmitool_thread_info.data.data(), node_ptr->mc_info );
|
|
}
|
|
}
|
|
else if (( node_ptr->mc_info_query_active == false ) &&
|
|
( node_ptr->mc_info_query_done == true ))
|
|
{
|
|
if (( node_ptr->reset_cause_query_active == false ) &&
|
|
( node_ptr->reset_cause_query_done == false ))
|
|
{
|
|
if ( ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__RESTART_CAUSE ) != PASS )
|
|
{
|
|
elog ("%s %s send failed\n", node_ptr->hostname.c_str(),
|
|
getIpmiCmd_str(node_ptr->ipmitool_thread_info.command));
|
|
mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
}
|
|
else
|
|
{
|
|
dlog ("%s %s\n", node_ptr->hostname.c_str(),
|
|
getIpmiCmd_str(node_ptr->ipmitool_thread_info.command));
|
|
mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_RETRY_WAIT );
|
|
node_ptr->reset_cause_query_active = true ;
|
|
}
|
|
}
|
|
else if (( node_ptr->reset_cause_query_active == true ) &&
|
|
( node_ptr->reset_cause_query_done == false ))
|
|
{
|
|
if ( ( rc = ipmi_command_recv ( node_ptr ) ) == RETRY )
|
|
{
|
|
mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_RETRY_WAIT );
|
|
}
|
|
else if ( rc != PASS )
|
|
{
|
|
elog ("%s %s command failed\n", node_ptr->hostname.c_str(),
|
|
getIpmiCmd_str(node_ptr->ipmitool_thread_info.command));
|
|
node_ptr->reset_cause_query_active = false ;
|
|
mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
node_ptr->ipmitool_thread_ctrl.done = true ;
|
|
}
|
|
else
|
|
{
|
|
node_ptr->reset_cause_query_active = false ;
|
|
node_ptr->reset_cause_query_done = true ;
|
|
node_ptr->ipmitool_thread_ctrl.done = true ;
|
|
ilog ("%s %s\n", node_ptr->hostname.c_str(),
|
|
node_ptr->ipmitool_thread_info.data.c_str());
|
|
}
|
|
node_ptr->ipmitool_thread_ctrl.done = true ;
|
|
}
|
|
else if (( node_ptr->mc_info_query_done == true ) &&
|
|
( node_ptr->reset_cause_query_done == true ) &&
|
|
( node_ptr->power_status_query_done == false ))
|
|
{
|
|
if ( node_ptr->power_status_query_active == false )
|
|
{
|
|
if ( ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_STATUS ) != PASS )
|
|
{
|
|
elog ("%s %s send failed\n", node_ptr->hostname.c_str(),
|
|
getIpmiCmd_str(node_ptr->ipmitool_thread_info.command));
|
|
mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
}
|
|
else
|
|
{
|
|
dlog ("%s %s\n", node_ptr->hostname.c_str(),
|
|
getIpmiCmd_str(node_ptr->ipmitool_thread_info.command));
|
|
mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_RETRY_WAIT );
|
|
node_ptr->power_status_query_active = true ;
|
|
}
|
|
}
|
|
else if ( node_ptr->power_status_query_done == false )
|
|
{
|
|
if ( ( rc = ipmi_command_recv ( node_ptr ) ) == RETRY )
|
|
{
|
|
mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_RETRY_WAIT );
|
|
}
|
|
else if ( rc )
|
|
{
|
|
node_ptr->power_status_query_active = false ;
|
|
mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
|
|
node_ptr->ipmitool_thread_ctrl.done = true ;
|
|
}
|
|
else
|
|
{
|
|
node_ptr->power_status_query_active = false ;
|
|
node_ptr->power_status_query_done = true ;
|
|
node_ptr->ipmitool_thread_ctrl.done = true ;
|
|
node_ptr->ipmitool_thread_info.command = 0 ;
|
|
node_ptr->bm_accessible = true ;
|
|
node_ptr->bm_accessible = true ;
|
|
mtcTimer_reset ( node_ptr->bmc_access_timer );
|
|
|
|
ilog ("%s %s\n", node_ptr->hostname.c_str(),
|
|
node_ptr->ipmitool_thread_info.data.c_str());
|
|
plog ("%s bmc is accessible\n", node_ptr->hostname.c_str());
|
|
|
|
if ( node_ptr->ipmitool_thread_info.data.find (IPMITOOL_POWER_OFF_STATUS) != std::string::npos )
|
|
{
|
|
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
|
|
{
|
|
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__POWERED_OFF );
|
|
}
|
|
else
|
|
{
|
|
wlog ("%s is powered off while in the unlocked state\n", node_ptr->hostname.c_str());
|
|
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__POWERED_OFF );
|
|
}
|
|
}
|
|
}
|
|
node_ptr->ipmitool_thread_ctrl.done = true ;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if ( node_ptr->bm_ping_info.ok == false )
|
|
{
|
|
/* Auto correct key ping information ; should ever occur but if it does ... */
|
|
if (( node_ptr->bm_ping_info.hostname.empty()) || ( node_ptr->bm_ping_info.ip.empty()))
|
|
{
|
|
/* if the bm ip is not yet learned then this log will flood */
|
|
//slog ("%s host ping info missing ; (%d:%d)\n",
|
|
// node_ptr->hostname.c_str(),
|
|
// node_ptr->bm_ping_info.hostname.empty(),
|
|
// node_ptr->bm_ping_info.ip.empty());
|
|
node_ptr->bm_ping_info.hostname = node_ptr->hostname ;
|
|
node_ptr->bm_ping_info.ip = node_ptr->bm_ip ;
|
|
}
|
|
}
|
|
|
|
/* don't run the ping monitor if the ip address is invalid */
|
|
if ( hostUtil_is_valid_ip_addr ( node_ptr->bm_ping_info.ip ) == true )
|
|
{
|
|
pingUtil_acc_monitor ( node_ptr->bm_ping_info );
|
|
}
|
|
|
|
/* Manage the Board Management Access Alarm */
|
|
if (( node_ptr->bm_accessible == false ) &&
|
|
( mtcTimer_expired ( node_ptr->bmc_access_timer ) == true ))
|
|
{
|
|
node_ptr->bm_ping_info.ok = false ;
|
|
|
|
node_ptr->bm_ping_info.stage = PINGUTIL_MONITOR_STAGE__FAIL ;
|
|
|
|
/* start a timer that will raise the BM Access alarm
|
|
* if we are not accessible by the time it expires */
|
|
plog ("%s bmc access timer started (%d secs)\n", node_ptr->hostname.c_str(), MTC_MINS_2);
|
|
mtcTimer_reset ( node_ptr->bmc_access_timer );
|
|
mtcTimer_start ( node_ptr->bmc_access_timer, mtcTimer_handler, MTC_MINS_2 );
|
|
|
|
if ( node_ptr->alarms[MTC_ALARM_ID__BM] == FM_ALARM_SEVERITY_CLEAR )
|
|
{
|
|
mtcAlarm_warning ( node_ptr->hostname, MTC_ALARM_ID__BM );
|
|
node_ptr->alarms[MTC_ALARM_ID__BM] = FM_ALARM_SEVERITY_WARNING ;
|
|
}
|
|
}
|
|
|
|
/* if BMs are accessible then see if we need to clear the Major BM Alarm. */
|
|
else if (( node_ptr->alarms[MTC_ALARM_ID__BM] != FM_ALARM_SEVERITY_CLEAR ) &&
|
|
( node_ptr->mc_info_query_done == true ) &&
|
|
( node_ptr->reset_cause_query_done == true ) &&
|
|
( node_ptr->power_status_query_done == true ))
|
|
{
|
|
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__BM );
|
|
node_ptr->alarms[MTC_ALARM_ID__BM] = FM_ALARM_SEVERITY_CLEAR ;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if ( node_ptr->alarms[MTC_ALARM_ID__BM] != FM_ALARM_SEVERITY_CLEAR )
|
|
{
|
|
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__BM );
|
|
node_ptr->alarms[MTC_ALARM_ID__BM] = FM_ALARM_SEVERITY_CLEAR ;
|
|
}
|
|
}
|
|
|
|
return (PASS);
|
|
}
|
|
|
|
|
|
int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
switch (node_ptr->oosTestStage)
|
|
{
|
|
case MTC_OOS_TEST__LOAD_NEXT_TEST:
|
|
{
|
|
oosTestStageChange ( node_ptr, MTC_OOS_TEST__START_WAIT );
|
|
break ;
|
|
}
|
|
case MTC_OOS_TEST__START_WAIT:
|
|
{
|
|
/* Monitor timer errors */
|
|
mtcTimer_dump_data ();
|
|
|
|
// blog ("%s Inservice Test Period %d secs\n", node_ptr->hostname.c_str(), oos_test_period);
|
|
mtcTimer_start ( node_ptr->oosTestTimer, mtcTimer_handler, oos_test_period );
|
|
oosTestStageChange ( node_ptr, MTC_OOS_TEST__WAIT );
|
|
|
|
#ifdef WANT_FIT_TESTING
|
|
if ( daemon_want_fit ( FIT_CODE__CORRUPT_TOKEN, node_ptr->hostname ))
|
|
tokenUtil_fail_token ();
|
|
|
|
else if ( daemon_want_fit ( FIT_CODE__STUCK_TASK, node_ptr->hostname ))
|
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_INPROGRESS);
|
|
|
|
else if ( daemon_want_fit ( FIT_CODE__STOP_HOST_SERVICES, node_ptr->hostname ))
|
|
{
|
|
bool start = false ;
|
|
this->launch_host_services_cmd ( node_ptr, start );
|
|
}
|
|
else if ( daemon_want_fit ( FIT_CODE__START_HOST_SERVICES, node_ptr->hostname ))
|
|
{
|
|
if (( node_ptr->start_services_needed == false ) &&
|
|
( node_ptr->start_services_running_main == false ))
|
|
{
|
|
node_ptr->start_services_needed = true ;
|
|
node_ptr->start_services_retries = 0 ;
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s start host services (FIT) rejected (%d:%d)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->start_services_needed,
|
|
node_ptr->start_services_running_main);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
|
|
/* Avoid forcing the states to the database when on the first & second pass.
|
|
* This is because it is likely we just read all the states and
|
|
* if coming out of a DOR or a SWACT we don't need to un-necessarily
|
|
* produce that extra sysinv traffic.
|
|
* Also, no point forcing the states while there is an admin action
|
|
* or enable or graceful recovery going on as well because state changes
|
|
* are being done in the FSM already */
|
|
if (( node_ptr->oos_test_count > 1 ) &&
|
|
( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
|
|
( !node_ptr->handlerStage.raw ) &&
|
|
( !node_ptr->recoveryStage ))
|
|
{
|
|
/* Change the oper and avail states in the database */
|
|
allStateChange ( node_ptr, node_ptr->adminState,
|
|
node_ptr->operState,
|
|
node_ptr->availStatus );
|
|
}
|
|
|
|
#ifdef WANT_CLEAR_ALARM_AUDIT
|
|
|
|
/* TODO: Obsolete with new Alarm Strategy */
|
|
/* Self Correct Stuck Failure Alarms */
|
|
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
|
(( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )))
|
|
{
|
|
if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CLEAR )
|
|
{
|
|
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
|
|
node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CLEAR ;
|
|
}
|
|
alarm_enabled_clear ( node_ptr , false);
|
|
}
|
|
#endif
|
|
/* Make sure the locked status on the host itself is set */
|
|
if (( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__DISABLED ) &&
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE ) &&
|
|
( !(node_ptr->mtce_flags & MTC_FLAG__I_AM_LOCKED) ))
|
|
{
|
|
ilog ("%s setting 'locked' status\n", node_ptr->hostname.c_str());
|
|
|
|
/* Tell the host that it is locked */
|
|
send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, MGMNT_INTERFACE);
|
|
}
|
|
|
|
if (( daemon_is_file_present ( MTC_CMD_FIT__GOENABLE_AUDIT )) &&
|
|
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
|
|
{
|
|
/* Request Out-Of--Service test execution */
|
|
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MAIN_GOENABLED, MGMNT_INTERFACE );
|
|
if ( node_ptr->operState_subf == MTC_OPER_STATE__ENABLED)
|
|
{
|
|
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_SUBF_GOENABLED, MGMNT_INTERFACE );
|
|
}
|
|
}
|
|
|
|
break ;
|
|
}
|
|
case MTC_OOS_TEST__WAIT:
|
|
{
|
|
if ( node_ptr->oosTestTimer.ring == true )
|
|
{
|
|
oosTestStageChange ( node_ptr, MTC_OOS_TEST__DONE );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_OOS_TEST__DONE:
|
|
default:
|
|
{
|
|
node_ptr->oos_test_count++ ;
|
|
oosTestStageChange ( node_ptr, MTC_OOS_TEST__LOAD_NEXT_TEST );
|
|
|
|
/* clear out the retry counter periodically */
|
|
node_ptr->http_retries_cur = 0 ;
|
|
|
|
break ;
|
|
}
|
|
}
|
|
return (PASS);
|
|
}
|
|
|
|
|
|
|
|
int local_counter = 0 ;
|
|
|
|
int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
switch (node_ptr->insvTestStage)
|
|
{
|
|
case MTC_INSV_TEST__START:
|
|
{
|
|
mtcTimer_reset ( node_ptr->insvTestTimer );
|
|
|
|
/* Run the inservice test more frequently while
|
|
* start_services_needed is true and we are not
|
|
* in failure retry mode */
|
|
if (( node_ptr->start_services_needed == true ) &&
|
|
( node_ptr->hostservices_failed == false ) &&
|
|
( node_ptr->hostservices_failed_subf == false ))
|
|
{
|
|
mtcTimer_start ( node_ptr->insvTestTimer, mtcTimer_handler, MTC_SECS_2 );
|
|
}
|
|
else
|
|
{
|
|
mtcTimer_start ( node_ptr->insvTestTimer, mtcTimer_handler, insv_test_period );
|
|
}
|
|
insvTestStageChange ( node_ptr, MTC_INSV_TEST__WAIT );
|
|
break ;
|
|
}
|
|
case MTC_INSV_TEST__WAIT:
|
|
{
|
|
if ( node_ptr->insvTestTimer.ring == true )
|
|
{
|
|
insvTestStageChange ( node_ptr, MTC_INSV_TEST__RUN );
|
|
}
|
|
/* manage degrade state and alarms */
|
|
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
|
|
{
|
|
/************************************************************
|
|
* Manage In-Service Alarms *
|
|
***********************************************************/
|
|
|
|
/* Manage Inservice Enable Alarm */
|
|
if ( node_ptr->hostservices_failed )
|
|
{
|
|
alarm_insv_failure ( node_ptr );
|
|
}
|
|
else
|
|
{
|
|
alarm_insv_clear ( node_ptr, false );
|
|
}
|
|
|
|
/* Manage Compute Subfunction Failure Alarm */
|
|
if ( node_ptr->hostservices_failed_subf )
|
|
{
|
|
alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_MAJOR );
|
|
}
|
|
else
|
|
{
|
|
alarm_compute_clear ( node_ptr, false );
|
|
}
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_INSV_TEST__RUN:
|
|
{
|
|
|
|
#ifdef WANT_FIT_TESTING
|
|
|
|
daemon_load_fit ();
|
|
|
|
if ( daemon_want_fit ( FIT_CODE__UNLOCK_HOST, node_ptr->hostname ) )
|
|
{
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__UNLOCK );
|
|
}
|
|
|
|
if ( daemon_want_fit ( FIT_CODE__LOCK_HOST, node_ptr->hostname ) )
|
|
{
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__LOCK );
|
|
}
|
|
|
|
if ( daemon_want_fit ( FIT_CODE__FORCE_LOCK_HOST, node_ptr->hostname ) )
|
|
{
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__FORCE_LOCK );
|
|
}
|
|
|
|
if (( daemon_want_fit ( FIT_CODE__DO_NOTHING_THREAD, node_ptr->hostname )) ||
|
|
( daemon_want_fit ( FIT_CODE__STRESS_THREAD , node_ptr->hostname )))
|
|
{
|
|
node_ptr->ipmitool_thread_ctrl.stage = THREAD_STAGE__IGNORE ;
|
|
node_ptr->ipmitool_thread_ctrl.id = true ;
|
|
node_ptr->ipmitool_thread_info.id = true ;
|
|
node_ptr->ipmitool_thread_info.command = IPMITOOL_THREAD_CMD__POWER_STATUS ;
|
|
|
|
/* Update / Setup the BMC access credentials */
|
|
node_ptr->thread_extra_info.bm_ip = node_ptr->bm_ip ;
|
|
node_ptr->thread_extra_info.bm_un = node_ptr->bm_un ;
|
|
node_ptr->thread_extra_info.bm_pw = node_ptr->bm_pw ;
|
|
node_ptr->thread_extra_info.bm_type = node_ptr->bm_type ;
|
|
node_ptr->ipmitool_thread_info.extra_info_ptr = &node_ptr->thread_extra_info ;
|
|
if ( thread_launch_thread ( mtcThread_ipmitool, &node_ptr->ipmitool_thread_info ) == 0 )
|
|
{
|
|
slog ("%s FIT launching mtcThread_ipmitool power query thread ; launch failed\n", node_ptr->hostname.c_str());
|
|
}
|
|
else
|
|
{
|
|
slog ("%s FIT launching mtcThread_ipmitool power query thread\n", node_ptr->hostname.c_str());
|
|
}
|
|
node_ptr->ipmitool_thread_ctrl.done = true ;
|
|
}
|
|
|
|
#endif
|
|
|
|
/* Manage active controller auto recovery bool.
|
|
* If the inactive controller is inservice then disable
|
|
* controller autorecovery. Otherwise enable it but in this case
|
|
* don't change the disable bool as that is used to gate auto
|
|
* recovery once the threshoild is reached */
|
|
if ( is_controller ( node_ptr ) && NOT_THIS_HOST )
|
|
{
|
|
if (( this->autorecovery_enabled == true ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
|
|
{
|
|
autorecovery_clear ( CONTROLLER_0 );
|
|
autorecovery_clear ( CONTROLLER_1 );
|
|
this->autorecovery_enabled = false ;
|
|
this->autorecovery_disabled = false ;
|
|
}
|
|
else if (( this->autorecovery_enabled == false ) &&
|
|
( node_ptr->operState != MTC_OPER_STATE__ENABLED ))
|
|
{
|
|
this->autorecovery_enabled = true ;
|
|
}
|
|
}
|
|
|
|
/* Monitor the health of the host - no pass file */
|
|
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
|
|
{
|
|
/************************************************************
|
|
* Prevent the start host services from running while in DOR
|
|
***********************************************************/
|
|
if ( node_ptr->dor_recovery_mode == true )
|
|
{
|
|
/* wait longer for the host to boot up */
|
|
wlog ("%s DOR recovery active ; waiting on host\n",
|
|
node_ptr->hostname.c_str());
|
|
}
|
|
else if ( this->dor_mode_active == true )
|
|
{
|
|
ilog_throttled ( this->dor_mode_active_log_throttle, 20,
|
|
"DOR mode active\n");
|
|
}
|
|
|
|
/*************************************************************
|
|
* Handle Start Host Services if its posted for execution
|
|
************************************************************/
|
|
else if ( node_ptr->start_services_needed == true )
|
|
{
|
|
/* If Main Start Host Services is not already running then launch it */
|
|
if (( node_ptr->start_services_running_main == false ) &&
|
|
( node_ptr->start_services_running_subf == false ))
|
|
{
|
|
bool start = true ;
|
|
if ( this->launch_host_services_cmd ( node_ptr , start ) != PASS )
|
|
{
|
|
node_ptr->hostservices_failed = true ;
|
|
node_ptr->start_services_retries++ ;
|
|
}
|
|
else
|
|
{
|
|
node_ptr->start_services_running_main = true ;
|
|
}
|
|
}
|
|
/* Handle start host services response for both main and
|
|
* subfunction levels */
|
|
else
|
|
{
|
|
/* Wait for host services to complete - pass or fail.
|
|
* The host_services_handler manages timeout. */
|
|
int rc = this->host_services_handler ( node_ptr );
|
|
if ( rc == RETRY )
|
|
{
|
|
/* wait for the mtcClient's response ... */
|
|
break ;
|
|
}
|
|
|
|
node_ptr->start_services_running_main = false ;
|
|
|
|
if ( rc != PASS )
|
|
{
|
|
|
|
/* set the correct failed flag */
|
|
if ( node_ptr->start_services_needed_subf == true )
|
|
{
|
|
node_ptr->start_services_running_subf = false ;
|
|
node_ptr->hostservices_failed_subf = true ;
|
|
}
|
|
else
|
|
{
|
|
node_ptr->hostservices_failed = true ;
|
|
}
|
|
|
|
node_ptr->start_services_retries++ ;
|
|
|
|
wlog ("%s %s request failed ; (retry %d)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->host_services_req.name.c_str(),
|
|
node_ptr->start_services_retries);
|
|
}
|
|
else /* success path */
|
|
{
|
|
/* clear the correct fail flag */
|
|
if (( node_ptr->start_services_needed_subf == true ) &&
|
|
( node_ptr->start_services_running_subf == true ))
|
|
{
|
|
node_ptr->start_services_needed_subf = false ;
|
|
node_ptr->start_services_running_subf = false ;
|
|
node_ptr->hostservices_failed_subf = false ;
|
|
}
|
|
else
|
|
{
|
|
node_ptr->hostservices_failed = false ;
|
|
}
|
|
|
|
/*************************************************
|
|
* Handle running the subfunction start compute
|
|
* host services command as a background operation
|
|
* after the controller start result has come in
|
|
* as a PASS.
|
|
************************************************/
|
|
if ( node_ptr->start_services_needed_subf == true )
|
|
{
|
|
bool start = true ;
|
|
bool subf = node_ptr->start_services_needed_subf ;
|
|
if ( this->launch_host_services_cmd ( node_ptr, start, subf ) != PASS )
|
|
{
|
|
node_ptr->hostservices_failed_subf = true ;
|
|
|
|
/* try again on next audit */
|
|
node_ptr->start_services_retries++ ;
|
|
}
|
|
else
|
|
{
|
|
node_ptr->start_services_running_subf = true ;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* All host service scripts pass ; done */
|
|
clear_hostservices_ctls ( node_ptr );
|
|
node_ptr->hostservices_failed_subf = false ;
|
|
node_ptr->hostservices_failed = false ;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if ( NOT_THIS_HOST )
|
|
{
|
|
if ((( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )) &&
|
|
(!(node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY) &&
|
|
!(node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY)))
|
|
{
|
|
if ( node_ptr->unknown_health_reported == false )
|
|
{
|
|
wlog ( "%s has UNKNOWN HEALTH\n", node_ptr->hostname.c_str());
|
|
node_ptr->unknown_health_reported = true ;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/** Manage the subfunction goenabled alarm over a mtcAgent restart
|
|
* In the restart case the subfunction fsm enable handler is not run so
|
|
* we try to detect the missing goenabled_subf flag as an inservice test.
|
|
*
|
|
* Only in CPE type
|
|
* - clear the alarm if the issue goes away -
|
|
* i.e. the goenabled tests eventually pass. Today
|
|
* hey are not re-run in the background but someday they may be
|
|
* - raise the alarm and go degraded if the goEnabled_subf flag is not set
|
|
* and we have only a single enabled controller (which must be this one)
|
|
* and the alarm is not already raised.
|
|
**/
|
|
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
|
{
|
|
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
|
( node_ptr->mtce_flags & MTC_FLAG__SUBF_CONFIGURED )) /* handle initial install case */
|
|
{
|
|
if (( node_ptr->goEnabled_subf == true ) &&
|
|
( node_ptr->inservice_failed_subf == false ) &&
|
|
( node_ptr->goEnabled_failed_subf == false ) &&
|
|
( node_ptr->hostservices_failed_subf == false ))
|
|
{
|
|
if ( node_ptr->alarms[MTC_ALARM_ID__CH_COMP] != FM_ALARM_SEVERITY_CLEAR )
|
|
{
|
|
alarm_compute_clear ( node_ptr, false );
|
|
ilog ("%s cleared alarm %s due to failure recovery (degrade:%x)\n",
|
|
node_ptr->hostname.c_str(),
|
|
mtcAlarm_getId_str(MTC_ALARM_ID__CH_COMP).c_str(),
|
|
node_ptr->degrade_mask);
|
|
|
|
|
|
if ( node_ptr->degrade_mask == 0 )
|
|
{
|
|
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
|
MTC_OPER_STATE__ENABLED,
|
|
MTC_AVAIL_STATUS__AVAILABLE );
|
|
|
|
subfStateChange ( node_ptr, MTC_OPER_STATE__ENABLED,
|
|
MTC_AVAIL_STATUS__AVAILABLE );
|
|
|
|
/* Inform the VIM that this host is enabled */
|
|
mtcVimApi_state_change ( node_ptr, VIM_HOST_ENABLED, 3 );
|
|
}
|
|
}
|
|
}
|
|
/*
|
|
* Send out-of-service test command and wait for the
|
|
* next audit interval to see the result.
|
|
*
|
|
* node_ptr->goEnabled_subf == true is pass
|
|
* node_ptr->goEnabled_subf_failed == true is fail
|
|
*
|
|
**/
|
|
if (( node_ptr->operState_subf == MTC_OPER_STATE__DISABLED ) &&
|
|
( this->autorecovery_disabled == false ) &&
|
|
( node_ptr->start_services_needed == false ))
|
|
{
|
|
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE_SUBF ) &&
|
|
( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ))
|
|
{
|
|
if (( node_ptr->inservice_failed_subf == false ) &&
|
|
( node_ptr->hostservices_failed_subf == false ))
|
|
{
|
|
ilog ("%s-compute ... running recovery enable\n", node_ptr->hostname.c_str());
|
|
|
|
alarm_compute_clear ( node_ptr, true );
|
|
|
|
enableStageChange ( node_ptr, MTC_ENABLE__START );
|
|
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__ENABLE_SUBF );
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s-compute subfunction is unlocked-disabled (non-operational)\n",
|
|
node_ptr->hostname.c_str());
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s-compute ... waiting on current goEnable completion\n", node_ptr->hostname.c_str() );
|
|
}
|
|
}
|
|
}
|
|
/* Only raise this alarm while in simplex */
|
|
if (( num_controllers_enabled() < 2 ) &&
|
|
(( node_ptr->goEnabled_failed_subf == true ) ||
|
|
( node_ptr->inservice_failed_subf == true ) ||
|
|
( node_ptr->hostservices_failed_subf == true )))
|
|
{
|
|
if ( node_ptr->alarms[MTC_ALARM_ID__CH_COMP] == FM_ALARM_SEVERITY_CLEAR )
|
|
{
|
|
wlog ("%s insv test detected subfunction failure ; degrading host\n",
|
|
node_ptr->hostname.c_str());
|
|
|
|
alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_MAJOR );
|
|
|
|
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
|
MTC_OPER_STATE__ENABLED,
|
|
MTC_AVAIL_STATUS__DEGRADED );
|
|
|
|
subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
|
|
MTC_AVAIL_STATUS__FAILED );
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Monitor the health of the host - no pass file */
|
|
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
|
(( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )))
|
|
{
|
|
/* Manage asserting degrade due to Software Management */
|
|
if (( node_ptr->mtce_flags & MTC_FLAG__SM_DEGRADED ) &&
|
|
( !(node_ptr->degrade_mask & DEGRADE_MASK_SM)))
|
|
{
|
|
/* set the SM degrade flag in the mask */
|
|
node_ptr->degrade_mask |= DEGRADE_MASK_SM ;
|
|
|
|
ilog ("%s sm degrade\n", node_ptr->hostname.c_str());
|
|
}
|
|
|
|
/* Manage de-asserting degrade due to Software Management */
|
|
if ((!(node_ptr->mtce_flags & MTC_FLAG__SM_DEGRADED)) &&
|
|
(node_ptr->degrade_mask & DEGRADE_MASK_SM))
|
|
{
|
|
/* clear the SM degrade flag */
|
|
node_ptr->degrade_mask &= ~DEGRADE_MASK_SM ;
|
|
|
|
ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str());
|
|
}
|
|
|
|
if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY)
|
|
{
|
|
/* not healthy .... */
|
|
if ( THIS_HOST )
|
|
{
|
|
/* initial config is complete and last manifest apply failed ... */
|
|
if (( daemon_is_file_present ( CONFIG_COMPLETE_FILE )) &&
|
|
( daemon_is_file_present ( CONFIG_FAIL_FILE )))
|
|
{
|
|
wlog_throttled ( node_ptr->health_threshold_counter, (MTC_UNHEALTHY_THRESHOLD*3), "%s is UNHEALTHY\n", node_ptr->hostname.c_str());
|
|
if ( node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD )
|
|
{
|
|
node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
|
|
|
|
/* threshold is reached so raise the config alarm if it is not already raised */
|
|
if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CRITICAL )
|
|
{
|
|
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
|
|
node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CRITICAL ;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if ( ++node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD )
|
|
{
|
|
elog ( "%s is UNHEALTHY failed ; forcing re-enabled\n",
|
|
node_ptr->hostname.c_str());
|
|
|
|
force_full_enable ( node_ptr ) ;
|
|
}
|
|
else
|
|
{
|
|
wlog ( "%s is UNHEALTHY (cnt:%d)\n",
|
|
node_ptr->hostname.c_str(),
|
|
node_ptr->health_threshold_counter );
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
node_ptr->health_threshold_counter = 0 ;
|
|
}
|
|
}
|
|
|
|
node_ptr->insv_test_count++ ;
|
|
insvTestStageChange ( node_ptr, MTC_INSV_TEST__START );
|
|
|
|
break ;
|
|
}
|
|
default:
|
|
{
|
|
node_ptr->insv_test_count++ ;
|
|
insvTestStageChange ( node_ptr, MTC_INSV_TEST__START );
|
|
break ;
|
|
}
|
|
}
|
|
return (PASS);
|
|
}
|
|
|
|
/************************************************************
|
|
* Manage host degrade state based on degrade mask *
|
|
* The availability state of degrade only applies when the *
|
|
* host is unlocked-enabled. *
|
|
***********************************************************/
|
|
int nodeLinkClass::degrade_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
|
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
|
|
{
|
|
if (( node_ptr->degrade_mask == DEGRADE_MASK_NONE ) &&
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ))
|
|
{
|
|
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE );
|
|
}
|
|
|
|
else if (( node_ptr->degrade_mask ) &&
|
|
( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ))
|
|
{
|
|
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
|
|
}
|
|
}
|
|
return (PASS);
|
|
}
|
|
|
|
int nodeLinkClass::cfg_handler ( struct nodeLinkClass::node * node_ptr )
|
|
{
|
|
int rc = PASS ;
|
|
switch (node_ptr->configStage )
|
|
{
|
|
case MTC_CONFIG__START:
|
|
{
|
|
ilog ("%s Starting a %s:%s shadow entry change check\n",
|
|
node_ptr->hostname.c_str(),
|
|
SHADOW_FILE,
|
|
USERNAME_ROOT );
|
|
|
|
/* Post the show command with a catch-all timeout timer */
|
|
rc = mtcInvApi_cfg_show ( node_ptr->hostname ) ;
|
|
if ( rc )
|
|
{
|
|
elog ("%s Config SHOW command failed\n", node_ptr->hostname.c_str() );
|
|
configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
|
|
}
|
|
else
|
|
{
|
|
mtcTimer_start ( node_ptr->mtcConfig_timer, mtcTimer_handler, (sysinv_timeout+1) );
|
|
configStageChange ( node_ptr, MTC_CONFIG__SHOW );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_CONFIG__SHOW:
|
|
{
|
|
/* timeout yet ? */
|
|
if ( node_ptr->mtcConfig_timer.ring == true )
|
|
{
|
|
elog ("%s timeout\n", node_ptr->cfgEvent.log_prefix.c_str());
|
|
configStageChange ( node_ptr, MTC_CONFIG__TIMEOUT );
|
|
break ;
|
|
}
|
|
|
|
/* done Yet ? */
|
|
rc = doneQueue_dequeue ( node_ptr->cfgEvent ) ;
|
|
if ( rc == RETRY )
|
|
{
|
|
/* Still waiting */
|
|
break ;
|
|
}
|
|
else if ( rc == PASS )
|
|
{
|
|
string temp_value = "" ;
|
|
mtcTimer_stop ( node_ptr->mtcConfig_timer );
|
|
node_ptr->cfgEvent.value = "" ;
|
|
node_ptr->cfgEvent.uuid = "" ;
|
|
if (( rc = jsonUtil_get_array_idx ( (char*)node_ptr->cfgEvent.response.data(), "iusers", 0 , temp_value )) == PASS )
|
|
{
|
|
jsonUtil_get_key_val ( (char*)temp_value.data(), "root_sig", node_ptr->cfgEvent.value);
|
|
}
|
|
|
|
if ( node_ptr->cfgEvent.value.empty() ||
|
|
!node_ptr->cfgEvent.value.compare("null") || rc )
|
|
{
|
|
elog ("%s null or missing 'root_sig' value (%d:%s)\n",
|
|
node_ptr->cfgEvent.service.c_str(), rc,
|
|
node_ptr->cfgEvent.value.empty() ? "empty" : node_ptr->cfgEvent.value.c_str());
|
|
|
|
node_ptr->cfgEvent.status = FAIL_INVALID_DATA ;
|
|
configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
|
|
break;
|
|
}
|
|
|
|
ilog ("%s root_sig:%s\n", node_ptr->cfgEvent.log_prefix.c_str(),
|
|
node_ptr->cfgEvent.value.c_str());
|
|
|
|
dlog ("Database Signature: %s\n", node_ptr->cfgEvent.value.c_str());
|
|
|
|
/*
|
|
* generate a md5 signature for this user's Shadow entry.
|
|
* We will do so for the entire entry as either the password
|
|
* or the password age may change and we need to track and notify
|
|
* for both.
|
|
*/
|
|
char cfgInfo[1024] = {0};
|
|
node_ptr->cfgEvent.key = get_shadow_signature ( (char*)SHADOW_FILE , (char*)USERNAME_ROOT,
|
|
&cfgInfo[0], sizeof(cfgInfo));
|
|
node_ptr->cfgEvent.information = cfgInfo;
|
|
|
|
if ( node_ptr->cfgEvent.key.empty() )
|
|
{
|
|
elog ("failed to get md5sum of username '%s' from '%s'\n", USERNAME_ROOT, SHADOW_FILE );
|
|
node_ptr->cfgEvent.status = FAIL_INVALID_DATA ;
|
|
configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
|
|
break ;
|
|
}
|
|
|
|
dlog ("File Signature : %s\n", node_ptr->cfgEvent.key.c_str());
|
|
if ( node_ptr->cfgEvent.key.compare(node_ptr->cfgEvent.value))
|
|
{
|
|
bool install = false ;
|
|
if ( node_ptr->configAction == MTC_CONFIG_ACTION__INSTALL_PASSWD )
|
|
{
|
|
install = true ;
|
|
ilog ("%s shadow file hash and aging ... install config\n", USERNAME_ROOT );
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s shadow entry has changed ... updating config\n", USERNAME_ROOT );
|
|
ilog ("... old signature - %s\n", node_ptr->cfgEvent.value.c_str());
|
|
ilog ("... new signature - %s\n", node_ptr->cfgEvent.key.c_str());
|
|
}
|
|
|
|
if ((rc = jsonUtil_get_array_idx ( (char*)node_ptr->cfgEvent.response.data(), "iusers", 0 , temp_value )) == PASS )
|
|
{
|
|
jsonUtil_get_key_val ( (char*)temp_value.data(), "uuid", node_ptr->cfgEvent.uuid);
|
|
}
|
|
|
|
if ( rc || node_ptr->cfgEvent.uuid.empty() || !node_ptr->cfgEvent.uuid.compare("null"))
|
|
{
|
|
elog ("%s null or missing reconfig 'uuid' (%d:%s)\n",
|
|
node_ptr->cfgEvent.service.c_str(), rc,
|
|
node_ptr->cfgEvent.uuid.empty() ? "empty" : node_ptr->cfgEvent.uuid.c_str());
|
|
return ( FAIL_INVALID_DATA );
|
|
}
|
|
ilog ("%s uuid:%s\n",
|
|
node_ptr->cfgEvent.log_prefix.c_str(),
|
|
node_ptr->cfgEvent.uuid.c_str());
|
|
|
|
/* Post the modify command */
|
|
rc = mtcInvApi_cfg_modify ( node_ptr->hostname, install ) ;
|
|
if ( rc )
|
|
{
|
|
elog ("%s Config MODIFY command failed\n", node_ptr->hostname.c_str() );
|
|
configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
|
|
}
|
|
else
|
|
{
|
|
mtcTimer_start ( node_ptr->mtcConfig_timer, mtcTimer_handler, (sysinv_timeout+1) );
|
|
configStageChange ( node_ptr, MTC_CONFIG__MODIFY );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s shadow entry has not changed (%s)\n",
|
|
USERNAME_ROOT, node_ptr->cfgEvent.key.c_str());
|
|
configStageChange ( node_ptr, MTC_CONFIG__DONE );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
elog ("%s failed (%d:%d)\n", node_ptr->cfgEvent.log_prefix.c_str(), rc,
|
|
node_ptr->cfgEvent.status );
|
|
configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
|
|
}
|
|
break ;
|
|
}
|
|
|
|
case MTC_CONFIG__MODIFY:
|
|
{
|
|
/* timeout yet ? */
|
|
if ( node_ptr->mtcConfig_timer.ring == true )
|
|
{
|
|
elog ("%s timeout\n", node_ptr->cfgEvent.log_prefix.c_str());
|
|
configStageChange ( node_ptr, MTC_CONFIG__TIMEOUT );
|
|
break ;
|
|
}
|
|
|
|
/* done Yet ? */
|
|
rc = doneQueue_dequeue ( node_ptr->cfgEvent ) ;
|
|
if ( rc == RETRY )
|
|
{
|
|
/* Still waiting */
|
|
break ;
|
|
}
|
|
else if ( rc == PASS )
|
|
{
|
|
mtcTimer_stop ( node_ptr->mtcConfig_timer );
|
|
if ( node_ptr->cfgEvent.response_len )
|
|
{
|
|
configStageChange ( node_ptr, MTC_CONFIG__VERIFY );
|
|
}
|
|
else
|
|
{
|
|
elog ("%s modify without response (%d:%d)\n",
|
|
node_ptr->cfgEvent.log_prefix.c_str(), rc,
|
|
node_ptr->cfgEvent.status );
|
|
configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
elog ("%s modify failed (%d:%d)\n",
|
|
node_ptr->cfgEvent.log_prefix.c_str(), rc,
|
|
node_ptr->cfgEvent.status );
|
|
configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_CONFIG__VERIFY:
|
|
{
|
|
node_ptr->cfgEvent.value = "" ;
|
|
rc = jsonUtil_get_key_val ( (char*)node_ptr->cfgEvent.response.data(),
|
|
"root_sig", node_ptr->cfgEvent.value);
|
|
if ( node_ptr->cfgEvent.value.empty() ||
|
|
!node_ptr->cfgEvent.value.compare("null") || rc )
|
|
{
|
|
elog ("%s null or missing 'root_sig' value (%d:%s)\n",
|
|
node_ptr->cfgEvent.service.c_str(), rc,
|
|
node_ptr->cfgEvent.value.empty() ? "empty" : node_ptr->cfgEvent.value.c_str());
|
|
|
|
node_ptr->cfgEvent.status = FAIL_INVALID_DATA ;
|
|
configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
|
|
break;
|
|
}
|
|
|
|
if ( node_ptr->cfgEvent.key.compare(node_ptr->cfgEvent.value))
|
|
{
|
|
elog ("%s root_sig modify compare failed\n",
|
|
node_ptr->cfgEvent.log_prefix.c_str());
|
|
wlog ("... database signature - %s\n", node_ptr->cfgEvent.value.c_str());
|
|
wlog ("... file signature - %s\n", node_ptr->cfgEvent.key.c_str());
|
|
|
|
configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s modify succeeded\n", node_ptr->cfgEvent.log_prefix.c_str());
|
|
configStageChange ( node_ptr, MTC_CONFIG__DONE );
|
|
}
|
|
break ;
|
|
}
|
|
case MTC_CONFIG__FAILURE:
|
|
{
|
|
elog ("%s Command Failure\n", node_ptr->cfgEvent.log_prefix.c_str());
|
|
|
|
/* Call to remove this command from the work queue ; if it exists */
|
|
workQueue_del_cmd ( node_ptr, node_ptr->cfgEvent.sequence );
|
|
|
|
configStageChange ( node_ptr, MTC_CONFIG__DONE );
|
|
break ;
|
|
}
|
|
case MTC_CONFIG__TIMEOUT:
|
|
{
|
|
elog ("%s Command Timeout\n", node_ptr->cfgEvent.log_prefix.c_str());
|
|
|
|
/* Call to remove this command from the work queue ; if it exists */
|
|
workQueue_del_cmd ( node_ptr, node_ptr->cfgEvent.sequence );
|
|
|
|
node_ptr->oper_failures++ ;
|
|
mtcHttpUtil_free_conn ( node_ptr->cfgEvent );
|
|
mtcHttpUtil_free_base ( node_ptr->cfgEvent );
|
|
|
|
configStageChange ( node_ptr, MTC_CONFIG__DONE );
|
|
break ;
|
|
}
|
|
case MTC_CONFIG__DONE:
|
|
default:
|
|
{
|
|
if (( node_ptr->configAction == MTC_CONFIG_ACTION__INSTALL_PASSWD ) ||
|
|
( node_ptr->configAction == MTC_CONFIG_ACTION__CHANGE_PASSWD ))
|
|
{
|
|
/* We are done */
|
|
node_ptr->configAction = MTC_CONFIG_ACTION__NONE ;
|
|
}
|
|
if ( node_ptr->configAction == MTC_CONFIG_ACTION__CHANGE_PASSWD_AGAIN )
|
|
{
|
|
/* Run the FSM again */
|
|
node_ptr->configAction = MTC_CONFIG_ACTION__CHANGE_PASSWD ;
|
|
}
|
|
node_ptr->configStage = MTC_CONFIG__START ;
|
|
break ;
|
|
}
|
|
}
|
|
return (PASS);
|
|
}
|