Merge "Improve DOR Recovery banner to include all hosts and their status"
This commit is contained in:
commit
73e3241b6d
@ -159,8 +159,8 @@ typedef struct
|
||||
int work_queue_timeout ; /**< end of action workq complete TO */
|
||||
int loc_recovery_timeout ; /**< loss of comms recovery timeout */
|
||||
int node_reinstall_timeout ; /**< node reinstall timeout */
|
||||
int dor_mode_detect ; /**< dead office recovery detect thld*/
|
||||
int dor_mode_timeout ; /**< dead office recovery timeout */
|
||||
int dor_recovery_timeout_ext ; /**< dor recovery timeout extension */
|
||||
int uptime_period ; /**< Uptime refresh timer period */
|
||||
int online_period ; /**< locked availability refresh */
|
||||
int insv_test_period ; /**< insv test period in secs */
|
||||
|
@ -203,8 +203,9 @@ typedef enum
|
||||
|
||||
#define DEFAULT_MTCALIVE_TIMEOUT (1200)
|
||||
#define DEFAULT_GOENABLE_TIMEOUT (300)
|
||||
#define DEFAULT_DOR_MODE_TIMEOUT (20)
|
||||
#define DEFAULT_DOR_MODE_AIO_TIMEOUT (600)
|
||||
#define DEFAULT_DOR_MODE_TIMEOUT (MTC_MINS_15)
|
||||
#define DEFAULT_DOR_MODE_AIO_TIMEOUT (MTC_MINS_20)
|
||||
#define DEFAULT_DOR_MODE_DETECT (MTC_MINS_20)
|
||||
#define DEFAULT_POWER_OFF_RETRY_WAIT (30)
|
||||
|
||||
/** TODO: Convert names to omit JSON part */
|
||||
@ -962,6 +963,8 @@ typedef enum
|
||||
MTC_ADD__MTC_SERVICES,
|
||||
MTC_ADD__CLEAR_TASK,
|
||||
MTC_ADD__WORKQUEUE_WAIT,
|
||||
MTC_ADD__HEARTBEAT_WAIT,
|
||||
MTC_ADD__HEARTBEAT_SOAK,
|
||||
MTC_ADD__DONE,
|
||||
MTC_ADD__STAGES
|
||||
} mtc_addStages_enum ;
|
||||
|
@ -45,6 +45,7 @@
|
||||
#define MTC_MINS_5 (300)
|
||||
#define MTC_MINS_8 (480)
|
||||
#define MTC_MINS_10 (600)
|
||||
#define MTC_MINS_14 (840)
|
||||
#define MTC_MINS_15 (900)
|
||||
#define MTC_MINS_20 (1200)
|
||||
#define MTC_MINS_30 (1800)
|
||||
@ -71,8 +72,7 @@
|
||||
#define MTC_BM_POWERON_TIMEOUT (30)
|
||||
#define MTC_RESET_PROG_TIMEOUT (20)
|
||||
#define MTC_WORKQUEUE_TIMEOUT (60)
|
||||
#define MTC_WORKER_CONFIG_TIMEOUT (900)
|
||||
#define MTC_EXIT_DOR_MODE_TIMEOUT (60*15)
|
||||
#define MTC_WORKER_CONFIG_TIMEOUT (MTC_MINS_14)
|
||||
#define MTC_RESET_PROG_OFFLINE_TIMEOUT (20)
|
||||
#define MTC_RESET_TO_OFFLINE_TIMEOUT (150)
|
||||
#define MTC_POWEROFF_TO_OFFLINE_TIMEOUT (200)
|
||||
@ -80,6 +80,7 @@
|
||||
#define MTC_POWERCYCLE_COOLDOWN_DELAY (MTC_MINS_5)
|
||||
#define MTC_POWERCYCLE_BACK2BACK_DELAY (MTC_MINS_5)
|
||||
#define MTC_HEARTBEAT_SOAK_BEFORE_ENABLE (11)
|
||||
#define MTC_HEARTBEAT_SOAK_DURING_ADD (10)
|
||||
#define MTC_REINSTALL_TIMEOUT_DEFAULT (MTC_MINS_40)
|
||||
#define MTC_REINSTALL_TIMEOUT_BMC_ACC (MTC_MINS_10)
|
||||
#define MTC_REINSTALL_TIMEOUT_MIN (MTC_MINS_1)
|
||||
|
@ -191,10 +191,10 @@ int timeout_config_handler ( void * user,
|
||||
config_ptr->dor_mode_timeout = atoi(value);
|
||||
ilog ("DOR Mode TO : %3d secs\n", config_ptr->dor_mode_timeout );
|
||||
}
|
||||
else if (MATCH("timeouts", "dor_recovery_timeout_ext"))
|
||||
else if (MATCH("timeouts", "dor_mode_detect"))
|
||||
{
|
||||
config_ptr->dor_recovery_timeout_ext = atoi(value);
|
||||
ilog ("DOR Time Ext: %3d secs\n", config_ptr->dor_recovery_timeout_ext );
|
||||
config_ptr->dor_mode_detect = atoi(value);
|
||||
ilog ("DOR Mode Det: %3d secs", config_ptr->dor_mode_detect );
|
||||
}
|
||||
else if (MATCH("timeouts", "bmc_audit_period"))
|
||||
{
|
||||
|
@ -582,8 +582,6 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
|
||||
ptr->offline_log_reported = true ;
|
||||
ptr->online_log_reported = false ;
|
||||
|
||||
ptr->dor_recovery_mode = false ;
|
||||
ptr->was_dor_recovery_mode= false ;
|
||||
ptr->dor_recovery_time = 0 ;
|
||||
|
||||
ptr->vim_notified = false ;
|
||||
@ -2134,9 +2132,6 @@ int nodeLinkClass::mod_host ( node_inv_type & inv )
|
||||
/* handle a lock request while unlocked */
|
||||
if ( !inv.action.compare ( "lock" ) )
|
||||
{
|
||||
if ( node_ptr->dor_recovery_mode == true )
|
||||
node_ptr->dor_recovery_mode = false ;
|
||||
|
||||
/* Set action to LOCK and let the FSM run the disable handler */
|
||||
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__LOCK );
|
||||
}
|
||||
@ -2183,9 +2178,6 @@ int nodeLinkClass::mod_host ( node_inv_type & inv )
|
||||
/* TODO: Create customer log of this action */
|
||||
ilog ("%s Force Lock Action\n", node_ptr->hostname.c_str());
|
||||
|
||||
if ( node_ptr->dor_recovery_mode == true )
|
||||
node_ptr->dor_recovery_mode = false ;
|
||||
|
||||
if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
|
||||
{
|
||||
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__FORCE_LOCK )
|
||||
@ -2210,9 +2202,6 @@ int nodeLinkClass::mod_host ( node_inv_type & inv )
|
||||
{
|
||||
if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
|
||||
{
|
||||
if ( node_ptr->dor_recovery_mode == true )
|
||||
node_ptr->dor_recovery_mode = false ;
|
||||
|
||||
/* Set action to LOCK and let the FSM run the disable handler */
|
||||
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__LOCK );
|
||||
}
|
||||
@ -3125,13 +3114,13 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
|
||||
if ( delay > 0 )
|
||||
{
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, delay );
|
||||
ilog ("Host add delay is %d seconds", delay );
|
||||
node_ptr->addStage = MTC_ADD__START_DELAY ;
|
||||
}
|
||||
else
|
||||
{
|
||||
node_ptr->addStage = MTC_ADD__START ;
|
||||
}
|
||||
ilog ("Host add delay is %d seconds", delay );
|
||||
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__ADD );
|
||||
}
|
||||
return (rc);
|
||||
@ -5228,6 +5217,20 @@ int nodeLinkClass::manage_shadow_change ( string hostname )
|
||||
return (rc);
|
||||
}
|
||||
|
||||
/** Returns the number of unlocked nodes */
|
||||
int nodeLinkClass::unlocked_nodes ( void )
|
||||
{
|
||||
int temp_count = 0 ;
|
||||
for ( struct node * ptr = head ; ; ptr = ptr->next )
|
||||
{
|
||||
if (ptr->adminState == MTC_ADMIN_STATE__UNLOCKED)
|
||||
temp_count++ ;
|
||||
if (( ptr->next == NULL ) || ( ptr == tail ))
|
||||
break ;
|
||||
}
|
||||
return (temp_count);
|
||||
}
|
||||
|
||||
/** Returns the number of worker hosts that are operationally 'enabled' */
|
||||
int nodeLinkClass::enabled_compute_nodes ( void )
|
||||
{
|
||||
@ -5462,6 +5465,7 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
|
||||
/* Nothing to do if this host is not in the hbs_minor state */
|
||||
if ( node_ptr->hbs_minor[iface] == true )
|
||||
{
|
||||
dlog ("%s clearing heartbeat minor on %s network", node_ptr->hostname.c_str(), get_iface_name_str(iface));
|
||||
/* clear it - possibly temporarily */
|
||||
node_ptr->hbs_minor[iface] = false ;
|
||||
|
||||
@ -5527,7 +5531,7 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
|
||||
{
|
||||
if ( ptr->operState != MTC_OPER_STATE__ENABLED )
|
||||
{
|
||||
slog ("%s found hbs_minor set for disabled host\n" , ptr->hostname.c_str() );
|
||||
slog ("%s found hbs_minor set for %s network for disabled host\n" , ptr->hostname.c_str(), get_iface_name_str(iface));
|
||||
}
|
||||
temp_count++ ;
|
||||
}
|
||||
@ -5553,56 +5557,6 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : manage_dor_recovery
|
||||
*
|
||||
* Description: Enable DOR recovery mode for this host.
|
||||
* Generate log
|
||||
*
|
||||
* The severity parm is used to enhance the logs to indicate what
|
||||
* severity level this utility was called from ;
|
||||
* minor, major, or critical
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void nodeLinkClass::manage_dor_recovery ( struct nodeLinkClass::node * node_ptr,
|
||||
EFmAlarmSeverityT severity )
|
||||
{
|
||||
if (( severity == FM_ALARM_SEVERITY_CLEAR ) &&
|
||||
( node_ptr->dor_recovery_mode == true ))
|
||||
{
|
||||
node_ptr->dor_recovery_mode = false ;
|
||||
node_ptr->was_dor_recovery_mode = true ;
|
||||
}
|
||||
|
||||
else if (( severity == FM_ALARM_SEVERITY_CRITICAL ) &&
|
||||
( node_ptr->dor_recovery_mode == false ))
|
||||
{
|
||||
struct timespec ts ;
|
||||
clock_gettime (CLOCK_MONOTONIC, &ts );
|
||||
wlog ("%-12s is waiting ; DOR recovery %2ld:%02ld mins (%4ld secs)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
ts.tv_sec/60,
|
||||
ts.tv_sec%60,
|
||||
ts.tv_sec);
|
||||
|
||||
node_ptr->dor_recovery_time = 0 ;
|
||||
node_ptr->dor_recovery_mode = true ;
|
||||
node_ptr->hbsClient_ready = false ;
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT );
|
||||
|
||||
/* don't restart graceful recovery for this host if its already in that FSM */
|
||||
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__RECOVER ) &&
|
||||
( node_ptr->adminAction != MTC_ADMIN_ACTION__LOCK ))
|
||||
{
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Manage heartbeat failure events */
|
||||
void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface, bool clear_event )
|
||||
{
|
||||
@ -5627,11 +5581,7 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
|
||||
node_ptr->hbs_failure[iface] = false ;
|
||||
}
|
||||
}
|
||||
else if ( this->mtcTimer_dor.tid )
|
||||
{
|
||||
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
|
||||
}
|
||||
else
|
||||
else if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
|
||||
{
|
||||
/* handle auto recovery for heartbeat failure during enable */
|
||||
if ( node_ptr->ar_cause == MTC_AR_DISABLE_CAUSE__HEARTBEAT )
|
||||
@ -5663,51 +5613,54 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
|
||||
|
||||
mnfa_add_host ( node_ptr , iface );
|
||||
|
||||
if ( mnfa_active == false )
|
||||
if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
|
||||
{
|
||||
/* if node is already in graceful recovery just ignore the event */
|
||||
if ( node_ptr->graceful_recovery_counter != 0 )
|
||||
if ( mnfa_active == false )
|
||||
{
|
||||
dlog ("%s %s loss event ; already in graceful recovery try %d",
|
||||
hostname.c_str(),
|
||||
get_iface_name_str(iface),
|
||||
node_ptr->graceful_recovery_counter );
|
||||
return ;
|
||||
}
|
||||
elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface));
|
||||
if ( iface == CLSTR_IFACE )
|
||||
{
|
||||
node_ptr->heartbeat_failed[CLSTR_IFACE] = true ;
|
||||
}
|
||||
else if ( iface == MGMNT_IFACE )
|
||||
{
|
||||
node_ptr->heartbeat_failed[MGMNT_IFACE] = true ;
|
||||
}
|
||||
if (mnfa_host_count[iface] < this->mnfa_threshold)
|
||||
{
|
||||
elog ("%s %s network heartbeat failure\n", hostname.c_str(), get_iface_name_str(iface));
|
||||
|
||||
nodeLinkClass::set_availStatus ( hostname, MTC_AVAIL_STATUS__FAILED );
|
||||
|
||||
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) &&
|
||||
( node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK ))
|
||||
/* if node is already in graceful recovery just ignore the event */
|
||||
if ( node_ptr->graceful_recovery_counter != 0 )
|
||||
{
|
||||
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER )
|
||||
dlog ("%s %s loss event ; already in graceful recovery try %d",
|
||||
hostname.c_str(),
|
||||
get_iface_name_str(iface),
|
||||
node_ptr->graceful_recovery_counter );
|
||||
return ;
|
||||
}
|
||||
elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface));
|
||||
if ( iface == CLSTR_IFACE )
|
||||
{
|
||||
node_ptr->heartbeat_failed[CLSTR_IFACE] = true ;
|
||||
}
|
||||
else if ( iface == MGMNT_IFACE )
|
||||
{
|
||||
node_ptr->heartbeat_failed[MGMNT_IFACE] = true ;
|
||||
}
|
||||
if (mnfa_host_count[iface] < this->mnfa_threshold)
|
||||
{
|
||||
elog ("%s %s network heartbeat failure\n", hostname.c_str(), get_iface_name_str(iface));
|
||||
|
||||
nodeLinkClass::set_availStatus ( hostname, MTC_AVAIL_STATUS__FAILED );
|
||||
|
||||
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) &&
|
||||
( node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK ))
|
||||
{
|
||||
wlog ("%s restarting graceful recovery\n", hostname.c_str() );
|
||||
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER )
|
||||
{
|
||||
wlog ("%s restarting graceful recovery", hostname.c_str() );
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("%s starting graceful recovery", hostname.c_str() );
|
||||
}
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("%s starting graceful recovery\n", hostname.c_str() );
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_FAIL_HB );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
||||
}
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
|
||||
}
|
||||
else
|
||||
{
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_FAIL_HB );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -5802,11 +5755,8 @@ void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface
|
||||
|
||||
hbs_minor_clear ( node_ptr, iface );
|
||||
}
|
||||
else if ( this->mtcTimer_dor.tid )
|
||||
{
|
||||
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_MAJOR );
|
||||
}
|
||||
else
|
||||
/* - we don't care about locked hosts */
|
||||
else if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
|
||||
{
|
||||
if ( mnfa_active == false )
|
||||
{
|
||||
@ -5815,7 +5765,7 @@ void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface
|
||||
|
||||
mnfa_add_host ( node_ptr, iface );
|
||||
|
||||
if ( nodeLinkClass::get_operState ( hostname ) == MTC_OPER_STATE__ENABLED )
|
||||
if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
|
||||
{
|
||||
if ( iface == MGMNT_IFACE )
|
||||
{
|
||||
@ -5852,16 +5802,11 @@ void nodeLinkClass::manage_heartbeat_minor ( string hostname, iface_enum iface,
|
||||
alog ("%s %s Heartbeat Minor (clear)\n", hostname.c_str(), get_iface_name_str(iface));
|
||||
hbs_minor_clear ( node_ptr, iface );
|
||||
}
|
||||
/* if not a clear then only set if the host is enabled
|
||||
* - we don't care about disabled hosts */
|
||||
else if ( nodeLinkClass::get_operState ( hostname ) == MTC_OPER_STATE__ENABLED )
|
||||
/* - we don't care about locked hosts */
|
||||
else if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
|
||||
{
|
||||
if ( this->mtcTimer_dor.tid )
|
||||
{
|
||||
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_MINOR );
|
||||
}
|
||||
|
||||
else if ( node_ptr->hbs_minor[iface] != true )
|
||||
if ( node_ptr->hbs_minor[iface] != true )
|
||||
{
|
||||
mnfa_add_host ( node_ptr, iface );
|
||||
}
|
||||
@ -7077,8 +7022,7 @@ int nodeLinkClass::adminActionChange ( struct nodeLinkClass::node * node_ptr,
|
||||
* other action can take effect.
|
||||
* If its not one of these action then just proceed with it
|
||||
**/
|
||||
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ADD ) &&
|
||||
( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK ))
|
||||
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK )
|
||||
{
|
||||
clog ("%s Administrative Action '%s' -> '%s'\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
@ -8511,30 +8455,76 @@ int nodeLinkClass::ar_handler ( struct nodeLinkClass::node * node_ptr,
|
||||
* Description: Create a specifically formatted log for the specified
|
||||
* hosts DOR recovery state and timing.
|
||||
*
|
||||
* Parameters : The node and a caller prefix string that states if the node
|
||||
* is ENABELD
|
||||
* Assumptions: Only logged if the active controller has an uptime
|
||||
* less than 20 minutes (default). Configurable in mtce.conf
|
||||
*
|
||||
* Parameters :
|
||||
*
|
||||
* @param node_ptr Pointer to the node in the inventoried node linked list.
|
||||
* @param node_state_log_prefix Prefix for the node's state log messages.
|
||||
* is ENABLED
|
||||
* is DEGRADED
|
||||
* is DISABLED
|
||||
* is FAILED
|
||||
* is ENMABLED-degraded
|
||||
* etc.
|
||||
* is OFFLINE
|
||||
* @param extra string representing where this function was called.
|
||||
*
|
||||
***************************************************************************/
|
||||
void nodeLinkClass::report_dor_recovery ( struct nodeLinkClass::node * node_ptr,
|
||||
string node_state_log_prefix )
|
||||
string node_state_log_prefix,
|
||||
string extra )
|
||||
{
|
||||
struct timespec ts ;
|
||||
clock_gettime (CLOCK_MONOTONIC, &ts );
|
||||
node_ptr->dor_recovery_time = ts.tv_sec ;
|
||||
plog ("%-12s %s ; DOR Recovery %2d:%02d mins (%4d secs) (uptime:%2d:%02d mins)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_state_log_prefix.c_str(),
|
||||
node_ptr->dor_recovery_time/60,
|
||||
node_ptr->dor_recovery_time%60,
|
||||
node_ptr->dor_recovery_time,
|
||||
node_ptr->uptime/60,
|
||||
node_ptr->uptime%60 );
|
||||
|
||||
node_ptr->dor_recovery_mode = false ;
|
||||
node_ptr->was_dor_recovery_mode = false ;
|
||||
if ( this->dor_mode_active )
|
||||
{
|
||||
node_ptr->dor_recovery_time = ts.tv_sec ;
|
||||
plog ("%-12s %-11s ; DOR Recovery %2d:%02d mins (%4d secs) (uptime:%2d:%02d mins) %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_state_log_prefix.c_str(),
|
||||
node_ptr->dor_recovery_time/60,
|
||||
node_ptr->dor_recovery_time%60,
|
||||
node_ptr->dor_recovery_time,
|
||||
node_ptr->uptime/60,
|
||||
node_ptr->uptime%60,
|
||||
extra.c_str());
|
||||
|
||||
// Accounting
|
||||
int unlocked_nodes = this->unlocked_nodes() ;
|
||||
if ( ++this->dor_recovered_nodes == unlocked_nodes )
|
||||
{
|
||||
mtcTimer_reset (this->mtcTimer_dor);
|
||||
this->dor_mode_active = false ;
|
||||
this->dor_mode_active_log_throttle = 0 ;
|
||||
ilog ("%-13s %3d of %-3d ; DOR Recovery ; all nodes are recovered ; active controller uptime:%ld",
|
||||
this->my_hostname.c_str(),
|
||||
this->dor_recovered_nodes,
|
||||
unlocked_nodes,
|
||||
ts.tv_sec);
|
||||
}
|
||||
else if ( this->dor_recovered_nodes > this->unlocked_nodes() )
|
||||
{
|
||||
slog ("%s unexpected extra DOR recovery call ; unlocked:%d recovered:%d",
|
||||
node_ptr->hostname.c_str(),
|
||||
unlocked_nodes,
|
||||
this->dor_recovered_nodes);
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("%s %d of %d DOR nodes recovered",
|
||||
node_ptr->hostname.c_str(),
|
||||
this->dor_recovered_nodes,
|
||||
unlocked_nodes);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
dlog ("%s DOR Recovery called with '%s %s' while dor mode disabled",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_state_log_prefix.c_str(),
|
||||
extra.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
|
||||
@ -8548,10 +8538,7 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
|
||||
return ;
|
||||
}
|
||||
|
||||
if ( node_ptr->was_dor_recovery_mode )
|
||||
{
|
||||
report_dor_recovery ( node_ptr , "is FAILED " );
|
||||
}
|
||||
report_dor_recovery ( node_ptr , "is FAILED", "full enable" );
|
||||
|
||||
plog ("%s Forcing Full Enable Sequence\n", node_ptr->hostname.c_str());
|
||||
|
||||
@ -8561,9 +8548,8 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
|
||||
allStateChange ( node_ptr, node_ptr->adminState, MTC_OPER_STATE__DISABLED, MTC_AVAIL_STATUS__FAILED );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__START ); /* reset the fsm */
|
||||
// don't override the add action or lock actions /
|
||||
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ADD ) &&
|
||||
( node_ptr->adminAction != MTC_ADMIN_ACTION__LOCK ) &&
|
||||
// don't override the lock actions /
|
||||
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__LOCK ) &&
|
||||
( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK ))
|
||||
{
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); // no action
|
||||
@ -9784,10 +9770,8 @@ void nodeLinkClass::mem_log_general ( void )
|
||||
void nodeLinkClass::mem_log_dor ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
char str[MAX_MEM_LOG_DATA] ;
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s DOR - Active: %c Was: %c Time: %5d (00:%02d:%02d)\n",
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s DOR - Time: %5d (00:%02d:%02d)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->dor_recovery_mode ? 'Y' : 'N',
|
||||
node_ptr->was_dor_recovery_mode ? 'Y' : 'N',
|
||||
node_ptr->dor_recovery_time,
|
||||
node_ptr->dor_recovery_time ? node_ptr->dor_recovery_time/60 : 0,
|
||||
node_ptr->dor_recovery_time ? node_ptr->dor_recovery_time%60 : 0);
|
||||
@ -9813,12 +9797,14 @@ void nodeLinkClass::mem_log_mnfa ( void )
|
||||
void nodeLinkClass::mem_log_general_mtce_hosts ( void )
|
||||
{
|
||||
char str[MAX_MEM_LOG_DATA] ;
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s EnableHosts -> Cont:%d Comp:%d Stor:%d StorType:%d\n",
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s EnableHosts -> Cont:%d Comp:%d Stor:%d StorType:%d Unlocked:%d DOR:Recovered:%d\n",
|
||||
my_hostname.c_str(),
|
||||
num_controllers_enabled(),
|
||||
enabled_compute_nodes(),
|
||||
enabled_storage_nodes(),
|
||||
get_storage_backend());
|
||||
get_storage_backend(),
|
||||
unlocked_nodes(),
|
||||
dor_recovered_nodes);
|
||||
mem_log (str);
|
||||
}
|
||||
|
||||
|
@ -368,10 +368,8 @@ private:
|
||||
/* the fault handling offline handler timer */
|
||||
struct mtc_timer offline_timer ;
|
||||
|
||||
/* Host level DOR recovery mode time and bools */
|
||||
/* Host level DOR recovery time */
|
||||
int dor_recovery_time ;
|
||||
bool dor_recovery_mode ;
|
||||
bool was_dor_recovery_mode ;
|
||||
|
||||
/** Integer code representing the host health */
|
||||
int health ;
|
||||
@ -1275,7 +1273,7 @@ private:
|
||||
|
||||
/* Dead Office Recovery - system level controls */
|
||||
void manage_dor_recovery ( struct nodeLinkClass::node * node_ptr, EFmAlarmSeverityT severity );
|
||||
void report_dor_recovery ( struct nodeLinkClass::node * node_ptr, string node_state_log_prefix );
|
||||
void report_dor_recovery ( struct nodeLinkClass::node * node_ptr, string node_state_log_prefix, string extra );
|
||||
|
||||
struct {
|
||||
struct node * head_ptr ; /**< Pulse Linked List Head pointer */
|
||||
@ -1398,6 +1396,7 @@ public:
|
||||
bool dor_mode_active ;
|
||||
unsigned int dor_start_time ;
|
||||
int dor_mode_active_log_throttle ;
|
||||
int dor_recovered_nodes = 0; /**< DOR node recovery count */
|
||||
|
||||
bool hbs_disabled ; /**< Control heartbeat service state */
|
||||
bool hbs_state_change ; /**< Flag service state change */
|
||||
@ -1702,6 +1701,9 @@ public:
|
||||
/** Remove a host from Node list */
|
||||
int rem_host ( string & hostname );
|
||||
|
||||
/** Get the number of unlocked nodes */
|
||||
int unlocked_nodes ( void );
|
||||
|
||||
/** Get the number of worker hosts that are operationally 'enabled' */
|
||||
int enabled_compute_nodes ( void );
|
||||
|
||||
|
@ -710,6 +710,15 @@ int daemon_configure ( void )
|
||||
else
|
||||
mtcInv.node_reinstall_timeout = MTC_REINSTALL_TIMEOUT_DEFAULT ;
|
||||
|
||||
if ( mtc_config.dor_mode_detect <= 0 )
|
||||
{
|
||||
wlog ("DOR mode detect timeout is invalid (%d), setting to default (%d)",
|
||||
mtc_config.dor_mode_detect,
|
||||
DEFAULT_DOR_MODE_DETECT);
|
||||
|
||||
mtc_config.dor_mode_detect = DEFAULT_DOR_MODE_DETECT ;
|
||||
}
|
||||
|
||||
if ( mtc_config.dor_mode_timeout <= 0 )
|
||||
{
|
||||
slog ("DOR Mode Timeout is invalid (%d), setting to default (%d)\n",
|
||||
@ -1653,7 +1662,7 @@ void daemon_service_run ( void )
|
||||
}
|
||||
#endif
|
||||
|
||||
if ( ts.tv_sec < MTC_MINS_15 )
|
||||
if ( ts.tv_sec < mtc_config.dor_mode_detect )
|
||||
{
|
||||
/* AIO DOR window is much greater in AIO since heartbeat
|
||||
* cannot start until the inactive AIO has run both manifests */
|
||||
@ -1669,16 +1678,16 @@ void daemon_service_run ( void )
|
||||
mtcInv.dor_mode_active = true ;
|
||||
mtcInv.dor_start_time = ts.tv_sec ;
|
||||
|
||||
ilog ("%-12s ---------- ; DOR Recovery ---------------------- -------------------\n", mtcInv.my_hostname.c_str());
|
||||
ilog ("%-12s is ACTIVE ; DOR Recovery %2d:%02d mins (%4d secs) (duration %3d secs)\n",
|
||||
ilog ("%-12s ----------- ; DOR Recovery ---------------------- -------------------\n", mtcInv.my_hostname.c_str());
|
||||
ilog ("%-12s is ACTIVE ; DOR Recovery %2d:%02d mins (%4d secs) (dor timeout in %3d secs)\n",
|
||||
mtcInv.my_hostname.c_str(),
|
||||
mtcInv.dor_start_time/60,
|
||||
mtcInv.dor_start_time%60,
|
||||
mtcInv.dor_start_time,
|
||||
timeout );
|
||||
ilog ("%-12s ---------- ; DOR Recovery ---------------------- -------------------\n", mtcInv.my_hostname.c_str());
|
||||
ilog ("%-12s host state ; DOR Recovery controller uptime host uptime \n", mtcInv.my_hostname.c_str());
|
||||
ilog ("%-12s ---------- ; DOR Recovery ---------------------- -------------------\n", mtcInv.my_hostname.c_str());
|
||||
ilog ("%-12s ----------- ; DOR Recovery ---------------------- -------------------", mtcInv.my_hostname.c_str());
|
||||
ilog ("%-12s host state ; DOR Recovery controller uptime host uptime ", mtcInv.my_hostname.c_str());
|
||||
ilog ("%-12s ----------- ; DOR Recovery ---------------------- -------------------", mtcInv.my_hostname.c_str());
|
||||
mtcTimer_start ( mtcInv.mtcTimer_dor, mtcTimer_handler, timeout );
|
||||
}
|
||||
|
||||
@ -1992,7 +2001,12 @@ void daemon_service_run ( void )
|
||||
* then exit DOR mode. We do it here instead of */
|
||||
if (( mtcInv.dor_mode_active == true ) && ( mtcInv.mtcTimer_dor.tid == NULL ))
|
||||
{
|
||||
ilog ("DOR mode disable\n");
|
||||
wlog ("%s DOR mode disabled ; DOR Recovery Timeout ; %d of %d unlocked hosts ; active controller uptime:%d",
|
||||
mtcInv.my_hostname.c_str(),
|
||||
mtcInv.dor_recovered_nodes,
|
||||
mtcInv.unlocked_nodes(),
|
||||
mtcInv.get_uptime(mtcInv.my_hostname));
|
||||
mtcInv.dor_mode_active_log_throttle = 0 ;
|
||||
mtcInv.dor_mode_active = false ;
|
||||
}
|
||||
}
|
||||
|
@ -194,6 +194,7 @@ void nodeLinkClass::timer_handler ( int sig, siginfo_t *si, void *uc)
|
||||
{
|
||||
mtcTimer_stop_int_safe ( mtcTimer_dor );
|
||||
mtcTimer_dor.ring = true ;
|
||||
this->dor_mode_active_log_throttle = 0 ;
|
||||
return ;
|
||||
}
|
||||
|
||||
@ -488,7 +489,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
AR_LOG_THROTTLE_THRESHOLD,
|
||||
"%s auto recovery disabled cause:%d",
|
||||
node_ptr->hostname.c_str(), node_ptr->ar_cause );
|
||||
return (RETRY); ;
|
||||
return (RETRY);
|
||||
}
|
||||
|
||||
if ( THIS_HOST )
|
||||
@ -787,11 +788,10 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
case MTC_ENABLE__START:
|
||||
{
|
||||
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );
|
||||
|
||||
plog ("%s Main Enable FSM (from start)%s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->was_dor_recovery_mode ? " (from DOR)" : "" );
|
||||
this->dor_mode_active ? " (DOR active)" : "" );
|
||||
|
||||
/* clear all the past enable failure bools */
|
||||
clear_main_failed_bools ( node_ptr );
|
||||
@ -1547,10 +1547,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
/* Inform the VIM that this host is enabled */
|
||||
mtcVimApi_state_change ( node_ptr, VIM_HOST_ENABLED, 3 );
|
||||
|
||||
plog ("%s is ENABLED%s\n", node_ptr->hostname.c_str(),
|
||||
node_ptr->was_dor_recovery_mode ? " (from DOR)" : "");
|
||||
node_ptr->dor_recovery_mode = false ;
|
||||
node_ptr->was_dor_recovery_mode = false ;
|
||||
plog ("%s is ENABLED", node_ptr->hostname.c_str());
|
||||
node_ptr->http_retries_cur = 0 ;
|
||||
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
||||
@ -1718,13 +1715,12 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
if ( node_ptr->mtcAlive_online == true )
|
||||
{
|
||||
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );
|
||||
|
||||
mtcTimer_stop ( node_ptr->mtcTimer );
|
||||
|
||||
ilog ("%s got requested mtcAlive%s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->was_dor_recovery_mode ? " (DOR)" : "" );
|
||||
this->dor_mode_active ? " (DOR mode)" : "" );
|
||||
|
||||
stop_offline_handler ( node_ptr );
|
||||
|
||||
@ -1793,7 +1789,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
/* did not reboot case */
|
||||
wlog ("%s Connectivity Recovered ; host did not reset%s (uptime:%d)",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->was_dor_recovery_mode ? " (DOR)" : "",
|
||||
this->dor_mode_active ? " (DOR mode)" : "",
|
||||
node_ptr->uptime);
|
||||
|
||||
wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str());
|
||||
@ -1808,9 +1804,9 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
else
|
||||
{
|
||||
wlog ("%s Connectivity Recovered ; host has reset\n", node_ptr->hostname.c_str());
|
||||
ilog ("%s ... continuing%sgraceful recovery ; (OOB: %08x)\n",
|
||||
ilog ("%s ... continuing graceful recovery%s ; (OOB: %08x)",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->was_dor_recovery_mode ? " (DOR) " : " ",
|
||||
this->dor_mode_active ? " (DOR mode)" : "",
|
||||
node_ptr->mtce_flags);
|
||||
ilog ("%s ... without additional reboot %s (uptime:%d)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
@ -1845,7 +1841,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
wlog ("%s Loss Of Communication for %d seconds ; disabling host%s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
loc_recovery_timeout,
|
||||
node_ptr->dor_recovery_mode ? " (DOR)" : "" );
|
||||
this->dor_mode_active ? " (DOR mode)" : "" );
|
||||
wlog ("%s ... stopping host services\n", node_ptr->hostname.c_str());
|
||||
wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str());
|
||||
|
||||
@ -1898,7 +1894,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
|
||||
/* Only try and issue in-line recovery reboot or reset if
|
||||
* NOT in Dead Office Recovery (DOR) mode. */
|
||||
if ( node_ptr->dor_recovery_mode == false )
|
||||
if ( this->dor_mode_active )
|
||||
{
|
||||
ilog ("%s issuing one time graceful recovery reboot over management network\n",
|
||||
node_ptr->hostname.c_str());
|
||||
@ -1945,7 +1941,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
LOAD_NODETYPE_TIMERS ;
|
||||
|
||||
/* load the mtcAlive timeout to accomodate for dor recovery */
|
||||
timeout = node_ptr->mtcalive_timeout + daemon_get_cfg_ptr()->dor_recovery_timeout_ext ;
|
||||
timeout = node_ptr->mtcalive_timeout ;
|
||||
}
|
||||
|
||||
/* start the timer that waits for MTCALIVE */
|
||||
@ -1955,7 +1951,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
node_ptr->hostname.c_str(),
|
||||
MTC_TASK_RECOVERY_WAIT,
|
||||
timeout,
|
||||
node_ptr->dor_recovery_mode ? " (DOR) " : " " ,
|
||||
this->dor_mode_active ? " (DOR) " : " " ,
|
||||
node_ptr->uptime_save );
|
||||
|
||||
clear_service_readies ( node_ptr );
|
||||
@ -2024,7 +2020,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
node_ptr->hostname.c_str(),
|
||||
MTC_TASK_RECOVERY_WAIT,
|
||||
timeout,
|
||||
node_ptr->dor_recovery_mode ? " (DOR) " : " " ,
|
||||
this->dor_mode_active ? " (DOR mode) " : " " ,
|
||||
node_ptr->uptime_save );
|
||||
|
||||
clear_service_readies ( node_ptr );
|
||||
@ -2075,7 +2071,6 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
mtcTimer_stop ( node_ptr->mtcTimer );
|
||||
|
||||
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );
|
||||
|
||||
/* If the host's uptime is bigger than the saved uptime then
|
||||
* the host has not reset yet we have disabled services
|
||||
@ -2084,7 +2079,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
if ((( node_ptr->uptime_save != 0 ) &&
|
||||
( node_ptr->uptime >= node_ptr->uptime_save )) ||
|
||||
(( node_ptr->uptime_save == 0 ) &&
|
||||
( node_ptr->uptime > MTC_MINS_15 )))
|
||||
( node_ptr->uptime > MTC_MINS_20 )))
|
||||
{
|
||||
ilog ("%s regained MTCALIVE from host that did not reboot (uptime:%d)\n",
|
||||
node_ptr->hostname.c_str(), node_ptr->uptime );
|
||||
@ -2121,7 +2116,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
node_ptr->hostname.c_str(), node_ptr->uptime, node_ptr->uptime_save );
|
||||
ilog ("%s ... continuing with graceful recovery %s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->dor_recovery_mode ? "(DOR)" : " ");
|
||||
this->dor_mode_active ? "(DOR mode)" : "");
|
||||
ilog ("%s ... without additional reboot %s\n",
|
||||
node_ptr->hostname.c_str(), node_ptr->bm_ip.empty() ? "or reset" : "" );
|
||||
|
||||
@ -2138,7 +2133,6 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
else if ( node_ptr->mtcTimer.ring == true )
|
||||
{
|
||||
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );
|
||||
|
||||
/* Set the FSM task state to init failed */
|
||||
mtcInvApi_update_task ( node_ptr, "Graceful Recovery Failed" );
|
||||
@ -2523,15 +2517,13 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
node_ptr->http_retries_cur = 0 ;
|
||||
|
||||
doneQueue_purge ( node_ptr );
|
||||
if ( node_ptr->was_dor_recovery_mode )
|
||||
if ( this->dor_mode_active )
|
||||
{
|
||||
report_dor_recovery ( node_ptr , "is ENABLED" );
|
||||
}
|
||||
else
|
||||
{
|
||||
plog ("%s is ENABLED (Gracefully Recovered)\n",
|
||||
node_ptr->hostname.c_str());
|
||||
report_dor_recovery ( node_ptr , "is ENABLED", "recovery" );
|
||||
}
|
||||
plog ("%s is ENABLED (Gracefully Recovered%s)",
|
||||
node_ptr->hostname.c_str(),
|
||||
this->dor_mode_active ? " in DOR mode" : "");
|
||||
alarm_enabled_clear ( node_ptr, false );
|
||||
break ;
|
||||
}
|
||||
@ -6023,11 +6015,26 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
case MTC_ADD__START:
|
||||
{
|
||||
bool timer_set = false ;
|
||||
plog ("%s Host Add\n", node_ptr->hostname.c_str());
|
||||
if ( THIS_HOST )
|
||||
{
|
||||
struct timespec ts ;
|
||||
clock_gettime (CLOCK_MONOTONIC, &ts );
|
||||
node_ptr->uptime = ts.tv_sec ;
|
||||
}
|
||||
else if ( ! node_ptr->mtcClient_ready )
|
||||
{
|
||||
/* If we have not received a mtcAlive event from the
|
||||
* mtcClient already then lets request it since that
|
||||
* is how we get its uptime.
|
||||
* Don't trust what is in the database since it will
|
||||
* be stale. Best to default to zero so the logs will
|
||||
* show that there has been no mtcAlive received */
|
||||
node_ptr->uptime = 0 ;
|
||||
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, MGMNT_INTERFACE );
|
||||
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, PXEBOOT_INTERFACE );
|
||||
}
|
||||
|
||||
/* Request a mtcAlive message ; gives us uptime ; don't trust what is in the database */
|
||||
node_ptr->uptime = 0 ;
|
||||
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, MGMNT_INTERFACE );
|
||||
plog ("%s Host Add (uptime:%d)", node_ptr->hostname.c_str(), node_ptr->uptime );
|
||||
|
||||
ilog ("%s %s %s-%s-%s (%s)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
@ -6075,14 +6082,31 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
EFmAlarmSeverityT mtcAlive_alarm_severity =
|
||||
mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__MTCALIVE);
|
||||
|
||||
/* Clear generic enable alarm over process restart.
|
||||
* Will get reasserted if the cause condition still exists */
|
||||
/* Manage an existing enable alarm */
|
||||
if ( enable_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
|
||||
{
|
||||
ilog ("%s found enable alarm ; clearing %s",
|
||||
/* Added the unlocked-disabled check to avoid clearing the
|
||||
* enabled alarm when the node is found to be unlocked-disabled
|
||||
* with the enable alarm already asserted.
|
||||
* We don't want to clear it in that case. */
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__DISABLED ))
|
||||
{
|
||||
node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ;
|
||||
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = enable_alarm_severity ;
|
||||
wlog ("%s found enable alarm while unlocked-disabled ; loaded %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
alarmUtil_getSev_str(enable_alarm_severity).c_str());
|
||||
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
||||
alarmUtil_getSev_str(enable_alarm_severity).c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("%s found enable alarm while %s-%s ; clearing %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
adminState_enum_to_str (node_ptr->adminState).c_str(),
|
||||
operState_enum_to_str (node_ptr->operState_subf).c_str(),
|
||||
alarmUtil_getSev_str(enable_alarm_severity).c_str());
|
||||
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
||||
}
|
||||
}
|
||||
|
||||
/* The config alarm is maintained if it exists.
|
||||
@ -6230,6 +6254,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
alarm_luks_failure ( node_ptr );
|
||||
}
|
||||
node_ptr->ar_disabled = true ;
|
||||
this->report_dor_recovery ( node_ptr, "is DISABLED" , "auto recovery disabled");
|
||||
|
||||
if ( THIS_HOST )
|
||||
mtcInvApi_update_states ( node_ptr, "unlocked", "enabled", "degraded" );
|
||||
@ -6341,22 +6366,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
/* Stop the work queue wait timer */
|
||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||
|
||||
/* Only start it on this add operation if host is
|
||||
* already unlocked and enabled and not the active controller */
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
|
||||
{
|
||||
/* start the heartbeat service in all cases except for
|
||||
* THIS host and AIO controller hosts */
|
||||
if ( NOT_THIS_HOST )
|
||||
{
|
||||
if (( LARGE_SYSTEM ) ||
|
||||
(( AIO_SYSTEM ) && ( this->dor_mode_active == false )))
|
||||
{
|
||||
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Only run hardware monitor if the bm ip is provisioned */
|
||||
if (( hostUtil_is_valid_bm_type ( node_ptr->bm_type )) &&
|
||||
@ -6367,9 +6376,63 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
|
||||
this->ctl_mtcAlive_gate(node_ptr, false) ;
|
||||
node_ptr->addStage = MTC_ADD__DONE ;
|
||||
if (( NOT_THIS_HOST ) &&
|
||||
((( AIO_SYSTEM ) && ( is_controller(node_ptr) == false )) || ( LARGE_SYSTEM )) &&
|
||||
( this->hbs_failure_action != HBS_FAILURE_ACTION__NONE ) &&
|
||||
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
|
||||
{
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_5 );
|
||||
if ( ! node_ptr->hbsClient_ready )
|
||||
{
|
||||
ilog ("%s waiting for hbsClient ready event (%d secs)", node_ptr->hostname.c_str(), MTC_MINS_5);
|
||||
}
|
||||
node_ptr->addStage = MTC_ADD__HEARTBEAT_WAIT ;
|
||||
}
|
||||
else
|
||||
{
|
||||
node_ptr->addStage = MTC_ADD__DONE ;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case MTC_ADD__HEARTBEAT_WAIT:
|
||||
{
|
||||
/* Wait for hbsClient ready event */
|
||||
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
||||
{
|
||||
wlog ("%s hbsClient ready event timeout\n", node_ptr->hostname.c_str());
|
||||
}
|
||||
else if ( node_ptr->hbsClient_ready == false )
|
||||
{
|
||||
break ;
|
||||
}
|
||||
else
|
||||
{
|
||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||
}
|
||||
plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
MTC_HEARTBEAT_SOAK_DURING_ADD,
|
||||
node_ptr->hbsClient_ready ? " ready event" : "out ready event" );
|
||||
|
||||
/* allow heartbeat to run for MTC_HEARTBEAT_SOAK_DURING_ADD
|
||||
* seconds before we declare enable */
|
||||
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_DURING_ADD );
|
||||
node_ptr->addStage = MTC_ADD__HEARTBEAT_SOAK ;
|
||||
break ;
|
||||
}
|
||||
case MTC_ADD__HEARTBEAT_SOAK:
|
||||
{
|
||||
if ( node_ptr->mtcTimer.ring == true )
|
||||
{
|
||||
plog ("%s heartbeating", node_ptr->hostname.c_str());
|
||||
/* if heartbeat is not working then we will
|
||||
* never get here */
|
||||
node_ptr->addStage = MTC_ADD__DONE ;
|
||||
}
|
||||
break ;
|
||||
}
|
||||
case MTC_ADD__DONE:
|
||||
default:
|
||||
{
|
||||
@ -6396,16 +6459,55 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
|
||||
{
|
||||
/* In AIO if in DOR mode and the host is unlocked enabled
|
||||
* we need to run the subfunction handler and request
|
||||
* to start host services. */
|
||||
/* Need to run the subfunction enable handler
|
||||
* for AIO controllers while in DOR mode */
|
||||
if ( this->dor_mode_active )
|
||||
{
|
||||
ilog ("%s running subfunction enable for unlocked-enabled AIO controller (DOR mode)", node_ptr->hostname.c_str());
|
||||
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__ENABLE_SUBF );
|
||||
break ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
else if ( this->dor_mode_active )
|
||||
{
|
||||
/* The Enable SUBF handler will do this so lets not do it twice */
|
||||
if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
|
||||
{
|
||||
string state_str = "" ;
|
||||
if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
|
||||
{
|
||||
state_str = "is ENABLED" ;
|
||||
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )
|
||||
state_str = "is DEGRADED" ;
|
||||
}
|
||||
else if ( node_ptr->availStatus == MTC_AVAIL_STATUS__FAILED )
|
||||
{
|
||||
state_str = "is FAILED" ;
|
||||
}
|
||||
else if ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE )
|
||||
{
|
||||
state_str = "is OFFLINE" ;
|
||||
}
|
||||
if ( ! state_str.empty() )
|
||||
{
|
||||
report_dor_recovery ( node_ptr , state_str, "" ) ;
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("%-12s is waiting ; DOR Recovery ; %s-%s-%s ; mtcClient:%c hbsClient:%c uptime:%3d task:%s",
|
||||
node_ptr->hostname.c_str(),
|
||||
adminState_enum_to_str (node_ptr->adminState).c_str(),
|
||||
operState_enum_to_str (node_ptr->operState).c_str(),
|
||||
availStatus_enum_to_str(node_ptr->availStatus).c_str(),
|
||||
node_ptr->mtcClient_ready ? 'Y':'N',
|
||||
node_ptr->hbsClient_ready ? 'Y':'N',
|
||||
node_ptr->uptime,
|
||||
node_ptr->task.empty() ? "empty" : node_ptr->task.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
node_ptr->addStage = MTC_ADD__START;
|
||||
|
||||
plog ("%s Host Add Completed (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime );
|
||||
@ -7597,35 +7699,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
insvTestStageChange ( node_ptr, MTC_INSV_TEST__RUN );
|
||||
}
|
||||
/* manage degrade state and alarms */
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
||||
( node_ptr->ar_disabled == false ))
|
||||
{
|
||||
/************************************************************
|
||||
* Manage In-Service Alarms *
|
||||
***********************************************************/
|
||||
|
||||
/* Manage Inservice Enable Alarm */
|
||||
if ( node_ptr->hostservices_failed )
|
||||
{
|
||||
alarm_insv_failure ( node_ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
alarm_insv_clear ( node_ptr, false );
|
||||
}
|
||||
|
||||
/* Manage Compute Subfunction Failure Alarm */
|
||||
if ( node_ptr->hostservices_failed_subf )
|
||||
{
|
||||
alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_MAJOR );
|
||||
}
|
||||
else
|
||||
{
|
||||
alarm_compute_clear ( node_ptr, false );
|
||||
}
|
||||
}
|
||||
break ;
|
||||
}
|
||||
case MTC_INSV_TEST__RUN:
|
||||
@ -7694,16 +7767,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
|
||||
{
|
||||
/************************************************************
|
||||
* Prevent the start host services from running while in DOR
|
||||
***********************************************************/
|
||||
if ( node_ptr->dor_recovery_mode == true )
|
||||
{
|
||||
/* wait longer for the host to boot up */
|
||||
wlog ("%s DOR recovery active ; waiting on host\n",
|
||||
node_ptr->hostname.c_str());
|
||||
}
|
||||
else if ( this->dor_mode_active == true )
|
||||
if ( this->dor_mode_active == true )
|
||||
{
|
||||
ilog_throttled ( this->dor_mode_active_log_throttle, 20,
|
||||
"DOR mode active\n");
|
||||
@ -8313,4 +8377,4 @@ int nodeLinkClass::self_fail_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
"force swact to unlocked-enabled standby controller");
|
||||
}
|
||||
return (PASS);
|
||||
}
|
||||
}
|
||||
|
@ -411,12 +411,6 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( node_ptr->dor_recovery_mode || node_ptr->was_dor_recovery_mode )
|
||||
{
|
||||
node_ptr->dor_recovery_mode = false ;
|
||||
node_ptr->was_dor_recovery_mode = true ;
|
||||
}
|
||||
|
||||
if (( node_ptr->alarms[MTC_ALARM_ID__CH_COMP] != FM_ALARM_SEVERITY_CLEAR ) ||
|
||||
( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CLEAR ) ||
|
||||
( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CLEAR ))
|
||||
@ -454,9 +448,9 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
|
||||
node_ptr->subf_enabled = true ;
|
||||
node_ptr->inservice_failed_subf = false ;
|
||||
if ( node_ptr->was_dor_recovery_mode )
|
||||
if ( this->dor_mode_active )
|
||||
{
|
||||
report_dor_recovery ( node_ptr , "is ENABLED" );
|
||||
report_dor_recovery ( node_ptr , "is ENABLED", "subf" );
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -488,9 +482,9 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
MTC_OPER_STATE__ENABLED,
|
||||
MTC_AVAIL_STATUS__DEGRADED );
|
||||
|
||||
if ( node_ptr->was_dor_recovery_mode )
|
||||
if ( this->dor_mode_active )
|
||||
{
|
||||
report_dor_recovery ( node_ptr , "is ENABLED-degraded" );
|
||||
report_dor_recovery ( node_ptr , "is DEGRADED", "subf" );
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -511,16 +505,6 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
MTC_OPER_STATE__ENABLED,
|
||||
MTC_AVAIL_STATUS__DEGRADED );
|
||||
|
||||
if ( node_ptr->was_dor_recovery_mode )
|
||||
{
|
||||
report_dor_recovery ( node_ptr , "is DISABLED-failed" );
|
||||
}
|
||||
else
|
||||
{
|
||||
elog ("%s is DISABLED-failed (subfunction failed)\n",
|
||||
name.c_str() );
|
||||
}
|
||||
this->dor_mode_active = false ;
|
||||
|
||||
alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_CRITICAL ) ;
|
||||
|
||||
@ -552,9 +536,6 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
node_ptr->enabled_count++ ;
|
||||
node_ptr->health_threshold_counter = 0 ;
|
||||
|
||||
node_ptr->was_dor_recovery_mode = false ;
|
||||
node_ptr->dor_recovery_mode = false ;
|
||||
this->dor_mode_active = false ;
|
||||
|
||||
ar_enable ( node_ptr );
|
||||
|
||||
|
@ -125,12 +125,12 @@ loc_recovery_timeout = 5 ; Loss Of Communication Recovery Timeout
|
||||
; the max number of seconds that a host can be in
|
||||
; loss of communication state without failing the unit
|
||||
|
||||
dor_mode_timeout = 20 ; The default base time in seconds for how long
|
||||
dor_mode_detect = 1200 ; Controller uptime less than this value puts mtcAgent
|
||||
; into DOR mode active state. Default: 20 minutes
|
||||
|
||||
dor_mode_timeout = 1000 ; The default base time in seconds for how long
|
||||
; maintenance DOR mode is active. This number
|
||||
; is extended by the number of enabled hosts.
|
||||
dor_recovery_timeout_ext = 1800 ; Dor timeout extension. An extra time in seconds
|
||||
; that is added to the host specific recovery time
|
||||
; making the overall host's dor recovery timeout.
|
||||
|
||||
swact_timeout = 120 ; Seconds Mtce waits for HA Service SWACT before failing
|
||||
; the swact operation
|
||||
|
Loading…
x
Reference in New Issue
Block a user