Merge "Improve DOR Recovery banner to include all hosts and their status"

This commit is contained in:
Zuul 2025-04-10 17:36:03 +00:00 committed by Gerrit Code Review
commit 73e3241b6d
10 changed files with 344 additions and 293 deletions

View File

@ -159,8 +159,8 @@ typedef struct
int work_queue_timeout ; /**< end of action workq complete TO */
int loc_recovery_timeout ; /**< loss of comms recovery timeout */
int node_reinstall_timeout ; /**< node reinstall timeout */
int dor_mode_detect ; /**< dead office recovery detect thld*/
int dor_mode_timeout ; /**< dead office recovery timeout */
int dor_recovery_timeout_ext ; /**< dor recovery timeout extension */
int uptime_period ; /**< Uptime refresh timer period */
int online_period ; /**< locked availability refresh */
int insv_test_period ; /**< insv test period in secs */

View File

@ -203,8 +203,9 @@ typedef enum
#define DEFAULT_MTCALIVE_TIMEOUT (1200)
#define DEFAULT_GOENABLE_TIMEOUT (300)
#define DEFAULT_DOR_MODE_TIMEOUT (20)
#define DEFAULT_DOR_MODE_AIO_TIMEOUT (600)
#define DEFAULT_DOR_MODE_TIMEOUT (MTC_MINS_15)
#define DEFAULT_DOR_MODE_AIO_TIMEOUT (MTC_MINS_20)
#define DEFAULT_DOR_MODE_DETECT (MTC_MINS_20)
#define DEFAULT_POWER_OFF_RETRY_WAIT (30)
/** TODO: Convert names to omit JSON part */
@ -962,6 +963,8 @@ typedef enum
MTC_ADD__MTC_SERVICES,
MTC_ADD__CLEAR_TASK,
MTC_ADD__WORKQUEUE_WAIT,
MTC_ADD__HEARTBEAT_WAIT,
MTC_ADD__HEARTBEAT_SOAK,
MTC_ADD__DONE,
MTC_ADD__STAGES
} mtc_addStages_enum ;

View File

@ -45,6 +45,7 @@
#define MTC_MINS_5 (300)
#define MTC_MINS_8 (480)
#define MTC_MINS_10 (600)
#define MTC_MINS_14 (840)
#define MTC_MINS_15 (900)
#define MTC_MINS_20 (1200)
#define MTC_MINS_30 (1800)
@ -71,8 +72,7 @@
#define MTC_BM_POWERON_TIMEOUT (30)
#define MTC_RESET_PROG_TIMEOUT (20)
#define MTC_WORKQUEUE_TIMEOUT (60)
#define MTC_WORKER_CONFIG_TIMEOUT (900)
#define MTC_EXIT_DOR_MODE_TIMEOUT (60*15)
#define MTC_WORKER_CONFIG_TIMEOUT (MTC_MINS_14)
#define MTC_RESET_PROG_OFFLINE_TIMEOUT (20)
#define MTC_RESET_TO_OFFLINE_TIMEOUT (150)
#define MTC_POWEROFF_TO_OFFLINE_TIMEOUT (200)
@ -80,6 +80,7 @@
#define MTC_POWERCYCLE_COOLDOWN_DELAY (MTC_MINS_5)
#define MTC_POWERCYCLE_BACK2BACK_DELAY (MTC_MINS_5)
#define MTC_HEARTBEAT_SOAK_BEFORE_ENABLE (11)
#define MTC_HEARTBEAT_SOAK_DURING_ADD (10)
#define MTC_REINSTALL_TIMEOUT_DEFAULT (MTC_MINS_40)
#define MTC_REINSTALL_TIMEOUT_BMC_ACC (MTC_MINS_10)
#define MTC_REINSTALL_TIMEOUT_MIN (MTC_MINS_1)

View File

@ -191,10 +191,10 @@ int timeout_config_handler ( void * user,
config_ptr->dor_mode_timeout = atoi(value);
ilog ("DOR Mode TO : %3d secs\n", config_ptr->dor_mode_timeout );
}
else if (MATCH("timeouts", "dor_recovery_timeout_ext"))
else if (MATCH("timeouts", "dor_mode_detect"))
{
config_ptr->dor_recovery_timeout_ext = atoi(value);
ilog ("DOR Time Ext: %3d secs\n", config_ptr->dor_recovery_timeout_ext );
config_ptr->dor_mode_detect = atoi(value);
ilog ("DOR Mode Det: %3d secs", config_ptr->dor_mode_detect );
}
else if (MATCH("timeouts", "bmc_audit_period"))
{

View File

@ -582,8 +582,6 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
ptr->offline_log_reported = true ;
ptr->online_log_reported = false ;
ptr->dor_recovery_mode = false ;
ptr->was_dor_recovery_mode= false ;
ptr->dor_recovery_time = 0 ;
ptr->vim_notified = false ;
@ -2134,9 +2132,6 @@ int nodeLinkClass::mod_host ( node_inv_type & inv )
/* handle a lock request while unlocked */
if ( !inv.action.compare ( "lock" ) )
{
if ( node_ptr->dor_recovery_mode == true )
node_ptr->dor_recovery_mode = false ;
/* Set action to LOCK and let the FSM run the disable handler */
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__LOCK );
}
@ -2183,9 +2178,6 @@ int nodeLinkClass::mod_host ( node_inv_type & inv )
/* TODO: Create customer log of this action */
ilog ("%s Force Lock Action\n", node_ptr->hostname.c_str());
if ( node_ptr->dor_recovery_mode == true )
node_ptr->dor_recovery_mode = false ;
if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
{
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__FORCE_LOCK )
@ -2210,9 +2202,6 @@ int nodeLinkClass::mod_host ( node_inv_type & inv )
{
if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
{
if ( node_ptr->dor_recovery_mode == true )
node_ptr->dor_recovery_mode = false ;
/* Set action to LOCK and let the FSM run the disable handler */
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__LOCK );
}
@ -3125,13 +3114,13 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
if ( delay > 0 )
{
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, delay );
ilog ("Host add delay is %d seconds", delay );
node_ptr->addStage = MTC_ADD__START_DELAY ;
}
else
{
node_ptr->addStage = MTC_ADD__START ;
}
ilog ("Host add delay is %d seconds", delay );
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__ADD );
}
return (rc);
@ -5228,6 +5217,20 @@ int nodeLinkClass::manage_shadow_change ( string hostname )
return (rc);
}
/** Returns the number of unlocked nodes */
int nodeLinkClass::unlocked_nodes ( void )
{
int temp_count = 0 ;
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if (ptr->adminState == MTC_ADMIN_STATE__UNLOCKED)
temp_count++ ;
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
return (temp_count);
}
/** Returns the number of worker hosts that are operationally 'enabled' */
int nodeLinkClass::enabled_compute_nodes ( void )
{
@ -5462,6 +5465,7 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
/* Nothing to do if this host is not in the hbs_minor state */
if ( node_ptr->hbs_minor[iface] == true )
{
dlog ("%s clearing heartbeat minor on %s network", node_ptr->hostname.c_str(), get_iface_name_str(iface));
/* clear it - possibly temporarily */
node_ptr->hbs_minor[iface] = false ;
@ -5527,7 +5531,7 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
{
if ( ptr->operState != MTC_OPER_STATE__ENABLED )
{
slog ("%s found hbs_minor set for disabled host\n" , ptr->hostname.c_str() );
slog ("%s found hbs_minor set for %s network for disabled host\n" , ptr->hostname.c_str(), get_iface_name_str(iface));
}
temp_count++ ;
}
@ -5553,56 +5557,6 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
}
}
/****************************************************************************
*
* Name : manage_dor_recovery
*
* Description: Enable DOR recovery mode for this host.
* Generate log
*
* The severity parm is used to enhance the logs to indicate what
* severity level this utility was called from ;
* minor, major, or critical
*
***************************************************************************/
void nodeLinkClass::manage_dor_recovery ( struct nodeLinkClass::node * node_ptr,
EFmAlarmSeverityT severity )
{
if (( severity == FM_ALARM_SEVERITY_CLEAR ) &&
( node_ptr->dor_recovery_mode == true ))
{
node_ptr->dor_recovery_mode = false ;
node_ptr->was_dor_recovery_mode = true ;
}
else if (( severity == FM_ALARM_SEVERITY_CRITICAL ) &&
( node_ptr->dor_recovery_mode == false ))
{
struct timespec ts ;
clock_gettime (CLOCK_MONOTONIC, &ts );
wlog ("%-12s is waiting ; DOR recovery %2ld:%02ld mins (%4ld secs)\n",
node_ptr->hostname.c_str(),
ts.tv_sec/60,
ts.tv_sec%60,
ts.tv_sec);
node_ptr->dor_recovery_time = 0 ;
node_ptr->dor_recovery_mode = true ;
node_ptr->hbsClient_ready = false ;
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT );
/* don't restart graceful recovery for this host if its already in that FSM */
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__RECOVER ) &&
( node_ptr->adminAction != MTC_ADMIN_ACTION__LOCK ))
{
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
}
}
}
/** Manage heartbeat failure events */
void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface, bool clear_event )
{
@ -5627,11 +5581,7 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
node_ptr->hbs_failure[iface] = false ;
}
}
else if ( this->mtcTimer_dor.tid )
{
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
}
else
else if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
{
/* handle auto recovery for heartbeat failure during enable */
if ( node_ptr->ar_cause == MTC_AR_DISABLE_CAUSE__HEARTBEAT )
@ -5663,51 +5613,54 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
mnfa_add_host ( node_ptr , iface );
if ( mnfa_active == false )
if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
{
/* if node is already in graceful recovery just ignore the event */
if ( node_ptr->graceful_recovery_counter != 0 )
if ( mnfa_active == false )
{
dlog ("%s %s loss event ; already in graceful recovery try %d",
hostname.c_str(),
get_iface_name_str(iface),
node_ptr->graceful_recovery_counter );
return ;
}
elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface));
if ( iface == CLSTR_IFACE )
{
node_ptr->heartbeat_failed[CLSTR_IFACE] = true ;
}
else if ( iface == MGMNT_IFACE )
{
node_ptr->heartbeat_failed[MGMNT_IFACE] = true ;
}
if (mnfa_host_count[iface] < this->mnfa_threshold)
{
elog ("%s %s network heartbeat failure\n", hostname.c_str(), get_iface_name_str(iface));
nodeLinkClass::set_availStatus ( hostname, MTC_AVAIL_STATUS__FAILED );
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) &&
( node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK ))
/* if node is already in graceful recovery just ignore the event */
if ( node_ptr->graceful_recovery_counter != 0 )
{
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER )
dlog ("%s %s loss event ; already in graceful recovery try %d",
hostname.c_str(),
get_iface_name_str(iface),
node_ptr->graceful_recovery_counter );
return ;
}
elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface));
if ( iface == CLSTR_IFACE )
{
node_ptr->heartbeat_failed[CLSTR_IFACE] = true ;
}
else if ( iface == MGMNT_IFACE )
{
node_ptr->heartbeat_failed[MGMNT_IFACE] = true ;
}
if (mnfa_host_count[iface] < this->mnfa_threshold)
{
elog ("%s %s network heartbeat failure\n", hostname.c_str(), get_iface_name_str(iface));
nodeLinkClass::set_availStatus ( hostname, MTC_AVAIL_STATUS__FAILED );
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) &&
( node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK ))
{
wlog ("%s restarting graceful recovery\n", hostname.c_str() );
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER )
{
wlog ("%s restarting graceful recovery", hostname.c_str() );
}
else
{
wlog ("%s starting graceful recovery", hostname.c_str() );
}
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
}
else
{
wlog ("%s starting graceful recovery\n", hostname.c_str() );
mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_FAIL_HB );
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
}
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
}
else
{
mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_FAIL_HB );
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
}
}
}
@ -5802,11 +5755,8 @@ void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface
hbs_minor_clear ( node_ptr, iface );
}
else if ( this->mtcTimer_dor.tid )
{
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_MAJOR );
}
else
/* - we don't care about locked hosts */
else if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
{
if ( mnfa_active == false )
{
@ -5815,7 +5765,7 @@ void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface
mnfa_add_host ( node_ptr, iface );
if ( nodeLinkClass::get_operState ( hostname ) == MTC_OPER_STATE__ENABLED )
if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
{
if ( iface == MGMNT_IFACE )
{
@ -5852,16 +5802,11 @@ void nodeLinkClass::manage_heartbeat_minor ( string hostname, iface_enum iface,
alog ("%s %s Heartbeat Minor (clear)\n", hostname.c_str(), get_iface_name_str(iface));
hbs_minor_clear ( node_ptr, iface );
}
/* if not a clear then only set if the host is enabled
* - we don't care about disabled hosts */
else if ( nodeLinkClass::get_operState ( hostname ) == MTC_OPER_STATE__ENABLED )
/* - we don't care about locked hosts */
else if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
{
if ( this->mtcTimer_dor.tid )
{
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_MINOR );
}
else if ( node_ptr->hbs_minor[iface] != true )
if ( node_ptr->hbs_minor[iface] != true )
{
mnfa_add_host ( node_ptr, iface );
}
@ -7077,8 +7022,7 @@ int nodeLinkClass::adminActionChange ( struct nodeLinkClass::node * node_ptr,
* other action can take effect.
* If its not one of these action then just proceed with it
**/
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ADD ) &&
( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK ))
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK )
{
clog ("%s Administrative Action '%s' -> '%s'\n",
node_ptr->hostname.c_str(),
@ -8511,30 +8455,76 @@ int nodeLinkClass::ar_handler ( struct nodeLinkClass::node * node_ptr,
* Description: Create a specifically formatted log for the specified
* hosts DOR recovery state and timing.
*
* Parameters : The node and a caller prefix string that states if the node
* is ENABELD
* Assumptions: Only logged if the active controller has an uptime
* less than 20 minutes (default). Configurable in mtce.conf
*
* Parameters :
*
* @param node_ptr Pointer to the node in the inventoried node linked list.
* @param node_state_log_prefix Prefix for the node's state log messages.
* is ENABLED
* is DEGRADED
* is DISABLED
* is FAILED
* is ENMABLED-degraded
* etc.
* is OFFLINE
* @param extra string representing where this function was called.
*
***************************************************************************/
void nodeLinkClass::report_dor_recovery ( struct nodeLinkClass::node * node_ptr,
string node_state_log_prefix )
string node_state_log_prefix,
string extra )
{
struct timespec ts ;
clock_gettime (CLOCK_MONOTONIC, &ts );
node_ptr->dor_recovery_time = ts.tv_sec ;
plog ("%-12s %s ; DOR Recovery %2d:%02d mins (%4d secs) (uptime:%2d:%02d mins)\n",
node_ptr->hostname.c_str(),
node_state_log_prefix.c_str(),
node_ptr->dor_recovery_time/60,
node_ptr->dor_recovery_time%60,
node_ptr->dor_recovery_time,
node_ptr->uptime/60,
node_ptr->uptime%60 );
node_ptr->dor_recovery_mode = false ;
node_ptr->was_dor_recovery_mode = false ;
if ( this->dor_mode_active )
{
node_ptr->dor_recovery_time = ts.tv_sec ;
plog ("%-12s %-11s ; DOR Recovery %2d:%02d mins (%4d secs) (uptime:%2d:%02d mins) %s",
node_ptr->hostname.c_str(),
node_state_log_prefix.c_str(),
node_ptr->dor_recovery_time/60,
node_ptr->dor_recovery_time%60,
node_ptr->dor_recovery_time,
node_ptr->uptime/60,
node_ptr->uptime%60,
extra.c_str());
// Accounting
int unlocked_nodes = this->unlocked_nodes() ;
if ( ++this->dor_recovered_nodes == unlocked_nodes )
{
mtcTimer_reset (this->mtcTimer_dor);
this->dor_mode_active = false ;
this->dor_mode_active_log_throttle = 0 ;
ilog ("%-13s %3d of %-3d ; DOR Recovery ; all nodes are recovered ; active controller uptime:%ld",
this->my_hostname.c_str(),
this->dor_recovered_nodes,
unlocked_nodes,
ts.tv_sec);
}
else if ( this->dor_recovered_nodes > this->unlocked_nodes() )
{
slog ("%s unexpected extra DOR recovery call ; unlocked:%d recovered:%d",
node_ptr->hostname.c_str(),
unlocked_nodes,
this->dor_recovered_nodes);
}
else
{
ilog ("%s %d of %d DOR nodes recovered",
node_ptr->hostname.c_str(),
this->dor_recovered_nodes,
unlocked_nodes);
}
}
else
{
dlog ("%s DOR Recovery called with '%s %s' while dor mode disabled",
node_ptr->hostname.c_str(),
node_state_log_prefix.c_str(),
extra.c_str());
}
}
void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
@ -8548,10 +8538,7 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
return ;
}
if ( node_ptr->was_dor_recovery_mode )
{
report_dor_recovery ( node_ptr , "is FAILED " );
}
report_dor_recovery ( node_ptr , "is FAILED", "full enable" );
plog ("%s Forcing Full Enable Sequence\n", node_ptr->hostname.c_str());
@ -8561,9 +8548,8 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
allStateChange ( node_ptr, node_ptr->adminState, MTC_OPER_STATE__DISABLED, MTC_AVAIL_STATUS__FAILED );
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
recoveryStageChange ( node_ptr, MTC_RECOVERY__START ); /* reset the fsm */
// don't override the add action or lock actions /
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ADD ) &&
( node_ptr->adminAction != MTC_ADMIN_ACTION__LOCK ) &&
// don't override the lock actions /
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__LOCK ) &&
( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK ))
{
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); // no action
@ -9784,10 +9770,8 @@ void nodeLinkClass::mem_log_general ( void )
void nodeLinkClass::mem_log_dor ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s DOR - Active: %c Was: %c Time: %5d (00:%02d:%02d)\n",
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s DOR - Time: %5d (00:%02d:%02d)\n",
node_ptr->hostname.c_str(),
node_ptr->dor_recovery_mode ? 'Y' : 'N',
node_ptr->was_dor_recovery_mode ? 'Y' : 'N',
node_ptr->dor_recovery_time,
node_ptr->dor_recovery_time ? node_ptr->dor_recovery_time/60 : 0,
node_ptr->dor_recovery_time ? node_ptr->dor_recovery_time%60 : 0);
@ -9813,12 +9797,14 @@ void nodeLinkClass::mem_log_mnfa ( void )
void nodeLinkClass::mem_log_general_mtce_hosts ( void )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s EnableHosts -> Cont:%d Comp:%d Stor:%d StorType:%d\n",
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s EnableHosts -> Cont:%d Comp:%d Stor:%d StorType:%d Unlocked:%d DOR:Recovered:%d\n",
my_hostname.c_str(),
num_controllers_enabled(),
enabled_compute_nodes(),
enabled_storage_nodes(),
get_storage_backend());
get_storage_backend(),
unlocked_nodes(),
dor_recovered_nodes);
mem_log (str);
}

View File

@ -368,10 +368,8 @@ private:
/* the fault handling offline handler timer */
struct mtc_timer offline_timer ;
/* Host level DOR recovery mode time and bools */
/* Host level DOR recovery time */
int dor_recovery_time ;
bool dor_recovery_mode ;
bool was_dor_recovery_mode ;
/** Integer code representing the host health */
int health ;
@ -1275,7 +1273,7 @@ private:
/* Dead Office Recovery - system level controls */
void manage_dor_recovery ( struct nodeLinkClass::node * node_ptr, EFmAlarmSeverityT severity );
void report_dor_recovery ( struct nodeLinkClass::node * node_ptr, string node_state_log_prefix );
void report_dor_recovery ( struct nodeLinkClass::node * node_ptr, string node_state_log_prefix, string extra );
struct {
struct node * head_ptr ; /**< Pulse Linked List Head pointer */
@ -1398,6 +1396,7 @@ public:
bool dor_mode_active ;
unsigned int dor_start_time ;
int dor_mode_active_log_throttle ;
int dor_recovered_nodes = 0; /**< DOR node recovery count */
bool hbs_disabled ; /**< Control heartbeat service state */
bool hbs_state_change ; /**< Flag service state change */
@ -1702,6 +1701,9 @@ public:
/** Remove a host from Node list */
int rem_host ( string & hostname );
/** Get the number of unlocked nodes */
int unlocked_nodes ( void );
/** Get the number of worker hosts that are operationally 'enabled' */
int enabled_compute_nodes ( void );

View File

@ -710,6 +710,15 @@ int daemon_configure ( void )
else
mtcInv.node_reinstall_timeout = MTC_REINSTALL_TIMEOUT_DEFAULT ;
if ( mtc_config.dor_mode_detect <= 0 )
{
wlog ("DOR mode detect timeout is invalid (%d), setting to default (%d)",
mtc_config.dor_mode_detect,
DEFAULT_DOR_MODE_DETECT);
mtc_config.dor_mode_detect = DEFAULT_DOR_MODE_DETECT ;
}
if ( mtc_config.dor_mode_timeout <= 0 )
{
slog ("DOR Mode Timeout is invalid (%d), setting to default (%d)\n",
@ -1653,7 +1662,7 @@ void daemon_service_run ( void )
}
#endif
if ( ts.tv_sec < MTC_MINS_15 )
if ( ts.tv_sec < mtc_config.dor_mode_detect )
{
/* AIO DOR window is much greater in AIO since heartbeat
* cannot start until the inactive AIO has run both manifests */
@ -1669,16 +1678,16 @@ void daemon_service_run ( void )
mtcInv.dor_mode_active = true ;
mtcInv.dor_start_time = ts.tv_sec ;
ilog ("%-12s ---------- ; DOR Recovery ---------------------- -------------------\n", mtcInv.my_hostname.c_str());
ilog ("%-12s is ACTIVE ; DOR Recovery %2d:%02d mins (%4d secs) (duration %3d secs)\n",
ilog ("%-12s ----------- ; DOR Recovery ---------------------- -------------------\n", mtcInv.my_hostname.c_str());
ilog ("%-12s is ACTIVE ; DOR Recovery %2d:%02d mins (%4d secs) (dor timeout in %3d secs)\n",
mtcInv.my_hostname.c_str(),
mtcInv.dor_start_time/60,
mtcInv.dor_start_time%60,
mtcInv.dor_start_time,
timeout );
ilog ("%-12s ---------- ; DOR Recovery ---------------------- -------------------\n", mtcInv.my_hostname.c_str());
ilog ("%-12s host state ; DOR Recovery controller uptime host uptime \n", mtcInv.my_hostname.c_str());
ilog ("%-12s ---------- ; DOR Recovery ---------------------- -------------------\n", mtcInv.my_hostname.c_str());
ilog ("%-12s ----------- ; DOR Recovery ---------------------- -------------------", mtcInv.my_hostname.c_str());
ilog ("%-12s host state ; DOR Recovery controller uptime host uptime ", mtcInv.my_hostname.c_str());
ilog ("%-12s ----------- ; DOR Recovery ---------------------- -------------------", mtcInv.my_hostname.c_str());
mtcTimer_start ( mtcInv.mtcTimer_dor, mtcTimer_handler, timeout );
}
@ -1992,7 +2001,12 @@ void daemon_service_run ( void )
* then exit DOR mode. We do it here instead of */
if (( mtcInv.dor_mode_active == true ) && ( mtcInv.mtcTimer_dor.tid == NULL ))
{
ilog ("DOR mode disable\n");
wlog ("%s DOR mode disabled ; DOR Recovery Timeout ; %d of %d unlocked hosts ; active controller uptime:%d",
mtcInv.my_hostname.c_str(),
mtcInv.dor_recovered_nodes,
mtcInv.unlocked_nodes(),
mtcInv.get_uptime(mtcInv.my_hostname));
mtcInv.dor_mode_active_log_throttle = 0 ;
mtcInv.dor_mode_active = false ;
}
}

View File

@ -194,6 +194,7 @@ void nodeLinkClass::timer_handler ( int sig, siginfo_t *si, void *uc)
{
mtcTimer_stop_int_safe ( mtcTimer_dor );
mtcTimer_dor.ring = true ;
this->dor_mode_active_log_throttle = 0 ;
return ;
}
@ -488,7 +489,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
AR_LOG_THROTTLE_THRESHOLD,
"%s auto recovery disabled cause:%d",
node_ptr->hostname.c_str(), node_ptr->ar_cause );
return (RETRY); ;
return (RETRY);
}
if ( THIS_HOST )
@ -787,11 +788,10 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
}
case MTC_ENABLE__START:
{
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );
plog ("%s Main Enable FSM (from start)%s\n",
node_ptr->hostname.c_str(),
node_ptr->was_dor_recovery_mode ? " (from DOR)" : "" );
this->dor_mode_active ? " (DOR active)" : "" );
/* clear all the past enable failure bools */
clear_main_failed_bools ( node_ptr );
@ -1547,10 +1547,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
/* Inform the VIM that this host is enabled */
mtcVimApi_state_change ( node_ptr, VIM_HOST_ENABLED, 3 );
plog ("%s is ENABLED%s\n", node_ptr->hostname.c_str(),
node_ptr->was_dor_recovery_mode ? " (from DOR)" : "");
node_ptr->dor_recovery_mode = false ;
node_ptr->was_dor_recovery_mode = false ;
plog ("%s is ENABLED", node_ptr->hostname.c_str());
node_ptr->http_retries_cur = 0 ;
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
@ -1718,13 +1715,12 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr->mtcAlive_online == true )
{
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );
mtcTimer_stop ( node_ptr->mtcTimer );
ilog ("%s got requested mtcAlive%s\n",
node_ptr->hostname.c_str(),
node_ptr->was_dor_recovery_mode ? " (DOR)" : "" );
this->dor_mode_active ? " (DOR mode)" : "" );
stop_offline_handler ( node_ptr );
@ -1793,7 +1789,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
/* did not reboot case */
wlog ("%s Connectivity Recovered ; host did not reset%s (uptime:%d)",
node_ptr->hostname.c_str(),
node_ptr->was_dor_recovery_mode ? " (DOR)" : "",
this->dor_mode_active ? " (DOR mode)" : "",
node_ptr->uptime);
wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str());
@ -1808,9 +1804,9 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
else
{
wlog ("%s Connectivity Recovered ; host has reset\n", node_ptr->hostname.c_str());
ilog ("%s ... continuing%sgraceful recovery ; (OOB: %08x)\n",
ilog ("%s ... continuing graceful recovery%s ; (OOB: %08x)",
node_ptr->hostname.c_str(),
node_ptr->was_dor_recovery_mode ? " (DOR) " : " ",
this->dor_mode_active ? " (DOR mode)" : "",
node_ptr->mtce_flags);
ilog ("%s ... without additional reboot %s (uptime:%d)\n",
node_ptr->hostname.c_str(),
@ -1845,7 +1841,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
wlog ("%s Loss Of Communication for %d seconds ; disabling host%s\n",
node_ptr->hostname.c_str(),
loc_recovery_timeout,
node_ptr->dor_recovery_mode ? " (DOR)" : "" );
this->dor_mode_active ? " (DOR mode)" : "" );
wlog ("%s ... stopping host services\n", node_ptr->hostname.c_str());
wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str());
@ -1898,7 +1894,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
/* Only try and issue in-line recovery reboot or reset if
* NOT in Dead Office Recovery (DOR) mode. */
if ( node_ptr->dor_recovery_mode == false )
if ( this->dor_mode_active )
{
ilog ("%s issuing one time graceful recovery reboot over management network\n",
node_ptr->hostname.c_str());
@ -1945,7 +1941,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
LOAD_NODETYPE_TIMERS ;
/* load the mtcAlive timeout to accomodate for dor recovery */
timeout = node_ptr->mtcalive_timeout + daemon_get_cfg_ptr()->dor_recovery_timeout_ext ;
timeout = node_ptr->mtcalive_timeout ;
}
/* start the timer that waits for MTCALIVE */
@ -1955,7 +1951,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->hostname.c_str(),
MTC_TASK_RECOVERY_WAIT,
timeout,
node_ptr->dor_recovery_mode ? " (DOR) " : " " ,
this->dor_mode_active ? " (DOR) " : " " ,
node_ptr->uptime_save );
clear_service_readies ( node_ptr );
@ -2024,7 +2020,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->hostname.c_str(),
MTC_TASK_RECOVERY_WAIT,
timeout,
node_ptr->dor_recovery_mode ? " (DOR) " : " " ,
this->dor_mode_active ? " (DOR mode) " : " " ,
node_ptr->uptime_save );
clear_service_readies ( node_ptr );
@ -2075,7 +2071,6 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
{
mtcTimer_stop ( node_ptr->mtcTimer );
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );
/* If the host's uptime is bigger than the saved uptime then
* the host has not reset yet we have disabled services
@ -2084,7 +2079,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
if ((( node_ptr->uptime_save != 0 ) &&
( node_ptr->uptime >= node_ptr->uptime_save )) ||
(( node_ptr->uptime_save == 0 ) &&
( node_ptr->uptime > MTC_MINS_15 )))
( node_ptr->uptime > MTC_MINS_20 )))
{
ilog ("%s regained MTCALIVE from host that did not reboot (uptime:%d)\n",
node_ptr->hostname.c_str(), node_ptr->uptime );
@ -2121,7 +2116,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->hostname.c_str(), node_ptr->uptime, node_ptr->uptime_save );
ilog ("%s ... continuing with graceful recovery %s\n",
node_ptr->hostname.c_str(),
node_ptr->dor_recovery_mode ? "(DOR)" : " ");
this->dor_mode_active ? "(DOR mode)" : "");
ilog ("%s ... without additional reboot %s\n",
node_ptr->hostname.c_str(), node_ptr->bm_ip.empty() ? "or reset" : "" );
@ -2138,7 +2133,6 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
}
else if ( node_ptr->mtcTimer.ring == true )
{
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );
/* Set the FSM task state to init failed */
mtcInvApi_update_task ( node_ptr, "Graceful Recovery Failed" );
@ -2523,15 +2517,13 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->http_retries_cur = 0 ;
doneQueue_purge ( node_ptr );
if ( node_ptr->was_dor_recovery_mode )
if ( this->dor_mode_active )
{
report_dor_recovery ( node_ptr , "is ENABLED" );
}
else
{
plog ("%s is ENABLED (Gracefully Recovered)\n",
node_ptr->hostname.c_str());
report_dor_recovery ( node_ptr , "is ENABLED", "recovery" );
}
plog ("%s is ENABLED (Gracefully Recovered%s)",
node_ptr->hostname.c_str(),
this->dor_mode_active ? " in DOR mode" : "");
alarm_enabled_clear ( node_ptr, false );
break ;
}
@ -6023,11 +6015,26 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
case MTC_ADD__START:
{
bool timer_set = false ;
plog ("%s Host Add\n", node_ptr->hostname.c_str());
if ( THIS_HOST )
{
struct timespec ts ;
clock_gettime (CLOCK_MONOTONIC, &ts );
node_ptr->uptime = ts.tv_sec ;
}
else if ( ! node_ptr->mtcClient_ready )
{
/* If we have not received a mtcAlive event from the
* mtcClient already then lets request it since that
* is how we get its uptime.
* Don't trust what is in the database since it will
* be stale. Best to default to zero so the logs will
* show that there has been no mtcAlive received */
node_ptr->uptime = 0 ;
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, MGMNT_INTERFACE );
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, PXEBOOT_INTERFACE );
}
/* Request a mtcAlive message ; gives us uptime ; don't trust what is in the database */
node_ptr->uptime = 0 ;
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, MGMNT_INTERFACE );
plog ("%s Host Add (uptime:%d)", node_ptr->hostname.c_str(), node_ptr->uptime );
ilog ("%s %s %s-%s-%s (%s)\n",
node_ptr->hostname.c_str(),
@ -6075,14 +6082,31 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
EFmAlarmSeverityT mtcAlive_alarm_severity =
mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__MTCALIVE);
/* Clear generic enable alarm over process restart.
* Will get reasserted if the cause condition still exists */
/* Manage an existing enable alarm */
if ( enable_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
{
ilog ("%s found enable alarm ; clearing %s",
/* Added the unlocked-disabled check to avoid clearing the
* enabled alarm when the node is found to be unlocked-disabled
* with the enable alarm already asserted.
* We don't want to clear it in that case. */
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__DISABLED ))
{
node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ;
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = enable_alarm_severity ;
wlog ("%s found enable alarm while unlocked-disabled ; loaded %s",
node_ptr->hostname.c_str(),
alarmUtil_getSev_str(enable_alarm_severity).c_str());
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
alarmUtil_getSev_str(enable_alarm_severity).c_str());
}
else
{
ilog ("%s found enable alarm while %s-%s ; clearing %s",
node_ptr->hostname.c_str(),
adminState_enum_to_str (node_ptr->adminState).c_str(),
operState_enum_to_str (node_ptr->operState_subf).c_str(),
alarmUtil_getSev_str(enable_alarm_severity).c_str());
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
}
}
/* The config alarm is maintained if it exists.
@ -6230,6 +6254,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
alarm_luks_failure ( node_ptr );
}
node_ptr->ar_disabled = true ;
this->report_dor_recovery ( node_ptr, "is DISABLED" , "auto recovery disabled");
if ( THIS_HOST )
mtcInvApi_update_states ( node_ptr, "unlocked", "enabled", "degraded" );
@ -6341,22 +6366,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
/* Stop the work queue wait timer */
mtcTimer_reset ( node_ptr->mtcTimer );
/* Only start it on this add operation if host is
* already unlocked and enabled and not the active controller */
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
{
/* start the heartbeat service in all cases except for
* THIS host and AIO controller hosts */
if ( NOT_THIS_HOST )
{
if (( LARGE_SYSTEM ) ||
(( AIO_SYSTEM ) && ( this->dor_mode_active == false )))
{
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
}
}
}
/* Only run hardware monitor if the bm ip is provisioned */
if (( hostUtil_is_valid_bm_type ( node_ptr->bm_type )) &&
@ -6367,9 +6376,63 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
}
this->ctl_mtcAlive_gate(node_ptr, false) ;
node_ptr->addStage = MTC_ADD__DONE ;
if (( NOT_THIS_HOST ) &&
((( AIO_SYSTEM ) && ( is_controller(node_ptr) == false )) || ( LARGE_SYSTEM )) &&
( this->hbs_failure_action != HBS_FAILURE_ACTION__NONE ) &&
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
{
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_5 );
if ( ! node_ptr->hbsClient_ready )
{
ilog ("%s waiting for hbsClient ready event (%d secs)", node_ptr->hostname.c_str(), MTC_MINS_5);
}
node_ptr->addStage = MTC_ADD__HEARTBEAT_WAIT ;
}
else
{
node_ptr->addStage = MTC_ADD__DONE ;
}
break;
}
case MTC_ADD__HEARTBEAT_WAIT:
{
/* Wait for hbsClient ready event */
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
wlog ("%s hbsClient ready event timeout\n", node_ptr->hostname.c_str());
}
else if ( node_ptr->hbsClient_ready == false )
{
break ;
}
else
{
mtcTimer_reset ( node_ptr->mtcTimer );
}
plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
node_ptr->hostname.c_str(),
MTC_HEARTBEAT_SOAK_DURING_ADD,
node_ptr->hbsClient_ready ? " ready event" : "out ready event" );
/* allow heartbeat to run for MTC_HEARTBEAT_SOAK_DURING_ADD
* seconds before we declare enable */
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_DURING_ADD );
node_ptr->addStage = MTC_ADD__HEARTBEAT_SOAK ;
break ;
}
case MTC_ADD__HEARTBEAT_SOAK:
{
if ( node_ptr->mtcTimer.ring == true )
{
plog ("%s heartbeating", node_ptr->hostname.c_str());
/* if heartbeat is not working then we will
* never get here */
node_ptr->addStage = MTC_ADD__DONE ;
}
break ;
}
case MTC_ADD__DONE:
default:
{
@ -6396,16 +6459,55 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
{
/* In AIO if in DOR mode and the host is unlocked enabled
* we need to run the subfunction handler and request
* to start host services. */
/* Need to run the subfunction enable handler
* for AIO controllers while in DOR mode */
if ( this->dor_mode_active )
{
ilog ("%s running subfunction enable for unlocked-enabled AIO controller (DOR mode)", node_ptr->hostname.c_str());
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__ENABLE_SUBF );
break ;
}
}
}
else if ( this->dor_mode_active )
{
/* The Enable SUBF handler will do this so lets not do it twice */
if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
{
string state_str = "" ;
if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
{
state_str = "is ENABLED" ;
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )
state_str = "is DEGRADED" ;
}
else if ( node_ptr->availStatus == MTC_AVAIL_STATUS__FAILED )
{
state_str = "is FAILED" ;
}
else if ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE )
{
state_str = "is OFFLINE" ;
}
if ( ! state_str.empty() )
{
report_dor_recovery ( node_ptr , state_str, "" ) ;
}
else
{
ilog ("%-12s is waiting ; DOR Recovery ; %s-%s-%s ; mtcClient:%c hbsClient:%c uptime:%3d task:%s",
node_ptr->hostname.c_str(),
adminState_enum_to_str (node_ptr->adminState).c_str(),
operState_enum_to_str (node_ptr->operState).c_str(),
availStatus_enum_to_str(node_ptr->availStatus).c_str(),
node_ptr->mtcClient_ready ? 'Y':'N',
node_ptr->hbsClient_ready ? 'Y':'N',
node_ptr->uptime,
node_ptr->task.empty() ? "empty" : node_ptr->task.c_str());
}
}
}
node_ptr->addStage = MTC_ADD__START;
plog ("%s Host Add Completed (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime );
@ -7597,35 +7699,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
{
insvTestStageChange ( node_ptr, MTC_INSV_TEST__RUN );
}
/* manage degrade state and alarms */
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
( node_ptr->ar_disabled == false ))
{
/************************************************************
* Manage In-Service Alarms *
***********************************************************/
/* Manage Inservice Enable Alarm */
if ( node_ptr->hostservices_failed )
{
alarm_insv_failure ( node_ptr );
}
else
{
alarm_insv_clear ( node_ptr, false );
}
/* Manage Compute Subfunction Failure Alarm */
if ( node_ptr->hostservices_failed_subf )
{
alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_MAJOR );
}
else
{
alarm_compute_clear ( node_ptr, false );
}
}
break ;
}
case MTC_INSV_TEST__RUN:
@ -7694,16 +7767,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
{
/************************************************************
* Prevent the start host services from running while in DOR
***********************************************************/
if ( node_ptr->dor_recovery_mode == true )
{
/* wait longer for the host to boot up */
wlog ("%s DOR recovery active ; waiting on host\n",
node_ptr->hostname.c_str());
}
else if ( this->dor_mode_active == true )
if ( this->dor_mode_active == true )
{
ilog_throttled ( this->dor_mode_active_log_throttle, 20,
"DOR mode active\n");
@ -8313,4 +8377,4 @@ int nodeLinkClass::self_fail_handler ( struct nodeLinkClass::node * node_ptr )
"force swact to unlocked-enabled standby controller");
}
return (PASS);
}
}

View File

@ -411,12 +411,6 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
}
else
{
if ( node_ptr->dor_recovery_mode || node_ptr->was_dor_recovery_mode )
{
node_ptr->dor_recovery_mode = false ;
node_ptr->was_dor_recovery_mode = true ;
}
if (( node_ptr->alarms[MTC_ALARM_ID__CH_COMP] != FM_ALARM_SEVERITY_CLEAR ) ||
( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CLEAR ) ||
( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CLEAR ))
@ -454,9 +448,9 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->subf_enabled = true ;
node_ptr->inservice_failed_subf = false ;
if ( node_ptr->was_dor_recovery_mode )
if ( this->dor_mode_active )
{
report_dor_recovery ( node_ptr , "is ENABLED" );
report_dor_recovery ( node_ptr , "is ENABLED", "subf" );
}
else
{
@ -488,9 +482,9 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
MTC_OPER_STATE__ENABLED,
MTC_AVAIL_STATUS__DEGRADED );
if ( node_ptr->was_dor_recovery_mode )
if ( this->dor_mode_active )
{
report_dor_recovery ( node_ptr , "is ENABLED-degraded" );
report_dor_recovery ( node_ptr , "is DEGRADED", "subf" );
}
else
{
@ -511,16 +505,6 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
MTC_OPER_STATE__ENABLED,
MTC_AVAIL_STATUS__DEGRADED );
if ( node_ptr->was_dor_recovery_mode )
{
report_dor_recovery ( node_ptr , "is DISABLED-failed" );
}
else
{
elog ("%s is DISABLED-failed (subfunction failed)\n",
name.c_str() );
}
this->dor_mode_active = false ;
alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_CRITICAL ) ;
@ -552,9 +536,6 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->enabled_count++ ;
node_ptr->health_threshold_counter = 0 ;
node_ptr->was_dor_recovery_mode = false ;
node_ptr->dor_recovery_mode = false ;
this->dor_mode_active = false ;
ar_enable ( node_ptr );

View File

@ -125,12 +125,12 @@ loc_recovery_timeout = 5 ; Loss Of Communication Recovery Timeout
; the max number of seconds that a host can be in
; loss of communication state without failing the unit
dor_mode_timeout = 20 ; The default base time in seconds for how long
dor_mode_detect = 1200 ; Controller uptime less than this value puts mtcAgent
; into DOR mode active state. Default: 20 minutes
dor_mode_timeout = 1000 ; The default base time in seconds for how long
; maintenance DOR mode is active. This number
; is extended by the number of enabled hosts.
dor_recovery_timeout_ext = 1800 ; Dor timeout extension. An extra time in seconds
; that is added to the host specific recovery time
; making the overall host's dor recovery timeout.
swact_timeout = 120 ; Seconds Mtce waits for HA Service SWACT before failing
; the swact operation