diff --git a/mtce-common/src/common/logMacros.h b/mtce-common/src/common/logMacros.h index a3227b5b..096ed7ca 100644 --- a/mtce-common/src/common/logMacros.h +++ b/mtce-common/src/common/logMacros.h @@ -159,8 +159,8 @@ typedef struct int work_queue_timeout ; /**< end of action workq complete TO */ int loc_recovery_timeout ; /**< loss of comms recovery timeout */ int node_reinstall_timeout ; /**< node reinstall timeout */ + int dor_mode_detect ; /**< dead office recovery detect thld*/ int dor_mode_timeout ; /**< dead office recovery timeout */ - int dor_recovery_timeout_ext ; /**< dor recovery timeout extension */ int uptime_period ; /**< Uptime refresh timer period */ int online_period ; /**< locked availability refresh */ int insv_test_period ; /**< insv test period in secs */ diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index be259f6e..caa9731d 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -203,8 +203,9 @@ typedef enum #define DEFAULT_MTCALIVE_TIMEOUT (1200) #define DEFAULT_GOENABLE_TIMEOUT (300) -#define DEFAULT_DOR_MODE_TIMEOUT (20) -#define DEFAULT_DOR_MODE_AIO_TIMEOUT (600) +#define DEFAULT_DOR_MODE_TIMEOUT (MTC_MINS_15) +#define DEFAULT_DOR_MODE_AIO_TIMEOUT (MTC_MINS_20) +#define DEFAULT_DOR_MODE_DETECT (MTC_MINS_20) #define DEFAULT_POWER_OFF_RETRY_WAIT (30) /** TODO: Convert names to omit JSON part */ @@ -962,6 +963,8 @@ typedef enum MTC_ADD__MTC_SERVICES, MTC_ADD__CLEAR_TASK, MTC_ADD__WORKQUEUE_WAIT, + MTC_ADD__HEARTBEAT_WAIT, + MTC_ADD__HEARTBEAT_SOAK, MTC_ADD__DONE, MTC_ADD__STAGES } mtc_addStages_enum ; diff --git a/mtce-common/src/common/nodeTimers.h b/mtce-common/src/common/nodeTimers.h index 8a33c713..b8d83e02 100755 --- a/mtce-common/src/common/nodeTimers.h +++ b/mtce-common/src/common/nodeTimers.h @@ -45,6 +45,7 @@ #define MTC_MINS_5 (300) #define MTC_MINS_8 (480) #define MTC_MINS_10 (600) +#define MTC_MINS_14 (840) #define MTC_MINS_15 (900) #define MTC_MINS_20 (1200) #define MTC_MINS_30 (1800) @@ -71,8 +72,7 @@ #define MTC_BM_POWERON_TIMEOUT (30) #define MTC_RESET_PROG_TIMEOUT (20) #define MTC_WORKQUEUE_TIMEOUT (60) -#define MTC_WORKER_CONFIG_TIMEOUT (900) -#define MTC_EXIT_DOR_MODE_TIMEOUT (60*15) +#define MTC_WORKER_CONFIG_TIMEOUT (MTC_MINS_14) #define MTC_RESET_PROG_OFFLINE_TIMEOUT (20) #define MTC_RESET_TO_OFFLINE_TIMEOUT (150) #define MTC_POWEROFF_TO_OFFLINE_TIMEOUT (200) @@ -80,6 +80,7 @@ #define MTC_POWERCYCLE_COOLDOWN_DELAY (MTC_MINS_5) #define MTC_POWERCYCLE_BACK2BACK_DELAY (MTC_MINS_5) #define MTC_HEARTBEAT_SOAK_BEFORE_ENABLE (11) +#define MTC_HEARTBEAT_SOAK_DURING_ADD (10) #define MTC_REINSTALL_TIMEOUT_DEFAULT (MTC_MINS_40) #define MTC_REINSTALL_TIMEOUT_BMC_ACC (MTC_MINS_10) #define MTC_REINSTALL_TIMEOUT_MIN (MTC_MINS_1) diff --git a/mtce-common/src/daemon/daemon_config.cpp b/mtce-common/src/daemon/daemon_config.cpp index 3bd49b52..edaa70a5 100644 --- a/mtce-common/src/daemon/daemon_config.cpp +++ b/mtce-common/src/daemon/daemon_config.cpp @@ -191,10 +191,10 @@ int timeout_config_handler ( void * user, config_ptr->dor_mode_timeout = atoi(value); ilog ("DOR Mode TO : %3d secs\n", config_ptr->dor_mode_timeout ); } - else if (MATCH("timeouts", "dor_recovery_timeout_ext")) + else if (MATCH("timeouts", "dor_mode_detect")) { - config_ptr->dor_recovery_timeout_ext = atoi(value); - ilog ("DOR Time Ext: %3d secs\n", config_ptr->dor_recovery_timeout_ext ); + config_ptr->dor_mode_detect = atoi(value); + ilog ("DOR Mode Det: %3d secs", config_ptr->dor_mode_detect ); } else if (MATCH("timeouts", "bmc_audit_period")) { diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 85b631f6..9e7a1f03 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -582,8 +582,6 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) ptr->offline_log_reported = true ; ptr->online_log_reported = false ; - ptr->dor_recovery_mode = false ; - ptr->was_dor_recovery_mode= false ; ptr->dor_recovery_time = 0 ; ptr->vim_notified = false ; @@ -2134,9 +2132,6 @@ int nodeLinkClass::mod_host ( node_inv_type & inv ) /* handle a lock request while unlocked */ if ( !inv.action.compare ( "lock" ) ) { - if ( node_ptr->dor_recovery_mode == true ) - node_ptr->dor_recovery_mode = false ; - /* Set action to LOCK and let the FSM run the disable handler */ adminActionChange ( node_ptr , MTC_ADMIN_ACTION__LOCK ); } @@ -2183,9 +2178,6 @@ int nodeLinkClass::mod_host ( node_inv_type & inv ) /* TODO: Create customer log of this action */ ilog ("%s Force Lock Action\n", node_ptr->hostname.c_str()); - if ( node_ptr->dor_recovery_mode == true ) - node_ptr->dor_recovery_mode = false ; - if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) { if ( node_ptr->adminAction == MTC_ADMIN_ACTION__FORCE_LOCK ) @@ -2210,9 +2202,6 @@ int nodeLinkClass::mod_host ( node_inv_type & inv ) { if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) { - if ( node_ptr->dor_recovery_mode == true ) - node_ptr->dor_recovery_mode = false ; - /* Set action to LOCK and let the FSM run the disable handler */ adminActionChange ( node_ptr , MTC_ADMIN_ACTION__LOCK ); } @@ -3125,13 +3114,13 @@ int nodeLinkClass::add_host ( node_inv_type & inv ) if ( delay > 0 ) { mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, delay ); + ilog ("Host add delay is %d seconds", delay ); node_ptr->addStage = MTC_ADD__START_DELAY ; } else { node_ptr->addStage = MTC_ADD__START ; } - ilog ("Host add delay is %d seconds", delay ); adminActionChange ( node_ptr , MTC_ADMIN_ACTION__ADD ); } return (rc); @@ -5227,6 +5216,20 @@ int nodeLinkClass::manage_shadow_change ( string hostname ) return (rc); } +/** Returns the number of unlocked nodes */ +int nodeLinkClass::unlocked_nodes ( void ) +{ + int temp_count = 0 ; + for ( struct node * ptr = head ; ; ptr = ptr->next ) + { + if (ptr->adminState == MTC_ADMIN_STATE__UNLOCKED) + temp_count++ ; + if (( ptr->next == NULL ) || ( ptr == tail )) + break ; + } + return (temp_count); +} + /** Returns the number of worker hosts that are operationally 'enabled' */ int nodeLinkClass::enabled_compute_nodes ( void ) { @@ -5461,6 +5464,7 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa /* Nothing to do if this host is not in the hbs_minor state */ if ( node_ptr->hbs_minor[iface] == true ) { + dlog ("%s clearing heartbeat minor on %s network", node_ptr->hostname.c_str(), get_iface_name_str(iface)); /* clear it - possibly temporarily */ node_ptr->hbs_minor[iface] = false ; @@ -5526,7 +5530,7 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa { if ( ptr->operState != MTC_OPER_STATE__ENABLED ) { - slog ("%s found hbs_minor set for disabled host\n" , ptr->hostname.c_str() ); + slog ("%s found hbs_minor set for %s network for disabled host\n" , ptr->hostname.c_str(), get_iface_name_str(iface)); } temp_count++ ; } @@ -5552,56 +5556,6 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa } } -/**************************************************************************** - * - * Name : manage_dor_recovery - * - * Description: Enable DOR recovery mode for this host. - * Generate log - * - * The severity parm is used to enhance the logs to indicate what - * severity level this utility was called from ; - * minor, major, or critical - * - ***************************************************************************/ - -void nodeLinkClass::manage_dor_recovery ( struct nodeLinkClass::node * node_ptr, - EFmAlarmSeverityT severity ) -{ - if (( severity == FM_ALARM_SEVERITY_CLEAR ) && - ( node_ptr->dor_recovery_mode == true )) - { - node_ptr->dor_recovery_mode = false ; - node_ptr->was_dor_recovery_mode = true ; - } - - else if (( severity == FM_ALARM_SEVERITY_CRITICAL ) && - ( node_ptr->dor_recovery_mode == false )) - { - struct timespec ts ; - clock_gettime (CLOCK_MONOTONIC, &ts ); - wlog ("%-12s is waiting ; DOR recovery %2ld:%02ld mins (%4ld secs)\n", - node_ptr->hostname.c_str(), - ts.tv_sec/60, - ts.tv_sec%60, - ts.tv_sec); - - node_ptr->dor_recovery_time = 0 ; - node_ptr->dor_recovery_mode = true ; - node_ptr->hbsClient_ready = false ; - mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT ); - - /* don't restart graceful recovery for this host if its already in that FSM */ - if (( node_ptr->adminAction != MTC_ADMIN_ACTION__RECOVER ) && - ( node_ptr->adminAction != MTC_ADMIN_ACTION__LOCK )) - { - recoveryStageChange ( node_ptr, MTC_RECOVERY__START ); - adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER ); - } - } -} - - /** Manage heartbeat failure events */ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface, bool clear_event ) { @@ -5626,11 +5580,7 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface node_ptr->hbs_failure[iface] = false ; } } - else if ( this->mtcTimer_dor.tid ) - { - manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CRITICAL ); - } - else + else if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) { /* handle auto recovery for heartbeat failure during enable */ if ( node_ptr->ar_cause == MTC_AR_DISABLE_CAUSE__HEARTBEAT ) @@ -5662,51 +5612,54 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface mnfa_add_host ( node_ptr , iface ); - if ( mnfa_active == false ) + if ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) { - /* if node is already in graceful recovery just ignore the event */ - if ( node_ptr->graceful_recovery_counter != 0 ) + if ( mnfa_active == false ) { - dlog ("%s %s loss event ; already in graceful recovery try %d", - hostname.c_str(), - get_iface_name_str(iface), - node_ptr->graceful_recovery_counter ); - return ; - } - elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface)); - if ( iface == CLSTR_IFACE ) - { - node_ptr->heartbeat_failed[CLSTR_IFACE] = true ; - } - else if ( iface == MGMNT_IFACE ) - { - node_ptr->heartbeat_failed[MGMNT_IFACE] = true ; - } - if (mnfa_host_count[iface] < this->mnfa_threshold) - { - elog ("%s %s network heartbeat failure\n", hostname.c_str(), get_iface_name_str(iface)); - - nodeLinkClass::set_availStatus ( hostname, MTC_AVAIL_STATUS__FAILED ); - - if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) && - ( node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK )) + /* if node is already in graceful recovery just ignore the event */ + if ( node_ptr->graceful_recovery_counter != 0 ) { - if ( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER ) + dlog ("%s %s loss event ; already in graceful recovery try %d", + hostname.c_str(), + get_iface_name_str(iface), + node_ptr->graceful_recovery_counter ); + return ; + } + elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface)); + if ( iface == CLSTR_IFACE ) + { + node_ptr->heartbeat_failed[CLSTR_IFACE] = true ; + } + else if ( iface == MGMNT_IFACE ) + { + node_ptr->heartbeat_failed[MGMNT_IFACE] = true ; + } + if (mnfa_host_count[iface] < this->mnfa_threshold) + { + elog ("%s %s network heartbeat failure\n", hostname.c_str(), get_iface_name_str(iface)); + + nodeLinkClass::set_availStatus ( hostname, MTC_AVAIL_STATUS__FAILED ); + + if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) && + ( node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK )) { - wlog ("%s restarting graceful recovery\n", hostname.c_str() ); + if ( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER ) + { + wlog ("%s restarting graceful recovery", hostname.c_str() ); + } + else + { + wlog ("%s starting graceful recovery", hostname.c_str() ); + } + recoveryStageChange ( node_ptr, MTC_RECOVERY__START ); + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER ); } else { - wlog ("%s starting graceful recovery\n", hostname.c_str() ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_FAIL_HB ); + enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); } - recoveryStageChange ( node_ptr, MTC_RECOVERY__START ); - adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER ); - } - else - { - mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_FAIL_HB ); - enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); - adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); } } } @@ -5801,11 +5754,8 @@ void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface hbs_minor_clear ( node_ptr, iface ); } - else if ( this->mtcTimer_dor.tid ) - { - manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_MAJOR ); - } - else + /* - we don't care about locked hosts */ + else if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) { if ( mnfa_active == false ) { @@ -5814,7 +5764,7 @@ void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface mnfa_add_host ( node_ptr, iface ); - if ( nodeLinkClass::get_operState ( hostname ) == MTC_OPER_STATE__ENABLED ) + if ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) { if ( iface == MGMNT_IFACE ) { @@ -5851,16 +5801,11 @@ void nodeLinkClass::manage_heartbeat_minor ( string hostname, iface_enum iface, alog ("%s %s Heartbeat Minor (clear)\n", hostname.c_str(), get_iface_name_str(iface)); hbs_minor_clear ( node_ptr, iface ); } - /* if not a clear then only set if the host is enabled - * - we don't care about disabled hosts */ - else if ( nodeLinkClass::get_operState ( hostname ) == MTC_OPER_STATE__ENABLED ) + /* - we don't care about locked hosts */ + else if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && + ( node_ptr->operState == MTC_OPER_STATE__ENABLED )) { - if ( this->mtcTimer_dor.tid ) - { - manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_MINOR ); - } - - else if ( node_ptr->hbs_minor[iface] != true ) + if ( node_ptr->hbs_minor[iface] != true ) { mnfa_add_host ( node_ptr, iface ); } @@ -7076,8 +7021,7 @@ int nodeLinkClass::adminActionChange ( struct nodeLinkClass::node * node_ptr, * other action can take effect. * If its not one of these action then just proceed with it **/ - if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ADD ) && - ( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK )) + if ( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK ) { clog ("%s Administrative Action '%s' -> '%s'\n", node_ptr->hostname.c_str(), @@ -8510,30 +8454,76 @@ int nodeLinkClass::ar_handler ( struct nodeLinkClass::node * node_ptr, * Description: Create a specifically formatted log for the specified * hosts DOR recovery state and timing. * - * Parameters : The node and a caller prefix string that states if the node - * is ENABELD + * Assumptions: Only logged if the active controller has an uptime + * less than 20 minutes (default). Configurable in mtce.conf + * + * Parameters : + * + * @param node_ptr Pointer to the node in the inventoried node linked list. + * @param node_state_log_prefix Prefix for the node's state log messages. + * is ENABLED + * is DEGRADED + * is DISABLED * is FAILED - * is ENMABLED-degraded - * etc. + * is OFFLINE + * @param extra string representing where this function was called. * ***************************************************************************/ void nodeLinkClass::report_dor_recovery ( struct nodeLinkClass::node * node_ptr, - string node_state_log_prefix ) + string node_state_log_prefix, + string extra ) { struct timespec ts ; clock_gettime (CLOCK_MONOTONIC, &ts ); - node_ptr->dor_recovery_time = ts.tv_sec ; - plog ("%-12s %s ; DOR Recovery %2d:%02d mins (%4d secs) (uptime:%2d:%02d mins)\n", - node_ptr->hostname.c_str(), - node_state_log_prefix.c_str(), - node_ptr->dor_recovery_time/60, - node_ptr->dor_recovery_time%60, - node_ptr->dor_recovery_time, - node_ptr->uptime/60, - node_ptr->uptime%60 ); - node_ptr->dor_recovery_mode = false ; - node_ptr->was_dor_recovery_mode = false ; + if ( this->dor_mode_active ) + { + node_ptr->dor_recovery_time = ts.tv_sec ; + plog ("%-12s %-11s ; DOR Recovery %2d:%02d mins (%4d secs) (uptime:%2d:%02d mins) %s", + node_ptr->hostname.c_str(), + node_state_log_prefix.c_str(), + node_ptr->dor_recovery_time/60, + node_ptr->dor_recovery_time%60, + node_ptr->dor_recovery_time, + node_ptr->uptime/60, + node_ptr->uptime%60, + extra.c_str()); + + // Accounting + int unlocked_nodes = this->unlocked_nodes() ; + if ( ++this->dor_recovered_nodes == unlocked_nodes ) + { + mtcTimer_reset (this->mtcTimer_dor); + this->dor_mode_active = false ; + this->dor_mode_active_log_throttle = 0 ; + ilog ("%-13s %3d of %-3d ; DOR Recovery ; all nodes are recovered ; active controller uptime:%ld", + this->my_hostname.c_str(), + this->dor_recovered_nodes, + unlocked_nodes, + ts.tv_sec); + } + else if ( this->dor_recovered_nodes > this->unlocked_nodes() ) + { + slog ("%s unexpected extra DOR recovery call ; unlocked:%d recovered:%d", + node_ptr->hostname.c_str(), + unlocked_nodes, + this->dor_recovered_nodes); + } + else + { + ilog ("%s %d of %d DOR nodes recovered", + node_ptr->hostname.c_str(), + this->dor_recovered_nodes, + unlocked_nodes); + } + } + else + { + dlog ("%s DOR Recovery called with '%s %s' while dor mode disabled", + node_ptr->hostname.c_str(), + node_state_log_prefix.c_str(), + extra.c_str()); + } } void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr ) @@ -8547,10 +8537,7 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr ) return ; } - if ( node_ptr->was_dor_recovery_mode ) - { - report_dor_recovery ( node_ptr , "is FAILED " ); - } + report_dor_recovery ( node_ptr , "is FAILED", "full enable" ); plog ("%s Forcing Full Enable Sequence\n", node_ptr->hostname.c_str()); @@ -8560,9 +8547,8 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr ) allStateChange ( node_ptr, node_ptr->adminState, MTC_OPER_STATE__DISABLED, MTC_AVAIL_STATUS__FAILED ); enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); recoveryStageChange ( node_ptr, MTC_RECOVERY__START ); /* reset the fsm */ - // don't override the add action or lock actions / - if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ADD ) && - ( node_ptr->adminAction != MTC_ADMIN_ACTION__LOCK ) && + // don't override the lock actions / + if (( node_ptr->adminAction != MTC_ADMIN_ACTION__LOCK ) && ( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK )) { adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); // no action @@ -9783,10 +9769,8 @@ void nodeLinkClass::mem_log_general ( void ) void nodeLinkClass::mem_log_dor ( struct nodeLinkClass::node * node_ptr ) { char str[MAX_MEM_LOG_DATA] ; - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s DOR - Active: %c Was: %c Time: %5d (00:%02d:%02d)\n", + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s DOR - Time: %5d (00:%02d:%02d)\n", node_ptr->hostname.c_str(), - node_ptr->dor_recovery_mode ? 'Y' : 'N', - node_ptr->was_dor_recovery_mode ? 'Y' : 'N', node_ptr->dor_recovery_time, node_ptr->dor_recovery_time ? node_ptr->dor_recovery_time/60 : 0, node_ptr->dor_recovery_time ? node_ptr->dor_recovery_time%60 : 0); @@ -9812,12 +9796,14 @@ void nodeLinkClass::mem_log_mnfa ( void ) void nodeLinkClass::mem_log_general_mtce_hosts ( void ) { char str[MAX_MEM_LOG_DATA] ; - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s EnableHosts -> Cont:%d Comp:%d Stor:%d StorType:%d\n", + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s EnableHosts -> Cont:%d Comp:%d Stor:%d StorType:%d Unlocked:%d DOR:Recovered:%d\n", my_hostname.c_str(), num_controllers_enabled(), enabled_compute_nodes(), enabled_storage_nodes(), - get_storage_backend()); + get_storage_backend(), + unlocked_nodes(), + dor_recovered_nodes); mem_log (str); } diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 11433263..b85c3820 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -368,10 +368,8 @@ private: /* the fault handling offline handler timer */ struct mtc_timer offline_timer ; - /* Host level DOR recovery mode time and bools */ + /* Host level DOR recovery time */ int dor_recovery_time ; - bool dor_recovery_mode ; - bool was_dor_recovery_mode ; /** Integer code representing the host health */ int health ; @@ -1275,7 +1273,7 @@ private: /* Dead Office Recovery - system level controls */ void manage_dor_recovery ( struct nodeLinkClass::node * node_ptr, EFmAlarmSeverityT severity ); - void report_dor_recovery ( struct nodeLinkClass::node * node_ptr, string node_state_log_prefix ); + void report_dor_recovery ( struct nodeLinkClass::node * node_ptr, string node_state_log_prefix, string extra ); struct { struct node * head_ptr ; /**< Pulse Linked List Head pointer */ @@ -1398,6 +1396,7 @@ public: bool dor_mode_active ; unsigned int dor_start_time ; int dor_mode_active_log_throttle ; + int dor_recovered_nodes = 0; /**< DOR node recovery count */ bool hbs_disabled ; /**< Control heartbeat service state */ bool hbs_state_change ; /**< Flag service state change */ @@ -1702,6 +1701,9 @@ public: /** Remove a host from Node list */ int rem_host ( string & hostname ); + /** Get the number of unlocked nodes */ + int unlocked_nodes ( void ); + /** Get the number of worker hosts that are operationally 'enabled' */ int enabled_compute_nodes ( void ); diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index ab7874b2..0fbf1179 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -710,6 +710,15 @@ int daemon_configure ( void ) else mtcInv.node_reinstall_timeout = MTC_REINSTALL_TIMEOUT_DEFAULT ; + if ( mtc_config.dor_mode_detect <= 0 ) + { + wlog ("DOR mode detect timeout is invalid (%d), setting to default (%d)", + mtc_config.dor_mode_detect, + DEFAULT_DOR_MODE_DETECT); + + mtc_config.dor_mode_detect = DEFAULT_DOR_MODE_DETECT ; + } + if ( mtc_config.dor_mode_timeout <= 0 ) { slog ("DOR Mode Timeout is invalid (%d), setting to default (%d)\n", @@ -1653,7 +1662,7 @@ void daemon_service_run ( void ) } #endif - if ( ts.tv_sec < MTC_MINS_15 ) + if ( ts.tv_sec < mtc_config.dor_mode_detect ) { /* AIO DOR window is much greater in AIO since heartbeat * cannot start until the inactive AIO has run both manifests */ @@ -1669,16 +1678,16 @@ void daemon_service_run ( void ) mtcInv.dor_mode_active = true ; mtcInv.dor_start_time = ts.tv_sec ; - ilog ("%-12s ---------- ; DOR Recovery ---------------------- -------------------\n", mtcInv.my_hostname.c_str()); - ilog ("%-12s is ACTIVE ; DOR Recovery %2d:%02d mins (%4d secs) (duration %3d secs)\n", + ilog ("%-12s ----------- ; DOR Recovery ---------------------- -------------------\n", mtcInv.my_hostname.c_str()); + ilog ("%-12s is ACTIVE ; DOR Recovery %2d:%02d mins (%4d secs) (dor timeout in %3d secs)\n", mtcInv.my_hostname.c_str(), mtcInv.dor_start_time/60, mtcInv.dor_start_time%60, mtcInv.dor_start_time, timeout ); - ilog ("%-12s ---------- ; DOR Recovery ---------------------- -------------------\n", mtcInv.my_hostname.c_str()); - ilog ("%-12s host state ; DOR Recovery controller uptime host uptime \n", mtcInv.my_hostname.c_str()); - ilog ("%-12s ---------- ; DOR Recovery ---------------------- -------------------\n", mtcInv.my_hostname.c_str()); + ilog ("%-12s ----------- ; DOR Recovery ---------------------- -------------------", mtcInv.my_hostname.c_str()); + ilog ("%-12s host state ; DOR Recovery controller uptime host uptime ", mtcInv.my_hostname.c_str()); + ilog ("%-12s ----------- ; DOR Recovery ---------------------- -------------------", mtcInv.my_hostname.c_str()); mtcTimer_start ( mtcInv.mtcTimer_dor, mtcTimer_handler, timeout ); } @@ -1992,7 +2001,12 @@ void daemon_service_run ( void ) * then exit DOR mode. We do it here instead of */ if (( mtcInv.dor_mode_active == true ) && ( mtcInv.mtcTimer_dor.tid == NULL )) { - ilog ("DOR mode disable\n"); + wlog ("%s DOR mode disabled ; DOR Recovery Timeout ; %d of %d unlocked hosts ; active controller uptime:%d", + mtcInv.my_hostname.c_str(), + mtcInv.dor_recovered_nodes, + mtcInv.unlocked_nodes(), + mtcInv.get_uptime(mtcInv.my_hostname)); + mtcInv.dor_mode_active_log_throttle = 0 ; mtcInv.dor_mode_active = false ; } } diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index fb17627f..cf832a8b 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -194,6 +194,7 @@ void nodeLinkClass::timer_handler ( int sig, siginfo_t *si, void *uc) { mtcTimer_stop_int_safe ( mtcTimer_dor ); mtcTimer_dor.ring = true ; + this->dor_mode_active_log_throttle = 0 ; return ; } @@ -488,7 +489,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) AR_LOG_THROTTLE_THRESHOLD, "%s auto recovery disabled cause:%d", node_ptr->hostname.c_str(), node_ptr->ar_cause ); - return (RETRY); ; + return (RETRY); } if ( THIS_HOST ) @@ -787,11 +788,10 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_ENABLE__START: { - manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR ); plog ("%s Main Enable FSM (from start)%s\n", node_ptr->hostname.c_str(), - node_ptr->was_dor_recovery_mode ? " (from DOR)" : "" ); + this->dor_mode_active ? " (DOR active)" : "" ); /* clear all the past enable failure bools */ clear_main_failed_bools ( node_ptr ); @@ -1547,10 +1547,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) /* Inform the VIM that this host is enabled */ mtcVimApi_state_change ( node_ptr, VIM_HOST_ENABLED, 3 ); - plog ("%s is ENABLED%s\n", node_ptr->hostname.c_str(), - node_ptr->was_dor_recovery_mode ? " (from DOR)" : ""); - node_ptr->dor_recovery_mode = false ; - node_ptr->was_dor_recovery_mode = false ; + plog ("%s is ENABLED", node_ptr->hostname.c_str()); node_ptr->http_retries_cur = 0 ; adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); @@ -1718,13 +1715,12 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) { if ( node_ptr->mtcAlive_online == true ) { - manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR ); mtcTimer_stop ( node_ptr->mtcTimer ); ilog ("%s got requested mtcAlive%s\n", node_ptr->hostname.c_str(), - node_ptr->was_dor_recovery_mode ? " (DOR)" : "" ); + this->dor_mode_active ? " (DOR mode)" : "" ); stop_offline_handler ( node_ptr ); @@ -1793,7 +1789,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) /* did not reboot case */ wlog ("%s Connectivity Recovered ; host did not reset%s (uptime:%d)", node_ptr->hostname.c_str(), - node_ptr->was_dor_recovery_mode ? " (DOR)" : "", + this->dor_mode_active ? " (DOR mode)" : "", node_ptr->uptime); wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str()); @@ -1808,9 +1804,9 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) else { wlog ("%s Connectivity Recovered ; host has reset\n", node_ptr->hostname.c_str()); - ilog ("%s ... continuing%sgraceful recovery ; (OOB: %08x)\n", + ilog ("%s ... continuing graceful recovery%s ; (OOB: %08x)", node_ptr->hostname.c_str(), - node_ptr->was_dor_recovery_mode ? " (DOR) " : " ", + this->dor_mode_active ? " (DOR mode)" : "", node_ptr->mtce_flags); ilog ("%s ... without additional reboot %s (uptime:%d)\n", node_ptr->hostname.c_str(), @@ -1845,7 +1841,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) wlog ("%s Loss Of Communication for %d seconds ; disabling host%s\n", node_ptr->hostname.c_str(), loc_recovery_timeout, - node_ptr->dor_recovery_mode ? " (DOR)" : "" ); + this->dor_mode_active ? " (DOR mode)" : "" ); wlog ("%s ... stopping host services\n", node_ptr->hostname.c_str()); wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str()); @@ -1898,7 +1894,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) /* Only try and issue in-line recovery reboot or reset if * NOT in Dead Office Recovery (DOR) mode. */ - if ( node_ptr->dor_recovery_mode == false ) + if ( this->dor_mode_active ) { ilog ("%s issuing one time graceful recovery reboot over management network\n", node_ptr->hostname.c_str()); @@ -1945,7 +1941,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) LOAD_NODETYPE_TIMERS ; /* load the mtcAlive timeout to accomodate for dor recovery */ - timeout = node_ptr->mtcalive_timeout + daemon_get_cfg_ptr()->dor_recovery_timeout_ext ; + timeout = node_ptr->mtcalive_timeout ; } /* start the timer that waits for MTCALIVE */ @@ -1955,7 +1951,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->hostname.c_str(), MTC_TASK_RECOVERY_WAIT, timeout, - node_ptr->dor_recovery_mode ? " (DOR) " : " " , + this->dor_mode_active ? " (DOR) " : " " , node_ptr->uptime_save ); clear_service_readies ( node_ptr ); @@ -2024,7 +2020,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->hostname.c_str(), MTC_TASK_RECOVERY_WAIT, timeout, - node_ptr->dor_recovery_mode ? " (DOR) " : " " , + this->dor_mode_active ? " (DOR mode) " : " " , node_ptr->uptime_save ); clear_service_readies ( node_ptr ); @@ -2075,7 +2071,6 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) { mtcTimer_stop ( node_ptr->mtcTimer ); - manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR ); /* If the host's uptime is bigger than the saved uptime then * the host has not reset yet we have disabled services @@ -2084,7 +2079,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) if ((( node_ptr->uptime_save != 0 ) && ( node_ptr->uptime >= node_ptr->uptime_save )) || (( node_ptr->uptime_save == 0 ) && - ( node_ptr->uptime > MTC_MINS_15 ))) + ( node_ptr->uptime > MTC_MINS_20 ))) { ilog ("%s regained MTCALIVE from host that did not reboot (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime ); @@ -2121,7 +2116,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->hostname.c_str(), node_ptr->uptime, node_ptr->uptime_save ); ilog ("%s ... continuing with graceful recovery %s\n", node_ptr->hostname.c_str(), - node_ptr->dor_recovery_mode ? "(DOR)" : " "); + this->dor_mode_active ? "(DOR mode)" : ""); ilog ("%s ... without additional reboot %s\n", node_ptr->hostname.c_str(), node_ptr->bm_ip.empty() ? "or reset" : "" ); @@ -2138,7 +2133,6 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) } else if ( node_ptr->mtcTimer.ring == true ) { - manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR ); /* Set the FSM task state to init failed */ mtcInvApi_update_task ( node_ptr, "Graceful Recovery Failed" ); @@ -2523,15 +2517,13 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->http_retries_cur = 0 ; doneQueue_purge ( node_ptr ); - if ( node_ptr->was_dor_recovery_mode ) + if ( this->dor_mode_active ) { - report_dor_recovery ( node_ptr , "is ENABLED" ); - } - else - { - plog ("%s is ENABLED (Gracefully Recovered)\n", - node_ptr->hostname.c_str()); + report_dor_recovery ( node_ptr , "is ENABLED", "recovery" ); } + plog ("%s is ENABLED (Gracefully Recovered%s)", + node_ptr->hostname.c_str(), + this->dor_mode_active ? " in DOR mode" : ""); alarm_enabled_clear ( node_ptr, false ); break ; } @@ -6023,11 +6015,26 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) case MTC_ADD__START: { bool timer_set = false ; - plog ("%s Host Add\n", node_ptr->hostname.c_str()); + if ( THIS_HOST ) + { + struct timespec ts ; + clock_gettime (CLOCK_MONOTONIC, &ts ); + node_ptr->uptime = ts.tv_sec ; + } + else if ( ! node_ptr->mtcClient_ready ) + { + /* If we have not received a mtcAlive event from the + * mtcClient already then lets request it since that + * is how we get its uptime. + * Don't trust what is in the database since it will + * be stale. Best to default to zero so the logs will + * show that there has been no mtcAlive received */ + node_ptr->uptime = 0 ; + send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, MGMNT_INTERFACE ); + send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, PXEBOOT_INTERFACE ); + } - /* Request a mtcAlive message ; gives us uptime ; don't trust what is in the database */ - node_ptr->uptime = 0 ; - send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, MGMNT_INTERFACE ); + plog ("%s Host Add (uptime:%d)", node_ptr->hostname.c_str(), node_ptr->uptime ); ilog ("%s %s %s-%s-%s (%s)\n", node_ptr->hostname.c_str(), @@ -6075,14 +6082,31 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) EFmAlarmSeverityT mtcAlive_alarm_severity = mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__MTCALIVE); - /* Clear generic enable alarm over process restart. - * Will get reasserted if the cause condition still exists */ + /* Manage an existing enable alarm */ if ( enable_alarm_severity != FM_ALARM_SEVERITY_CLEAR ) { - ilog ("%s found enable alarm ; clearing %s", + /* Added the unlocked-disabled check to avoid clearing the + * enabled alarm when the node is found to be unlocked-disabled + * with the enable alarm already asserted. + * We don't want to clear it in that case. */ + if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && + ( node_ptr->operState == MTC_OPER_STATE__DISABLED )) + { + node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ; + node_ptr->alarms[MTC_ALARM_ID__ENABLE] = enable_alarm_severity ; + wlog ("%s found enable alarm while unlocked-disabled ; loaded %s", node_ptr->hostname.c_str(), - alarmUtil_getSev_str(enable_alarm_severity).c_str()); - mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE ); + alarmUtil_getSev_str(enable_alarm_severity).c_str()); + } + else + { + ilog ("%s found enable alarm while %s-%s ; clearing %s", + node_ptr->hostname.c_str(), + adminState_enum_to_str (node_ptr->adminState).c_str(), + operState_enum_to_str (node_ptr->operState_subf).c_str(), + alarmUtil_getSev_str(enable_alarm_severity).c_str()); + mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE ); + } } /* The config alarm is maintained if it exists. @@ -6230,6 +6254,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) alarm_luks_failure ( node_ptr ); } node_ptr->ar_disabled = true ; + this->report_dor_recovery ( node_ptr, "is DISABLED" , "auto recovery disabled"); if ( THIS_HOST ) mtcInvApi_update_states ( node_ptr, "unlocked", "enabled", "degraded" ); @@ -6341,22 +6366,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) /* Stop the work queue wait timer */ mtcTimer_reset ( node_ptr->mtcTimer ); - /* Only start it on this add operation if host is - * already unlocked and enabled and not the active controller */ - if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && - ( node_ptr->operState == MTC_OPER_STATE__ENABLED )) - { - /* start the heartbeat service in all cases except for - * THIS host and AIO controller hosts */ - if ( NOT_THIS_HOST ) - { - if (( LARGE_SYSTEM ) || - (( AIO_SYSTEM ) && ( this->dor_mode_active == false ))) - { - send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST ); - } - } - } /* Only run hardware monitor if the bm ip is provisioned */ if (( hostUtil_is_valid_bm_type ( node_ptr->bm_type )) && @@ -6367,9 +6376,63 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) } this->ctl_mtcAlive_gate(node_ptr, false) ; - node_ptr->addStage = MTC_ADD__DONE ; + if (( NOT_THIS_HOST ) && + ((( AIO_SYSTEM ) && ( is_controller(node_ptr) == false )) || ( LARGE_SYSTEM )) && + ( this->hbs_failure_action != HBS_FAILURE_ACTION__NONE ) && + ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && + ( node_ptr->operState == MTC_OPER_STATE__ENABLED )) + { + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_5 ); + if ( ! node_ptr->hbsClient_ready ) + { + ilog ("%s waiting for hbsClient ready event (%d secs)", node_ptr->hostname.c_str(), MTC_MINS_5); + } + node_ptr->addStage = MTC_ADD__HEARTBEAT_WAIT ; + } + else + { + node_ptr->addStage = MTC_ADD__DONE ; + } break; } + case MTC_ADD__HEARTBEAT_WAIT: + { + /* Wait for hbsClient ready event */ + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) + { + wlog ("%s hbsClient ready event timeout\n", node_ptr->hostname.c_str()); + } + else if ( node_ptr->hbsClient_ready == false ) + { + break ; + } + else + { + mtcTimer_reset ( node_ptr->mtcTimer ); + } + plog ("%s Starting %d sec Heartbeat Soak (with%s)\n", + node_ptr->hostname.c_str(), + MTC_HEARTBEAT_SOAK_DURING_ADD, + node_ptr->hbsClient_ready ? " ready event" : "out ready event" ); + + /* allow heartbeat to run for MTC_HEARTBEAT_SOAK_DURING_ADD + * seconds before we declare enable */ + send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_DURING_ADD ); + node_ptr->addStage = MTC_ADD__HEARTBEAT_SOAK ; + break ; + } + case MTC_ADD__HEARTBEAT_SOAK: + { + if ( node_ptr->mtcTimer.ring == true ) + { + plog ("%s heartbeating", node_ptr->hostname.c_str()); + /* if heartbeat is not working then we will + * never get here */ + node_ptr->addStage = MTC_ADD__DONE ; + } + break ; + } case MTC_ADD__DONE: default: { @@ -6396,16 +6459,55 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && ( node_ptr->operState == MTC_OPER_STATE__ENABLED )) { - /* In AIO if in DOR mode and the host is unlocked enabled - * we need to run the subfunction handler and request - * to start host services. */ + /* Need to run the subfunction enable handler + * for AIO controllers while in DOR mode */ if ( this->dor_mode_active ) { + ilog ("%s running subfunction enable for unlocked-enabled AIO controller (DOR mode)", node_ptr->hostname.c_str()); adminActionChange ( node_ptr , MTC_ADMIN_ACTION__ENABLE_SUBF ); + break ; } } } + else if ( this->dor_mode_active ) + { + /* The Enable SUBF handler will do this so lets not do it twice */ + if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) + { + string state_str = "" ; + if ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) + { + state_str = "is ENABLED" ; + if ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ) + state_str = "is DEGRADED" ; + } + else if ( node_ptr->availStatus == MTC_AVAIL_STATUS__FAILED ) + { + state_str = "is FAILED" ; + } + else if ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE ) + { + state_str = "is OFFLINE" ; + } + if ( ! state_str.empty() ) + { + report_dor_recovery ( node_ptr , state_str, "" ) ; + } + else + { + ilog ("%-12s is waiting ; DOR Recovery ; %s-%s-%s ; mtcClient:%c hbsClient:%c uptime:%3d task:%s", + node_ptr->hostname.c_str(), + adminState_enum_to_str (node_ptr->adminState).c_str(), + operState_enum_to_str (node_ptr->operState).c_str(), + availStatus_enum_to_str(node_ptr->availStatus).c_str(), + node_ptr->mtcClient_ready ? 'Y':'N', + node_ptr->hbsClient_ready ? 'Y':'N', + node_ptr->uptime, + node_ptr->task.empty() ? "empty" : node_ptr->task.c_str()); + } + } + } node_ptr->addStage = MTC_ADD__START; plog ("%s Host Add Completed (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime ); @@ -7597,35 +7699,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) { insvTestStageChange ( node_ptr, MTC_INSV_TEST__RUN ); } - /* manage degrade state and alarms */ - if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && - ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) && - ( node_ptr->ar_disabled == false )) - { - /************************************************************ - * Manage In-Service Alarms * - ***********************************************************/ - - /* Manage Inservice Enable Alarm */ - if ( node_ptr->hostservices_failed ) - { - alarm_insv_failure ( node_ptr ); - } - else - { - alarm_insv_clear ( node_ptr, false ); - } - - /* Manage Compute Subfunction Failure Alarm */ - if ( node_ptr->hostservices_failed_subf ) - { - alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_MAJOR ); - } - else - { - alarm_compute_clear ( node_ptr, false ); - } - } break ; } case MTC_INSV_TEST__RUN: @@ -7694,16 +7767,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && ( node_ptr->operState == MTC_OPER_STATE__ENABLED )) { - /************************************************************ - * Prevent the start host services from running while in DOR - ***********************************************************/ - if ( node_ptr->dor_recovery_mode == true ) - { - /* wait longer for the host to boot up */ - wlog ("%s DOR recovery active ; waiting on host\n", - node_ptr->hostname.c_str()); - } - else if ( this->dor_mode_active == true ) + if ( this->dor_mode_active == true ) { ilog_throttled ( this->dor_mode_active_log_throttle, 20, "DOR mode active\n"); @@ -8313,4 +8377,4 @@ int nodeLinkClass::self_fail_handler ( struct nodeLinkClass::node * node_ptr ) "force swact to unlocked-enabled standby controller"); } return (PASS); -} \ No newline at end of file +} diff --git a/mtce/src/maintenance/mtcSubfHdlrs.cpp b/mtce/src/maintenance/mtcSubfHdlrs.cpp index 0210b6f3..c669eaca 100644 --- a/mtce/src/maintenance/mtcSubfHdlrs.cpp +++ b/mtce/src/maintenance/mtcSubfHdlrs.cpp @@ -411,12 +411,6 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) } else { - if ( node_ptr->dor_recovery_mode || node_ptr->was_dor_recovery_mode ) - { - node_ptr->dor_recovery_mode = false ; - node_ptr->was_dor_recovery_mode = true ; - } - if (( node_ptr->alarms[MTC_ALARM_ID__CH_COMP] != FM_ALARM_SEVERITY_CLEAR ) || ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CLEAR ) || ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CLEAR )) @@ -454,9 +448,9 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->subf_enabled = true ; node_ptr->inservice_failed_subf = false ; - if ( node_ptr->was_dor_recovery_mode ) + if ( this->dor_mode_active ) { - report_dor_recovery ( node_ptr , "is ENABLED" ); + report_dor_recovery ( node_ptr , "is ENABLED", "subf" ); } else { @@ -488,9 +482,9 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) MTC_OPER_STATE__ENABLED, MTC_AVAIL_STATUS__DEGRADED ); - if ( node_ptr->was_dor_recovery_mode ) + if ( this->dor_mode_active ) { - report_dor_recovery ( node_ptr , "is ENABLED-degraded" ); + report_dor_recovery ( node_ptr , "is DEGRADED", "subf" ); } else { @@ -511,16 +505,6 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) MTC_OPER_STATE__ENABLED, MTC_AVAIL_STATUS__DEGRADED ); - if ( node_ptr->was_dor_recovery_mode ) - { - report_dor_recovery ( node_ptr , "is DISABLED-failed" ); - } - else - { - elog ("%s is DISABLED-failed (subfunction failed)\n", - name.c_str() ); - } - this->dor_mode_active = false ; alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_CRITICAL ) ; @@ -552,9 +536,6 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->enabled_count++ ; node_ptr->health_threshold_counter = 0 ; - node_ptr->was_dor_recovery_mode = false ; - node_ptr->dor_recovery_mode = false ; - this->dor_mode_active = false ; ar_enable ( node_ptr ); diff --git a/mtce/src/scripts/mtc.conf b/mtce/src/scripts/mtc.conf index f88d6331..c6b3b0ab 100644 --- a/mtce/src/scripts/mtc.conf +++ b/mtce/src/scripts/mtc.conf @@ -125,12 +125,12 @@ loc_recovery_timeout = 5 ; Loss Of Communication Recovery Timeout ; the max number of seconds that a host can be in ; loss of communication state without failing the unit -dor_mode_timeout = 20 ; The default base time in seconds for how long +dor_mode_detect = 1200 ; Controller uptime less than this value puts mtcAgent + ; into DOR mode active state. Default: 20 minutes + +dor_mode_timeout = 1000 ; The default base time in seconds for how long ; maintenance DOR mode is active. This number ; is extended by the number of enabled hosts. -dor_recovery_timeout_ext = 1800 ; Dor timeout extension. An extra time in seconds - ; that is added to the host specific recovery time - ; making the overall host's dor recovery timeout. swact_timeout = 120 ; Seconds Mtce waits for HA Service SWACT before failing ; the swact operation