Merge "Improve DOR Recovery banner to include all hosts and their status"

2025-04-10 17:36:03 +00:00 · 2025-04-10 17:36:03 +00:00 · 73e3241b6d
commit 73e3241b6d
parent 34207b1895 1daa670126
10 changed files with 344 additions and 293 deletions
--- a/mtce-common/src/common/logMacros.h
+++ b/mtce-common/src/common/logMacros.h
@ -159,8 +159,8 @@ typedef struct
    int   work_queue_timeout           ; /**< end of action workq complete TO */
    int   loc_recovery_timeout         ; /**< loss of comms recovery timeout  */
    int   node_reinstall_timeout       ; /**< node reinstall timeout          */
+    int   dor_mode_detect              ; /**< dead office recovery detect thld*/
    int   dor_mode_timeout             ; /**< dead office recovery timeout    */
-    int   dor_recovery_timeout_ext     ; /**< dor recovery timeout extension  */
    int   uptime_period                ; /**< Uptime refresh timer period     */
    int   online_period                ; /**< locked availability refresh     */
    int   insv_test_period             ; /**< insv test period in secs        */
--- a/mtce-common/src/common/nodeBase.h
+++ b/mtce-common/src/common/nodeBase.h
@ -203,8 +203,9 @@ typedef enum

 #define DEFAULT_MTCALIVE_TIMEOUT    (1200)
 #define DEFAULT_GOENABLE_TIMEOUT     (300)
-#define DEFAULT_DOR_MODE_TIMEOUT      (20)
-#define DEFAULT_DOR_MODE_AIO_TIMEOUT (600)
+#define DEFAULT_DOR_MODE_TIMEOUT      (MTC_MINS_15)
+#define DEFAULT_DOR_MODE_AIO_TIMEOUT  (MTC_MINS_20)
+#define DEFAULT_DOR_MODE_DETECT       (MTC_MINS_20)
 #define DEFAULT_POWER_OFF_RETRY_WAIT  (30)

 /** TODO: Convert names to omit JSON part */
@ -962,6 +963,8 @@ typedef enum
    MTC_ADD__MTC_SERVICES,
    MTC_ADD__CLEAR_TASK,
    MTC_ADD__WORKQUEUE_WAIT,
+    MTC_ADD__HEARTBEAT_WAIT,
+    MTC_ADD__HEARTBEAT_SOAK,
    MTC_ADD__DONE,
    MTC_ADD__STAGES
 } mtc_addStages_enum ;
--- a/mtce-common/src/common/nodeTimers.h
+++ b/mtce-common/src/common/nodeTimers.h
@ -45,6 +45,7 @@
 #define MTC_MINS_5  (300)
 #define MTC_MINS_8  (480)
 #define MTC_MINS_10 (600)
+#define MTC_MINS_14 (840)
 #define MTC_MINS_15 (900)
 #define MTC_MINS_20 (1200)
 #define MTC_MINS_30 (1800)
@ -71,8 +72,7 @@
 #define MTC_BM_POWERON_TIMEOUT       (30)
 #define MTC_RESET_PROG_TIMEOUT       (20)
 #define MTC_WORKQUEUE_TIMEOUT        (60)
-#define MTC_WORKER_CONFIG_TIMEOUT  (900)
-#define MTC_EXIT_DOR_MODE_TIMEOUT (60*15)
+#define MTC_WORKER_CONFIG_TIMEOUT    (MTC_MINS_14)
 #define MTC_RESET_PROG_OFFLINE_TIMEOUT   (20)
 #define MTC_RESET_TO_OFFLINE_TIMEOUT    (150)
 #define MTC_POWEROFF_TO_OFFLINE_TIMEOUT (200)
@ -80,6 +80,7 @@
 #define MTC_POWERCYCLE_COOLDOWN_DELAY  (MTC_MINS_5)
 #define MTC_POWERCYCLE_BACK2BACK_DELAY (MTC_MINS_5)
 #define MTC_HEARTBEAT_SOAK_BEFORE_ENABLE (11)
+#define MTC_HEARTBEAT_SOAK_DURING_ADD    (10)
 #define MTC_REINSTALL_TIMEOUT_DEFAULT  (MTC_MINS_40)
 #define MTC_REINSTALL_TIMEOUT_BMC_ACC  (MTC_MINS_10)
 #define MTC_REINSTALL_TIMEOUT_MIN      (MTC_MINS_1)
--- a/mtce-common/src/daemon/daemon_config.cpp
+++ b/mtce-common/src/daemon/daemon_config.cpp
@ -191,10 +191,10 @@ int timeout_config_handler (       void * user,
        config_ptr->dor_mode_timeout = atoi(value);
        ilog ("DOR Mode TO : %3d secs\n", config_ptr->dor_mode_timeout );
    }
-    else if (MATCH("timeouts", "dor_recovery_timeout_ext"))
+    else if (MATCH("timeouts", "dor_mode_detect"))
    {
-        config_ptr->dor_recovery_timeout_ext = atoi(value);
-        ilog ("DOR Time Ext: %3d secs\n", config_ptr->dor_recovery_timeout_ext );
+        config_ptr->dor_mode_detect = atoi(value);
+        ilog ("DOR Mode Det: %3d secs", config_ptr->dor_mode_detect );
    }
    else if (MATCH("timeouts", "bmc_audit_period"))
    {
--- a/mtce/src/common/nodeClass.cpp
+++ b/mtce/src/common/nodeClass.cpp
@ -582,8 +582,6 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
    ptr->offline_log_reported = true  ;
    ptr->online_log_reported  = false ;

-    ptr->dor_recovery_mode    = false ;
-    ptr->was_dor_recovery_mode= false ;
    ptr->dor_recovery_time    = 0     ;

    ptr->vim_notified = false ;
@ -2134,9 +2132,6 @@ int nodeLinkClass::mod_host ( node_inv_type & inv )
                /* handle a lock request while unlocked */
                if ( !inv.action.compare ( "lock" ) )
                {
-                    if ( node_ptr->dor_recovery_mode == true )
-                         node_ptr->dor_recovery_mode = false ;
-
                    /* Set action to LOCK and let the FSM run the disable handler */
                    adminActionChange ( node_ptr , MTC_ADMIN_ACTION__LOCK );
                }
@ -2183,9 +2178,6 @@ int nodeLinkClass::mod_host ( node_inv_type & inv )
                /* TODO: Create customer log of this action */
                ilog ("%s Force Lock Action\n", node_ptr->hostname.c_str());

-                if ( node_ptr->dor_recovery_mode == true )
-                     node_ptr->dor_recovery_mode = false ;
-
                if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
                {
                    if ( node_ptr->adminAction == MTC_ADMIN_ACTION__FORCE_LOCK )
@ -2210,9 +2202,6 @@ int nodeLinkClass::mod_host ( node_inv_type & inv )
            {
                if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
                {
-                    if ( node_ptr->dor_recovery_mode == true )
-                         node_ptr->dor_recovery_mode = false ;
-
                    /* Set action to LOCK and let the FSM run the disable handler */
                    adminActionChange ( node_ptr , MTC_ADMIN_ACTION__LOCK );
                }
@ -3125,13 +3114,13 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
        if ( delay > 0 )
        {
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, delay );
+            ilog ("Host add delay is %d seconds", delay );
            node_ptr->addStage = MTC_ADD__START_DELAY ;
        }
        else
        {
            node_ptr->addStage = MTC_ADD__START ;
        }
-        ilog ("Host add delay is %d seconds", delay );
        adminActionChange ( node_ptr , MTC_ADMIN_ACTION__ADD );
    }
    return (rc);
@ -5228,6 +5217,20 @@ int  nodeLinkClass::manage_shadow_change ( string hostname )
    return (rc);
 }

+/** Returns the number of unlocked nodes */
+int nodeLinkClass::unlocked_nodes ( void )
+{
+    int temp_count = 0 ;
+    for ( struct node * ptr = head ;  ; ptr = ptr->next )
+    {
+        if (ptr->adminState == MTC_ADMIN_STATE__UNLOCKED)
+            temp_count++ ;
+        if (( ptr->next == NULL ) || ( ptr == tail ))
+               break ;
+    }
+    return (temp_count);
+}
+
 /** Returns the number of worker hosts that are operationally 'enabled' */
 int nodeLinkClass::enabled_compute_nodes ( void )
 {
@ -5462,6 +5465,7 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
    /* Nothing to do if this host is not in the hbs_minor state */
    if ( node_ptr->hbs_minor[iface] == true )
    {
+        dlog ("%s clearing heartbeat minor on %s network", node_ptr->hostname.c_str(), get_iface_name_str(iface));
        /* clear it - possibly temporarily */
        node_ptr->hbs_minor[iface] = false ;

@ -5527,7 +5531,7 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
        {
            if ( ptr->operState != MTC_OPER_STATE__ENABLED )
            {
-                slog ("%s found hbs_minor set for disabled host\n" , ptr->hostname.c_str() );
+                slog ("%s found hbs_minor set for %s network for disabled host\n" , ptr->hostname.c_str(), get_iface_name_str(iface));
            }
            temp_count++ ;
        }
@ -5553,56 +5557,6 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
     }
 }

-/****************************************************************************
- *
- * Name       : manage_dor_recovery
- *
- * Description: Enable DOR recovery mode for this host.
- *              Generate log
- *
- *              The severity parm is used to enhance the logs to indicate what
- *              severity level this utility was called from ;
- *              minor, major, or critical
- *
- ***************************************************************************/
-
-void nodeLinkClass::manage_dor_recovery (  struct nodeLinkClass::node * node_ptr,
-                                                    EFmAlarmSeverityT   severity )
-{
-    if (( severity == FM_ALARM_SEVERITY_CLEAR ) &&
-        ( node_ptr->dor_recovery_mode == true ))
-    {
-        node_ptr->dor_recovery_mode = false ;
-        node_ptr->was_dor_recovery_mode = true ;
-    }
-
-    else if (( severity == FM_ALARM_SEVERITY_CRITICAL ) &&
-             ( node_ptr->dor_recovery_mode == false ))
-    {
-        struct timespec ts ;
-        clock_gettime (CLOCK_MONOTONIC, &ts );
-        wlog ("%-12s is waiting ; DOR recovery %2ld:%02ld mins (%4ld secs)\n",
-                     node_ptr->hostname.c_str(),
-                     ts.tv_sec/60,
-                     ts.tv_sec%60,
-                     ts.tv_sec);
-
-        node_ptr->dor_recovery_time = 0     ;
-        node_ptr->dor_recovery_mode = true  ;
-        node_ptr->hbsClient_ready   = false ;
-        mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT );
-
-        /* don't restart graceful recovery for this host if its already in that FSM */
-        if (( node_ptr->adminAction != MTC_ADMIN_ACTION__RECOVER ) &&
-            ( node_ptr->adminAction != MTC_ADMIN_ACTION__LOCK ))
-        {
-            recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
-            adminActionChange   ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
-        }
-    }
-}
-
-
 /** Manage heartbeat failure events */
 void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface, bool clear_event )
 {
@ -5627,11 +5581,7 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
             node_ptr->hbs_failure[iface] = false ;
        }
    }
-    else if ( this->mtcTimer_dor.tid )
-    {
-        manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
-    }
-    else
+    else if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
    {
        /* handle auto recovery for heartbeat failure during enable */
        if ( node_ptr->ar_cause == MTC_AR_DISABLE_CAUSE__HEARTBEAT )
@ -5663,51 +5613,54 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface

        mnfa_add_host ( node_ptr , iface );

-        if ( mnfa_active == false )
+        if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
        {
-            /* if node is already in graceful recovery just ignore the event */
-            if ( node_ptr->graceful_recovery_counter != 0 )
+            if ( mnfa_active == false )
            {
-                dlog ("%s %s loss event ; already in graceful recovery try %d",
-                          hostname.c_str(),
-                          get_iface_name_str(iface),
-                          node_ptr->graceful_recovery_counter );
-                return ;
-            }
-            elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface));
-            if ( iface == CLSTR_IFACE )
-            {
-                node_ptr->heartbeat_failed[CLSTR_IFACE] = true ;
-            }
-            else if ( iface == MGMNT_IFACE )
-            {
-                node_ptr->heartbeat_failed[MGMNT_IFACE] = true ;
-            }
-            if (mnfa_host_count[iface] < this->mnfa_threshold)
-            {
-                elog ("%s %s network heartbeat failure\n", hostname.c_str(), get_iface_name_str(iface));
-
-                nodeLinkClass::set_availStatus ( hostname, MTC_AVAIL_STATUS__FAILED );
-
-                if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) &&
-                    ( node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK ))
+                /* if node is already in graceful recovery just ignore the event */
+                if ( node_ptr->graceful_recovery_counter != 0 )
                {
-                    if ( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER )
+                    dlog ("%s %s loss event ; already in graceful recovery try %d",
+                            hostname.c_str(),
+                            get_iface_name_str(iface),
+                            node_ptr->graceful_recovery_counter );
+                    return ;
+                }
+                elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface));
+                if ( iface == CLSTR_IFACE )
+                {
+                    node_ptr->heartbeat_failed[CLSTR_IFACE] = true ;
+                }
+                else if ( iface == MGMNT_IFACE )
+                {
+                    node_ptr->heartbeat_failed[MGMNT_IFACE] = true ;
+                }
+                if (mnfa_host_count[iface] < this->mnfa_threshold)
+                {
+                    elog ("%s %s network heartbeat failure\n", hostname.c_str(), get_iface_name_str(iface));
+
+                    nodeLinkClass::set_availStatus ( hostname, MTC_AVAIL_STATUS__FAILED );
+
+                    if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) &&
+                        ( node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK ))
                    {
-                        wlog ("%s restarting graceful recovery\n", hostname.c_str() );
+                        if ( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER )
+                        {
+                            wlog ("%s restarting graceful recovery", hostname.c_str() );
+                        }
+                        else
+                        {
+                            wlog ("%s starting graceful recovery", hostname.c_str() );
+                        }
+                        recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
+                        adminActionChange   ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
                    }
                    else
                    {
-                        wlog ("%s starting graceful recovery\n", hostname.c_str() );
+                        mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_FAIL_HB );
+                        enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
+                        adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
                    }
-                    recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
-                    adminActionChange   ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
-                }
-                else
-                {
-                    mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_FAIL_HB );
-                    enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
-                    adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
                }
            }
        }
@ -5802,11 +5755,8 @@ void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface

        hbs_minor_clear ( node_ptr, iface );
    }
-    else if ( this->mtcTimer_dor.tid )
-    {
-        manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_MAJOR );
-    }
-    else
+    /* - we don't care about locked hosts */
+    else if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
    {
        if ( mnfa_active == false )
        {
@ -5815,7 +5765,7 @@ void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface

        mnfa_add_host ( node_ptr, iface );

-        if ( nodeLinkClass::get_operState ( hostname ) == MTC_OPER_STATE__ENABLED )
+        if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
        {
            if ( iface == MGMNT_IFACE )
            {
@ -5852,16 +5802,11 @@ void nodeLinkClass::manage_heartbeat_minor ( string hostname, iface_enum iface,
        alog ("%s %s Heartbeat Minor (clear)\n", hostname.c_str(), get_iface_name_str(iface));
        hbs_minor_clear ( node_ptr, iface );
    }
-    /* if not a clear then only set if the host is enabled
-     * - we don't care about disabled hosts */
-    else if ( nodeLinkClass::get_operState ( hostname ) == MTC_OPER_STATE__ENABLED )
+    /* - we don't care about locked hosts */
+    else if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
+             ( node_ptr->operState  == MTC_OPER_STATE__ENABLED ))
    {
-        if ( this->mtcTimer_dor.tid )
-        {
-            manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_MINOR );
-        }
-
-        else if ( node_ptr->hbs_minor[iface] != true )
+        if ( node_ptr->hbs_minor[iface] != true )
        {
            mnfa_add_host ( node_ptr, iface );
        }
@ -7077,8 +7022,7 @@ int nodeLinkClass::adminActionChange ( struct nodeLinkClass::node * node_ptr,
         *  other action can take effect.
         *  If its not one of these action then just proceed with it
         **/
-        if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ADD ) &&
-            ( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK ))
+        if ( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK )
        {
            clog ("%s Administrative Action '%s' -> '%s'\n",
                      node_ptr->hostname.c_str(),
@ -8511,30 +8455,76 @@ int nodeLinkClass::ar_handler ( struct nodeLinkClass::node    * node_ptr,
 * Description: Create a specifically formatted log for the specified
 *              hosts DOR recovery state and timing.
 *
- * Parameters : The node and a caller prefix string that states if the node
- *              is ENABELD
+ * Assumptions: Only logged if the active controller has an uptime
+ *              less than 20 minutes (default). Configurable in mtce.conf
+ *
+ * Parameters :
+ *
+ * @param node_ptr Pointer to the node in the inventoried node linked list.
+ * @param node_state_log_prefix Prefix for the node's state log messages.
+ *              is ENABLED
+ *              is DEGRADED
+ *              is DISABLED
 *              is FAILED
- *              is ENMABLED-degraded
- *              etc.
+ *              is OFFLINE
+ * @param extra string representing where this function was called.
 *
 ***************************************************************************/
 void nodeLinkClass::report_dor_recovery ( struct nodeLinkClass::node * node_ptr,
-                                          string node_state_log_prefix )
+                                          string node_state_log_prefix,
+                                          string extra )
 {
    struct timespec ts ;
    clock_gettime (CLOCK_MONOTONIC, &ts );
-    node_ptr->dor_recovery_time = ts.tv_sec ;
-    plog ("%-12s %s ; DOR Recovery %2d:%02d mins (%4d secs) (uptime:%2d:%02d mins)\n",
-                 node_ptr->hostname.c_str(),
-                 node_state_log_prefix.c_str(),
-                 node_ptr->dor_recovery_time/60,
-                 node_ptr->dor_recovery_time%60,
-                 node_ptr->dor_recovery_time,
-                 node_ptr->uptime/60,
-                 node_ptr->uptime%60 );

-    node_ptr->dor_recovery_mode = false ;
-    node_ptr->was_dor_recovery_mode = false ;
+    if ( this->dor_mode_active )
+    {
+        node_ptr->dor_recovery_time = ts.tv_sec ;
+        plog ("%-12s %-11s ; DOR Recovery %2d:%02d mins (%4d secs) (uptime:%2d:%02d mins) %s",
+                    node_ptr->hostname.c_str(),
+                    node_state_log_prefix.c_str(),
+                    node_ptr->dor_recovery_time/60,
+                    node_ptr->dor_recovery_time%60,
+                    node_ptr->dor_recovery_time,
+                    node_ptr->uptime/60,
+                    node_ptr->uptime%60,
+                    extra.c_str());
+
+        // Accounting
+        int unlocked_nodes = this->unlocked_nodes() ;
+        if ( ++this->dor_recovered_nodes == unlocked_nodes )
+        {
+            mtcTimer_reset (this->mtcTimer_dor);
+            this->dor_mode_active = false ;
+            this->dor_mode_active_log_throttle = 0 ;
+            ilog ("%-13s %3d of %-3d ; DOR Recovery ; all nodes are recovered ; active controller uptime:%ld",
+                this->my_hostname.c_str(),
+                this->dor_recovered_nodes,
+                unlocked_nodes,
+                ts.tv_sec);
+        }
+        else if ( this->dor_recovered_nodes > this->unlocked_nodes() )
+        {
+            slog ("%s unexpected extra DOR recovery call ; unlocked:%d recovered:%d",
+                    node_ptr->hostname.c_str(),
+                    unlocked_nodes,
+                    this->dor_recovered_nodes);
+        }
+        else
+        {
+            ilog ("%s %d of %d DOR nodes recovered",
+                    node_ptr->hostname.c_str(),
+                    this->dor_recovered_nodes,
+                    unlocked_nodes);
+        }
+    }
+    else
+    {
+        dlog ("%s DOR Recovery called with '%s %s' while dor mode disabled",
+                  node_ptr->hostname.c_str(),
+                  node_state_log_prefix.c_str(),
+                  extra.c_str());
+    }
 }

 void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
@ -8548,10 +8538,7 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
        return ;
    }

-    if ( node_ptr->was_dor_recovery_mode )
-    {
-        report_dor_recovery ( node_ptr , "is FAILED " );
-    }
+    report_dor_recovery ( node_ptr , "is FAILED", "full enable" );

    plog ("%s Forcing Full Enable Sequence\n", node_ptr->hostname.c_str());

@ -8561,9 +8548,8 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
    allStateChange      ( node_ptr, node_ptr->adminState, MTC_OPER_STATE__DISABLED, MTC_AVAIL_STATUS__FAILED );
    enableStageChange   ( node_ptr, MTC_ENABLE__FAILURE    );
    recoveryStageChange ( node_ptr, MTC_RECOVERY__START    ); /* reset the fsm */
-    // don't override the add action or lock actions /
-    if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ADD ) &&
-        ( node_ptr->adminAction != MTC_ADMIN_ACTION__LOCK ) &&
+    // don't override the lock actions /
+    if (( node_ptr->adminAction != MTC_ADMIN_ACTION__LOCK ) &&
        ( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK ))
    {
        adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); // no action
@ -9784,10 +9770,8 @@ void nodeLinkClass::mem_log_general ( void )
 void nodeLinkClass::mem_log_dor ( struct nodeLinkClass::node * node_ptr )
 {
    char str[MAX_MEM_LOG_DATA] ;
-    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s  DOR - Active: %c  Was: %c  Time: %5d (00:%02d:%02d)\n",
+    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s  DOR - Time: %5d (00:%02d:%02d)\n",
                node_ptr->hostname.c_str(),
-                node_ptr->dor_recovery_mode ? 'Y' : 'N',
-                node_ptr->was_dor_recovery_mode ? 'Y' : 'N',
                node_ptr->dor_recovery_time,
                node_ptr->dor_recovery_time ? node_ptr->dor_recovery_time/60 : 0,
                node_ptr->dor_recovery_time ? node_ptr->dor_recovery_time%60 : 0);
@ -9813,12 +9797,14 @@ void nodeLinkClass::mem_log_mnfa ( void )
 void nodeLinkClass::mem_log_general_mtce_hosts ( void )
 {
    char str[MAX_MEM_LOG_DATA] ;
-    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s EnableHosts -> Cont:%d Comp:%d Stor:%d StorType:%d\n",
+    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s EnableHosts -> Cont:%d Comp:%d Stor:%d StorType:%d Unlocked:%d DOR:Recovered:%d\n",
                my_hostname.c_str(),
                num_controllers_enabled(),
                enabled_compute_nodes(),
                enabled_storage_nodes(),
-                get_storage_backend());
+                get_storage_backend(),
+                unlocked_nodes(),
+                dor_recovered_nodes);
    mem_log (str);
 }

--- a/mtce/src/common/nodeClass.h
+++ b/mtce/src/common/nodeClass.h
@ -368,10 +368,8 @@ private:
        /* the fault handling offline handler timer */
        struct mtc_timer offline_timer ;

-        /* Host level DOR recovery mode time and bools */
+        /* Host level DOR recovery time */
        int              dor_recovery_time  ;
-        bool             dor_recovery_mode  ;
-        bool         was_dor_recovery_mode  ;

        /** Integer code representing the host health */
        int  health ;
@ -1275,7 +1273,7 @@ private:

    /* Dead Office Recovery - system level controls */
    void manage_dor_recovery ( struct nodeLinkClass::node * node_ptr, EFmAlarmSeverityT severity );
-    void report_dor_recovery ( struct nodeLinkClass::node * node_ptr, string node_state_log_prefix );
+    void report_dor_recovery ( struct nodeLinkClass::node * node_ptr, string node_state_log_prefix, string extra );

    struct {
        struct node * head_ptr ; /**< Pulse Linked List Head pointer */
@ -1398,6 +1396,7 @@ public:
    bool dor_mode_active ;
    unsigned int dor_start_time  ;
    int  dor_mode_active_log_throttle ;
+    int  dor_recovered_nodes = 0; /**< DOR node recovery count            */

    bool hbs_disabled          ; /**< Control heartbeat service state    */
    bool hbs_state_change      ; /**< Flag service state change          */
@ -1702,6 +1701,9 @@ public:
    /** Remove a host from Node list */
    int rem_host ( string & hostname );

+    /** Get the number of unlocked nodes */
+    int unlocked_nodes ( void );
+
    /** Get the number of worker hosts that are operationally 'enabled' */
    int enabled_compute_nodes ( void );

--- a/mtce/src/maintenance/mtcNodeCtrl.cpp
+++ b/mtce/src/maintenance/mtcNodeCtrl.cpp
@ -710,6 +710,15 @@ int daemon_configure ( void )
    else
        mtcInv.node_reinstall_timeout = MTC_REINSTALL_TIMEOUT_DEFAULT ;

+    if ( mtc_config.dor_mode_detect <= 0 )
+    {
+        wlog ("DOR mode detect timeout is invalid (%d), setting to default (%d)",
+                mtc_config.dor_mode_detect,
+                DEFAULT_DOR_MODE_DETECT);
+
+        mtc_config.dor_mode_detect = DEFAULT_DOR_MODE_DETECT ;
+    }
+
    if ( mtc_config.dor_mode_timeout <= 0 )
    {
        slog ("DOR Mode Timeout is invalid (%d), setting to default (%d)\n",
@ -1653,7 +1662,7 @@ void daemon_service_run ( void )
    }
 #endif

-    if ( ts.tv_sec < MTC_MINS_15 )
+    if ( ts.tv_sec < mtc_config.dor_mode_detect )
    {
        /* AIO DOR window is much greater in AIO since heartbeat
         * cannot start until the inactive AIO has run both manifests */
@ -1669,16 +1678,16 @@ void daemon_service_run ( void )
        mtcInv.dor_mode_active = true ;
        mtcInv.dor_start_time  = ts.tv_sec ;

-        ilog ("%-12s ---------- ; DOR Recovery ---------------------- -------------------\n", mtcInv.my_hostname.c_str());
-        ilog ("%-12s is ACTIVE  ; DOR Recovery %2d:%02d mins (%4d secs) (duration %3d secs)\n",
+        ilog ("%-12s ----------- ; DOR Recovery ---------------------- -------------------\n", mtcInv.my_hostname.c_str());
+        ilog ("%-12s  is ACTIVE  ; DOR Recovery %2d:%02d mins (%4d secs) (dor timeout in %3d secs)\n",
                mtcInv.my_hostname.c_str(),
                mtcInv.dor_start_time/60,
                mtcInv.dor_start_time%60,
                mtcInv.dor_start_time,
                timeout );
-        ilog ("%-12s ---------- ; DOR Recovery ---------------------- -------------------\n", mtcInv.my_hostname.c_str());
-        ilog ("%-12s host state ; DOR Recovery    controller uptime         host uptime    \n", mtcInv.my_hostname.c_str());
-        ilog ("%-12s ---------- ; DOR Recovery ---------------------- -------------------\n", mtcInv.my_hostname.c_str());
+        ilog ("%-12s ----------- ; DOR Recovery ---------------------- -------------------", mtcInv.my_hostname.c_str());
+        ilog ("%-12s host  state ; DOR Recovery    controller uptime         host uptime  ", mtcInv.my_hostname.c_str());
+        ilog ("%-12s ----------- ; DOR Recovery ---------------------- -------------------", mtcInv.my_hostname.c_str());
        mtcTimer_start ( mtcInv.mtcTimer_dor, mtcTimer_handler, timeout );
    }

@ -1992,7 +2001,12 @@ void daemon_service_run ( void )
         * then exit DOR mode. We do it here instead of  */
        if (( mtcInv.dor_mode_active == true ) && ( mtcInv.mtcTimer_dor.tid == NULL ))
        {
-            ilog ("DOR mode disable\n");
+            wlog ("%s DOR mode disabled ; DOR Recovery Timeout ; %d of %d unlocked hosts ; active controller uptime:%d",
+                      mtcInv.my_hostname.c_str(),
+                      mtcInv.dor_recovered_nodes,
+                      mtcInv.unlocked_nodes(),
+                      mtcInv.get_uptime(mtcInv.my_hostname));
+            mtcInv.dor_mode_active_log_throttle = 0 ;
            mtcInv.dor_mode_active = false ;
        }
    }
--- a/mtce/src/maintenance/mtcNodeHdlrs.cpp
+++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp
@ -194,6 +194,7 @@ void nodeLinkClass::timer_handler ( int sig, siginfo_t *si, void *uc)
    {
        mtcTimer_stop_int_safe ( mtcTimer_dor );
        mtcTimer_dor.ring = true ;
+        this->dor_mode_active_log_throttle = 0 ;
        return ;
    }

@ -488,7 +489,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
                         AR_LOG_THROTTLE_THRESHOLD,
                         "%s auto recovery disabled cause:%d",
                         node_ptr->hostname.c_str(), node_ptr->ar_cause );
-         return (RETRY); ;
+         return (RETRY);
    }

    if ( THIS_HOST )
@ -787,11 +788,10 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
        }
        case MTC_ENABLE__START:
        {
-            manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );

            plog ("%s Main Enable FSM (from start)%s\n",
                      node_ptr->hostname.c_str(),
-                      node_ptr->was_dor_recovery_mode ? " (from DOR)" : "" );
+                      this->dor_mode_active ? " (DOR active)" : "" );

            /* clear all the past enable failure bools */
            clear_main_failed_bools ( node_ptr );
@ -1547,10 +1547,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
                /* Inform the VIM that this host is enabled */
                mtcVimApi_state_change ( node_ptr, VIM_HOST_ENABLED, 3 );

-                plog ("%s is ENABLED%s\n", node_ptr->hostname.c_str(),
-                          node_ptr->was_dor_recovery_mode ? " (from DOR)" : "");
-                node_ptr->dor_recovery_mode = false ;
-                node_ptr->was_dor_recovery_mode = false ;
+                plog ("%s is ENABLED", node_ptr->hostname.c_str());
                node_ptr->http_retries_cur = 0 ;

                adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
@ -1718,13 +1715,12 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
        {
            if ( node_ptr->mtcAlive_online == true )
            {
-                manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );

                mtcTimer_stop ( node_ptr->mtcTimer );

                ilog ("%s got requested mtcAlive%s\n",
                          node_ptr->hostname.c_str(),
-                          node_ptr->was_dor_recovery_mode ? " (DOR)" : "" );
+                          this->dor_mode_active ? " (DOR mode)" : "" );

                stop_offline_handler ( node_ptr );

@ -1793,7 +1789,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
                    /* did not reboot case */
                    wlog ("%s Connectivity Recovered ; host did not reset%s (uptime:%d)",
                              node_ptr->hostname.c_str(),
-                              node_ptr->was_dor_recovery_mode ? " (DOR)" : "",
+                              this->dor_mode_active ? " (DOR mode)" : "",
                              node_ptr->uptime);

                    wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str());
@ -1808,9 +1804,9 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
                else
                {
                    wlog ("%s Connectivity Recovered ; host has reset\n", node_ptr->hostname.c_str());
-                    ilog ("%s ... continuing%sgraceful recovery ; (OOB: %08x)\n",
+                    ilog ("%s ... continuing graceful recovery%s ; (OOB: %08x)",
                              node_ptr->hostname.c_str(),
-                              node_ptr->was_dor_recovery_mode ? " (DOR) " : " ",
+                              this->dor_mode_active ? " (DOR mode)" : "",
                              node_ptr->mtce_flags);
                    ilog ("%s ... without additional reboot %s (uptime:%d)\n",
                              node_ptr->hostname.c_str(),
@ -1845,7 +1841,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
                wlog ("%s Loss Of Communication for %d seconds ; disabling host%s\n",
                          node_ptr->hostname.c_str(),
                          loc_recovery_timeout,
-                          node_ptr->dor_recovery_mode ? " (DOR)" : "" );
+                          this->dor_mode_active ? " (DOR mode)" : "" );
                wlog ("%s ... stopping host services\n", node_ptr->hostname.c_str());
                wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str());

@ -1898,7 +1894,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )

            /* Only try and issue in-line recovery reboot or reset if
             * NOT in Dead Office Recovery (DOR) mode. */
-            if ( node_ptr->dor_recovery_mode == false )
+            if ( this->dor_mode_active )
            {
                ilog ("%s issuing one time graceful recovery reboot over management network\n",
                          node_ptr->hostname.c_str());
@ -1945,7 +1941,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
                LOAD_NODETYPE_TIMERS ;

                /* load the mtcAlive timeout to accomodate for dor recovery */
-                timeout = node_ptr->mtcalive_timeout + daemon_get_cfg_ptr()->dor_recovery_timeout_ext ;
+                timeout = node_ptr->mtcalive_timeout ;
            }

            /* start the timer that waits for MTCALIVE */
@ -1955,7 +1951,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
                      node_ptr->hostname.c_str(),
                      MTC_TASK_RECOVERY_WAIT,
                      timeout,
-                      node_ptr->dor_recovery_mode ? " (DOR) " : " " ,
+                      this->dor_mode_active ? " (DOR) " : " " ,
                      node_ptr->uptime_save );

            clear_service_readies ( node_ptr );
@ -2024,7 +2020,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
                          node_ptr->hostname.c_str(),
                          MTC_TASK_RECOVERY_WAIT,
                          timeout,
-                          node_ptr->dor_recovery_mode ? " (DOR) " : " " ,
+                          this->dor_mode_active ? " (DOR mode) " : " " ,
                          node_ptr->uptime_save );

                clear_service_readies ( node_ptr );
@ -2075,7 +2071,6 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
            {
                mtcTimer_stop ( node_ptr->mtcTimer );

-                manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );

                /* If the host's uptime is bigger than the saved uptime then
                 * the host has not reset yet we have disabled services
@ -2084,7 +2079,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
                if ((( node_ptr->uptime_save != 0 ) &&
                     ( node_ptr->uptime >= node_ptr->uptime_save )) ||
                    (( node_ptr->uptime_save == 0 ) &&
-                     ( node_ptr->uptime > MTC_MINS_15 )))
+                     ( node_ptr->uptime > MTC_MINS_20 )))
                {
                    ilog ("%s regained MTCALIVE from host that did not reboot (uptime:%d)\n",
                                  node_ptr->hostname.c_str(), node_ptr->uptime );
@ -2121,7 +2116,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
                                  node_ptr->hostname.c_str(), node_ptr->uptime, node_ptr->uptime_save );
                    ilog ("%s ... continuing with graceful recovery %s\n",
                                  node_ptr->hostname.c_str(),
-                                  node_ptr->dor_recovery_mode ? "(DOR)" : " ");
+                                  this->dor_mode_active ? "(DOR mode)" : "");
                    ilog ("%s ... without additional reboot %s\n",
                                  node_ptr->hostname.c_str(), node_ptr->bm_ip.empty() ? "or reset" : "" );

@ -2138,7 +2133,6 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
            }
            else if ( node_ptr->mtcTimer.ring == true )
            {
-                manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );

                /* Set the FSM task state to init failed */
                mtcInvApi_update_task ( node_ptr, "Graceful Recovery Failed" );
@ -2523,15 +2517,13 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
            node_ptr->http_retries_cur = 0 ;

            doneQueue_purge ( node_ptr );
-            if ( node_ptr->was_dor_recovery_mode )
+            if ( this->dor_mode_active )
            {
-                report_dor_recovery (  node_ptr , "is ENABLED" );
-            }
-            else
-            {
-                plog ("%s is ENABLED (Gracefully Recovered)\n",
-                          node_ptr->hostname.c_str());
+                report_dor_recovery (  node_ptr , "is ENABLED", "recovery" );
            }
+            plog ("%s is ENABLED (Gracefully Recovered%s)",
+                      node_ptr->hostname.c_str(),
+                      this->dor_mode_active ? " in DOR mode" : "");
            alarm_enabled_clear ( node_ptr, false );
            break ;
        }
@ -6023,11 +6015,26 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
        case MTC_ADD__START:
        {
            bool timer_set = false ;
-            plog ("%s Host Add\n", node_ptr->hostname.c_str());
+            if ( THIS_HOST )
+            {
+                struct timespec ts ;
+                clock_gettime (CLOCK_MONOTONIC, &ts );
+                node_ptr->uptime = ts.tv_sec ;
+            }
+            else if ( ! node_ptr->mtcClient_ready )
+            {
+                /* If we have not received a mtcAlive event from the
+                 * mtcClient already then lets request it since that
+                 * is how we get its uptime.
+                 * Don't trust what is in the database since it will
+                 * be stale. Best to default to zero so the logs will
+                 * show that there has been no mtcAlive received */
+                node_ptr->uptime = 0 ;
+                send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, MGMNT_INTERFACE );
+                send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, PXEBOOT_INTERFACE );
+            }

-            /* Request a mtcAlive message ; gives us uptime ; don't trust what is in the database */
-            node_ptr->uptime = 0 ;
-            send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, MGMNT_INTERFACE );
+            plog ("%s Host Add (uptime:%d)", node_ptr->hostname.c_str(), node_ptr->uptime );

            ilog ("%s %s %s-%s-%s (%s)\n",
                node_ptr->hostname.c_str(),
@ -6075,14 +6082,31 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
            EFmAlarmSeverityT mtcAlive_alarm_severity =
                mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__MTCALIVE);

-            /* Clear generic enable alarm over process restart.
-             * Will get reasserted if the cause condition still exists */
+            /* Manage an existing enable alarm */
            if ( enable_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
            {
-                ilog ("%s found enable alarm ; clearing %s",
+                /* Added the unlocked-disabled check to avoid clearing the
+                 * enabled alarm when the node is found to be unlocked-disabled
+                 * with the enable alarm already asserted.
+                 * We don't want to clear it in that case. */
+                if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
+                    ( node_ptr->operState == MTC_OPER_STATE__DISABLED ))
+                {
+                    node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ;
+                    node_ptr->alarms[MTC_ALARM_ID__ENABLE] = enable_alarm_severity ;
+                    wlog ("%s found enable alarm while unlocked-disabled ; loaded %s",
                          node_ptr->hostname.c_str(),
-                          alarmUtil_getSev_str(enable_alarm_severity).c_str());
-                mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
+                           alarmUtil_getSev_str(enable_alarm_severity).c_str());
+                }
+                else
+                {
+                    ilog ("%s found enable alarm while %s-%s ; clearing %s",
+                              node_ptr->hostname.c_str(),
+                              adminState_enum_to_str (node_ptr->adminState).c_str(),
+                              operState_enum_to_str  (node_ptr->operState_subf).c_str(),
+                              alarmUtil_getSev_str(enable_alarm_severity).c_str());
+                    mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
+                }
            }

            /* The config alarm is maintained if it exists.
@ -6230,6 +6254,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
                        alarm_luks_failure ( node_ptr );
                    }
                    node_ptr->ar_disabled = true ;
+                    this->report_dor_recovery ( node_ptr, "is DISABLED" , "auto recovery disabled");

                    if ( THIS_HOST )
                        mtcInvApi_update_states ( node_ptr, "unlocked", "enabled", "degraded" );
@ -6341,22 +6366,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
            /* Stop the work queue wait timer */
            mtcTimer_reset ( node_ptr->mtcTimer );

-            /* Only start it on this add operation if host is
-             * already unlocked and enabled and not the active controller */
-            if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
-                ( node_ptr->operState  == MTC_OPER_STATE__ENABLED ))
-            {
-                /* start the heartbeat service in all cases except for
-                 * THIS host and AIO controller hosts */
-                if ( NOT_THIS_HOST )
-                {
-                    if (( LARGE_SYSTEM ) ||
-                        (( AIO_SYSTEM ) && ( this->dor_mode_active == false )))
-                    {
-                        send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
-                    }
-                }
-            }

            /* Only run hardware monitor if the bm ip is provisioned */
            if (( hostUtil_is_valid_bm_type  ( node_ptr->bm_type )) &&
@ -6367,9 +6376,63 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
            }

            this->ctl_mtcAlive_gate(node_ptr, false) ;
-            node_ptr->addStage = MTC_ADD__DONE ;
+            if (( NOT_THIS_HOST ) &&
+                ((( AIO_SYSTEM ) && ( is_controller(node_ptr) == false )) || ( LARGE_SYSTEM )) &&
+                ( this->hbs_failure_action != HBS_FAILURE_ACTION__NONE ) &&
+                ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
+                ( node_ptr->operState  == MTC_OPER_STATE__ENABLED ))
+            {
+                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_5 );
+                if ( ! node_ptr->hbsClient_ready )
+                {
+                    ilog ("%s waiting for hbsClient ready event (%d secs)", node_ptr->hostname.c_str(), MTC_MINS_5);
+                }
+                node_ptr->addStage = MTC_ADD__HEARTBEAT_WAIT ;
+            }
+            else
+            {
+                node_ptr->addStage = MTC_ADD__DONE ;
+            }
            break;
        }
+        case MTC_ADD__HEARTBEAT_WAIT:
+        {
+            /* Wait for hbsClient ready event */
+            if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
+            {
+                wlog ("%s hbsClient ready event timeout\n", node_ptr->hostname.c_str());
+            }
+            else if ( node_ptr->hbsClient_ready == false )
+            {
+                break ;
+            }
+            else
+            {
+                mtcTimer_reset ( node_ptr->mtcTimer );
+            }
+            plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
+                        node_ptr->hostname.c_str(),
+                        MTC_HEARTBEAT_SOAK_DURING_ADD,
+                        node_ptr->hbsClient_ready ? " ready event" : "out ready event" );
+
+            /* allow heartbeat to run for MTC_HEARTBEAT_SOAK_DURING_ADD
+             * seconds before we declare enable */
+            send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
+            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_DURING_ADD );
+            node_ptr->addStage = MTC_ADD__HEARTBEAT_SOAK ;
+            break ;
+        }
+        case MTC_ADD__HEARTBEAT_SOAK:
+        {
+            if ( node_ptr->mtcTimer.ring == true )
+            {
+                plog ("%s heartbeating", node_ptr->hostname.c_str());
+                /* if heartbeat is not working then we will
+                 * never get here */
+                node_ptr->addStage = MTC_ADD__DONE ;
+            }
+            break ;
+        }
        case MTC_ADD__DONE:
        default:
        {
@ -6396,16 +6459,55 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
                if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
                    ( node_ptr->operState  == MTC_OPER_STATE__ENABLED ))
                {
-                    /* In AIO if in DOR mode and the host is unlocked enabled
-                     * we need to run the subfunction handler and request
-                     * to start host services. */
+                    /* Need to run the subfunction enable handler
+                     * for AIO controllers while in DOR mode */
                    if ( this->dor_mode_active )
                    {
+                        ilog ("%s running subfunction enable for unlocked-enabled AIO controller (DOR mode)", node_ptr->hostname.c_str());
                        adminActionChange ( node_ptr , MTC_ADMIN_ACTION__ENABLE_SUBF );
+                        break ;
                    }
                }
            }

+            else if ( this->dor_mode_active )
+            {
+                /* The Enable SUBF handler will do this so lets not do it twice */
+                if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
+                {
+                    string state_str = "" ;
+                    if ( node_ptr->operState  == MTC_OPER_STATE__ENABLED )
+                    {
+                        state_str = "is ENABLED" ;
+                        if ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )
+                            state_str = "is DEGRADED" ;
+                    }
+                    else if ( node_ptr->availStatus == MTC_AVAIL_STATUS__FAILED )
+                    {
+                        state_str = "is FAILED" ;
+                    }
+                    else if ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE )
+                    {
+                        state_str = "is OFFLINE" ;
+                    }
+                    if ( ! state_str.empty() )
+                    {
+                        report_dor_recovery ( node_ptr , state_str, "" ) ;
+                    }
+                    else
+                    {
+                        ilog ("%-12s is waiting ; DOR Recovery ; %s-%s-%s ; mtcClient:%c hbsClient:%c uptime:%3d task:%s",
+                                 node_ptr->hostname.c_str(),
+                                 adminState_enum_to_str (node_ptr->adminState).c_str(),
+                                 operState_enum_to_str  (node_ptr->operState).c_str(),
+                                 availStatus_enum_to_str(node_ptr->availStatus).c_str(),
+                                 node_ptr->mtcClient_ready ? 'Y':'N',
+                                 node_ptr->hbsClient_ready ? 'Y':'N',
+                                 node_ptr->uptime,
+                                 node_ptr->task.empty() ? "empty" : node_ptr->task.c_str());
+                    }
+                }
+            }
            node_ptr->addStage = MTC_ADD__START;

            plog ("%s Host Add Completed (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime );
@ -7597,35 +7699,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
            {
                insvTestStageChange ( node_ptr, MTC_INSV_TEST__RUN );
            }
-            /* manage degrade state and alarms */
-            if ((  node_ptr->adminState  == MTC_ADMIN_STATE__UNLOCKED ) &&
-                (  node_ptr->operState   == MTC_OPER_STATE__ENABLED   ) &&
-                (  node_ptr->ar_disabled == false ))
-            {
-                /************************************************************
-                 *               Manage In-Service Alarms                   *
-                 ***********************************************************/
-
-                /* Manage Inservice Enable Alarm */
-                if ( node_ptr->hostservices_failed )
-                {
-                    alarm_insv_failure ( node_ptr );
-                }
-                else
-                {
-                    alarm_insv_clear ( node_ptr, false );
-                }
-
-                /* Manage Compute Subfunction Failure Alarm */
-                if ( node_ptr->hostservices_failed_subf )
-                {
-                    alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_MAJOR );
-                }
-                else
-                {
-                    alarm_compute_clear ( node_ptr, false );
-                }
-            }
            break ;
        }
        case MTC_INSV_TEST__RUN:
@ -7694,16 +7767,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
            if ((  node_ptr->adminState  == MTC_ADMIN_STATE__UNLOCKED ) &&
                (  node_ptr->operState   == MTC_OPER_STATE__ENABLED   ))
            {
-                /************************************************************
-                 * Prevent the start host services from running while in DOR
-                 ***********************************************************/
-                if ( node_ptr->dor_recovery_mode == true )
-                {
-                    /* wait longer for the host to boot up */
-                    wlog ("%s DOR recovery active ; waiting on host\n",
-                              node_ptr->hostname.c_str());
-                }
-                else if ( this->dor_mode_active == true )
+                if ( this->dor_mode_active == true )
                {
                    ilog_throttled ( this->dor_mode_active_log_throttle, 20,
                                     "DOR mode active\n");
@ -8313,4 +8377,4 @@ int nodeLinkClass::self_fail_handler ( struct nodeLinkClass::node * node_ptr )
                                  "force swact to unlocked-enabled standby controller");
    }
    return (PASS);
-}
+}
--- a/mtce/src/maintenance/mtcSubfHdlrs.cpp
+++ b/mtce/src/maintenance/mtcSubfHdlrs.cpp
@ -411,12 +411,6 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
            }
            else
            {
-                if ( node_ptr->dor_recovery_mode || node_ptr->was_dor_recovery_mode )
-                {
-                    node_ptr->dor_recovery_mode = false ;
-                    node_ptr->was_dor_recovery_mode = true ;
-                }
-
                if (( node_ptr->alarms[MTC_ALARM_ID__CH_COMP] != FM_ALARM_SEVERITY_CLEAR ) ||
                    ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CLEAR ) ||
                    ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CLEAR ))
@ -454,9 +448,9 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )

            node_ptr->subf_enabled = true ;
            node_ptr->inservice_failed_subf    = false ;
-            if ( node_ptr->was_dor_recovery_mode )
+            if ( this->dor_mode_active )
            {
-                report_dor_recovery (  node_ptr , "is ENABLED" );
+                report_dor_recovery (  node_ptr , "is ENABLED", "subf" );
            }
            else
            {
@ -488,9 +482,9 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
                                       MTC_OPER_STATE__ENABLED,
                                       MTC_AVAIL_STATUS__DEGRADED );

-            if ( node_ptr->was_dor_recovery_mode )
+            if ( this->dor_mode_active )
            {
-                report_dor_recovery (  node_ptr , "is ENABLED-degraded" );
+                report_dor_recovery (  node_ptr , "is DEGRADED", "subf" );
            }
            else
            {
@ -511,16 +505,6 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
                                       MTC_OPER_STATE__ENABLED,
                                       MTC_AVAIL_STATUS__DEGRADED );

-            if ( node_ptr->was_dor_recovery_mode )
-            {
-                report_dor_recovery (  node_ptr , "is DISABLED-failed" );
-            }
-            else
-            {
-                elog ("%s is DISABLED-failed (subfunction failed)\n",
-                          name.c_str() );
-            }
-            this->dor_mode_active = false ;

            alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_CRITICAL ) ;

@ -552,9 +536,6 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
            node_ptr->enabled_count++ ;
            node_ptr->health_threshold_counter = 0 ;

-            node_ptr->was_dor_recovery_mode = false ;
-            node_ptr->dor_recovery_mode = false ;
-            this->dor_mode_active = false ;

            ar_enable ( node_ptr );

--- a/mtce/src/scripts/mtc.conf
+++ b/mtce/src/scripts/mtc.conf
@ -125,12 +125,12 @@ loc_recovery_timeout = 5      ; Loss Of Communication Recovery Timeout
                              ;  the max number of seconds that a host can be in
                              ;  loss of communication state without failing the unit

-dor_mode_timeout = 20           ; The default base time in seconds for how long
+dor_mode_detect = 1200        ; Controller uptime less than this value puts mtcAgent
+                              ; into DOR mode active state. Default: 20 minutes
+
+dor_mode_timeout = 1000          ; The default base time in seconds for how long
                                ; maintenance DOR mode is active. This number
                                ; is extended by the number of enabled hosts.
-dor_recovery_timeout_ext = 1800 ; Dor timeout extension. An extra time in seconds
-                                ; that is added to the host specific recovery time
-                                ; making the overall host's dor recovery timeout.

 swact_timeout = 120         ; Seconds Mtce waits for HA Service SWACT before failing
                            ;  the swact operation