metal/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeHdlrs.cpp

/*
 * Copyright (c) 2013-2016 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
 */

/****************************************************************************
 * @file
 * Wind River CGTS Platform Node "Handlers" Implementation
 *
 * Description: This file contains the handlers that implement the X.731 FSM.

 * Interfaces:
 *
 *  nodeLinkClass::timer_handler
 *  nodeLinkClass::enable_handler
 *  nodeLinkClass::disable_handler
 *  nodeLinkClass::delete_handler
 *  nodeLinkClass::degrade_handler
 *  nodeLinkClass::reset_handler
 *  nodeLinkClass::reinstall_handler
 *  nodeLinkClass::event_handler
 *  nodeLinkClass::power_handler
 *  nodeLinkClass::recovery_handler
 *  nodeLinkClass::cfg_handler

 ****************************************************************************/

using namespace std;

#define __AREA__ "hdl"

#include "nodeBase.h"     /* for ... basic definitions       */
#include "mtcAlarm.h"     /* for ... mtcAlarm_<severity>     */
#include "nodeTimers.h"   /* for ... mtcTimer_start/stop     */

#include "jsonUtil.h"     /* for ... jsonApi_array_value     */
#include "tokenUtil.h"
#include "regexUtil.h"    /* for ... regexUtil_pattern_match */

#include "nodeClass.h"    /* All base stuff                  */
#include "ipmiUtil.h"     /* for ... power and reset support */

#include "mtcNodeMsg.h"   /* for ... send_mtc_cmd            */
#include "mtcInvApi.h"    /* for ... SYSINV API              */
#include "mtcSmgrApi.h"   /* for ... SM API                  */
#include "mtcVimApi.h"    /* for ... VIm API                 */

#include "daemon_ini.h"   /* for ... ini_parse               */
#include "daemon_common.h"


#define LOAD_NODETYPE_TIMERS                              \
    if ( is_controller(node_ptr) )                        \
    {                                                     \
       node_ptr->mtcalive_timeout = daemon_get_cfg_ptr()->controller_mtcalive_timeout ;   \
    }                                                     \
    else                                                  \
    {                                                     \
       node_ptr->mtcalive_timeout = daemon_get_cfg_ptr()->compute_mtcalive_timeout ;      \
    }                                                     \
    this->goenabled_timeout = daemon_get_cfg_ptr()->goenabled_timeout + 3 ; \
    // Adding 3 seconds to the timeout so that the agent timeout is a
    // little longer than the client.

/*************************************************************
 *
 * Name    : calc_reset_prog_timeout
 *
 * Purpose : Calculate the overall reset progression timeout
 *
 * ***********************************************************/
int nodeLinkClass::calc_reset_prog_timeout ( struct nodeLinkClass::node * node_ptr,
                                                                    int   retries )
{
    /* for the management interface */
    int to = MTC_RESET_PROG_OFFLINE_TIMEOUT ;

    /* and add on for the bmc interface if its provisioned */
    if ( node_ptr->bm_provisioned == true )
        to += MTC_RESET_PROG_OFFLINE_TIMEOUT ;

    /* add a small buffer */
    to += (MTC_ENABLED_TIMER*4) ;

    /* factor in the number of retries */
    to *= (retries+1) ;

    ilog ("%s Reboot/Reset progression has %d sec 'wait for offline' timeout\n",
              node_ptr->hostname.c_str(), to );
    ilog ("%s ... sources - mgmnt:Yes  infra:%s  bmc:%s\n",
              node_ptr->hostname.c_str(),
              infra_network_provisioned ? "Yes" : "No",
              node_ptr->bm_provisioned ? "Yes" : "No" );
    return (to);
}

void mtcTimer_handler ( int sig, siginfo_t *si, void *uc);

/* Looks up the timer ID and asserts the corresponding node's ringer */
void nodeLinkClass::timer_handler ( int sig, siginfo_t *si, void *uc)
{
    struct nodeLinkClass::node * node_ptr ;
    timer_t * tid_ptr = (void**)si->si_value.sival_ptr ;

    /* Avoid compiler errors/warnings for parms we must
     * have but currently do nothing with */
    sig=sig ; uc = uc ;

    if ( !(*tid_ptr) )
    {
        // tlog ("Called with a NULL Timer ID\n");
        return ;
    }

    /* Is this an offline timer */
    node_ptr = get_offline_timer ( *tid_ptr );
    if ( node_ptr )
    {
        // tlog ("%s offline timer ring\n", node_ptr->hostname.c_str());
        mtcTimer_stop_int_safe ( node_ptr->offline_timer );
        node_ptr->offline_timer.ring = true ;
        return ;
    }

    /* Is this TID a mtcAlive timer TID ? */
    node_ptr = get_mtcAlive_timer ( *tid_ptr );
    if ( node_ptr )
    {
        // tlog ("%s MtcAlive 'offline' timer ring\n", node_ptr->hostname.c_str());
        mtcTimer_stop_int_safe ( node_ptr->mtcAlive_timer );
        node_ptr->mtcAlive_timer.ring = true ;
        return ;
    }

    /* Is this TID a for the command FSM */
    node_ptr = get_mtcCmd_timer ( *tid_ptr );
    if ( node_ptr )
    {
        // tlog ("%s Mtc Command FSM timer ring\n", node_ptr->hostname.c_str());
        mtcTimer_stop_int_safe ( node_ptr->mtcCmd_timer );
        node_ptr->mtcCmd_timer.ring = true ;
        return ;
    }

    /* Is this TID a inservice test timer TID ? */
    node_ptr = get_insvTestTimer ( *tid_ptr );
    if ( node_ptr )
    {
        // tlog ("%s Insv Test timer ring\n", node_ptr->hostname.c_str());
        mtcTimer_stop_int_safe ( node_ptr->insvTestTimer );
        node_ptr->insvTestTimer.ring = true ;
        return ;
    }

    /* Is this TID a out-of-service test timer TID ? */
    node_ptr = get_oosTestTimer ( *tid_ptr );
    if ( node_ptr )
    {
        // tlog ("%s Oos Test timer ring\n", node_ptr->hostname.c_str());
        mtcTimer_stop_int_safe ( node_ptr->oosTestTimer );
        node_ptr->oosTestTimer.ring = true ;
        return ;
    }

    /* Is this TID a swact timer TID ? */
    node_ptr = get_mtcSwact_timer ( *tid_ptr );
    if ( node_ptr )
    {
        // tlog ("%s Swact Timer ring\n", node_ptr->hostname.c_str());
        mtcTimer_stop_int_safe ( node_ptr->mtcSwact_timer );
        node_ptr->mtcSwact_timer.ring = true ;
        return ;
    }

    /* Dead Office Recovery Mode Timer */
    if ( *tid_ptr == mtcTimer_dor.tid )
    {
        mtcTimer_stop_int_safe ( mtcTimer_dor );
        mtcTimer_dor.ring = true ;
        return ;
    }

    /* Multi-Node Failure Avoidance Timer ? */
    if ( *tid_ptr == mtcTimer_mnfa.tid )
    {
        // tlog ("%s Mnfa timer ring\n", mtcTimer_mnfa.hostname.c_str());
        mtcTimer_stop_int_safe ( mtcTimer_mnfa );
        mtcTimer_mnfa.ring = true ;
        return ;
    }

    /* is base mtc timer */
    if ( *tid_ptr == mtcTimer.tid )
    {
        // tlog ("%s Mtc timer ring\n", mtcTimer.hostname.c_str());
        mtcTimer_stop_int_safe ( mtcTimer );
        mtcTimer.ring = true ;
        return ;
    }

    /* is uptime refresh timer ? */
    if ( *tid_ptr == mtcTimer_uptime.tid )
    {
        // tlog ("%s Uptime 'refresh' timer ring\n", mtcTimer_uptime.hostname.c_str());
        mtcTimer_stop_int_safe ( mtcTimer_uptime );
        mtcTimer_uptime.ring = true ;

        /* This timer provides self corrective action handler as a secondary service
         * Currently it looks for the following ...
         *
         * 1. Stuck libevent smgrEvent.mutex gate and frees it after 5 uptime intervals
         *
         **/
        if ( smgrEvent.mutex )
        {
            daemon_config_type * cfg_ptr = daemon_get_cfg_ptr();

            /* Clear this mutex flag if stuck for more than 5 minutes */
            if ( ++smgrEvent.stuck > ((cfg_ptr->swact_timeout/60)+1))
            {
                // wlog ("Swact Mutex found stuck and has been auto cleared\n");
                smgrEvent.stuck = 0     ;
                smgrEvent.mutex = false ;
            }
        }
        else
        {
            /* Clear the stuck count */
            smgrEvent.stuck = 0 ;
        }
        return ;
    }
    /* is keystone token refresh timer ? */
    if (( *tid_ptr == mtcTimer_token.tid ) )
    {
        // tlog ("%s Token 'refresh' timer ring\n", mtcTimer_token.hostname.c_str());
        mtcTimer_stop_int_safe ( mtcTimer_token );
        mtcTimer_token.ring = true ;
        return ;
    }

    /* is the http request timer ? */
    node_ptr = get_http_timer ( *tid_ptr );
    if ( node_ptr )
    {
        // tlog ("%s Http timer ring\n", node_ptr->http_timer.hostname.c_str());
        mtcTimer_stop_int_safe ( node_ptr->http_timer );
        node_ptr->http_timer.ring = true ;

        if ( node_ptr->http_timer.mutex == true )
            node_ptr->http_timer.error = true ;

        return ;
    }

    /* get the node */
    node_ptr = get_mtcTimer_timer ( *tid_ptr );
    if ( node_ptr )
    {
        // tlog ("%s Timer ring\n", node_ptr->hostname.c_str());
        mtcTimer_stop_int_safe ( node_ptr->mtcTimer );
        node_ptr->mtcTimer.ring = true ;
        return ;
    }

    /* Is this TID a config timer TID ? */
    node_ptr = get_mtcConfig_timer ( *tid_ptr );
    if ( node_ptr )
    {
        // tlog ("%s Config Timer ring\n", node_ptr->hostname.c_str());
        mtcTimer_stop_int_safe ( node_ptr->mtcConfig_timer );
        node_ptr->mtcConfig_timer.ring = true ;
        return ;
    }

    /* is the thread timer ? */
    node_ptr = get_thread_timer ( *tid_ptr );
    if ( node_ptr )
    {
        mtcTimer_stop_int_safe ( node_ptr->ipmitool_thread_ctrl.timer );
        node_ptr->ipmitool_thread_ctrl.timer.ring = true ;
        return ;
    }

    /* is the ping timer ? */
    node_ptr = get_ping_timer ( *tid_ptr );
    if ( node_ptr )
    {
        /* is this the bm ping timer */
        if ( *tid_ptr == node_ptr->bm_ping_info.timer.tid )
        {
            mtcTimer_stop_int_safe ( node_ptr->bm_ping_info.timer );
            node_ptr->bm_ping_info.timer.ring = true ;
            return ;
        }
        /* there may be other ping timers introduced later */
    }

    /* is the bmc handler timer ? */
    node_ptr = get_bm_timer ( *tid_ptr );
    if ( node_ptr )
    {
        /* is this the bm ping timer */
        if ( *tid_ptr == node_ptr->bm_timer.tid )
        {
            mtcTimer_stop_int_safe ( node_ptr->bm_timer );
            node_ptr->bm_timer.ring = true ;
            return ;
        }
    }

    /* is the bmc handler timer ? */
    node_ptr = get_bmc_access_timer ( *tid_ptr );
    if ( node_ptr )
    {
        /* is this the bm ping timer */
        if ( *tid_ptr == node_ptr->bmc_access_timer.tid )
        {
            mtcTimer_stop_int_safe ( node_ptr->bmc_access_timer );
            node_ptr->bmc_access_timer.ring = true ;
            return ;
        }
    }

    /* is the host services handler timer ? */
    node_ptr = get_host_services_timer ( *tid_ptr );
    if ( node_ptr )
    {
        /* is this the bm ping timer */
        if ( *tid_ptr == node_ptr->host_services_timer.tid )
        {
            mtcTimer_stop_int_safe ( node_ptr->host_services_timer );
            node_ptr->host_services_timer.ring = true ;
            return ;
        }
    }

    node_ptr = get_powercycle_recovery_timer ( *tid_ptr );
    if ( node_ptr )
    {
        if (( *tid_ptr == node_ptr->hwmon_powercycle.recovery_timer.tid ) )
        {
            if ( node_ptr->hwmon_powercycle.attempts )
            {
                tlog ("%s powercycle monitor completed successfully after attempt %d\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->hwmon_powercycle.attempts);
            }

            recovery_ctrl_init ( node_ptr->hwmon_powercycle );

            if (( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
                ( node_ptr->availStatus != MTC_AVAIL_STATUS__POWERED_OFF ))
            {
               node_ptr->clear_task = true ;
            }

            /* cancel the timer */
            mtcTimer_stop_int_safe ( node_ptr->hwmon_powercycle.recovery_timer );

            node_ptr->hwmon_powercycle.recovery_timer.ring = true ;

            return ;
        }
    }

    node_ptr = get_powercycle_control_timer ( *tid_ptr );
    if ( node_ptr )
    {
        if (( *tid_ptr == node_ptr->hwmon_powercycle.control_timer.tid ) )
        {
            /* cancel the timer */
            mtcTimer_stop_int_safe ( node_ptr->hwmon_powercycle.control_timer );

            node_ptr->hwmon_powercycle.control_timer.ring = true ;

            return ;
        }
    }

    /* Is this TID a reset recovery timer TID ? */
    node_ptr = get_reset_recovery_timer ( *tid_ptr );
    if ( node_ptr )
    {
        if (( *tid_ptr == node_ptr->hwmon_reset.recovery_timer.tid ) )
        {
            tlog ("%s clearing hwmon reset holdoff timer\n",
                      node_ptr->hostname.c_str());

            recovery_ctrl_init ( node_ptr->hwmon_reset );

            mtcTimer_stop_int_safe ( node_ptr->hwmon_reset.recovery_timer );

            node_ptr->hwmon_reset.recovery_timer.ring = true ;
            return ;
        }
    }

    /* Is this TID a reset control timer TID ? */
    node_ptr = get_reset_control_timer ( *tid_ptr );
    if ( node_ptr )
    {
        if (( *tid_ptr == node_ptr->hwmon_reset.control_timer.tid ) )
        {
            tlog ("%s ringing hwmon reset control timer\n",
                      node_ptr->hostname.c_str());

            mtcTimer_stop_int_safe ( node_ptr->hwmon_reset.control_timer );

            node_ptr->hwmon_reset.control_timer.ring = true ;

            return ;
        }
    }

    /* cancel the timer by tid */
    mtcTimer_stop_tid_int_safe ( tid_ptr );
}

/* Inventory Object wrapper - does a node lookup and calls the timer handler */
void mtcTimer_handler ( int sig, siginfo_t *si, void *uc)
{
    nodeLinkClass * object_ptr = get_mtcInv_ptr() ;
    object_ptr->timer_handler ( sig, si, uc );
}

/** Responsible for recovering a host into its enabled state
 *
 * Steps: availibility is either unavailable or failed or intest if previous enable failed
 *  1. enable Start
 *     operational = disabled
 *  2. Notify VM Manager                            (signal)
 *  3. send disabled message to heartbeat service   (message)
 *  4. reboot host                                  (message)
 *     availability = intest
 *  5. wait for mtc alive                           (timer)
 *  6. wait for go enabled                          (timer)
 *  7. send enabled message to heartbeat service    (message)
 *  8. change state to enabled
 *     availability - available
 */

int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
{
    int rc = PASS ;

    if ( THIS_HOST )
    {
        /******************************************************************
         *
         * Intercept the unlock action for self.
         *    1. change the admin state to unlocked,
         *    2. send a lazy reboot and
         *    3. wait for the reboot
         *
         ******************************************************************/
        if ( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK )
        {
            bool aio = false ;
            if ( SIMPLEX_CPE_SYSTEM )
                aio = true ;
            else
                aio = false ;

            mtcInvApi_update_states_now ( node_ptr, "unlocked", "disabled" , "offline", "disabled", "offline" );
            mtcInvApi_update_task_now   ( node_ptr, aio ? MTC_TASK_CPE_SX_UNLOCK_MSG : MTC_TASK_SELF_UNLOCK_MSG );

            wlog ("%s unlocking %s with reboot\n",
                      my_hostname.c_str(),
                      aio ? "Simplex System" : "Active Controller" );

            /* should not return */
            return ( lazy_graceful_fs_reboot ( node_ptr ));
        }
    }

    switch ( (int)node_ptr->handlerStage.enable )
    {
        case MTC_ENABLE__FAILURE:
        {
            /**************************************************************
             * Failure of thr active controller has special handling.
             *
             * Condition 1: While there is no in-service backup controller
             *              to swact to. In this case the ctive controller
             *              - is only degraded to avoid a system outage.
             *              - the CPE subfunction is failed
             *              - compute SubFunction Alarm is raised
             *              - Enable alarm is raised
             *              - A process monitor alarm may also be raised if
             *                the failure was that of a critical process.
             *
             * Condition 2: While there is another controller to Swact to.
             *              In this case the active conroller is failed
             *              and maintenance will trigger SM to Swact and
             *              the failing active controller will get
             *              auto-recovered by the takeover controller.
             *
             * Condition 3: AIO Simplex failures can request thresholded
             *              auto-recovery. In doing so maintenance will
             *              increment the count in an auto recovery counter
             *              file and self reboot if that count does not exceed
             *              the auto recovery threshold. After 3 retries the
             *              threshold is exceeded and then maiantenance stops
             *              self rebooting and enters the state specified by
             *              condition 1 above.
             *
             ***************************************************************/
            bool degrade_only = false ;

            elog ("%s Main Enable FSM (from failed)\n", node_ptr->hostname.c_str());

            mtcTimer_reset ( node_ptr->mtcTimer );

            /* Stop heartbeat */
            send_hbs_command   ( node_ptr->hostname, MTC_CMD_STOP_HOST  );
            for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
            {
                hbs_minor_clear ( node_ptr, (iface_enum)iface );
            }

            node_ptr->cmdReq        = MTC_CMD_NONE ;
            node_ptr->cmdRsp        = MTC_CMD_NONE ;
            node_ptr->cmdRsp_status = 0 ;

            /* Raise Critical Enable Alarm */
            alarm_enabled_failure ( node_ptr );

            /* Handle active controller failures */
            if ( THIS_HOST )
            {
                /* Don't fail the only controller, degrade instead */
                degrade_only = true ;

                /* If the inactive controller is enabled then tru to swact to it.
                 * SM will reject till its eady, until then just run degraded */
                if ( is_inactive_controller_main_insv() == true )
                {
                    wlog ("%s has critical failure\n", node_ptr->hostname.c_str());
                    wlog ("%s ... requesting swact to in-service inactive controller\n", node_ptr->hostname.c_str());

                    mtcInvApi_update_task_now ( node_ptr, MTC_TASK_FAILED_SWACT_REQ );

                    /* Inform the VIM of the failure */
                    mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );

                    /* ask SM to swact to the backup controller */
                    mtcSmgrApi_request ( node_ptr, CONTROLLER_SWACT, 0 );

                    for ( int i = 0 ; i < SMGR_MAX_RETRIES ; i++ )
                    {
                        daemon_signal_hdlr ();
                        sleep (1);

                        /* Try and receive the response */
                        if ( mtcHttpUtil_receive ( nodeLinkClass::smgrEvent ) != RETRY )
                        {
                            wlog ("%s SM Swact Request Response: %s\n",
                                      node_ptr->hostname.c_str(),
                                      smgrEvent.response.c_str());
                            break ;
                        }
                    }
                    if ( nodeLinkClass::smgrEvent.active == true )
                    {
                        slog ("%s freeing smgrEvent activity state\n", node_ptr->hostname.c_str());
                        nodeLinkClass::smgrEvent.active = false ;
                    }

                    /* if we get here then proceed to delay for another swact attempt */
                    enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_SWACT_WAIT );

                    /* force ourselves into the enable handler */
                    if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE) &&
                        ( node_ptr->adminAction != MTC_ADMIN_ACTION__SWACT) &&
                        ( node_ptr->adminAction != MTC_ADMIN_ACTION__LOCK) &&
                        ( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK))
                    {
                        adminActionChange ( node_ptr, MTC_ADMIN_ACTION__ENABLE );
                    }

                    /* Wait 30 seconds before trying the Swact again */
                    mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_30 );
                    break ;
                }
                else
                {
                    this->autorecovery_enabled = true ;

                    /* use thresholded auto recovery for simplext failure case */
                    manage_autorecovery ( node_ptr );

                    if ( this->autorecovery_disabled == false )
                    {
                        wlog ("%s has critical failure.\n", node_ptr->hostname.c_str());
                        wlog ("%s ... downgrading to degrade with auto recovery disabled\n", node_ptr->hostname.c_str());
                        wlog ("%s ... to avoid disabling only enabled controller\n", node_ptr->hostname.c_str());
                        this->autorecovery_disabled = true ;
                    }

                    if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
                    {
                        /* Raise Critical Compute Function Alarm */
                        alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_CRITICAL );
                    }
                    adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
                }
            }

            /* Start fresh the next time we enter graceful recovery handler */
            node_ptr->graceful_recovery_counter = 0 ;
            node_ptr->health_threshold_counter  = 0 ;

            if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
            {
                node_ptr->inservice_failed_subf = true ;
                subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
                                            MTC_AVAIL_STATUS__FAILED );
            }

            if ( degrade_only == true )
            {
                allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                           MTC_OPER_STATE__ENABLED,
                                           MTC_AVAIL_STATUS__DEGRADED );
            }
            else
            {
                allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                           MTC_OPER_STATE__DISABLED,
                                           MTC_AVAIL_STATUS__FAILED );
            }

            /* Inform the VIM of the failure */
            mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );

            /* if we get here in controller simplex mode then go degraded
             * if we are not already degraded. Otherwise, fail. */
            if ( THIS_HOST && ( is_inactive_controller_main_insv() == false ))
            {
                /* autorecovery must be disabled */
                if (( node_ptr->adminState  != MTC_ADMIN_STATE__UNLOCKED ) ||
                    ( node_ptr->operState   != MTC_OPER_STATE__ENABLED   ) ||
                    ( node_ptr->availStatus != MTC_AVAIL_STATUS__DEGRADED))
                {
                    allStateChange  ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                                MTC_OPER_STATE__ENABLED,
                                                MTC_AVAIL_STATUS__DEGRADED );
                }
                /* adminAction state is already changed to NONE. */
            }
            else
            {
                enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_WAIT );
            }

            break;
        }
        case MTC_ENABLE__FAILURE_SWACT_WAIT:
        {
            if (( node_ptr->operState != MTC_OPER_STATE__ENABLED ) ||
                ( node_ptr->availStatus != MTC_AVAIL_STATUS__DEGRADED ))
            {
                allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                           MTC_OPER_STATE__ENABLED,
                                           MTC_AVAIL_STATUS__DEGRADED );
            }

            /* wait for the swact or to re-try MTC_ENABLE_FAILURE and likely
             * try the swact request again */
            if ( node_ptr->mtcTimer.ring == true )
            {
                enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
            }
            break ;
        }
        case MTC_ENABLE__FAILURE_WAIT:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ) == false )
            {
                break ;
            }
            /* Stop the enable sequence if the locked now;
             * this might occur if the unlock failed from inventory */
            if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
            {
                adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
                mtcInvApi_update_task ( node_ptr, "" );
            }
            enableStageChange ( node_ptr, MTC_ENABLE__START );
            node_ptr->mtcTimer.ring = false ;
            break ;
            /* Fall through */
        }
        case MTC_ENABLE__START:
        {
            manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );

            plog ("%s Main Enable FSM (from start)%s\n",
                      node_ptr->hostname.c_str(),
                      node_ptr->was_dor_recovery_mode ? " (from DOR)" : "" );

            /* clear all the past enable failure bools */
            clear_main_failed_bools ( node_ptr );
            clear_subf_failed_bools ( node_ptr );
            clear_hostservices_ctls ( node_ptr );

            /* Clear all degrade flags except for the HWMON one */
            clear_host_degrade_causes ( node_ptr->degrade_mask );
            node_ptr->degraded_resources_list.clear();

            /* Purge this hosts work and done queues */
            workQueue_purge    ( node_ptr );
            doneQueue_purge    ( node_ptr );
            mtcCmd_workQ_purge ( node_ptr );
            mtcCmd_doneQ_purge ( node_ptr );

            /* Assert the mtc alive gate */
            node_ptr->mtcAlive_gate = true ;

            node_ptr->mtcAlive_online  = false ;
            node_ptr->mtcAlive_offline = true  ;
            node_ptr->health_threshold_counter  = 0 ;
            node_ptr->graceful_recovery_counter = 0 ;
            node_ptr->http_retries_cur          = 0 ;
            node_ptr->insv_test_count           = 0 ;
            node_ptr->mnfa_graceful_recovery    = false ;

            node_ptr->goEnabled      = false ;
            node_ptr->goEnabled_subf = false ;

            mtc_nodeAvailStatus_enum availStatus_temp = node_ptr->availStatus ;
            switch ( node_ptr->availStatus )
            {
                case MTC_AVAIL_STATUS__INTEST:
                case MTC_AVAIL_STATUS__FAILED:

                   /* enable auto recovery if the inactive controller
                    * is out of service */
                   if (( is_controller (node_ptr) ) && ( NOT_THIS_HOST ))
                       this->autorecovery_enabled = true ;

                    /* fall through */

                case MTC_AVAIL_STATUS__DEGRADED:
                case MTC_AVAIL_STATUS__AVAILABLE:
                {
                    if (( is_active_controller ( node_ptr->hostname )) &&
                        ( is_inactive_controller_main_insv() == false ))
                    {
                        wlog ("%s recovering active controller from %s-%s-%s\n",
                                  node_ptr->hostname.c_str(),
                                  get_adminState_str(node_ptr->adminState).c_str(),
                                  get_operState_str(node_ptr->operState).c_str(),
                                  get_availStatus_str(node_ptr->availStatus).c_str());

                        mtcInvApi_update_task ( node_ptr, "" );

                        /* Special case */
                        // alarm_enabled_clear ( node_ptr, false );

                        //mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
                        //node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CLEAR ;

                        //allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                        //                           MTC_OPER_STATE__ENABLED,
                        //                           MTC_AVAIL_STATUS__DEGRADED );

                        // adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );

                        // return (PASS);
                    }
                    else
                    {
                        alarm_enabled_failure ( node_ptr );

                        if ( node_ptr->availStatus != MTC_AVAIL_STATUS__FAILED )
                        {
                            if ( node_ptr->operState != MTC_OPER_STATE__DISABLED )
                            {
                                mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_FAILED );
                            }
                            allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                                       MTC_OPER_STATE__DISABLED,
                                                       MTC_AVAIL_STATUS__FAILED );
                        }
                    }
                    break ;
                }
                /* Lets make any availability state corrections */

                case MTC_AVAIL_STATUS__OFFDUTY:
                case MTC_AVAIL_STATUS__ONLINE:
                    availStatus_temp = MTC_AVAIL_STATUS__ONLINE;
                    break ;
                case MTC_AVAIL_STATUS__OFFLINE:
                case MTC_AVAIL_STATUS__NOT_INSTALLED:
                    availStatus_temp = MTC_AVAIL_STATUS__OFFLINE;
                    break ;

                default:
                    slog ("Unknown availability state (%d)\n", availStatus_temp);
                    break ;
            }

            /* Never send a disable request to SM for this controller
             * or SM will shut us down. */
            if ( is_controller ( node_ptr ) && NOT_THIS_HOST )
            {
                mtcSmgrApi_request ( node_ptr,
                                     CONTROLLER_DISABLED,
                                     SMGR_MAX_RETRIES );
            }
            rc = allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                  MTC_OPER_STATE__DISABLED,
                                  availStatus_temp );

            if (( rc != PASS ) && ( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK ))
            {
                allStateChange ( node_ptr, MTC_ADMIN_STATE__LOCKED,
                                            MTC_OPER_STATE__DISABLED,
                                            availStatus_temp );

                mtcInvApi_update_task ( node_ptr, MTC_TASK_UNLOCK_FAILED );

                elog ("%s 'unlock' failed by System Inventory (rc:%d)\n",
                          node_ptr->hostname.c_str(), rc ) ;

                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, 15 );
                enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_WAIT );
                break ;
            }

            if ( NOT_THIS_HOST )
            {
                /* lets stop heartbeat */
                enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_STOP_CMD );
            }
            else
            {
                /* skip over the reset part as that was taken care and we are
                 * in the reboot recovery phase now. Look for the mtcAlive */

                /* In self-enable we don't need to purge mtcAlive just need
                 * to wait for one more. Assum,e offline, not online and open
                 * the mtcAlive gate. */
                node_ptr->mtcAlive_gate = false ;
                node_ptr->mtcAlive_online  = false ;
                node_ptr->mtcAlive_offline = true  ;
                /* set mtcAlive timeout */
                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_30 ) ;

                /* timer is started ok so we can do the stage transition */
                enableStageChange ( node_ptr, MTC_ENABLE__MTCALIVE_WAIT );
            }
            break ;
        }

        case MTC_ENABLE__HEARTBEAT_STOP_CMD:
        {
            /* Stop heartbeat */
            send_hbs_command   ( node_ptr->hostname, MTC_CMD_STOP_HOST  );

            /* Clear the minor and failkure flags if it is set for this host */
            for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
            {
                hbs_minor_clear ( node_ptr, (iface_enum)iface );
                node_ptr->heartbeat_failed[iface] = false ;
            }

            /* now reset/reboot the node by running reset progression */
            enableStageChange ( node_ptr, MTC_ENABLE__RESET_PROGRESSION );

            break ;
        }

        case MTC_ENABLE__RECOVERY_TIMER:
        {
            /* start the recovery wait timer */
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECOVERY_TIMEOUT );
            ilog ("%s Delaying Recovery for %d seconds\n",
                      node_ptr->hostname.c_str(),MTC_RECOVERY_TIMEOUT);

            enableStageChange ( node_ptr, MTC_ENABLE__RECOVERY_WAIT );

            break ;
        }

        case MTC_ENABLE__RECOVERY_WAIT:
        {
            if ( node_ptr->mtcTimer.ring == true )
            {
                enableStageChange ( node_ptr, MTC_ENABLE__RESET_PROGRESSION );

                node_ptr->mtcTimer.ring = false ;
            }
            if ( node_ptr->availStatus != MTC_AVAIL_STATUS__FAILED )
            {
                availStatusChange ( node_ptr, MTC_AVAIL_STATUS__FAILED );
            }
            break;
        }
        case MTC_ENABLE__RESET_PROGRESSION:
        {
            int overall_timeout = 0 ;

            plog ("%s reboot\n", node_ptr->hostname.c_str() );

            /* Health will get updated in the first
             * mtcAlive message after reset */
            node_ptr->health = NODE_HEALTH_UNKNOWN ;

            node_ptr->mtcCmd_work_fifo.clear();
            mtcCmd_init ( node_ptr->cmd );
            node_ptr->cmd.stage = MTC_CMD_STAGE__START ;
            node_ptr->cmd.cmd   = MTC_OPER__RESET_PROGRESSION ;
            node_ptr->cmd.parm1 = 0    ; /* retries */
            node_ptr->cmd.task  = true ; /* send task updates */
            node_ptr->mtcCmd_work_fifo.push_front(node_ptr->cmd);

            /* calculate the overall timeout period taking into account
             * all the reboot/reset sources that will be tried */
            overall_timeout = calc_reset_prog_timeout ( node_ptr , node_ptr->cmd.parm1 ) ;
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, overall_timeout ) ;
            enableStageChange ( node_ptr, MTC_ENABLE__RESET_WAIT );

            break ;
        }
        case MTC_ENABLE__RESET_WAIT:
        {
            /* Wait or reset progression FSM to complete */
            if ( node_ptr->mtcTimer.ring == true )
            {
                wlog ("%s Reset Progression Timeout\n", node_ptr->hostname.c_str());

                /* trigger some delay before another attempt */
                enableStageChange ( node_ptr, MTC_ENABLE__RECOVERY_TIMER );

                /* if we timeout then remove the reset progression command
                 * and cleanup the done queue ; just in case */
                if ( node_ptr->mtcCmd_done_fifo.size() )
                    node_ptr->mtcCmd_done_fifo.pop_front();
                if ( node_ptr->mtcCmd_work_fifo.size() )
                    node_ptr->mtcCmd_work_fifo.pop_front();
            }
            else if ( node_ptr->mtcCmd_done_fifo.size() )
            {
                mtcTimer_reset ( node_ptr->mtcTimer );

                node_ptr->mtcCmd_done_fifo_ptr =
                node_ptr->mtcCmd_done_fifo.begin();
                if ( node_ptr->mtcCmd_done_fifo_ptr->status != PASS )
                {
                    wlog ("%s Reset Unsuccessful (retries:%d) (rc:%d)\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->cmd.parm1,
                              node_ptr->mtcCmd_done_fifo_ptr->status );

                    /* trigger some delay before another attempt */
                    enableStageChange ( node_ptr, MTC_ENABLE__RECOVERY_TIMER );
                }
                else /* ... we got the reset or reboot */
                {
                    /* Set the FSM task state to booting */
                    mtcInvApi_update_task ( node_ptr, MTC_TASK_BOOTING );
                    enableStageChange ( node_ptr, MTC_ENABLE__INTEST_START );
                }
                /* Remove the reset progression command now that it is done */
                node_ptr->mtcCmd_done_fifo.pop_front();
            }
            break ;
        }

        case MTC_ENABLE__INTEST_START:
        {
            plog ("%s Booting (timeout: %d secs) (%d)\n",
                      node_ptr->hostname.c_str(),
                      node_ptr->mtcalive_timeout,
                      node_ptr->node_unlocked_counter);

            node_ptr->cmdReq    = MTC_CMD_NONE ;
            node_ptr->cmdRsp    = MTC_CMD_NONE ;
            node_ptr->unknown_health_reported = false ;
            node_ptr->mtcAlive_online         = false ;
            node_ptr->mtcAlive_offline        = true  ;
            node_ptr->goEnabled               = false ;

            clear_service_readies ( node_ptr );

            /* Set uptime to zero in mtce and in the database */
            node_ptr->uptime_save = 0 ;
            set_uptime ( node_ptr, 0 , false );

            /* start the timer that waits for MTC READY */
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, node_ptr->mtcalive_timeout );

            node_ptr->mtcAlive_purge = 0 ;

            /* timer is started ok so we can do the stage transition */
            enableStageChange ( node_ptr, MTC_ENABLE__MTCALIVE_PURGE );

            break ;
        }
        case MTC_ENABLE__MTCALIVE_PURGE:
        {
            node_ptr->mtcAlive_purge += 1 ;

            if ( node_ptr->mtcAlive_purge >= 20 )
            {
               /* open gate */
               node_ptr->mtcAlive_gate = false ;

               node_ptr->mtcAlive_purge = 0 ;
               /* timer is started ok so we can do the stage transition */
               enableStageChange ( node_ptr, MTC_ENABLE__MTCALIVE_WAIT );
            }
#ifdef WANT_PURGE_LOG
            else
            {
                dlog2 ("%s purging (%d) ...\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->mtcAlive_purge );
            }
#endif
            /* Clear out any mtcAlive messages that may
             * have come in while we were purging */
            node_ptr->mtcAlive_online  = false ;
            node_ptr->mtcAlive_offline = true  ;
            break ;
        }
        case MTC_ENABLE__MTCALIVE_WAIT:
        {
            /* search for the mtc alive message */
            if ( node_ptr->mtcAlive_online == true )
            {
                mtcTimer_reset ( node_ptr->mtcTimer );

                /* Check to see if the host is/got configured correctly */
                if ( (node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED) == 0 )
                {
                    elog ("%s configuration incomplete or failed (oob:%x:%x)\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->mtce_flags,
                              MTC_FLAG__I_AM_CONFIGURED);

                    /* raise an alarm for the failure of the config */
                    alarm_config_failure ( node_ptr );
                    mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_CONFIG_FAIL );
                    enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
                }
                else
                {
                    plog ("%s is MTCALIVE (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime );

                    /* Set the node mtcAlive timer to configured value.
                     * This will revert bact to normal timeout after any first
                     * unlock value that may be in effect. */
                    LOAD_NODETYPE_TIMERS ;

                    mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_ONLINE );
                    node_ptr->offline_log_reported = false ;
                    node_ptr->online_log_reported  = true ;

                    /* Request Out-Of--Service test execution */
                    send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MAIN_GOENABLED, MGMNT_INTERFACE );

                    /* now officially in the In-Test state */
                    availStatusChange ( node_ptr, MTC_AVAIL_STATUS__INTEST  );

                    /* O.K. Clear the alive */
                    node_ptr->mtcAlive_online = false ;

                    /* Go to the goEnabled stage */
                    enableStageChange ( node_ptr, MTC_ENABLE__GOENABLED_TIMER );

                    mtcInvApi_update_task ( node_ptr, MTC_TASK_TESTING );
                }
                break ;
            }
            else if ( mtcTimer_expired ( node_ptr->mtcTimer) )
            {
                elog ("%s Timeout waiting for MTCALIVE\n", node_ptr->hostname.c_str());

                /* raise an alarm for the enable failure */
                alarm_enabled_failure ( node_ptr );

                /* go back and issue reboot again */
                enableStageChange ( node_ptr, MTC_ENABLE__RESET_PROGRESSION );

                if ( node_ptr->availStatus != MTC_AVAIL_STATUS__FAILED )
                {
                    /* no longer In-Test ; we are 'Failed' again" */
                    availStatusChange ( node_ptr, MTC_AVAIL_STATUS__FAILED  );
                }

                /* Set the FSM task state to init failed */
                mtcInvApi_update_task ( node_ptr, MTC_TASK_BOOT_FAIL );

                break ;
            }
            else if ( node_ptr->mtcAlive_gate == true )
            {
                slog ("%s mtcAlive gate unexpectedly set, correcting ...\n",
                        node_ptr->hostname.c_str());

                 node_ptr->mtcAlive_gate = false ;
            }

            /* wait some more */
            break ;
        }

        case MTC_ENABLE__GOENABLED_TIMER:
        {
            mtcTimer_start ( node_ptr->mtcTimer,
                             mtcTimer_handler, this->goenabled_timeout);

            ilog ("%s waiting for GOENABLED (timeout: %d secs)\n",
                      node_ptr->hostname.c_str(), this->goenabled_timeout );

            node_ptr->goEnabled = false ;

            /* start waiting fhr the ENABLE READY message */
            enableStageChange ( node_ptr, MTC_ENABLE__GOENABLED_WAIT );

            break ;
        }
        case MTC_ENABLE__GOENABLED_WAIT:
        {
            /* The healthy code comes from the host in the mtcAlive message.
             * This 'if' clause was introduced to detected failure of host
             * without having to wait for the GOENABLED phase to timeout.
             *
             * This case is particularly important in the DOR case where
             * computes may have come up and fail to run their manifests
             * and sit there in an unconfigured state. We don't want them to
             * be gracefully recovered to enabled in that case. Instead
             * we want to recover the card through a reset as quickly as
             * possible. */
            if ( node_ptr->health == NODE_UNHEALTHY )
            {
                elog ("%s is UNHEALTHY\n", node_ptr->hostname.c_str());

                mtcTimer_reset ( node_ptr->mtcTimer );
                this->force_full_enable ( node_ptr );
            }
            /* search for the Go Enable message */
            else if ( node_ptr->goEnabled_failed == true )
            {
                elog ("%s got GOENABLED Failed\n", node_ptr->hostname.c_str());
                mtcTimer_reset ( node_ptr->mtcTimer );
                mtcInvApi_update_task ( node_ptr, MTC_TASK_INTEST_FAIL );
                enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
            }
            /* search for the Go Enable message */
            else if ( node_ptr->goEnabled == true )
            {
                mtcTimer_reset ( node_ptr->mtcTimer );
                plog ("%s got GOENABLED\n", node_ptr->hostname.c_str());
                // plog ("%s main configured OK\n", node_ptr->hostname.c_str());

                /* O.K. clearing the state now that we got it */
                node_ptr->goEnabled = false ;

                mtcInvApi_update_task ( node_ptr, MTC_TASK_INITIALIZING );

                /* ok. great, got the go-enabled message, lets move on */
                enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_START );
            }
            else if ( mtcTimer_expired ( node_ptr->mtcTimer ))
            {
                elog ("%s has GOENABLED Timeout\n", node_ptr->hostname.c_str());
                ilog ("%s ... the out-of-service tests took too long to complete\n",
                          node_ptr->hostname.c_str());

                mtcInvApi_update_task ( node_ptr, MTC_TASK_INTEST_FAIL_TO_ );
                node_ptr->mtcTimer.ring = false ;

                /* raise an alarm for the enable failure */
                alarm_enabled_failure ( node_ptr );

                /* go back and issue reboot again */
                enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );

                /* no longer In-Test ; we are 'Failed' again" */
                availStatusChange ( node_ptr, MTC_AVAIL_STATUS__FAILED  );
            }
            else
            {
                ; /* wait some more */
            }
            break ;
        }

        case  MTC_ENABLE__HOST_SERVICES_START:
        {
            bool start = true ;

            plog ("%s Starting Host Services\n", node_ptr->hostname.c_str());
            if ( this->launch_host_services_cmd ( node_ptr, start ) != PASS )
            {
                node_ptr->hostservices_failed = true ;

                elog ("%s %s failed ; launch\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->host_services_req.name.c_str());

                mtcInvApi_update_task ( node_ptr, MTC_TASK_START_SERVICE_FAIL );
                enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
            }
            else
            {
                mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLING );

                /* Only run hardware monitor if board management is provisioned */
                if ( node_ptr->bm_provisioned == true )
                {
                    send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
                }

                enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_WAIT );
            }
            break ;
        }

        case MTC_ENABLE__HOST_SERVICES_WAIT:
        {
            /* Wait for host services to complete - pass or fail.
             * The host_services_handler manages timeout. */
            rc = this->host_services_handler ( node_ptr );
            if ( rc == RETRY )
            {
                /* wait for the mtcClient's response ... */
                break ;
            }
            else if ( rc != PASS )
            {
                node_ptr->hostservices_failed = true ;
                /* distinguish 'timeout' from other 'execution' failures */
                if ( rc == FAIL_TIMEOUT )
                {
                    elog ("%s %s failed ; timeout\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->host_services_req.name.c_str());

                    mtcInvApi_update_task ( node_ptr,
                                            MTC_TASK_START_SERVICE_TO );
                }
                else
                {
                    elog ("%s %s failed ; rc:%d\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->host_services_req.name.c_str(),
                              rc);

                    mtcInvApi_update_task ( node_ptr,
                                            MTC_TASK_START_SERVICE_FAIL );
                }
                enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
            }
            else /* success path */
            {
                /* Don't start the self heartbeat for the active controller.
                 * Also, in AIO , hosts that have a controller function also
                 * have a compute function and the heartbeat for those hosts
                 * are started at the end of the subfunction handler. */
                if (( THIS_HOST ) ||
                   (( CPE_SYSTEM ) && ( is_controller(node_ptr)) ))
                {
                    enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
                }
                else
                {
                    /* allow the fsm to wait for up to 1 minute for the
                     * hbsClient's ready event before starting heartbeat
                     * test. */
                    mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
                    enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_WAIT );
                }
            }
            break ;
        }

        case MTC_ENABLE__HEARTBEAT_WAIT:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
            {
                wlog ("%s hbsClient ready event timeout\n", node_ptr->hostname.c_str());
            }
            else if ( node_ptr->hbsClient_ready == false )
            {
                 break ;
            }
            else
            {
                mtcTimer_reset ( node_ptr->mtcTimer );
            }

            plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
                      node_ptr->hostname.c_str(),
                      MTC_HEARTBEAT_SOAK_BEFORE_ENABLE,
                      node_ptr->hbsClient_ready ? " ready event" : "out ready event"  );

            /* Start Monitoring Services - heartbeat, process and hardware */
            send_hbs_command   ( node_ptr->hostname, MTC_CMD_START_HOST );

            /* allow heartbeat to run for 10 seconds before we declare enable */
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
            enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK );

            break ;
        }
        case MTC_ENABLE__HEARTBEAT_SOAK:
        {
            if ( node_ptr->mtcTimer.ring == true )
            {
                plog ("%s heartbeating\n", node_ptr->hostname.c_str() );
                /* if heartbeat is not working then we will
                 * never get here and enable the host */
                enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
            }
            break ;
        }
        case MTC_ENABLE__STATE_CHANGE:
        {
            /* Check the work queue complete and done status's */
            mtcInvApi_force_task ( node_ptr, "" );

            if ( node_ptr->degrade_mask )
            {
                /* Allow host to enable in the degraded state */
                allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                           MTC_OPER_STATE__ENABLED,
                                           MTC_AVAIL_STATUS__DEGRADED );
            }
            else
            {
                /* Set node as unlocked-enabled */
                allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                           MTC_OPER_STATE__ENABLED,
                                           MTC_AVAIL_STATUS__AVAILABLE );
            }

            /* Now that we have posted the unlocked-enabled-available state we need
             * to force the final part of the enable sequence through */
            if ( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE )
            {
                adminActionChange ( node_ptr, MTC_ADMIN_ACTION__ENABLE );
            }

            /* Start a timer that failed enable if the work queue
             * does not empty or if commands in the done queue have failed */
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, work_queue_timeout );

            enableStageChange ( node_ptr, MTC_ENABLE__WORKQUEUE_WAIT );

            break ;
        }
        case MTC_ENABLE__WORKQUEUE_WAIT:
        {
            bool fail = false ;
            rc = workQueue_done ( node_ptr );
            if ( rc == RETRY )
            {
                /* wait longer */
                break ;
            }
            else if ( rc == FAIL_WORKQ_TIMEOUT )
            {
                elog ("%s enable failed ; Enable workQueue timeout, purging ...\n", node_ptr->hostname.c_str());
                mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_WORK_TO );
                fail = true ;
            }
            else if ( rc != PASS )
            {
                elog ("%s Enable failed ; Enable doneQueue has failed commands\n", node_ptr->hostname.c_str());
                mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_WORK_FAIL );
                fail = true ;
            }
            else if ( NOT_THIS_HOST )
            {
                /* Loop over the heartbeat interfaces and fail the Enable if any of them are failing */
                for ( int i = 0 ; i < MAX_IFACES ; i++ )
                {
                    if ( node_ptr->heartbeat_failed[i] == true )
                    {
                        elog ("%s Enable failure due to %s Network *** Heartbeat Loss ***\n",
                                  node_ptr->hostname.c_str(),
                                  get_iface_name_str ((iface_enum)i));

                        fail = true ;
                        mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_FAIL_HB );
                    }
                }
            }

            if ( fail == false )
            {
                /* Go enabled */
                enableStageChange ( node_ptr, MTC_ENABLE__ENABLED );
            }
            else
            {
                workQueue_purge ( node_ptr );
                enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
            }

            mtcTimer_reset ( node_ptr->mtcTimer );

            break ;
        }
        case MTC_ENABLE__ENABLED:
        {
            if ( is_controller(node_ptr) )
            {
                /* Defer telling SM the controller state if
                 * this is a CPE and this is the only controller */
                if ( CPE_SYSTEM && ( num_controllers_enabled() > 0 ))
                {
                    wlog ("%s deferring SM enable notification till subfunction-enable complete\n",
                              node_ptr->hostname.c_str());
                }
                else
                {
                    mtc_cmd_enum cmd = CONTROLLER_ENABLED ;

                    /* Override cmd of ENABLED if action is UNLOCK */
                    if ( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK )
                    {
                        cmd = CONTROLLER_UNLOCKED ;
                    }

                    if ( mtcSmgrApi_request ( node_ptr, cmd, SMGR_MAX_RETRIES ) != PASS )
                    {
                        wlog ("%s Failed to send 'unlocked-enabled' to HA Service Manager (%d) ; enabling anyway\n",
                                  node_ptr->hostname.c_str(), cmd );
                    }
                }
            }

            alarm_enabled_clear ( node_ptr, false );

            mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
            node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CLEAR ;
            node_ptr->degrade_mask &= ~DEGRADE_MASK_CONFIG ;

            enableStageChange ( node_ptr, MTC_ENABLE__START );

            if (( CPE_SYSTEM ) && ( is_controller(node_ptr)))
            {
                ilog ("%s running compute sub-function enable handler\n", node_ptr->hostname.c_str());
                mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLING_SUBF );
                adminActionChange ( node_ptr, MTC_ADMIN_ACTION__ENABLE_SUBF );
            }
            else
            {

                node_ptr->enabled_count++ ;

                /* Inform the VIM that this host is enabled */
                mtcVimApi_state_change ( node_ptr, VIM_HOST_ENABLED, 3 );

                plog ("%s is ENABLED%s\n", node_ptr->hostname.c_str(),
                          node_ptr->was_dor_recovery_mode ? " (from DOR)" : "");
                node_ptr->dor_recovery_mode = false ;
                node_ptr->was_dor_recovery_mode = false ;
                node_ptr->http_retries_cur = 0 ;

                adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );

                node_ptr->health_threshold_counter = 0 ;
            }

            break ;
        }

        default:
            rc = FAIL_BAD_CASE ;
    }
    return (rc);
}

int recovery_state_gate = -1 ;

/* Graceful Recovery handler
 * -------------------------
 * Tries to recover a failed host back in service
 *  - auto recovery if it only disappeared for 5 seconds
 *  - avoiding a double reset if it was gone for longer or was known to reset */
int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
{
    int rc = PASS ;

    if ( node_ptr->recoveryStage != recovery_state_gate )
    {
        recovery_state_gate = node_ptr->recoveryStage ;
    }
    switch ( (int)node_ptr->recoveryStage )
    {
        case MTC_RECOVERY__FAILURE:
        {
            if ( node_ptr->mtcTimer.ring == false )
            {
                break ;
            }
            recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
            node_ptr->mtcTimer.ring = false ;

            break ;
        }

        case MTC_RECOVERY__START:
        {
           /* Purge this hosts work queues */
            mtcCmd_workQ_purge ( node_ptr );
            mtcCmd_doneQ_purge ( node_ptr );

            node_ptr->http_retries_cur = 0 ;
            node_ptr->unknown_health_reported = false ;

            plog ("%s %sGraceful Recovery (uptime was %d)\n",
                      node_ptr->hostname.c_str(),
                      node_ptr->mnfa_graceful_recovery ? "MNFA " : "",
                      node_ptr->uptime );

            /* Cancel any outstanding timers */
            mtcTimer_reset ( node_ptr->mtcTimer );

            /* clear all the past enable failure bools */
            clear_main_failed_bools ( node_ptr );
            clear_subf_failed_bools ( node_ptr );
            clear_hostservices_ctls ( node_ptr );

            /* Disable the heartbeat service for Graceful Recovery */
            send_hbs_command   ( node_ptr->hostname, MTC_CMD_STOP_HOST );

            /* Clear the minor and failure flags if it is set for this host */
            for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
            {
                hbs_minor_clear ( node_ptr, (iface_enum)iface );
                node_ptr->heartbeat_failed[iface] = false ;
            }

            /* Have we reached the maximum allowed fast recovery attempts.
             *
             * If we have then force the full enable by
             *   1. clearing the recovery action
             *   2. Setting the node operational state to Disabled
             *   3. Setting the Enable action
             */
            if ( ++node_ptr->graceful_recovery_counter > MTC_MAX_FAST_ENABLES )
            {
                 /* gate off further mtcAlive messaging timme the offline
                 * handler runs. This prevents stale messages from making it
                 * in and prolong the offline detection time */
                 node_ptr->mtcAlive_gate = true ;

                elog ("%s Graceful Recovery Failed (retries=%d)\n",
                          node_ptr->hostname.c_str(), node_ptr->graceful_recovery_counter );

                /* This forces exit from the recover handler and entry into the
                 * enable_handler via FAILED availability state and no aciton. */
                nodeLinkClass::force_full_enable ( node_ptr );

                break ;
            }
            else
            {
                /* TODO: Consider taking this log out as writing to the database
                 *       during a fast graceful recovery might no be the best idea */
                if ( node_ptr->graceful_recovery_counter > 1 )
                    mtcInvApi_update_task ( node_ptr, "Graceful Recovery Retry" );
                else
                    mtcInvApi_update_task ( node_ptr, "Graceful Recovery");

                recoveryStageChange ( node_ptr, MTC_RECOVERY__REQ_MTCALIVE ) ;
            }

            break ;
        }
        case MTC_RECOVERY__REQ_MTCALIVE:
        {
            /* Clear any recent mtcAlive notification ; start a new :) */
            node_ptr->mtcAlive_online = false ;

            /* Clear any recent goEnable notification ; start a new :) */
            node_ptr->goEnabled = false ;

            /* Save the node's last recorded uptime and request mtcAlive from
             * seemingly failed host. Uptime is saved because when the next
             * mtcAlive comes it the uptime will be over written and we need
             * it to compare as a dicision point later on in recovery handling */
            node_ptr->uptime_save = node_ptr->uptime ;

            /* A host is considered failed if it goes away for more
             * than a Loss Of Communication Recovery Timeout specified as mtc.ini
             * configuration option 'loc_recovery_timeout' time in seconds. */
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, loc_recovery_timeout );

            ilog ("%s requesting mtcAlive with %d sec timeout\n",
                      node_ptr->hostname.c_str(), loc_recovery_timeout);

            recoveryStageChange ( node_ptr, MTC_RECOVERY__REQ_MTCALIVE_WAIT ) ;

            break ;
        }
        case MTC_RECOVERY__REQ_MTCALIVE_WAIT:
        {
            if ( node_ptr->mtcAlive_online == true )
            {
                manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );

                mtcTimer_stop ( node_ptr->mtcTimer );

                ilog ("%s got requested mtcAlive%s\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->was_dor_recovery_mode ? " (DOR)" : "" );

                /* Check to see if the host is/got configured correctly */
                if ( (node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED) == 0 )
                {
                    elog ("%s Not Configured (Graceful Recovery)\n", node_ptr->hostname.c_str());

                    /* raise an alarm for the failure of the config */
                    alarm_config_failure ( node_ptr );
                    force_full_enable ( node_ptr );
                    break ;
                }

                /* Check to see if the host is/got configured correctly */
                else if ( (node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY) )
                {
                    elog ("%s Configuration Failure (Graceful Recovery)\n", node_ptr->hostname.c_str());

                    /* raise an alarm for the failure of the config */
                    alarm_config_failure ( node_ptr );
                    force_full_enable ( node_ptr );
                    break ;
                }

                else if ( node_ptr->mnfa_graceful_recovery == true )
                {
                    if ( node_ptr->uptime > MTC_MINS_10 )
                    {
                        /* did not reboot case */
                        wlog ("%s Connectivity Recovered ; host did not reset\n", node_ptr->hostname.c_str());
                        wlog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str());
                        wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str());

                        /* allow the fsm to wait for up to 1 minute for the
                         * hbsClient's ready event before starting heartbeat
                         * test. */
                        mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
                        recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START ) ;
                    }
                    else
                    {
                        /* did reboot case */
                        wlog ("%s Connectivity Recovered ; host has reset\n", node_ptr->hostname.c_str());
                        ilog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str());
                        ilog ("%s ... without additional reboot %s\n",
                                  node_ptr->hostname.c_str(), node_ptr->bm_ip.empty() ? "or reset" : "" );

                        /* now officially in the In-Test state */
                        availStatusChange ( node_ptr, MTC_AVAIL_STATUS__INTEST  );

                        /* O.K. Clear the alive */
                        node_ptr->mtcAlive_online = false ;

                        /* Go to the goEnabled stage */
                        recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );

                        if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
                        {
                            mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
                            node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ;
                        }
                        break ;
                    }
                }
                else if (( node_ptr->uptime_save ) && ( node_ptr->uptime >= node_ptr->uptime_save ))
                {
                    /* did not reboot case */
                    wlog ("%s Connectivity Recovered ; host did not reset%s\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->was_dor_recovery_mode ? " (DOR)" : "" );

                    wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str());
                    wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str());

                    /* allow the fsm to wait for up to 1 minute for the
                     * hbsClient's ready event before starting heartbeat
                     * test. */
                    mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
                    recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START ) ;
                }
                else
                {
                    wlog ("%s Connectivity Recovered ; host has reset\n", node_ptr->hostname.c_str());
                    ilog ("%s ... continuing%sgraceful recovery ; (OOB: %08x)\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->was_dor_recovery_mode ? " (DOR) " : " ",
                              node_ptr->mtce_flags);
                    ilog ("%s ... without additional reboot %s (uptime:%d)\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->bm_ip.empty() ? "or reset" : "",
                              node_ptr->uptime );

                    /* now officially in the In-Test state */
                    availStatusChange ( node_ptr, MTC_AVAIL_STATUS__INTEST  );

                    /* Go to the goEnabled stage */
                    recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );

                    if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
                    {
                        mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
                        node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ;
                    }
                }
            }
            /* A timer ring indicates that the host is not up */
            else if ( node_ptr->mtcTimer.ring == true )
            {
               /* So now this means the node is failed
                * we need to stop services and transition into
                * a longer 'waiting' for the asynchronous mtcAlive
                * that should come as part of the automatic reboot
                * Steps are
                *  1. Stop Services
                *  2. Create mtcAlive timer
                *  2a.  MtcAlive indicating reset ; run start services and recover
                *  2b.  MtcAlive indicating no reset ; force full enable
                *  2c   MtcAlive Timeout: force full enable
                */
                wlog ("%s Loss Of Communication for %d seconds ; disabling host%s\n",
                          node_ptr->hostname.c_str(),
                          loc_recovery_timeout,
                          node_ptr->dor_recovery_mode ? " (DOR)" : "" );
                wlog ("%s ... stopping host services\n", node_ptr->hostname.c_str());
                wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str());

                /* clear all mtc flags. Will be updated on the next/first
                 * mtcAlive message upon recovery */
                node_ptr->mtce_flags = 0 ;

                /* Set node as unlocked-disabled-failed */
                allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                           MTC_OPER_STATE__DISABLED,
                                           MTC_AVAIL_STATUS__FAILED );

                if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
                {
                    subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
                                               MTC_AVAIL_STATUS__FAILED );
                }

                /* Inform the VIM that this host has failed */
                mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );

                if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
                {
                    mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
                    node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ;
                }
                /* Clear all degrade flags except for the HWMON one */
                clear_host_degrade_causes ( node_ptr->degrade_mask );
                node_ptr->degraded_resources_list.clear();

                if ( is_controller(node_ptr) )
                {
                    if ( mtcSmgrApi_request ( node_ptr, CONTROLLER_DISABLED , SMGR_MAX_RETRIES ) != PASS )
                    {
                        wlog ("%s Failed to send 'unlocked-disabled' to HA Service Manager\n",
                                  node_ptr->hostname.c_str() );
                    }
                }
                recoveryStageChange ( node_ptr, MTC_RECOVERY__MTCALIVE_TIMER );
            }
            break ;
        }
        case MTC_RECOVERY__MTCALIVE_TIMER:
        {
            int timeout = 0 ;

            /* Set the FSM task state to booting */
            node_ptr->uptime = 0 ;
            mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT );

            start_offline_handler ( node_ptr );

            timeout = node_ptr->mtcalive_timeout ;

            /* Only try and issue in-line recovery reboot or reset if
             * NOT in Dead Office Recovery (DOR) mode. */
            if ( node_ptr->dor_recovery_mode == false )
            {
                /* If the infrastructure network is provisioned then try
                 * and issue a reset over it to expedite the recovery
                 * for the case where the management heartbeat has
                 * failed but the infra has not.
                 * Keeping it simple by just issing the command and not looping on it */
                if (( node_ptr->infra_ip.length () > 5 ) &&
                    ( node_ptr->heartbeat_failed[MGMNT_IFACE] == true ) &&
                    ( node_ptr->heartbeat_failed[INFRA_IFACE] == false ))
                {
                    ilog ("%s issuing one time graceful recovery reboot over infra network\n", node_ptr->hostname.c_str());
                    send_mtc_cmd ( node_ptr->hostname, MTC_CMD_REBOOT, INFRA_INTERFACE ) ;
                }

                if ((node_ptr->bm_provisioned) && (node_ptr->bm_accessible))
                {
                    ilog ("%s issuing one time board management graceful recovery reset\n", node_ptr->hostname.c_str());

                    rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_RESET );
                    if ( rc )
                    {
                        wlog ("%s board management reset failed\n", node_ptr->hostname.c_str());
                    }
                    else
                    {
                        mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                        recoveryStageChange ( node_ptr, MTC_RECOVERY__RESET_RECV_WAIT );
                        break ;
                    }
                }
                else
                {
                    wlog ("%s cannot issue Reset\n", node_ptr->hostname.c_str() );
                    wlog ("%s ... board management not provisioned or accessible\n", node_ptr->hostname.c_str() );
                }
            }
            else
            {
                /* Just allow Graceful Recovery to take its course. */
                /* Load configured mtcAlive and goEnabled timers */
                LOAD_NODETYPE_TIMERS ;

                /* load the mtcAlive timeout to accomodate for dor recovery */
                timeout = node_ptr->mtcalive_timeout + daemon_get_cfg_ptr()->dor_recovery_timeout_ext ;
            }

            /* start the timer that waits for MTCALIVE */
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, timeout );

            plog ("%s %s (%d secs)%s(uptime was %d) \n",
                      node_ptr->hostname.c_str(),
                      MTC_TASK_RECOVERY_WAIT,
                      timeout,
                      node_ptr->dor_recovery_mode ? " (DOR) " : " " ,
                      node_ptr->uptime_save );

            clear_service_readies ( node_ptr );

            recoveryStageChange ( node_ptr, MTC_RECOVERY__MTCALIVE_WAIT );
            break ;
        }

        case MTC_RECOVERY__RESET_RECV_WAIT:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ))
            {
                    rc = ipmi_command_recv ( node_ptr );
                    if ( rc == RETRY )
                    {
                        mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
                        break ;
                    }

                if ( rc )
                {
                    elog ("%s Reset command failed\n", node_ptr->hostname.c_str());
                }
                else
                {
                    ilog ("%s is Resetting\n", node_ptr->hostname.c_str());
                }

                /* start the timer that waits for MTCALIVE */
                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, node_ptr->mtcalive_timeout );

                plog ("%s %s (%d secs) (uptime was %d)\n",
                          node_ptr->hostname.c_str(),
                          MTC_TASK_RECOVERY_WAIT,
                          node_ptr->mtcalive_timeout,
                          node_ptr->uptime_save );

                clear_service_readies ( node_ptr );

                recoveryStageChange ( node_ptr, MTC_RECOVERY__MTCALIVE_WAIT );
            }
            break ;
        }
        case MTC_RECOVERY__MTCALIVE_WAIT:
        {
            /* search for the mtc alive message */
            if ( node_ptr->mtcAlive_online == true )
            {
                mtcTimer_stop ( node_ptr->mtcTimer );

                manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );

                /* If the host's uptime is bigger than the saved uptime then
                 * the host has not reset yet we have disabled services
                 * then now we need to reset the host to prevet VM duplication
                 * by forcing a full enable */
                if ( node_ptr->uptime_save & ( node_ptr->uptime >= node_ptr->uptime_save ) )
                {
                    ilog ("%s regained MTCALIVE from host that did not reboot (uptime:%d)\n",
                                  node_ptr->hostname.c_str(), node_ptr->uptime );
                    ilog ("%s ... uptimes before:%d after:%d\n", node_ptr->hostname.c_str(), node_ptr->uptime_save, node_ptr->uptime );
                    ilog ("%s ... exiting graceful recovery\n", node_ptr->hostname.c_str());
                    ilog ("%s ... forcing full enable with reset\n", node_ptr->hostname.c_str());

                    nodeLinkClass::force_full_enable ( node_ptr );
                }
                /* Check to see if the host is/got configured */
                else if ( (node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED) == 0 )
                {
                    elog ("%s Not Configured (Graceful Recovery)\n", node_ptr->hostname.c_str());

                    /* raise an alarm for the failure of the config */
                    alarm_config_failure ( node_ptr );
                    force_full_enable ( node_ptr );
                    break ;
                }

                /* Check to see if the host is/got configured correctly */
                else if ( (node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY) )
                {
                    elog ("%s Configuration Failure (Graceful Recovery)\n", node_ptr->hostname.c_str());

                    /* raise an alarm for the failure of the config */
                    alarm_config_failure ( node_ptr );
                    force_full_enable ( node_ptr );
                    break ;
                }
                else
                {
                    ilog ("%s regained MTCALIVE from host that has rebooted (uptime curr:%d save:%d)\n",
                                  node_ptr->hostname.c_str(), node_ptr->uptime, node_ptr->uptime_save );
                    ilog ("%s ... continuing with graceful recovery %s\n",
                                  node_ptr->hostname.c_str(),
                                  node_ptr->dor_recovery_mode ? "(DOR)" : " ");
                    ilog ("%s ... without additional reboot %s\n",
                                  node_ptr->hostname.c_str(), node_ptr->bm_ip.empty() ? "or reset" : "" );

                     /* now officially in the In-Test state */
                    availStatusChange ( node_ptr, MTC_AVAIL_STATUS__INTEST  );

                    /* O.K. Clear the alive */
                    node_ptr->mtcAlive_online = false ;

                    /* Go to the goEnabled stage */
                    recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );
                }
                break ;
            }
            else if ( node_ptr->mtcTimer.ring == true )
            {
                manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CLEAR );

                /* Set the FSM task state to init failed */
                mtcInvApi_update_task ( node_ptr, "Graceful Recovery Failed" );

                node_ptr->mtcTimer.ring = false ;

                elog ("%s has MTCALIVE Timeout\n", node_ptr->hostname.c_str());

                nodeLinkClass::force_full_enable ( node_ptr );

                break ;
            }
            else if (( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF ) &&
                     ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
                     ( node_ptr->bm_provisioned == true ) &&
                     ( node_ptr->bm_accessible == true ) &&
                     ( node_ptr->hwmon_powercycle.state == RECOVERY_STATE__INIT ) &&
                     ( thread_idle ( node_ptr->ipmitool_thread_ctrl )) &&
                     ( node_ptr->ipmitool_thread_info.command != IPMITOOL_THREAD_CMD__POWER_ON ))
            {
                ilog ("%s powering on unlocked powered off host\n",  node_ptr->hostname.c_str());
                if ( ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON ) != PASS )
                {
                    node_ptr->ipmitool_thread_ctrl.done = true ;
                    thread_kill ( node_ptr->ipmitool_thread_ctrl , node_ptr->ipmitool_thread_info ) ;
                }
            }
            else if (( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF ) &&
                     ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
                     ( node_ptr->bm_provisioned == true ) &&
                     ( node_ptr->bm_accessible == true ) &&
                     ( node_ptr->hwmon_powercycle.state == RECOVERY_STATE__INIT ) &&
                     ( thread_done ( node_ptr->ipmitool_thread_ctrl )) &&
                     ( node_ptr->ipmitool_thread_info.command == IPMITOOL_THREAD_CMD__POWER_ON ))
            {
                if ( ipmi_command_recv ( node_ptr ) == PASS )
                {
                    ilog ("%s powered on\n",  node_ptr->hostname.c_str());
                    availStatusChange ( node_ptr, MTC_AVAIL_STATUS__OFFLINE );
                }
            }
            else if ( node_ptr->mtcAlive_gate == true )
            {
                slog ("%s mtcAlive gate unexpectedly set, auto-correcting ...\n",
                        node_ptr->hostname.c_str());

                 node_ptr->mtcAlive_gate = false ;
            }

            /* wait some more */
            break ;
        }
        case MTC_RECOVERY__GOENABLED_TIMER:
        {
            node_ptr->goEnabled = false ;

            /* See if the host is there and already in the go enabled state */
            send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MAIN_GOENABLED, MGMNT_INTERFACE );

            /* start the reboot timer - is cought in the mtc alive case */
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, this->goenabled_timeout );

            /* ok time started */
            ilog ("%s waiting for GOENABLED ; with %d sec timeout\n",
                      node_ptr->hostname.c_str(),
                      this->goenabled_timeout );


            /* Default to unknown health */
            node_ptr->health = NODE_HEALTH_UNKNOWN ;

            /* start waiting fhr the ENABLE READY message */
            recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_WAIT );
            break ;
        }
        case MTC_RECOVERY__GOENABLED_WAIT:
        {
            /* The healthy code comes from the host in the mtcAlive message.
             * This 'if' clause was introduced to detected failure of host
             * without having to wait for the GOENABLED phase to timeout.
             *
             * This case is particularly important in the DOR case where
             * computes may have come up and fail to run their manifests
             * and sit there in an unconfigured state. We don't want them to
             * be gracefully recovered to enabled in that case. Instead
             * we want to recover the card through a reset as quickly as
             * possible. */
            if ( node_ptr->health == NODE_UNHEALTHY )
            {
                elog ("%s is UNHEALTHY\n", node_ptr->hostname.c_str());
                mtcTimer_reset ( node_ptr->mtcTimer );
                this->force_full_enable ( node_ptr );
            }
            /* search for the Go Enable message */
            else if ( node_ptr->goEnabled_failed == true )
            {
                elog ("%s got GOENABLED Failed\n", node_ptr->hostname.c_str());
                mtcTimer_reset ( node_ptr->mtcTimer );
                mtcInvApi_update_task ( node_ptr, MTC_TASK_INTEST_FAIL );
                this->force_full_enable ( node_ptr );
            }

            /* search for the Go Enable message */
            else if ( node_ptr->goEnabled == true )
            {
                plog ("%s got GOENABLED (Graceful Recovery)\n", node_ptr->hostname.c_str());
                mtcTimer_reset ( node_ptr->mtcTimer );

                /* O.K. clearing the state now that we got it */
                node_ptr->goEnabled = false ;

                recoveryStageChange ( node_ptr, MTC_RECOVERY__HOST_SERVICES_START );
            }
            else if ( node_ptr->mtcTimer.ring == true )
            {
                elog ("%s has GOENABLED Timeout\n", node_ptr->hostname.c_str());

                node_ptr->mtcTimer.ring = false ;

                this->force_full_enable ( node_ptr );
            }
            break;
        }

        case MTC_RECOVERY__HOST_SERVICES_START:
        {
            bool start = true ;

            plog ("%s Starting Host Services\n", node_ptr->hostname.c_str());
            if ( this->launch_host_services_cmd ( node_ptr, start ) != PASS )
            {
                elog ("%s %s failed ; launch\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->host_services_req.name.c_str());
                node_ptr->hostservices_failed = true ;
                this->force_full_enable ( node_ptr );
            }
            else
            {
                recoveryStageChange ( node_ptr, MTC_RECOVERY__HOST_SERVICES_WAIT );
            }
            break ;
        }
        case MTC_RECOVERY__HOST_SERVICES_WAIT:
        {
            /* Wait for host services to complete - pass or fail.
             * The host_services_handler manages timeout. */
            rc = this->host_services_handler ( node_ptr );
            if ( rc == RETRY )
            {
                /* wait for the mtcClient's response ... */
                break ;
            }
            else if ( rc != PASS )
            {
                node_ptr->hostservices_failed = true ;
                if ( rc == FAIL_TIMEOUT )
                {
                    elog ("%s %s failed ; timeout\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->host_services_req.name.c_str());

                    mtcInvApi_update_task ( node_ptr,
                                            MTC_TASK_START_SERVICE_TO );
                }
                else
                {
                    elog ("%s %s failed ; rc=%d\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->host_services_req.name.c_str(),
                              rc);

                    mtcInvApi_update_task ( node_ptr,
                                            MTC_TASK_START_SERVICE_FAIL );
                }
                this->force_full_enable ( node_ptr );
            }
            else /* success path */
            {
                /* The active controller would never get/be here but
                 * if it did then just fall through to change state. */
                if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
                {
                    /* Here we need to run the sub-fnction goenable and start
                     * host services if this is the other controller in a AIO
                     * system. */
                    if ( NOT_THIS_HOST )
                    {
                        /* start a timer that waits for the /var/run/.compute_config_complete flag */
                        mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_COMPUTE_CONFIG_TIMEOUT );

                        /* We will come back to MTC_RECOVERY__HEARTBEAT_START
                         * after we enable the compute subfunction */
                        recoveryStageChange ( node_ptr, MTC_RECOVERY__CONFIG_COMPLETE_WAIT );
                    }
                    else
                    {
                        recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE );
                    }
                }
                /* Otherwise in a normal system and not the active controller,
                 * just start the heartbeat soak */
                else if ( NOT_THIS_HOST )
                {
                    /* allow the fsm to wait for up to 1 minute for the
                     * hbsClient's ready event before starting heartbeat
                     * test. */
                    mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
                    recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START );
                }
                else
                {
                    recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE );
                }
            }
            break ;
        }
        case MTC_RECOVERY__CONFIG_COMPLETE_WAIT:
        {
            /* look for file */
            if ( node_ptr->mtce_flags & MTC_FLAG__SUBF_CONFIGURED )
            {
                plog ("%s-compute configured\n", node_ptr->hostname.c_str());

                mtcTimer_reset ( node_ptr->mtcTimer );

                recoveryStageChange ( node_ptr, MTC_RECOVERY__SUBF_GOENABLED_TIMER );
            }

            /* timeout handling */
            else if ( node_ptr->mtcTimer.ring == true )
            {
                elog ("%s-compute configuration timeout\n", node_ptr->hostname.c_str());

                mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_FAIL );
                nodeLinkClass::force_full_enable ( node_ptr );
            }
            else
            {
                ; /* wait longer */
            }
            break ;
        }
        case MTC_RECOVERY__SUBF_GOENABLED_TIMER:
        {
            ilog ("%s-compute running out-of-service tests\n", node_ptr->hostname.c_str());

            /* See if the host is there and already in the go enabled state */
            send_mtc_cmd ( node_ptr->hostname, MTC_REQ_SUBF_GOENABLED, MGMNT_INTERFACE );

            /* start the reboot timer - is cought in the mtc alive case */
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, this->goenabled_timeout );

            node_ptr->goEnabled_subf = false ;

            /* start waiting for the GOENABLED message */
            recoveryStageChange ( node_ptr, MTC_RECOVERY__SUBF_GOENABLED_WAIT );

            break ;
        }
        case MTC_RECOVERY__SUBF_GOENABLED_WAIT:
        {
            /* search for the Go Enable message */
            if ( node_ptr->goEnabled_failed_subf == true )
            {
                elog ("%s-compute one or more out-of-service tests failed\n", node_ptr->hostname.c_str());
                mtcTimer_reset ( node_ptr->mtcTimer );
                mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_FAIL );
                this->force_full_enable ( node_ptr );
            }

            /* search for the Go Enable message */
            else if ( node_ptr->goEnabled_subf == true )
            {
                /* stop the timer */
                mtcTimer_reset ( node_ptr->mtcTimer );

                plog ("%s-compute passed  out-of-service tests\n", node_ptr->hostname.c_str());

                /* O.K. clearing the state now that we got it */
                node_ptr->goEnabled_subf        = false ;

                /* ok. great, got the go-enabled message, lets move on */
                recoveryStageChange ( node_ptr, MTC_RECOVERY__SUBF_SERVICES_START );
            }
            else if ( node_ptr->mtcTimer.ring == true )
            {
                elog ("%s-compute out-of-service test execution timeout\n", node_ptr->hostname.c_str());
                node_ptr->mtcTimer.ring = false ;
                mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_FAIL );
                this->force_full_enable ( node_ptr );
            }
            else
            {
                ; /* wait some more */
            }
            break ;
        }

        case MTC_RECOVERY__SUBF_SERVICES_START:
        {
            bool start = true ;
            bool subf  = true ;

            plog ("%s-compute Starting Host Services\n", node_ptr->hostname.c_str());

            if ( this->launch_host_services_cmd ( node_ptr, start, subf ) != PASS )
            {
                elog ("%s-compute %s failed ; launch\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->host_services_req.name.c_str());
                node_ptr->hostservices_failed_subf = true ;
                mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_FAIL );
                this->force_full_enable ( node_ptr );
            }
            else
            {
                recoveryStageChange ( node_ptr, MTC_RECOVERY__SUBF_SERVICES_WAIT );
            }
            break ;
        }
        case MTC_RECOVERY__SUBF_SERVICES_WAIT:
        {
            /* Wait for host services to complete - pass or fail.
             * The host_services_handler manages timeout. */
            rc = this->host_services_handler ( node_ptr );
            if ( rc == RETRY )
            {
                /* wait for the mtcClient's response ... */
                break ;
            }
            else if ( rc != PASS )
            {
                node_ptr->hostservices_failed_subf = true ;
                if ( rc == FAIL_TIMEOUT )
                {
                    elog ("%s-compute %s failed ; timeout\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->host_services_req.name.c_str());

                    mtcInvApi_update_task ( node_ptr,
                                            MTC_TASK_START_SERVICE_TO );
                }
                else
                {
                    elog ("%s-compute %s failed ; rc=%d\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->host_services_req.name.c_str(),
                              rc);

                    mtcInvApi_update_task ( node_ptr,
                                            MTC_TASK_START_SERVICE_FAIL );
                }
                this->force_full_enable ( node_ptr );
            }
            else /* success path */
            {
                /* allow the fsm to wait for up to 1 minute for the
                 * hbsClient's ready event before starting heartbeat
                 * test. */
                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
                recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START );
            }
            break ;
        }
        case MTC_RECOVERY__HEARTBEAT_START:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
            {
                wlog ("%s hbsClient ready event timeout\n", node_ptr->hostname.c_str());
            }
            else if ( node_ptr->hbsClient_ready == false )
            {
                 break ;
            }
            else
            {
                mtcTimer_reset ( node_ptr->mtcTimer );
            }

            plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
                      node_ptr->hostname.c_str(),
                      MTC_HEARTBEAT_SOAK_BEFORE_ENABLE,
                      node_ptr->hbsClient_ready ? " ready event" : "out ready event"  );

            /* Enable the heartbeat service for Graceful Recovery */
            send_hbs_command   ( node_ptr->hostname, MTC_CMD_START_HOST );

            /* allow heartbeat to run for 10 seconds before we declare enable */
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );

            /* if heartbeat is not working then we will
             * never get here and enable the host */
            recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_SOAK );

            break ;
        }
        case MTC_RECOVERY__HEARTBEAT_SOAK:
        {
            if ( node_ptr->mtcTimer.ring == true )
            {
                /* if heartbeat is not working then we will
                 * never get here and enable the host */
                recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE );
            }
            break ;
        }
        case MTC_RECOVERY__STATE_CHANGE:
        {
            if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
            {
                /* Set node as unlocked-enabled */
                subfStateChange ( node_ptr, MTC_OPER_STATE__ENABLED,
                                            MTC_AVAIL_STATUS__AVAILABLE );
            }

            if ( node_ptr->degrade_mask )
            {
                /* Allow host to enable in the degraded state */
                allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                           MTC_OPER_STATE__ENABLED,
                                           MTC_AVAIL_STATUS__DEGRADED );
            }
            else
            {
                /* Set node as unlocked-enabled */
                allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                           MTC_OPER_STATE__ENABLED,
                                           MTC_AVAIL_STATUS__AVAILABLE );
            }

            /* Only run hardware monitor board management is provisioned */
            if ( node_ptr->bm_provisioned == true )
            {
                send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
            }

            /* Inform the VIM that this host is enabled */
            mtcVimApi_state_change ( node_ptr, VIM_HOST_ENABLED, 3 );

            /* Start a timer that failed enable if the work queue
             * does not empty or if commands in the done queue have failed */
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, work_queue_timeout );

            mtcInvApi_force_task ( node_ptr, "" );

            recoveryStageChange ( node_ptr, MTC_RECOVERY__WORKQUEUE_WAIT ) ;
            break ;
        }
        case MTC_RECOVERY__WORKQUEUE_WAIT:
        {
            rc = workQueue_done ( node_ptr );
            if ( rc == RETRY )
            {
                /* wait longer */
                break ;
            }
            else if ( rc == PASS )
            {
                /* Start Graceful Recovery */
                recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_START ) ;
                break ;
            }
            else if ( rc == FAIL_WORKQ_TIMEOUT )
            {
                wlog ("%s Graceful Recovery failed ; workQueue empty timeout, purging ...\n", node_ptr->hostname.c_str());
                workQueue_purge ( node_ptr );
            }
            else if ( rc != PASS )
            {
                wlog ("%s Graceful Recovery failed ; doneQueue contains failed commands\n", node_ptr->hostname.c_str());
            }
            mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_FAIL );
            nodeLinkClass::force_full_enable ( node_ptr );
            break ;
        }
        case MTC_RECOVERY__ENABLE_START:
        {
            /* Create the recovery enable timer. This timer is short.
             * A node need to stay enabled with the hartbeat service
             * running for a period of time before declaring it enabled */
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );

            recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_WAIT ) ;
            break;
        }
        case MTC_RECOVERY__ENABLE_WAIT:
        {
            /* When this timer fires the host has been up for enough time */
            if ( node_ptr->mtcTimer.ring == true )
            {
                if ( is_controller(node_ptr) )
                {
                    if ( mtcSmgrApi_request ( node_ptr,
                                              CONTROLLER_ENABLED,
                                              SMGR_MAX_RETRIES ) != PASS )
                    {
                        wlog ("%s Failed to send 'unlocked-disabled' to HA Service Manager ; allowing enable\n",
                              node_ptr->hostname.c_str());
                    }
                }
                /* Node Has Recovered */
                node_ptr->graceful_recovery_counter = 0 ;
                recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
                adminActionChange   ( node_ptr, MTC_ADMIN_ACTION__NONE );
                node_ptr->health_threshold_counter = 0 ;
                node_ptr->enabled_count++ ;
                node_ptr->http_retries_cur = 0 ;

                doneQueue_purge ( node_ptr );
                if ( node_ptr->was_dor_recovery_mode )
                {
                    report_dor_recovery (  node_ptr , "is ENABLED" );
                }
                else
                {
                    plog ("%s is ENABLED (Gracefully Recovered)\n",
                              node_ptr->hostname.c_str());
                }
                alarm_enabled_clear ( node_ptr, false );
            }
            break ;
        }
        default:
        {
            rc = FAIL_BAD_CASE ;
            break ;
        }
    }
    return (rc);
}

/*
 * Start Stop Host Services Handler
 * --------------------------------
 * Waits for the specified host services command to complete.
 *
 * Returns PASS      - command completed successfully
 *         RETRY     - command still running
 *         FAIL_xxxx - command failure for reason
 *
 */
int nodeLinkClass::host_services_handler ( struct nodeLinkClass::node * node_ptr )
{
    int rc = FAIL ;

    if ( node_ptr && ( is_host_services_cmd ( node_ptr->host_services_req.cmd ) == true ))
    {
        /* Handle command overall umbrella timeout */
        if ( mtcTimer_expired ( node_ptr->host_services_timer ) )
        {
            elog ("%s %s timeout\n",
                      node_ptr->hostname.c_str(),
                      node_ptr->host_services_req.name.c_str());

            /* treat as command failure */
            mtcCmd_workQ_purge ( node_ptr );
            mtcCmd_doneQ_purge ( node_ptr );
            rc = FAIL_TIMEOUT ;
        }

        /* Handle the case where both the done and work fifo's are empty.
         * ... yet this is the state while we are waiting for */
        else if (( node_ptr->mtcCmd_done_fifo.size() == 0 ) &&
                 ( node_ptr->mtcCmd_work_fifo.size() == 0 ))
        {
            mtcTimer_reset ( node_ptr->host_services_timer );
            slog ("%s %s command missing\n",
                      node_ptr->hostname.c_str(),
                      node_ptr->host_services_req.name.c_str());
            rc = FAIL_BAD_STATE ;
        }

        /* look for 'done' case - pass and failed */
        else if (( node_ptr->mtcCmd_done_fifo.size() != 0 ) &&
                 ( node_ptr->mtcCmd_work_fifo.size() == 0 ))
        {
            mtcTimer_reset ( node_ptr->host_services_timer );
            if ( node_ptr->host_services_req.status == PASS )
            {
                ilog ("%s %s completed\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->host_services_req.name.c_str());
                rc = PASS ;
            }
            else
            {
                wlog ("%s %s ; rc:%d\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->host_services_req.status_string.c_str(),
                          node_ptr->host_services_req.status);

                rc = FAIL_OPERATION ;
            }
            /* Purge the done command fifo now that we have consumed the result.
             * The work fifo is already empty or we would not be in this case */
            mtcCmd_doneQ_purge ( node_ptr );
        }
        /* still working ... */
        else
        {
            /* wait longer */
            rc = RETRY ;
        }
    }
    else
    {
        slog ("%s invalid host services command (%d)\n",
                  node_ptr->hostname.c_str(),
                  node_ptr->cmd.parm1 );

        rc = FAIL_BAD_PARM ;
    }

    return (rc);
}


/* Disable handler
 * ---------------
 * Algorithm that puts a node into the operationally disabled state */
int nodeLinkClass::disable_handler  ( struct nodeLinkClass::node * node_ptr )
{
    int rc = PASS ;

    switch ( (int)node_ptr->handlerStage.disable )
    {
        case MTC_DISABLE__START:
        {
            mtcTimer_reset ( node_ptr->mtcTimer );

            /* Purge this hosts work and done queues */
            workQueue_purge    ( node_ptr );
            doneQueue_purge    ( node_ptr );
            mtcCmd_workQ_purge ( node_ptr );
            mtcCmd_doneQ_purge ( node_ptr );

            /* clear all the enable failure bools */
            clear_main_failed_bools ( node_ptr );
            clear_subf_failed_bools ( node_ptr );
            clear_hostservices_ctls ( node_ptr );

            disableStageChange ( node_ptr, MTC_DISABLE__DIS_SERVICES_WAIT) ;

            stop_offline_handler ( node_ptr );

            if (( node_ptr->bm_provisioned == true ) &&
                ( node_ptr->bm_accessible == true ) &&
                ( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF ))
            {
                    rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON );
                    if ( rc )
                    {
                        mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                        disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_POWERON_SEND) ;
                    }
                    else
                    {
                        mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_IPMITOOL_REQUEST_DELAY );
                        disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_POWERON_RECV) ;
                    }

                if ( rc == PASS )
                {
                    ilog ("%s Power On request sent\n", node_ptr->hostname.c_str());
                }
            }

            if ( node_ptr->adminAction == MTC_ADMIN_ACTION__FORCE_LOCK )
            {
                mtc_nodeAvailStatus_enum locked_status = MTC_AVAIL_STATUS__OFFLINE ;
                plog ("%s Administrative 'force-lock' Operation\n", node_ptr->hostname.c_str());

                /* If the host was inservice then set its locked state as ONLINE for now.
                 * Otherwise its defaulted to offline */
                if (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
                    ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ) ||
                    ( node_ptr->availStatus == MTC_AVAIL_STATUS__INTEST ) ||
                    ( node_ptr->availStatus == MTC_AVAIL_STATUS__FAILED ))
                {
                    locked_status = MTC_AVAIL_STATUS__ONLINE ;
                }

                allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                           MTC_OPER_STATE__DISABLED,
                                           locked_status );

                if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
                {
                    subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
                                                locked_status );
                }
            }
            else
            {
                plog ("%s Administrative 'lock' Operation\n", node_ptr->hostname.c_str());
            }

            /* reset retries counter in prep for next stage */
            node_ptr->retries = 0 ;
            node_ptr->http_retries_cur = 0 ;
            node_ptr->pmond_ready = false ;

            /* Clear all degrade flags except for the HWMON one */
            clear_host_degrade_causes ( node_ptr->degrade_mask );
            node_ptr->degraded_resources_list.clear();

            if ( is_controller(node_ptr) )
            {
                mtcInvApi_update_task ( node_ptr, MTC_TASK_DISABLE_CONTROL );
            }
            // else
            // {
            //    consider putting in the host type
            // }

            if ( NOT_THIS_HOST )
            {
                /* Disable path for Controllers */
                if ( is_controller(node_ptr) )
                {
                    if ( mtcSmgrApi_request ( node_ptr,
                                              CONTROLLER_LOCKED,
                                              SMGR_MAX_RETRIES ) != PASS )
                    {
                        wlog ("%s Failed to send 'locked-disabled' to HA Service Manager\n",
                                  node_ptr->hostname.c_str() );
                    }
                }

                /* Clear the minor flag if it is set for this host */
                for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
                {
                    hbs_minor_clear ( node_ptr, (iface_enum)iface );
                }

                /* Turn off Heartbeat to that host */
                send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST );
            }

            /* If the stage is still MTC_DISABLE__DIS_SERVICES_WAIT then the
             * host should already be powered on so lets send the stop
             * services command */
            if ( node_ptr->handlerStage.disable == MTC_DISABLE__DIS_SERVICES_WAIT )
            {
                bool start = false ;
                if ( this->launch_host_services_cmd ( node_ptr, start ) != PASS )
                {
                    wlog ("%s %s failed ; launch\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->host_services_req.name.c_str());

                    /* proceed to handle force lock if the launch fails */
                    disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK );
                }
            }
            break ;
        }
        case MTC_DISABLE__DIS_SERVICES_WAIT:
        {
            /* manage host services stop command to this target */
            rc = this->host_services_handler ( node_ptr );
            if ( rc == RETRY )
            {
                break ;
            }
            else if ( rc != PASS )
            {
                if ( rc == FAIL_TIMEOUT )
                {
                    wlog ("%s %s failed ; timeout\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->host_services_req.name.c_str());
                }
                else
                {
                    wlog ("%s %s failed ; rc:%d\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->host_services_req.name.c_str(),
                              rc);
                }
            }
            disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK) ;
            break ;
        }
        case MTC_DISABLE__HANDLE_POWERON_SEND:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ))
            {
                rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON );
                if ( rc )
                {
                    elog ("%s failed to send Power On request\n", node_ptr->hostname.c_str());
                    disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK) ;
                }
                else
                {
                    mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_IPMITOOL_REQUEST_DELAY );
                    disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_POWERON_RECV) ;
                }
            }
            break ;
        }
        case MTC_DISABLE__HANDLE_POWERON_RECV:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ))
            {
                rc = ipmi_command_recv ( node_ptr );
                if ( rc == RETRY )
                {
                    mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
                    break ;
                }
                if ( rc )
                {
                    elog ("%s auto power-on failed\n", node_ptr->hostname.c_str());
                }
                else
                {
                    ilog ("%s is Powering On\n", node_ptr->hostname.c_str());
                }
                disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK) ;
            }
            break ;
        }
        case MTC_DISABLE__HANDLE_FORCE_LOCK:
        {
            /* If this is a force lock against a compute then we have to reset it */
            if (( node_ptr->adminAction == MTC_ADMIN_ACTION__FORCE_LOCK ))
            {
                /* Stop the timer if it is active coming into this case */
                mtcTimer_reset ( node_ptr->mtcTimer );

                /* purge in support of retries */
                mtcCmd_doneQ_purge ( node_ptr );
                mtcCmd_workQ_purge ( node_ptr );

                ilog ("%s Issuing Force-Lock Reset\n", node_ptr->hostname.c_str());
                mtcCmd_init ( node_ptr->cmd );
                node_ptr->cmd.stage = MTC_CMD_STAGE__START ;
                node_ptr->cmd.cmd   = MTC_OPER__RESET_PROGRESSION ;
                node_ptr->cmd.parm1 = 2 ; /* 2 retries */
                node_ptr->mtcCmd_work_fifo.push_back(node_ptr->cmd);

                int timeout = ((MTC_RESET_PROG_TIMEOUT*(node_ptr->cmd.parm1+1))*2) ;
                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, timeout ) ;

                mtcInvApi_update_task ( node_ptr, MTC_TASK_DISABLE_FORCE );

                /* Force instance evacuation */
                disableStageChange ( node_ptr, MTC_DISABLE__RESET_HOST_WAIT );
            }
            else
            {
                disableStageChange ( node_ptr, MTC_DISABLE__TASK_STATE_UPDATE ) ;
            }
            break ;
        }
        case MTC_DISABLE__RESET_HOST_WAIT:
        {
            /* Check for the operation timeout - should not occur */
            if ( node_ptr->mtcTimer.ring == true )
            {
                wlog ("%s Reset Progression Timeout ; aborting ...\n", node_ptr->hostname.c_str());

                /* Purge this hosts work and done queues */
                mtcCmd_doneQ_purge ( node_ptr );
                mtcCmd_workQ_purge ( node_ptr );

                /* aborting after timeout ; need to avoid a stuck FSM
                 * reset progression already did retries */
                mtcInvApi_update_task ( node_ptr, MTC_TASK_REBOOT_TIMEOUT );

                disableStageChange ( node_ptr, MTC_DISABLE__TASK_STATE_UPDATE );
            }

            /* Handle the case where the done fifo is empty ; avoid the segfault */
            else if ( node_ptr->mtcCmd_done_fifo.size() == 0 )
            {
                /* Should never get here but .....
                 * Handle the case where the work queue is also empty.
                 * Avoid stuck FSM */
                if ( node_ptr->mtcCmd_work_fifo.size() == 0 )
                {
                    slog ("%s unexpected empty work queue ; trying reboot/reset again\n",
                              node_ptr->hostname.c_str() );

                    /* reset progression failed so try again */
                    disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK );
                }
                else
                {
                    ; /* typical wait path - wait some more */
                }
            }
            else
            {
                /* TODO: Future: get the specific command rather than just the head */
                node_ptr->mtcCmd_done_fifo_ptr = node_ptr->mtcCmd_done_fifo.begin();

                /* defensive programming */
                if ( node_ptr->mtcCmd_done_fifo_ptr != node_ptr->mtcCmd_work_fifo.end())
                {
                    /* exit reset progression and any retries once the host is offline */
                    if ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE )
                    {
                        mtcTimer_stop ( node_ptr->mtcTimer );
                        stop_offline_handler ( node_ptr );
                        disableStageChange ( node_ptr, MTC_DISABLE__TASK_STATE_UPDATE ) ;
                    }
                    else if ( node_ptr->mtcCmd_done_fifo_ptr->cmd != MTC_OPER__RESET_PROGRESSION )
                    {
                        slog ("%s purging front entry of done cmdQueue\n",
                                  node_ptr->hostname.c_str());

                        /* reset progression failed so try again */
                        disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK );
                    }
                    else
                    {
                        ilog ("%s host still not offline ; trying reboot/reset again ....\n", node_ptr->hostname.c_str() );

                        /* reset progression failed so try again */
                        disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK );
                    }
                }
                else
                {
                    slog ("%s unexpected empty work queue ; trying force lock\n", node_ptr->hostname.c_str() );

                    /* reset progression failed so try again */
                    disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK );
                }
            }
            break ;
        }

        case MTC_DISABLE__TASK_STATE_UPDATE:
        {
            mtc_nodeAvailStatus_enum avail ;

            /* Tell the host that it is locked */
            send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, MGMNT_INTERFACE );
            if ( infra_network_provisioned )
            {
                send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, INFRA_INTERFACE );
            }

            /* Change the oper and avail states in the database */
            if (( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE ) ||
                ( node_ptr->availStatus == MTC_AVAIL_STATUS__FAILED ) ||
                ( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF ))
            {
                avail = MTC_AVAIL_STATUS__OFFLINE ;
            }
            else
            {
                avail = MTC_AVAIL_STATUS__ONLINE ;
            }
            allStateChange ( node_ptr, MTC_ADMIN_STATE__LOCKED, MTC_OPER_STATE__DISABLED, avail );
            mtcInvApi_subf_states (node_ptr,"disabled",get_availStatus_str(avail));

            /* Inform the VIM that this host is disabled */
            mtcVimApi_state_change ( node_ptr, VIM_HOST_DISABLED, 3 );

            /* Inform the VIM that the dataports are offline */
            update_dport_states (node_ptr, MTC_EVENT_AVS_OFFLINE );
            mtcVimApi_state_change ( node_ptr, VIM_DPORT_OFFLINE, 3 );

            /* Start a timer that waits for the work queue to complete */
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, work_queue_timeout );
            disableStageChange( node_ptr, MTC_DISABLE__WORKQUEUE_WAIT );

            break ;
        }
        case MTC_DISABLE__WORKQUEUE_WAIT:
        {
            rc = workQueue_done ( node_ptr );
            if ( rc == RETRY )
            {
                /* wait longer */
                break ;
            }
            else if ( rc == FAIL_WORKQ_TIMEOUT )
            {
                wlog ("%s Disable warning ; workQueue empty timeout, purging ...\n", node_ptr->hostname.c_str());
                workQueue_purge ( node_ptr );
            }
            else if ( rc != PASS )
            {
                wlog ("%s Disable warning ; doneQueue contained failed commands\n", node_ptr->hostname.c_str());
            }
            disableStageChange( node_ptr, MTC_DISABLE__DISABLED );
            break ;
        }
        case MTC_DISABLE__DISABLED:
        {
            /* Stop the timer if it is active coming into this case */
            mtcTimer_reset ( node_ptr->mtcTimer );

            /* This will get updated during the next
             * mtcLive message from this blade */
            node_ptr->health      = NODE_HEALTH_UNKNOWN       ;

            /* Set the lock alarm */
            if (( node_ptr->adminAction == MTC_ADMIN_ACTION__LOCK ) ||
                ( node_ptr->adminAction == MTC_ADMIN_ACTION__FORCE_LOCK ))
            {
                mtcAlarm_warning ( node_ptr->hostname, MTC_ALARM_ID__LOCK );
                node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_WARNING ;
            }

            /* open the mtcAlive gate while we are disabled */
            node_ptr->mtcAlive_gate = false ;

            disableStageChange( node_ptr, MTC_DISABLE__START );
            adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );

            node_ptr->mtcCmd_work_fifo.clear();
            node_ptr->mtcCmd_done_fifo.clear();
            node_ptr->http_retries_cur = 0 ;

            /***** Powercycle FSM Stuff *****/

            recovery_ctrl_init ( node_ptr->hwmon_reset );
            recovery_ctrl_init ( node_ptr->hwmon_powercycle );

            /* Load configured mtcAlive and goEnabled timers */
            LOAD_NODETYPE_TIMERS ;

            mtcInvApi_force_task ( node_ptr, "" );

            plog ("%s Disable Complete\n", node_ptr->hostname.c_str());

            break ;
        }

        default:
        {
            elog ("%s Bad Case (%d)\n", node_ptr->hostname.c_str(),
                                        node_ptr->handlerStage.disable );
            rc = FAIL_BAD_CASE ;
        }
    }
    return (rc);
}

/* Uptime handler
 * ---------------*/
int nodeLinkClass::uptime_handler ( void )
{
    /* Service uptime refresh timer */
    if ( this->mtcTimer_uptime.ring == true )
    {
        int rc = PASS ;
        unsigned int uptime = 0;

        /* Send uptime valies to inventory */
        for ( this->host  = this->hostname_inventory.begin () ;
              this->host != this->hostname_inventory.end () ;
              this->host++ )
        {
            bool do_uptime_update = false ;
            string hostname = "" ;

            hostname.append( this->host->c_str()) ;

            /* only update every 5 minutes after being up for an hour */
            uptime = this->get_uptime ( hostname ) ;
            if ( uptime < 3600 )
            {
                do_uptime_update = true ;
            }
            else
            {
                int ctr = this->get_uptime_refresh_ctr ( hostname );

                /* Update uptime only every 5 minutes after the
                 * host has been up for more than one hour */
                if (( uptime > 3600 ) && ( (ctr*(this->uptime_period)) >= MTC_MINS_5 ))
                {
                    do_uptime_update = true ;
                }
                else
                {
                    this->set_uptime_refresh_ctr ( hostname , (ctr+1) ) ;
                }
            }
            /* Handle update if required */
            if (( rc != PASS ) && ( do_uptime_update == true ))
            {
                wlog ("%s Uptime refresh bypassed due to previous error\n", hostname.c_str());
            }
            else if (( do_uptime_update == true ) || ( uptime == 0 ))
            {
                /* Sent uptime update request.
                 * But exit this iteration if we get an error as we
                 * don't want to stall mtce for all hosts on such a
                 * simple operation */

                // ilog ("%s - %d\n", hostname.c_str(), uptime );
                if ( uptime == 0 )
                {
                    this->set_uptime ( hostname, uptime , false ) ;
                }
                else
                {
                    this->set_uptime ( hostname, uptime , true ) ;
                }
            }
        }
        /* Re-Start the uptime timer */
        mtcTimer_start ( this->mtcTimer_uptime, mtcTimer_handler,
                        (this->uptime_period+(rand()%10)));
    }
    return PASS ;
}

/* Offline handler
 * ---------------
 * Algorithm that manages offline/online state for a locked host */
int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr )
{
    switch ( (int)node_ptr->offlineStage )
    {
        case MTC_OFFLINE__IDLE:
        {
            return (PASS) ; /* typical path */
        }
        case MTC_OFFLINE__START:
        {
            node_ptr->mtcAlive_mgmnt = false ;
            node_ptr->mtcAlive_infra = false ;

            node_ptr->offline_search_count = 0 ;

            mtcTimer_reset ( node_ptr->offline_timer );
            ilog ("%s starting %d msec offline audit (%s-%s)\n",
                      node_ptr->hostname.c_str(),
                      offline_period,
                      operState_enum_to_str(node_ptr->operState).c_str(),
                      availStatus_enum_to_str(node_ptr->availStatus).c_str());

            node_ptr->offlineStage = MTC_OFFLINE__SEND_MTCALIVE ;
            /* fall through on start */
        }
        case MTC_OFFLINE__SEND_MTCALIVE:
        {
            alog2 ("%s searching for offline (%s-%s)\n",
                      node_ptr->hostname.c_str(),
                      operState_enum_to_str(node_ptr->operState).c_str(),
                      availStatus_enum_to_str(node_ptr->availStatus).c_str());

            node_ptr->mtcAlive_gate  = false ;
            node_ptr->mtcAlive_mgmnt = false ;
            node_ptr->mtcAlive_infra = false ;

            /* Request a mtcAlive from host from Mgmnt and Infra (if provisioned) */
            send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, MGMNT_INTERFACE );
            if ( infra_network_provisioned )
            {
                send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, INFRA_INTERFACE );
            }

            /* reload the timer */
            mtcTimer_start_msec ( node_ptr->offline_timer, mtcTimer_handler, offline_period );

            node_ptr->offlineStage = MTC_OFFLINE__WAIT ;

            break ;
        }
        case MTC_OFFLINE__WAIT:
        {
            /* be sure the mtcAlive gate is open */
            node_ptr->mtcAlive_gate = false ;
            if ( mtcTimer_expired ( node_ptr->offline_timer ) == true )
            {
                if ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE )
                {
                    plog ("%s offline (external)\n", node_ptr->hostname.c_str());
                    node_ptr->offlineStage = MTC_OFFLINE__IDLE ;
                }
                else if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
                {
                    slog ("%s offline search while 'enabled' ; invalid\n", node_ptr->hostname.c_str());
                    node_ptr->offlineStage = MTC_OFFLINE__IDLE ;
                }
                else if ( !node_ptr->mtcAlive_mgmnt && !node_ptr->mtcAlive_infra )
                {
                    if ( ++node_ptr->offline_search_count > offline_threshold )
                    {
                        node_ptr->mtcAlive_online = false ;

                        plog ("%s going offline ; (threshold (%d msec * %d)\n",
                                  node_ptr->hostname.c_str(),
                                  offline_period,
                                  offline_threshold );

                        availStatusChange ( node_ptr, MTC_AVAIL_STATUS__OFFLINE );

                        /* Inform the VIM that this host is offline */
                        mtcVimApi_state_change ( node_ptr, VIM_HOST_OFFLINE, 1 );

                        node_ptr->offlineStage = MTC_OFFLINE__IDLE ;
                    }
                    else
                    {
                        alog ("%s missed mtcAlive %d of %d times\n",
                                  node_ptr->hostname.c_str(),
                                  node_ptr->offline_search_count,
                                  offline_threshold );
                    }
                }
                else
                {
                    node_ptr->mtcAlive_online = true ;
                    if ( node_ptr->mtcAlive_mgmnt || node_ptr->mtcAlive_infra )
                    {
                        ilog ("%s still seeing mtcAlive (%c:%c)\n",
                                  node_ptr->hostname.c_str(),
                                  node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
                                  node_ptr->mtcAlive_infra ? 'Y' : 'n');
                    }
                    else
                    {
                        alog ("%s still seeing mtcAlive (%c:%c)\n",
                                  node_ptr->hostname.c_str(),
                                  node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
                                  node_ptr->mtcAlive_infra ? 'Y' : 'n');
                    }
                }

                if ( node_ptr->offlineStage == MTC_OFFLINE__IDLE )
                {
                    ilog ("%s exiting offline handling\n", node_ptr->hostname.c_str());
                }
                else
                {
                    node_ptr->offlineStage = MTC_OFFLINE__SEND_MTCALIVE ;
                }
            }
            break ;
        }
        default:
        {
            slog ("%s unexpected stage ; correcting to idle\n",
                      node_ptr->hostname.c_str());

            node_ptr->offlineStage = MTC_OFFLINE__IDLE ;
        }
    }
    return (PASS);
}

/* Online handler
 * ---------------
 * Algorithm that manages offline/online state for a locked host */
int nodeLinkClass::online_handler ( struct nodeLinkClass::node * node_ptr )
{
    int rc = PASS ;

    /* don't need to manage the offline or online state
     * for the following availability states */
    if (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE )   ||
        ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED  )   ||
        ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFDUTY   )   ||
        ( node_ptr->availStatus == MTC_AVAIL_STATUS__INTEST    )   ||
        ( node_ptr->availStatus == MTC_AVAIL_STATUS__NOT_INSTALLED ))
    {
        return (PASS);
    }

    switch ( (int)node_ptr->onlineStage )
    {
        case MTC_ONLINE__START:
        {
            alog3 ("%s Offline Handler (%d)\n",
                      node_ptr->hostname.c_str(),
                      node_ptr->onlineStage );

            if ( node_ptr->mtcAlive_gate == true )
            {
                alog ("%s mtcAlive gate unexpectedly set, correcting ...\n",
                        node_ptr->hostname.c_str());

                node_ptr->mtcAlive_gate = false ;
            }

            /* Start with a zero count. This counter is incremented every
             * time we get a mtc alive message from that host */
            node_ptr->mtcAlive_online = false ;
            node_ptr->mtcAlive_misses = 0 ;

            /* Start mtcAlive message timer */
            mtcTimer_start ( node_ptr->mtcAlive_timer, mtcTimer_handler, online_period );
            node_ptr->onlineStage = MTC_ONLINE__WAITING ;
            break ;
        }
        case MTC_ONLINE__RETRYING:
        {
            /* Start mtcAlive message timer */
            mtcTimer_start ( node_ptr->mtcAlive_timer, mtcTimer_handler, online_period );
            node_ptr->onlineStage = MTC_ONLINE__WAITING ;
            break ;
        }
        case MTC_ONLINE__WAITING:
        {
            if ( node_ptr->mtcAlive_timer.ring == false )
                break ;

            alog ("%s mtcAlive [%s]  [ misses:%d]\n",
                      node_ptr->hostname.c_str(),
                      node_ptr->mtcAlive_online ? "Yes" : "No",
                      node_ptr->mtcAlive_misses );

            if ( node_ptr->mtcAlive_online == false )
            {
                node_ptr->mtcAlive_hits = 0 ;
                if ( node_ptr->mtcAlive_misses++ > MTC_OFFLINE_MISSES )
                {
                    /* If already online then and no counts then that means the node is not up - go offline */
                    if (( node_ptr->availStatus != MTC_AVAIL_STATUS__OFFLINE ) &&
                        ( node_ptr->availStatus != MTC_AVAIL_STATUS__POWERED_OFF ))
                    {
                        ilog ("%s mtcAlive lost ; going 'offline'\n",
                                  node_ptr->hostname.c_str());

                        clear_service_readies ( node_ptr );

                        /* otherwise change state */
                        mtcInvApi_update_state(node_ptr, MTC_JSON_INV_AVAIL,"offline" );
                        if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
                        {
                            mtcInvApi_update_state(node_ptr, MTC_JSON_INV_AVAIL_SUBF,"offline" );
                        }

                        /* Inform the VIM that this host is offline */
                        mtcVimApi_state_change ( node_ptr, VIM_HOST_OFFLINE, 1 );
                    }
                }
                else
                {
                    /* handle retries < MTC_OFFLINE_MISSES */
                    node_ptr->mtcAlive_timer.ring = false ;
                    node_ptr->onlineStage = MTC_ONLINE__RETRYING ;
                    break ;
                }
            }
            else
            {
                bool gate_online = false ;

                /* if we are getting counts then the node is up so change status */
                if ( node_ptr->availStatus != MTC_AVAIL_STATUS__ONLINE )
                {
                    node_ptr->mtcAlive_hits++ ;
                    if ( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF )
                    {
                        /* need 5 mtcAlive messages befpore we allow a power-off to go online */
                        if ( node_ptr->mtcAlive_hits < MTC_MTCALIVE_HITS_TO_GO_ONLINE )
                        {
                            gate_online = true ;
                            dlog ("%s ... %d\n", node_ptr->hostname.c_str(), node_ptr->mtcAlive_hits );
                        }
                    }

                    if ( gate_online == false )
                    {
                        ilog ("%s mtcAlive ; going 'online'\n",
                                  node_ptr->hostname.c_str());

                        mtcInvApi_update_state ( node_ptr, MTC_JSON_INV_AVAIL, "online" );
                        if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
                        {
                            mtcInvApi_update_state ( node_ptr, MTC_JSON_INV_AVAIL_SUBF, "online" );
                        }
                    }
                }
            }

            /* While the host is locked ... */
            if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
            {
                /* ... keep the 'host locked' file on this host refreshed while in the locked state
                 * ... send it on both interfaces just in case */
                send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, MGMNT_INTERFACE );
                // send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, INFRA_INTERFACE );
            }

            /* Start over */
            node_ptr->mtcAlive_timer.ring = false ;
            node_ptr->onlineStage = MTC_ONLINE__START ;
            break ;
        }
        default:
            node_ptr->onlineStage = MTC_ONLINE__START ;
    }
    return (rc);
}


/* Controller Swact Handler
 * ------------------------
 * Using a REST API into HA Service Manager through Inventory, this handler
 * is responsible for quering for active services on the specified
 * controller and then if services are found to be running , requesting
 * migration of those active services away from that controller */

#define SWACT_DONE \
{ \
    if ( node_ptr->mtcSwact_timer.tid ) \
    { \
        mtcTimer_stop ( node_ptr->mtcSwact_timer ); \
    } \
    mtcTimer_start ( node_ptr->mtcSwact_timer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ); \
    node_ptr->swactStage = MTC_SWACT__DONE ; \
}

#define SWACT_FAIL_THRESHOLD    (3)
#define SWACT_RETRY_THRESHOLD  (10)
#define SWACT_FAIL_MSEC_DELAY (250)
#define SWACT_RECV_MSEC_DELAY  (50)
#define SWACT_POLL_DELAY       (10)
#define SWACT_TIMEOUT_DELAY    (50)

int nodeLinkClass::swact_handler  ( struct nodeLinkClass::node * node_ptr )
{
    int rc = PASS ;

    if ( daemon_is_file_present ( PLATFORM_SIMPLEX_MODE ) == true )
    {
        slog ("%s rejecting Swact request in simplex mode\n", node_ptr->hostname.c_str());
        node_ptr->swactStage = MTC_SWACT__START ;
        adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
        return (PASS);
    }
    switch ( (int)node_ptr->swactStage )
    {
        /* Start / Init Stage */
        case MTC_SWACT__START:
        {
            plog ("%s Administrative SWACT Requested\n", node_ptr->hostname.c_str() );

            /* Cleanup and init the swact timer - start fresh */
            if ( node_ptr->mtcSwact_timer.tid )
            {
                wlog ("%s Cancelling outstanding Swact timer\n", node_ptr->hostname.c_str());
                mtcTimer_stop ( node_ptr->mtcSwact_timer );
            }
            mtcTimer_init ( node_ptr->mtcSwact_timer );

            /* reset error / control Counters to zero */
            nodeLinkClass::smgrEvent.count   = 0 ;
            nodeLinkClass::smgrEvent.fails   = 0 ;
            nodeLinkClass::smgrEvent.cur_retries = 0 ;

            /* Empty the event message strings */
            nodeLinkClass::smgrEvent.payload = "" ;
            nodeLinkClass::smgrEvent.response = "" ;

            /* Post a user message 'Swact: Request' and
             * then delay to allow it to be displayed */
            mtcInvApi_force_task ( node_ptr, MTC_TASK_SWACT_REQUEST );
            mtcTimer_start ( node_ptr->mtcSwact_timer, mtcTimer_handler, (MTC_TASK_UPDATE_DELAY/2) );
            node_ptr->swactStage = MTC_SWACT__QUERY ;
            break ;
        }

        /* Handle and threshold all Query Failures */
        case MTC_SWACT__QUERY_FAIL:
        {
            if ( ++nodeLinkClass::smgrEvent.fails >= SWACT_FAIL_THRESHOLD )
            {
                wlog ("%s Query Services Failed: Max Retries  (max:%d)\n",
                          node_ptr->hostname.c_str(), nodeLinkClass::smgrEvent.fails);
                mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_FAIL_QUERY);
                SWACT_DONE ;
            }
            else
            {
                wlog ("%s Query Services: Retrying (cnt:%d)\n",
                          node_ptr->hostname.c_str(), nodeLinkClass::smgrEvent.fails);
                mtcTimer_start_msec ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_FAIL_MSEC_DELAY );
                node_ptr->swactStage = MTC_SWACT__QUERY ;
            }
            break ;
        }

        /* Query Services on this host */
        case MTC_SWACT__QUERY:
        {
            if ( node_ptr->mtcSwact_timer.ring == true )
            {
                rc = mtcSmgrApi_request ( node_ptr, CONTROLLER_QUERY, 0 );
                if ( rc )
                {
                    nodeLinkClass::smgrEvent.status = rc ;
                    node_ptr->swactStage = MTC_SWACT__QUERY_FAIL ;
                }
                else
                {
                    /* Ok, we got a successful send request ;
                     * delay a bit and check for the response */
                    nodeLinkClass::smgrEvent.cur_retries = 0 ;
                    nodeLinkClass::smgrEvent.fails   = 0 ;
                    mtcTimer_start_msec ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_RECV_MSEC_DELAY );
                    node_ptr->swactStage = MTC_SWACT__QUERY_RECV ;
                }
            }
            break ;
        }

        case MTC_SWACT__QUERY_RECV:
        {
            if ( node_ptr->mtcSwact_timer.ring == true )
            {
                /* Try and receive the response */
                rc = mtcHttpUtil_receive ( nodeLinkClass::smgrEvent );
                if ( rc == RETRY )
                {
                    if ( ++nodeLinkClass::smgrEvent.cur_retries > SWACT_RETRY_THRESHOLD )
                    {
                        wlog ("%s Too many receive retries (cnt:%d)\n",
                                node_ptr->hostname.c_str(), nodeLinkClass::smgrEvent.cur_retries );
                        rc = FAIL ;
                    }
                    else
                    {
                        mtcTimer_start_msec ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_RECV_MSEC_DELAY );
                        break ;
                    }
                }
                if (( rc != PASS ) && ( rc != RETRY ))
                {
                    elog ("%s Service Query Failed: Receive Error (rc:%d)\n",
                              node_ptr->hostname.c_str(), rc );
                    mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_FAILED);
                    SWACT_DONE ;
                }
                else
                {
                    /* Parse through the response - no retries on response string errors */
                    bool active = false ;
                    rc = mtcSmgrApi_service_state ( nodeLinkClass::smgrEvent, active );
                    if ( rc )
                    {
                        /* Setup common error message for the user*/
                        ilog ("%s Swact: Service Query Failed\n", node_ptr->hostname.c_str());
                        mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_FAILED);
                        SWACT_DONE ;
                    }
                    else if ( active == true )
                    {
                        /* O.K. We need to Swact */
                        nodeLinkClass::smgrEvent.fails   = 0 ;
                        nodeLinkClass::smgrEvent.cur_retries = 0 ;
                        node_ptr->swactStage = MTC_SWACT__SWACT ;

                        /* Tell the user what we are doing */
                        mtcInvApi_force_task ( node_ptr, MTC_TASK_SWACT_INPROGRESS );
                    }
                    else
                    {
                        /* If not true then somehow we are being asked to
                         * Swact a controller that is not running any services */
                        ilog ("%s %s\n", node_ptr->hostname.c_str(), MTC_TASK_SWACT_NOSERVICE);
                        mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_NOSERVICE);
                        SWACT_DONE ;
                    }
                }
            }
            break ;
        }

        /* Phase 2: Perform Swact */
        case MTC_SWACT__SWACT:
        {
            rc = mtcSmgrApi_request ( node_ptr, CONTROLLER_SWACT, 0 );
            if ( rc )
            {
                /* Abort after SWACT_FAIL_THRESHOLD retries - verified */
                if ( ++nodeLinkClass::smgrEvent.fails >= SWACT_FAIL_THRESHOLD )
                {
                    elog ( "%s Swact: Failed Request (rc:%d) (max:%d)\n",
                               node_ptr->hostname.c_str(), rc,
                               nodeLinkClass::smgrEvent.fails);

                     mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_FAILED );
                     SWACT_DONE ;
                }
                else
                {
                    elog ( "%s Swact: Retrying Request (rc:%d) (cnt:%d)\n",
                               node_ptr->hostname.c_str(), rc,
                               nodeLinkClass::smgrEvent.fails);
                }
            }
            else
            {
                plog ("%s Swact: In Progress\n", node_ptr->hostname.c_str());
                nodeLinkClass::smgrEvent.status  = PASS ;
                nodeLinkClass::smgrEvent.fails   = 0 ;
                nodeLinkClass::smgrEvent.cur_retries = 0 ;
                mtcTimer_start_msec ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_RECV_MSEC_DELAY );
                node_ptr->swactStage = MTC_SWACT__SWACT_RECV ;
            }
            break ;
        }

        case MTC_SWACT__SWACT_RECV:
        {
            if ( node_ptr->mtcSwact_timer.ring == true )
            {
                /* Try and receive the response */
                rc = mtcHttpUtil_receive ( nodeLinkClass::smgrEvent );
                if ( rc == RETRY )
                {
                    if ( ++nodeLinkClass::smgrEvent.cur_retries > SWACT_RETRY_THRESHOLD )
                    {
                        wlog ("%s Too many receive retries (cnt:%d)\n",
                                node_ptr->hostname.c_str(), nodeLinkClass::smgrEvent.cur_retries );
                        rc = FAIL ;
                    }
                    else
                    {
                        mtcTimer_start_msec ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_RECV_MSEC_DELAY );
                        break ;
                    }
                }
                if (( rc != PASS ) && ( rc != RETRY ))
                {
                    elog ("%s Swact Failed: Receive Error (rc:%d)\n",
                              node_ptr->hostname.c_str(), rc );
                    mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_FAILED);
                    SWACT_DONE ;
                }
                else
                {
                    mtcTimer_start ( node_ptr->mtcSwact_timer, mtcTimer_handler, MTC_SWACT_POLL_TIMER );
                    mtcSmgrApi_request ( node_ptr, CONTROLLER_QUERY, 0 );
                    node_ptr->swactStage = MTC_SWACT__SWACT_POLL ;
                }
            }
            break ;
        }

        case MTC_SWACT__SWACT_POLL:
        {
            if ( node_ptr->mtcSwact_timer.ring == true )
            {
                if (++nodeLinkClass::smgrEvent.count >=
                   (nodeLinkClass::swact_timeout/MTC_SWACT_POLL_TIMER))
                {
                    elog ("%s Swact Failed: Timeout\n", node_ptr->hostname.c_str());
                    mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_TIMEOUT);
                    SWACT_DONE ;
                }
                rc = mtcHttpUtil_receive ( smgrEvent  );
                if ( rc != RETRY )
                {
                    bool active = true ;
                    mtcSmgrApi_service_state ( smgrEvent, active );
                    if ( active == false )
                    {
                        dlog ("%s %s\n",node_ptr->hostname.c_str(), MTC_TASK_SWACT_COMPLETE );
                        mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_COMPLETE );
                        SWACT_DONE ;
                        break ;
                    }
                    else
                    {
                        mtcSmgrApi_request ( node_ptr, CONTROLLER_QUERY, 0 );
                    }
                }
                else
                {
                    plog ("%s Swact: In-Progress\n", node_ptr->hostname.c_str());
                }
                mtcTimer_start ( node_ptr->mtcSwact_timer, mtcTimer_handler, MTC_SWACT_POLL_TIMER );
            }
            break ;
        }
        case MTC_SWACT__DONE:
        {
            /* Wait for the done timer to expire.
             * When it does ; exit the SWACT FSM after clearing
             * the task and setting it back to the start. */
            if ( node_ptr->mtcSwact_timer.ring == true )
            {
                mtcInvApi_force_task ( node_ptr, "");
                nodeLinkClass::smgrEvent.active = false ;
                nodeLinkClass::smgrEvent.mutex  = false ;
                node_ptr->mtcSwact_timer.ring   = false ;
                node_ptr->swactStage = MTC_SWACT__START ;
                adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
                if ( smgrEvent.status )
                {
                    wlog ("%s Swact: Failed\n", node_ptr->hostname.c_str());

                }
                else
                {
                    plog ("%s Swact: Completed\n", node_ptr->hostname.c_str());
                }
            }
            break;
        }

        default:
            node_ptr->swactStage = MTC_SWACT__START ;
    }
    return (rc);
}

/* Reset Handler
 *  ------------
 * Issue a reset to a host */
int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
{
    int rc = PASS ;
    switch ( node_ptr->resetStage )
    {
        case MTC_RESET__FAIL:
        {
            elog ("%s Reset failed ; aborting after max retries\n", node_ptr->hostname.c_str());
            mtcInvApi_update_task ( node_ptr, MTC_TASK_RESET_FAIL);
            mtcTimer_reset ( node_ptr->mtcTimer );
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
            resetStageChange ( node_ptr , MTC_RESET__FAIL_WAIT );
            break ;
        }
        case MTC_RESET__FAIL_WAIT:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
            {
                node_ptr->mtcTimer.ring = false ;
                resetStageChange ( node_ptr , MTC_RESET__DONE );

                recovery_ctrl_init ( node_ptr->hwmon_reset ); ;
                mtcTimer_reset ( node_ptr->hwmon_reset.recovery_timer );
            }
            break ;
        }
        case MTC_RESET__START:
        {
            plog ("%s Administrative 'Reset' Action\n", node_ptr->hostname.c_str());
            mtcInvApi_update_task ( node_ptr, "Reset Requested" );
            node_ptr->retries = 0 ;

            start_offline_handler ( node_ptr );

            if ( hostUtil_is_valid_ip_addr (node_ptr->bm_ip ) == false )
            {
                /**
                 *  New working provisioning is learned by from the
                 *  dnsmasq.bmc_hosts file changes through inotify watch so
                 *  it is entirely possible that the retries in this fsm
                 *  eventually succeed.
                 **/
                wlog ("%s bm_ip (%s) is invalid (%d) \n",
                          node_ptr->hostname.c_str(),
                          node_ptr->bm_ip.c_str(),
                          rc );
                resetStageChange ( node_ptr , MTC_RESET__FAIL );
                break ;
            }
            node_ptr->power_action_retries = MTC_RESET_ACTION_RETRY_COUNT ;
            /* the fall through is intentional */
        }
        case MTC_RESET__REQ_SEND:
        {
            node_ptr->power_action_retries--;

            /* Handle loss of connectivity over retries  */
            if ( node_ptr->bm_provisioned == false )
            {
                elog ("%s BMC not provisioned\n", node_ptr->hostname.c_str() );
                mtcInvApi_force_task ( node_ptr, MTC_TASK_BMC_NOT_PROV );
                resetStageChange ( node_ptr , MTC_RESET__FAIL );
                break ;
            }

            if ( node_ptr->bm_accessible == false )
            {
                wlog ("%s Power Off request rejected ; BMC not accessible ; retry in %d seconds \n",
                          node_ptr->hostname.c_str(),
                          MTC_POWER_ACTION_RETRY_DELAY);

                mtcTimer_reset ( node_ptr->mtcTimer );
                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                resetStageChange ( node_ptr , MTC_RESET__QUEUE );
                break ;
            }

            else
            {
                    rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_RESET );

                if ( rc )
                {
                    wlog ("%s Reset request failed (%d)\n", node_ptr->hostname.c_str(), rc );
                    resetStageChange ( node_ptr , MTC_RESET__QUEUE );
                }
                else
                {
                    blog ("%s Reset requested\n", node_ptr->hostname.c_str());
                    resetStageChange ( node_ptr , MTC_RESET__RESP_WAIT );
                }
                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
            }
            break ;
        }

        case MTC_RESET__RESP_WAIT:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
            {
                    rc = ipmi_command_recv ( node_ptr );
                    if ( rc == RETRY )
                    {
                        mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
                        break ;
                    }

                if ( rc )
                {
                    elog ("%s Reset command failed (rc:%d)\n", node_ptr->hostname.c_str(), rc );
                    mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                    resetStageChange ( node_ptr, MTC_RESET__QUEUE );
                }
                else
                {
                    ilog ("%s is Resetting\n", node_ptr->hostname.c_str());
                    mtcInvApi_update_task ( node_ptr, "Resetting: waiting for offline" );
                    mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_TO_OFFLINE_TIMEOUT );
                    resetStageChange ( node_ptr, MTC_RESET__OFFLINE_WAIT );
                }
            }
            break ;
        }

        case MTC_RESET__QUEUE:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
            {
                node_ptr->mtcTimer.ring = false ;
                if ( node_ptr->power_action_retries > 0 )
                {
                    char buffer[64] ;
                    int attempts = MTC_RESET_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
                    snprintf ( buffer, 64, MTC_TASK_RESET_QUEUE, attempts, MTC_RESET_ACTION_RETRY_COUNT);
                    mtcInvApi_update_task ( node_ptr, buffer);

                    /* check the thread error status if thetre is one */
                    if ( node_ptr->ipmitool_thread_info.status )
                    {
                        wlog ("%s ... %s (rc:%d)\n", node_ptr->hostname.c_str(),
                                                     node_ptr->ipmitool_thread_info.status_string.c_str(),
                                                     node_ptr->ipmitool_thread_info.status );
                    }

                    resetStageChange ( node_ptr , MTC_RESET__REQ_SEND );
                }
                else
                {
                    resetStageChange ( node_ptr , MTC_RESET__FAIL );
                }
            }
            break ;
        }

        case MTC_RESET__OFFLINE_WAIT:
        {
             if ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE )
             {
                 if (node_ptr->mtcTimer.tid)
                     mtcTimer_stop ( node_ptr->mtcTimer );

                 plog ("%s Reset Successful\n", node_ptr->hostname.c_str());
                 resetStageChange ( node_ptr , MTC_RESET__DONE );
             }
             else if ( node_ptr->mtcTimer.ring == true )
             {
                 elog ("%s Reset operation timeout - host did not go offline\n", node_ptr->hostname.c_str());
                 resetStageChange ( node_ptr , MTC_RESET__FAIL );
             }
             break ;
        }

        case MTC_RESET__DONE:
        default:
        {
            mtcTimer_reset ( node_ptr->mtcTimer );

            hwmon_recovery_monitor ( node_ptr, MTC_EVENT_HWMON_RESET );

            adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );

            mtcInvApi_force_task ( node_ptr, "" );

            clear_service_readies ( node_ptr );

            plog ("%s Reset Completed\n", node_ptr->hostname.c_str());
            break ;
        }
    }
    return (PASS);
}

/* Reinstall handler
 * --------------
 * Manage reinstall operations for a locked-disabled host */
int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
{
    switch ( node_ptr->reinstallStage )
    {
        case MTC_REINSTALL__START:
        {
            int host_reinstall_wait_timer = node_ptr->mtcalive_timeout + node_reinstall_timeout ;
            node_ptr->retries = host_reinstall_wait_timer / MTC_REINSTALL_WAIT_TIMER ;

            start_offline_handler ( node_ptr );

            node_ptr->cmdReq = MTC_CMD_WIPEDISK ;

            plog ("%s Administrative Reinstall Requested\n", node_ptr->hostname.c_str());
            if ( send_mtc_cmd ( node_ptr->hostname, MTC_CMD_WIPEDISK, MGMNT_INTERFACE ) != PASS )
            {
                elog ("Failed to send 'reinstall' request to %s\n", node_ptr->hostname.c_str());
                reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
            }
            else
            {
                node_ptr->cmdRsp = MTC_CMD_NONE ;

                if ( node_ptr->mtcTimer.tid )
                {
                    mtcTimer_stop ( node_ptr->mtcTimer );
                }

                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_CMD_RSP_TIMEOUT );

                ilog ("%s waiting for REINSTALL ACK \n", node_ptr->hostname.c_str() );

                reinstallStageChange ( node_ptr , MTC_REINSTALL__RESP_WAIT );
            }
            break ;
        }
        case MTC_REINSTALL__RESP_WAIT:
        {
            if ( node_ptr->cmdRsp != MTC_CMD_WIPEDISK )
            {
                if ( node_ptr->mtcTimer.ring == true )
                {
                    elog ("%s REINSTALL ACK Timeout\n",
                        node_ptr->hostname.c_str());

                    reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
                }
            }
            else
            {
                /* declare successful reinstall request */
                plog ("%s REINSTALL Request Succeeded\n", node_ptr->hostname.c_str());

                mtcTimer_stop ( node_ptr->mtcTimer );

                /* We need to wait for the host to go offline */
                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_TO_OFFLINE_TIMEOUT );

                /* Wait for the host to go offline */
                reinstallStageChange ( node_ptr , MTC_REINSTALL__OFFLINE_WAIT );
            }
            break ;
        }
        case MTC_REINSTALL__OFFLINE_WAIT:
        {
            if ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE )
            {
                mtcTimer_stop ( node_ptr->mtcTimer );

                clear_service_readies ( node_ptr );

                ilog ("%s Reinstall Progress: host is offline ; waiting for host to come back\n", node_ptr->hostname.c_str());

                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_WAIT_TIMER );
                reinstallStageChange ( node_ptr , MTC_REINSTALL__ONLINE_WAIT );
            }
            else if ( node_ptr->mtcTimer.ring == true )
            {
                elog ("%s offline timeout - reinstall failed\n", node_ptr->hostname.c_str());
                reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
            }
            break ;
        }
        case MTC_REINSTALL__ONLINE_WAIT:
        {
            if ( node_ptr->mtcTimer.ring == true )
            {
                if ( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE )
                {
                    mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_SUCCESS);
                    mtcTimer_stop ( node_ptr->mtcTimer );
                    mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
                    reinstallStageChange ( node_ptr , MTC_REINSTALL__MSG_DISPLAY );
                    mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_REINSTALL_COMPLETE );
                }
                else
                {
                    if ( --node_ptr->retries < 0 )
                    {
                        elog ("%s online timeout - reinstall failed\n", node_ptr->hostname.c_str());
                        reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
                    }
                    else
                    {
                        mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_WAIT_TIMER );
                    }
                }
            }
            break;
        }
        case MTC_REINSTALL__FAIL:
        {
            mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL);
            mtcTimer_stop ( node_ptr->mtcTimer );
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
            reinstallStageChange ( node_ptr , MTC_REINSTALL__MSG_DISPLAY );
            mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED );
            break ;
        }
        case MTC_REINSTALL__MSG_DISPLAY:
        {
            if ( node_ptr->mtcTimer.ring == true )
            {
                node_ptr->mtcTimer.ring = false ;
                reinstallStageChange ( node_ptr , MTC_REINSTALL__DONE );
            }
            break ;
        }
        case MTC_REINSTALL__DONE:
        default:
        {
            plog ("%s Reinstall Completed\n",  node_ptr->hostname.c_str());

            /* Default timeout values */
            LOAD_NODETYPE_TIMERS ;

            mtcTimer_stop ( node_ptr->mtcTimer );

            adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );

            recovery_ctrl_init ( node_ptr->hwmon_reset );
            recovery_ctrl_init ( node_ptr->hwmon_powercycle );

            mtcInvApi_force_task ( node_ptr, "" );
            break ;
        }
    }
    return (PASS);
}

/* Reboot handler
 * --------------
 * Manage reinstall operations for a disabled host */
int nodeLinkClass::reboot_handler ( struct nodeLinkClass::node * node_ptr )
{
    // ilog ("%s Administrative 'reboot' Action (%d)\n", node_ptr->hostname.c_str(), node_ptr->resetProgStage );

    switch ( node_ptr->resetProgStage )
    {
        case MTC_RESETPROG__START:
        {
            plog ("%s Administrative Reboot Requested\n",  node_ptr->hostname.c_str() );

            /* start with a clean command slate */
            mtcCmd_doneQ_purge ( node_ptr );
            mtcCmd_workQ_purge ( node_ptr );
            mtcInvApi_update_task ( node_ptr, MTC_TASK_RESET_PROG );
            if ( node_ptr->adminAction != MTC_ADMIN_ACTION__REBOOT )
            {
                mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_AUTO_REBOOT );
            }
            node_ptr->retries = 0 ;

            /* If this is a simplex all-in-one system then issue the lazy reboot and just wait */
            if ( THIS_HOST )
            {
                mtcInvApi_update_task_now ( node_ptr, "Please stand-by while the active controller gracefully reboots" );
                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_2 ) ;
                node_ptr->resetProgStage = MTC_RESETPROG__WAIT ;

                /* Launch a backup sysreq thread */
                fork_sysreq_reboot ( daemon_get_cfg_ptr()->failsafe_shutdown_delay );

                /* Tell SM we are unhealthy so that it shuts down all its services */
                daemon_log ( SMGMT_UNHEALTHY_FILE, "Active Controller Reboot request" );
                send_mtc_cmd ( node_ptr->hostname, MTC_CMD_LAZY_REBOOT, MGMNT_INTERFACE ) ;
            }
            else
            {
                node_ptr->resetProgStage = MTC_RESETPROG__REBOOT ;
            }
            break ;
        }
        case MTC_RESETPROG__REBOOT:
        {
            #define REBOOT_RETRIES (0)
            node_ptr->mtcCmd_work_fifo.clear();
            mtcCmd_init ( node_ptr->cmd );
            node_ptr->cmd.stage = MTC_CMD_STAGE__START ;
            node_ptr->cmd.cmd   = MTC_OPER__RESET_PROGRESSION ;
            node_ptr->cmd.parm1 = REBOOT_RETRIES ; /* retries */
            node_ptr->cmd.task  = false ; /* send task updates */
            node_ptr->mtcCmd_work_fifo.push_front(node_ptr->cmd);

            mtcTimer_reset ( node_ptr->mtcTimer );

            /* calculate the overall timeout period taking into account
             * all the reboot/reset sources that will be tried */
            int overall_timeout = calc_reset_prog_timeout ( node_ptr, REBOOT_RETRIES ) ;
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, overall_timeout ) ;
            node_ptr->resetProgStage = MTC_RESETPROG__WAIT ;

            break ;
        }
        case MTC_RESETPROG__WAIT:
        {
            /* Look for the command handler FSM timeout and abor in that case */
            if ( node_ptr->mtcTimer.ring == true )
            {
                ilog ("%s reboot (progression) timeout\n", node_ptr->hostname.c_str());

                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ) ;
                mtcInvApi_force_task ( node_ptr, MTC_TASK_REBOOT_ABORT );
                node_ptr->resetProgStage = MTC_RESETPROG__FAIL ;
            }
            else if ( THIS_HOST )
            {
                ; /* wait for the reboot or FSM timeout */
            }
            else if ( node_ptr->mtcCmd_work_fifo.empty())
            {
                slog ("%s unexpected empty cmd queue\n", node_ptr->hostname.c_str());
                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ) ;
                mtcInvApi_force_task ( node_ptr, MTC_TASK_REBOOT_ABORT );
                adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
                node_ptr->resetProgStage = MTC_RESETPROG__FAIL ;
            }
            else
            {
                node_ptr->mtcCmd_work_fifo_ptr = node_ptr->mtcCmd_work_fifo.begin() ;
                if ( node_ptr->mtcCmd_work_fifo_ptr->stage == MTC_CMD_STAGE__DONE )
                {
                    if ( node_ptr->mtcTimer.tid )
                        mtcTimer_stop ( node_ptr->mtcTimer );

                    if ( node_ptr->mtcCmd_work_fifo_ptr->status == PASS )
                    {
                        plog ("%s Reboot Completed\n", node_ptr->hostname.c_str() );
                        node_ptr->mtcTimer.ring = true ;
                        node_ptr->resetProgStage = MTC_RESETPROG__FAIL ; /* not really fail but use its clean up function */
                    }
                    else if ( ++node_ptr->retries <= 5 )
                    {
                        char buffer[255] ;
                        snprintf ( buffer, 255, MTC_TASK_REBOOT_FAIL_RETRY, node_ptr->retries, 5 );
                        wlog ("%s %s\n", node_ptr->hostname.c_str(), buffer );
                        mtcInvApi_update_task ( node_ptr, buffer );
                        if ( node_ptr->mtcCmd_done_fifo.size() )
                            node_ptr->mtcCmd_done_fifo.pop_front();
                        node_ptr->resetProgStage = MTC_RESETPROG__REBOOT ;
                    }
                    else
                    {
                        wlog ("%s %s\n", node_ptr->hostname.c_str(), MTC_TASK_REBOOT_ABORT );
                        if ( node_ptr->mtcCmd_done_fifo.size() )
                            node_ptr->mtcCmd_done_fifo.pop_front();
                        mtcInvApi_force_task ( node_ptr, MTC_TASK_REBOOT_ABORT );
                        mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ) ;
                        node_ptr->resetProgStage = MTC_RESETPROG__FAIL ;
                    }
                }
                break ;
            }
            case MTC_RESETPROG__FAIL:
            {
                if ( node_ptr->mtcTimer.ring == true )
                {
                    if ( !node_ptr->mtcCmd_work_fifo.empty() )
                        node_ptr->mtcCmd_work_fifo.pop_front();
                        if ( !node_ptr->mtcCmd_work_fifo.empty() )
                            mtcCmd_workQ_purge ( node_ptr );

                    if ( !node_ptr->mtcCmd_done_fifo.empty() )
                        node_ptr->mtcCmd_done_fifo.pop_front();
                        if ( !node_ptr->mtcCmd_done_fifo.empty() )
                            mtcCmd_doneQ_purge ( node_ptr );

                    mtcInvApi_force_task ( node_ptr, "" );
                    adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
                    node_ptr->resetProgStage = MTC_RESETPROG__START ;
                }
            }
            break ;
        }
        default:
        {
            slog ("%s unsupported reboot stage (%d) ; clearing action\n",
                      node_ptr->hostname.c_str(),
                      node_ptr->resetProgStage );

            adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
        }
    }

    return (PASS);
}

/* Power Handler
 * ----------------- */
int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
{
    int rc = PASS ;
    switch ( node_ptr->powerStage )
    {
        case MTC_POWEROFF__FAIL:
        {
            elog ("%s Power-Off failed ; aborting after max retries\n", node_ptr->hostname.c_str());
            mtcInvApi_update_task ( node_ptr, MTC_TASK_POWEROFF_FAIL);
            mtcTimer_reset ( node_ptr->mtcTimer ) ;
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
            powerStageChange ( node_ptr , MTC_POWEROFF__FAIL_WAIT );
            break ;
        }
        case MTC_POWEROFF__FAIL_WAIT:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
            {
                node_ptr->mtcTimer.ring = false ;
                powerStageChange ( node_ptr , MTC_POWER__DONE );
            }
            break ;
        }
        case MTC_POWEROFF__START:
        {
            plog ("%s Administrative 'Power-Off' Action\n", node_ptr->hostname.c_str());
            mtcInvApi_force_task ( node_ptr, "Power-Off Requested" );

            start_offline_handler ( node_ptr );

            if ( hostUtil_is_valid_ip_addr (node_ptr->bm_ip ) == false )
            {
                /**
                 *  New working provisioning is learned by from the
                 *  dnsmasq.bmc_hosts file changes through inotify watch so
                 *  it is entirely possible that the retries in this fsm
                 *  eventually succeed.
                 **/
                wlog ("%s bm_ip (%s) is invalid (%d) \n",
                          node_ptr->hostname.c_str(),
                          node_ptr->bm_ip.c_str(),
                          rc );
            }
            else
            {
                ; // send_hwmon_command ( node_ptr->hostname, MTC_CMD_STOP_HOST );
            }

            node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
            //the fall through to MTC_POWEROFF__REQ_SEND is intentional
        }
        case MTC_POWEROFF__REQ_SEND:
        {
            node_ptr->power_action_retries--;

            /* Handle loss of connectivity over retries  */
            if ( node_ptr->bm_provisioned == false )
            {
                elog ("%s BMC not provisioned\n", node_ptr->hostname.c_str());
                mtcInvApi_force_task ( node_ptr, MTC_TASK_BMC_NOT_PROV );
                powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
                break ;
            }

            if ( node_ptr->bm_accessible == false )
            {
                wlog ("%s Power Off request rejected ; BMC not accessible ; retry in %d seconds\n",
                          node_ptr->hostname.c_str(),
                          MTC_POWER_ACTION_RETRY_DELAY);

                mtcTimer_reset ( node_ptr->mtcTimer );
                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
                break ;
            }

            else
            {
                    rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_OFF );
                if ( rc )
                {
                    wlog ("%s Power-Off request failed (%d)\n", node_ptr->hostname.c_str(), rc );
                    powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
                }
                else
                {
                    blog ("%s Power-Off requested\n", node_ptr->hostname.c_str());
                    powerStageChange ( node_ptr , MTC_POWEROFF__RESP_WAIT );
                }
                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
            }
            break ;
        }

        case MTC_POWEROFF__RESP_WAIT:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
            {
                    rc = ipmi_command_recv ( node_ptr );
                    if ( rc == RETRY )
                    {
                        mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
                        break ;
                    }

                if ( rc )
                {
                    elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str());
                    mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                    powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
                }
                else
                {
                    ilog ("%s is Powering Off\n", node_ptr->hostname.c_str() );
                    mtcInvApi_update_task ( node_ptr, "Powering Off" );
                    mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
                    powerStageChange ( node_ptr , MTC_POWEROFF__DONE );
                    node_ptr->power_on = false ;
                }
            }
            break ;
        }
        case MTC_POWEROFF__QUEUE:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
            {
                node_ptr->mtcTimer.ring = false ;
                if ( node_ptr->power_action_retries > 0 )
                {
                    char buffer[255] ;
                    int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
                    snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT);
                    mtcInvApi_update_task ( node_ptr, buffer);

                    /* check the thread error status if thetre is one */
                    if ( node_ptr->ipmitool_thread_info.status )
                    {
                        wlog ("%s ... %s (rc:%d)\n", node_ptr->hostname.c_str(),
                                                     node_ptr->ipmitool_thread_info.status_string.c_str(),
                                                     node_ptr->ipmitool_thread_info.status );
                    }
                    powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
                }
                else
                {
                    powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
                }
            }
            break ;
        }
        case MTC_POWEROFF__DONE:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
            {
                plog ("%s Power-Off Completed\n", node_ptr->hostname.c_str());

                stop_offline_handler ( node_ptr );

                availStatusChange ( node_ptr, MTC_AVAIL_STATUS__POWERED_OFF );

                powerStageChange ( node_ptr , MTC_POWER__DONE );
            }
            break ;
        }

        /* ----------------------- */
        /* POWER ON Group of Cases */
        /* ----------------------- */

        case MTC_POWERON__FAIL:
        {
            elog ("%s Power-On failed ; aborting after max retries\n", node_ptr->hostname.c_str());
            mtcInvApi_update_task ( node_ptr, MTC_TASK_POWERON_FAIL);
            mtcTimer_reset ( node_ptr->mtcTimer ) ;
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
            powerStageChange ( node_ptr , MTC_POWERON__FAIL_WAIT );
            break ;
        }
        case MTC_POWERON__FAIL_WAIT:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
            {
                node_ptr->mtcTimer.ring = false ;
                powerStageChange ( node_ptr , MTC_POWER__DONE );
            }
            break ;
        }
        case MTC_POWERON__START:
        {
            plog ("%s Administrative 'Power-On' Action\n", node_ptr->hostname.c_str());
            mtcInvApi_update_task ( node_ptr, "Power-On Requested" );

            if ( hostUtil_is_valid_ip_addr ( node_ptr->bm_ip ) == false )
            {
                /**
                 *  New working provisioning is learned by from the
                 *  dnsmasq.bmc_hosts file changes through inotify watch so
                 *  it is entirely possible that the retries in this fsm
                 *  eventually succeed.
                 **/
                wlog ("%s bm_ip (%s) is invalid (%d) \n",
                          node_ptr->hostname.c_str(),
                          node_ptr->bm_ip.c_str(),
                          rc );
            }

            node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
            powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS );
            //the fall through to MTC_POWERON__REQ_SEND is intentional
        }
        case MTC_POWERON__POWER_STATUS:
        {
            if ( node_ptr->bm_accessible == false )
            {
                wlog ("%s Power On request rejected ; BMC not accessible ; retry in %d seconds\n",
                          node_ptr->hostname.c_str(),
                          MTC_POWER_ACTION_RETRY_DELAY);

                node_ptr->power_action_retries-- ;
                mtcTimer_reset ( node_ptr->mtcTimer );
                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
                break ;
            }

                rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_STATUS ) ;
                if ( rc )
                {
                    node_ptr->power_action_retries-- ;
                    powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
                }
                else
                {
                    powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS_WAIT );
                }
                mtcTimer_reset ( node_ptr->mtcTimer );
                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_IPMITOOL_REQUEST_DELAY );
                break ;
            }
        case MTC_POWERON__POWER_STATUS_WAIT:
        {
                if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
                {
                    rc = ipmi_command_recv ( node_ptr );
                    if ( rc == RETRY )
                    {
                        mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
                    }
                    else if ( rc == PASS )
                    {
                        if ( node_ptr->ipmitool_thread_info.data.find (IPMITOOL_POWER_ON_STATUS) != std::string::npos )
                        {
                            ilog ("%s power is already on ; no action required\n", node_ptr->hostname.c_str());
                            node_ptr->power_on = true ;
                            mtcInvApi_update_task ( node_ptr, "Power Already On" );
                            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
                            powerStageChange ( node_ptr , MTC_POWERON__DONE );
                        }
                        else
                        {
                            ilog ("%s power is off ; powering on ...\n", node_ptr->hostname.c_str() );
                            powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
                        }
                    }
                    else
                    {
                        powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS );
                    }
                }
            break ;
        }
        case MTC_POWERON__REQ_SEND:
        {
            node_ptr->power_action_retries--;

            /* Ensure that mtce is updated with the latest board
             * management ip address for this host */
            if ( node_ptr->bm_provisioned == false )
            {
                elog ("%s BMC not provisioned or accessible (%d:%d)\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->bm_provisioned,
                          node_ptr->bm_accessible );

                powerStageChange ( node_ptr , MTC_POWERON__FAIL );
                break ;
            }

            if ( node_ptr->bm_accessible == false )
            {
                wlog ("%s Power-On will fail ; not accessible to BMC ; retry in %d seconds \n",
                          node_ptr->hostname.c_str(), MTC_POWER_ACTION_RETRY_DELAY);

                mtcTimer_reset ( node_ptr->mtcTimer );
                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
                break ;
            }
            else
            {
                    rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON );
                if ( rc )
                {
                    wlog ("%s Power-On request failed (%d)\n",
                              node_ptr->hostname.c_str(), rc );

                    mtcTimer_reset ( node_ptr->mtcTimer );
                    mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                    powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
                }
                else
                {
                    blog ("%s Power-On requested\n", node_ptr->hostname.c_str());

                    mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_IPMITOOL_REQUEST_DELAY );

                    powerStageChange ( node_ptr , MTC_POWERON__RESP_WAIT );
                }
            }
            break ;
        }
        case MTC_POWERON__RESP_WAIT:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
            {
                    rc = ipmi_command_recv ( node_ptr );
                    if ( rc == RETRY )
                    {
                        mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
                        break ;
                    }

                if ( rc )
                {
                    elog ("%s Power-On command failed\n", node_ptr->hostname.c_str());
                    mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                    powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
                }
                else
                {
                    ilog ("%s is Powering On\n", node_ptr->hostname.c_str() );
                    mtcInvApi_update_task ( node_ptr, "Powering On" );
                    mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
                    powerStageChange ( node_ptr , MTC_POWERON__DONE );
                    node_ptr->power_on = true ;
                }
            }
            break ;
        }
        case MTC_POWERON__QUEUE:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
            {
                node_ptr->mtcTimer.ring = false ;
                if ( node_ptr->power_action_retries > 0 )
                {
                    char buffer[64] ;
                    int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
                    snprintf ( buffer, 64, MTC_TASK_POWERON_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT);
                    mtcInvApi_update_task ( node_ptr, buffer);

                    /* check the thread error status if thetre is one */
                    if ( node_ptr->ipmitool_thread_info.status )
                    {
                        wlog ("%s ... %s (rc:%d)\n", node_ptr->hostname.c_str(),
                                                     node_ptr->ipmitool_thread_info.status_string.c_str(),
                                                     node_ptr->ipmitool_thread_info.status );
                    }

                    powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS );
                }
                else
                {
                    powerStageChange ( node_ptr , MTC_POWERON__FAIL );
                }
            }
            break ;
        }
        case MTC_POWERON__DONE:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
            {
                plog ("%s Power-On Completed\n", node_ptr->hostname.c_str());

                availStatusChange ( node_ptr, MTC_AVAIL_STATUS__OFFLINE );

                // send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );

                powerStageChange ( node_ptr , MTC_POWER__DONE );
            }
            break ;
        }

        case MTC_POWER__DONE:
        default:
        {
            mtcTimer_reset ( node_ptr->mtcTimer );

            adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );

            recovery_ctrl_init ( node_ptr->hwmon_reset );
            recovery_ctrl_init ( node_ptr->hwmon_powercycle );

            mtcInvApi_force_task ( node_ptr, "" );
            break ;
        }
    }
    return (PASS);
}


/* Power Cycle Handler
 * ------------------- */
int nodeLinkClass::powercycle_handler ( struct nodeLinkClass::node * node_ptr )
{
    int rc = PASS ;

    if ( node_ptr->bm_accessible == false )
    {
        wlog ("%s 'powercycle' abort ; not accessible to BMC\n", node_ptr->hostname.c_str() );
        powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
    }

    /* Manage max retries */
    if ( node_ptr->hwmon_powercycle.retries  >= MAX_POWERCYCLE_STAGE_RETRIES )
    {
        wlog ("%s 'powercycle' abort ; max retries reached\n", node_ptr->hostname.c_str() );
        powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
    }

    /* Manage max retries */
    if ( node_ptr->hwmon_powercycle.queries >= MAX_POWERCYCLE_QUERY_RETRIES )
    {
        wlog ("%s power state query retries exceeded ; failing current iteration\n", node_ptr->hostname.c_str());
        powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
    }

    switch ( node_ptr->powercycleStage )
    {
        case MTC_POWERCYCLE__FAIL:
        {
            mtcTimer_reset ( node_ptr->hwmon_powercycle.control_timer );

            wlog ("%s entering 'powercycle' failed stage ATTEMPT: %d\n",
                      node_ptr->hostname.c_str() ,
                      node_ptr->hwmon_powercycle.attempts );

            /* Note: hwmon will continue to send powercycle requests to restart once it is accessible */

            /* TODO: RELEASE NOTE: Node may be left in the disabled state
             *  - need to track power state and raise logs or alarms if host is stuck in power off state.
             *  - The ipmitool update does add tracking of the power state but does not introduce the alarm */

            // send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );

            /* Let the next event perform anothe rpower-cycle retry */
            adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
            powercycleStageChange ( node_ptr, MTC_POWERCYCLE__DONE );

            mtcInvApi_update_task ( node_ptr, MTC_TASK_POWERCYCLE_FAIL , node_ptr->hwmon_powercycle.attempts );

            hwmon_recovery_monitor ( node_ptr, MTC_EVENT_HWMON_POWERCYCLE );

            break ;
        }
        case MTC_POWERCYCLE__START:
        {
            switch ( node_ptr->subStage )
            {
                case MTC_SUBSTAGE__START:
                {
                    if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
                    {
                        ilog ("%s failing host for powercycle\n", node_ptr->hostname.c_str() );
                        alarm_enabled_failure ( node_ptr );

                        /* Set node as unlocked-disabled-failed */
                        allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                                   MTC_OPER_STATE__DISABLED,
                                                   MTC_AVAIL_STATUS__FAILED );
                    }
                    ilog ("%s is %s-%s-%s\n", node_ptr->hostname.c_str(),
                                              get_adminState_str (node_ptr->adminState).c_str(),
                                              get_operState_str  (node_ptr->operState).c_str(),
                                              get_availStatus_str(node_ptr->availStatus).c_str());

                    node_ptr->hwmon_powercycle.state = RECOVERY_STATE__ACTION ;

                    node_ptr->hwmon_powercycle.attempts++ ;

                    mtcTimer_reset ( node_ptr->hwmon_powercycle.control_timer );

                    /***********************************************************************************
                     *
                     * Perminent Power-Down Case
                     * -------------------------
                     * If we exceed the maximum power cycle attempt retries then we
                     * give up and power the unit down and leave it that way.
                     *
                     ***********************************************************************************/
                    if ( node_ptr->hwmon_powercycle.attempts > MAX_POWERCYCLE_ATTEMPT_RETRIES )
                    {
                        ilog ("%s -------------------------------------------------------------\n", node_ptr->hostname.c_str());
                        wlog ("%s critical event is persistent ; too many failed attempts (%d)\n",
                                  node_ptr->hostname.c_str(), node_ptr->hwmon_powercycle.attempts );
                        ilog ("%s -------------------------------------------------------------\n", node_ptr->hostname.c_str());

                        /* terminate any in progress work, likely auto recovery if unlocked, for this host */
                        mtcTimer_reset ( node_ptr->mtcCmd_timer );
                        mtcCmd_workQ_purge ( node_ptr );
                        mtcCmd_doneQ_purge ( node_ptr );

                        // node_ptr->powercycle_completed  = true ;
                        node_ptr->hwmon_powercycle.retries = 0 ;
                        node_ptr->hwmon_powercycle.queries = 0 ;
                        mtcInvApi_update_task ( node_ptr, MTC_TASK_POWERCYCLE_DOWN, node_ptr->hwmon_powercycle.attempts );
                        powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWEROFF );
                    }
                    else
                    {
                        wlog ("%s starting 'powercycle' recovery ATTEMPT: %d\n",
                                  node_ptr->hostname.c_str(),
                                  node_ptr->hwmon_powercycle.attempts );

                        // send_hwmon_command    ( node_ptr->hostname, MTC_CMD_STOP_HOST);
                        mtcInvApi_update_task ( node_ptr, MTC_TASK_POWERCYCLE_HOST, node_ptr->hwmon_powercycle.attempts );

                        node_ptr->hwmon_powercycle.retries = 0 ; /* remove for back to back power cycles */
                        mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, 1 );
                        subStageChange ( node_ptr, MTC_SUBSTAGE__SEND );
                    }
                    break ;
                }

                /* Query current power state */
                case MTC_SUBSTAGE__SEND:
                {
                    if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
                    {
                        int delay = MTC_IPMITOOL_REQUEST_DELAY ;
                        ilog ("%s querying current power state\n", node_ptr->hostname.c_str());

                            rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_STATUS );
                        if ( rc )
                        {
                            node_ptr->hwmon_powercycle.retries++ ;
                            wlog ("%s failed to send 'power state query' ; retrying %d of %d\n",
                                      node_ptr->hostname.c_str(),
                                      node_ptr->hwmon_powercycle.retries,
                                      MAX_POWERCYCLE_STAGE_RETRIES );

                            node_ptr->hwmon_powercycle.queries++ ;

                            /* Retry the send */
                            mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                        }
                        else
                        {
                            node_ptr->hwmon_powercycle.queries = 0 ;
                            subStageChange ( node_ptr, MTC_SUBSTAGE__RECV );
                            mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, delay );
                        }
                    }
                    break ;
                }

                /* Interpret current power state query */
                case MTC_SUBSTAGE__RECV:
                {
                    if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
                    {
                            rc = ipmi_command_recv ( node_ptr );
                            if ( rc == RETRY )
                            {
                                mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_RETRY_WAIT );
                                break ;
                            }

                        if ( rc )
                        {
                            node_ptr->hwmon_powercycle.retries++ ;
                            elog ("%s 'power query' command failed ; retrying %d or %d\n",
                                      node_ptr->hostname.c_str(),
                                      node_ptr->hwmon_powercycle.retries,
                                      MAX_POWERCYCLE_STAGE_RETRIES );

                            mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                            subStageChange ( node_ptr, MTC_SUBSTAGE__SEND );
                        }
                        else
                        {
                            bool on = false ;

                                ilog ("%s Power Status: %s\n",
                                           node_ptr->hostname.c_str(),
                                           node_ptr->ipmitool_thread_info.data.c_str());

                                if ( node_ptr->ipmitool_thread_info.data.find ( IPMITOOL_POWER_ON_STATUS ) != std::string::npos )
                                {
                                    on = true ;
                                }
                            if ( rc == PASS )
                            {
                                /* maintain current power state */
                                node_ptr->power_on = on ;

                                if ( on == true )
                                {
                                    ilog ("%s invoking 'powerdown' phase\n", node_ptr->hostname.c_str());

                                    subStageChange ( node_ptr, MTC_SUBSTAGE__DONE );
                                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWEROFF );
                                }
                                else
                                {
                                    wlog ("%s is already powered-off ; starting powercycle with power-on\n", node_ptr->hostname.c_str() );
                                    subStageChange ( node_ptr, MTC_SUBSTAGE__DONE );
                                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWERON );
                                }
                            }
                            else
                            {
                                /* TODO: use FAIL handler */
                                node_ptr->hwmon_powercycle.retries = MAX_POWERCYCLE_STAGE_RETRIES+1 ;
                                // powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
                                elog ("%s failed to query power status ; aborting powercycle action\n",
                                          node_ptr->hostname.c_str());
                            }
                        }
                    }
                    break ;
                }
                default:
                {
                    slog ("%s %s.%s stage\n", node_ptr->hostname.c_str(),
                              get_powercycleStages_str(node_ptr->powercycleStage).c_str(),
                              get_subStages_str(node_ptr->subStage).c_str());

                    subStageChange ( node_ptr, MTC_SUBSTAGE__DONE );
                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__START );
                    break ;
                }
            }
            break ;
        }

        case MTC_POWERCYCLE__POWEROFF:
        {
            int delay = MTC_IPMITOOL_REQUEST_DELAY ;

            /* Stop heartbeat if we are powering off the host */
            send_hbs_command  ( node_ptr->hostname, MTC_CMD_STOP_HOST );

            rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_OFF );
            if ( rc )
            {
                elog ("%s failed to send power-off command to BMC (%d)\n",
                          node_ptr->hostname.c_str(),
                          rc );

                powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
            }
            else
            {
                mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, delay );
                powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWEROFF_CMND_WAIT );
            }
            break ;
        }
        case MTC_POWERCYCLE__POWEROFF_CMND_WAIT:
        {
            if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
            {
                rc = ipmi_command_recv ( node_ptr );
                if ( rc == RETRY )
                {
                    mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_RETRY_WAIT );
                    break ;
                }

                if ( rc )
                {
                    elog ("%s power-off command failed (rc:%d:%d)\n",
                              node_ptr->hostname.c_str(),
                              rc , node_ptr->ipmitool_thread_info.status);

                    if ( node_ptr->ipmitool_thread_info.status )
                    {
                        wlog ("%s ... %s\n",
                                  node_ptr->hostname.c_str(),
                                  node_ptr->ipmitool_thread_info.status_string.c_str());
                    }
                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
                }
                else
                {
                    ilog ("%s waiting up to %d seconds for 'offline'\n", node_ptr->hostname.c_str(), MTC_POWEROFF_TO_OFFLINE_TIMEOUT );

                    /* Set the power-off timeout */
                    mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer,
                                     mtcTimer_handler,
                                     MTC_POWEROFF_TO_OFFLINE_TIMEOUT );

                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWEROFF_WAIT );
                }
            }
            break ;
        }
        case MTC_POWERCYCLE__POWEROFF_WAIT:
        {
            if (( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF ) ||
                ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE ))
            {
                /* since the host is powered down lets reflect that in the database */
                node_ptr->uptime = 0 ;
                mtcInvApi_update_uptime ( node_ptr, node_ptr->uptime );

                clear_service_readies ( node_ptr );

                mtcTimer_reset ( node_ptr->hwmon_powercycle.control_timer );

                if ( node_ptr->hwmon_powercycle.attempts > MAX_POWERCYCLE_ATTEMPT_RETRIES )
                {
                    wlog ("%s -------------------------------------------------------------------\n",
                                  node_ptr->hostname.c_str() );
                    wlog ("%s ... Leaving server POWERED DOWN to protect hardware from damage ...\n",
                                  node_ptr->hostname.c_str() );
                    wlog ("%s -------------------------------------------------------------------\n",
                                  node_ptr->hostname.c_str() );

                    /* Cancelling the recovery timer prevents auto-recovery.
                     * Recovery must be through manual actions. */
                    mtcTimer_reset ( node_ptr->hwmon_powercycle.recovery_timer );
                    node_ptr->hwmon_powercycle.state = RECOVERY_STATE__BLOCKED ;

                    /* Block Auto-Recovery Path
                     * ------------------------
                     * If we have reached the max retries and are unlocked then
                     * leave the powercycle action active so that the enable
                     * and graceful recovery handlers don't recover this host.
                     * -------------------------
                     * Manual action is required to recover a host that has
                     * exceeded the maximum powercycle retries */
                    if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
                    {
                        adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
                    }

                    /* While the node_ptr->hwmon_powercycle.control_timer is
                     * inactive the MTC_POWERCYCLE__DONE stagwe is a NOOP
                     * thereby keeping us doing nothing till the next manual
                     * action */
                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__DONE );
                }
                else if ( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF )
                {
                    ilog ("%s already powered-off, skipping cool-off\n", node_ptr->hostname.c_str());
                    mtcTimer_reset ( node_ptr->hwmon_powercycle.control_timer );
                    mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, 10 );
                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWERON );
                }
                else
                {
                    ilog ("%s waiting %d seconds before power-on ; cool down time\n",
                              node_ptr->hostname.c_str(),
                              MTC_POWERCYCLE_COOLDOWN_DELAY );

                    node_ptr->hwmon_powercycle.holdoff = MTC_POWERCYCLE_COOLDOWN_DELAY/60 ;

                    /* Set the power-off timeout */
                    mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, COMMAND_DELAY );

                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__COOLOFF );
                    node_ptr->hwmon_powercycle.state = RECOVERY_STATE__COOLOFF ;
                }
                availStatusChange     ( node_ptr, MTC_AVAIL_STATUS__POWERED_OFF );
            }

            /* handle timeout case */
            else if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
            {
                /* TODO: manage the retry count */
                elog ("%s timeout waiting for 'offline' state ; retrying ...\n", node_ptr->hostname.c_str() );

                powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWEROFF );
            }
            break ;
        }

        case MTC_POWERCYCLE__COOLOFF:
        {
            if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
            {
                mtcInvApi_update_task ( node_ptr,
                                        MTC_TASK_POWERCYCLE_COOL,
                                        node_ptr->hwmon_powercycle.attempts,
                                        node_ptr->hwmon_powercycle.holdoff);
                ilog ("%s Power-Cycle cool-off (%d minutes remaining)\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->hwmon_powercycle.holdoff );

                if ( node_ptr->hwmon_powercycle.holdoff > 1 )
                {
                    node_ptr->hwmon_powercycle.holdoff-- ;
                }
                else
                {
                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWERON );
                    node_ptr->hwmon_powercycle.state = RECOVERY_STATE__ACTION ;
                }
                mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_MINS_1 );
            }
            break ;
        }

        case MTC_POWERCYCLE__POWERON:
        {
            if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
            {
                int delay = MTC_IPMITOOL_REQUEST_DELAY ;
                clog ("%s %s stage\n", node_ptr->hostname.c_str(),
                      get_powercycleStages_str(node_ptr->powercycleStage).c_str());

                if ( node_ptr->bm_accessible == false )
                {
                    mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer,
                                     mtcTimer_handler,
                                     MTC_POWERCYCLE_COOLDOWN_DELAY );

                    wlog ("%s not accessible ; waiting another %d seconds before power-on\n",
                              node_ptr->hostname.c_str(),
                              MTC_POWERCYCLE_COOLDOWN_DELAY );
                }
                rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON );
                if ( rc )
                {
                    elog ("%s failed to send power-on command to BMC (%d)\n",
                              node_ptr->hostname.c_str(),
                              rc );

                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
                }
                else
                {
                    ilog ("%s Power-On requested\n", node_ptr->hostname.c_str() );
                    mtcInvApi_update_task ( node_ptr, MTC_TASK_POWERCYCLE_ON, node_ptr->hwmon_powercycle.attempts );
                    mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, delay );
                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWERON_CMND_WAIT );
                }
            }
            break ;
        }
        case MTC_POWERCYCLE__POWERON_CMND_WAIT:
        {
            if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
            {
                rc = ipmi_command_recv ( node_ptr );
                if ( rc == RETRY )
                {
                    mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_RETRY_WAIT );
                    break ;
                }

                if ( rc )
                {
                    wlog ("%s Power-On request failed (rc:%d)\n", node_ptr->hostname.c_str(), rc );
                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
                }
                else
                {
                    ilog ("%s Power-On response: %s\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->ipmitool_thread_info.data.c_str() );

                    /* Give the power on request time to execute */
                    mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_CMD_RSP_TIMEOUT );

                    availStatusChange     ( node_ptr, MTC_AVAIL_STATUS__OFFLINE );
                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWERON_VERIFY );
                }
            }
            break ;
        }
        case MTC_POWERCYCLE__POWERON_VERIFY:
        {
            if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
            {
                    rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_STATUS );
                if ( rc )
                {
                    wlog ("%s Power-On command failed (rc:%d)\n", node_ptr->hostname.c_str(), rc );
                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
                }
                else
                {
                    wlog ("%s power status query requested\n", node_ptr->hostname.c_str() );
                    mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_IPMITOOL_REQUEST_DELAY  );
                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWERON_VERIFY_WAIT );
                }
            }
            break ;
        }
        case MTC_POWERCYCLE__POWERON_VERIFY_WAIT:
        {
            if ( mtcTimer_expired ( node_ptr->hwmon_powercycle.control_timer ) )
            {
                bool on = false ;

                rc = ipmi_command_recv ( node_ptr );
                if ( rc == RETRY )
                {
                    mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_RETRY_WAIT );
                    break ;
                }
                if ( rc == PASS )
                {
                    if ( node_ptr->ipmitool_thread_info.data.find (IPMITOOL_POWER_ON_STATUS) != std::string::npos )
                    {
                        on = true ;
                    }
                }

                ilog ("%s power state query result: %s\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->ipmitool_thread_info.data.c_str() );

                if (( rc == PASS ) && ( on == true ))
                {
                    node_ptr->power_on = true ;
                    ilog ("%s is Powered On - waiting for 'online' (%d sec timeout)\n",
                              node_ptr->hostname.c_str(),
                              MTC_POWERON_TO_ONLINE_TIMEOUT);

                    mtcInvApi_update_task ( node_ptr, MTC_TASK_POWERCYCLE_BOOT, node_ptr->hwmon_powercycle.attempts );

                    /* Set the online timeout */
                    mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_POWERON_TO_ONLINE_TIMEOUT );
                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWERON_WAIT );
                }
                else
                {
                    wlog ("%s Power-On failed or did not occur ; retrying (rc:%d:%d)\n", node_ptr->hostname.c_str(), rc, on );
                    node_ptr->power_on = false ;
                    mtcInvApi_update_task ( node_ptr, MTC_TASK_POWERCYCLE_RETRY, node_ptr->hwmon_powercycle.attempts );
                    mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_BM_POWERON_TIMEOUT );
                    node_ptr->hwmon_powercycle.queries++ ;
                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__POWERON );
                    break ;
                }
            }
            break ;
        }
        case MTC_POWERCYCLE__POWERON_WAIT:
        {
            if ( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE )
            {
                ilog ("%s online (after powercycle)\n", node_ptr->hostname.c_str());

                node_ptr->hwmon_powercycle.holdoff = MTC_POWERCYCLE_BACK2BACK_DELAY/60 ;

                mtcTimer_reset ( node_ptr->hwmon_powercycle.control_timer );
                mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, 1 );
                node_ptr->hwmon_powercycle.state = RECOVERY_STATE__HOLDOFF ;
                powercycleStageChange ( node_ptr, MTC_POWERCYCLE__HOLDOFF );
            }
            else if ( node_ptr->hwmon_powercycle.control_timer.ring == true )
            {
                elog ("%s timeout waiting for 'online' state\n", node_ptr->hostname.c_str() );
                powercycleStageChange ( node_ptr, MTC_POWERCYCLE__FAIL );
            }
            break ;
        }
        case MTC_POWERCYCLE__HOLDOFF:
        {
            if ( node_ptr->hwmon_powercycle.control_timer.ring == true )
            {
                mtcInvApi_update_task ( node_ptr,
                                        MTC_TASK_POWERCYCLE_HOLD,
                                        node_ptr->hwmon_powercycle.attempts,
                                        node_ptr->hwmon_powercycle.holdoff);
                ilog ("%s Power-Cycle hold-off (%d minutes remaining) (uptime:%d)\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->hwmon_powercycle.holdoff,
                          node_ptr->uptime );

                if ( node_ptr->hwmon_powercycle.holdoff > 1 )
                {
                    node_ptr->hwmon_powercycle.holdoff--;
                }
                else
                {
                    powercycleStageChange ( node_ptr, MTC_POWERCYCLE__DONE );
                }
                mtcTimer_start ( node_ptr->hwmon_powercycle.control_timer, mtcTimer_handler, MTC_MINS_1 );
            }
            break ;
        }
        case MTC_POWERCYCLE__DONE:
        {
            if ( node_ptr->hwmon_powercycle.control_timer.ring == true )
            {
                mtcInvApi_update_task ( node_ptr, "" );

                adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
                node_ptr->addStage = MTC_ADD__START ;

                hwmon_recovery_monitor ( node_ptr, MTC_EVENT_HWMON_POWERCYCLE );

                enableStageChange   ( node_ptr, MTC_ENABLE__START  );
                recoveryStageChange ( node_ptr, MTC_RECOVERY__START); /* reset the fsm */
                disableStageChange  ( node_ptr, MTC_DISABLE__START); /* reset the fsm */

                plog ("%s Power-Cycle Completed (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime );
            }
            break ;
        }

        default:
        {
            powercycleStageChange ( node_ptr, MTC_POWERCYCLE__DONE );
            adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
            break ;
        }
    }
    return (rc);
}

/* Delete Handler
 * ----------------- */
int nodeLinkClass::delete_handler ( struct nodeLinkClass::node * node_ptr )
{
    switch ( node_ptr->delStage )
    {
        case MTC_DEL__START:
        {
            ilog ("%s Delete Operation Started (%s)\n", node_ptr->hostname.c_str(), node_ptr->uuid.c_str());
            node_ptr->retries = 0 ;
            send_mtc_cmd ( node_ptr->hostname, MTC_CMD_WIPEDISK, MGMNT_INTERFACE ) ;

            if ( node_ptr->bm_provisioned == true )
            {
                set_bm_prov ( node_ptr, false);
            }

            if ( node_ptr->ipmitool_thread_ctrl.stage != THREAD_STAGE__IDLE )
            {
                int delay = THREAD_POST_KILL_WAIT ;
                thread_kill ( node_ptr->ipmitool_thread_ctrl , node_ptr->ipmitool_thread_info ) ;

                ilog ("%s thread active ; sending kill ; waiting %d seconds\n",
                          node_ptr->hostname.c_str(), delay );
                mtcTimer_reset ( node_ptr->mtcTimer );
                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, delay );
                node_ptr->delStage = MTC_DEL__WAIT ;
            }
            else
            {
                node_ptr->delStage = MTC_DEL__DONE ;
            }


            /* Send delete commands to monitor services */
            send_hbs_command   ( node_ptr->hostname, MTC_CMD_DEL_HOST );
            send_hwmon_command ( node_ptr->hostname, MTC_CMD_DEL_HOST );
            send_guest_command ( node_ptr->hostname, MTC_CMD_DEL_HOST );

            /* Clear all the alarms for this host and generate a costomer delete log */
            alarmUtil_clear_all ( node_ptr->hostname );
            mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_DELETE );

            break ;
        }
        case MTC_DEL__WAIT:
        {
            if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
            {
                if ( node_ptr->ipmitool_thread_ctrl.stage != THREAD_STAGE__IDLE )
                {
                    if ( node_ptr->retries++ < 3 )
                    {
                        wlog ("%s still waiting on active thread ; sending another kill signal (try %d or %d)\n",
                                  node_ptr->hostname.c_str(), node_ptr->retries, 3 );

                        thread_kill ( node_ptr->ipmitool_thread_ctrl, node_ptr->ipmitool_thread_info ) ;
                        mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, THREAD_POST_KILL_WAIT );
                        break ;
                    }
                    else
                    {
                        elog ("%s thread refuses to stop ; giving up ...\n",
                                  node_ptr->hostname.c_str());
                    }
                }
                node_ptr->delStage = MTC_DEL__DONE ;
            }
            break ;
        }
        default:
        case MTC_DEL__DONE:
        {
            dlog ("%s delete almost done !!\n", node_ptr->hostname.c_str());
            adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
            del_host ( node_ptr->hostname );
            this->host_deleted = true ;
            break ;
        }
    }
    return (PASS);
}


/* Add Handler
 * ----------------- */
int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
{
    int rc = PASS ;

    switch ( node_ptr->addStage )
    {
        case MTC_ADD__START:
        case MTC_ADD__START_DELAY:
        {
            bool timer_set = false ;
            plog ("%s Host Add\n", node_ptr->hostname.c_str());

            /* Request a mtcAlive message ; gives us uptime ; don't trust what is in the database */
            node_ptr->uptime = 0 ;
            send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, MGMNT_INTERFACE );

            ilog ("%s %s %s-%s-%s (%s)\n",
                node_ptr->hostname.c_str(),
                node_ptr->ip.c_str(),
                adminState_enum_to_str (node_ptr->adminState).c_str(),
                operState_enum_to_str  (node_ptr->operState).c_str(),
                availStatus_enum_to_str(node_ptr->availStatus).c_str(),

                node_ptr->uuid.length() ? node_ptr->uuid.c_str() : "" );

            if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
            {
                if ( daemon_is_file_present ( CONFIG_COMPLETE_COMPUTE ) == false )
                {
                    if ( node_ptr->operState_subf != MTC_OPER_STATE__DISABLED )
                    {
                        subfStateChange  ( node_ptr, MTC_OPER_STATE__DISABLED, MTC_AVAIL_STATUS__OFFLINE );
                    }
                }
                ilog ("%s-%s %s-%s-%s\n",
                    node_ptr->hostname.c_str(),
                    node_ptr->subfunction_str.c_str(),
                    adminState_enum_to_str (node_ptr->adminState).c_str(),
                    operState_enum_to_str  (node_ptr->operState_subf).c_str(),
                    availStatus_enum_to_str(node_ptr->availStatus_subf).c_str());
            }

            if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
                ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
                ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ))
            {
                wlog ("%s Add with availability status 'unlocked-enabled-%s' ; overriding to 'available'\n",
                          node_ptr->hostname.c_str(),
                          availStatus_enum_to_str(node_ptr->availStatus).c_str());
                mtcInvApi_update_state ( node_ptr, "availability", "available" );
            }

            /* handle other cases */
            EFmAlarmSeverityT sev = mtcAlarm_state ( node_ptr->hostname,
                                                     MTC_ALARM_ID__ENABLE);

            if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
            {
                node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_WARNING ;

                /* If the node is locked then the Enable alarm
                 * should not be present */
                if ( sev != FM_ALARM_SEVERITY_CLEAR )
                {
                    mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
                    sev = FM_ALARM_SEVERITY_CLEAR ;
                }
            }

            /* Manage enable alarm over process restart.
             *
             * - clear the alarm in the active controller case
             * - maintain the alarm, set degrade state in MAJOR and CRIT cases
             * - clear alarm for all other severities.
             */
            if ( THIS_HOST )
            {
                if ( sev != FM_ALARM_SEVERITY_CLEAR )
                {
                    mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
                }
            }
            else
            {
                if (( sev == FM_ALARM_SEVERITY_CRITICAL ) ||
                    ( sev == FM_ALARM_SEVERITY_MAJOR ))
                {
                    node_ptr->alarms[MTC_ALARM_ID__ENABLE] = sev ;
                    node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ;
                }
                else if ( sev != FM_ALARM_SEVERITY_CLEAR )
                {
                    mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
                }
            }

            if ( is_controller(node_ptr) )
            {
                mtc_cmd_enum state = CONTROLLER_DISABLED ;

                if (( node_ptr->adminState   == MTC_ADMIN_STATE__UNLOCKED ) &&
                    ( node_ptr->operState    == MTC_OPER_STATE__ENABLED ) &&
                    (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
                     ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )))
                {
                    state = CONTROLLER_UNLOCKED ;
                }
                else if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
                {
                    state = CONTROLLER_LOCKED ;
                }

                if ( THIS_HOST )
                {
                    nodeLinkClass::set_active_controller_hostname(node_ptr->hostname);
                    if ( !node_ptr->task.compare(MTC_TASK_SWACT_INPROGRESS) )
                    {
                        ilog ("%s %s\n",node_ptr->hostname.c_str(), MTC_TASK_SWACT_NO_COMPLETE);
                        mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_NO_COMPLETE);
                        mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, 20 );
                        timer_set = true ;
                    }
                }
                else
                {
                    nodeLinkClass::set_inactive_controller_hostname(node_ptr->hostname);

                    if ( !node_ptr->task.compare(MTC_TASK_SWACT_INPROGRESS) )
                    {
                        ilog ("%s %s\n",node_ptr->hostname.c_str(), MTC_TASK_SWACT_COMPLETE );

                        /* Work Around for issue: */
                        mtcInvApi_update_uptime ( node_ptr, node_ptr->uptime );

                        mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_COMPLETE );
                        mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, 10 );
                        timer_set = true ;
                    }
                }

                /*************************************************************
                 * Don't send a disable to SM if we are in simplex and locked.
                 * This will cause SM to shut down all services.
                 *
                 * Including a hostname check just in case simplex mode
                 * is ever or still true with a second controller provisioned
                 * but not unlocked. Defensive code.
                 *
                 * TODO: This should exist in AIO. Without it services will
                 *       not be running if you lock controller and then
                 *       reboot while this controller is disabled.
                 */
                if (( THIS_HOST ) &&
                    ( is_inactive_controller_main_insv() == false ) &&
                    ( node_ptr->operState == MTC_OPER_STATE__DISABLED ))
                {
                    ilog ("%s recovering from %s-disabled\n",
                              node_ptr->hostname.c_str(),
                              get_adminState_str (node_ptr->adminState).c_str());
                }
                else
                {
                    mtcSmgrApi_request ( node_ptr, state , SWACT_FAIL_THRESHOLD );
                }
            }

            if ( daemon_get_cfg_ptr()->debug_level & 1 )
                nodeLinkClass::host_print (node_ptr);

            if ( timer_set == false )
            {
                node_ptr->mtcTimer.ring = true ;
            }
            node_ptr->addStage = MTC_ADD__CLEAR_TASK ;
            break ;
        }

        case MTC_ADD__CLEAR_TASK:
        {
            if ( is_controller(node_ptr) )
            {
                if ( node_ptr->mtcTimer.ring == true )
                {
                    if ( !node_ptr->task.empty () )
                    {
                        mtcInvApi_force_task ( node_ptr, "" );
                    }
                }
                else
                {
                    break ;
                }
            }
            else
            {
                /* do it immediately for all otyher server types */
                mtcInvApi_force_task ( node_ptr, "" );
            }
            /* default retries counter to zero before START_SERVICES */
            node_ptr->retries = 0 ;
            node_ptr->addStage = MTC_ADD__START_SERVICES ;
            break ;
        }

        case MTC_ADD__START_SERVICES:
        {
            if (( node_ptr->adminState   == MTC_ADMIN_STATE__UNLOCKED ) &&
                ( node_ptr->operState    == MTC_OPER_STATE__ENABLED ) &&
                (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
                 ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )))
            {
                ilog ("%s scheduling start host services\n",
                          node_ptr->hostname.c_str());

                node_ptr->start_services_needed  = true ;
                node_ptr->start_services_retries = 0    ;
            }

            node_ptr->addStage = MTC_ADD__MTC_SERVICES ;
            break ;
        }
        case MTC_ADD__MTC_SERVICES:
        {
            if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
            {
                /* Inform the VIM that this host is enabled */
                mtcVimApi_state_change ( node_ptr, VIM_HOST_ENABLED, 3 );
            }
            else
            {
                if ( node_ptr->availStatus == MTC_AVAIL_STATUS__FAILED )
                {
                    mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );
                }
                else
                {
                    mtcVimApi_state_change ( node_ptr, VIM_HOST_DISABLED, 3 );
                }
            }

            send_hbs_command   ( node_ptr->hostname, MTC_CMD_ADD_HOST );

            /* Add this host to other maintenance services */
            if (( ! SIMPLEX_CPE_SYSTEM ) && ( node_ptr->bm_provisioned ))
            {
                send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
            }
            if ( ( CPE_SYSTEM ) || ( is_compute (node_ptr) == true ))
            {
                send_guest_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
            }

            /* Start a timer that failed enable if the work queue
             * does not empty or if commands in the done queue have failed */
            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, work_queue_timeout );

            node_ptr->addStage = MTC_ADD__WORKQUEUE_WAIT ;
            break ;
        }
        case MTC_ADD__WORKQUEUE_WAIT:
        {
            rc = workQueue_done ( node_ptr );
            if ( rc == RETRY )
            {
                /* wait longer */
                break ;
            }
            else if ( rc == FAIL_WORKQ_TIMEOUT )
            {
                wlog ("%s Add failed ; workQueue empty timeout, purging ...\n", node_ptr->hostname.c_str());
                workQueue_purge ( node_ptr );
            }
            else if ( rc != PASS )
            {
                wlog ("%s Add failed ; doneQueue contains failed commands\n", node_ptr->hostname.c_str());
            }

            /* Stop the work queue wait timer */
            mtcTimer_reset ( node_ptr->mtcTimer );

            /* Only start it on this add operation if host is
             * already unlocked and enabled and not the active controller */
            if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
                ( node_ptr->operState  == MTC_OPER_STATE__ENABLED ))
            {
                /* start the heartbeat service in all cases except for
                 * THIS host and CPE controller hosts */
                if ( NOT_THIS_HOST )
                {
                    if (( LARGE_SYSTEM ) ||
                        (( CPE_SYSTEM ) && ( is_controller(node_ptr) == false )))
                    {
                        send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
                    }
                }
            }
            /* Only run hardware monitor if the bm ip is provisioned */
            if (( hostUtil_is_valid_bm_type  ( node_ptr->bm_type )) &&
                ( hostUtil_is_valid_ip_addr  ( node_ptr->bm_ip )))
            {
                set_bm_prov ( node_ptr, true ) ;
                send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
            }

            node_ptr->mtcAlive_gate = false ;
            node_ptr->addStage = MTC_ADD__DONE ;
            break;
        }
        case MTC_ADD__DONE:
        default:
        {
            adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );

            /* Send sysinv the wrsroot password hash
             * and aging data as an install command */
            if ( SIMPLEX && THIS_HOST &&
                ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ))
            {
                node_ptr->configStage  = MTC_CONFIG__START ;
                node_ptr->configAction = MTC_CONFIG_ACTION__INSTALL_PASSWD ;
            }

            if (( ! SIMPLEX_CPE_SYSTEM ) &&
                ( node_ptr->bm_provisioned == true ))
            {
                mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__BM );
                node_ptr->alarms[MTC_ALARM_ID__BM] = FM_ALARM_SEVERITY_CLEAR ;
            }

            /* Special Add handling for the AIO system */
            if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
            {
                if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
                    ( node_ptr->operState  == MTC_OPER_STATE__ENABLED ))
                {
                    /* In AIO if in DOR mode and the host is unlocked enabled
                     * we need to run the subfunction handler and request
                     * to start host services. */
                    if ( this->dor_mode_active )
                    {
                        node_ptr->start_services_needed_subf = true ;
                        adminActionChange ( node_ptr , MTC_ADMIN_ACTION__ENABLE_SUBF );
                    }
                }
            }

            node_ptr->addStage = MTC_ADD__START;
            plog ("%s Host Add Completed (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime );
            node_ptr->add_completed = true ;
            break ;
        }
    }
    return (rc);
}

int nodeLinkClass::bm_handler ( struct nodeLinkClass::node * node_ptr )
{
    /* Call the bmc ssh connection monitor if this node's bm is provisioned */
    if ( node_ptr->bm_provisioned == true )
    {
                if (( node_ptr->bm_accessible == true ) && ( node_ptr->bm_ping_info.ok == false ))
                {
                    wlog ("%s bmc access lost\n", node_ptr->hostname.c_str());

                    /* remove the mc info file in case there is a firmware
                     * upgrade in progress. hwmond reads it and get
                     * the bmc fw version from it */
                    string mc_info_filename = IPMITOOL_OUTPUT_DIR ;
                    mc_info_filename.append(node_ptr->hostname);
                    mc_info_filename.append(IPMITOOL_MC_INFO_FILE_SUFFIX);
                    daemon_remove_file ( mc_info_filename.data() );

                    thread_kill ( node_ptr->ipmitool_thread_ctrl, node_ptr->ipmitool_thread_info );

                    bmc_access_data_init ( node_ptr );

                    ipmiUtil_mc_info_init ( node_ptr->mc_info );

                    node_ptr->bm_ping_info.stage = PINGUTIL_MONITOR_STAGE__FAIL ;

                    /* start a timer that will raise the BM Access alarm
                     * if we are not accessible by the time it expires */
                    plog ("%s bmc access timer started (%d secs)\n", node_ptr->hostname.c_str(), MTC_MINS_2);
                    mtcTimer_reset ( node_ptr->bmc_access_timer );
                    mtcTimer_start ( node_ptr->bmc_access_timer, mtcTimer_handler, MTC_MINS_2 );
                }

                /* This block queries and logs BMC Info and last Reset Cause */
                if (( node_ptr->bm_accessible == false ) &&
                    ( node_ptr->bm_ping_info.ok == true ) &&
                    (( node_ptr->mc_info_query_done == false ) ||
                     ( node_ptr->reset_cause_query_done == false ) ||
                     ( node_ptr->power_status_query_done == false )) &&
                     ( mtcTimer_expired (node_ptr->bm_timer ) == true ))
                {
                    int rc = PASS ;
                    if (( node_ptr->mc_info_query_active == false ) &&
                        ( node_ptr->mc_info_query_done   == false ))
                    {
                        if ( ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__MC_INFO ) != PASS )
                        {
                            elog ("%s %s send failed\n",
                                      node_ptr->hostname.c_str(),
                                      getIpmiCmd_str(node_ptr->ipmitool_thread_info.command));
                            mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                        }
                        else
                        {
                            dlog ("%s %s\n", node_ptr->hostname.c_str(),
                                                            getIpmiCmd_str(node_ptr->ipmitool_thread_info.command));
                            mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_RETRY_WAIT );
                            node_ptr->mc_info_query_active = true ;
                        }
                    }
                    else if (( node_ptr->mc_info_query_active == true ) &&
                             ( node_ptr->mc_info_query_done   == false))
                    {
                        if ( ( rc = ipmi_command_recv ( node_ptr ) ) == RETRY )
                        {
                            mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_RETRY_WAIT );
                        }
                        else if ( rc != PASS )
                        {
                            /* this error is reported by the ipmi receive driver ...
                             * blog ("%s %s command failed\n", node_ptr->hostname.c_str(),
                             *                               getIpmiCmd_str(node_ptr->ipmitool_thread_info.command));
                             */
                            node_ptr->mc_info_query_active = false ;
                            node_ptr->ipmitool_thread_ctrl.done = true ;
                            mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                        }
                        else
                        {
                            node_ptr->mc_info_query_active = false ;
                            node_ptr->mc_info_query_done = true ;
                            node_ptr->ipmitool_thread_ctrl.done = true ;
                            ipmiUtil_mc_info_load (  node_ptr->hostname, node_ptr->ipmitool_thread_info.data.data(), node_ptr->mc_info );
                        }
                    }
                    else if (( node_ptr->mc_info_query_active == false ) &&
                             ( node_ptr->mc_info_query_done   == true  ))
                    {
                        if (( node_ptr->reset_cause_query_active == false ) &&
                            ( node_ptr->reset_cause_query_done   == false ))
                        {
                            if ( ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__RESTART_CAUSE ) != PASS )
                            {
                                elog ("%s %s send failed\n", node_ptr->hostname.c_str(),
                                                             getIpmiCmd_str(node_ptr->ipmitool_thread_info.command));
                                mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                            }
                            else
                            {
                                dlog ("%s %s\n", node_ptr->hostname.c_str(),
                                                                getIpmiCmd_str(node_ptr->ipmitool_thread_info.command));
                                mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_RETRY_WAIT );
                                node_ptr->reset_cause_query_active = true ;
                            }
                        }
                        else if (( node_ptr->reset_cause_query_active == true ) &&
                                 ( node_ptr->reset_cause_query_done   == false ))
                        {
                            if ( ( rc = ipmi_command_recv ( node_ptr ) ) == RETRY )
                            {
                                mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_RETRY_WAIT );
                            }
                            else if ( rc != PASS )
                            {
                                elog ("%s %s command failed\n", node_ptr->hostname.c_str(),
                                                                getIpmiCmd_str(node_ptr->ipmitool_thread_info.command));
                                node_ptr->reset_cause_query_active = false ;
                                mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                                node_ptr->ipmitool_thread_ctrl.done = true ;
                            }
                            else
                            {
                                node_ptr->reset_cause_query_active = false ;
                                node_ptr->reset_cause_query_done   = true ;
                                node_ptr->ipmitool_thread_ctrl.done = true ;
                                ilog ("%s %s\n", node_ptr->hostname.c_str(),
                                                 node_ptr->ipmitool_thread_info.data.c_str());
                            }
                            node_ptr->ipmitool_thread_ctrl.done = true ;
                        }
                        else if (( node_ptr->mc_info_query_done      == true ) &&
                                 ( node_ptr->reset_cause_query_done  == true ) &&
                                 ( node_ptr->power_status_query_done == false ))
                        {
                            if ( node_ptr->power_status_query_active == false )
                            {
                                if ( ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_STATUS ) != PASS )
                                {
                                    elog ("%s %s send failed\n", node_ptr->hostname.c_str(),
                                                                 getIpmiCmd_str(node_ptr->ipmitool_thread_info.command));
                                    mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                                }
                                else
                                {
                                    dlog ("%s %s\n", node_ptr->hostname.c_str(),
                                                     getIpmiCmd_str(node_ptr->ipmitool_thread_info.command));
                                    mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_RETRY_WAIT );
                                    node_ptr->power_status_query_active = true ;
                                }
                            }
                            else if ( node_ptr->power_status_query_done == false )
                            {
                                if ( ( rc = ipmi_command_recv ( node_ptr ) ) == RETRY )
                                {
                                    mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_RETRY_WAIT );
                                }
                                else if ( rc )
                                {
                                    node_ptr->power_status_query_active = false ;
                                    mtcTimer_start ( node_ptr->bm_timer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
                                    node_ptr->ipmitool_thread_ctrl.done = true ;
                                }
                                else
                                {
                                    node_ptr->power_status_query_active = false ;
                                    node_ptr->power_status_query_done   = true  ;
                                    node_ptr->ipmitool_thread_ctrl.done = true  ;
                                    node_ptr->ipmitool_thread_info.command = 0  ;
                                node_ptr->bm_accessible = true ;
                                    node_ptr->bm_accessible = true ;
                                    mtcTimer_reset ( node_ptr->bmc_access_timer );

                                    ilog ("%s %s\n", node_ptr->hostname.c_str(),
                                                     node_ptr->ipmitool_thread_info.data.c_str());
                                    plog ("%s bmc is accessible\n", node_ptr->hostname.c_str());

                                    if ( node_ptr->ipmitool_thread_info.data.find (IPMITOOL_POWER_OFF_STATUS) != std::string::npos )
                                    {
                                        if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
                                        {
                                            availStatusChange ( node_ptr, MTC_AVAIL_STATUS__POWERED_OFF );
                                        }
                                        else
                                        {
                                            wlog ("%s is powered off while in the unlocked state\n", node_ptr->hostname.c_str());
                                            availStatusChange ( node_ptr, MTC_AVAIL_STATUS__POWERED_OFF );
                                        }
                                    }
                                }
                                node_ptr->ipmitool_thread_ctrl.done = true ;
                            }
                        }
                    }
                }
                if ( node_ptr->bm_ping_info.ok == false )
                {
                    /* Auto correct key ping information ; should ever occur but if it does ... */
                    if (( node_ptr->bm_ping_info.hostname.empty()) || ( node_ptr->bm_ping_info.ip.empty()))
                    {
                        /* if the bm ip is not yet learned then this log will flood */
                        //slog ("%s host ping info missing ; (%d:%d)\n",
                        //          node_ptr->hostname.c_str(),
                        //          node_ptr->bm_ping_info.hostname.empty(),
                        //          node_ptr->bm_ping_info.ip.empty());
                         node_ptr->bm_ping_info.hostname = node_ptr->hostname ;
                         node_ptr->bm_ping_info.ip       = node_ptr->bm_ip    ;
                    }
                }

                /* don't run the ping monitor if the ip address is invalid */
                if ( hostUtil_is_valid_ip_addr ( node_ptr->bm_ping_info.ip ) == true )
                {
                    pingUtil_acc_monitor ( node_ptr->bm_ping_info );
                }

                /* Manage the Board Management Access Alarm */
                if (( node_ptr->bm_accessible == false ) &&
                    ( mtcTimer_expired ( node_ptr->bmc_access_timer ) == true ))
                {
                    node_ptr->bm_ping_info.ok = false ;

                    node_ptr->bm_ping_info.stage = PINGUTIL_MONITOR_STAGE__FAIL ;

                    /* start a timer that will raise the BM Access alarm
                     * if we are not accessible by the time it expires */
                    plog ("%s bmc access timer started (%d secs)\n", node_ptr->hostname.c_str(), MTC_MINS_2);
                    mtcTimer_reset ( node_ptr->bmc_access_timer );
                    mtcTimer_start ( node_ptr->bmc_access_timer, mtcTimer_handler, MTC_MINS_2 );

                    if ( node_ptr->alarms[MTC_ALARM_ID__BM] == FM_ALARM_SEVERITY_CLEAR )
                    {
                        mtcAlarm_warning ( node_ptr->hostname, MTC_ALARM_ID__BM );
                        node_ptr->alarms[MTC_ALARM_ID__BM] = FM_ALARM_SEVERITY_WARNING ;
                    }
                }

                /* if BMs are accessible then see if we need to clear the Major BM Alarm. */
                else if (( node_ptr->alarms[MTC_ALARM_ID__BM] != FM_ALARM_SEVERITY_CLEAR ) &&
                         ( node_ptr->mc_info_query_done == true ) &&
                         ( node_ptr->reset_cause_query_done == true ) &&
                         ( node_ptr->power_status_query_done == true ))
                {
                    mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__BM );
                    node_ptr->alarms[MTC_ALARM_ID__BM] = FM_ALARM_SEVERITY_CLEAR ;
                }
    }
    else
    {
        if ( node_ptr->alarms[MTC_ALARM_ID__BM] != FM_ALARM_SEVERITY_CLEAR )
        {
            mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__BM );
            node_ptr->alarms[MTC_ALARM_ID__BM] = FM_ALARM_SEVERITY_CLEAR ;
        }
    }

    return (PASS);
}


int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr )
{
    switch (node_ptr->oosTestStage)
    {
        case MTC_OOS_TEST__LOAD_NEXT_TEST:
        {
            oosTestStageChange ( node_ptr, MTC_OOS_TEST__START_WAIT );
            break ;
        }
        case MTC_OOS_TEST__START_WAIT:
        {
            /* Monitor timer errors */
            mtcTimer_dump_data ();

            // blog ("%s Inservice Test Period %d secs\n", node_ptr->hostname.c_str(), oos_test_period);
            mtcTimer_start ( node_ptr->oosTestTimer, mtcTimer_handler, oos_test_period );
            oosTestStageChange ( node_ptr, MTC_OOS_TEST__WAIT );

#ifdef WANT_FIT_TESTING
            if ( daemon_want_fit ( FIT_CODE__CORRUPT_TOKEN, node_ptr->hostname ))
                tokenUtil_fail_token ();

            else if ( daemon_want_fit ( FIT_CODE__STUCK_TASK, node_ptr->hostname ))
                mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_INPROGRESS);

            else if ( daemon_want_fit ( FIT_CODE__STOP_HOST_SERVICES, node_ptr->hostname ))
            {
                bool start = false ;
                this->launch_host_services_cmd ( node_ptr, start );
            }
            else if ( daemon_want_fit ( FIT_CODE__START_HOST_SERVICES, node_ptr->hostname ))
            {
                if (( node_ptr->start_services_needed == false ) &&
                    ( node_ptr->start_services_running_main == false ))
                {
                    node_ptr->start_services_needed  = true ;
                    node_ptr->start_services_retries = 0    ;
                }
                else
                {
                    ilog ("%s start host services (FIT) rejected (%d:%d)\n",
                              node_ptr->hostname.c_str(),
                              node_ptr->start_services_needed,
                              node_ptr->start_services_running_main);
                }
            }
#endif


            /* Avoid forcing the states to the database when on the first & second pass.
             * This is because it is likely we just read all the states and
             * if coming out of a DOR or a SWACT we don't need to un-necessarily
             * produce that extra sysinv traffic.
             * Also, no point forcing the states while there is an admin action
             * or enable or graceful recovery going on as well because state changes
             * are being done in the FSM already */
            if (( node_ptr->oos_test_count > 1 ) &&
                ( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
                ( !node_ptr->handlerStage.raw ) &&
                ( !node_ptr->recoveryStage ))
            {
                /* Change the oper and avail states in the database */
                allStateChange ( node_ptr, node_ptr->adminState,
                                           node_ptr->operState,
                                           node_ptr->availStatus );
            }

#ifdef WANT_CLEAR_ALARM_AUDIT

            /* TODO: Obsolete with new Alarm Strategy */
            /* Self Correct Stuck Failure Alarms */
            if (( node_ptr->adminState   == MTC_ADMIN_STATE__UNLOCKED  ) &&
                ( node_ptr->operState    == MTC_OPER_STATE__ENABLED ) &&
                (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
                 ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )))
            {
                if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CLEAR )
                {
                    mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
                    node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CLEAR ;
                }
                alarm_enabled_clear ( node_ptr , false);
            }
#endif
            /* Make sure the locked status on the host itself is set */
            if (( node_ptr->adminState  == MTC_ADMIN_STATE__LOCKED  ) &&
                ( node_ptr->operState   == MTC_OPER_STATE__DISABLED ) &&
                ( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE ) &&
                ( !(node_ptr->mtce_flags & MTC_FLAG__I_AM_LOCKED)    ))
            {
                ilog ("%s setting 'locked' status\n", node_ptr->hostname.c_str());

                /* Tell the host that it is locked */
                send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, MGMNT_INTERFACE);
            }

            if (( daemon_is_file_present ( MTC_CMD_FIT__GOENABLE_AUDIT )) &&
                ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
                ( node_ptr->operState  == MTC_OPER_STATE__ENABLED ))
            {
                /* Request Out-Of--Service test execution */
                send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MAIN_GOENABLED, MGMNT_INTERFACE );
                if ( node_ptr->operState_subf == MTC_OPER_STATE__ENABLED)
                {
                    send_mtc_cmd ( node_ptr->hostname, MTC_REQ_SUBF_GOENABLED, MGMNT_INTERFACE );
                }
            }

            break ;
        }
        case MTC_OOS_TEST__WAIT:
        {
            if ( node_ptr->oosTestTimer.ring == true )
            {
                oosTestStageChange ( node_ptr, MTC_OOS_TEST__DONE );
            }
            break ;
        }
        case MTC_OOS_TEST__DONE:
        default:
        {
            node_ptr->oos_test_count++ ;
            oosTestStageChange ( node_ptr, MTC_OOS_TEST__LOAD_NEXT_TEST );

            /* clear out the retry counter periodically */
            node_ptr->http_retries_cur = 0 ;

            break ;
        }
    }
    return (PASS);
}


int local_counter = 0 ;

int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
{
    switch (node_ptr->insvTestStage)
    {
        case MTC_INSV_TEST__START:
        {
            mtcTimer_reset ( node_ptr->insvTestTimer );

            /* Run the inservice test more frequently while
             * start_services_needed is true and we are not
             * in failure retry mode */
            if (( node_ptr->start_services_needed == true ) &&
                ( node_ptr->hostservices_failed == false ) &&
                ( node_ptr->hostservices_failed_subf == false ))
            {
                mtcTimer_start ( node_ptr->insvTestTimer, mtcTimer_handler, MTC_SECS_2 );
            }
            else
            {
                mtcTimer_start ( node_ptr->insvTestTimer, mtcTimer_handler, insv_test_period );
            }
            insvTestStageChange ( node_ptr, MTC_INSV_TEST__WAIT );
            break ;
        }
        case MTC_INSV_TEST__WAIT:
        {
            if ( node_ptr->insvTestTimer.ring == true )
            {
                insvTestStageChange ( node_ptr, MTC_INSV_TEST__RUN );
            }
            /* manage degrade state and alarms */
            if ((  node_ptr->adminState  == MTC_ADMIN_STATE__UNLOCKED ) &&
                (  node_ptr->operState   == MTC_OPER_STATE__ENABLED   ))
            {
                /************************************************************
                 *               Manage In-Service Alarms                   *
                 ***********************************************************/

                /* Manage Inservice Enable Alarm */
                if ( node_ptr->hostservices_failed )
                {
                    alarm_insv_failure ( node_ptr );
                }
                else
                {
                    alarm_insv_clear ( node_ptr, false );
                }

                /* Manage Compute Subfunction Failure Alarm */
                if ( node_ptr->hostservices_failed_subf )
                {
                    alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_MAJOR );
                }
                else
                {
                    alarm_compute_clear ( node_ptr, false );
                }
            }
            break ;
        }
        case MTC_INSV_TEST__RUN:
        {

#ifdef WANT_FIT_TESTING

            daemon_load_fit ();

            if ( daemon_want_fit ( FIT_CODE__UNLOCK_HOST, node_ptr->hostname ) )
            {
                adminActionChange ( node_ptr, MTC_ADMIN_ACTION__UNLOCK );
            }

            if ( daemon_want_fit ( FIT_CODE__LOCK_HOST, node_ptr->hostname ) )
            {
                adminActionChange ( node_ptr, MTC_ADMIN_ACTION__LOCK );
            }

            if ( daemon_want_fit ( FIT_CODE__FORCE_LOCK_HOST, node_ptr->hostname ) )
            {
                adminActionChange ( node_ptr, MTC_ADMIN_ACTION__FORCE_LOCK );
            }

            if (( daemon_want_fit ( FIT_CODE__DO_NOTHING_THREAD, node_ptr->hostname )) ||
                ( daemon_want_fit ( FIT_CODE__STRESS_THREAD , node_ptr->hostname )))
            {
                 node_ptr->ipmitool_thread_ctrl.stage = THREAD_STAGE__IGNORE ;
                 node_ptr->ipmitool_thread_ctrl.id = true ;
                 node_ptr->ipmitool_thread_info.id = true ;
                 node_ptr->ipmitool_thread_info.command = IPMITOOL_THREAD_CMD__POWER_STATUS ;

                 /* Update / Setup the BMC access credentials */
                 node_ptr->thread_extra_info.bm_ip   = node_ptr->bm_ip   ;
                 node_ptr->thread_extra_info.bm_un   = node_ptr->bm_un   ;
                 node_ptr->thread_extra_info.bm_pw   = node_ptr->bm_pw   ;
                 node_ptr->thread_extra_info.bm_type = node_ptr->bm_type ;
                 node_ptr->ipmitool_thread_info.extra_info_ptr = &node_ptr->thread_extra_info ;
                 if ( thread_launch_thread ( mtcThread_ipmitool, &node_ptr->ipmitool_thread_info ) == 0 )
                 {
                     slog ("%s FIT launching mtcThread_ipmitool power query thread ; launch failed\n", node_ptr->hostname.c_str());
                 }
                 else
                 {
                     slog ("%s FIT launching mtcThread_ipmitool power query thread\n", node_ptr->hostname.c_str());
                 }
                 node_ptr->ipmitool_thread_ctrl.done = true ;
            }

#endif

            /* Manage active controller auto recovery bool.
             * If the inactive controller is inservice then disable
             * controller autorecovery. Otherwise enable it but in this case
             * don't change the disable bool as that is used to gate auto
             * recovery once the threshoild is reached */
            if ( is_controller ( node_ptr ) && NOT_THIS_HOST )
            {
                if (( this->autorecovery_enabled == true ) &&
                    ( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
                {
                    autorecovery_clear ( CONTROLLER_0 );
                    autorecovery_clear ( CONTROLLER_1 );
                    this->autorecovery_enabled  = false ;
                    this->autorecovery_disabled = false ;
                }
                else if (( this->autorecovery_enabled == false ) &&
                         ( node_ptr->operState != MTC_OPER_STATE__ENABLED ))
                {
                    this->autorecovery_enabled = true ;
                }
            }

            /* Monitor the health of the host - no pass file */
            if ((  node_ptr->adminState  == MTC_ADMIN_STATE__UNLOCKED ) &&
                (  node_ptr->operState   == MTC_OPER_STATE__ENABLED   ))
            {
                /************************************************************
                 * Prevent the start host services from running while in DOR
                 ***********************************************************/
                if ( node_ptr->dor_recovery_mode == true )
                {
                    /* wait longer for the host to boot up */
                    wlog ("%s DOR recovery active ; waiting on host\n",
                              node_ptr->hostname.c_str());
                }
                else if ( this->dor_mode_active == true )
                {
                    ilog_throttled ( this->dor_mode_active_log_throttle, 20,
                                     "DOR mode active\n");
                }

               /*************************************************************
                * Handle Start Host Services if its posted for execution
                ************************************************************/
                else if ( node_ptr->start_services_needed == true )
                {
                    /* If Main Start Host Services is not already running then launch it */
                    if (( node_ptr->start_services_running_main == false ) &&
                        ( node_ptr->start_services_running_subf == false ))
                    {
                        bool start = true ;
                        if ( this->launch_host_services_cmd ( node_ptr , start ) != PASS )
                        {
                            node_ptr->hostservices_failed = true ;
                            node_ptr->start_services_retries++ ;
                        }
                        else
                        {
                            node_ptr->start_services_running_main = true ;
                        }
                    }
                    /* Handle start host services response for both main and
                     * subfunction levels */
                    else
                    {
                        /* Wait for host services to complete - pass or fail.
                         * The host_services_handler manages timeout. */
                        int rc = this->host_services_handler ( node_ptr );
                        if ( rc == RETRY )
                        {
                            /* wait for the mtcClient's response ... */
                            break ;
                        }

                        node_ptr->start_services_running_main = false ;

                        if ( rc != PASS )
                        {

                            /* set the correct failed flag */
                            if ( node_ptr->start_services_needed_subf == true )
                            {
                                node_ptr->start_services_running_subf = false ;
                                node_ptr->hostservices_failed_subf = true ;
                            }
                            else
                            {
                                node_ptr->hostservices_failed = true ;
                            }

                            node_ptr->start_services_retries++ ;

                            wlog ("%s %s request failed ; (retry %d)\n",
                                      node_ptr->hostname.c_str(),
                                      node_ptr->host_services_req.name.c_str(),
                                      node_ptr->start_services_retries);
                        }
                        else /* success path */
                        {
                            /* clear the correct fail flag */
                            if (( node_ptr->start_services_needed_subf == true ) &&
                                ( node_ptr->start_services_running_subf == true ))
                            {
                                node_ptr->start_services_needed_subf  = false ;
                                node_ptr->start_services_running_subf = false ;
                                node_ptr->hostservices_failed_subf    = false ;
                            }
                            else
                            {
                                node_ptr->hostservices_failed = false ;
                            }

                            /*************************************************
                             * Handle running the subfunction start compute
                             * host services command as a background operation
                             * after the controller start result has come in
                             * as a PASS.
                             ************************************************/
                            if ( node_ptr->start_services_needed_subf == true )
                            {
                                bool start = true ;
                                bool subf  = node_ptr->start_services_needed_subf ;
                                if ( this->launch_host_services_cmd ( node_ptr, start, subf ) != PASS )
                                {
                                    node_ptr->hostservices_failed_subf = true ;

                                    /* try again on next audit */
                                    node_ptr->start_services_retries++ ;
                                }
                                else
                                {
                                    node_ptr->start_services_running_subf = true ;
                                }
                            }
                            else
                            {
                                /* All host service scripts pass ; done */
                                clear_hostservices_ctls ( node_ptr );
                                node_ptr->hostservices_failed_subf = false ;
                                node_ptr->hostservices_failed = false ;
                            }
                        }
                    }
                }

                if ( NOT_THIS_HOST )
                {
                    if ((( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
                        (  node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )) &&
                        (!(node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY) &&
                         !(node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY)))
                    {
                        if ( node_ptr->unknown_health_reported == false )
                        {
                            wlog ( "%s has UNKNOWN HEALTH\n", node_ptr->hostname.c_str());
                            node_ptr->unknown_health_reported = true ;
                        }
                    }
                }
            }

            /** Manage the subfunction goenabled alarm over a mtcAgent restart
             *  In the restart case the subfunction fsm enable handler is not run so
             *  we try to detect the missing goenabled_subf flag as an inservice test.
             *
             *  Only in CPE type
             *   - clear the alarm if the issue goes away -
             *     i.e. the goenabled tests eventually pass. Today
             *     hey are not re-run in the background but someday they may be
             *   - raise the alarm and go degraded if the goEnabled_subf flag is not set
             *     and we have only a single enabled controller (which must be this one)
             *     and the alarm is not already raised.
             **/
            if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
            {
                if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
                    ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
                    ( node_ptr->mtce_flags & MTC_FLAG__SUBF_CONFIGURED )) /* handle initial install case */
                {
                    if (( node_ptr->goEnabled_subf == true ) &&
                        ( node_ptr->inservice_failed_subf == false ) &&
                        ( node_ptr->goEnabled_failed_subf == false ) &&
                        ( node_ptr->hostservices_failed_subf == false ))
                    {
                        if ( node_ptr->alarms[MTC_ALARM_ID__CH_COMP] != FM_ALARM_SEVERITY_CLEAR )
                        {
                            alarm_compute_clear ( node_ptr, false );
                            ilog ("%s cleared alarm %s due to failure recovery (degrade:%x)\n",
                                      node_ptr->hostname.c_str(),
                                      mtcAlarm_getId_str(MTC_ALARM_ID__CH_COMP).c_str(),
                                      node_ptr->degrade_mask);


                            if ( node_ptr->degrade_mask == 0 )
                            {
                                allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                                           MTC_OPER_STATE__ENABLED,
                                                           MTC_AVAIL_STATUS__AVAILABLE );

                                subfStateChange ( node_ptr, MTC_OPER_STATE__ENABLED,
                                                            MTC_AVAIL_STATUS__AVAILABLE );

                                /* Inform the VIM that this host is enabled */
                                mtcVimApi_state_change ( node_ptr, VIM_HOST_ENABLED, 3 );
                            }
                        }
                    }
                    /*
                     * Send out-of-service test command and wait for the
                     * next audit interval to see the result.
                     *
                     *  node_ptr->goEnabled_subf        == true is pass
                     *  node_ptr->goEnabled_subf_failed == true is fail
                     *
                     **/
                    if (( node_ptr->operState_subf == MTC_OPER_STATE__DISABLED ) &&
                        ( this->autorecovery_disabled == false ) &&
                        ( node_ptr->start_services_needed == false ))
                    {
                        if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE_SUBF ) &&
                            ( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ))
                        {
                            if (( node_ptr->inservice_failed_subf == false ) &&
                                ( node_ptr->hostservices_failed_subf == false ))
                            {
                                ilog ("%s-compute ... running recovery enable\n", node_ptr->hostname.c_str());

                                alarm_compute_clear ( node_ptr, true );

                                enableStageChange ( node_ptr, MTC_ENABLE__START );
                                adminActionChange ( node_ptr, MTC_ADMIN_ACTION__ENABLE_SUBF );
                            }
                            else
                            {
                                ilog ("%s-compute subfunction is unlocked-disabled (non-operational)\n",
                                          node_ptr->hostname.c_str());
                            }
                        }
                        else
                        {
                            ilog ("%s-compute ... waiting on current goEnable completion\n", node_ptr->hostname.c_str() );
                        }
                    }
                }
                /* Only raise this alarm while in simplex */
                if (( num_controllers_enabled() < 2 ) &&
                    (( node_ptr->goEnabled_failed_subf == true ) ||
                     ( node_ptr->inservice_failed_subf == true ) ||
                     ( node_ptr->hostservices_failed_subf == true )))
                {
                    if ( node_ptr->alarms[MTC_ALARM_ID__CH_COMP] == FM_ALARM_SEVERITY_CLEAR )
                    {
                        wlog ("%s insv test detected subfunction failure ; degrading host\n",
                                  node_ptr->hostname.c_str());

                        alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_MAJOR );

                        allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                                   MTC_OPER_STATE__ENABLED,
                                                   MTC_AVAIL_STATUS__DEGRADED );

                        subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
                                                    MTC_AVAIL_STATUS__FAILED );

                    }
                }
            }

            /* Monitor the health of the host - no pass file */
            if ((  node_ptr->adminState  == MTC_ADMIN_STATE__UNLOCKED ) &&
                (  node_ptr->operState   == MTC_OPER_STATE__ENABLED   ) &&
                (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
                 ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )))
            {
                /* Manage asserting degrade due to Software Management */
                if (( node_ptr->mtce_flags & MTC_FLAG__SM_DEGRADED ) &&
                    ( !(node_ptr->degrade_mask & DEGRADE_MASK_SM)))
                {
                    /* set the SM degrade flag in the mask */
                    node_ptr->degrade_mask |= DEGRADE_MASK_SM ;

                    ilog ("%s sm degrade\n", node_ptr->hostname.c_str());
                }

                /* Manage de-asserting degrade due to Software Management */
                if ((!(node_ptr->mtce_flags & MTC_FLAG__SM_DEGRADED)) &&
                    (node_ptr->degrade_mask & DEGRADE_MASK_SM))
                {
                    /* clear the SM degrade flag */
                    node_ptr->degrade_mask &= ~DEGRADE_MASK_SM ;

                    ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str());
                }

                if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY)
                {
                    /* not healthy .... */
                    if ( THIS_HOST )
                    {
                        /* initial config is complete and last manifest apply failed ... */
                        if (( daemon_is_file_present ( CONFIG_COMPLETE_FILE )) &&
                            ( daemon_is_file_present ( CONFIG_FAIL_FILE )))
                        {
                            wlog_throttled ( node_ptr->health_threshold_counter, (MTC_UNHEALTHY_THRESHOLD*3), "%s is UNHEALTHY\n", node_ptr->hostname.c_str());
                            if ( node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD )
                            {
                                node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;

                                /* threshold is reached so raise the config alarm if it is not already raised */
                                if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CRITICAL )
                                {
                                    mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
                                    node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CRITICAL ;
                                }
                            }
                        }
                    }
                    else
                    {
                        if ( ++node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD )
                        {
                            elog ( "%s is UNHEALTHY failed ; forcing re-enabled\n",
                                    node_ptr->hostname.c_str());

                            force_full_enable ( node_ptr ) ;
                        }
                        else
                        {
                            wlog ( "%s is UNHEALTHY (cnt:%d)\n",
                                       node_ptr->hostname.c_str(),
                                       node_ptr->health_threshold_counter );
                        }
                    }
                }
                else
                {
                    node_ptr->health_threshold_counter = 0 ;
                }
            }

            node_ptr->insv_test_count++ ;
            insvTestStageChange ( node_ptr, MTC_INSV_TEST__START );

            break ;
        }
        default:
        {
            node_ptr->insv_test_count++ ;
            insvTestStageChange ( node_ptr, MTC_INSV_TEST__START );
            break ;
        }
    }
    return (PASS);
}

/************************************************************
 * Manage host degrade state based on degrade mask          *
 * The availability state of degrade only applies when the  *
 * host is unlocked-enabled.                                *
 ***********************************************************/
int nodeLinkClass::degrade_handler ( struct nodeLinkClass::node * node_ptr )
{
    if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
        ( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
    {
        if (( node_ptr->degrade_mask == DEGRADE_MASK_NONE ) &&
            ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ))
        {
            availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE );
        }

        else if (( node_ptr->degrade_mask ) &&
                 ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ))
        {
            availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
        }
    }
    return (PASS);
}

int nodeLinkClass::cfg_handler ( struct nodeLinkClass::node * node_ptr )
{
    int rc = PASS ;
    switch (node_ptr->configStage )
    {
        case MTC_CONFIG__START:
        {
            ilog ("%s Starting a %s:%s shadow entry change check\n",
                      node_ptr->hostname.c_str(),
                      SHADOW_FILE,
                      USERNAME_ROOT );

            /* Post the show command with a catch-all timeout timer */
            rc = mtcInvApi_cfg_show ( node_ptr->hostname ) ;
            if ( rc )
            {
                elog ("%s Config SHOW command failed\n", node_ptr->hostname.c_str() );
                configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
            }
            else
            {
                mtcTimer_start ( node_ptr->mtcConfig_timer, mtcTimer_handler, (sysinv_timeout+1) );
                configStageChange ( node_ptr, MTC_CONFIG__SHOW );
            }
            break ;
        }
        case MTC_CONFIG__SHOW:
        {
            /* timeout yet ? */
            if ( node_ptr->mtcConfig_timer.ring == true )
            {
                elog ("%s timeout\n", node_ptr->cfgEvent.log_prefix.c_str());
                configStageChange ( node_ptr, MTC_CONFIG__TIMEOUT );
                break ;
            }

            /* done Yet ? */
            rc = doneQueue_dequeue ( node_ptr->cfgEvent ) ;
            if ( rc == RETRY )
            {
                /* Still waiting */
                break ;
            }
            else if ( rc == PASS )
            {
                string temp_value = "" ;
                mtcTimer_stop ( node_ptr->mtcConfig_timer );
                node_ptr->cfgEvent.value = "" ;
                node_ptr->cfgEvent.uuid  = "" ;
                if (( rc = jsonUtil_get_array_idx ( (char*)node_ptr->cfgEvent.response.data(), "iusers", 0 , temp_value )) == PASS )
                {
                    jsonUtil_get_key_val ( (char*)temp_value.data(), "root_sig", node_ptr->cfgEvent.value);
                }

                if (  node_ptr->cfgEvent.value.empty() ||
                     !node_ptr->cfgEvent.value.compare("null") || rc )
                {
                    elog ("%s null or missing 'root_sig' value (%d:%s)\n",
                              node_ptr->cfgEvent.service.c_str(), rc,
                              node_ptr->cfgEvent.value.empty() ? "empty" : node_ptr->cfgEvent.value.c_str());

                    node_ptr->cfgEvent.status = FAIL_INVALID_DATA ;
                    configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
                    break;
                }

                ilog ("%s root_sig:%s\n", node_ptr->cfgEvent.log_prefix.c_str(),
                                          node_ptr->cfgEvent.value.c_str());

                dlog ("Database Signature: %s\n", node_ptr->cfgEvent.value.c_str());

                /*
                 * generate a md5 signature for this user's Shadow entry.
                 * We will do so for the entire entry as either the password
                 * or the password age may change and we need to track and notify
                 * for both.
                 */
                char cfgInfo[1024] = {0};
                node_ptr->cfgEvent.key = get_shadow_signature ( (char*)SHADOW_FILE , (char*)USERNAME_ROOT,
                                                                &cfgInfo[0], sizeof(cfgInfo));
                node_ptr->cfgEvent.information = cfgInfo;

                if ( node_ptr->cfgEvent.key.empty() )
                {
                    elog ("failed to get md5sum of username '%s' from  '%s'\n", USERNAME_ROOT, SHADOW_FILE );
                    node_ptr->cfgEvent.status = FAIL_INVALID_DATA ;
                    configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
                    break ;
                }

                dlog ("File Signature : %s\n", node_ptr->cfgEvent.key.c_str());
                if ( node_ptr->cfgEvent.key.compare(node_ptr->cfgEvent.value))
                {
                    bool install = false ;
                    if ( node_ptr->configAction == MTC_CONFIG_ACTION__INSTALL_PASSWD )
                    {
                        install = true ;
                        ilog ("%s shadow file hash and aging ... install config\n", USERNAME_ROOT );
                    }
                    else
                    {
                        ilog ("%s shadow entry has changed ... updating config\n", USERNAME_ROOT );
                        ilog ("... old signature - %s\n", node_ptr->cfgEvent.value.c_str());
                        ilog ("... new signature - %s\n", node_ptr->cfgEvent.key.c_str());
                    }

                    if ((rc = jsonUtil_get_array_idx ( (char*)node_ptr->cfgEvent.response.data(), "iusers", 0 , temp_value )) == PASS )
                    {
                        jsonUtil_get_key_val ( (char*)temp_value.data(), "uuid", node_ptr->cfgEvent.uuid);
                    }

                    if ( rc || node_ptr->cfgEvent.uuid.empty() || !node_ptr->cfgEvent.uuid.compare("null"))
                    {
                        elog ("%s null or missing reconfig 'uuid' (%d:%s)\n",
                                  node_ptr->cfgEvent.service.c_str(), rc,
                                  node_ptr->cfgEvent.uuid.empty() ? "empty" : node_ptr->cfgEvent.uuid.c_str());
                        return ( FAIL_INVALID_DATA );
                    }
                    ilog ("%s uuid:%s\n",
                              node_ptr->cfgEvent.log_prefix.c_str(),
                              node_ptr->cfgEvent.uuid.c_str());

                    /* Post the modify command */
                    rc = mtcInvApi_cfg_modify ( node_ptr->hostname, install ) ;
                    if ( rc )
                    {
                        elog ("%s Config MODIFY command failed\n", node_ptr->hostname.c_str() );
                        configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
                    }
                    else
                    {
                        mtcTimer_start ( node_ptr->mtcConfig_timer, mtcTimer_handler, (sysinv_timeout+1) );
                        configStageChange ( node_ptr, MTC_CONFIG__MODIFY );
                    }
                }
                else
                {
                    ilog ("%s shadow entry has not changed (%s)\n",
                              USERNAME_ROOT, node_ptr->cfgEvent.key.c_str());
                    configStageChange ( node_ptr, MTC_CONFIG__DONE );
                }
            }
            else
            {
                elog ("%s failed (%d:%d)\n", node_ptr->cfgEvent.log_prefix.c_str(), rc,
                                             node_ptr->cfgEvent.status );
                configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
            }
            break ;
        }

        case MTC_CONFIG__MODIFY:
        {
            /* timeout yet ? */
            if ( node_ptr->mtcConfig_timer.ring == true )
            {
                elog ("%s timeout\n", node_ptr->cfgEvent.log_prefix.c_str());
                configStageChange ( node_ptr, MTC_CONFIG__TIMEOUT );
                break ;
            }

            /* done Yet ? */
            rc = doneQueue_dequeue ( node_ptr->cfgEvent ) ;
            if ( rc == RETRY )
            {
                /* Still waiting */
                break ;
            }
            else if ( rc == PASS )
            {
                mtcTimer_stop ( node_ptr->mtcConfig_timer );
                if ( node_ptr->cfgEvent.response_len )
                {
                    configStageChange ( node_ptr, MTC_CONFIG__VERIFY );
                }
                else
                {
                    elog ("%s modify without response (%d:%d)\n",
                              node_ptr->cfgEvent.log_prefix.c_str(), rc,
                              node_ptr->cfgEvent.status );
                    configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
                }
            }
            else
            {
                elog ("%s modify failed (%d:%d)\n",
                          node_ptr->cfgEvent.log_prefix.c_str(), rc,
                          node_ptr->cfgEvent.status );
                configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
            }
            break ;
        }
        case MTC_CONFIG__VERIFY:
        {
            node_ptr->cfgEvent.value = "" ;
            rc = jsonUtil_get_key_val ( (char*)node_ptr->cfgEvent.response.data(),
                                             "root_sig", node_ptr->cfgEvent.value);
            if (  node_ptr->cfgEvent.value.empty() ||
                 !node_ptr->cfgEvent.value.compare("null") || rc )
            {
                elog ("%s null or missing 'root_sig' value (%d:%s)\n",
                          node_ptr->cfgEvent.service.c_str(), rc,
                          node_ptr->cfgEvent.value.empty() ? "empty" : node_ptr->cfgEvent.value.c_str());

                node_ptr->cfgEvent.status = FAIL_INVALID_DATA ;
                configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
                break;
            }

            if ( node_ptr->cfgEvent.key.compare(node_ptr->cfgEvent.value))
            {
                elog ("%s root_sig modify compare failed\n",
                          node_ptr->cfgEvent.log_prefix.c_str());
                wlog ("... database signature - %s\n", node_ptr->cfgEvent.value.c_str());
                wlog ("... file     signature - %s\n", node_ptr->cfgEvent.key.c_str());

                configStageChange ( node_ptr, MTC_CONFIG__FAILURE );
            }
            else
            {
                ilog ("%s modify succeeded\n", node_ptr->cfgEvent.log_prefix.c_str());
                configStageChange ( node_ptr, MTC_CONFIG__DONE );
            }
            break ;
        }
        case MTC_CONFIG__FAILURE:
        {
            elog ("%s Command Failure\n", node_ptr->cfgEvent.log_prefix.c_str());

            /* Call to remove this command from the work queue ; if it exists */
            workQueue_del_cmd ( node_ptr, node_ptr->cfgEvent.sequence );

            configStageChange ( node_ptr, MTC_CONFIG__DONE );
            break ;
        }
        case MTC_CONFIG__TIMEOUT:
        {
            elog ("%s Command Timeout\n", node_ptr->cfgEvent.log_prefix.c_str());

            /* Call to remove this command from the work queue ; if it exists */
            workQueue_del_cmd ( node_ptr, node_ptr->cfgEvent.sequence );

            node_ptr->oper_failures++ ;
            mtcHttpUtil_free_conn ( node_ptr->cfgEvent );
            mtcHttpUtil_free_base ( node_ptr->cfgEvent );

            configStageChange ( node_ptr, MTC_CONFIG__DONE );
            break ;
        }
        case MTC_CONFIG__DONE:
        default:
        {
            if (( node_ptr->configAction == MTC_CONFIG_ACTION__INSTALL_PASSWD ) ||
                ( node_ptr->configAction == MTC_CONFIG_ACTION__CHANGE_PASSWD ))
            {
                /* We are done */
                node_ptr->configAction = MTC_CONFIG_ACTION__NONE ;
            }
            if ( node_ptr->configAction == MTC_CONFIG_ACTION__CHANGE_PASSWD_AGAIN )
            {
                /* Run the FSM again */
                node_ptr->configAction = MTC_CONFIG_ACTION__CHANGE_PASSWD ;
            }
            node_ptr->configStage = MTC_CONFIG__START ;
            break ;
        }
    }
    return (PASS);
}