Merge "Remove Start Host Service Launch in mtcAgent & enhance fault detection"
This commit is contained in:
commit
34207b1895
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013, 2016, 2023-2024 Wind River Systems, Inc.
|
||||
* Copyright (c) 2013, 2016, 2023-2025 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
@ -348,8 +348,6 @@ void mtc_stages_init ( void )
|
||||
enableStages_str [MTC_ENABLE__GOENABLED_TIMER ] = "GoEnable-Start";
|
||||
enableStages_str [MTC_ENABLE__GOENABLED_WAIT ] = "GoEnable-Wait";
|
||||
enableStages_str [MTC_ENABLE__PMOND_READY_WAIT ] = "PmondReady-Wait";
|
||||
enableStages_str [MTC_ENABLE__HOST_SERVICES_START ] = "HostServices-Start";
|
||||
enableStages_str [MTC_ENABLE__HOST_SERVICES_WAIT ] = "HostServices-Wait";
|
||||
enableStages_str [MTC_ENABLE__SERVICES_START_WAIT ] = "Services-Start";
|
||||
enableStages_str [MTC_ENABLE__HEARTBEAT_WAIT ] = "Heartbeat-Wait";
|
||||
enableStages_str [MTC_ENABLE__HEARTBEAT_SOAK ] = "Heartbeat-Soak";
|
||||
@ -375,8 +373,6 @@ void mtc_stages_init ( void )
|
||||
recoveryStages_str[MTC_RECOVERY__MTCALIVE_WAIT ] = "MtcAlive-Wait";
|
||||
recoveryStages_str[MTC_RECOVERY__GOENABLED_TIMER ] = "GoEnable-Timer";
|
||||
recoveryStages_str[MTC_RECOVERY__GOENABLED_WAIT ] = "GoEnable-Wait";
|
||||
recoveryStages_str[MTC_RECOVERY__HOST_SERVICES_START] = "HostServices-Start";
|
||||
recoveryStages_str[MTC_RECOVERY__HOST_SERVICES_WAIT ] = "HostServices-Wait";
|
||||
recoveryStages_str[MTC_RECOVERY__CONFIG_COMPLETE_WAIT]= "Compute-Config-Wait";
|
||||
recoveryStages_str[MTC_RECOVERY__SUBF_GOENABLED_TIMER]= "Subf-GoEnable-Timer";
|
||||
recoveryStages_str[MTC_RECOVERY__SUBF_GOENABLED_WAIT] = "Subf-GoEnable-Wait";
|
||||
|
@ -1,7 +1,7 @@
|
||||
#ifndef __INCLUDE_NODEBASE_HH__
|
||||
#define __INCLUDE_NODEBASE_HH__
|
||||
/*
|
||||
* Copyright (c) 2013-2020, 2023-2024 Wind River Systems, Inc.
|
||||
* Copyright (c) 2013-2020, 2023-2025 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
@ -77,18 +77,25 @@ void daemon_exit ( void );
|
||||
*
|
||||
* These flags are shipped in the parm[2] if the
|
||||
* mtcAlive message from each host. */
|
||||
#define MTC_FLAG__I_AM_CONFIGURED (0x00000001)
|
||||
#define MTC_FLAG__I_AM_NOT_HEALTHY (0x00000002)
|
||||
#define MTC_FLAG__I_AM_HEALTHY (0x00000004)
|
||||
#define MTC_FLAG__I_AM_LOCKED (0x00000008)
|
||||
#define MTC_FLAG__SUBF_CONFIGURED (0x00000010)
|
||||
#define MTC_FLAG__MAIN_GOENABLED (0x00000020)
|
||||
#define MTC_FLAG__SUBF_GOENABLED (0x00000040)
|
||||
#define MTC_FLAG__SM_DEGRADED (0x00000080)
|
||||
#define MTC_FLAG__PATCHING (0x00000100) /* Patching in progress */
|
||||
#define MTC_FLAG__PATCHED (0x00000200) /* Patched but not reset */
|
||||
#define MTC_FLAG__LUKS_VOL_FAILED (0x00000400)
|
||||
#define MTC_FLAG__SM_UNHEALTHY (0x00001000)
|
||||
#define MTC_FLAG__I_AM_CONFIGURED (0x00000001)
|
||||
#define MTC_FLAG__I_AM_NOT_HEALTHY (0x00000002)
|
||||
#define MTC_FLAG__I_AM_HEALTHY (0x00000004)
|
||||
#define MTC_FLAG__I_AM_LOCKED (0x00000008)
|
||||
#define MTC_FLAG__SUBF_CONFIGURED (0x00000010)
|
||||
#define MTC_FLAG__MAIN_GOENABLED (0x00000020)
|
||||
#define MTC_FLAG__SUBF_GOENABLED (0x00000040)
|
||||
#define MTC_FLAG__SM_DEGRADED (0x00000080)
|
||||
#define MTC_FLAG__PATCHING (0x00000100) /* Patching in progress */
|
||||
#define MTC_FLAG__PATCHED (0x00000200) /* Patched but not reset */
|
||||
#define MTC_FLAG__LUKS_VOL_FAILED (0x00000400)
|
||||
#define MTC_FLAG__SM_UNHEALTHY (0x00001000)
|
||||
#define MTC_FLAG__RESERVED_2000 (0x00002000)
|
||||
#define MTC_FLAG__RESERVED_4000 (0x00004000)
|
||||
#define MTC_FLAG__RESERVED_8000 (0x00008000)
|
||||
#define MTC_FLAG__MAIN_GOENABLE_FAIL (0x00010000)
|
||||
#define MTC_FLAG__SUBF_GOENABLE_FAIL (0x00020000)
|
||||
#define MTC_FLAG__MAIN_SERVICES_FAIL (0x00040000)
|
||||
#define MTC_FLAG__SUBF_SERVICES_FAIL (0x00080000)
|
||||
|
||||
#define MTC_UNHEALTHY_THRESHOLD (3)
|
||||
|
||||
@ -98,7 +105,7 @@ void daemon_exit ( void );
|
||||
#define NODE_UNHEALTHY (2)
|
||||
|
||||
#define AUTO_RECOVERY_FILE_SUFFIX ((const char *)"_ar_count")
|
||||
#define TMP_DIR_PATH ((const char *)"/etc/mtc/tmp/")
|
||||
#define MTC_PERSIST_PATH ((const char *)"/var/persist/mtc/")
|
||||
|
||||
#define HOST_IS_VIRTUAL ((const char *)"/var/run/virtual.host")
|
||||
|
||||
@ -159,6 +166,10 @@ void daemon_exit ( void );
|
||||
#define OPT_PLATFORM_CONFIG_DIR ((const char *)"/opt/platform/config")
|
||||
#define DNSMASQ_HOSTS_FILE ((const char *)"dnsmasq.hosts")
|
||||
|
||||
/* maintenance log files */
|
||||
#define MTCAGENT_LOG_FILE ((const char *)"/var/log/mtcAgent.log")
|
||||
#define MTCCLIENT_LOG_FILE ((const char *)"/var/log/mtcClient.log")
|
||||
|
||||
/* supported BMC communication protocols ; access method */
|
||||
typedef enum
|
||||
{
|
||||
@ -294,8 +305,7 @@ typedef enum
|
||||
#define MTC_TASK_SUBF_CONFIG_TO "Worker Configuration Timeout, re-enabling"
|
||||
#define MTC_TASK_SUBF_INTEST_FAIL "Worker In-Test Failed, re-enabling"
|
||||
#define MTC_TASK_SUBF_INTEST_TO "Worker In-Test Timeout, re-enabling"
|
||||
#define MTC_TASK_SUBF_SERVICE_FAIL "Worker Start Services Failed, re-enabling"
|
||||
#define MTC_TASK_SUBF_SERVICE_TO "Worker Start Services Timeout, re-enabling"
|
||||
#define MTC_TASK_SUBF_SERVICE_FAIL "Start Worker Services Failed, re-enabling"
|
||||
|
||||
#define MTC_TASK_AR_DISABLED_CONFIG "Configuration failure, threshold reached, Lock/Unlock to retry"
|
||||
#define MTC_TASK_AR_DISABLED_GOENABLE "In-Test Failure, threshold reached, Lock/Unlock to retry"
|
||||
@ -904,8 +914,6 @@ typedef enum
|
||||
MTC_ENABLE__GOENABLED_TIMER = 12,
|
||||
MTC_ENABLE__GOENABLED_WAIT = 13,
|
||||
MTC_ENABLE__PMOND_READY_WAIT = 14,
|
||||
MTC_ENABLE__HOST_SERVICES_START = 15,
|
||||
MTC_ENABLE__HOST_SERVICES_WAIT = 16,
|
||||
MTC_ENABLE__SERVICES_START_WAIT = 17,
|
||||
MTC_ENABLE__HEARTBEAT_WAIT = 18,
|
||||
MTC_ENABLE__HEARTBEAT_SOAK = 19,
|
||||
@ -987,8 +995,6 @@ typedef enum
|
||||
MTC_RECOVERY__MTCALIVE_WAIT,
|
||||
MTC_RECOVERY__GOENABLED_TIMER,
|
||||
MTC_RECOVERY__GOENABLED_WAIT,
|
||||
MTC_RECOVERY__HOST_SERVICES_START,
|
||||
MTC_RECOVERY__HOST_SERVICES_WAIT,
|
||||
|
||||
/* Subfunction stages */
|
||||
MTC_RECOVERY__CONFIG_COMPLETE_WAIT,
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2019 Wind River Systems, Inc.
|
||||
* Copyright (c) 2013-2019, 2025 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
@ -1149,9 +1149,12 @@ int daemon_wait_for_file ( const char * filename, int timeout )
|
||||
|
||||
int daemon_files_init ( void )
|
||||
{
|
||||
struct timespec ts ;
|
||||
clock_gettime (CLOCK_MONOTONIC, &ts );
|
||||
|
||||
/* Create PID file */
|
||||
pid_t mypid = getpid();
|
||||
ilog ("--- Daemon Start-Up --- pid:%d\n", mypid);
|
||||
ilog ("--- Daemon Start-Up --- pid:%d uptime:%ld", mypid, ts.tv_sec);
|
||||
daemon_init_fit ();
|
||||
return ( PASS );
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2020, 2023-2024 Wind River Systems, Inc.
|
||||
* Copyright (c) 2013-2020, 2023-2025 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
@ -353,8 +353,6 @@ nodeLinkClass::nodeLinkClass()
|
||||
smgrEvent.buf = NULL ;
|
||||
tokenEvent.buf = NULL ;
|
||||
|
||||
unknown_host_throttle = 0 ;
|
||||
|
||||
testmode = 0 ;
|
||||
module_init( );
|
||||
}
|
||||
@ -365,19 +363,6 @@ nodeLinkClass::~nodeLinkClass()
|
||||
;
|
||||
}
|
||||
|
||||
/* Clear start host service controls */
|
||||
void nodeLinkClass::clear_hostservices_ctls ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
if ( node_ptr )
|
||||
{
|
||||
node_ptr->start_services_needed = false ;
|
||||
node_ptr->start_services_needed_subf = false ;
|
||||
node_ptr->start_services_running_main = false ;
|
||||
node_ptr->start_services_running_subf = false ;
|
||||
node_ptr->start_services_retries = 0 ;
|
||||
}
|
||||
}
|
||||
|
||||
/* Clear all the main function enable failure bools */
|
||||
void nodeLinkClass::clear_main_failed_bools ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
@ -516,7 +501,6 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
|
||||
ptr->goEnabled = false ;
|
||||
ptr->goEnabled_subf = false ;
|
||||
|
||||
clear_hostservices_ctls ( ptr );
|
||||
|
||||
/* clear all the enable failure bools */
|
||||
clear_main_failed_bools ( ptr );
|
||||
@ -4103,6 +4087,11 @@ void nodeLinkClass::set_cmd_resp ( string & hostname, mtc_message_type & msg, in
|
||||
{
|
||||
if ( is_host_services_cmd ( msg.cmd ) )
|
||||
{
|
||||
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
|
||||
{
|
||||
dlog3 ("%s ignoring host services result for locked node", hostname.c_str());
|
||||
return ;
|
||||
}
|
||||
/*****************************************************
|
||||
* Host Services Request's Response Handling
|
||||
*****************************************************/
|
||||
@ -4116,9 +4105,33 @@ void nodeLinkClass::set_cmd_resp ( string & hostname, mtc_message_type & msg, in
|
||||
{
|
||||
if ( !node_ptr->host_services_req.ack )
|
||||
{
|
||||
slog ("%s %s without initial command ACK\n",
|
||||
hostname.c_str(),
|
||||
node_ptr->host_services_req.name.c_str());
|
||||
// parm[0] contains the return code
|
||||
if ( msg.parm[0] == PASS )
|
||||
{
|
||||
ilog ("%s mtcClient %s ran and passed", hostname.c_str(), msg.buf);
|
||||
}
|
||||
else if ( node_ptr->hostservices_failed || node_ptr->hostservices_failed_subf )
|
||||
{
|
||||
ilog ("%s already handling 'host services' failure", hostname.c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
elog ("%s mtcClient %s ran and failed", hostname.c_str(), msg.buf);
|
||||
if (( msg.cmd != MTC_CMD_STOP_CONTROL_SVCS ) &&
|
||||
( msg.cmd != MTC_CMD_STOP_WORKER_SVCS ) &&
|
||||
( msg.cmd != MTC_CMD_STOP_STORAGE_SVCS ))
|
||||
{
|
||||
alarm_enabled_failure ( node_ptr, true );
|
||||
if ( ar_manage ( node_ptr,
|
||||
MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
|
||||
MTC_TASK_AR_DISABLED_SERVICES) == PASS )
|
||||
{
|
||||
node_ptr->hostservices_failed = true ;
|
||||
this->force_full_enable ( node_ptr );
|
||||
}
|
||||
}
|
||||
}
|
||||
return ;
|
||||
}
|
||||
node_ptr->host_services_req.rsp = msg.cmd ;
|
||||
if ( msg.buf[0] != '\0' )
|
||||
@ -4233,6 +4246,7 @@ unsigned int nodeLinkClass::get_cmd_resp ( string & hostname )
|
||||
* 1. manage the online/offline state bools
|
||||
* 2. increment the mtcAlive count
|
||||
* 3. set the mtcAlive received bool for the specified interface
|
||||
* 4. handle start host services failures
|
||||
*
|
||||
*****************************************************************************/
|
||||
void nodeLinkClass::set_mtcAlive ( string & hostname, unsigned int sequence, int iface )
|
||||
@ -4456,7 +4470,13 @@ void nodeLinkClass::set_goEnabled_failed ( string & hostname )
|
||||
node_ptr = nodeLinkClass::getNode ( hostname );
|
||||
if ( node_ptr != NULL )
|
||||
{
|
||||
node_ptr->goEnabled_failed = true ;
|
||||
if ( node_ptr->goEnabled_failed == false )
|
||||
{
|
||||
node_ptr->goEnabled_failed = true ;
|
||||
alarm_enabled_failure ( node_ptr, true );
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_FAIL );
|
||||
ar_handler ( node_ptr, MTC_AR_DISABLE_CAUSE__GOENABLE, MTC_TASK_AR_DISABLED_GOENABLE );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -4489,6 +4509,13 @@ void nodeLinkClass::set_goEnabled_failed_subf ( string & hostname )
|
||||
node_ptr = nodeLinkClass::getNode ( hostname );
|
||||
if ( node_ptr != NULL )
|
||||
{
|
||||
if ( node_ptr->goEnabled_failed_subf == false )
|
||||
{
|
||||
node_ptr->goEnabled_failed_subf = true ;
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_INTEST_FAIL );
|
||||
alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
|
||||
ar_handler ( node_ptr, MTC_AR_DISABLE_CAUSE__GOENABLE, MTC_TASK_AR_DISABLED_GOENABLE );
|
||||
}
|
||||
node_ptr->goEnabled_failed_subf = true ;
|
||||
}
|
||||
}
|
||||
@ -4580,6 +4607,85 @@ void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface )
|
||||
else
|
||||
node_ptr->goEnabled = false ;
|
||||
|
||||
// Detect and handle 'Go Enable' Failures that come in by
|
||||
// Out-Of-Band signaling from periodic mtcAlive messaging.
|
||||
//
|
||||
// Only take action on the first event while node is
|
||||
// unlocked-enabled and while 'goEnabled_failed' AND
|
||||
// 'goEnabled_failed_subf' are false.
|
||||
//
|
||||
// These failure bool's are cleared by calls to
|
||||
// clear_main_failed_bools and clear_subf_failed_bools
|
||||
// in the enable_handler.
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
||||
(( flags & MTC_FLAG__MAIN_GOENABLE_FAIL ) || ( flags & MTC_FLAG__SUBF_GOENABLE_FAIL )) &&
|
||||
(!( node_ptr->goEnabled_failed || node_ptr->goEnabled_failed_subf )))
|
||||
{
|
||||
if ( flags & MTC_FLAG__MAIN_GOENABLE_FAIL )
|
||||
{
|
||||
elog ("%s goEnabled failed (oob:%08X) ; see %s:%s for details",
|
||||
hostname.c_str(), flags, hostname.c_str(), MTCCLIENT_LOG_FILE);
|
||||
node_ptr->goEnabled_failed = true ;
|
||||
alarm_enabled_failure ( node_ptr, true );
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_FAIL );
|
||||
}
|
||||
|
||||
if ( flags & MTC_FLAG__SUBF_GOENABLE_FAIL )
|
||||
{
|
||||
ilog ("%s goEnabled subfunction failed (oob:%08X) ; see %s:%s for details",
|
||||
hostname.c_str(), flags, hostname.c_str(), MTCCLIENT_LOG_FILE);
|
||||
node_ptr->goEnabled_failed_subf = true ;
|
||||
alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_INTEST_FAIL );
|
||||
}
|
||||
if ( ar_handler ( node_ptr,
|
||||
MTC_AR_DISABLE_CAUSE__GOENABLE,
|
||||
MTC_TASK_AR_DISABLED_GOENABLE ) == PASS )
|
||||
{
|
||||
this->force_full_enable ( node_ptr );
|
||||
}
|
||||
}
|
||||
|
||||
// Detect and handle 'Host Services' failures that come in by
|
||||
// Out-Of-Band signaling from periodic mtcAlive messaging.
|
||||
//
|
||||
// Only take action on the first event while node is
|
||||
// unlocked-enabled and while 'goEnabled_failed' AND
|
||||
// 'goEnabled_failed_subf' are false.
|
||||
//
|
||||
// These failure bool's are cleared by calls to
|
||||
// clear_main_failed_bools and clear_subf_failed_bools
|
||||
// in the enable_handler.
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
||||
(( flags & MTC_FLAG__MAIN_SERVICES_FAIL ) || ( flags & MTC_FLAG__SUBF_SERVICES_FAIL )) &&
|
||||
(!( node_ptr->hostservices_failed || node_ptr->hostservices_failed_subf )))
|
||||
{
|
||||
if ( flags & MTC_FLAG__MAIN_SERVICES_FAIL )
|
||||
{
|
||||
elog ("%s start host services failed (oob:%08X) ; see %s:%s for details",
|
||||
hostname.c_str(), flags, hostname.c_str(), MTCCLIENT_LOG_FILE);
|
||||
node_ptr->hostservices_failed = true ;
|
||||
alarm_enabled_failure ( node_ptr, true );
|
||||
// mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_SERVICE_FAIL );
|
||||
}
|
||||
|
||||
if ( flags & MTC_FLAG__SUBF_SERVICES_FAIL )
|
||||
{
|
||||
ilog ("%s start host subfunction services failed (oob:%08X) ; see %s:%s for details",
|
||||
hostname.c_str(), flags, hostname.c_str(), MTCCLIENT_LOG_FILE);
|
||||
node_ptr->hostservices_failed_subf = true ;
|
||||
alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
|
||||
// mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_FAIL );
|
||||
}
|
||||
if ( ar_handler ( node_ptr,
|
||||
MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
|
||||
MTC_TASK_AR_DISABLED_SERVICES) == PASS )
|
||||
{
|
||||
this->force_full_enable ( node_ptr );
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Fail the inactive controller if the sm unhealthy flag is set.
|
||||
* Degrade for the active controller.
|
||||
@ -8091,7 +8197,7 @@ struct nodeLinkClass::node * nodeLinkClass::get_insvTestTimer ( timer_t tid )
|
||||
|
||||
void nodeLinkClass::ar_enable ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
string ar_file = TMP_DIR_PATH + node_ptr->hostname + AUTO_RECOVERY_FILE_SUFFIX ;
|
||||
string ar_file = MTC_PERSIST_PATH + node_ptr->hostname + AUTO_RECOVERY_FILE_SUFFIX ;
|
||||
if ( daemon_is_file_present (ar_file.data()))
|
||||
{
|
||||
wlog ("%s clearing autorecovery file counter\n", node_ptr->hostname.c_str());
|
||||
@ -8126,7 +8232,7 @@ void nodeLinkClass::ar_enable ( struct nodeLinkClass::node * node_ptr )
|
||||
*
|
||||
* Manage Auto Recovery:
|
||||
*
|
||||
* Case 1: Failed active controller with no enabled inactive controller.
|
||||
* Case 1: Failed active controller in DX system
|
||||
*
|
||||
* Requires persistent count file and self reboot until threshold
|
||||
* is reached.
|
||||
@ -8136,9 +8242,13 @@ void nodeLinkClass::ar_enable ( struct nodeLinkClass::node * node_ptr )
|
||||
* so we don't get a rolling boot loop.
|
||||
*
|
||||
* Auto recovery count is tracked/preserved in a host named auto
|
||||
* recovery counter file /etc/mtc/tmp/hostname_ar_count.
|
||||
* recovery counter file /var/persist/mtc/<hostname>_ar_count.
|
||||
*
|
||||
* Case 2: All other cases
|
||||
* Note: This auto recovery count file only applies to SX systems.
|
||||
* Otherwise, in DX systems a node's auto recovery count
|
||||
* is tracked in that node's nodeClass data structure.
|
||||
*
|
||||
* Case 2: All other cases ; remote hosts and SX systems
|
||||
*
|
||||
* Case 2a: No auto recovery thresholding of active controller in non AIO SX
|
||||
* where the user can't lock and unlock the active controller.
|
||||
@ -8166,6 +8276,7 @@ void nodeLinkClass::ar_enable ( struct nodeLinkClass::node * node_ptr )
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
#define FORCE_SWACT_DELAY_SECS (5)
|
||||
int nodeLinkClass::ar_manage ( struct nodeLinkClass::node * node_ptr,
|
||||
autorecovery_disable_cause_enum cause,
|
||||
string ar_disable_banner )
|
||||
@ -8180,6 +8291,12 @@ int nodeLinkClass::ar_manage ( struct nodeLinkClass::node * node_ptr,
|
||||
return (rc);
|
||||
}
|
||||
|
||||
if ( node_ptr->forcing_full_enable == true )
|
||||
{
|
||||
wlog ("%s already handling full enable", node_ptr->hostname.c_str());
|
||||
return (rc) ;
|
||||
}
|
||||
|
||||
/* check for invalid call case */
|
||||
if ( cause >= MTC_AR_DISABLE_CAUSE__LAST )
|
||||
{
|
||||
@ -8192,98 +8309,201 @@ int nodeLinkClass::ar_manage ( struct nodeLinkClass::node * node_ptr,
|
||||
if ( node_ptr->ar_cause != cause )
|
||||
node_ptr->ar_cause = cause ;
|
||||
|
||||
string ar_file = MTC_PERSIST_PATH +
|
||||
node_ptr->hostname +
|
||||
AUTO_RECOVERY_FILE_SUFFIX ;
|
||||
|
||||
/* Case 1 check */
|
||||
if ( ( THIS_HOST ) && ( is_inactive_controller_main_insv() == false ))
|
||||
if ( daemon_is_file_present (ar_file.data()))
|
||||
{
|
||||
/* manage the auto recovery threshold count file */
|
||||
unsigned int value = 0 ;
|
||||
|
||||
string ar_file = TMP_DIR_PATH +
|
||||
node_ptr->hostname +
|
||||
AUTO_RECOVERY_FILE_SUFFIX ;
|
||||
|
||||
if ( daemon_is_file_present (ar_file.data()))
|
||||
{
|
||||
/* if the file is there then read the count and increment it */
|
||||
value = daemon_get_file_int ( ar_file.data() );
|
||||
}
|
||||
value++ ;
|
||||
|
||||
/* Save the new value in the file */
|
||||
daemon_log_value ( ar_file.data(), value );
|
||||
|
||||
value = daemon_get_file_int ( ar_file.data() );
|
||||
|
||||
/* set rc to reflect what the caller should do */
|
||||
if ( value > this->ar_threshold[node_ptr->ar_cause] )
|
||||
{
|
||||
elog ("%s auto recovery threshold exceeded (%d)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
this->ar_threshold[node_ptr->ar_cause] );
|
||||
|
||||
node_ptr->ar_disabled = true ;
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
||||
|
||||
allStateChange ( node_ptr, node_ptr->adminState,
|
||||
MTC_OPER_STATE__ENABLED,
|
||||
MTC_AVAIL_STATUS__DEGRADED );
|
||||
|
||||
mtcInvApi_update_task ( node_ptr, ar_disable_banner );
|
||||
|
||||
return (rc);
|
||||
}
|
||||
|
||||
wlog ("%s auto recovery (try %d of %d) (%d)",
|
||||
node_ptr->hostname.c_str(),
|
||||
value,
|
||||
this->ar_threshold[node_ptr->ar_cause],
|
||||
node_ptr->ar_cause);
|
||||
|
||||
mtcInvApi_update_states_now ( node_ptr, "unlocked",
|
||||
"disabled", "failed",
|
||||
"disabled", "failed" );
|
||||
|
||||
lazy_graceful_fs_reboot ( node_ptr );
|
||||
/* If the file is there then read the count and increment it */
|
||||
node_ptr->ar_count[node_ptr->ar_cause] = daemon_get_file_int ( ar_file.data() );
|
||||
}
|
||||
else /* Case 2 */
|
||||
{
|
||||
send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST );
|
||||
mtcInvApi_update_states ( node_ptr, "unlocked", "disabled", "failed" );
|
||||
node_ptr->ar_count[node_ptr->ar_cause]++ ;
|
||||
|
||||
if (( NOT_THIS_HOST ) &&
|
||||
( this->system_type != SYSTEM_TYPE__AIO__SIMPLEX ))
|
||||
/* Only save the value to a file for SIMPLEX systems.
|
||||
* Preserving the auto recovery file in DX systems is problematic over
|
||||
* Swact unless its stored in the active controller mounted filesystem
|
||||
* which it is not. */
|
||||
if ( SIMPLEX )
|
||||
daemon_log_value ( ar_file.data(), node_ptr->ar_count[node_ptr->ar_cause] );
|
||||
|
||||
/* If not simplex then ensure there is no lingering
|
||||
* file after a simplex to duplex migration */
|
||||
else if ( daemon_is_file_present ( ar_file.data() ) )
|
||||
daemon_remove_file ( ar_file.data() );
|
||||
|
||||
/* set rc to reflect what the caller should do */
|
||||
if ( node_ptr->ar_count[node_ptr->ar_cause] > this->ar_threshold[node_ptr->ar_cause] )
|
||||
{
|
||||
elog ("%s auto recovery threshold of %d reached - going auto recovery disabled.",
|
||||
node_ptr->hostname.c_str(),
|
||||
this->ar_threshold[node_ptr->ar_cause] );
|
||||
|
||||
node_ptr->ar_disabled = true ;
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
||||
|
||||
allStateChange ( node_ptr, node_ptr->adminState,
|
||||
MTC_OPER_STATE__DISABLED,
|
||||
MTC_AVAIL_STATUS__FAILED );
|
||||
|
||||
mtcInvApi_update_task ( node_ptr, ar_disable_banner );
|
||||
|
||||
return (rc);
|
||||
}
|
||||
|
||||
/* Case 1: This Host and not simplex system */
|
||||
if (( THIS_HOST ) && ( NOT_SIMPLEX ))
|
||||
{
|
||||
/* Case 1a - This DX controller with no enabled standby controller - go degraded and no reboot */
|
||||
if ( is_inactive_controller_main_insv() == false )
|
||||
{
|
||||
if ( ++node_ptr->ar_count[node_ptr->ar_cause] >=
|
||||
this->ar_threshold [node_ptr->ar_cause] )
|
||||
{
|
||||
elog ("%s auto recovery threshold exceeded (%d)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
this->ar_threshold[node_ptr->ar_cause] );
|
||||
node_ptr->ar_disabled = true ;
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
||||
mtcInvApi_update_task ( node_ptr, ar_disable_banner );
|
||||
rc = FAIL ;
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("%s auto recovery (try %d of %d) (%d)",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->ar_count[node_ptr->ar_cause],
|
||||
this->ar_threshold[node_ptr->ar_cause],
|
||||
node_ptr->ar_cause);
|
||||
rc = PASS ;
|
||||
}
|
||||
alarm_enabled_failure ( node_ptr, true );
|
||||
allStateChange ( node_ptr,
|
||||
node_ptr->adminState,
|
||||
MTC_OPER_STATE__ENABLED,
|
||||
MTC_AVAIL_STATUS__DEGRADED );
|
||||
|
||||
wlog ("%s refusing to self reboot with no enabled standby controller.", node_ptr->hostname.c_str());
|
||||
wlog ("%s ... critical enable alarm raised, running enabled but degraded.", node_ptr->hostname.c_str());
|
||||
wlog ("%s ... recommend enabling a standby controller.", node_ptr->hostname.c_str());
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_NO_BACKUP);
|
||||
rc = FAIL ;
|
||||
}
|
||||
|
||||
/* Case 1b - This DX controller host with an enabled standby controller - force swact and reboot */
|
||||
else
|
||||
{
|
||||
wlog ("%s auto recovery\n", node_ptr->hostname.c_str());
|
||||
rc = PASS ;
|
||||
wlog ("%s auto recovery of self (try %d of %d) (%d)",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->ar_count[node_ptr->ar_cause],
|
||||
this->ar_threshold[node_ptr->ar_cause],
|
||||
node_ptr->ar_cause);
|
||||
|
||||
mtcInvApi_update_states_now ( node_ptr, "", "disabled", "failed", "", "" );
|
||||
|
||||
/* Turn off Heartbeat to that host */
|
||||
send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST );
|
||||
|
||||
/* Post critical failure message */
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_SWACT_REQ );
|
||||
|
||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, FORCE_SWACT_DELAY_SECS );
|
||||
|
||||
wlog ("%s force swact in %d seconds ; waiting for database 'disabled-failed' state update",
|
||||
node_ptr->hostname.c_str(), FORCE_SWACT_DELAY_SECS );
|
||||
this->delayed_swact_required = true ;
|
||||
node_ptr->ar_log_throttle = 0 ;
|
||||
rc = FAIL ;
|
||||
}
|
||||
}
|
||||
else /* Case 2 - Not this host , let the caller decide what to do */
|
||||
{
|
||||
wlog ("%s auto recovery (try %d of %d) (%d)",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->ar_count[node_ptr->ar_cause],
|
||||
this->ar_threshold[node_ptr->ar_cause],
|
||||
node_ptr->ar_cause);
|
||||
rc = PASS ;
|
||||
}
|
||||
return (rc);
|
||||
}
|
||||
|
||||
/*****************************************************************************
|
||||
*
|
||||
* Name : ar_handler
|
||||
*
|
||||
* Purpose : Handle node failure from ar_manage return code
|
||||
*
|
||||
* Description: The following cases apply whe the failed node is ...
|
||||
*
|
||||
* Case 1: Not the active controller
|
||||
* - Auto recovery disable thresholding applies for applicable
|
||||
* causes.
|
||||
*
|
||||
* Case 2: Active Controller in SIMPLEX or With Enabled Standby
|
||||
* - Auto Recovery disable applies on Simplex system or DX System
|
||||
* with enabled standby controller.
|
||||
*
|
||||
* Case 3: Active Controller in DX System
|
||||
* - Auto Recovery disable does not apply to a active controller
|
||||
* in a DX system that does not have an unlocked-enabled standby
|
||||
* controller to switch activity to.
|
||||
* - Logs are produced, host is degraded, alarm is raised and
|
||||
* node task field is updated.
|
||||
* - Locking the active contorller to recover from an auto
|
||||
* recovery disabled host is not supported in a DX system.
|
||||
*
|
||||
* Parameters :
|
||||
*
|
||||
* @param node_ptr: pointer to the nodeLinkClass struct for the failing node
|
||||
* @param cause : autorecovery_disable_cause_enum enumberated type of the
|
||||
* failure cause
|
||||
* @param ar_disable_banner : the auto recover disable cause string
|
||||
*
|
||||
* Returns : PASS if the auto recovery threshold is not reached.
|
||||
* FAIL if the auto recovery threshold is reached and
|
||||
* ar_disable is true
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
int nodeLinkClass::ar_handler ( struct nodeLinkClass::node * node_ptr,
|
||||
autorecovery_disable_cause_enum cause,
|
||||
string ar_disable_banner )
|
||||
{
|
||||
int ar_status = FAIL;
|
||||
|
||||
if ( node_ptr->ar_disabled )
|
||||
return ar_status ;
|
||||
|
||||
wlog ("%s handling node failure ; cause:%d", node_ptr->hostname.c_str(), cause );
|
||||
|
||||
// Case 1: Not the active controller
|
||||
if ( NOT_THIS_HOST )
|
||||
{
|
||||
if ( ( ar_status = this->ar_manage ( node_ptr, cause, ar_disable_banner ) ) == PASS )
|
||||
this->force_full_enable ( node_ptr );
|
||||
}
|
||||
|
||||
// Case 2: Active Controller failed on
|
||||
// - SX system or
|
||||
// - DX system with enabled standby controller
|
||||
else if (( SIMPLEX ) || ( this->num_controllers_enabled() > 1 ))
|
||||
{
|
||||
if ( ( ar_status = this->ar_manage ( node_ptr, cause, ar_disable_banner ) ) == PASS )
|
||||
{
|
||||
mtcInvApi_update_states_now ( node_ptr, "", "disabled", "failed", "", "" );
|
||||
|
||||
if ( NOT_SIMPLEX )
|
||||
{
|
||||
/* Turn off Heartbeat to that host */
|
||||
send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST );
|
||||
|
||||
/* Update task stating that a Swact is in progress */
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_SWACT_REQ );
|
||||
}
|
||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, FORCE_SWACT_DELAY_SECS );
|
||||
node_ptr->ar_log_throttle = 0 ;
|
||||
this->delayed_swact_required = true ;
|
||||
wlog ("%s %s in %d seconds ; waiting for database 'disabled-failed' state update",
|
||||
node_ptr->hostname.c_str(),
|
||||
SIMPLEX ? "lazy reboot" : "force swact",
|
||||
FORCE_SWACT_DELAY_SECS);
|
||||
}
|
||||
}
|
||||
|
||||
// Case 3: Active Controller in DX System without enabled standby controller.
|
||||
else
|
||||
{
|
||||
wlog ("%s refusing to self reboot with no enabled standby controller", node_ptr->hostname.c_str());
|
||||
wlog ("%s ... critical enable alarm raised", node_ptr->hostname.c_str());
|
||||
wlog ("%s ... recommend enabling a standby controller.", node_ptr->hostname.c_str());
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_NO_BACKUP);
|
||||
}
|
||||
return (ar_status);
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : report_dor_recovery
|
||||
@ -8322,6 +8542,12 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
|
||||
if ( node_ptr->ar_disabled == true )
|
||||
return ;
|
||||
|
||||
if ( node_ptr->forcing_full_enable == true )
|
||||
{
|
||||
wlog ("%s already handling force full enable", node_ptr->hostname.c_str());
|
||||
return ;
|
||||
}
|
||||
|
||||
if ( node_ptr->was_dor_recovery_mode )
|
||||
{
|
||||
report_dor_recovery ( node_ptr , "is FAILED " );
|
||||
@ -8341,6 +8567,7 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
|
||||
( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK ))
|
||||
{
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); // no action
|
||||
node_ptr->forcing_full_enable = true ;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -8372,9 +8599,7 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
|
||||
*
|
||||
* start = True
|
||||
*
|
||||
* MTC_CMD_START_CONTROL_SVCS
|
||||
* MTC_CMD_START_WORKER_SVCS
|
||||
* MTC_CMD_START_STORAGE_SVCS
|
||||
* No Longer Supported
|
||||
*
|
||||
* Returns : PASS = launch success
|
||||
* !PASS = launch failure
|
||||
@ -8386,32 +8611,22 @@ int nodeLinkClass::launch_host_services_cmd ( struct nodeLinkClass::node * node_
|
||||
if ( !node_ptr )
|
||||
return (FAIL_NULL_POINTER);
|
||||
|
||||
/* Initialize the host's command request control structure */
|
||||
mtcCmd_init ( node_ptr->host_services_req );
|
||||
if ( start == true )
|
||||
{
|
||||
slog ("%s Start Host Services Command Not Supported", node_ptr->hostname.c_str());
|
||||
return ( FAIL_INVALID_OPERATION ) ;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Initialize the host's command request control structure */
|
||||
mtcCmd_init ( node_ptr->host_services_req );
|
||||
}
|
||||
|
||||
/* Service subfunction override first, efficiency. */
|
||||
if ( subf == true )
|
||||
{
|
||||
/* only supported subfunction (right now) is COMPUTE */
|
||||
if ( start == true )
|
||||
node_ptr->host_services_req.cmd = MTC_CMD_START_WORKER_SVCS ;
|
||||
else
|
||||
node_ptr->host_services_req.cmd = MTC_CMD_STOP_WORKER_SVCS ;
|
||||
}
|
||||
else if ( start == true )
|
||||
{
|
||||
if ( is_controller (node_ptr) )
|
||||
node_ptr->host_services_req.cmd = MTC_CMD_START_CONTROL_SVCS ;
|
||||
else if ( is_worker (node_ptr) )
|
||||
node_ptr->host_services_req.cmd = MTC_CMD_START_WORKER_SVCS ;
|
||||
else if ( is_storage (node_ptr) )
|
||||
node_ptr->host_services_req.cmd = MTC_CMD_START_STORAGE_SVCS ;
|
||||
else
|
||||
{
|
||||
slog ("%s start host services is not supported for this host type\n",
|
||||
node_ptr->hostname.c_str());
|
||||
return (FAIL_BAD_CASE) ;
|
||||
}
|
||||
node_ptr->host_services_req.cmd = MTC_CMD_STOP_WORKER_SVCS ;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -9879,6 +10094,14 @@ void nodeLinkClass::mem_log_test_info ( struct nodeLinkClass::node * node_ptr )
|
||||
get_insvTestStages_str(node_ptr->insvTestStage).c_str(),
|
||||
node_ptr->insv_test_count);
|
||||
mem_log (str);
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tGoEnabled Main:%s Subf:%s - Services Main:%s Subf:%s - Force Full Enable Bypass:%s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->goEnabled_failed ? "Fail" : "Ok",
|
||||
node_ptr->goEnabled_failed_subf ? "Fail" : "Ok",
|
||||
node_ptr->hostservices_failed ? "Fail" : "Ok",
|
||||
node_ptr->hostservices_failed_subf ? "Fail" : "Ok",
|
||||
node_ptr->forcing_full_enable ? "Yes" : "No");
|
||||
mem_log (str);
|
||||
}
|
||||
|
||||
void nodeLinkClass::mem_log_thread_info ( struct nodeLinkClass::node * node_ptr )
|
||||
|
@ -1,7 +1,7 @@
|
||||
#ifndef __INCLUDE_NODECLASS_H__
|
||||
#define __INCLUDE_NODECLASS_H__
|
||||
/*
|
||||
* Copyright (c) 2013-2016, 2023-2024 Wind River Systems, Inc.
|
||||
* Copyright (c) 2013-2016, 2023-2025 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
@ -67,6 +67,9 @@ using namespace std;
|
||||
#define SIMPLEX \
|
||||
( daemon_is_file_present ( PLATFORM_SIMPLEX_MODE ) == true )
|
||||
|
||||
#define NOT_SIMPLEX \
|
||||
( daemon_is_file_present ( PLATFORM_SIMPLEX_MODE ) == false )
|
||||
|
||||
#define THIS_HOST \
|
||||
( node_ptr->hostname == this->my_hostname )
|
||||
|
||||
@ -209,19 +212,6 @@ private:
|
||||
|
||||
int mtcalive_timeout ;
|
||||
|
||||
/* start host service retry controls */
|
||||
int start_services_retries ;
|
||||
|
||||
bool start_services_running_main ;
|
||||
bool start_services_running_subf ;
|
||||
|
||||
bool start_services_needed ;
|
||||
bool start_services_needed_subf ; /* for the add handler that defers
|
||||
start to the inservice test handler.
|
||||
this provides a means of telling
|
||||
maintenance that the subfunction
|
||||
start needs to also be run. */
|
||||
|
||||
/** Pointer to the previous node in the list */
|
||||
struct node *prev;
|
||||
|
||||
@ -404,8 +394,8 @@ private:
|
||||
|
||||
/* Boolean indicating the main or subfunction has start host services
|
||||
* failure. */
|
||||
bool hostservices_failed ;
|
||||
bool hostservices_failed_subf ;
|
||||
bool hostservices_failed = false ;
|
||||
bool hostservices_failed_subf = false ;
|
||||
|
||||
/* Boolean indicating the main or subfunction has inservice failure */
|
||||
bool inservice_failed ;
|
||||
@ -442,8 +432,12 @@ private:
|
||||
/* throttles the ar_disabled log to periodically indicate auto
|
||||
* recovery disabled state but avoid flooding that same message. */
|
||||
#define AR_LOG_THROTTLE_THRESHOLD (100000)
|
||||
#define AR_HANDLER_LOG_THROTTLE_THRESHOLD (1000)
|
||||
unsigned int ar_log_throttle ;
|
||||
|
||||
/** Bool to prevent nested force_full_enable and auto recovery management handling */
|
||||
bool forcing_full_enable = false ;
|
||||
|
||||
/** Host's mtc timer struct. Use to time handler stages.
|
||||
*
|
||||
* reset -> reset command response
|
||||
@ -876,6 +870,7 @@ private:
|
||||
int stress_handler ( struct nodeLinkClass::node * node_ptr );
|
||||
int bmc_handler ( struct nodeLinkClass::node * node_ptr );
|
||||
int degrade_handler ( struct nodeLinkClass::node * node_ptr );
|
||||
int self_fail_handler ( struct nodeLinkClass::node * node_ptr );
|
||||
|
||||
int uptime_handler ( void );
|
||||
|
||||
@ -985,6 +980,12 @@ private:
|
||||
autorecovery_disable_cause_enum cause,
|
||||
string ar_disable_banner );
|
||||
|
||||
/* handle auto recovery
|
||||
* - adds common handling functionality on top of ar_manage */
|
||||
int ar_handler ( struct nodeLinkClass::node * node_ptr,
|
||||
autorecovery_disable_cause_enum cause,
|
||||
string ar_disable_banner);
|
||||
|
||||
/** ***********************************************************************
|
||||
*
|
||||
* Name : nodeLinkClass::workQueue_process
|
||||
@ -1160,7 +1161,6 @@ private:
|
||||
|
||||
void clear_subf_failed_bools ( struct nodeLinkClass::node * node_ptr );
|
||||
void clear_main_failed_bools ( struct nodeLinkClass::node * node_ptr );
|
||||
void clear_hostservices_ctls ( struct nodeLinkClass::node * node_ptr );
|
||||
|
||||
/* Enables/Clears dynamic auto recovery state. start fresh !
|
||||
* called in disabled_handler (lock) and in the DONE stages
|
||||
@ -1254,28 +1254,6 @@ private:
|
||||
*/
|
||||
int memory_used ;
|
||||
|
||||
/** Inservice memory management audit.
|
||||
*
|
||||
* Verifies that the node_ptr list and memory_allocs jive as well
|
||||
* as all the node pointers point to a node in the linked list.
|
||||
*
|
||||
* @return
|
||||
* an integer representing a PASS or TODO: list other error codes.
|
||||
*/
|
||||
int memory_audit ( void );
|
||||
|
||||
|
||||
/* Simplex mode auto recovery bools
|
||||
*
|
||||
* Set to true when the autorecovery threshold is reached
|
||||
* and we want to avoid taking further autorecovery action
|
||||
* even though it may be requested. */
|
||||
bool autorecovery_disabled = false ;
|
||||
|
||||
/* Set to true by fault detection methods that are
|
||||
* autorecoverable when in simplex mode. */
|
||||
bool autorecovery_enabled = false ;
|
||||
|
||||
/** Tracks the number of hosts that 'are currently' in service trouble
|
||||
* wrt heartbeat (above minor threshold).
|
||||
* This is used in multi-host failure avoidance.
|
||||
@ -2191,7 +2169,14 @@ public:
|
||||
*/
|
||||
unsigned int ar_interval[MTC_AR_DISABLE_CAUSE__LAST] ;
|
||||
|
||||
int unknown_host_throttle ;
|
||||
/* Used by the auto recovery algorithm for self-reboot.
|
||||
* This is a flag indicating a delayed self-reboot is required.
|
||||
* This ensures the FSM enters the self_reboot_handler, allowing sufficient time
|
||||
* for operational and availability state changes to be committed to the database
|
||||
* before initiating the reboot. */
|
||||
bool delayed_swact_required = false ;
|
||||
bool self_reboot_wait = false ;
|
||||
bool force_swact_wait = false ;
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2018, 2024 Wind River Systems, Inc.
|
||||
* Copyright (c) 2013-2018, 2024-2025 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
@ -458,7 +458,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
|
||||
return (PASS);
|
||||
}
|
||||
|
||||
/* inform mtcAgent of enhanced ost services support */
|
||||
/* inform mtcAgent of enhanced host services support */
|
||||
msg.parm[1] = MTC_ENHANCED_HOST_SERVICES ;
|
||||
msg.parm[0] = rc ;
|
||||
msg.num = 2 ;
|
||||
@ -810,14 +810,20 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char
|
||||
|
||||
if ( mtce_name_ptr )
|
||||
{
|
||||
/* add the error message to the message buffer */
|
||||
/* add the message to the message buffer */
|
||||
size_t len = strnlen ( mtce_name_ptr, MAX_MTCE_EVENT_NAME_LEN );
|
||||
|
||||
/* We don't use the buffer for mtce events to remove it from the size */
|
||||
bytes = ((sizeof(mtc_message_type))-(BUF_SIZE-len));
|
||||
|
||||
snprintf ( &event.buf[0], MAX_MTCE_EVENT_NAME_LEN , "%s", mtce_name_ptr );
|
||||
rc = FAIL_OPERATION ;
|
||||
|
||||
// If the supplied mtce_name_str string contains 'failed'
|
||||
// then set the rc to FAIL_OPERATION
|
||||
if ( strcasestr (mtce_name_ptr, "failed" ) )
|
||||
rc = FAIL_OPERATION ;
|
||||
if ( strcasestr (mtce_name_ptr, "timeout" ) )
|
||||
rc = FAIL_TIMEOUT ;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -983,6 +989,38 @@ int create_mtcAlive_msg ( ctrl_type * ctrl_ptr, mtc_message_type & msg, int cmd,
|
||||
}
|
||||
}
|
||||
|
||||
/* Set Out-Of-Band goEnable failure flag for goEnable failure. */
|
||||
if ( ctrl_ptr->goEnable_result )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__MAIN_GOENABLE_FAIL ;
|
||||
if ( ctrl_ptr->goEnable_result_subf )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SUBF_GOENABLE_FAIL ;
|
||||
|
||||
/* Set the Out-Of-Band Host Services failure
|
||||
* flag for any start host services that failed */
|
||||
if ( ctrl_ptr->storage_hostservices_result )
|
||||
{
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__MAIN_SERVICES_FAIL ;
|
||||
dlog3 ("storage start host services failed ; rc:%d", ctrl_ptr->storage_hostservices_result );
|
||||
}
|
||||
else if ( ctrl_ptr->controller_hostservices_result )
|
||||
{
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__MAIN_SERVICES_FAIL ;
|
||||
dlog3 ("controller start host services failed ; rc:%d", ctrl_ptr->controller_hostservices_result );
|
||||
}
|
||||
else if ( is_subfunction_worker () )
|
||||
{
|
||||
if ( ctrl_ptr->worker_hostservices_result )
|
||||
{
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SUBF_SERVICES_FAIL ;
|
||||
dlog3 ("worker subfunction start host services failed ; rc:%d", ctrl_ptr->worker_hostservices_result );
|
||||
}
|
||||
}
|
||||
else if ( ctrl_ptr->worker_hostservices_result )
|
||||
{
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__MAIN_SERVICES_FAIL ;
|
||||
dlog3 ("worker start host services failed ; rc:%d", ctrl_ptr->worker_hostservices_result );
|
||||
}
|
||||
|
||||
if ( daemon_is_file_present ( SMGMT_DEGRADED_FILE ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_DEGRADED ;
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2018, 2023-2024 Wind River Systems, Inc.
|
||||
* Copyright (c) 2013-2018, 2023-2025 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
@ -295,15 +295,7 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
|
||||
if ( msg.num > 0 )
|
||||
{
|
||||
/* log if not locked message */
|
||||
if ( msg.cmd != MTC_MSG_LOCKED )
|
||||
{
|
||||
ilog ("%s %s request ACK (rc:%d) (%s)",
|
||||
hostname.c_str(),
|
||||
get_mtcNodeCommand_str(msg.cmd),
|
||||
msg.parm[0],
|
||||
iface_name_ptr);
|
||||
}
|
||||
else
|
||||
if ( msg.cmd == MTC_MSG_LOCKED )
|
||||
{
|
||||
mlog ("%s %s request ACK (rc:%d) (%s)",
|
||||
hostname.c_str(),
|
||||
@ -311,6 +303,38 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
|
||||
msg.parm[0],
|
||||
iface_name_ptr);
|
||||
}
|
||||
else if ( msg.cmd == MTC_CMD_HOST_SVCS_RESULT )
|
||||
{
|
||||
ilog ("%s %s (rc:%d) (%s)",
|
||||
hostname.c_str(),
|
||||
msg.buf,
|
||||
msg.parm[0],
|
||||
iface_name_ptr);
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("%s %s request ACK (rc:%d) (%s)",
|
||||
hostname.c_str(),
|
||||
get_mtcNodeCommand_str(msg.cmd),
|
||||
msg.parm[0],
|
||||
iface_name_ptr);
|
||||
}
|
||||
}
|
||||
else if ( msg.cmd == MTC_MSG_LOCKED )
|
||||
{
|
||||
mlog ("%s %s request ACK (%s)",
|
||||
hostname.c_str(),
|
||||
get_mtcNodeCommand_str(msg.cmd),
|
||||
iface_name_ptr);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
/* log other command request ACKs that don't have any return parameters */
|
||||
ilog ("%s %s request ACK (%s)",
|
||||
hostname.c_str(),
|
||||
get_mtcNodeCommand_str(msg.cmd),
|
||||
iface_name_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
@ -731,10 +755,8 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface, string json_dict
|
||||
case MTC_CMD_STOP_CONTROL_SVCS:
|
||||
case MTC_CMD_STOP_WORKER_SVCS:
|
||||
case MTC_CMD_STOP_STORAGE_SVCS:
|
||||
case MTC_CMD_START_CONTROL_SVCS:
|
||||
case MTC_CMD_START_WORKER_SVCS:
|
||||
case MTC_CMD_START_STORAGE_SVCS:
|
||||
{
|
||||
ilog ("%s %s command sent", hostname.c_str(), get_mtcNodeCommand_str(cmd));
|
||||
snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s", get_cmd_req_msg_header() );
|
||||
mtc_cmd.cmd = cmd ;
|
||||
rc = PASS ;
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2016, 2024 Wind River Systems, Inc.
|
||||
* Copyright (c) 2013-2016, 2024-2025 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
@ -832,7 +832,7 @@ void _scripts_cleanup ( script_set_enum script_set )
|
||||
***************************************************************************/
|
||||
void _manage_services_scripts ( void )
|
||||
{
|
||||
bool failed = false ;
|
||||
int status = PASS ;
|
||||
char str [BUF_SIZE] ;
|
||||
|
||||
if ( ! ctrl.hostservices.scripts )
|
||||
@ -842,6 +842,8 @@ void _manage_services_scripts ( void )
|
||||
return ;
|
||||
}
|
||||
|
||||
string current_cmd = get_mtcNodeCommand_str(ctrl.current_hostservices_command) ;
|
||||
|
||||
memset (str,0,BUF_SIZE);
|
||||
|
||||
/* do if all the scripts are done ? */
|
||||
@ -852,31 +854,32 @@ void _manage_services_scripts ( void )
|
||||
{
|
||||
if ( ctrl.hostservices.script[i].status )
|
||||
{
|
||||
if ( failed == false )
|
||||
if ( status == PASS )
|
||||
{
|
||||
/* only report of the first failure */
|
||||
snprintf(str, BUF_SIZE, "%s failed ; rc:%d",
|
||||
ctrl.hostservices.script[i].name.data(),
|
||||
ctrl.hostservices.script[i].status );
|
||||
failed = true ;
|
||||
status = ctrl.hostservices.script[i].status ;
|
||||
break ;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* handle the aggrigate status */
|
||||
if ( failed == true )
|
||||
if ( status )
|
||||
{
|
||||
elog ("Host Services: %s\n", str );
|
||||
ilog ("%s result: %s", current_cmd.c_str(), str );
|
||||
mtce_send_event ( sock_ptr, MTC_CMD_HOST_SVCS_RESULT, str );
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("Host Services Complete ; all passed ; %s", get_mtcNodeCommand_str(ctrl.current_hostservices_command));
|
||||
mtce_send_event ( sock_ptr, MTC_CMD_HOST_SVCS_RESULT, NULL );
|
||||
ilog ("%s complete ; all passed", current_cmd.c_str());
|
||||
mtce_send_event ( sock_ptr, MTC_CMD_HOST_SVCS_RESULT, current_cmd.data());
|
||||
}
|
||||
ctrl.active_script_set = NO_SCRIPTS ;
|
||||
}
|
||||
|
||||
/* do if have we timed out ? */
|
||||
/* check for 5 minute timeout */
|
||||
else if ( ctrl.hostservices.timer.ring == true )
|
||||
{
|
||||
bool found = false ;
|
||||
@ -887,9 +890,13 @@ void _manage_services_scripts ( void )
|
||||
{
|
||||
if ( ctrl.hostservices.script[i].done == false )
|
||||
{
|
||||
snprintf(str, BUF_SIZE, "%s (timeout)", ctrl.hostservices.script[i].name.data() );
|
||||
status = FAIL_TIMEOUT ;
|
||||
snprintf(str, BUF_SIZE, "%s timeout", ctrl.hostservices.script[i].name.data() );
|
||||
found = true ;
|
||||
wlog ("host services timeout on %s\n", ctrl.hostservices.script[i].name.c_str());
|
||||
elog ("%s timeout on %s\n",
|
||||
current_cmd.c_str(),
|
||||
ctrl.hostservices.script[i].name.c_str());
|
||||
|
||||
mtce_send_event ( sock_ptr, MTC_CMD_HOST_SVCS_RESULT, str );
|
||||
break ;
|
||||
}
|
||||
@ -906,6 +913,23 @@ void _manage_services_scripts ( void )
|
||||
return ;
|
||||
}
|
||||
|
||||
/* Specify which start host services command failed */
|
||||
if ( status )
|
||||
{
|
||||
if ( ctrl.current_hostservices_command == MTC_CMD_START_CONTROL_SVCS )
|
||||
ctrl.controller_hostservices_result = status ;
|
||||
else if ( ctrl.current_hostservices_command == MTC_CMD_START_WORKER_SVCS )
|
||||
ctrl.worker_hostservices_result = status ;
|
||||
else if ( ctrl.current_hostservices_command == MTC_CMD_START_STORAGE_SVCS )
|
||||
ctrl.storage_hostservices_result = status ;
|
||||
else
|
||||
{
|
||||
slog ("unexpected current hostservices command=%d status=%d",
|
||||
ctrl.current_hostservices_command, status );
|
||||
}
|
||||
}
|
||||
|
||||
mtcTimer_reset (ctrl.hostservices.timer );
|
||||
_scripts_cleanup (ctrl.active_script_set) ;
|
||||
}
|
||||
|
||||
@ -992,6 +1016,7 @@ void _manage_goenabled_tests ( void )
|
||||
|
||||
ilog ("GoEnabled Subfunction Testing Failed ; at least one test failed\n");
|
||||
daemon_log ( GOENABLED_SUBF_FAIL , str );
|
||||
ctrl.goEnable_result_subf = FAIL ;
|
||||
send_mtc_msg ( sock_ptr, MTC_MSG_SUBF_GOENABLED_FAILED, str );
|
||||
break ;
|
||||
}
|
||||
@ -1002,6 +1027,7 @@ void _manage_goenabled_tests ( void )
|
||||
|
||||
ilog ("GoEnabled Testing Failed ; at least one test failed\n");
|
||||
daemon_log ( GOENABLED_MAIN_FAIL , str );
|
||||
ctrl.goEnable_result = FAIL ;
|
||||
send_mtc_msg ( sock_ptr, MTC_MSG_MAIN_GOENABLED_FAILED, str );
|
||||
break ;
|
||||
}
|
||||
@ -1067,6 +1093,7 @@ void _manage_goenabled_tests ( void )
|
||||
daemon_remove_file ( GOENABLED_SUBF_PASS );
|
||||
send_mtc_msg ( sock_ptr, MTC_MSG_SUBF_GOENABLED_FAILED, str );
|
||||
daemon_log ( GOENABLED_SUBF_FAIL , str );
|
||||
ctrl.goEnable_result_subf = FAIL ;
|
||||
break ;
|
||||
}
|
||||
case GOENABLED_MAIN_SCRIPTS:
|
||||
@ -1074,6 +1101,7 @@ void _manage_goenabled_tests ( void )
|
||||
daemon_remove_file ( GOENABLED_SUBF_PASS );
|
||||
send_mtc_msg ( sock_ptr, MTC_MSG_MAIN_GOENABLED_FAILED, str );
|
||||
daemon_log ( GOENABLED_MAIN_FAIL , str );
|
||||
ctrl.goEnable_result = FAIL ;
|
||||
break ;
|
||||
}
|
||||
default:
|
||||
@ -1420,10 +1448,15 @@ void daemon_service_run ( void )
|
||||
int rc = PASS ;
|
||||
int file_not_present_count = 0 ;
|
||||
|
||||
/* Bool to track whether the start host services scripts run has
|
||||
* been attempted at least once since last process startup. */
|
||||
/* Bool to track whether the start host services scripts needs to be run. */
|
||||
bool start_host_services_needs_to_be_run = true ;
|
||||
|
||||
/* Don't start host services if the node is locked */
|
||||
if ( daemon_is_file_present ( NODE_LOCKED_FILE ) == true )
|
||||
{
|
||||
ilog ("locked node");
|
||||
start_host_services_needs_to_be_run = false ;
|
||||
}
|
||||
if ( daemon_is_file_present ( NODE_RESET_FILE ) )
|
||||
{
|
||||
wlog ("mtce reboot required");
|
||||
@ -1948,7 +1981,7 @@ void daemon_service_run ( void )
|
||||
* Need to ensure that the appropriate host
|
||||
* services are started for the system/node
|
||||
* type. */
|
||||
if ( start_host_services_needs_to_be_run == true )
|
||||
if ( start_host_services_needs_to_be_run == true )
|
||||
{
|
||||
if ( ctrl.system_type == SYSTEM_TYPE__NORMAL )
|
||||
{
|
||||
@ -1981,7 +2014,7 @@ void daemon_service_run ( void )
|
||||
ctrl.start_controller_hostservices = true ;
|
||||
if ( ctrl.nodetype & WORKER_TYPE )
|
||||
ctrl.start_worker_hostservices = true ;
|
||||
start_host_services_needs_to_be_run = false ;
|
||||
start_host_services_needs_to_be_run = false ;
|
||||
}
|
||||
else if (( daemon_is_file_present ( GOENABLED_MAIN_FAIL ) ||
|
||||
( daemon_is_file_present ( GOENABLED_SUBF_FAIL ))))
|
||||
@ -2001,7 +2034,7 @@ void daemon_service_run ( void )
|
||||
ctrl.start_worker_hostservices = true ;
|
||||
else if ( ctrl.nodetype & STORAGE_TYPE )
|
||||
ctrl.start_storage_hostservices = true ;
|
||||
start_host_services_needs_to_be_run = false ;
|
||||
start_host_services_needs_to_be_run = false ;
|
||||
}
|
||||
else if ( daemon_is_file_present ( GOENABLED_MAIN_FAIL ) )
|
||||
{
|
||||
@ -2012,7 +2045,6 @@ void daemon_service_run ( void )
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Handle auto start of node personality services.
|
||||
// - prioritize controller first
|
||||
// - prevent more than one being posted at once
|
||||
|
@ -1,7 +1,7 @@
|
||||
#ifndef __INCLUDE_MTCNODECOMP_HH__
|
||||
#define __INCLUDE_MTCNODECOMP_HH__
|
||||
/*
|
||||
* Copyright (c) 2015-2016, 2024 Wind River Systems, Inc.
|
||||
* Copyright (c) 2015-2016, 2024-2025 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
@ -139,6 +139,17 @@ typedef struct
|
||||
bool start_worker_hostservices = false ;
|
||||
bool start_storage_hostservices = false ;
|
||||
|
||||
/* Store the result of the last Start Host Services
|
||||
* completion status for each personality. */
|
||||
int controller_hostservices_result = PASS ;
|
||||
int worker_hostservices_result = PASS ;
|
||||
int storage_hostservices_result = PASS ;
|
||||
|
||||
/* Store the result of the last goEnabled completion
|
||||
* status */
|
||||
int goEnable_result = PASS ;
|
||||
int goEnable_result_subf = PASS ;
|
||||
|
||||
/* The script set that is executing */
|
||||
script_set_enum active_script_set ;
|
||||
|
||||
|
@ -69,6 +69,11 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
|
||||
return RETRY ;
|
||||
}
|
||||
|
||||
/* Check for a 'delayed self reboot required' condition */
|
||||
if ( this->delayed_swact_required )
|
||||
if ( node_ptr->hostname == this->my_hostname )
|
||||
return ( this->self_fail_handler ( node_ptr ));
|
||||
|
||||
/* manage the host connected state and board management alarms */
|
||||
nodeLinkClass::bmc_handler ( node_ptr );
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2020, 2023-2024 Wind River Systems, Inc.
|
||||
* Copyright (c) 2013-2020, 2023-2025 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
@ -796,7 +796,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
/* clear all the past enable failure bools */
|
||||
clear_main_failed_bools ( node_ptr );
|
||||
clear_subf_failed_bools ( node_ptr );
|
||||
clear_hostservices_ctls ( node_ptr );
|
||||
|
||||
/* Clear all degrade flags except for the HWMON one */
|
||||
clear_host_degrade_causes ( node_ptr->degrade_mask );
|
||||
@ -829,18 +828,12 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
case MTC_AVAIL_STATUS__INTEST:
|
||||
case MTC_AVAIL_STATUS__FAILED:
|
||||
|
||||
/* enable auto recovery if the inactive controller
|
||||
* is out of service */
|
||||
//if (( is_controller (node_ptr) ) && ( NOT_THIS_HOST ))
|
||||
// node_ptr->ar_disabled = false ;
|
||||
// this->autorecovery_enabled = true ;
|
||||
|
||||
/* fall through */
|
||||
|
||||
case MTC_AVAIL_STATUS__DEGRADED:
|
||||
case MTC_AVAIL_STATUS__AVAILABLE:
|
||||
{
|
||||
if (( is_active_controller ( node_ptr->hostname )) &&
|
||||
if ( ( NOT_SIMPLEX ) && ( is_active_controller ( node_ptr->hostname )) &&
|
||||
( is_inactive_controller_main_insv() == false ))
|
||||
{
|
||||
wlog ("%s recovering active controller from %s-%s-%s\n",
|
||||
@ -1068,6 +1061,12 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
node_ptr->goEnabled = false ;
|
||||
node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ;
|
||||
|
||||
if ( node_ptr->forcing_full_enable == true )
|
||||
{
|
||||
ilog ("%s clearing force full enable recursion prevention flag", node_ptr->hostname.c_str());
|
||||
node_ptr->forcing_full_enable = false ;
|
||||
}
|
||||
|
||||
/* Set uptime to zero in mtce and in the database */
|
||||
node_ptr->uptime_save = 0 ;
|
||||
set_uptime ( node_ptr, 0 , false );
|
||||
@ -1159,8 +1158,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
else
|
||||
{
|
||||
plog ("%s is MTCALIVE (uptime:%d secs)\n",
|
||||
node_ptr->hostname.c_str(), node_ptr->uptime );
|
||||
plog ("%s is MTCALIVE (uptime:%d secs) (oob:%08X)",
|
||||
node_ptr->hostname.c_str(), node_ptr->uptime, node_ptr->mtce_flags );
|
||||
if ((NOT_THIS_HOST) &&
|
||||
( node_ptr->uptime > ((unsigned int)(node_ptr->mtcalive_timeout*2))))
|
||||
{
|
||||
@ -1198,7 +1197,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
node_ptr->unlock_cmd_ack = false ;
|
||||
send_mtc_cmd ( node_ptr->hostname , MTC_MSG_UNLOCKED, MGMNT_INTERFACE );
|
||||
|
||||
/* Request Out-Of--Service test execution */
|
||||
/* Request Out-Of-Service test execution */
|
||||
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MAIN_GOENABLED, MGMNT_INTERFACE );
|
||||
|
||||
/* now officially in the In-Test state */
|
||||
@ -1257,7 +1256,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
|
||||
node_ptr->goEnabled = false ;
|
||||
|
||||
/* start waiting fhr the ENABLE READY message */
|
||||
/* start waiting for the ENABLE READY message */
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__GOENABLED_WAIT );
|
||||
|
||||
break ;
|
||||
@ -1298,7 +1297,24 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_INITIALIZING );
|
||||
|
||||
/* ok. great, got the go-enabled message, lets move on */
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_START );
|
||||
|
||||
/* Don't start the self heartbeat for the active controller.
|
||||
* Also, in AIO , hosts that have a controller function also
|
||||
* have a worker function and the heartbeat for those hosts
|
||||
* are started at the end of the subfunction handler. */
|
||||
if (( THIS_HOST ) ||
|
||||
(( AIO_SYSTEM ) && ( is_controller(node_ptr)) ))
|
||||
{
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
|
||||
}
|
||||
else
|
||||
{
|
||||
/* allow the fsm to wait for up to 1 minute for the
|
||||
* hbsClient's ready event before starting heartbeat
|
||||
* test. */
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_WAIT );
|
||||
}
|
||||
}
|
||||
else if ( mtcTimer_expired ( node_ptr->mtcTimer ))
|
||||
{
|
||||
@ -1327,102 +1343,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
break ;
|
||||
}
|
||||
|
||||
case MTC_ENABLE__HOST_SERVICES_START:
|
||||
{
|
||||
bool start = true ;
|
||||
|
||||
plog ("%s Starting Host Services\n", node_ptr->hostname.c_str());
|
||||
if ( this->launch_host_services_cmd ( node_ptr, start ) != PASS )
|
||||
{
|
||||
elog ("%s %s failed ; launch\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->host_services_req.name.c_str());
|
||||
|
||||
node_ptr->hostservices_failed = true ;
|
||||
alarm_enabled_failure ( node_ptr, true );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_SERVICE_FAIL );
|
||||
|
||||
/* handle auto recovery for this failure */
|
||||
if ( ar_manage ( node_ptr,
|
||||
MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
|
||||
MTC_TASK_AR_DISABLED_SERVICES ) != PASS )
|
||||
break ;
|
||||
}
|
||||
else
|
||||
{
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLING );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_WAIT );
|
||||
}
|
||||
break ;
|
||||
}
|
||||
|
||||
case MTC_ENABLE__HOST_SERVICES_WAIT:
|
||||
{
|
||||
/* Wait for host services to complete - pass or fail.
|
||||
* The host_services_handler manages timeout. */
|
||||
rc = this->host_services_handler ( node_ptr );
|
||||
if ( rc == RETRY )
|
||||
{
|
||||
/* wait for the mtcClient's response ... */
|
||||
break ;
|
||||
}
|
||||
else if ( rc != PASS )
|
||||
{
|
||||
node_ptr->hostservices_failed = true ;
|
||||
alarm_enabled_failure ( node_ptr, true );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
||||
|
||||
|
||||
/* distinguish 'timeout' from other 'execution' failures */
|
||||
if ( rc == FAIL_TIMEOUT )
|
||||
{
|
||||
elog ("%s %s failed ; timeout\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->host_services_req.name.c_str());
|
||||
|
||||
mtcInvApi_update_task ( node_ptr,
|
||||
MTC_TASK_MAIN_SERVICE_TO );
|
||||
}
|
||||
else
|
||||
{
|
||||
elog ("%s %s failed ; rc:%d\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->host_services_req.name.c_str(),
|
||||
rc);
|
||||
|
||||
mtcInvApi_update_task ( node_ptr,
|
||||
MTC_TASK_MAIN_SERVICE_FAIL );
|
||||
}
|
||||
|
||||
/* handle auto recovery for this failure */
|
||||
if ( ar_manage ( node_ptr,
|
||||
MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
|
||||
MTC_TASK_AR_DISABLED_SERVICES ) != PASS )
|
||||
break ;
|
||||
}
|
||||
else /* success path */
|
||||
{
|
||||
/* Don't start the self heartbeat for the active controller.
|
||||
* Also, in AIO , hosts that have a controller function also
|
||||
* have a worker function and the heartbeat for those hosts
|
||||
* are started at the end of the subfunction handler. */
|
||||
if (( THIS_HOST ) ||
|
||||
(( AIO_SYSTEM ) && ( is_controller(node_ptr)) ))
|
||||
{
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
|
||||
}
|
||||
else
|
||||
{
|
||||
/* allow the fsm to wait for up to 1 minute for the
|
||||
* hbsClient's ready event before starting heartbeat
|
||||
* test. */
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_WAIT );
|
||||
}
|
||||
}
|
||||
break ;
|
||||
}
|
||||
|
||||
case MTC_ENABLE__HEARTBEAT_WAIT:
|
||||
{
|
||||
@ -1708,7 +1628,6 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
/* clear all the past enable failure bools */
|
||||
clear_main_failed_bools ( node_ptr );
|
||||
clear_subf_failed_bools ( node_ptr );
|
||||
clear_hostservices_ctls ( node_ptr );
|
||||
|
||||
/* Disable the heartbeat service for Graceful Recovery */
|
||||
send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST );
|
||||
@ -2331,77 +2250,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
/* O.K. clearing the state now that we got it */
|
||||
node_ptr->goEnabled = false ;
|
||||
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__HOST_SERVICES_START );
|
||||
}
|
||||
else if ( node_ptr->mtcTimer.ring == true )
|
||||
{
|
||||
elog ("%s has GOENABLED Timeout\n", node_ptr->hostname.c_str());
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_TO );
|
||||
|
||||
node_ptr->mtcTimer.ring = false ;
|
||||
|
||||
this->force_full_enable ( node_ptr );
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case MTC_RECOVERY__HOST_SERVICES_START:
|
||||
{
|
||||
bool start = true ;
|
||||
|
||||
plog ("%s Starting Host Services\n", node_ptr->hostname.c_str());
|
||||
if ( this->launch_host_services_cmd ( node_ptr, start ) != PASS )
|
||||
{
|
||||
elog ("%s %s failed ; launch\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->host_services_req.name.c_str());
|
||||
node_ptr->hostservices_failed = true ;
|
||||
this->force_full_enable ( node_ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__HOST_SERVICES_WAIT );
|
||||
}
|
||||
break ;
|
||||
}
|
||||
case MTC_RECOVERY__HOST_SERVICES_WAIT:
|
||||
{
|
||||
/* Wait for host services to complete - pass or fail.
|
||||
* The host_services_handler manages timeout. */
|
||||
rc = this->host_services_handler ( node_ptr );
|
||||
if ( rc == RETRY )
|
||||
{
|
||||
/* wait for the mtcClient's response ... */
|
||||
break ;
|
||||
}
|
||||
else if ( rc != PASS )
|
||||
{
|
||||
node_ptr->hostservices_failed = true ;
|
||||
if ( rc == FAIL_TIMEOUT )
|
||||
{
|
||||
elog ("%s %s failed ; timeout\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->host_services_req.name.c_str());
|
||||
|
||||
mtcInvApi_update_task ( node_ptr,
|
||||
MTC_TASK_START_SERVICE_TO );
|
||||
}
|
||||
else
|
||||
{
|
||||
elog ("%s %s failed ; rc=%d\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->host_services_req.name.c_str(),
|
||||
rc);
|
||||
|
||||
mtcInvApi_update_task ( node_ptr,
|
||||
MTC_TASK_START_SERVICE_FAIL );
|
||||
}
|
||||
this->force_full_enable ( node_ptr );
|
||||
}
|
||||
else /* success path */
|
||||
{
|
||||
/* The active controller would never get/be here but
|
||||
* if it did then just fall through to change state. */
|
||||
/* Manage state change */
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
/* Here we need to run the sub-fnction goenable and start
|
||||
@ -2436,7 +2285,16 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE );
|
||||
}
|
||||
}
|
||||
break ;
|
||||
else if ( node_ptr->mtcTimer.ring == true )
|
||||
{
|
||||
elog ("%s has GOENABLED Timeout\n", node_ptr->hostname.c_str());
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_TO );
|
||||
|
||||
node_ptr->mtcTimer.ring = false ;
|
||||
|
||||
this->force_full_enable ( node_ptr );
|
||||
}
|
||||
break;
|
||||
}
|
||||
case MTC_RECOVERY__CONFIG_COMPLETE_WAIT:
|
||||
{
|
||||
@ -2504,7 +2362,9 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
node_ptr->goEnabled_subf = false ;
|
||||
|
||||
/* ok. great, got the go-enabled message, lets move on */
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__SUBF_SERVICES_START );
|
||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_WORKER_CONFIG_TIMEOUT );
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START );
|
||||
}
|
||||
else if ( node_ptr->mtcTimer.ring == true )
|
||||
{
|
||||
@ -2520,72 +2380,6 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
break ;
|
||||
}
|
||||
|
||||
case MTC_RECOVERY__SUBF_SERVICES_START:
|
||||
{
|
||||
bool start = true ;
|
||||
bool subf = true ;
|
||||
|
||||
plog ("%s-worker Starting Host Services\n", node_ptr->hostname.c_str());
|
||||
|
||||
if ( this->launch_host_services_cmd ( node_ptr, start, subf ) != PASS )
|
||||
{
|
||||
elog ("%s-worker %s failed ; launch\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->host_services_req.name.c_str());
|
||||
node_ptr->hostservices_failed_subf = true ;
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_FAIL );
|
||||
this->force_full_enable ( node_ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__SUBF_SERVICES_WAIT );
|
||||
}
|
||||
break ;
|
||||
}
|
||||
case MTC_RECOVERY__SUBF_SERVICES_WAIT:
|
||||
{
|
||||
/* Wait for host services to complete - pass or fail.
|
||||
* The host_services_handler manages timeout. */
|
||||
rc = this->host_services_handler ( node_ptr );
|
||||
if ( rc == RETRY )
|
||||
{
|
||||
/* wait for the mtcClient's response ... */
|
||||
break ;
|
||||
}
|
||||
else if ( rc != PASS )
|
||||
{
|
||||
node_ptr->hostservices_failed_subf = true ;
|
||||
if ( rc == FAIL_TIMEOUT )
|
||||
{
|
||||
elog ("%s-worker %s failed ; timeout\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->host_services_req.name.c_str());
|
||||
|
||||
mtcInvApi_update_task ( node_ptr,
|
||||
MTC_TASK_START_SERVICE_TO );
|
||||
}
|
||||
else
|
||||
{
|
||||
elog ("%s-worker %s failed ; rc=%d\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->host_services_req.name.c_str(),
|
||||
rc);
|
||||
|
||||
mtcInvApi_update_task ( node_ptr,
|
||||
MTC_TASK_START_SERVICE_FAIL );
|
||||
}
|
||||
this->force_full_enable ( node_ptr );
|
||||
}
|
||||
else /* success path */
|
||||
{
|
||||
/* allow the fsm to wait for up to 'worker config timeout'
|
||||
* for the hbsClient's ready event before starting heartbeat
|
||||
* test. */
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_WORKER_CONFIG_TIMEOUT );
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START );
|
||||
}
|
||||
break ;
|
||||
}
|
||||
case MTC_RECOVERY__HEARTBEAT_START:
|
||||
{
|
||||
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
|
||||
@ -2858,7 +2652,6 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
/* clear all the enable failure bools */
|
||||
clear_main_failed_bools ( node_ptr );
|
||||
clear_subf_failed_bools ( node_ptr );
|
||||
clear_hostservices_ctls ( node_ptr );
|
||||
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__START ) ;
|
||||
disableStageChange ( node_ptr, MTC_DISABLE__DIS_SERVICES_WAIT) ;
|
||||
@ -2973,6 +2766,12 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
/* proceed to handle force lock if the launch fails */
|
||||
disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK );
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("%s %s launched",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->host_services_req.name.c_str())
|
||||
}
|
||||
}
|
||||
break ;
|
||||
}
|
||||
@ -6499,26 +6298,8 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
}
|
||||
}
|
||||
/* default retries counter to zero before START_SERVICES */
|
||||
/* default retries counter to zero before MTC_SERVICES */
|
||||
node_ptr->retries = 0 ;
|
||||
node_ptr->addStage = MTC_ADD__START_SERVICES ;
|
||||
break ;
|
||||
}
|
||||
|
||||
case MTC_ADD__START_SERVICES:
|
||||
{
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
||||
(( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
|
||||
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )))
|
||||
{
|
||||
ilog ("%s scheduling start host services\n",
|
||||
node_ptr->hostname.c_str());
|
||||
|
||||
node_ptr->start_services_needed = true ;
|
||||
node_ptr->start_services_retries = 0 ;
|
||||
}
|
||||
|
||||
node_ptr->addStage = MTC_ADD__MTC_SERVICES ;
|
||||
break ;
|
||||
}
|
||||
@ -6620,7 +6401,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
* to start host services. */
|
||||
if ( this->dor_mode_active )
|
||||
{
|
||||
node_ptr->start_services_needed_subf = true ;
|
||||
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__ENABLE_SUBF );
|
||||
}
|
||||
}
|
||||
@ -7486,22 +7266,6 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
bool start = false ;
|
||||
this->launch_host_services_cmd ( node_ptr, start );
|
||||
}
|
||||
else if ( daemon_want_fit ( FIT_CODE__START_HOST_SERVICES, node_ptr->hostname ))
|
||||
{
|
||||
if (( node_ptr->start_services_needed == false ) &&
|
||||
( node_ptr->start_services_running_main == false ))
|
||||
{
|
||||
node_ptr->start_services_needed = true ;
|
||||
node_ptr->start_services_retries = 0 ;
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("%s start host services (FIT) rejected (%d:%d)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->start_services_needed,
|
||||
node_ptr->start_services_running_main);
|
||||
}
|
||||
}
|
||||
|
||||
if (( daemon_is_file_present ( MTC_CMD_FIT__GOENABLE_AUDIT )) &&
|
||||
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
@ -7608,8 +7372,9 @@ int nodeLinkClass::pxeboot_mtcAlive_monitor ( struct nodeLinkClass::node * node_
|
||||
|
||||
|
||||
// Don't monitor pxeboot mtcAlive messaging while the node is
|
||||
// locked or in the following administrative action states.
|
||||
// locked, disabled or in the following administrative action states.
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) ||
|
||||
( node_ptr->operState == MTC_OPER_STATE__DISABLED ) ||
|
||||
( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK ) ||
|
||||
( node_ptr->adminAction == MTC_ADMIN_ACTION__ENABLE ) ||
|
||||
( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER ) ||
|
||||
@ -7822,20 +7587,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
case MTC_INSV_TEST__START:
|
||||
{
|
||||
mtcTimer_reset ( node_ptr->insvTestTimer );
|
||||
|
||||
/* Run the inservice test more frequently while
|
||||
* start_services_needed is true and we are not
|
||||
* in failure retry mode */
|
||||
if (( node_ptr->start_services_needed == true ) &&
|
||||
( node_ptr->hostservices_failed == false ) &&
|
||||
( node_ptr->hostservices_failed_subf == false ))
|
||||
{
|
||||
mtcTimer_start ( node_ptr->insvTestTimer, mtcTimer_handler, MTC_SECS_2 );
|
||||
}
|
||||
else
|
||||
{
|
||||
mtcTimer_start ( node_ptr->insvTestTimer, mtcTimer_handler, insv_test_period );
|
||||
}
|
||||
mtcTimer_start ( node_ptr->insvTestTimer, mtcTimer_handler, insv_test_period );
|
||||
insvTestStageChange ( node_ptr, MTC_INSV_TEST__WAIT );
|
||||
break ;
|
||||
}
|
||||
@ -7957,147 +7709,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
"DOR mode active\n");
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
* Handle Main Function Start Host Services if it's 'needed'
|
||||
************************************************************/
|
||||
else if ( node_ptr->start_services_needed == true )
|
||||
{
|
||||
/* If Main Start Host Services is not already running
|
||||
* then launch it */
|
||||
if ( node_ptr->start_services_running_main == false )
|
||||
{
|
||||
/* Only launch if the node is successfully configured
|
||||
* and tested */
|
||||
if (( node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY ) &&
|
||||
( node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) &&
|
||||
( node_ptr->mtce_flags & MTC_FLAG__MAIN_GOENABLED ))
|
||||
{
|
||||
/* Launch 'start' for this node type */
|
||||
bool start = true ;
|
||||
if ( this->launch_host_services_cmd ( node_ptr , start ) != PASS )
|
||||
{
|
||||
/* failed -> retry */
|
||||
node_ptr->hostservices_failed = true ;
|
||||
node_ptr->start_services_running_main = false ;
|
||||
node_ptr->start_services_retries++ ;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* launched successfully */
|
||||
node_ptr->start_services_running_main = true ;
|
||||
node_ptr->hostservices_failed = false ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog("%s start host services ; waiting to launch (%x)",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->mtce_flags);
|
||||
}
|
||||
}
|
||||
/* Handle Main start host services response */
|
||||
else
|
||||
{
|
||||
/* Wait for host services to complete - pass or fail.
|
||||
* The host_services_handler manages timeout. */
|
||||
int rc = this->host_services_handler ( node_ptr );
|
||||
if ( rc == RETRY )
|
||||
{
|
||||
/* wait for the mtcClient's response ... */
|
||||
break ;
|
||||
}
|
||||
else if ( rc != PASS )
|
||||
{
|
||||
node_ptr->hostservices_failed = true ;
|
||||
node_ptr->start_services_retries++ ;
|
||||
wlog ("%s %s request failed ; (retry %d)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->host_services_req.name.c_str(),
|
||||
node_ptr->start_services_retries);
|
||||
}
|
||||
else /* success path */
|
||||
{
|
||||
node_ptr->start_services_needed = false ;
|
||||
node_ptr->hostservices_failed = false ;
|
||||
node_ptr->start_services_retries = 0 ;
|
||||
}
|
||||
node_ptr->start_services_running_main = false ;
|
||||
}
|
||||
}
|
||||
/*************************************************************
|
||||
* Handle Sub Function Start Host Services if it's 'needed'
|
||||
************************************************************/
|
||||
else if ( node_ptr->start_services_needed_subf == true )
|
||||
{
|
||||
/* If Subf Start Host Services is not already running
|
||||
* then launch it */
|
||||
if ( node_ptr->start_services_running_subf == false )
|
||||
{
|
||||
/* Only launch if the node and subfunction are
|
||||
* successfully configured and tested */
|
||||
if (( node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY ) &&
|
||||
( node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) &&
|
||||
( node_ptr->mtce_flags & MTC_FLAG__MAIN_GOENABLED ) &&
|
||||
( node_ptr->mtce_flags & MTC_FLAG__SUBF_CONFIGURED ) &&
|
||||
( node_ptr->mtce_flags & MTC_FLAG__SUBF_GOENABLED ))
|
||||
{
|
||||
/* Launch 'start' for this subfunction type */
|
||||
bool start = true ;
|
||||
bool subf = true ;
|
||||
if ( this->launch_host_services_cmd ( node_ptr, start, subf ) != PASS )
|
||||
{
|
||||
/* failed -> retry */
|
||||
node_ptr->hostservices_failed_subf = true ;
|
||||
node_ptr->start_services_running_subf = false ;
|
||||
node_ptr->start_services_retries++ ;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* launched successfully */
|
||||
node_ptr->hostservices_failed_subf = false ;
|
||||
node_ptr->start_services_running_subf = true ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog("%s subf start host services ; waiting to launch (%x)",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->mtce_flags);
|
||||
}
|
||||
}
|
||||
/* Handle Subf start host services response */
|
||||
else
|
||||
{
|
||||
/* Wait for host services to complete - pass or fail.
|
||||
* The host_services_handler manages timeout. */
|
||||
int rc = this->host_services_handler ( node_ptr );
|
||||
if ( rc == RETRY )
|
||||
{
|
||||
/* wait for the mtcClient's response ... */
|
||||
break ;
|
||||
}
|
||||
node_ptr->start_services_running_subf = false ;
|
||||
if ( rc != PASS )
|
||||
{
|
||||
node_ptr->start_services_running_subf = false ;
|
||||
node_ptr->hostservices_failed_subf = true ;
|
||||
node_ptr->start_services_retries++ ;
|
||||
|
||||
wlog ("%s %s request failed ; (retry %d)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->host_services_req.name.c_str(),
|
||||
node_ptr->start_services_retries);
|
||||
}
|
||||
else /* success path */
|
||||
{
|
||||
node_ptr->start_services_needed_subf = false ;
|
||||
node_ptr->hostservices_failed_subf = false ;
|
||||
node_ptr->start_services_running_subf = false ;
|
||||
node_ptr->start_services_retries = 0 ;
|
||||
}
|
||||
node_ptr->start_services_running_subf = false ;
|
||||
}
|
||||
}
|
||||
if ( NOT_THIS_HOST )
|
||||
{
|
||||
if ((( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
|
||||
@ -8169,8 +7780,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
*
|
||||
**/
|
||||
if (( node_ptr->operState_subf == MTC_OPER_STATE__DISABLED ) &&
|
||||
( node_ptr->ar_disabled == false ) &&
|
||||
( node_ptr->start_services_needed == false ))
|
||||
( node_ptr->ar_disabled == false ))
|
||||
{
|
||||
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE_SUBF ) &&
|
||||
( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ))
|
||||
@ -8197,28 +7807,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Only raise this alarm while in simplex */
|
||||
if (( num_controllers_enabled() < 2 ) &&
|
||||
(( node_ptr->goEnabled_failed_subf == true ) ||
|
||||
( node_ptr->inservice_failed_subf == true ) ||
|
||||
( node_ptr->hostservices_failed_subf == true )))
|
||||
{
|
||||
if ( node_ptr->alarms[MTC_ALARM_ID__CH_COMP] == FM_ALARM_SEVERITY_CLEAR )
|
||||
{
|
||||
wlog ("%s insv test detected subfunction failure ; degrading host\n",
|
||||
node_ptr->hostname.c_str());
|
||||
|
||||
alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_MAJOR );
|
||||
|
||||
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
||||
MTC_OPER_STATE__ENABLED,
|
||||
MTC_AVAIL_STATUS__DEGRADED );
|
||||
|
||||
subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
|
||||
MTC_AVAIL_STATUS__FAILED );
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Monitor the health of the host */
|
||||
@ -8634,3 +8222,95 @@ int nodeLinkClass::cfg_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
return (PASS);
|
||||
}
|
||||
|
||||
/*****************************************************************************
|
||||
* Name : self_fail_handler
|
||||
*
|
||||
* Purpose : Handle force failure of self for Fully DX enabled or SX systems
|
||||
*
|
||||
* Description: Wait for mtcTimer to expire giving the the active controller
|
||||
* time to flush any outstanding state change updates to the
|
||||
* database. Then trigger a force shutdown of SM services.
|
||||
*
|
||||
* Simplex System behavior: issue a lazy reboot
|
||||
* Duplex System behavior : wait for swact to the enabled standby controller.
|
||||
*
|
||||
* Assumptions: Only called in a DX system if the standby controller is enabled.
|
||||
* Do a last second check for the enabled standby controller.
|
||||
* Otherwise, abort and revert back to enabled-degraded.
|
||||
*
|
||||
* Parameters :
|
||||
* @param node_ptr - pointed toi this host's nodeLinkClass control structure
|
||||
*
|
||||
*****************************************************************************/
|
||||
int nodeLinkClass::self_fail_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
/* Wait for this Simplex node to lazy reboot */
|
||||
if (this->self_reboot_wait)
|
||||
{
|
||||
ilog_throttled ( node_ptr->ar_log_throttle, AR_HANDLER_LOG_THROTTLE_THRESHOLD,
|
||||
"%s ... waiting on lazy reboot", node_ptr->hostname.c_str());
|
||||
return (PASS);
|
||||
}
|
||||
/* Wait for SM to shut down the mtcAgent */
|
||||
else if (this->force_swact_wait)
|
||||
{
|
||||
ilog_throttled ( node_ptr->ar_log_throttle, AR_HANDLER_LOG_THROTTLE_THRESHOLD,
|
||||
"%s ... waiting on force swact", node_ptr->hostname.c_str());
|
||||
return (PASS);
|
||||
}
|
||||
|
||||
/* Wait for the database update */
|
||||
else if ( node_ptr->mtcTimer.ring )
|
||||
{
|
||||
// Last second check for an active standby controller in a DX system
|
||||
if (( NOT_SIMPLEX ) && ( is_inactive_controller_main_insv () == false ))
|
||||
{
|
||||
// ERIK: TEST ME: Force this test case
|
||||
wlog ("%s refusing to self reboot with no enabled standby controller", node_ptr->hostname.c_str());
|
||||
wlog ("%s ... critical enable alarm raised", node_ptr->hostname.c_str());
|
||||
wlog ("%s ... recommend enabling a standby controller.", node_ptr->hostname.c_str());
|
||||
allStateChange ( node_ptr,
|
||||
node_ptr->adminState,
|
||||
MTC_OPER_STATE__ENABLED,
|
||||
MTC_AVAIL_STATUS__DEGRADED );
|
||||
alarm_enabled_failure ( node_ptr, true );
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_NO_BACKUP);
|
||||
this->delayed_swact_required = false ;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Force an uncontrolled SWACT to enabled standby controller */
|
||||
/* Tell SM we are unhealthy so that it shuts down all its services */
|
||||
wlog ("%s forcing SM to shut down services by %s", node_ptr->hostname.c_str(), SMGMT_UNHEALTHY_FILE);
|
||||
daemon_log ( SMGMT_UNHEALTHY_FILE, "Maintenance force swact due to self failure");
|
||||
node_ptr->ar_log_throttle = 0 ;
|
||||
if ( SIMPLEX )
|
||||
{
|
||||
wlog ("%s commanding lazy reboot", node_ptr->hostname.c_str());
|
||||
|
||||
send_mtc_cmd ( node_ptr->hostname, MTC_CMD_LAZY_REBOOT, MGMNT_INTERFACE) ;
|
||||
|
||||
/* pxeboot network is not currently provisioned in SX
|
||||
* auto handle if that changes in the future */
|
||||
if ( this->pxeboot_network_provisioned == true )
|
||||
send_mtc_cmd ( node_ptr->hostname, MTC_CMD_LAZY_REBOOT, PXEBOOT_INTERFACE) ;
|
||||
|
||||
this->self_reboot_wait = true ;
|
||||
}
|
||||
else
|
||||
{
|
||||
this->force_swact_wait = true ;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog_throttled (node_ptr->ar_log_throttle, AR_HANDLER_LOG_THROTTLE_THRESHOLD,
|
||||
"%s ... waiting on database update before %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
SIMPLEX ? "lazy reboot of this simplex system" :
|
||||
"force swact to unlocked-enabled standby controller");
|
||||
}
|
||||
return (PASS);
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2016 Wind River Systems, Inc.
|
||||
* Copyright (c) 2013-2016, 2025 Wind River Systems, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
@ -192,7 +192,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
alarm_compute_clear ( node_ptr, true );
|
||||
|
||||
/* ok. great, got the go-enabled message, lets move on */
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_START );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_CHECK );
|
||||
break ;
|
||||
}
|
||||
ilog ("%s running out-of-service tests\n", name.c_str());
|
||||
@ -214,19 +214,20 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
case MTC_ENABLE__GOENABLED_WAIT:
|
||||
{
|
||||
bool goenable_failed = false ;
|
||||
bool goenable_failed_subf = false ;
|
||||
|
||||
/* search for the Go Enable message */
|
||||
if (( node_ptr->health == NODE_UNHEALTHY ) ||
|
||||
( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY) ||
|
||||
( node_ptr->mtce_flags & MTC_FLAG__SUBF_GOENABLE_FAIL) ||
|
||||
( node_ptr->goEnabled_failed_subf == true ))
|
||||
{
|
||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||
elog ("%s one or more out-of-service tests failed\n", name.c_str());
|
||||
elog ("%s one or more out-of-service subfunction tests failed\n", name.c_str());
|
||||
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_INTEST_FAIL );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED );
|
||||
goenable_failed = true ;
|
||||
goenable_failed_subf = true ;
|
||||
}
|
||||
|
||||
/* search for the Go Enable message */
|
||||
@ -245,17 +246,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
|
||||
/* ok. great, got the go-enabled message, lets move on */
|
||||
|
||||
if ( node_ptr->start_services_needed_subf == true )
|
||||
{
|
||||
/* If the add_handler set start_services_needed_subf to
|
||||
* true then we bypass inline execution and allow it to
|
||||
* be serviced as a scheduled background operation. */
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_CHECK );
|
||||
}
|
||||
else
|
||||
{
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_START );
|
||||
}
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_CHECK );
|
||||
break ;
|
||||
}
|
||||
|
||||
@ -265,14 +256,14 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_INTEST_TO );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED );
|
||||
goenable_failed = true ;
|
||||
goenable_failed_subf = true ;
|
||||
}
|
||||
else
|
||||
{
|
||||
; /* wait some more */
|
||||
}
|
||||
|
||||
if ( goenable_failed == true )
|
||||
if ( goenable_failed_subf == true )
|
||||
{
|
||||
alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
|
||||
|
||||
@ -284,103 +275,6 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
break ;
|
||||
}
|
||||
case MTC_ENABLE__HOST_SERVICES_START:
|
||||
{
|
||||
bool start = true ;
|
||||
bool subf = true ;
|
||||
|
||||
plog ("%s %s host services\n",
|
||||
name.c_str(),
|
||||
node_ptr->start_services_needed_subf ? "scheduling start compute" :
|
||||
"starting compute");
|
||||
|
||||
if ( node_ptr->start_services_needed_subf == true )
|
||||
{
|
||||
bool force = true ;
|
||||
|
||||
/* If the add_handler set start_services_needed_subf to
|
||||
* true then we bypass inline execution and allow it to
|
||||
* be serviced as a scheduled background operation. */
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_CHECK );
|
||||
alarm_compute_clear ( node_ptr, force );
|
||||
}
|
||||
|
||||
else if ( launch_host_services_cmd ( node_ptr, start, subf ) != PASS )
|
||||
{
|
||||
wlog ("%s %s failed ; launch\n",
|
||||
name.c_str(),
|
||||
node_ptr->host_services_req.name.c_str());
|
||||
|
||||
node_ptr->hostservices_failed_subf = true ;
|
||||
alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED );
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_FAIL );
|
||||
|
||||
/* handle auto recovery for this failure */
|
||||
if ( ar_manage ( node_ptr,
|
||||
MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
|
||||
MTC_TASK_AR_DISABLED_SERVICES ) != PASS )
|
||||
break ;
|
||||
}
|
||||
else
|
||||
{
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_WAIT );
|
||||
}
|
||||
break ;
|
||||
}
|
||||
|
||||
case MTC_ENABLE__HOST_SERVICES_WAIT:
|
||||
{
|
||||
/* Wait for host services to complete - pass or fail.
|
||||
* The host_services_handler manages timeout. */
|
||||
rc = host_services_handler ( node_ptr );
|
||||
if ( rc == RETRY )
|
||||
{
|
||||
/* wait for the mtcClient's response ... */
|
||||
break ;
|
||||
}
|
||||
else if ( rc != PASS )
|
||||
{
|
||||
node_ptr->hostservices_failed_subf = true ;
|
||||
alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
|
||||
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED );
|
||||
|
||||
|
||||
if ( rc == FAIL_TIMEOUT )
|
||||
{
|
||||
elog ("%s %s failed ; timeout\n",
|
||||
name.c_str(),
|
||||
node_ptr->host_services_req.name.c_str());
|
||||
|
||||
/* Report "Enabling Compute Service Timeout" to sysinv/horizon */
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_TO );
|
||||
}
|
||||
else
|
||||
{
|
||||
elog ("%s %s failed ; rc:%d\n",
|
||||
name.c_str(),
|
||||
node_ptr->host_services_req.name.c_str(),
|
||||
rc);
|
||||
|
||||
/* Report "Enabling Compute Service Failed" to sysinv/horizon */
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_FAIL );
|
||||
}
|
||||
|
||||
/* handle auto recovery for this failure */
|
||||
if ( ar_manage ( node_ptr,
|
||||
MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
|
||||
MTC_TASK_AR_DISABLED_SERVICES ) != PASS )
|
||||
break ;
|
||||
}
|
||||
else /* success path */
|
||||
{
|
||||
alarm_compute_clear ( node_ptr, true );
|
||||
node_ptr->hostservices_failed_subf = false ;
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_CHECK );
|
||||
}
|
||||
break ;
|
||||
}
|
||||
case MTC_ENABLE__HEARTBEAT_CHECK:
|
||||
{
|
||||
if ( THIS_HOST )
|
||||
@ -569,11 +463,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
plog ("%s is ENABLED\n", name.c_str());
|
||||
}
|
||||
|
||||
/* already cleared if true so no need to do it again */
|
||||
if ( node_ptr->start_services_needed_subf != true )
|
||||
{
|
||||
alarm_compute_clear ( node_ptr, force );
|
||||
}
|
||||
alarm_compute_clear ( node_ptr, force );
|
||||
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__DONE );
|
||||
|
||||
|
@ -80,7 +80,7 @@ bmc_reset_delay = 300 ; seconds to wait before issuing a bmc
|
||||
|
||||
http_retry_wait = 10 ; secs to wait between http request retries
|
||||
|
||||
host_add_delay = 20 ; seconds to wait before adding hosts
|
||||
host_add_delay = 0 ; seconds to wait before adding hosts
|
||||
|
||||
[client] ; Client Configuration
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user