diff --git a/mtce-common/src/common/nodeBase.cpp b/mtce-common/src/common/nodeBase.cpp index 2204a387..61c182d8 100755 --- a/mtce-common/src/common/nodeBase.cpp +++ b/mtce-common/src/common/nodeBase.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, 2016, 2023-2024 Wind River Systems, Inc. + * Copyright (c) 2013, 2016, 2023-2025 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -348,8 +348,6 @@ void mtc_stages_init ( void ) enableStages_str [MTC_ENABLE__GOENABLED_TIMER ] = "GoEnable-Start"; enableStages_str [MTC_ENABLE__GOENABLED_WAIT ] = "GoEnable-Wait"; enableStages_str [MTC_ENABLE__PMOND_READY_WAIT ] = "PmondReady-Wait"; - enableStages_str [MTC_ENABLE__HOST_SERVICES_START ] = "HostServices-Start"; - enableStages_str [MTC_ENABLE__HOST_SERVICES_WAIT ] = "HostServices-Wait"; enableStages_str [MTC_ENABLE__SERVICES_START_WAIT ] = "Services-Start"; enableStages_str [MTC_ENABLE__HEARTBEAT_WAIT ] = "Heartbeat-Wait"; enableStages_str [MTC_ENABLE__HEARTBEAT_SOAK ] = "Heartbeat-Soak"; @@ -375,8 +373,6 @@ void mtc_stages_init ( void ) recoveryStages_str[MTC_RECOVERY__MTCALIVE_WAIT ] = "MtcAlive-Wait"; recoveryStages_str[MTC_RECOVERY__GOENABLED_TIMER ] = "GoEnable-Timer"; recoveryStages_str[MTC_RECOVERY__GOENABLED_WAIT ] = "GoEnable-Wait"; - recoveryStages_str[MTC_RECOVERY__HOST_SERVICES_START] = "HostServices-Start"; - recoveryStages_str[MTC_RECOVERY__HOST_SERVICES_WAIT ] = "HostServices-Wait"; recoveryStages_str[MTC_RECOVERY__CONFIG_COMPLETE_WAIT]= "Compute-Config-Wait"; recoveryStages_str[MTC_RECOVERY__SUBF_GOENABLED_TIMER]= "Subf-GoEnable-Timer"; recoveryStages_str[MTC_RECOVERY__SUBF_GOENABLED_WAIT] = "Subf-GoEnable-Wait"; diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 77af3c84..be259f6e 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -1,7 +1,7 @@ #ifndef __INCLUDE_NODEBASE_HH__ #define __INCLUDE_NODEBASE_HH__ /* - * Copyright (c) 2013-2020, 2023-2024 Wind River Systems, Inc. + * Copyright (c) 2013-2020, 2023-2025 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -77,18 +77,25 @@ void daemon_exit ( void ); * * These flags are shipped in the parm[2] if the * mtcAlive message from each host. */ -#define MTC_FLAG__I_AM_CONFIGURED (0x00000001) -#define MTC_FLAG__I_AM_NOT_HEALTHY (0x00000002) -#define MTC_FLAG__I_AM_HEALTHY (0x00000004) -#define MTC_FLAG__I_AM_LOCKED (0x00000008) -#define MTC_FLAG__SUBF_CONFIGURED (0x00000010) -#define MTC_FLAG__MAIN_GOENABLED (0x00000020) -#define MTC_FLAG__SUBF_GOENABLED (0x00000040) -#define MTC_FLAG__SM_DEGRADED (0x00000080) -#define MTC_FLAG__PATCHING (0x00000100) /* Patching in progress */ -#define MTC_FLAG__PATCHED (0x00000200) /* Patched but not reset */ -#define MTC_FLAG__LUKS_VOL_FAILED (0x00000400) -#define MTC_FLAG__SM_UNHEALTHY (0x00001000) +#define MTC_FLAG__I_AM_CONFIGURED (0x00000001) +#define MTC_FLAG__I_AM_NOT_HEALTHY (0x00000002) +#define MTC_FLAG__I_AM_HEALTHY (0x00000004) +#define MTC_FLAG__I_AM_LOCKED (0x00000008) +#define MTC_FLAG__SUBF_CONFIGURED (0x00000010) +#define MTC_FLAG__MAIN_GOENABLED (0x00000020) +#define MTC_FLAG__SUBF_GOENABLED (0x00000040) +#define MTC_FLAG__SM_DEGRADED (0x00000080) +#define MTC_FLAG__PATCHING (0x00000100) /* Patching in progress */ +#define MTC_FLAG__PATCHED (0x00000200) /* Patched but not reset */ +#define MTC_FLAG__LUKS_VOL_FAILED (0x00000400) +#define MTC_FLAG__SM_UNHEALTHY (0x00001000) +#define MTC_FLAG__RESERVED_2000 (0x00002000) +#define MTC_FLAG__RESERVED_4000 (0x00004000) +#define MTC_FLAG__RESERVED_8000 (0x00008000) +#define MTC_FLAG__MAIN_GOENABLE_FAIL (0x00010000) +#define MTC_FLAG__SUBF_GOENABLE_FAIL (0x00020000) +#define MTC_FLAG__MAIN_SERVICES_FAIL (0x00040000) +#define MTC_FLAG__SUBF_SERVICES_FAIL (0x00080000) #define MTC_UNHEALTHY_THRESHOLD (3) @@ -98,7 +105,7 @@ void daemon_exit ( void ); #define NODE_UNHEALTHY (2) #define AUTO_RECOVERY_FILE_SUFFIX ((const char *)"_ar_count") -#define TMP_DIR_PATH ((const char *)"/etc/mtc/tmp/") +#define MTC_PERSIST_PATH ((const char *)"/var/persist/mtc/") #define HOST_IS_VIRTUAL ((const char *)"/var/run/virtual.host") @@ -159,6 +166,10 @@ void daemon_exit ( void ); #define OPT_PLATFORM_CONFIG_DIR ((const char *)"/opt/platform/config") #define DNSMASQ_HOSTS_FILE ((const char *)"dnsmasq.hosts") +/* maintenance log files */ +#define MTCAGENT_LOG_FILE ((const char *)"/var/log/mtcAgent.log") +#define MTCCLIENT_LOG_FILE ((const char *)"/var/log/mtcClient.log") + /* supported BMC communication protocols ; access method */ typedef enum { @@ -294,8 +305,7 @@ typedef enum #define MTC_TASK_SUBF_CONFIG_TO "Worker Configuration Timeout, re-enabling" #define MTC_TASK_SUBF_INTEST_FAIL "Worker In-Test Failed, re-enabling" #define MTC_TASK_SUBF_INTEST_TO "Worker In-Test Timeout, re-enabling" -#define MTC_TASK_SUBF_SERVICE_FAIL "Worker Start Services Failed, re-enabling" -#define MTC_TASK_SUBF_SERVICE_TO "Worker Start Services Timeout, re-enabling" +#define MTC_TASK_SUBF_SERVICE_FAIL "Start Worker Services Failed, re-enabling" #define MTC_TASK_AR_DISABLED_CONFIG "Configuration failure, threshold reached, Lock/Unlock to retry" #define MTC_TASK_AR_DISABLED_GOENABLE "In-Test Failure, threshold reached, Lock/Unlock to retry" @@ -904,8 +914,6 @@ typedef enum MTC_ENABLE__GOENABLED_TIMER = 12, MTC_ENABLE__GOENABLED_WAIT = 13, MTC_ENABLE__PMOND_READY_WAIT = 14, - MTC_ENABLE__HOST_SERVICES_START = 15, - MTC_ENABLE__HOST_SERVICES_WAIT = 16, MTC_ENABLE__SERVICES_START_WAIT = 17, MTC_ENABLE__HEARTBEAT_WAIT = 18, MTC_ENABLE__HEARTBEAT_SOAK = 19, @@ -987,8 +995,6 @@ typedef enum MTC_RECOVERY__MTCALIVE_WAIT, MTC_RECOVERY__GOENABLED_TIMER, MTC_RECOVERY__GOENABLED_WAIT, - MTC_RECOVERY__HOST_SERVICES_START, - MTC_RECOVERY__HOST_SERVICES_WAIT, /* Subfunction stages */ MTC_RECOVERY__CONFIG_COMPLETE_WAIT, diff --git a/mtce-common/src/daemon/daemon_files.cpp b/mtce-common/src/daemon/daemon_files.cpp index 6799eae3..1a922258 100755 --- a/mtce-common/src/daemon/daemon_files.cpp +++ b/mtce-common/src/daemon/daemon_files.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2019 Wind River Systems, Inc. + * Copyright (c) 2013-2019, 2025 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -1149,9 +1149,12 @@ int daemon_wait_for_file ( const char * filename, int timeout ) int daemon_files_init ( void ) { + struct timespec ts ; + clock_gettime (CLOCK_MONOTONIC, &ts ); + /* Create PID file */ pid_t mypid = getpid(); - ilog ("--- Daemon Start-Up --- pid:%d\n", mypid); + ilog ("--- Daemon Start-Up --- pid:%d uptime:%ld", mypid, ts.tv_sec); daemon_init_fit (); return ( PASS ); } diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index ca86312b..27b66618 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2020, 2023-2024 Wind River Systems, Inc. + * Copyright (c) 2013-2020, 2023-2025 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -353,8 +353,6 @@ nodeLinkClass::nodeLinkClass() smgrEvent.buf = NULL ; tokenEvent.buf = NULL ; - unknown_host_throttle = 0 ; - testmode = 0 ; module_init( ); } @@ -365,19 +363,6 @@ nodeLinkClass::~nodeLinkClass() ; } -/* Clear start host service controls */ -void nodeLinkClass::clear_hostservices_ctls ( struct nodeLinkClass::node * node_ptr ) -{ - if ( node_ptr ) - { - node_ptr->start_services_needed = false ; - node_ptr->start_services_needed_subf = false ; - node_ptr->start_services_running_main = false ; - node_ptr->start_services_running_subf = false ; - node_ptr->start_services_retries = 0 ; - } -} - /* Clear all the main function enable failure bools */ void nodeLinkClass::clear_main_failed_bools ( struct nodeLinkClass::node * node_ptr ) { @@ -516,7 +501,6 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) ptr->goEnabled = false ; ptr->goEnabled_subf = false ; - clear_hostservices_ctls ( ptr ); /* clear all the enable failure bools */ clear_main_failed_bools ( ptr ); @@ -4103,6 +4087,11 @@ void nodeLinkClass::set_cmd_resp ( string & hostname, mtc_message_type & msg, in { if ( is_host_services_cmd ( msg.cmd ) ) { + if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) + { + dlog3 ("%s ignoring host services result for locked node", hostname.c_str()); + return ; + } /***************************************************** * Host Services Request's Response Handling *****************************************************/ @@ -4116,9 +4105,33 @@ void nodeLinkClass::set_cmd_resp ( string & hostname, mtc_message_type & msg, in { if ( !node_ptr->host_services_req.ack ) { - slog ("%s %s without initial command ACK\n", - hostname.c_str(), - node_ptr->host_services_req.name.c_str()); + // parm[0] contains the return code + if ( msg.parm[0] == PASS ) + { + ilog ("%s mtcClient %s ran and passed", hostname.c_str(), msg.buf); + } + else if ( node_ptr->hostservices_failed || node_ptr->hostservices_failed_subf ) + { + ilog ("%s already handling 'host services' failure", hostname.c_str()); + } + else + { + elog ("%s mtcClient %s ran and failed", hostname.c_str(), msg.buf); + if (( msg.cmd != MTC_CMD_STOP_CONTROL_SVCS ) && + ( msg.cmd != MTC_CMD_STOP_WORKER_SVCS ) && + ( msg.cmd != MTC_CMD_STOP_STORAGE_SVCS )) + { + alarm_enabled_failure ( node_ptr, true ); + if ( ar_manage ( node_ptr, + MTC_AR_DISABLE_CAUSE__HOST_SERVICES, + MTC_TASK_AR_DISABLED_SERVICES) == PASS ) + { + node_ptr->hostservices_failed = true ; + this->force_full_enable ( node_ptr ); + } + } + } + return ; } node_ptr->host_services_req.rsp = msg.cmd ; if ( msg.buf[0] != '\0' ) @@ -4233,6 +4246,7 @@ unsigned int nodeLinkClass::get_cmd_resp ( string & hostname ) * 1. manage the online/offline state bools * 2. increment the mtcAlive count * 3. set the mtcAlive received bool for the specified interface + * 4. handle start host services failures * *****************************************************************************/ void nodeLinkClass::set_mtcAlive ( string & hostname, unsigned int sequence, int iface ) @@ -4456,7 +4470,13 @@ void nodeLinkClass::set_goEnabled_failed ( string & hostname ) node_ptr = nodeLinkClass::getNode ( hostname ); if ( node_ptr != NULL ) { - node_ptr->goEnabled_failed = true ; + if ( node_ptr->goEnabled_failed == false ) + { + node_ptr->goEnabled_failed = true ; + alarm_enabled_failure ( node_ptr, true ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_FAIL ); + ar_handler ( node_ptr, MTC_AR_DISABLE_CAUSE__GOENABLE, MTC_TASK_AR_DISABLED_GOENABLE ); + } } } @@ -4489,6 +4509,13 @@ void nodeLinkClass::set_goEnabled_failed_subf ( string & hostname ) node_ptr = nodeLinkClass::getNode ( hostname ); if ( node_ptr != NULL ) { + if ( node_ptr->goEnabled_failed_subf == false ) + { + node_ptr->goEnabled_failed_subf = true ; + mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_INTEST_FAIL ); + alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL ); + ar_handler ( node_ptr, MTC_AR_DISABLE_CAUSE__GOENABLE, MTC_TASK_AR_DISABLED_GOENABLE ); + } node_ptr->goEnabled_failed_subf = true ; } } @@ -4580,6 +4607,85 @@ void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface ) else node_ptr->goEnabled = false ; + // Detect and handle 'Go Enable' Failures that come in by + // Out-Of-Band signaling from periodic mtcAlive messaging. + // + // Only take action on the first event while node is + // unlocked-enabled and while 'goEnabled_failed' AND + // 'goEnabled_failed_subf' are false. + // + // These failure bool's are cleared by calls to + // clear_main_failed_bools and clear_subf_failed_bools + // in the enable_handler. + if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && + ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) && + (( flags & MTC_FLAG__MAIN_GOENABLE_FAIL ) || ( flags & MTC_FLAG__SUBF_GOENABLE_FAIL )) && + (!( node_ptr->goEnabled_failed || node_ptr->goEnabled_failed_subf ))) + { + if ( flags & MTC_FLAG__MAIN_GOENABLE_FAIL ) + { + elog ("%s goEnabled failed (oob:%08X) ; see %s:%s for details", + hostname.c_str(), flags, hostname.c_str(), MTCCLIENT_LOG_FILE); + node_ptr->goEnabled_failed = true ; + alarm_enabled_failure ( node_ptr, true ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_FAIL ); + } + + if ( flags & MTC_FLAG__SUBF_GOENABLE_FAIL ) + { + ilog ("%s goEnabled subfunction failed (oob:%08X) ; see %s:%s for details", + hostname.c_str(), flags, hostname.c_str(), MTCCLIENT_LOG_FILE); + node_ptr->goEnabled_failed_subf = true ; + alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_INTEST_FAIL ); + } + if ( ar_handler ( node_ptr, + MTC_AR_DISABLE_CAUSE__GOENABLE, + MTC_TASK_AR_DISABLED_GOENABLE ) == PASS ) + { + this->force_full_enable ( node_ptr ); + } + } + + // Detect and handle 'Host Services' failures that come in by + // Out-Of-Band signaling from periodic mtcAlive messaging. + // + // Only take action on the first event while node is + // unlocked-enabled and while 'goEnabled_failed' AND + // 'goEnabled_failed_subf' are false. + // + // These failure bool's are cleared by calls to + // clear_main_failed_bools and clear_subf_failed_bools + // in the enable_handler. + if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && + ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) && + (( flags & MTC_FLAG__MAIN_SERVICES_FAIL ) || ( flags & MTC_FLAG__SUBF_SERVICES_FAIL )) && + (!( node_ptr->hostservices_failed || node_ptr->hostservices_failed_subf ))) + { + if ( flags & MTC_FLAG__MAIN_SERVICES_FAIL ) + { + elog ("%s start host services failed (oob:%08X) ; see %s:%s for details", + hostname.c_str(), flags, hostname.c_str(), MTCCLIENT_LOG_FILE); + node_ptr->hostservices_failed = true ; + alarm_enabled_failure ( node_ptr, true ); + // mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_SERVICE_FAIL ); + } + + if ( flags & MTC_FLAG__SUBF_SERVICES_FAIL ) + { + ilog ("%s start host subfunction services failed (oob:%08X) ; see %s:%s for details", + hostname.c_str(), flags, hostname.c_str(), MTCCLIENT_LOG_FILE); + node_ptr->hostservices_failed_subf = true ; + alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL ); + // mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_FAIL ); + } + if ( ar_handler ( node_ptr, + MTC_AR_DISABLE_CAUSE__HOST_SERVICES, + MTC_TASK_AR_DISABLED_SERVICES) == PASS ) + { + this->force_full_enable ( node_ptr ); + } + } /* * Fail the inactive controller if the sm unhealthy flag is set. * Degrade for the active controller. @@ -8091,7 +8197,7 @@ struct nodeLinkClass::node * nodeLinkClass::get_insvTestTimer ( timer_t tid ) void nodeLinkClass::ar_enable ( struct nodeLinkClass::node * node_ptr ) { - string ar_file = TMP_DIR_PATH + node_ptr->hostname + AUTO_RECOVERY_FILE_SUFFIX ; + string ar_file = MTC_PERSIST_PATH + node_ptr->hostname + AUTO_RECOVERY_FILE_SUFFIX ; if ( daemon_is_file_present (ar_file.data())) { wlog ("%s clearing autorecovery file counter\n", node_ptr->hostname.c_str()); @@ -8126,7 +8232,7 @@ void nodeLinkClass::ar_enable ( struct nodeLinkClass::node * node_ptr ) * * Manage Auto Recovery: * - * Case 1: Failed active controller with no enabled inactive controller. + * Case 1: Failed active controller in DX system * * Requires persistent count file and self reboot until threshold * is reached. @@ -8136,9 +8242,13 @@ void nodeLinkClass::ar_enable ( struct nodeLinkClass::node * node_ptr ) * so we don't get a rolling boot loop. * * Auto recovery count is tracked/preserved in a host named auto - * recovery counter file /etc/mtc/tmp/hostname_ar_count. + * recovery counter file /var/persist/mtc/_ar_count. * - * Case 2: All other cases + * Note: This auto recovery count file only applies to SX systems. + * Otherwise, in DX systems a node's auto recovery count + * is tracked in that node's nodeClass data structure. + * + * Case 2: All other cases ; remote hosts and SX systems * * Case 2a: No auto recovery thresholding of active controller in non AIO SX * where the user can't lock and unlock the active controller. @@ -8166,6 +8276,7 @@ void nodeLinkClass::ar_enable ( struct nodeLinkClass::node * node_ptr ) * ******************************************************************************/ +#define FORCE_SWACT_DELAY_SECS (5) int nodeLinkClass::ar_manage ( struct nodeLinkClass::node * node_ptr, autorecovery_disable_cause_enum cause, string ar_disable_banner ) @@ -8180,6 +8291,12 @@ int nodeLinkClass::ar_manage ( struct nodeLinkClass::node * node_ptr, return (rc); } + if ( node_ptr->forcing_full_enable == true ) + { + wlog ("%s already handling full enable", node_ptr->hostname.c_str()); + return (rc) ; + } + /* check for invalid call case */ if ( cause >= MTC_AR_DISABLE_CAUSE__LAST ) { @@ -8192,98 +8309,201 @@ int nodeLinkClass::ar_manage ( struct nodeLinkClass::node * node_ptr, if ( node_ptr->ar_cause != cause ) node_ptr->ar_cause = cause ; + string ar_file = MTC_PERSIST_PATH + + node_ptr->hostname + + AUTO_RECOVERY_FILE_SUFFIX ; - /* Case 1 check */ - if ( ( THIS_HOST ) && ( is_inactive_controller_main_insv() == false )) + if ( daemon_is_file_present (ar_file.data())) { - /* manage the auto recovery threshold count file */ - unsigned int value = 0 ; - - string ar_file = TMP_DIR_PATH + - node_ptr->hostname + - AUTO_RECOVERY_FILE_SUFFIX ; - - if ( daemon_is_file_present (ar_file.data())) - { - /* if the file is there then read the count and increment it */ - value = daemon_get_file_int ( ar_file.data() ); - } - value++ ; - - /* Save the new value in the file */ - daemon_log_value ( ar_file.data(), value ); - - value = daemon_get_file_int ( ar_file.data() ); - - /* set rc to reflect what the caller should do */ - if ( value > this->ar_threshold[node_ptr->ar_cause] ) - { - elog ("%s auto recovery threshold exceeded (%d)\n", - node_ptr->hostname.c_str(), - this->ar_threshold[node_ptr->ar_cause] ); - - node_ptr->ar_disabled = true ; - adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); - - allStateChange ( node_ptr, node_ptr->adminState, - MTC_OPER_STATE__ENABLED, - MTC_AVAIL_STATUS__DEGRADED ); - - mtcInvApi_update_task ( node_ptr, ar_disable_banner ); - - return (rc); - } - - wlog ("%s auto recovery (try %d of %d) (%d)", - node_ptr->hostname.c_str(), - value, - this->ar_threshold[node_ptr->ar_cause], - node_ptr->ar_cause); - - mtcInvApi_update_states_now ( node_ptr, "unlocked", - "disabled", "failed", - "disabled", "failed" ); - - lazy_graceful_fs_reboot ( node_ptr ); + /* If the file is there then read the count and increment it */ + node_ptr->ar_count[node_ptr->ar_cause] = daemon_get_file_int ( ar_file.data() ); } - else /* Case 2 */ - { - send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST ); - mtcInvApi_update_states ( node_ptr, "unlocked", "disabled", "failed" ); + node_ptr->ar_count[node_ptr->ar_cause]++ ; - if (( NOT_THIS_HOST ) && - ( this->system_type != SYSTEM_TYPE__AIO__SIMPLEX )) + /* Only save the value to a file for SIMPLEX systems. + * Preserving the auto recovery file in DX systems is problematic over + * Swact unless its stored in the active controller mounted filesystem + * which it is not. */ + if ( SIMPLEX ) + daemon_log_value ( ar_file.data(), node_ptr->ar_count[node_ptr->ar_cause] ); + + /* If not simplex then ensure there is no lingering + * file after a simplex to duplex migration */ + else if ( daemon_is_file_present ( ar_file.data() ) ) + daemon_remove_file ( ar_file.data() ); + + /* set rc to reflect what the caller should do */ + if ( node_ptr->ar_count[node_ptr->ar_cause] > this->ar_threshold[node_ptr->ar_cause] ) + { + elog ("%s auto recovery threshold of %d reached - going auto recovery disabled.", + node_ptr->hostname.c_str(), + this->ar_threshold[node_ptr->ar_cause] ); + + node_ptr->ar_disabled = true ; + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); + + allStateChange ( node_ptr, node_ptr->adminState, + MTC_OPER_STATE__DISABLED, + MTC_AVAIL_STATUS__FAILED ); + + mtcInvApi_update_task ( node_ptr, ar_disable_banner ); + + return (rc); + } + + /* Case 1: This Host and not simplex system */ + if (( THIS_HOST ) && ( NOT_SIMPLEX )) + { + /* Case 1a - This DX controller with no enabled standby controller - go degraded and no reboot */ + if ( is_inactive_controller_main_insv() == false ) { - if ( ++node_ptr->ar_count[node_ptr->ar_cause] >= - this->ar_threshold [node_ptr->ar_cause] ) - { - elog ("%s auto recovery threshold exceeded (%d)\n", - node_ptr->hostname.c_str(), - this->ar_threshold[node_ptr->ar_cause] ); - node_ptr->ar_disabled = true ; - adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); - mtcInvApi_update_task ( node_ptr, ar_disable_banner ); - rc = FAIL ; - } - else - { - wlog ("%s auto recovery (try %d of %d) (%d)", - node_ptr->hostname.c_str(), - node_ptr->ar_count[node_ptr->ar_cause], - this->ar_threshold[node_ptr->ar_cause], - node_ptr->ar_cause); - rc = PASS ; - } + alarm_enabled_failure ( node_ptr, true ); + allStateChange ( node_ptr, + node_ptr->adminState, + MTC_OPER_STATE__ENABLED, + MTC_AVAIL_STATUS__DEGRADED ); + + wlog ("%s refusing to self reboot with no enabled standby controller.", node_ptr->hostname.c_str()); + wlog ("%s ... critical enable alarm raised, running enabled but degraded.", node_ptr->hostname.c_str()); + wlog ("%s ... recommend enabling a standby controller.", node_ptr->hostname.c_str()); + mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_NO_BACKUP); + rc = FAIL ; } + + /* Case 1b - This DX controller host with an enabled standby controller - force swact and reboot */ else { - wlog ("%s auto recovery\n", node_ptr->hostname.c_str()); - rc = PASS ; + wlog ("%s auto recovery of self (try %d of %d) (%d)", + node_ptr->hostname.c_str(), + node_ptr->ar_count[node_ptr->ar_cause], + this->ar_threshold[node_ptr->ar_cause], + node_ptr->ar_cause); + + mtcInvApi_update_states_now ( node_ptr, "", "disabled", "failed", "", "" ); + + /* Turn off Heartbeat to that host */ + send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST ); + + /* Post critical failure message */ + mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_SWACT_REQ ); + + mtcTimer_reset ( node_ptr->mtcTimer ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, FORCE_SWACT_DELAY_SECS ); + + wlog ("%s force swact in %d seconds ; waiting for database 'disabled-failed' state update", + node_ptr->hostname.c_str(), FORCE_SWACT_DELAY_SECS ); + this->delayed_swact_required = true ; + node_ptr->ar_log_throttle = 0 ; + rc = FAIL ; } } + else /* Case 2 - Not this host , let the caller decide what to do */ + { + wlog ("%s auto recovery (try %d of %d) (%d)", + node_ptr->hostname.c_str(), + node_ptr->ar_count[node_ptr->ar_cause], + this->ar_threshold[node_ptr->ar_cause], + node_ptr->ar_cause); + rc = PASS ; + } return (rc); } +/***************************************************************************** + * + * Name : ar_handler + * + * Purpose : Handle node failure from ar_manage return code + * + * Description: The following cases apply whe the failed node is ... + * + * Case 1: Not the active controller + * - Auto recovery disable thresholding applies for applicable + * causes. + * + * Case 2: Active Controller in SIMPLEX or With Enabled Standby + * - Auto Recovery disable applies on Simplex system or DX System + * with enabled standby controller. + * + * Case 3: Active Controller in DX System + * - Auto Recovery disable does not apply to a active controller + * in a DX system that does not have an unlocked-enabled standby + * controller to switch activity to. + * - Logs are produced, host is degraded, alarm is raised and + * node task field is updated. + * - Locking the active contorller to recover from an auto + * recovery disabled host is not supported in a DX system. + * + * Parameters : + * + * @param node_ptr: pointer to the nodeLinkClass struct for the failing node + * @param cause : autorecovery_disable_cause_enum enumberated type of the + * failure cause + * @param ar_disable_banner : the auto recover disable cause string + * + * Returns : PASS if the auto recovery threshold is not reached. + * FAIL if the auto recovery threshold is reached and + * ar_disable is true + * + *****************************************************************************/ + +int nodeLinkClass::ar_handler ( struct nodeLinkClass::node * node_ptr, + autorecovery_disable_cause_enum cause, + string ar_disable_banner ) +{ + int ar_status = FAIL; + + if ( node_ptr->ar_disabled ) + return ar_status ; + + wlog ("%s handling node failure ; cause:%d", node_ptr->hostname.c_str(), cause ); + + // Case 1: Not the active controller + if ( NOT_THIS_HOST ) + { + if ( ( ar_status = this->ar_manage ( node_ptr, cause, ar_disable_banner ) ) == PASS ) + this->force_full_enable ( node_ptr ); + } + + // Case 2: Active Controller failed on + // - SX system or + // - DX system with enabled standby controller + else if (( SIMPLEX ) || ( this->num_controllers_enabled() > 1 )) + { + if ( ( ar_status = this->ar_manage ( node_ptr, cause, ar_disable_banner ) ) == PASS ) + { + mtcInvApi_update_states_now ( node_ptr, "", "disabled", "failed", "", "" ); + + if ( NOT_SIMPLEX ) + { + /* Turn off Heartbeat to that host */ + send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST ); + + /* Update task stating that a Swact is in progress */ + mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_SWACT_REQ ); + } + mtcTimer_reset ( node_ptr->mtcTimer ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, FORCE_SWACT_DELAY_SECS ); + node_ptr->ar_log_throttle = 0 ; + this->delayed_swact_required = true ; + wlog ("%s %s in %d seconds ; waiting for database 'disabled-failed' state update", + node_ptr->hostname.c_str(), + SIMPLEX ? "lazy reboot" : "force swact", + FORCE_SWACT_DELAY_SECS); + } + } + + // Case 3: Active Controller in DX System without enabled standby controller. + else + { + wlog ("%s refusing to self reboot with no enabled standby controller", node_ptr->hostname.c_str()); + wlog ("%s ... critical enable alarm raised", node_ptr->hostname.c_str()); + wlog ("%s ... recommend enabling a standby controller.", node_ptr->hostname.c_str()); + mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_NO_BACKUP); + } + return (ar_status); +} + /**************************************************************************** * * Name : report_dor_recovery @@ -8322,6 +8542,12 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->ar_disabled == true ) return ; + if ( node_ptr->forcing_full_enable == true ) + { + wlog ("%s already handling force full enable", node_ptr->hostname.c_str()); + return ; + } + if ( node_ptr->was_dor_recovery_mode ) { report_dor_recovery ( node_ptr , "is FAILED " ); @@ -8341,6 +8567,7 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr ) ( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK )) { adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); // no action + node_ptr->forcing_full_enable = true ; } else { @@ -8372,9 +8599,7 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr ) * * start = True * - * MTC_CMD_START_CONTROL_SVCS - * MTC_CMD_START_WORKER_SVCS - * MTC_CMD_START_STORAGE_SVCS + * No Longer Supported * * Returns : PASS = launch success * !PASS = launch failure @@ -8386,32 +8611,22 @@ int nodeLinkClass::launch_host_services_cmd ( struct nodeLinkClass::node * node_ if ( !node_ptr ) return (FAIL_NULL_POINTER); - /* Initialize the host's command request control structure */ - mtcCmd_init ( node_ptr->host_services_req ); + if ( start == true ) + { + slog ("%s Start Host Services Command Not Supported", node_ptr->hostname.c_str()); + return ( FAIL_INVALID_OPERATION ) ; + } + else + { + /* Initialize the host's command request control structure */ + mtcCmd_init ( node_ptr->host_services_req ); + } /* Service subfunction override first, efficiency. */ if ( subf == true ) { /* only supported subfunction (right now) is COMPUTE */ - if ( start == true ) - node_ptr->host_services_req.cmd = MTC_CMD_START_WORKER_SVCS ; - else - node_ptr->host_services_req.cmd = MTC_CMD_STOP_WORKER_SVCS ; - } - else if ( start == true ) - { - if ( is_controller (node_ptr) ) - node_ptr->host_services_req.cmd = MTC_CMD_START_CONTROL_SVCS ; - else if ( is_worker (node_ptr) ) - node_ptr->host_services_req.cmd = MTC_CMD_START_WORKER_SVCS ; - else if ( is_storage (node_ptr) ) - node_ptr->host_services_req.cmd = MTC_CMD_START_STORAGE_SVCS ; - else - { - slog ("%s start host services is not supported for this host type\n", - node_ptr->hostname.c_str()); - return (FAIL_BAD_CASE) ; - } + node_ptr->host_services_req.cmd = MTC_CMD_STOP_WORKER_SVCS ; } else { @@ -9879,6 +10094,14 @@ void nodeLinkClass::mem_log_test_info ( struct nodeLinkClass::node * node_ptr ) get_insvTestStages_str(node_ptr->insvTestStage).c_str(), node_ptr->insv_test_count); mem_log (str); + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tGoEnabled Main:%s Subf:%s - Services Main:%s Subf:%s - Force Full Enable Bypass:%s\n", + node_ptr->hostname.c_str(), + node_ptr->goEnabled_failed ? "Fail" : "Ok", + node_ptr->goEnabled_failed_subf ? "Fail" : "Ok", + node_ptr->hostservices_failed ? "Fail" : "Ok", + node_ptr->hostservices_failed_subf ? "Fail" : "Ok", + node_ptr->forcing_full_enable ? "Yes" : "No"); + mem_log (str); } void nodeLinkClass::mem_log_thread_info ( struct nodeLinkClass::node * node_ptr ) diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 3f5a7825..11433263 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -1,7 +1,7 @@ #ifndef __INCLUDE_NODECLASS_H__ #define __INCLUDE_NODECLASS_H__ /* - * Copyright (c) 2013-2016, 2023-2024 Wind River Systems, Inc. + * Copyright (c) 2013-2016, 2023-2025 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -67,6 +67,9 @@ using namespace std; #define SIMPLEX \ ( daemon_is_file_present ( PLATFORM_SIMPLEX_MODE ) == true ) +#define NOT_SIMPLEX \ + ( daemon_is_file_present ( PLATFORM_SIMPLEX_MODE ) == false ) + #define THIS_HOST \ ( node_ptr->hostname == this->my_hostname ) @@ -209,19 +212,6 @@ private: int mtcalive_timeout ; - /* start host service retry controls */ - int start_services_retries ; - - bool start_services_running_main ; - bool start_services_running_subf ; - - bool start_services_needed ; - bool start_services_needed_subf ; /* for the add handler that defers - start to the inservice test handler. - this provides a means of telling - maintenance that the subfunction - start needs to also be run. */ - /** Pointer to the previous node in the list */ struct node *prev; @@ -404,8 +394,8 @@ private: /* Boolean indicating the main or subfunction has start host services * failure. */ - bool hostservices_failed ; - bool hostservices_failed_subf ; + bool hostservices_failed = false ; + bool hostservices_failed_subf = false ; /* Boolean indicating the main or subfunction has inservice failure */ bool inservice_failed ; @@ -442,8 +432,12 @@ private: /* throttles the ar_disabled log to periodically indicate auto * recovery disabled state but avoid flooding that same message. */ #define AR_LOG_THROTTLE_THRESHOLD (100000) + #define AR_HANDLER_LOG_THROTTLE_THRESHOLD (1000) unsigned int ar_log_throttle ; + /** Bool to prevent nested force_full_enable and auto recovery management handling */ + bool forcing_full_enable = false ; + /** Host's mtc timer struct. Use to time handler stages. * * reset -> reset command response @@ -876,6 +870,7 @@ private: int stress_handler ( struct nodeLinkClass::node * node_ptr ); int bmc_handler ( struct nodeLinkClass::node * node_ptr ); int degrade_handler ( struct nodeLinkClass::node * node_ptr ); + int self_fail_handler ( struct nodeLinkClass::node * node_ptr ); int uptime_handler ( void ); @@ -985,6 +980,12 @@ private: autorecovery_disable_cause_enum cause, string ar_disable_banner ); + /* handle auto recovery + * - adds common handling functionality on top of ar_manage */ + int ar_handler ( struct nodeLinkClass::node * node_ptr, + autorecovery_disable_cause_enum cause, + string ar_disable_banner); + /** *********************************************************************** * * Name : nodeLinkClass::workQueue_process @@ -1160,7 +1161,6 @@ private: void clear_subf_failed_bools ( struct nodeLinkClass::node * node_ptr ); void clear_main_failed_bools ( struct nodeLinkClass::node * node_ptr ); - void clear_hostservices_ctls ( struct nodeLinkClass::node * node_ptr ); /* Enables/Clears dynamic auto recovery state. start fresh ! * called in disabled_handler (lock) and in the DONE stages @@ -1254,28 +1254,6 @@ private: */ int memory_used ; - /** Inservice memory management audit. - * - * Verifies that the node_ptr list and memory_allocs jive as well - * as all the node pointers point to a node in the linked list. - * - * @return - * an integer representing a PASS or TODO: list other error codes. - */ - int memory_audit ( void ); - - - /* Simplex mode auto recovery bools - * - * Set to true when the autorecovery threshold is reached - * and we want to avoid taking further autorecovery action - * even though it may be requested. */ - bool autorecovery_disabled = false ; - - /* Set to true by fault detection methods that are - * autorecoverable when in simplex mode. */ - bool autorecovery_enabled = false ; - /** Tracks the number of hosts that 'are currently' in service trouble * wrt heartbeat (above minor threshold). * This is used in multi-host failure avoidance. @@ -2191,7 +2169,14 @@ public: */ unsigned int ar_interval[MTC_AR_DISABLE_CAUSE__LAST] ; - int unknown_host_throttle ; + /* Used by the auto recovery algorithm for self-reboot. + * This is a flag indicating a delayed self-reboot is required. + * This ensures the FSM enters the self_reboot_handler, allowing sufficient time + * for operational and availability state changes to be committed to the database + * before initiating the reboot. */ + bool delayed_swact_required = false ; + bool self_reboot_wait = false ; + bool force_swact_wait = false ; }; /** diff --git a/mtce/src/maintenance/mtcCompMsg.cpp b/mtce/src/maintenance/mtcCompMsg.cpp index 44269a76..2c498bb7 100755 --- a/mtce/src/maintenance/mtcCompMsg.cpp +++ b/mtce/src/maintenance/mtcCompMsg.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018, 2024 Wind River Systems, Inc. + * Copyright (c) 2013-2018, 2024-2025 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -458,7 +458,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) return (PASS); } - /* inform mtcAgent of enhanced ost services support */ + /* inform mtcAgent of enhanced host services support */ msg.parm[1] = MTC_ENHANCED_HOST_SERVICES ; msg.parm[0] = rc ; msg.num = 2 ; @@ -810,14 +810,20 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char if ( mtce_name_ptr ) { - /* add the error message to the message buffer */ + /* add the message to the message buffer */ size_t len = strnlen ( mtce_name_ptr, MAX_MTCE_EVENT_NAME_LEN ); /* We don't use the buffer for mtce events to remove it from the size */ bytes = ((sizeof(mtc_message_type))-(BUF_SIZE-len)); snprintf ( &event.buf[0], MAX_MTCE_EVENT_NAME_LEN , "%s", mtce_name_ptr ); - rc = FAIL_OPERATION ; + + // If the supplied mtce_name_str string contains 'failed' + // then set the rc to FAIL_OPERATION + if ( strcasestr (mtce_name_ptr, "failed" ) ) + rc = FAIL_OPERATION ; + if ( strcasestr (mtce_name_ptr, "timeout" ) ) + rc = FAIL_TIMEOUT ; } else { @@ -983,6 +989,38 @@ int create_mtcAlive_msg ( ctrl_type * ctrl_ptr, mtc_message_type & msg, int cmd, } } + /* Set Out-Of-Band goEnable failure flag for goEnable failure. */ + if ( ctrl_ptr->goEnable_result ) + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__MAIN_GOENABLE_FAIL ; + if ( ctrl_ptr->goEnable_result_subf ) + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SUBF_GOENABLE_FAIL ; + + /* Set the Out-Of-Band Host Services failure + * flag for any start host services that failed */ + if ( ctrl_ptr->storage_hostservices_result ) + { + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__MAIN_SERVICES_FAIL ; + dlog3 ("storage start host services failed ; rc:%d", ctrl_ptr->storage_hostservices_result ); + } + else if ( ctrl_ptr->controller_hostservices_result ) + { + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__MAIN_SERVICES_FAIL ; + dlog3 ("controller start host services failed ; rc:%d", ctrl_ptr->controller_hostservices_result ); + } + else if ( is_subfunction_worker () ) + { + if ( ctrl_ptr->worker_hostservices_result ) + { + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SUBF_SERVICES_FAIL ; + dlog3 ("worker subfunction start host services failed ; rc:%d", ctrl_ptr->worker_hostservices_result ); + } + } + else if ( ctrl_ptr->worker_hostservices_result ) + { + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__MAIN_SERVICES_FAIL ; + dlog3 ("worker start host services failed ; rc:%d", ctrl_ptr->worker_hostservices_result ); + } + if ( daemon_is_file_present ( SMGMT_DEGRADED_FILE ) ) msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_DEGRADED ; diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp index d9d8033d..d7a29d39 100755 --- a/mtce/src/maintenance/mtcCtrlMsg.cpp +++ b/mtce/src/maintenance/mtcCtrlMsg.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018, 2023-2024 Wind River Systems, Inc. + * Copyright (c) 2013-2018, 2023-2025 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -295,15 +295,7 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, if ( msg.num > 0 ) { /* log if not locked message */ - if ( msg.cmd != MTC_MSG_LOCKED ) - { - ilog ("%s %s request ACK (rc:%d) (%s)", - hostname.c_str(), - get_mtcNodeCommand_str(msg.cmd), - msg.parm[0], - iface_name_ptr); - } - else + if ( msg.cmd == MTC_MSG_LOCKED ) { mlog ("%s %s request ACK (rc:%d) (%s)", hostname.c_str(), @@ -311,6 +303,38 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, msg.parm[0], iface_name_ptr); } + else if ( msg.cmd == MTC_CMD_HOST_SVCS_RESULT ) + { + ilog ("%s %s (rc:%d) (%s)", + hostname.c_str(), + msg.buf, + msg.parm[0], + iface_name_ptr); + } + else + { + ilog ("%s %s request ACK (rc:%d) (%s)", + hostname.c_str(), + get_mtcNodeCommand_str(msg.cmd), + msg.parm[0], + iface_name_ptr); + } + } + else if ( msg.cmd == MTC_MSG_LOCKED ) + { + mlog ("%s %s request ACK (%s)", + hostname.c_str(), + get_mtcNodeCommand_str(msg.cmd), + iface_name_ptr); + + } + else + { + /* log other command request ACKs that don't have any return parameters */ + ilog ("%s %s request ACK (%s)", + hostname.c_str(), + get_mtcNodeCommand_str(msg.cmd), + iface_name_ptr); } } @@ -731,10 +755,8 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface, string json_dict case MTC_CMD_STOP_CONTROL_SVCS: case MTC_CMD_STOP_WORKER_SVCS: case MTC_CMD_STOP_STORAGE_SVCS: - case MTC_CMD_START_CONTROL_SVCS: - case MTC_CMD_START_WORKER_SVCS: - case MTC_CMD_START_STORAGE_SVCS: { + ilog ("%s %s command sent", hostname.c_str(), get_mtcNodeCommand_str(cmd)); snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s", get_cmd_req_msg_header() ); mtc_cmd.cmd = cmd ; rc = PASS ; diff --git a/mtce/src/maintenance/mtcNodeComp.cpp b/mtce/src/maintenance/mtcNodeComp.cpp index 5a9a5367..6e261e99 100644 --- a/mtce/src/maintenance/mtcNodeComp.cpp +++ b/mtce/src/maintenance/mtcNodeComp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2016, 2024 Wind River Systems, Inc. + * Copyright (c) 2013-2016, 2024-2025 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -832,7 +832,7 @@ void _scripts_cleanup ( script_set_enum script_set ) ***************************************************************************/ void _manage_services_scripts ( void ) { - bool failed = false ; + int status = PASS ; char str [BUF_SIZE] ; if ( ! ctrl.hostservices.scripts ) @@ -842,6 +842,8 @@ void _manage_services_scripts ( void ) return ; } + string current_cmd = get_mtcNodeCommand_str(ctrl.current_hostservices_command) ; + memset (str,0,BUF_SIZE); /* do if all the scripts are done ? */ @@ -852,31 +854,32 @@ void _manage_services_scripts ( void ) { if ( ctrl.hostservices.script[i].status ) { - if ( failed == false ) + if ( status == PASS ) { /* only report of the first failure */ snprintf(str, BUF_SIZE, "%s failed ; rc:%d", ctrl.hostservices.script[i].name.data(), ctrl.hostservices.script[i].status ); - failed = true ; + status = ctrl.hostservices.script[i].status ; + break ; } } } /* handle the aggrigate status */ - if ( failed == true ) + if ( status ) { - elog ("Host Services: %s\n", str ); + ilog ("%s result: %s", current_cmd.c_str(), str ); mtce_send_event ( sock_ptr, MTC_CMD_HOST_SVCS_RESULT, str ); } else { - ilog ("Host Services Complete ; all passed ; %s", get_mtcNodeCommand_str(ctrl.current_hostservices_command)); - mtce_send_event ( sock_ptr, MTC_CMD_HOST_SVCS_RESULT, NULL ); + ilog ("%s complete ; all passed", current_cmd.c_str()); + mtce_send_event ( sock_ptr, MTC_CMD_HOST_SVCS_RESULT, current_cmd.data()); } ctrl.active_script_set = NO_SCRIPTS ; } - /* do if have we timed out ? */ + /* check for 5 minute timeout */ else if ( ctrl.hostservices.timer.ring == true ) { bool found = false ; @@ -887,9 +890,13 @@ void _manage_services_scripts ( void ) { if ( ctrl.hostservices.script[i].done == false ) { - snprintf(str, BUF_SIZE, "%s (timeout)", ctrl.hostservices.script[i].name.data() ); + status = FAIL_TIMEOUT ; + snprintf(str, BUF_SIZE, "%s timeout", ctrl.hostservices.script[i].name.data() ); found = true ; - wlog ("host services timeout on %s\n", ctrl.hostservices.script[i].name.c_str()); + elog ("%s timeout on %s\n", + current_cmd.c_str(), + ctrl.hostservices.script[i].name.c_str()); + mtce_send_event ( sock_ptr, MTC_CMD_HOST_SVCS_RESULT, str ); break ; } @@ -906,6 +913,23 @@ void _manage_services_scripts ( void ) return ; } + /* Specify which start host services command failed */ + if ( status ) + { + if ( ctrl.current_hostservices_command == MTC_CMD_START_CONTROL_SVCS ) + ctrl.controller_hostservices_result = status ; + else if ( ctrl.current_hostservices_command == MTC_CMD_START_WORKER_SVCS ) + ctrl.worker_hostservices_result = status ; + else if ( ctrl.current_hostservices_command == MTC_CMD_START_STORAGE_SVCS ) + ctrl.storage_hostservices_result = status ; + else + { + slog ("unexpected current hostservices command=%d status=%d", + ctrl.current_hostservices_command, status ); + } + } + + mtcTimer_reset (ctrl.hostservices.timer ); _scripts_cleanup (ctrl.active_script_set) ; } @@ -992,6 +1016,7 @@ void _manage_goenabled_tests ( void ) ilog ("GoEnabled Subfunction Testing Failed ; at least one test failed\n"); daemon_log ( GOENABLED_SUBF_FAIL , str ); + ctrl.goEnable_result_subf = FAIL ; send_mtc_msg ( sock_ptr, MTC_MSG_SUBF_GOENABLED_FAILED, str ); break ; } @@ -1002,6 +1027,7 @@ void _manage_goenabled_tests ( void ) ilog ("GoEnabled Testing Failed ; at least one test failed\n"); daemon_log ( GOENABLED_MAIN_FAIL , str ); + ctrl.goEnable_result = FAIL ; send_mtc_msg ( sock_ptr, MTC_MSG_MAIN_GOENABLED_FAILED, str ); break ; } @@ -1067,6 +1093,7 @@ void _manage_goenabled_tests ( void ) daemon_remove_file ( GOENABLED_SUBF_PASS ); send_mtc_msg ( sock_ptr, MTC_MSG_SUBF_GOENABLED_FAILED, str ); daemon_log ( GOENABLED_SUBF_FAIL , str ); + ctrl.goEnable_result_subf = FAIL ; break ; } case GOENABLED_MAIN_SCRIPTS: @@ -1074,6 +1101,7 @@ void _manage_goenabled_tests ( void ) daemon_remove_file ( GOENABLED_SUBF_PASS ); send_mtc_msg ( sock_ptr, MTC_MSG_MAIN_GOENABLED_FAILED, str ); daemon_log ( GOENABLED_MAIN_FAIL , str ); + ctrl.goEnable_result = FAIL ; break ; } default: @@ -1420,10 +1448,15 @@ void daemon_service_run ( void ) int rc = PASS ; int file_not_present_count = 0 ; - /* Bool to track whether the start host services scripts run has - * been attempted at least once since last process startup. */ + /* Bool to track whether the start host services scripts needs to be run. */ bool start_host_services_needs_to_be_run = true ; + /* Don't start host services if the node is locked */ + if ( daemon_is_file_present ( NODE_LOCKED_FILE ) == true ) + { + ilog ("locked node"); + start_host_services_needs_to_be_run = false ; + } if ( daemon_is_file_present ( NODE_RESET_FILE ) ) { wlog ("mtce reboot required"); @@ -1948,7 +1981,7 @@ void daemon_service_run ( void ) * Need to ensure that the appropriate host * services are started for the system/node * type. */ - if ( start_host_services_needs_to_be_run == true ) + if ( start_host_services_needs_to_be_run == true ) { if ( ctrl.system_type == SYSTEM_TYPE__NORMAL ) { @@ -1981,7 +2014,7 @@ void daemon_service_run ( void ) ctrl.start_controller_hostservices = true ; if ( ctrl.nodetype & WORKER_TYPE ) ctrl.start_worker_hostservices = true ; - start_host_services_needs_to_be_run = false ; + start_host_services_needs_to_be_run = false ; } else if (( daemon_is_file_present ( GOENABLED_MAIN_FAIL ) || ( daemon_is_file_present ( GOENABLED_SUBF_FAIL )))) @@ -2001,7 +2034,7 @@ void daemon_service_run ( void ) ctrl.start_worker_hostservices = true ; else if ( ctrl.nodetype & STORAGE_TYPE ) ctrl.start_storage_hostservices = true ; - start_host_services_needs_to_be_run = false ; + start_host_services_needs_to_be_run = false ; } else if ( daemon_is_file_present ( GOENABLED_MAIN_FAIL ) ) { @@ -2012,7 +2045,6 @@ void daemon_service_run ( void ) } } - // Handle auto start of node personality services. // - prioritize controller first // - prevent more than one being posted at once diff --git a/mtce/src/maintenance/mtcNodeComp.h b/mtce/src/maintenance/mtcNodeComp.h index ed9abb93..6242f250 100644 --- a/mtce/src/maintenance/mtcNodeComp.h +++ b/mtce/src/maintenance/mtcNodeComp.h @@ -1,7 +1,7 @@ #ifndef __INCLUDE_MTCNODECOMP_HH__ #define __INCLUDE_MTCNODECOMP_HH__ /* - * Copyright (c) 2015-2016, 2024 Wind River Systems, Inc. + * Copyright (c) 2015-2016, 2024-2025 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -139,6 +139,17 @@ typedef struct bool start_worker_hostservices = false ; bool start_storage_hostservices = false ; + /* Store the result of the last Start Host Services + * completion status for each personality. */ + int controller_hostservices_result = PASS ; + int worker_hostservices_result = PASS ; + int storage_hostservices_result = PASS ; + + /* Store the result of the last goEnabled completion + * status */ + int goEnable_result = PASS ; + int goEnable_result_subf = PASS ; + /* The script set that is executing */ script_set_enum active_script_set ; diff --git a/mtce/src/maintenance/mtcNodeFsm.cpp b/mtce/src/maintenance/mtcNodeFsm.cpp index 74fc1a0d..ee11a312 100755 --- a/mtce/src/maintenance/mtcNodeFsm.cpp +++ b/mtce/src/maintenance/mtcNodeFsm.cpp @@ -69,6 +69,11 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr ) return RETRY ; } + /* Check for a 'delayed self reboot required' condition */ + if ( this->delayed_swact_required ) + if ( node_ptr->hostname == this->my_hostname ) + return ( this->self_fail_handler ( node_ptr )); + /* manage the host connected state and board management alarms */ nodeLinkClass::bmc_handler ( node_ptr ); diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index a00152d1..fb17627f 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2020, 2023-2024 Wind River Systems, Inc. + * Copyright (c) 2013-2020, 2023-2025 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -796,7 +796,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) /* clear all the past enable failure bools */ clear_main_failed_bools ( node_ptr ); clear_subf_failed_bools ( node_ptr ); - clear_hostservices_ctls ( node_ptr ); /* Clear all degrade flags except for the HWMON one */ clear_host_degrade_causes ( node_ptr->degrade_mask ); @@ -829,18 +828,12 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) case MTC_AVAIL_STATUS__INTEST: case MTC_AVAIL_STATUS__FAILED: - /* enable auto recovery if the inactive controller - * is out of service */ - //if (( is_controller (node_ptr) ) && ( NOT_THIS_HOST )) - // node_ptr->ar_disabled = false ; - // this->autorecovery_enabled = true ; - /* fall through */ case MTC_AVAIL_STATUS__DEGRADED: case MTC_AVAIL_STATUS__AVAILABLE: { - if (( is_active_controller ( node_ptr->hostname )) && + if ( ( NOT_SIMPLEX ) && ( is_active_controller ( node_ptr->hostname )) && ( is_inactive_controller_main_insv() == false )) { wlog ("%s recovering active controller from %s-%s-%s\n", @@ -1068,6 +1061,12 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->goEnabled = false ; node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ; + if ( node_ptr->forcing_full_enable == true ) + { + ilog ("%s clearing force full enable recursion prevention flag", node_ptr->hostname.c_str()); + node_ptr->forcing_full_enable = false ; + } + /* Set uptime to zero in mtce and in the database */ node_ptr->uptime_save = 0 ; set_uptime ( node_ptr, 0 , false ); @@ -1159,8 +1158,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) } else { - plog ("%s is MTCALIVE (uptime:%d secs)\n", - node_ptr->hostname.c_str(), node_ptr->uptime ); + plog ("%s is MTCALIVE (uptime:%d secs) (oob:%08X)", + node_ptr->hostname.c_str(), node_ptr->uptime, node_ptr->mtce_flags ); if ((NOT_THIS_HOST) && ( node_ptr->uptime > ((unsigned int)(node_ptr->mtcalive_timeout*2)))) { @@ -1198,7 +1197,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->unlock_cmd_ack = false ; send_mtc_cmd ( node_ptr->hostname , MTC_MSG_UNLOCKED, MGMNT_INTERFACE ); - /* Request Out-Of--Service test execution */ + /* Request Out-Of-Service test execution */ send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MAIN_GOENABLED, MGMNT_INTERFACE ); /* now officially in the In-Test state */ @@ -1257,7 +1256,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->goEnabled = false ; - /* start waiting fhr the ENABLE READY message */ + /* start waiting for the ENABLE READY message */ enableStageChange ( node_ptr, MTC_ENABLE__GOENABLED_WAIT ); break ; @@ -1298,7 +1297,24 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) mtcInvApi_update_task ( node_ptr, MTC_TASK_INITIALIZING ); /* ok. great, got the go-enabled message, lets move on */ - enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_START ); + + /* Don't start the self heartbeat for the active controller. + * Also, in AIO , hosts that have a controller function also + * have a worker function and the heartbeat for those hosts + * are started at the end of the subfunction handler. */ + if (( THIS_HOST ) || + (( AIO_SYSTEM ) && ( is_controller(node_ptr)) )) + { + enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE ); + } + else + { + /* allow the fsm to wait for up to 1 minute for the + * hbsClient's ready event before starting heartbeat + * test. */ + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 ); + enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_WAIT ); + } } else if ( mtcTimer_expired ( node_ptr->mtcTimer )) { @@ -1327,102 +1343,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) break ; } - case MTC_ENABLE__HOST_SERVICES_START: - { - bool start = true ; - - plog ("%s Starting Host Services\n", node_ptr->hostname.c_str()); - if ( this->launch_host_services_cmd ( node_ptr, start ) != PASS ) - { - elog ("%s %s failed ; launch\n", - node_ptr->hostname.c_str(), - node_ptr->host_services_req.name.c_str()); - - node_ptr->hostservices_failed = true ; - alarm_enabled_failure ( node_ptr, true ); - enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); - mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_SERVICE_FAIL ); - - /* handle auto recovery for this failure */ - if ( ar_manage ( node_ptr, - MTC_AR_DISABLE_CAUSE__HOST_SERVICES, - MTC_TASK_AR_DISABLED_SERVICES ) != PASS ) - break ; - } - else - { - mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLING ); - enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_WAIT ); - } - break ; - } - - case MTC_ENABLE__HOST_SERVICES_WAIT: - { - /* Wait for host services to complete - pass or fail. - * The host_services_handler manages timeout. */ - rc = this->host_services_handler ( node_ptr ); - if ( rc == RETRY ) - { - /* wait for the mtcClient's response ... */ - break ; - } - else if ( rc != PASS ) - { - node_ptr->hostservices_failed = true ; - alarm_enabled_failure ( node_ptr, true ); - enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); - - - /* distinguish 'timeout' from other 'execution' failures */ - if ( rc == FAIL_TIMEOUT ) - { - elog ("%s %s failed ; timeout\n", - node_ptr->hostname.c_str(), - node_ptr->host_services_req.name.c_str()); - - mtcInvApi_update_task ( node_ptr, - MTC_TASK_MAIN_SERVICE_TO ); - } - else - { - elog ("%s %s failed ; rc:%d\n", - node_ptr->hostname.c_str(), - node_ptr->host_services_req.name.c_str(), - rc); - - mtcInvApi_update_task ( node_ptr, - MTC_TASK_MAIN_SERVICE_FAIL ); - } - - /* handle auto recovery for this failure */ - if ( ar_manage ( node_ptr, - MTC_AR_DISABLE_CAUSE__HOST_SERVICES, - MTC_TASK_AR_DISABLED_SERVICES ) != PASS ) - break ; - } - else /* success path */ - { - /* Don't start the self heartbeat for the active controller. - * Also, in AIO , hosts that have a controller function also - * have a worker function and the heartbeat for those hosts - * are started at the end of the subfunction handler. */ - if (( THIS_HOST ) || - (( AIO_SYSTEM ) && ( is_controller(node_ptr)) )) - { - enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE ); - } - else - { - /* allow the fsm to wait for up to 1 minute for the - * hbsClient's ready event before starting heartbeat - * test. */ - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 ); - enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_WAIT ); - } - } - break ; - } case MTC_ENABLE__HEARTBEAT_WAIT: { @@ -1708,7 +1628,6 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) /* clear all the past enable failure bools */ clear_main_failed_bools ( node_ptr ); clear_subf_failed_bools ( node_ptr ); - clear_hostservices_ctls ( node_ptr ); /* Disable the heartbeat service for Graceful Recovery */ send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST ); @@ -2331,77 +2250,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) /* O.K. clearing the state now that we got it */ node_ptr->goEnabled = false ; - recoveryStageChange ( node_ptr, MTC_RECOVERY__HOST_SERVICES_START ); - } - else if ( node_ptr->mtcTimer.ring == true ) - { - elog ("%s has GOENABLED Timeout\n", node_ptr->hostname.c_str()); - mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_TO ); - - node_ptr->mtcTimer.ring = false ; - - this->force_full_enable ( node_ptr ); - } - break; - } - - case MTC_RECOVERY__HOST_SERVICES_START: - { - bool start = true ; - - plog ("%s Starting Host Services\n", node_ptr->hostname.c_str()); - if ( this->launch_host_services_cmd ( node_ptr, start ) != PASS ) - { - elog ("%s %s failed ; launch\n", - node_ptr->hostname.c_str(), - node_ptr->host_services_req.name.c_str()); - node_ptr->hostservices_failed = true ; - this->force_full_enable ( node_ptr ); - } - else - { - recoveryStageChange ( node_ptr, MTC_RECOVERY__HOST_SERVICES_WAIT ); - } - break ; - } - case MTC_RECOVERY__HOST_SERVICES_WAIT: - { - /* Wait for host services to complete - pass or fail. - * The host_services_handler manages timeout. */ - rc = this->host_services_handler ( node_ptr ); - if ( rc == RETRY ) - { - /* wait for the mtcClient's response ... */ - break ; - } - else if ( rc != PASS ) - { - node_ptr->hostservices_failed = true ; - if ( rc == FAIL_TIMEOUT ) - { - elog ("%s %s failed ; timeout\n", - node_ptr->hostname.c_str(), - node_ptr->host_services_req.name.c_str()); - - mtcInvApi_update_task ( node_ptr, - MTC_TASK_START_SERVICE_TO ); - } - else - { - elog ("%s %s failed ; rc=%d\n", - node_ptr->hostname.c_str(), - node_ptr->host_services_req.name.c_str(), - rc); - - mtcInvApi_update_task ( node_ptr, - MTC_TASK_START_SERVICE_FAIL ); - } - this->force_full_enable ( node_ptr ); - } - else /* success path */ - { - /* The active controller would never get/be here but - * if it did then just fall through to change state. */ + /* Manage state change */ if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { /* Here we need to run the sub-fnction goenable and start @@ -2436,7 +2285,16 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE ); } } - break ; + else if ( node_ptr->mtcTimer.ring == true ) + { + elog ("%s has GOENABLED Timeout\n", node_ptr->hostname.c_str()); + mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_TO ); + + node_ptr->mtcTimer.ring = false ; + + this->force_full_enable ( node_ptr ); + } + break; } case MTC_RECOVERY__CONFIG_COMPLETE_WAIT: { @@ -2504,7 +2362,9 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->goEnabled_subf = false ; /* ok. great, got the go-enabled message, lets move on */ - recoveryStageChange ( node_ptr, MTC_RECOVERY__SUBF_SERVICES_START ); + mtcTimer_reset ( node_ptr->mtcTimer ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_WORKER_CONFIG_TIMEOUT ); + recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START ); } else if ( node_ptr->mtcTimer.ring == true ) { @@ -2520,72 +2380,6 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) break ; } - case MTC_RECOVERY__SUBF_SERVICES_START: - { - bool start = true ; - bool subf = true ; - - plog ("%s-worker Starting Host Services\n", node_ptr->hostname.c_str()); - - if ( this->launch_host_services_cmd ( node_ptr, start, subf ) != PASS ) - { - elog ("%s-worker %s failed ; launch\n", - node_ptr->hostname.c_str(), - node_ptr->host_services_req.name.c_str()); - node_ptr->hostservices_failed_subf = true ; - mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_FAIL ); - this->force_full_enable ( node_ptr ); - } - else - { - recoveryStageChange ( node_ptr, MTC_RECOVERY__SUBF_SERVICES_WAIT ); - } - break ; - } - case MTC_RECOVERY__SUBF_SERVICES_WAIT: - { - /* Wait for host services to complete - pass or fail. - * The host_services_handler manages timeout. */ - rc = this->host_services_handler ( node_ptr ); - if ( rc == RETRY ) - { - /* wait for the mtcClient's response ... */ - break ; - } - else if ( rc != PASS ) - { - node_ptr->hostservices_failed_subf = true ; - if ( rc == FAIL_TIMEOUT ) - { - elog ("%s-worker %s failed ; timeout\n", - node_ptr->hostname.c_str(), - node_ptr->host_services_req.name.c_str()); - - mtcInvApi_update_task ( node_ptr, - MTC_TASK_START_SERVICE_TO ); - } - else - { - elog ("%s-worker %s failed ; rc=%d\n", - node_ptr->hostname.c_str(), - node_ptr->host_services_req.name.c_str(), - rc); - - mtcInvApi_update_task ( node_ptr, - MTC_TASK_START_SERVICE_FAIL ); - } - this->force_full_enable ( node_ptr ); - } - else /* success path */ - { - /* allow the fsm to wait for up to 'worker config timeout' - * for the hbsClient's ready event before starting heartbeat - * test. */ - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_WORKER_CONFIG_TIMEOUT ); - recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START ); - } - break ; - } case MTC_RECOVERY__HEARTBEAT_START: { if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) @@ -2858,7 +2652,6 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr ) /* clear all the enable failure bools */ clear_main_failed_bools ( node_ptr ); clear_subf_failed_bools ( node_ptr ); - clear_hostservices_ctls ( node_ptr ); enableStageChange ( node_ptr, MTC_ENABLE__START ) ; disableStageChange ( node_ptr, MTC_DISABLE__DIS_SERVICES_WAIT) ; @@ -2973,6 +2766,12 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr ) /* proceed to handle force lock if the launch fails */ disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK ); } + else + { + ilog ("%s %s launched", + node_ptr->hostname.c_str(), + node_ptr->host_services_req.name.c_str()) + } } break ; } @@ -6499,26 +6298,8 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) } } } - /* default retries counter to zero before START_SERVICES */ + /* default retries counter to zero before MTC_SERVICES */ node_ptr->retries = 0 ; - node_ptr->addStage = MTC_ADD__START_SERVICES ; - break ; - } - - case MTC_ADD__START_SERVICES: - { - if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && - ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) && - (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) || - ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ))) - { - ilog ("%s scheduling start host services\n", - node_ptr->hostname.c_str()); - - node_ptr->start_services_needed = true ; - node_ptr->start_services_retries = 0 ; - } - node_ptr->addStage = MTC_ADD__MTC_SERVICES ; break ; } @@ -6620,7 +6401,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) * to start host services. */ if ( this->dor_mode_active ) { - node_ptr->start_services_needed_subf = true ; adminActionChange ( node_ptr , MTC_ADMIN_ACTION__ENABLE_SUBF ); } } @@ -7486,22 +7266,6 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr ) bool start = false ; this->launch_host_services_cmd ( node_ptr, start ); } - else if ( daemon_want_fit ( FIT_CODE__START_HOST_SERVICES, node_ptr->hostname )) - { - if (( node_ptr->start_services_needed == false ) && - ( node_ptr->start_services_running_main == false )) - { - node_ptr->start_services_needed = true ; - node_ptr->start_services_retries = 0 ; - } - else - { - ilog ("%s start host services (FIT) rejected (%d:%d)\n", - node_ptr->hostname.c_str(), - node_ptr->start_services_needed, - node_ptr->start_services_running_main); - } - } if (( daemon_is_file_present ( MTC_CMD_FIT__GOENABLE_AUDIT )) && ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && @@ -7608,8 +7372,9 @@ int nodeLinkClass::pxeboot_mtcAlive_monitor ( struct nodeLinkClass::node * node_ // Don't monitor pxeboot mtcAlive messaging while the node is - // locked or in the following administrative action states. + // locked, disabled or in the following administrative action states. if (( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) || + ( node_ptr->operState == MTC_OPER_STATE__DISABLED ) || ( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK ) || ( node_ptr->adminAction == MTC_ADMIN_ACTION__ENABLE ) || ( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER ) || @@ -7822,20 +7587,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) case MTC_INSV_TEST__START: { mtcTimer_reset ( node_ptr->insvTestTimer ); - - /* Run the inservice test more frequently while - * start_services_needed is true and we are not - * in failure retry mode */ - if (( node_ptr->start_services_needed == true ) && - ( node_ptr->hostservices_failed == false ) && - ( node_ptr->hostservices_failed_subf == false )) - { - mtcTimer_start ( node_ptr->insvTestTimer, mtcTimer_handler, MTC_SECS_2 ); - } - else - { - mtcTimer_start ( node_ptr->insvTestTimer, mtcTimer_handler, insv_test_period ); - } + mtcTimer_start ( node_ptr->insvTestTimer, mtcTimer_handler, insv_test_period ); insvTestStageChange ( node_ptr, MTC_INSV_TEST__WAIT ); break ; } @@ -7957,147 +7709,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) "DOR mode active\n"); } - /************************************************************* - * Handle Main Function Start Host Services if it's 'needed' - ************************************************************/ - else if ( node_ptr->start_services_needed == true ) - { - /* If Main Start Host Services is not already running - * then launch it */ - if ( node_ptr->start_services_running_main == false ) - { - /* Only launch if the node is successfully configured - * and tested */ - if (( node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY ) && - ( node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) && - ( node_ptr->mtce_flags & MTC_FLAG__MAIN_GOENABLED )) - { - /* Launch 'start' for this node type */ - bool start = true ; - if ( this->launch_host_services_cmd ( node_ptr , start ) != PASS ) - { - /* failed -> retry */ - node_ptr->hostservices_failed = true ; - node_ptr->start_services_running_main = false ; - node_ptr->start_services_retries++ ; - } - else - { - /* launched successfully */ - node_ptr->start_services_running_main = true ; - node_ptr->hostservices_failed = false ; - } - } - else - { - ilog("%s start host services ; waiting to launch (%x)", - node_ptr->hostname.c_str(), - node_ptr->mtce_flags); - } - } - /* Handle Main start host services response */ - else - { - /* Wait for host services to complete - pass or fail. - * The host_services_handler manages timeout. */ - int rc = this->host_services_handler ( node_ptr ); - if ( rc == RETRY ) - { - /* wait for the mtcClient's response ... */ - break ; - } - else if ( rc != PASS ) - { - node_ptr->hostservices_failed = true ; - node_ptr->start_services_retries++ ; - wlog ("%s %s request failed ; (retry %d)\n", - node_ptr->hostname.c_str(), - node_ptr->host_services_req.name.c_str(), - node_ptr->start_services_retries); - } - else /* success path */ - { - node_ptr->start_services_needed = false ; - node_ptr->hostservices_failed = false ; - node_ptr->start_services_retries = 0 ; - } - node_ptr->start_services_running_main = false ; - } - } - /************************************************************* - * Handle Sub Function Start Host Services if it's 'needed' - ************************************************************/ - else if ( node_ptr->start_services_needed_subf == true ) - { - /* If Subf Start Host Services is not already running - * then launch it */ - if ( node_ptr->start_services_running_subf == false ) - { - /* Only launch if the node and subfunction are - * successfully configured and tested */ - if (( node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY ) && - ( node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) && - ( node_ptr->mtce_flags & MTC_FLAG__MAIN_GOENABLED ) && - ( node_ptr->mtce_flags & MTC_FLAG__SUBF_CONFIGURED ) && - ( node_ptr->mtce_flags & MTC_FLAG__SUBF_GOENABLED )) - { - /* Launch 'start' for this subfunction type */ - bool start = true ; - bool subf = true ; - if ( this->launch_host_services_cmd ( node_ptr, start, subf ) != PASS ) - { - /* failed -> retry */ - node_ptr->hostservices_failed_subf = true ; - node_ptr->start_services_running_subf = false ; - node_ptr->start_services_retries++ ; - } - else - { - /* launched successfully */ - node_ptr->hostservices_failed_subf = false ; - node_ptr->start_services_running_subf = true ; - } - } - else - { - ilog("%s subf start host services ; waiting to launch (%x)", - node_ptr->hostname.c_str(), - node_ptr->mtce_flags); - } - } - /* Handle Subf start host services response */ - else - { - /* Wait for host services to complete - pass or fail. - * The host_services_handler manages timeout. */ - int rc = this->host_services_handler ( node_ptr ); - if ( rc == RETRY ) - { - /* wait for the mtcClient's response ... */ - break ; - } - node_ptr->start_services_running_subf = false ; - if ( rc != PASS ) - { - node_ptr->start_services_running_subf = false ; - node_ptr->hostservices_failed_subf = true ; - node_ptr->start_services_retries++ ; - - wlog ("%s %s request failed ; (retry %d)\n", - node_ptr->hostname.c_str(), - node_ptr->host_services_req.name.c_str(), - node_ptr->start_services_retries); - } - else /* success path */ - { - node_ptr->start_services_needed_subf = false ; - node_ptr->hostservices_failed_subf = false ; - node_ptr->start_services_running_subf = false ; - node_ptr->start_services_retries = 0 ; - } - node_ptr->start_services_running_subf = false ; - } - } if ( NOT_THIS_HOST ) { if ((( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) || @@ -8169,8 +7780,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) * **/ if (( node_ptr->operState_subf == MTC_OPER_STATE__DISABLED ) && - ( node_ptr->ar_disabled == false ) && - ( node_ptr->start_services_needed == false )) + ( node_ptr->ar_disabled == false )) { if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE_SUBF ) && ( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE )) @@ -8197,28 +7807,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) } } } - /* Only raise this alarm while in simplex */ - if (( num_controllers_enabled() < 2 ) && - (( node_ptr->goEnabled_failed_subf == true ) || - ( node_ptr->inservice_failed_subf == true ) || - ( node_ptr->hostservices_failed_subf == true ))) - { - if ( node_ptr->alarms[MTC_ALARM_ID__CH_COMP] == FM_ALARM_SEVERITY_CLEAR ) - { - wlog ("%s insv test detected subfunction failure ; degrading host\n", - node_ptr->hostname.c_str()); - - alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_MAJOR ); - - allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED, - MTC_OPER_STATE__ENABLED, - MTC_AVAIL_STATUS__DEGRADED ); - - subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED, - MTC_AVAIL_STATUS__FAILED ); - - } - } } /* Monitor the health of the host */ @@ -8634,3 +8222,95 @@ int nodeLinkClass::cfg_handler ( struct nodeLinkClass::node * node_ptr ) } return (PASS); } + +/***************************************************************************** + * Name : self_fail_handler + * + * Purpose : Handle force failure of self for Fully DX enabled or SX systems + * + * Description: Wait for mtcTimer to expire giving the the active controller + * time to flush any outstanding state change updates to the + * database. Then trigger a force shutdown of SM services. + * + * Simplex System behavior: issue a lazy reboot + * Duplex System behavior : wait for swact to the enabled standby controller. + * + * Assumptions: Only called in a DX system if the standby controller is enabled. + * Do a last second check for the enabled standby controller. + * Otherwise, abort and revert back to enabled-degraded. + * + * Parameters : + * @param node_ptr - pointed toi this host's nodeLinkClass control structure + * + *****************************************************************************/ +int nodeLinkClass::self_fail_handler ( struct nodeLinkClass::node * node_ptr ) +{ + /* Wait for this Simplex node to lazy reboot */ + if (this->self_reboot_wait) + { + ilog_throttled ( node_ptr->ar_log_throttle, AR_HANDLER_LOG_THROTTLE_THRESHOLD, + "%s ... waiting on lazy reboot", node_ptr->hostname.c_str()); + return (PASS); + } + /* Wait for SM to shut down the mtcAgent */ + else if (this->force_swact_wait) + { + ilog_throttled ( node_ptr->ar_log_throttle, AR_HANDLER_LOG_THROTTLE_THRESHOLD, + "%s ... waiting on force swact", node_ptr->hostname.c_str()); + return (PASS); + } + + /* Wait for the database update */ + else if ( node_ptr->mtcTimer.ring ) + { + // Last second check for an active standby controller in a DX system + if (( NOT_SIMPLEX ) && ( is_inactive_controller_main_insv () == false )) + { + // ERIK: TEST ME: Force this test case + wlog ("%s refusing to self reboot with no enabled standby controller", node_ptr->hostname.c_str()); + wlog ("%s ... critical enable alarm raised", node_ptr->hostname.c_str()); + wlog ("%s ... recommend enabling a standby controller.", node_ptr->hostname.c_str()); + allStateChange ( node_ptr, + node_ptr->adminState, + MTC_OPER_STATE__ENABLED, + MTC_AVAIL_STATUS__DEGRADED ); + alarm_enabled_failure ( node_ptr, true ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_NO_BACKUP); + this->delayed_swact_required = false ; + } + else + { + /* Force an uncontrolled SWACT to enabled standby controller */ + /* Tell SM we are unhealthy so that it shuts down all its services */ + wlog ("%s forcing SM to shut down services by %s", node_ptr->hostname.c_str(), SMGMT_UNHEALTHY_FILE); + daemon_log ( SMGMT_UNHEALTHY_FILE, "Maintenance force swact due to self failure"); + node_ptr->ar_log_throttle = 0 ; + if ( SIMPLEX ) + { + wlog ("%s commanding lazy reboot", node_ptr->hostname.c_str()); + + send_mtc_cmd ( node_ptr->hostname, MTC_CMD_LAZY_REBOOT, MGMNT_INTERFACE) ; + + /* pxeboot network is not currently provisioned in SX + * auto handle if that changes in the future */ + if ( this->pxeboot_network_provisioned == true ) + send_mtc_cmd ( node_ptr->hostname, MTC_CMD_LAZY_REBOOT, PXEBOOT_INTERFACE) ; + + this->self_reboot_wait = true ; + } + else + { + this->force_swact_wait = true ; + } + } + } + else + { + ilog_throttled (node_ptr->ar_log_throttle, AR_HANDLER_LOG_THROTTLE_THRESHOLD, + "%s ... waiting on database update before %s", + node_ptr->hostname.c_str(), + SIMPLEX ? "lazy reboot of this simplex system" : + "force swact to unlocked-enabled standby controller"); + } + return (PASS); +} \ No newline at end of file diff --git a/mtce/src/maintenance/mtcSubfHdlrs.cpp b/mtce/src/maintenance/mtcSubfHdlrs.cpp index a488e922..0210b6f3 100644 --- a/mtce/src/maintenance/mtcSubfHdlrs.cpp +++ b/mtce/src/maintenance/mtcSubfHdlrs.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2016 Wind River Systems, Inc. + * Copyright (c) 2013-2016, 2025 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -192,7 +192,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) alarm_compute_clear ( node_ptr, true ); /* ok. great, got the go-enabled message, lets move on */ - enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_START ); + enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_CHECK ); break ; } ilog ("%s running out-of-service tests\n", name.c_str()); @@ -214,19 +214,20 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_ENABLE__GOENABLED_WAIT: { - bool goenable_failed = false ; + bool goenable_failed_subf = false ; /* search for the Go Enable message */ if (( node_ptr->health == NODE_UNHEALTHY ) || ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY) || + ( node_ptr->mtce_flags & MTC_FLAG__SUBF_GOENABLE_FAIL) || ( node_ptr->goEnabled_failed_subf == true )) { mtcTimer_reset ( node_ptr->mtcTimer ); - elog ("%s one or more out-of-service tests failed\n", name.c_str()); + elog ("%s one or more out-of-service subfunction tests failed\n", name.c_str()); mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_INTEST_FAIL ); enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED ); - goenable_failed = true ; + goenable_failed_subf = true ; } /* search for the Go Enable message */ @@ -245,17 +246,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) /* ok. great, got the go-enabled message, lets move on */ - if ( node_ptr->start_services_needed_subf == true ) - { - /* If the add_handler set start_services_needed_subf to - * true then we bypass inline execution and allow it to - * be serviced as a scheduled background operation. */ - enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_CHECK ); - } - else - { - enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_START ); - } + enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_CHECK ); break ; } @@ -265,14 +256,14 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_INTEST_TO ); enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED ); - goenable_failed = true ; + goenable_failed_subf = true ; } else { ; /* wait some more */ } - if ( goenable_failed == true ) + if ( goenable_failed_subf == true ) { alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL ); @@ -284,103 +275,6 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) } break ; } - case MTC_ENABLE__HOST_SERVICES_START: - { - bool start = true ; - bool subf = true ; - - plog ("%s %s host services\n", - name.c_str(), - node_ptr->start_services_needed_subf ? "scheduling start compute" : - "starting compute"); - - if ( node_ptr->start_services_needed_subf == true ) - { - bool force = true ; - - /* If the add_handler set start_services_needed_subf to - * true then we bypass inline execution and allow it to - * be serviced as a scheduled background operation. */ - enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_CHECK ); - alarm_compute_clear ( node_ptr, force ); - } - - else if ( launch_host_services_cmd ( node_ptr, start, subf ) != PASS ) - { - wlog ("%s %s failed ; launch\n", - name.c_str(), - node_ptr->host_services_req.name.c_str()); - - node_ptr->hostservices_failed_subf = true ; - alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL ); - enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED ); - mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_FAIL ); - - /* handle auto recovery for this failure */ - if ( ar_manage ( node_ptr, - MTC_AR_DISABLE_CAUSE__HOST_SERVICES, - MTC_TASK_AR_DISABLED_SERVICES ) != PASS ) - break ; - } - else - { - enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_WAIT ); - } - break ; - } - - case MTC_ENABLE__HOST_SERVICES_WAIT: - { - /* Wait for host services to complete - pass or fail. - * The host_services_handler manages timeout. */ - rc = host_services_handler ( node_ptr ); - if ( rc == RETRY ) - { - /* wait for the mtcClient's response ... */ - break ; - } - else if ( rc != PASS ) - { - node_ptr->hostservices_failed_subf = true ; - alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL ); - - enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED ); - - - if ( rc == FAIL_TIMEOUT ) - { - elog ("%s %s failed ; timeout\n", - name.c_str(), - node_ptr->host_services_req.name.c_str()); - - /* Report "Enabling Compute Service Timeout" to sysinv/horizon */ - mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_TO ); - } - else - { - elog ("%s %s failed ; rc:%d\n", - name.c_str(), - node_ptr->host_services_req.name.c_str(), - rc); - - /* Report "Enabling Compute Service Failed" to sysinv/horizon */ - mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_FAIL ); - } - - /* handle auto recovery for this failure */ - if ( ar_manage ( node_ptr, - MTC_AR_DISABLE_CAUSE__HOST_SERVICES, - MTC_TASK_AR_DISABLED_SERVICES ) != PASS ) - break ; - } - else /* success path */ - { - alarm_compute_clear ( node_ptr, true ); - node_ptr->hostservices_failed_subf = false ; - enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_CHECK ); - } - break ; - } case MTC_ENABLE__HEARTBEAT_CHECK: { if ( THIS_HOST ) @@ -569,11 +463,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) plog ("%s is ENABLED\n", name.c_str()); } - /* already cleared if true so no need to do it again */ - if ( node_ptr->start_services_needed_subf != true ) - { - alarm_compute_clear ( node_ptr, force ); - } + alarm_compute_clear ( node_ptr, force ); enableStageChange ( node_ptr, MTC_ENABLE__DONE ); diff --git a/mtce/src/scripts/mtc.conf b/mtce/src/scripts/mtc.conf index 84d6b3f6..f88d6331 100644 --- a/mtce/src/scripts/mtc.conf +++ b/mtce/src/scripts/mtc.conf @@ -80,7 +80,7 @@ bmc_reset_delay = 300 ; seconds to wait before issuing a bmc http_retry_wait = 10 ; secs to wait between http request retries -host_add_delay = 20 ; seconds to wait before adding hosts +host_add_delay = 0 ; seconds to wait before adding hosts [client] ; Client Configuration