Merge "Remove Start Host Service Launch in mtcAgent & enhance fault detection"

This commit is contained in:
Zuul 2025-04-10 17:36:02 +00:00 committed by Gerrit Code Review
commit 34207b1895
13 changed files with 709 additions and 818 deletions

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2016, 2023-2024 Wind River Systems, Inc.
* Copyright (c) 2013, 2016, 2023-2025 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -348,8 +348,6 @@ void mtc_stages_init ( void )
enableStages_str [MTC_ENABLE__GOENABLED_TIMER ] = "GoEnable-Start";
enableStages_str [MTC_ENABLE__GOENABLED_WAIT ] = "GoEnable-Wait";
enableStages_str [MTC_ENABLE__PMOND_READY_WAIT ] = "PmondReady-Wait";
enableStages_str [MTC_ENABLE__HOST_SERVICES_START ] = "HostServices-Start";
enableStages_str [MTC_ENABLE__HOST_SERVICES_WAIT ] = "HostServices-Wait";
enableStages_str [MTC_ENABLE__SERVICES_START_WAIT ] = "Services-Start";
enableStages_str [MTC_ENABLE__HEARTBEAT_WAIT ] = "Heartbeat-Wait";
enableStages_str [MTC_ENABLE__HEARTBEAT_SOAK ] = "Heartbeat-Soak";
@ -375,8 +373,6 @@ void mtc_stages_init ( void )
recoveryStages_str[MTC_RECOVERY__MTCALIVE_WAIT ] = "MtcAlive-Wait";
recoveryStages_str[MTC_RECOVERY__GOENABLED_TIMER ] = "GoEnable-Timer";
recoveryStages_str[MTC_RECOVERY__GOENABLED_WAIT ] = "GoEnable-Wait";
recoveryStages_str[MTC_RECOVERY__HOST_SERVICES_START] = "HostServices-Start";
recoveryStages_str[MTC_RECOVERY__HOST_SERVICES_WAIT ] = "HostServices-Wait";
recoveryStages_str[MTC_RECOVERY__CONFIG_COMPLETE_WAIT]= "Compute-Config-Wait";
recoveryStages_str[MTC_RECOVERY__SUBF_GOENABLED_TIMER]= "Subf-GoEnable-Timer";
recoveryStages_str[MTC_RECOVERY__SUBF_GOENABLED_WAIT] = "Subf-GoEnable-Wait";

View File

@ -1,7 +1,7 @@
#ifndef __INCLUDE_NODEBASE_HH__
#define __INCLUDE_NODEBASE_HH__
/*
* Copyright (c) 2013-2020, 2023-2024 Wind River Systems, Inc.
* Copyright (c) 2013-2020, 2023-2025 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -77,18 +77,25 @@ void daemon_exit ( void );
*
* These flags are shipped in the parm[2] if the
* mtcAlive message from each host. */
#define MTC_FLAG__I_AM_CONFIGURED (0x00000001)
#define MTC_FLAG__I_AM_NOT_HEALTHY (0x00000002)
#define MTC_FLAG__I_AM_HEALTHY (0x00000004)
#define MTC_FLAG__I_AM_LOCKED (0x00000008)
#define MTC_FLAG__SUBF_CONFIGURED (0x00000010)
#define MTC_FLAG__MAIN_GOENABLED (0x00000020)
#define MTC_FLAG__SUBF_GOENABLED (0x00000040)
#define MTC_FLAG__SM_DEGRADED (0x00000080)
#define MTC_FLAG__PATCHING (0x00000100) /* Patching in progress */
#define MTC_FLAG__PATCHED (0x00000200) /* Patched but not reset */
#define MTC_FLAG__LUKS_VOL_FAILED (0x00000400)
#define MTC_FLAG__SM_UNHEALTHY (0x00001000)
#define MTC_FLAG__I_AM_CONFIGURED (0x00000001)
#define MTC_FLAG__I_AM_NOT_HEALTHY (0x00000002)
#define MTC_FLAG__I_AM_HEALTHY (0x00000004)
#define MTC_FLAG__I_AM_LOCKED (0x00000008)
#define MTC_FLAG__SUBF_CONFIGURED (0x00000010)
#define MTC_FLAG__MAIN_GOENABLED (0x00000020)
#define MTC_FLAG__SUBF_GOENABLED (0x00000040)
#define MTC_FLAG__SM_DEGRADED (0x00000080)
#define MTC_FLAG__PATCHING (0x00000100) /* Patching in progress */
#define MTC_FLAG__PATCHED (0x00000200) /* Patched but not reset */
#define MTC_FLAG__LUKS_VOL_FAILED (0x00000400)
#define MTC_FLAG__SM_UNHEALTHY (0x00001000)
#define MTC_FLAG__RESERVED_2000 (0x00002000)
#define MTC_FLAG__RESERVED_4000 (0x00004000)
#define MTC_FLAG__RESERVED_8000 (0x00008000)
#define MTC_FLAG__MAIN_GOENABLE_FAIL (0x00010000)
#define MTC_FLAG__SUBF_GOENABLE_FAIL (0x00020000)
#define MTC_FLAG__MAIN_SERVICES_FAIL (0x00040000)
#define MTC_FLAG__SUBF_SERVICES_FAIL (0x00080000)
#define MTC_UNHEALTHY_THRESHOLD (3)
@ -98,7 +105,7 @@ void daemon_exit ( void );
#define NODE_UNHEALTHY (2)
#define AUTO_RECOVERY_FILE_SUFFIX ((const char *)"_ar_count")
#define TMP_DIR_PATH ((const char *)"/etc/mtc/tmp/")
#define MTC_PERSIST_PATH ((const char *)"/var/persist/mtc/")
#define HOST_IS_VIRTUAL ((const char *)"/var/run/virtual.host")
@ -159,6 +166,10 @@ void daemon_exit ( void );
#define OPT_PLATFORM_CONFIG_DIR ((const char *)"/opt/platform/config")
#define DNSMASQ_HOSTS_FILE ((const char *)"dnsmasq.hosts")
/* maintenance log files */
#define MTCAGENT_LOG_FILE ((const char *)"/var/log/mtcAgent.log")
#define MTCCLIENT_LOG_FILE ((const char *)"/var/log/mtcClient.log")
/* supported BMC communication protocols ; access method */
typedef enum
{
@ -294,8 +305,7 @@ typedef enum
#define MTC_TASK_SUBF_CONFIG_TO "Worker Configuration Timeout, re-enabling"
#define MTC_TASK_SUBF_INTEST_FAIL "Worker In-Test Failed, re-enabling"
#define MTC_TASK_SUBF_INTEST_TO "Worker In-Test Timeout, re-enabling"
#define MTC_TASK_SUBF_SERVICE_FAIL "Worker Start Services Failed, re-enabling"
#define MTC_TASK_SUBF_SERVICE_TO "Worker Start Services Timeout, re-enabling"
#define MTC_TASK_SUBF_SERVICE_FAIL "Start Worker Services Failed, re-enabling"
#define MTC_TASK_AR_DISABLED_CONFIG "Configuration failure, threshold reached, Lock/Unlock to retry"
#define MTC_TASK_AR_DISABLED_GOENABLE "In-Test Failure, threshold reached, Lock/Unlock to retry"
@ -904,8 +914,6 @@ typedef enum
MTC_ENABLE__GOENABLED_TIMER = 12,
MTC_ENABLE__GOENABLED_WAIT = 13,
MTC_ENABLE__PMOND_READY_WAIT = 14,
MTC_ENABLE__HOST_SERVICES_START = 15,
MTC_ENABLE__HOST_SERVICES_WAIT = 16,
MTC_ENABLE__SERVICES_START_WAIT = 17,
MTC_ENABLE__HEARTBEAT_WAIT = 18,
MTC_ENABLE__HEARTBEAT_SOAK = 19,
@ -987,8 +995,6 @@ typedef enum
MTC_RECOVERY__MTCALIVE_WAIT,
MTC_RECOVERY__GOENABLED_TIMER,
MTC_RECOVERY__GOENABLED_WAIT,
MTC_RECOVERY__HOST_SERVICES_START,
MTC_RECOVERY__HOST_SERVICES_WAIT,
/* Subfunction stages */
MTC_RECOVERY__CONFIG_COMPLETE_WAIT,

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2019 Wind River Systems, Inc.
* Copyright (c) 2013-2019, 2025 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -1149,9 +1149,12 @@ int daemon_wait_for_file ( const char * filename, int timeout )
int daemon_files_init ( void )
{
struct timespec ts ;
clock_gettime (CLOCK_MONOTONIC, &ts );
/* Create PID file */
pid_t mypid = getpid();
ilog ("--- Daemon Start-Up --- pid:%d\n", mypid);
ilog ("--- Daemon Start-Up --- pid:%d uptime:%ld", mypid, ts.tv_sec);
daemon_init_fit ();
return ( PASS );
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2020, 2023-2024 Wind River Systems, Inc.
* Copyright (c) 2013-2020, 2023-2025 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -353,8 +353,6 @@ nodeLinkClass::nodeLinkClass()
smgrEvent.buf = NULL ;
tokenEvent.buf = NULL ;
unknown_host_throttle = 0 ;
testmode = 0 ;
module_init( );
}
@ -365,19 +363,6 @@ nodeLinkClass::~nodeLinkClass()
;
}
/* Clear start host service controls */
void nodeLinkClass::clear_hostservices_ctls ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr )
{
node_ptr->start_services_needed = false ;
node_ptr->start_services_needed_subf = false ;
node_ptr->start_services_running_main = false ;
node_ptr->start_services_running_subf = false ;
node_ptr->start_services_retries = 0 ;
}
}
/* Clear all the main function enable failure bools */
void nodeLinkClass::clear_main_failed_bools ( struct nodeLinkClass::node * node_ptr )
{
@ -516,7 +501,6 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
ptr->goEnabled = false ;
ptr->goEnabled_subf = false ;
clear_hostservices_ctls ( ptr );
/* clear all the enable failure bools */
clear_main_failed_bools ( ptr );
@ -4103,6 +4087,11 @@ void nodeLinkClass::set_cmd_resp ( string & hostname, mtc_message_type & msg, in
{
if ( is_host_services_cmd ( msg.cmd ) )
{
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
{
dlog3 ("%s ignoring host services result for locked node", hostname.c_str());
return ;
}
/*****************************************************
* Host Services Request's Response Handling
*****************************************************/
@ -4116,9 +4105,33 @@ void nodeLinkClass::set_cmd_resp ( string & hostname, mtc_message_type & msg, in
{
if ( !node_ptr->host_services_req.ack )
{
slog ("%s %s without initial command ACK\n",
hostname.c_str(),
node_ptr->host_services_req.name.c_str());
// parm[0] contains the return code
if ( msg.parm[0] == PASS )
{
ilog ("%s mtcClient %s ran and passed", hostname.c_str(), msg.buf);
}
else if ( node_ptr->hostservices_failed || node_ptr->hostservices_failed_subf )
{
ilog ("%s already handling 'host services' failure", hostname.c_str());
}
else
{
elog ("%s mtcClient %s ran and failed", hostname.c_str(), msg.buf);
if (( msg.cmd != MTC_CMD_STOP_CONTROL_SVCS ) &&
( msg.cmd != MTC_CMD_STOP_WORKER_SVCS ) &&
( msg.cmd != MTC_CMD_STOP_STORAGE_SVCS ))
{
alarm_enabled_failure ( node_ptr, true );
if ( ar_manage ( node_ptr,
MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
MTC_TASK_AR_DISABLED_SERVICES) == PASS )
{
node_ptr->hostservices_failed = true ;
this->force_full_enable ( node_ptr );
}
}
}
return ;
}
node_ptr->host_services_req.rsp = msg.cmd ;
if ( msg.buf[0] != '\0' )
@ -4233,6 +4246,7 @@ unsigned int nodeLinkClass::get_cmd_resp ( string & hostname )
* 1. manage the online/offline state bools
* 2. increment the mtcAlive count
* 3. set the mtcAlive received bool for the specified interface
* 4. handle start host services failures
*
*****************************************************************************/
void nodeLinkClass::set_mtcAlive ( string & hostname, unsigned int sequence, int iface )
@ -4456,7 +4470,13 @@ void nodeLinkClass::set_goEnabled_failed ( string & hostname )
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
node_ptr->goEnabled_failed = true ;
if ( node_ptr->goEnabled_failed == false )
{
node_ptr->goEnabled_failed = true ;
alarm_enabled_failure ( node_ptr, true );
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_FAIL );
ar_handler ( node_ptr, MTC_AR_DISABLE_CAUSE__GOENABLE, MTC_TASK_AR_DISABLED_GOENABLE );
}
}
}
@ -4489,6 +4509,13 @@ void nodeLinkClass::set_goEnabled_failed_subf ( string & hostname )
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
if ( node_ptr->goEnabled_failed_subf == false )
{
node_ptr->goEnabled_failed_subf = true ;
mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_INTEST_FAIL );
alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
ar_handler ( node_ptr, MTC_AR_DISABLE_CAUSE__GOENABLE, MTC_TASK_AR_DISABLED_GOENABLE );
}
node_ptr->goEnabled_failed_subf = true ;
}
}
@ -4580,6 +4607,85 @@ void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface )
else
node_ptr->goEnabled = false ;
// Detect and handle 'Go Enable' Failures that come in by
// Out-Of-Band signaling from periodic mtcAlive messaging.
//
// Only take action on the first event while node is
// unlocked-enabled and while 'goEnabled_failed' AND
// 'goEnabled_failed_subf' are false.
//
// These failure bool's are cleared by calls to
// clear_main_failed_bools and clear_subf_failed_bools
// in the enable_handler.
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
(( flags & MTC_FLAG__MAIN_GOENABLE_FAIL ) || ( flags & MTC_FLAG__SUBF_GOENABLE_FAIL )) &&
(!( node_ptr->goEnabled_failed || node_ptr->goEnabled_failed_subf )))
{
if ( flags & MTC_FLAG__MAIN_GOENABLE_FAIL )
{
elog ("%s goEnabled failed (oob:%08X) ; see %s:%s for details",
hostname.c_str(), flags, hostname.c_str(), MTCCLIENT_LOG_FILE);
node_ptr->goEnabled_failed = true ;
alarm_enabled_failure ( node_ptr, true );
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_FAIL );
}
if ( flags & MTC_FLAG__SUBF_GOENABLE_FAIL )
{
ilog ("%s goEnabled subfunction failed (oob:%08X) ; see %s:%s for details",
hostname.c_str(), flags, hostname.c_str(), MTCCLIENT_LOG_FILE);
node_ptr->goEnabled_failed_subf = true ;
alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_INTEST_FAIL );
}
if ( ar_handler ( node_ptr,
MTC_AR_DISABLE_CAUSE__GOENABLE,
MTC_TASK_AR_DISABLED_GOENABLE ) == PASS )
{
this->force_full_enable ( node_ptr );
}
}
// Detect and handle 'Host Services' failures that come in by
// Out-Of-Band signaling from periodic mtcAlive messaging.
//
// Only take action on the first event while node is
// unlocked-enabled and while 'goEnabled_failed' AND
// 'goEnabled_failed_subf' are false.
//
// These failure bool's are cleared by calls to
// clear_main_failed_bools and clear_subf_failed_bools
// in the enable_handler.
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
(( flags & MTC_FLAG__MAIN_SERVICES_FAIL ) || ( flags & MTC_FLAG__SUBF_SERVICES_FAIL )) &&
(!( node_ptr->hostservices_failed || node_ptr->hostservices_failed_subf )))
{
if ( flags & MTC_FLAG__MAIN_SERVICES_FAIL )
{
elog ("%s start host services failed (oob:%08X) ; see %s:%s for details",
hostname.c_str(), flags, hostname.c_str(), MTCCLIENT_LOG_FILE);
node_ptr->hostservices_failed = true ;
alarm_enabled_failure ( node_ptr, true );
// mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_SERVICE_FAIL );
}
if ( flags & MTC_FLAG__SUBF_SERVICES_FAIL )
{
ilog ("%s start host subfunction services failed (oob:%08X) ; see %s:%s for details",
hostname.c_str(), flags, hostname.c_str(), MTCCLIENT_LOG_FILE);
node_ptr->hostservices_failed_subf = true ;
alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
// mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_FAIL );
}
if ( ar_handler ( node_ptr,
MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
MTC_TASK_AR_DISABLED_SERVICES) == PASS )
{
this->force_full_enable ( node_ptr );
}
}
/*
* Fail the inactive controller if the sm unhealthy flag is set.
* Degrade for the active controller.
@ -8091,7 +8197,7 @@ struct nodeLinkClass::node * nodeLinkClass::get_insvTestTimer ( timer_t tid )
void nodeLinkClass::ar_enable ( struct nodeLinkClass::node * node_ptr )
{
string ar_file = TMP_DIR_PATH + node_ptr->hostname + AUTO_RECOVERY_FILE_SUFFIX ;
string ar_file = MTC_PERSIST_PATH + node_ptr->hostname + AUTO_RECOVERY_FILE_SUFFIX ;
if ( daemon_is_file_present (ar_file.data()))
{
wlog ("%s clearing autorecovery file counter\n", node_ptr->hostname.c_str());
@ -8126,7 +8232,7 @@ void nodeLinkClass::ar_enable ( struct nodeLinkClass::node * node_ptr )
*
* Manage Auto Recovery:
*
* Case 1: Failed active controller with no enabled inactive controller.
* Case 1: Failed active controller in DX system
*
* Requires persistent count file and self reboot until threshold
* is reached.
@ -8136,9 +8242,13 @@ void nodeLinkClass::ar_enable ( struct nodeLinkClass::node * node_ptr )
* so we don't get a rolling boot loop.
*
* Auto recovery count is tracked/preserved in a host named auto
* recovery counter file /etc/mtc/tmp/hostname_ar_count.
* recovery counter file /var/persist/mtc/<hostname>_ar_count.
*
* Case 2: All other cases
* Note: This auto recovery count file only applies to SX systems.
* Otherwise, in DX systems a node's auto recovery count
* is tracked in that node's nodeClass data structure.
*
* Case 2: All other cases ; remote hosts and SX systems
*
* Case 2a: No auto recovery thresholding of active controller in non AIO SX
* where the user can't lock and unlock the active controller.
@ -8166,6 +8276,7 @@ void nodeLinkClass::ar_enable ( struct nodeLinkClass::node * node_ptr )
*
******************************************************************************/
#define FORCE_SWACT_DELAY_SECS (5)
int nodeLinkClass::ar_manage ( struct nodeLinkClass::node * node_ptr,
autorecovery_disable_cause_enum cause,
string ar_disable_banner )
@ -8180,6 +8291,12 @@ int nodeLinkClass::ar_manage ( struct nodeLinkClass::node * node_ptr,
return (rc);
}
if ( node_ptr->forcing_full_enable == true )
{
wlog ("%s already handling full enable", node_ptr->hostname.c_str());
return (rc) ;
}
/* check for invalid call case */
if ( cause >= MTC_AR_DISABLE_CAUSE__LAST )
{
@ -8192,98 +8309,201 @@ int nodeLinkClass::ar_manage ( struct nodeLinkClass::node * node_ptr,
if ( node_ptr->ar_cause != cause )
node_ptr->ar_cause = cause ;
string ar_file = MTC_PERSIST_PATH +
node_ptr->hostname +
AUTO_RECOVERY_FILE_SUFFIX ;
/* Case 1 check */
if ( ( THIS_HOST ) && ( is_inactive_controller_main_insv() == false ))
if ( daemon_is_file_present (ar_file.data()))
{
/* manage the auto recovery threshold count file */
unsigned int value = 0 ;
string ar_file = TMP_DIR_PATH +
node_ptr->hostname +
AUTO_RECOVERY_FILE_SUFFIX ;
if ( daemon_is_file_present (ar_file.data()))
{
/* if the file is there then read the count and increment it */
value = daemon_get_file_int ( ar_file.data() );
}
value++ ;
/* Save the new value in the file */
daemon_log_value ( ar_file.data(), value );
value = daemon_get_file_int ( ar_file.data() );
/* set rc to reflect what the caller should do */
if ( value > this->ar_threshold[node_ptr->ar_cause] )
{
elog ("%s auto recovery threshold exceeded (%d)\n",
node_ptr->hostname.c_str(),
this->ar_threshold[node_ptr->ar_cause] );
node_ptr->ar_disabled = true ;
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
allStateChange ( node_ptr, node_ptr->adminState,
MTC_OPER_STATE__ENABLED,
MTC_AVAIL_STATUS__DEGRADED );
mtcInvApi_update_task ( node_ptr, ar_disable_banner );
return (rc);
}
wlog ("%s auto recovery (try %d of %d) (%d)",
node_ptr->hostname.c_str(),
value,
this->ar_threshold[node_ptr->ar_cause],
node_ptr->ar_cause);
mtcInvApi_update_states_now ( node_ptr, "unlocked",
"disabled", "failed",
"disabled", "failed" );
lazy_graceful_fs_reboot ( node_ptr );
/* If the file is there then read the count and increment it */
node_ptr->ar_count[node_ptr->ar_cause] = daemon_get_file_int ( ar_file.data() );
}
else /* Case 2 */
{
send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST );
mtcInvApi_update_states ( node_ptr, "unlocked", "disabled", "failed" );
node_ptr->ar_count[node_ptr->ar_cause]++ ;
if (( NOT_THIS_HOST ) &&
( this->system_type != SYSTEM_TYPE__AIO__SIMPLEX ))
/* Only save the value to a file for SIMPLEX systems.
* Preserving the auto recovery file in DX systems is problematic over
* Swact unless its stored in the active controller mounted filesystem
* which it is not. */
if ( SIMPLEX )
daemon_log_value ( ar_file.data(), node_ptr->ar_count[node_ptr->ar_cause] );
/* If not simplex then ensure there is no lingering
* file after a simplex to duplex migration */
else if ( daemon_is_file_present ( ar_file.data() ) )
daemon_remove_file ( ar_file.data() );
/* set rc to reflect what the caller should do */
if ( node_ptr->ar_count[node_ptr->ar_cause] > this->ar_threshold[node_ptr->ar_cause] )
{
elog ("%s auto recovery threshold of %d reached - going auto recovery disabled.",
node_ptr->hostname.c_str(),
this->ar_threshold[node_ptr->ar_cause] );
node_ptr->ar_disabled = true ;
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
allStateChange ( node_ptr, node_ptr->adminState,
MTC_OPER_STATE__DISABLED,
MTC_AVAIL_STATUS__FAILED );
mtcInvApi_update_task ( node_ptr, ar_disable_banner );
return (rc);
}
/* Case 1: This Host and not simplex system */
if (( THIS_HOST ) && ( NOT_SIMPLEX ))
{
/* Case 1a - This DX controller with no enabled standby controller - go degraded and no reboot */
if ( is_inactive_controller_main_insv() == false )
{
if ( ++node_ptr->ar_count[node_ptr->ar_cause] >=
this->ar_threshold [node_ptr->ar_cause] )
{
elog ("%s auto recovery threshold exceeded (%d)\n",
node_ptr->hostname.c_str(),
this->ar_threshold[node_ptr->ar_cause] );
node_ptr->ar_disabled = true ;
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
mtcInvApi_update_task ( node_ptr, ar_disable_banner );
rc = FAIL ;
}
else
{
wlog ("%s auto recovery (try %d of %d) (%d)",
node_ptr->hostname.c_str(),
node_ptr->ar_count[node_ptr->ar_cause],
this->ar_threshold[node_ptr->ar_cause],
node_ptr->ar_cause);
rc = PASS ;
}
alarm_enabled_failure ( node_ptr, true );
allStateChange ( node_ptr,
node_ptr->adminState,
MTC_OPER_STATE__ENABLED,
MTC_AVAIL_STATUS__DEGRADED );
wlog ("%s refusing to self reboot with no enabled standby controller.", node_ptr->hostname.c_str());
wlog ("%s ... critical enable alarm raised, running enabled but degraded.", node_ptr->hostname.c_str());
wlog ("%s ... recommend enabling a standby controller.", node_ptr->hostname.c_str());
mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_NO_BACKUP);
rc = FAIL ;
}
/* Case 1b - This DX controller host with an enabled standby controller - force swact and reboot */
else
{
wlog ("%s auto recovery\n", node_ptr->hostname.c_str());
rc = PASS ;
wlog ("%s auto recovery of self (try %d of %d) (%d)",
node_ptr->hostname.c_str(),
node_ptr->ar_count[node_ptr->ar_cause],
this->ar_threshold[node_ptr->ar_cause],
node_ptr->ar_cause);
mtcInvApi_update_states_now ( node_ptr, "", "disabled", "failed", "", "" );
/* Turn off Heartbeat to that host */
send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST );
/* Post critical failure message */
mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_SWACT_REQ );
mtcTimer_reset ( node_ptr->mtcTimer );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, FORCE_SWACT_DELAY_SECS );
wlog ("%s force swact in %d seconds ; waiting for database 'disabled-failed' state update",
node_ptr->hostname.c_str(), FORCE_SWACT_DELAY_SECS );
this->delayed_swact_required = true ;
node_ptr->ar_log_throttle = 0 ;
rc = FAIL ;
}
}
else /* Case 2 - Not this host , let the caller decide what to do */
{
wlog ("%s auto recovery (try %d of %d) (%d)",
node_ptr->hostname.c_str(),
node_ptr->ar_count[node_ptr->ar_cause],
this->ar_threshold[node_ptr->ar_cause],
node_ptr->ar_cause);
rc = PASS ;
}
return (rc);
}
/*****************************************************************************
*
* Name : ar_handler
*
* Purpose : Handle node failure from ar_manage return code
*
* Description: The following cases apply whe the failed node is ...
*
* Case 1: Not the active controller
* - Auto recovery disable thresholding applies for applicable
* causes.
*
* Case 2: Active Controller in SIMPLEX or With Enabled Standby
* - Auto Recovery disable applies on Simplex system or DX System
* with enabled standby controller.
*
* Case 3: Active Controller in DX System
* - Auto Recovery disable does not apply to a active controller
* in a DX system that does not have an unlocked-enabled standby
* controller to switch activity to.
* - Logs are produced, host is degraded, alarm is raised and
* node task field is updated.
* - Locking the active contorller to recover from an auto
* recovery disabled host is not supported in a DX system.
*
* Parameters :
*
* @param node_ptr: pointer to the nodeLinkClass struct for the failing node
* @param cause : autorecovery_disable_cause_enum enumberated type of the
* failure cause
* @param ar_disable_banner : the auto recover disable cause string
*
* Returns : PASS if the auto recovery threshold is not reached.
* FAIL if the auto recovery threshold is reached and
* ar_disable is true
*
*****************************************************************************/
int nodeLinkClass::ar_handler ( struct nodeLinkClass::node * node_ptr,
autorecovery_disable_cause_enum cause,
string ar_disable_banner )
{
int ar_status = FAIL;
if ( node_ptr->ar_disabled )
return ar_status ;
wlog ("%s handling node failure ; cause:%d", node_ptr->hostname.c_str(), cause );
// Case 1: Not the active controller
if ( NOT_THIS_HOST )
{
if ( ( ar_status = this->ar_manage ( node_ptr, cause, ar_disable_banner ) ) == PASS )
this->force_full_enable ( node_ptr );
}
// Case 2: Active Controller failed on
// - SX system or
// - DX system with enabled standby controller
else if (( SIMPLEX ) || ( this->num_controllers_enabled() > 1 ))
{
if ( ( ar_status = this->ar_manage ( node_ptr, cause, ar_disable_banner ) ) == PASS )
{
mtcInvApi_update_states_now ( node_ptr, "", "disabled", "failed", "", "" );
if ( NOT_SIMPLEX )
{
/* Turn off Heartbeat to that host */
send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST );
/* Update task stating that a Swact is in progress */
mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_SWACT_REQ );
}
mtcTimer_reset ( node_ptr->mtcTimer );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, FORCE_SWACT_DELAY_SECS );
node_ptr->ar_log_throttle = 0 ;
this->delayed_swact_required = true ;
wlog ("%s %s in %d seconds ; waiting for database 'disabled-failed' state update",
node_ptr->hostname.c_str(),
SIMPLEX ? "lazy reboot" : "force swact",
FORCE_SWACT_DELAY_SECS);
}
}
// Case 3: Active Controller in DX System without enabled standby controller.
else
{
wlog ("%s refusing to self reboot with no enabled standby controller", node_ptr->hostname.c_str());
wlog ("%s ... critical enable alarm raised", node_ptr->hostname.c_str());
wlog ("%s ... recommend enabling a standby controller.", node_ptr->hostname.c_str());
mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_NO_BACKUP);
}
return (ar_status);
}
/****************************************************************************
*
* Name : report_dor_recovery
@ -8322,6 +8542,12 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
if ( node_ptr->ar_disabled == true )
return ;
if ( node_ptr->forcing_full_enable == true )
{
wlog ("%s already handling force full enable", node_ptr->hostname.c_str());
return ;
}
if ( node_ptr->was_dor_recovery_mode )
{
report_dor_recovery ( node_ptr , "is FAILED " );
@ -8341,6 +8567,7 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK ))
{
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); // no action
node_ptr->forcing_full_enable = true ;
}
else
{
@ -8372,9 +8599,7 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
*
* start = True
*
* MTC_CMD_START_CONTROL_SVCS
* MTC_CMD_START_WORKER_SVCS
* MTC_CMD_START_STORAGE_SVCS
* No Longer Supported
*
* Returns : PASS = launch success
* !PASS = launch failure
@ -8386,32 +8611,22 @@ int nodeLinkClass::launch_host_services_cmd ( struct nodeLinkClass::node * node_
if ( !node_ptr )
return (FAIL_NULL_POINTER);
/* Initialize the host's command request control structure */
mtcCmd_init ( node_ptr->host_services_req );
if ( start == true )
{
slog ("%s Start Host Services Command Not Supported", node_ptr->hostname.c_str());
return ( FAIL_INVALID_OPERATION ) ;
}
else
{
/* Initialize the host's command request control structure */
mtcCmd_init ( node_ptr->host_services_req );
}
/* Service subfunction override first, efficiency. */
if ( subf == true )
{
/* only supported subfunction (right now) is COMPUTE */
if ( start == true )
node_ptr->host_services_req.cmd = MTC_CMD_START_WORKER_SVCS ;
else
node_ptr->host_services_req.cmd = MTC_CMD_STOP_WORKER_SVCS ;
}
else if ( start == true )
{
if ( is_controller (node_ptr) )
node_ptr->host_services_req.cmd = MTC_CMD_START_CONTROL_SVCS ;
else if ( is_worker (node_ptr) )
node_ptr->host_services_req.cmd = MTC_CMD_START_WORKER_SVCS ;
else if ( is_storage (node_ptr) )
node_ptr->host_services_req.cmd = MTC_CMD_START_STORAGE_SVCS ;
else
{
slog ("%s start host services is not supported for this host type\n",
node_ptr->hostname.c_str());
return (FAIL_BAD_CASE) ;
}
node_ptr->host_services_req.cmd = MTC_CMD_STOP_WORKER_SVCS ;
}
else
{
@ -9879,6 +10094,14 @@ void nodeLinkClass::mem_log_test_info ( struct nodeLinkClass::node * node_ptr )
get_insvTestStages_str(node_ptr->insvTestStage).c_str(),
node_ptr->insv_test_count);
mem_log (str);
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tGoEnabled Main:%s Subf:%s - Services Main:%s Subf:%s - Force Full Enable Bypass:%s\n",
node_ptr->hostname.c_str(),
node_ptr->goEnabled_failed ? "Fail" : "Ok",
node_ptr->goEnabled_failed_subf ? "Fail" : "Ok",
node_ptr->hostservices_failed ? "Fail" : "Ok",
node_ptr->hostservices_failed_subf ? "Fail" : "Ok",
node_ptr->forcing_full_enable ? "Yes" : "No");
mem_log (str);
}
void nodeLinkClass::mem_log_thread_info ( struct nodeLinkClass::node * node_ptr )

View File

@ -1,7 +1,7 @@
#ifndef __INCLUDE_NODECLASS_H__
#define __INCLUDE_NODECLASS_H__
/*
* Copyright (c) 2013-2016, 2023-2024 Wind River Systems, Inc.
* Copyright (c) 2013-2016, 2023-2025 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -67,6 +67,9 @@ using namespace std;
#define SIMPLEX \
( daemon_is_file_present ( PLATFORM_SIMPLEX_MODE ) == true )
#define NOT_SIMPLEX \
( daemon_is_file_present ( PLATFORM_SIMPLEX_MODE ) == false )
#define THIS_HOST \
( node_ptr->hostname == this->my_hostname )
@ -209,19 +212,6 @@ private:
int mtcalive_timeout ;
/* start host service retry controls */
int start_services_retries ;
bool start_services_running_main ;
bool start_services_running_subf ;
bool start_services_needed ;
bool start_services_needed_subf ; /* for the add handler that defers
start to the inservice test handler.
this provides a means of telling
maintenance that the subfunction
start needs to also be run. */
/** Pointer to the previous node in the list */
struct node *prev;
@ -404,8 +394,8 @@ private:
/* Boolean indicating the main or subfunction has start host services
* failure. */
bool hostservices_failed ;
bool hostservices_failed_subf ;
bool hostservices_failed = false ;
bool hostservices_failed_subf = false ;
/* Boolean indicating the main or subfunction has inservice failure */
bool inservice_failed ;
@ -442,8 +432,12 @@ private:
/* throttles the ar_disabled log to periodically indicate auto
* recovery disabled state but avoid flooding that same message. */
#define AR_LOG_THROTTLE_THRESHOLD (100000)
#define AR_HANDLER_LOG_THROTTLE_THRESHOLD (1000)
unsigned int ar_log_throttle ;
/** Bool to prevent nested force_full_enable and auto recovery management handling */
bool forcing_full_enable = false ;
/** Host's mtc timer struct. Use to time handler stages.
*
* reset -> reset command response
@ -876,6 +870,7 @@ private:
int stress_handler ( struct nodeLinkClass::node * node_ptr );
int bmc_handler ( struct nodeLinkClass::node * node_ptr );
int degrade_handler ( struct nodeLinkClass::node * node_ptr );
int self_fail_handler ( struct nodeLinkClass::node * node_ptr );
int uptime_handler ( void );
@ -985,6 +980,12 @@ private:
autorecovery_disable_cause_enum cause,
string ar_disable_banner );
/* handle auto recovery
* - adds common handling functionality on top of ar_manage */
int ar_handler ( struct nodeLinkClass::node * node_ptr,
autorecovery_disable_cause_enum cause,
string ar_disable_banner);
/** ***********************************************************************
*
* Name : nodeLinkClass::workQueue_process
@ -1160,7 +1161,6 @@ private:
void clear_subf_failed_bools ( struct nodeLinkClass::node * node_ptr );
void clear_main_failed_bools ( struct nodeLinkClass::node * node_ptr );
void clear_hostservices_ctls ( struct nodeLinkClass::node * node_ptr );
/* Enables/Clears dynamic auto recovery state. start fresh !
* called in disabled_handler (lock) and in the DONE stages
@ -1254,28 +1254,6 @@ private:
*/
int memory_used ;
/** Inservice memory management audit.
*
* Verifies that the node_ptr list and memory_allocs jive as well
* as all the node pointers point to a node in the linked list.
*
* @return
* an integer representing a PASS or TODO: list other error codes.
*/
int memory_audit ( void );
/* Simplex mode auto recovery bools
*
* Set to true when the autorecovery threshold is reached
* and we want to avoid taking further autorecovery action
* even though it may be requested. */
bool autorecovery_disabled = false ;
/* Set to true by fault detection methods that are
* autorecoverable when in simplex mode. */
bool autorecovery_enabled = false ;
/** Tracks the number of hosts that 'are currently' in service trouble
* wrt heartbeat (above minor threshold).
* This is used in multi-host failure avoidance.
@ -2191,7 +2169,14 @@ public:
*/
unsigned int ar_interval[MTC_AR_DISABLE_CAUSE__LAST] ;
int unknown_host_throttle ;
/* Used by the auto recovery algorithm for self-reboot.
* This is a flag indicating a delayed self-reboot is required.
* This ensures the FSM enters the self_reboot_handler, allowing sufficient time
* for operational and availability state changes to be committed to the database
* before initiating the reboot. */
bool delayed_swact_required = false ;
bool self_reboot_wait = false ;
bool force_swact_wait = false ;
};
/**

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2018, 2024 Wind River Systems, Inc.
* Copyright (c) 2013-2018, 2024-2025 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -458,7 +458,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
return (PASS);
}
/* inform mtcAgent of enhanced ost services support */
/* inform mtcAgent of enhanced host services support */
msg.parm[1] = MTC_ENHANCED_HOST_SERVICES ;
msg.parm[0] = rc ;
msg.num = 2 ;
@ -810,14 +810,20 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char
if ( mtce_name_ptr )
{
/* add the error message to the message buffer */
/* add the message to the message buffer */
size_t len = strnlen ( mtce_name_ptr, MAX_MTCE_EVENT_NAME_LEN );
/* We don't use the buffer for mtce events to remove it from the size */
bytes = ((sizeof(mtc_message_type))-(BUF_SIZE-len));
snprintf ( &event.buf[0], MAX_MTCE_EVENT_NAME_LEN , "%s", mtce_name_ptr );
rc = FAIL_OPERATION ;
// If the supplied mtce_name_str string contains 'failed'
// then set the rc to FAIL_OPERATION
if ( strcasestr (mtce_name_ptr, "failed" ) )
rc = FAIL_OPERATION ;
if ( strcasestr (mtce_name_ptr, "timeout" ) )
rc = FAIL_TIMEOUT ;
}
else
{
@ -983,6 +989,38 @@ int create_mtcAlive_msg ( ctrl_type * ctrl_ptr, mtc_message_type & msg, int cmd,
}
}
/* Set Out-Of-Band goEnable failure flag for goEnable failure. */
if ( ctrl_ptr->goEnable_result )
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__MAIN_GOENABLE_FAIL ;
if ( ctrl_ptr->goEnable_result_subf )
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SUBF_GOENABLE_FAIL ;
/* Set the Out-Of-Band Host Services failure
* flag for any start host services that failed */
if ( ctrl_ptr->storage_hostservices_result )
{
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__MAIN_SERVICES_FAIL ;
dlog3 ("storage start host services failed ; rc:%d", ctrl_ptr->storage_hostservices_result );
}
else if ( ctrl_ptr->controller_hostservices_result )
{
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__MAIN_SERVICES_FAIL ;
dlog3 ("controller start host services failed ; rc:%d", ctrl_ptr->controller_hostservices_result );
}
else if ( is_subfunction_worker () )
{
if ( ctrl_ptr->worker_hostservices_result )
{
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SUBF_SERVICES_FAIL ;
dlog3 ("worker subfunction start host services failed ; rc:%d", ctrl_ptr->worker_hostservices_result );
}
}
else if ( ctrl_ptr->worker_hostservices_result )
{
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__MAIN_SERVICES_FAIL ;
dlog3 ("worker start host services failed ; rc:%d", ctrl_ptr->worker_hostservices_result );
}
if ( daemon_is_file_present ( SMGMT_DEGRADED_FILE ) )
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_DEGRADED ;

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2018, 2023-2024 Wind River Systems, Inc.
* Copyright (c) 2013-2018, 2023-2025 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -295,15 +295,7 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
if ( msg.num > 0 )
{
/* log if not locked message */
if ( msg.cmd != MTC_MSG_LOCKED )
{
ilog ("%s %s request ACK (rc:%d) (%s)",
hostname.c_str(),
get_mtcNodeCommand_str(msg.cmd),
msg.parm[0],
iface_name_ptr);
}
else
if ( msg.cmd == MTC_MSG_LOCKED )
{
mlog ("%s %s request ACK (rc:%d) (%s)",
hostname.c_str(),
@ -311,6 +303,38 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
msg.parm[0],
iface_name_ptr);
}
else if ( msg.cmd == MTC_CMD_HOST_SVCS_RESULT )
{
ilog ("%s %s (rc:%d) (%s)",
hostname.c_str(),
msg.buf,
msg.parm[0],
iface_name_ptr);
}
else
{
ilog ("%s %s request ACK (rc:%d) (%s)",
hostname.c_str(),
get_mtcNodeCommand_str(msg.cmd),
msg.parm[0],
iface_name_ptr);
}
}
else if ( msg.cmd == MTC_MSG_LOCKED )
{
mlog ("%s %s request ACK (%s)",
hostname.c_str(),
get_mtcNodeCommand_str(msg.cmd),
iface_name_ptr);
}
else
{
/* log other command request ACKs that don't have any return parameters */
ilog ("%s %s request ACK (%s)",
hostname.c_str(),
get_mtcNodeCommand_str(msg.cmd),
iface_name_ptr);
}
}
@ -731,10 +755,8 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface, string json_dict
case MTC_CMD_STOP_CONTROL_SVCS:
case MTC_CMD_STOP_WORKER_SVCS:
case MTC_CMD_STOP_STORAGE_SVCS:
case MTC_CMD_START_CONTROL_SVCS:
case MTC_CMD_START_WORKER_SVCS:
case MTC_CMD_START_STORAGE_SVCS:
{
ilog ("%s %s command sent", hostname.c_str(), get_mtcNodeCommand_str(cmd));
snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s", get_cmd_req_msg_header() );
mtc_cmd.cmd = cmd ;
rc = PASS ;

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2016, 2024 Wind River Systems, Inc.
* Copyright (c) 2013-2016, 2024-2025 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -832,7 +832,7 @@ void _scripts_cleanup ( script_set_enum script_set )
***************************************************************************/
void _manage_services_scripts ( void )
{
bool failed = false ;
int status = PASS ;
char str [BUF_SIZE] ;
if ( ! ctrl.hostservices.scripts )
@ -842,6 +842,8 @@ void _manage_services_scripts ( void )
return ;
}
string current_cmd = get_mtcNodeCommand_str(ctrl.current_hostservices_command) ;
memset (str,0,BUF_SIZE);
/* do if all the scripts are done ? */
@ -852,31 +854,32 @@ void _manage_services_scripts ( void )
{
if ( ctrl.hostservices.script[i].status )
{
if ( failed == false )
if ( status == PASS )
{
/* only report of the first failure */
snprintf(str, BUF_SIZE, "%s failed ; rc:%d",
ctrl.hostservices.script[i].name.data(),
ctrl.hostservices.script[i].status );
failed = true ;
status = ctrl.hostservices.script[i].status ;
break ;
}
}
}
/* handle the aggrigate status */
if ( failed == true )
if ( status )
{
elog ("Host Services: %s\n", str );
ilog ("%s result: %s", current_cmd.c_str(), str );
mtce_send_event ( sock_ptr, MTC_CMD_HOST_SVCS_RESULT, str );
}
else
{
ilog ("Host Services Complete ; all passed ; %s", get_mtcNodeCommand_str(ctrl.current_hostservices_command));
mtce_send_event ( sock_ptr, MTC_CMD_HOST_SVCS_RESULT, NULL );
ilog ("%s complete ; all passed", current_cmd.c_str());
mtce_send_event ( sock_ptr, MTC_CMD_HOST_SVCS_RESULT, current_cmd.data());
}
ctrl.active_script_set = NO_SCRIPTS ;
}
/* do if have we timed out ? */
/* check for 5 minute timeout */
else if ( ctrl.hostservices.timer.ring == true )
{
bool found = false ;
@ -887,9 +890,13 @@ void _manage_services_scripts ( void )
{
if ( ctrl.hostservices.script[i].done == false )
{
snprintf(str, BUF_SIZE, "%s (timeout)", ctrl.hostservices.script[i].name.data() );
status = FAIL_TIMEOUT ;
snprintf(str, BUF_SIZE, "%s timeout", ctrl.hostservices.script[i].name.data() );
found = true ;
wlog ("host services timeout on %s\n", ctrl.hostservices.script[i].name.c_str());
elog ("%s timeout on %s\n",
current_cmd.c_str(),
ctrl.hostservices.script[i].name.c_str());
mtce_send_event ( sock_ptr, MTC_CMD_HOST_SVCS_RESULT, str );
break ;
}
@ -906,6 +913,23 @@ void _manage_services_scripts ( void )
return ;
}
/* Specify which start host services command failed */
if ( status )
{
if ( ctrl.current_hostservices_command == MTC_CMD_START_CONTROL_SVCS )
ctrl.controller_hostservices_result = status ;
else if ( ctrl.current_hostservices_command == MTC_CMD_START_WORKER_SVCS )
ctrl.worker_hostservices_result = status ;
else if ( ctrl.current_hostservices_command == MTC_CMD_START_STORAGE_SVCS )
ctrl.storage_hostservices_result = status ;
else
{
slog ("unexpected current hostservices command=%d status=%d",
ctrl.current_hostservices_command, status );
}
}
mtcTimer_reset (ctrl.hostservices.timer );
_scripts_cleanup (ctrl.active_script_set) ;
}
@ -992,6 +1016,7 @@ void _manage_goenabled_tests ( void )
ilog ("GoEnabled Subfunction Testing Failed ; at least one test failed\n");
daemon_log ( GOENABLED_SUBF_FAIL , str );
ctrl.goEnable_result_subf = FAIL ;
send_mtc_msg ( sock_ptr, MTC_MSG_SUBF_GOENABLED_FAILED, str );
break ;
}
@ -1002,6 +1027,7 @@ void _manage_goenabled_tests ( void )
ilog ("GoEnabled Testing Failed ; at least one test failed\n");
daemon_log ( GOENABLED_MAIN_FAIL , str );
ctrl.goEnable_result = FAIL ;
send_mtc_msg ( sock_ptr, MTC_MSG_MAIN_GOENABLED_FAILED, str );
break ;
}
@ -1067,6 +1093,7 @@ void _manage_goenabled_tests ( void )
daemon_remove_file ( GOENABLED_SUBF_PASS );
send_mtc_msg ( sock_ptr, MTC_MSG_SUBF_GOENABLED_FAILED, str );
daemon_log ( GOENABLED_SUBF_FAIL , str );
ctrl.goEnable_result_subf = FAIL ;
break ;
}
case GOENABLED_MAIN_SCRIPTS:
@ -1074,6 +1101,7 @@ void _manage_goenabled_tests ( void )
daemon_remove_file ( GOENABLED_SUBF_PASS );
send_mtc_msg ( sock_ptr, MTC_MSG_MAIN_GOENABLED_FAILED, str );
daemon_log ( GOENABLED_MAIN_FAIL , str );
ctrl.goEnable_result = FAIL ;
break ;
}
default:
@ -1420,10 +1448,15 @@ void daemon_service_run ( void )
int rc = PASS ;
int file_not_present_count = 0 ;
/* Bool to track whether the start host services scripts run has
* been attempted at least once since last process startup. */
/* Bool to track whether the start host services scripts needs to be run. */
bool start_host_services_needs_to_be_run = true ;
/* Don't start host services if the node is locked */
if ( daemon_is_file_present ( NODE_LOCKED_FILE ) == true )
{
ilog ("locked node");
start_host_services_needs_to_be_run = false ;
}
if ( daemon_is_file_present ( NODE_RESET_FILE ) )
{
wlog ("mtce reboot required");
@ -1948,7 +1981,7 @@ void daemon_service_run ( void )
* Need to ensure that the appropriate host
* services are started for the system/node
* type. */
if ( start_host_services_needs_to_be_run == true )
if ( start_host_services_needs_to_be_run == true )
{
if ( ctrl.system_type == SYSTEM_TYPE__NORMAL )
{
@ -1981,7 +2014,7 @@ void daemon_service_run ( void )
ctrl.start_controller_hostservices = true ;
if ( ctrl.nodetype & WORKER_TYPE )
ctrl.start_worker_hostservices = true ;
start_host_services_needs_to_be_run = false ;
start_host_services_needs_to_be_run = false ;
}
else if (( daemon_is_file_present ( GOENABLED_MAIN_FAIL ) ||
( daemon_is_file_present ( GOENABLED_SUBF_FAIL ))))
@ -2001,7 +2034,7 @@ void daemon_service_run ( void )
ctrl.start_worker_hostservices = true ;
else if ( ctrl.nodetype & STORAGE_TYPE )
ctrl.start_storage_hostservices = true ;
start_host_services_needs_to_be_run = false ;
start_host_services_needs_to_be_run = false ;
}
else if ( daemon_is_file_present ( GOENABLED_MAIN_FAIL ) )
{
@ -2012,7 +2045,6 @@ void daemon_service_run ( void )
}
}
// Handle auto start of node personality services.
// - prioritize controller first
// - prevent more than one being posted at once

View File

@ -1,7 +1,7 @@
#ifndef __INCLUDE_MTCNODECOMP_HH__
#define __INCLUDE_MTCNODECOMP_HH__
/*
* Copyright (c) 2015-2016, 2024 Wind River Systems, Inc.
* Copyright (c) 2015-2016, 2024-2025 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -139,6 +139,17 @@ typedef struct
bool start_worker_hostservices = false ;
bool start_storage_hostservices = false ;
/* Store the result of the last Start Host Services
* completion status for each personality. */
int controller_hostservices_result = PASS ;
int worker_hostservices_result = PASS ;
int storage_hostservices_result = PASS ;
/* Store the result of the last goEnabled completion
* status */
int goEnable_result = PASS ;
int goEnable_result_subf = PASS ;
/* The script set that is executing */
script_set_enum active_script_set ;

View File

@ -69,6 +69,11 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
return RETRY ;
}
/* Check for a 'delayed self reboot required' condition */
if ( this->delayed_swact_required )
if ( node_ptr->hostname == this->my_hostname )
return ( this->self_fail_handler ( node_ptr ));
/* manage the host connected state and board management alarms */
nodeLinkClass::bmc_handler ( node_ptr );

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2020, 2023-2024 Wind River Systems, Inc.
* Copyright (c) 2013-2020, 2023-2025 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -796,7 +796,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
/* clear all the past enable failure bools */
clear_main_failed_bools ( node_ptr );
clear_subf_failed_bools ( node_ptr );
clear_hostservices_ctls ( node_ptr );
/* Clear all degrade flags except for the HWMON one */
clear_host_degrade_causes ( node_ptr->degrade_mask );
@ -829,18 +828,12 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
case MTC_AVAIL_STATUS__INTEST:
case MTC_AVAIL_STATUS__FAILED:
/* enable auto recovery if the inactive controller
* is out of service */
//if (( is_controller (node_ptr) ) && ( NOT_THIS_HOST ))
// node_ptr->ar_disabled = false ;
// this->autorecovery_enabled = true ;
/* fall through */
case MTC_AVAIL_STATUS__DEGRADED:
case MTC_AVAIL_STATUS__AVAILABLE:
{
if (( is_active_controller ( node_ptr->hostname )) &&
if ( ( NOT_SIMPLEX ) && ( is_active_controller ( node_ptr->hostname )) &&
( is_inactive_controller_main_insv() == false ))
{
wlog ("%s recovering active controller from %s-%s-%s\n",
@ -1068,6 +1061,12 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->goEnabled = false ;
node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ;
if ( node_ptr->forcing_full_enable == true )
{
ilog ("%s clearing force full enable recursion prevention flag", node_ptr->hostname.c_str());
node_ptr->forcing_full_enable = false ;
}
/* Set uptime to zero in mtce and in the database */
node_ptr->uptime_save = 0 ;
set_uptime ( node_ptr, 0 , false );
@ -1159,8 +1158,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
}
else
{
plog ("%s is MTCALIVE (uptime:%d secs)\n",
node_ptr->hostname.c_str(), node_ptr->uptime );
plog ("%s is MTCALIVE (uptime:%d secs) (oob:%08X)",
node_ptr->hostname.c_str(), node_ptr->uptime, node_ptr->mtce_flags );
if ((NOT_THIS_HOST) &&
( node_ptr->uptime > ((unsigned int)(node_ptr->mtcalive_timeout*2))))
{
@ -1198,7 +1197,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->unlock_cmd_ack = false ;
send_mtc_cmd ( node_ptr->hostname , MTC_MSG_UNLOCKED, MGMNT_INTERFACE );
/* Request Out-Of--Service test execution */
/* Request Out-Of-Service test execution */
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MAIN_GOENABLED, MGMNT_INTERFACE );
/* now officially in the In-Test state */
@ -1257,7 +1256,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->goEnabled = false ;
/* start waiting fhr the ENABLE READY message */
/* start waiting for the ENABLE READY message */
enableStageChange ( node_ptr, MTC_ENABLE__GOENABLED_WAIT );
break ;
@ -1298,7 +1297,24 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
mtcInvApi_update_task ( node_ptr, MTC_TASK_INITIALIZING );
/* ok. great, got the go-enabled message, lets move on */
enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_START );
/* Don't start the self heartbeat for the active controller.
* Also, in AIO , hosts that have a controller function also
* have a worker function and the heartbeat for those hosts
* are started at the end of the subfunction handler. */
if (( THIS_HOST ) ||
(( AIO_SYSTEM ) && ( is_controller(node_ptr)) ))
{
enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
}
else
{
/* allow the fsm to wait for up to 1 minute for the
* hbsClient's ready event before starting heartbeat
* test. */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_WAIT );
}
}
else if ( mtcTimer_expired ( node_ptr->mtcTimer ))
{
@ -1327,102 +1343,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
break ;
}
case MTC_ENABLE__HOST_SERVICES_START:
{
bool start = true ;
plog ("%s Starting Host Services\n", node_ptr->hostname.c_str());
if ( this->launch_host_services_cmd ( node_ptr, start ) != PASS )
{
elog ("%s %s failed ; launch\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.name.c_str());
node_ptr->hostservices_failed = true ;
alarm_enabled_failure ( node_ptr, true );
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_SERVICE_FAIL );
/* handle auto recovery for this failure */
if ( ar_manage ( node_ptr,
MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
MTC_TASK_AR_DISABLED_SERVICES ) != PASS )
break ;
}
else
{
mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLING );
enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_WAIT );
}
break ;
}
case MTC_ENABLE__HOST_SERVICES_WAIT:
{
/* Wait for host services to complete - pass or fail.
* The host_services_handler manages timeout. */
rc = this->host_services_handler ( node_ptr );
if ( rc == RETRY )
{
/* wait for the mtcClient's response ... */
break ;
}
else if ( rc != PASS )
{
node_ptr->hostservices_failed = true ;
alarm_enabled_failure ( node_ptr, true );
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
/* distinguish 'timeout' from other 'execution' failures */
if ( rc == FAIL_TIMEOUT )
{
elog ("%s %s failed ; timeout\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.name.c_str());
mtcInvApi_update_task ( node_ptr,
MTC_TASK_MAIN_SERVICE_TO );
}
else
{
elog ("%s %s failed ; rc:%d\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.name.c_str(),
rc);
mtcInvApi_update_task ( node_ptr,
MTC_TASK_MAIN_SERVICE_FAIL );
}
/* handle auto recovery for this failure */
if ( ar_manage ( node_ptr,
MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
MTC_TASK_AR_DISABLED_SERVICES ) != PASS )
break ;
}
else /* success path */
{
/* Don't start the self heartbeat for the active controller.
* Also, in AIO , hosts that have a controller function also
* have a worker function and the heartbeat for those hosts
* are started at the end of the subfunction handler. */
if (( THIS_HOST ) ||
(( AIO_SYSTEM ) && ( is_controller(node_ptr)) ))
{
enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
}
else
{
/* allow the fsm to wait for up to 1 minute for the
* hbsClient's ready event before starting heartbeat
* test. */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_WAIT );
}
}
break ;
}
case MTC_ENABLE__HEARTBEAT_WAIT:
{
@ -1708,7 +1628,6 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
/* clear all the past enable failure bools */
clear_main_failed_bools ( node_ptr );
clear_subf_failed_bools ( node_ptr );
clear_hostservices_ctls ( node_ptr );
/* Disable the heartbeat service for Graceful Recovery */
send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST );
@ -2331,77 +2250,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
/* O.K. clearing the state now that we got it */
node_ptr->goEnabled = false ;
recoveryStageChange ( node_ptr, MTC_RECOVERY__HOST_SERVICES_START );
}
else if ( node_ptr->mtcTimer.ring == true )
{
elog ("%s has GOENABLED Timeout\n", node_ptr->hostname.c_str());
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_TO );
node_ptr->mtcTimer.ring = false ;
this->force_full_enable ( node_ptr );
}
break;
}
case MTC_RECOVERY__HOST_SERVICES_START:
{
bool start = true ;
plog ("%s Starting Host Services\n", node_ptr->hostname.c_str());
if ( this->launch_host_services_cmd ( node_ptr, start ) != PASS )
{
elog ("%s %s failed ; launch\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.name.c_str());
node_ptr->hostservices_failed = true ;
this->force_full_enable ( node_ptr );
}
else
{
recoveryStageChange ( node_ptr, MTC_RECOVERY__HOST_SERVICES_WAIT );
}
break ;
}
case MTC_RECOVERY__HOST_SERVICES_WAIT:
{
/* Wait for host services to complete - pass or fail.
* The host_services_handler manages timeout. */
rc = this->host_services_handler ( node_ptr );
if ( rc == RETRY )
{
/* wait for the mtcClient's response ... */
break ;
}
else if ( rc != PASS )
{
node_ptr->hostservices_failed = true ;
if ( rc == FAIL_TIMEOUT )
{
elog ("%s %s failed ; timeout\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.name.c_str());
mtcInvApi_update_task ( node_ptr,
MTC_TASK_START_SERVICE_TO );
}
else
{
elog ("%s %s failed ; rc=%d\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.name.c_str(),
rc);
mtcInvApi_update_task ( node_ptr,
MTC_TASK_START_SERVICE_FAIL );
}
this->force_full_enable ( node_ptr );
}
else /* success path */
{
/* The active controller would never get/be here but
* if it did then just fall through to change state. */
/* Manage state change */
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
/* Here we need to run the sub-fnction goenable and start
@ -2436,7 +2285,16 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE );
}
}
break ;
else if ( node_ptr->mtcTimer.ring == true )
{
elog ("%s has GOENABLED Timeout\n", node_ptr->hostname.c_str());
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_TO );
node_ptr->mtcTimer.ring = false ;
this->force_full_enable ( node_ptr );
}
break;
}
case MTC_RECOVERY__CONFIG_COMPLETE_WAIT:
{
@ -2504,7 +2362,9 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->goEnabled_subf = false ;
/* ok. great, got the go-enabled message, lets move on */
recoveryStageChange ( node_ptr, MTC_RECOVERY__SUBF_SERVICES_START );
mtcTimer_reset ( node_ptr->mtcTimer );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_WORKER_CONFIG_TIMEOUT );
recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START );
}
else if ( node_ptr->mtcTimer.ring == true )
{
@ -2520,72 +2380,6 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
break ;
}
case MTC_RECOVERY__SUBF_SERVICES_START:
{
bool start = true ;
bool subf = true ;
plog ("%s-worker Starting Host Services\n", node_ptr->hostname.c_str());
if ( this->launch_host_services_cmd ( node_ptr, start, subf ) != PASS )
{
elog ("%s-worker %s failed ; launch\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.name.c_str());
node_ptr->hostservices_failed_subf = true ;
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_FAIL );
this->force_full_enable ( node_ptr );
}
else
{
recoveryStageChange ( node_ptr, MTC_RECOVERY__SUBF_SERVICES_WAIT );
}
break ;
}
case MTC_RECOVERY__SUBF_SERVICES_WAIT:
{
/* Wait for host services to complete - pass or fail.
* The host_services_handler manages timeout. */
rc = this->host_services_handler ( node_ptr );
if ( rc == RETRY )
{
/* wait for the mtcClient's response ... */
break ;
}
else if ( rc != PASS )
{
node_ptr->hostservices_failed_subf = true ;
if ( rc == FAIL_TIMEOUT )
{
elog ("%s-worker %s failed ; timeout\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.name.c_str());
mtcInvApi_update_task ( node_ptr,
MTC_TASK_START_SERVICE_TO );
}
else
{
elog ("%s-worker %s failed ; rc=%d\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.name.c_str(),
rc);
mtcInvApi_update_task ( node_ptr,
MTC_TASK_START_SERVICE_FAIL );
}
this->force_full_enable ( node_ptr );
}
else /* success path */
{
/* allow the fsm to wait for up to 'worker config timeout'
* for the hbsClient's ready event before starting heartbeat
* test. */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_WORKER_CONFIG_TIMEOUT );
recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START );
}
break ;
}
case MTC_RECOVERY__HEARTBEAT_START:
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
@ -2858,7 +2652,6 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr )
/* clear all the enable failure bools */
clear_main_failed_bools ( node_ptr );
clear_subf_failed_bools ( node_ptr );
clear_hostservices_ctls ( node_ptr );
enableStageChange ( node_ptr, MTC_ENABLE__START ) ;
disableStageChange ( node_ptr, MTC_DISABLE__DIS_SERVICES_WAIT) ;
@ -2973,6 +2766,12 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr )
/* proceed to handle force lock if the launch fails */
disableStageChange ( node_ptr, MTC_DISABLE__HANDLE_FORCE_LOCK );
}
else
{
ilog ("%s %s launched",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.name.c_str())
}
}
break ;
}
@ -6499,26 +6298,8 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
}
}
}
/* default retries counter to zero before START_SERVICES */
/* default retries counter to zero before MTC_SERVICES */
node_ptr->retries = 0 ;
node_ptr->addStage = MTC_ADD__START_SERVICES ;
break ;
}
case MTC_ADD__START_SERVICES:
{
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
(( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )))
{
ilog ("%s scheduling start host services\n",
node_ptr->hostname.c_str());
node_ptr->start_services_needed = true ;
node_ptr->start_services_retries = 0 ;
}
node_ptr->addStage = MTC_ADD__MTC_SERVICES ;
break ;
}
@ -6620,7 +6401,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
* to start host services. */
if ( this->dor_mode_active )
{
node_ptr->start_services_needed_subf = true ;
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__ENABLE_SUBF );
}
}
@ -7486,22 +7266,6 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr )
bool start = false ;
this->launch_host_services_cmd ( node_ptr, start );
}
else if ( daemon_want_fit ( FIT_CODE__START_HOST_SERVICES, node_ptr->hostname ))
{
if (( node_ptr->start_services_needed == false ) &&
( node_ptr->start_services_running_main == false ))
{
node_ptr->start_services_needed = true ;
node_ptr->start_services_retries = 0 ;
}
else
{
ilog ("%s start host services (FIT) rejected (%d:%d)\n",
node_ptr->hostname.c_str(),
node_ptr->start_services_needed,
node_ptr->start_services_running_main);
}
}
if (( daemon_is_file_present ( MTC_CMD_FIT__GOENABLE_AUDIT )) &&
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
@ -7608,8 +7372,9 @@ int nodeLinkClass::pxeboot_mtcAlive_monitor ( struct nodeLinkClass::node * node_
// Don't monitor pxeboot mtcAlive messaging while the node is
// locked or in the following administrative action states.
// locked, disabled or in the following administrative action states.
if (( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) ||
( node_ptr->operState == MTC_OPER_STATE__DISABLED ) ||
( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK ) ||
( node_ptr->adminAction == MTC_ADMIN_ACTION__ENABLE ) ||
( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER ) ||
@ -7822,20 +7587,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
case MTC_INSV_TEST__START:
{
mtcTimer_reset ( node_ptr->insvTestTimer );
/* Run the inservice test more frequently while
* start_services_needed is true and we are not
* in failure retry mode */
if (( node_ptr->start_services_needed == true ) &&
( node_ptr->hostservices_failed == false ) &&
( node_ptr->hostservices_failed_subf == false ))
{
mtcTimer_start ( node_ptr->insvTestTimer, mtcTimer_handler, MTC_SECS_2 );
}
else
{
mtcTimer_start ( node_ptr->insvTestTimer, mtcTimer_handler, insv_test_period );
}
mtcTimer_start ( node_ptr->insvTestTimer, mtcTimer_handler, insv_test_period );
insvTestStageChange ( node_ptr, MTC_INSV_TEST__WAIT );
break ;
}
@ -7957,147 +7709,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
"DOR mode active\n");
}
/*************************************************************
* Handle Main Function Start Host Services if it's 'needed'
************************************************************/
else if ( node_ptr->start_services_needed == true )
{
/* If Main Start Host Services is not already running
* then launch it */
if ( node_ptr->start_services_running_main == false )
{
/* Only launch if the node is successfully configured
* and tested */
if (( node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY ) &&
( node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) &&
( node_ptr->mtce_flags & MTC_FLAG__MAIN_GOENABLED ))
{
/* Launch 'start' for this node type */
bool start = true ;
if ( this->launch_host_services_cmd ( node_ptr , start ) != PASS )
{
/* failed -> retry */
node_ptr->hostservices_failed = true ;
node_ptr->start_services_running_main = false ;
node_ptr->start_services_retries++ ;
}
else
{
/* launched successfully */
node_ptr->start_services_running_main = true ;
node_ptr->hostservices_failed = false ;
}
}
else
{
ilog("%s start host services ; waiting to launch (%x)",
node_ptr->hostname.c_str(),
node_ptr->mtce_flags);
}
}
/* Handle Main start host services response */
else
{
/* Wait for host services to complete - pass or fail.
* The host_services_handler manages timeout. */
int rc = this->host_services_handler ( node_ptr );
if ( rc == RETRY )
{
/* wait for the mtcClient's response ... */
break ;
}
else if ( rc != PASS )
{
node_ptr->hostservices_failed = true ;
node_ptr->start_services_retries++ ;
wlog ("%s %s request failed ; (retry %d)\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.name.c_str(),
node_ptr->start_services_retries);
}
else /* success path */
{
node_ptr->start_services_needed = false ;
node_ptr->hostservices_failed = false ;
node_ptr->start_services_retries = 0 ;
}
node_ptr->start_services_running_main = false ;
}
}
/*************************************************************
* Handle Sub Function Start Host Services if it's 'needed'
************************************************************/
else if ( node_ptr->start_services_needed_subf == true )
{
/* If Subf Start Host Services is not already running
* then launch it */
if ( node_ptr->start_services_running_subf == false )
{
/* Only launch if the node and subfunction are
* successfully configured and tested */
if (( node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY ) &&
( node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) &&
( node_ptr->mtce_flags & MTC_FLAG__MAIN_GOENABLED ) &&
( node_ptr->mtce_flags & MTC_FLAG__SUBF_CONFIGURED ) &&
( node_ptr->mtce_flags & MTC_FLAG__SUBF_GOENABLED ))
{
/* Launch 'start' for this subfunction type */
bool start = true ;
bool subf = true ;
if ( this->launch_host_services_cmd ( node_ptr, start, subf ) != PASS )
{
/* failed -> retry */
node_ptr->hostservices_failed_subf = true ;
node_ptr->start_services_running_subf = false ;
node_ptr->start_services_retries++ ;
}
else
{
/* launched successfully */
node_ptr->hostservices_failed_subf = false ;
node_ptr->start_services_running_subf = true ;
}
}
else
{
ilog("%s subf start host services ; waiting to launch (%x)",
node_ptr->hostname.c_str(),
node_ptr->mtce_flags);
}
}
/* Handle Subf start host services response */
else
{
/* Wait for host services to complete - pass or fail.
* The host_services_handler manages timeout. */
int rc = this->host_services_handler ( node_ptr );
if ( rc == RETRY )
{
/* wait for the mtcClient's response ... */
break ;
}
node_ptr->start_services_running_subf = false ;
if ( rc != PASS )
{
node_ptr->start_services_running_subf = false ;
node_ptr->hostservices_failed_subf = true ;
node_ptr->start_services_retries++ ;
wlog ("%s %s request failed ; (retry %d)\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.name.c_str(),
node_ptr->start_services_retries);
}
else /* success path */
{
node_ptr->start_services_needed_subf = false ;
node_ptr->hostservices_failed_subf = false ;
node_ptr->start_services_running_subf = false ;
node_ptr->start_services_retries = 0 ;
}
node_ptr->start_services_running_subf = false ;
}
}
if ( NOT_THIS_HOST )
{
if ((( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
@ -8169,8 +7780,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
*
**/
if (( node_ptr->operState_subf == MTC_OPER_STATE__DISABLED ) &&
( node_ptr->ar_disabled == false ) &&
( node_ptr->start_services_needed == false ))
( node_ptr->ar_disabled == false ))
{
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE_SUBF ) &&
( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ))
@ -8197,28 +7807,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
}
}
}
/* Only raise this alarm while in simplex */
if (( num_controllers_enabled() < 2 ) &&
(( node_ptr->goEnabled_failed_subf == true ) ||
( node_ptr->inservice_failed_subf == true ) ||
( node_ptr->hostservices_failed_subf == true )))
{
if ( node_ptr->alarms[MTC_ALARM_ID__CH_COMP] == FM_ALARM_SEVERITY_CLEAR )
{
wlog ("%s insv test detected subfunction failure ; degrading host\n",
node_ptr->hostname.c_str());
alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_MAJOR );
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
MTC_OPER_STATE__ENABLED,
MTC_AVAIL_STATUS__DEGRADED );
subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
MTC_AVAIL_STATUS__FAILED );
}
}
}
/* Monitor the health of the host */
@ -8634,3 +8222,95 @@ int nodeLinkClass::cfg_handler ( struct nodeLinkClass::node * node_ptr )
}
return (PASS);
}
/*****************************************************************************
* Name : self_fail_handler
*
* Purpose : Handle force failure of self for Fully DX enabled or SX systems
*
* Description: Wait for mtcTimer to expire giving the the active controller
* time to flush any outstanding state change updates to the
* database. Then trigger a force shutdown of SM services.
*
* Simplex System behavior: issue a lazy reboot
* Duplex System behavior : wait for swact to the enabled standby controller.
*
* Assumptions: Only called in a DX system if the standby controller is enabled.
* Do a last second check for the enabled standby controller.
* Otherwise, abort and revert back to enabled-degraded.
*
* Parameters :
* @param node_ptr - pointed toi this host's nodeLinkClass control structure
*
*****************************************************************************/
int nodeLinkClass::self_fail_handler ( struct nodeLinkClass::node * node_ptr )
{
/* Wait for this Simplex node to lazy reboot */
if (this->self_reboot_wait)
{
ilog_throttled ( node_ptr->ar_log_throttle, AR_HANDLER_LOG_THROTTLE_THRESHOLD,
"%s ... waiting on lazy reboot", node_ptr->hostname.c_str());
return (PASS);
}
/* Wait for SM to shut down the mtcAgent */
else if (this->force_swact_wait)
{
ilog_throttled ( node_ptr->ar_log_throttle, AR_HANDLER_LOG_THROTTLE_THRESHOLD,
"%s ... waiting on force swact", node_ptr->hostname.c_str());
return (PASS);
}
/* Wait for the database update */
else if ( node_ptr->mtcTimer.ring )
{
// Last second check for an active standby controller in a DX system
if (( NOT_SIMPLEX ) && ( is_inactive_controller_main_insv () == false ))
{
// ERIK: TEST ME: Force this test case
wlog ("%s refusing to self reboot with no enabled standby controller", node_ptr->hostname.c_str());
wlog ("%s ... critical enable alarm raised", node_ptr->hostname.c_str());
wlog ("%s ... recommend enabling a standby controller.", node_ptr->hostname.c_str());
allStateChange ( node_ptr,
node_ptr->adminState,
MTC_OPER_STATE__ENABLED,
MTC_AVAIL_STATUS__DEGRADED );
alarm_enabled_failure ( node_ptr, true );
mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_NO_BACKUP);
this->delayed_swact_required = false ;
}
else
{
/* Force an uncontrolled SWACT to enabled standby controller */
/* Tell SM we are unhealthy so that it shuts down all its services */
wlog ("%s forcing SM to shut down services by %s", node_ptr->hostname.c_str(), SMGMT_UNHEALTHY_FILE);
daemon_log ( SMGMT_UNHEALTHY_FILE, "Maintenance force swact due to self failure");
node_ptr->ar_log_throttle = 0 ;
if ( SIMPLEX )
{
wlog ("%s commanding lazy reboot", node_ptr->hostname.c_str());
send_mtc_cmd ( node_ptr->hostname, MTC_CMD_LAZY_REBOOT, MGMNT_INTERFACE) ;
/* pxeboot network is not currently provisioned in SX
* auto handle if that changes in the future */
if ( this->pxeboot_network_provisioned == true )
send_mtc_cmd ( node_ptr->hostname, MTC_CMD_LAZY_REBOOT, PXEBOOT_INTERFACE) ;
this->self_reboot_wait = true ;
}
else
{
this->force_swact_wait = true ;
}
}
}
else
{
ilog_throttled (node_ptr->ar_log_throttle, AR_HANDLER_LOG_THROTTLE_THRESHOLD,
"%s ... waiting on database update before %s",
node_ptr->hostname.c_str(),
SIMPLEX ? "lazy reboot of this simplex system" :
"force swact to unlocked-enabled standby controller");
}
return (PASS);
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2016 Wind River Systems, Inc.
* Copyright (c) 2013-2016, 2025 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -192,7 +192,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
alarm_compute_clear ( node_ptr, true );
/* ok. great, got the go-enabled message, lets move on */
enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_START );
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_CHECK );
break ;
}
ilog ("%s running out-of-service tests\n", name.c_str());
@ -214,19 +214,20 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
}
case MTC_ENABLE__GOENABLED_WAIT:
{
bool goenable_failed = false ;
bool goenable_failed_subf = false ;
/* search for the Go Enable message */
if (( node_ptr->health == NODE_UNHEALTHY ) ||
( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY) ||
( node_ptr->mtce_flags & MTC_FLAG__SUBF_GOENABLE_FAIL) ||
( node_ptr->goEnabled_failed_subf == true ))
{
mtcTimer_reset ( node_ptr->mtcTimer );
elog ("%s one or more out-of-service tests failed\n", name.c_str());
elog ("%s one or more out-of-service subfunction tests failed\n", name.c_str());
mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_INTEST_FAIL );
enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED );
goenable_failed = true ;
goenable_failed_subf = true ;
}
/* search for the Go Enable message */
@ -245,17 +246,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
/* ok. great, got the go-enabled message, lets move on */
if ( node_ptr->start_services_needed_subf == true )
{
/* If the add_handler set start_services_needed_subf to
* true then we bypass inline execution and allow it to
* be serviced as a scheduled background operation. */
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_CHECK );
}
else
{
enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_START );
}
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_CHECK );
break ;
}
@ -265,14 +256,14 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_INTEST_TO );
enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED );
goenable_failed = true ;
goenable_failed_subf = true ;
}
else
{
; /* wait some more */
}
if ( goenable_failed == true )
if ( goenable_failed_subf == true )
{
alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
@ -284,103 +275,6 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
}
break ;
}
case MTC_ENABLE__HOST_SERVICES_START:
{
bool start = true ;
bool subf = true ;
plog ("%s %s host services\n",
name.c_str(),
node_ptr->start_services_needed_subf ? "scheduling start compute" :
"starting compute");
if ( node_ptr->start_services_needed_subf == true )
{
bool force = true ;
/* If the add_handler set start_services_needed_subf to
* true then we bypass inline execution and allow it to
* be serviced as a scheduled background operation. */
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_CHECK );
alarm_compute_clear ( node_ptr, force );
}
else if ( launch_host_services_cmd ( node_ptr, start, subf ) != PASS )
{
wlog ("%s %s failed ; launch\n",
name.c_str(),
node_ptr->host_services_req.name.c_str());
node_ptr->hostservices_failed_subf = true ;
alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED );
mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_FAIL );
/* handle auto recovery for this failure */
if ( ar_manage ( node_ptr,
MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
MTC_TASK_AR_DISABLED_SERVICES ) != PASS )
break ;
}
else
{
enableStageChange ( node_ptr, MTC_ENABLE__HOST_SERVICES_WAIT );
}
break ;
}
case MTC_ENABLE__HOST_SERVICES_WAIT:
{
/* Wait for host services to complete - pass or fail.
* The host_services_handler manages timeout. */
rc = host_services_handler ( node_ptr );
if ( rc == RETRY )
{
/* wait for the mtcClient's response ... */
break ;
}
else if ( rc != PASS )
{
node_ptr->hostservices_failed_subf = true ;
alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED );
if ( rc == FAIL_TIMEOUT )
{
elog ("%s %s failed ; timeout\n",
name.c_str(),
node_ptr->host_services_req.name.c_str());
/* Report "Enabling Compute Service Timeout" to sysinv/horizon */
mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_TO );
}
else
{
elog ("%s %s failed ; rc:%d\n",
name.c_str(),
node_ptr->host_services_req.name.c_str(),
rc);
/* Report "Enabling Compute Service Failed" to sysinv/horizon */
mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_FAIL );
}
/* handle auto recovery for this failure */
if ( ar_manage ( node_ptr,
MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
MTC_TASK_AR_DISABLED_SERVICES ) != PASS )
break ;
}
else /* success path */
{
alarm_compute_clear ( node_ptr, true );
node_ptr->hostservices_failed_subf = false ;
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_CHECK );
}
break ;
}
case MTC_ENABLE__HEARTBEAT_CHECK:
{
if ( THIS_HOST )
@ -569,11 +463,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
plog ("%s is ENABLED\n", name.c_str());
}
/* already cleared if true so no need to do it again */
if ( node_ptr->start_services_needed_subf != true )
{
alarm_compute_clear ( node_ptr, force );
}
alarm_compute_clear ( node_ptr, force );
enableStageChange ( node_ptr, MTC_ENABLE__DONE );

View File

@ -80,7 +80,7 @@ bmc_reset_delay = 300 ; seconds to wait before issuing a bmc
http_retry_wait = 10 ; secs to wait between http request retries
host_add_delay = 20 ; seconds to wait before adding hosts
host_add_delay = 0 ; seconds to wait before adding hosts
[client] ; Client Configuration