Merge "Debian: Make Mtce offline handler more resilient to slow shutdowns"

This commit is contained in:
Zuul 2022-10-24 19:41:06 +00:00 committed by Gerrit Code Review
commit 03e953d8df
5 changed files with 66 additions and 31 deletions

View File

@ -151,7 +151,7 @@ EFmAlarmSeverityT alarmUtil_query ( string & hostname,
ENTITY_PREFIX, hostname.data(), instance.data());
}
alog ("entity_instance:%s\n", alarm_filter.entity_instance_id );
alog1 ("entity_instance:%s\n", alarm_filter.entity_instance_id );
if (( rc = fm_get_fault ( &alarm_filter, &alarm_query )) == FM_ERR_OK )
{
dlog ("Found with Severity: %d\n", alarm_query.severity );
@ -183,7 +183,7 @@ int alarmUtil_query_identity ( string identity, SFmAlarmDataT * alarm_list_ptr,
memset(&alarm_filter, 0, sizeof(alarm_filter));
snprintf ( alarm_filter.alarm_id, FM_MAX_BUFFER_LENGTH, "%s", identity.data());
rc = fm_get_faults_by_id ( &alarm_filter.alarm_id, alarm_list_ptr, &max_alarms );
alog ("%s fm_get_faults_by_id rc = %d\n", alarm_filter.alarm_id, rc );
alog1 ("%s fm_get_faults_by_id rc = %d\n", alarm_filter.alarm_id, rc );
if ( rc == FM_ERR_OK )
{
return (PASS);
@ -242,7 +242,7 @@ int alarmUtil ( string & hostname,
if (( alarm.alarm_state != FM_ALARM_STATE_MSG ) &&
( curr_sev == alarm.severity ))
{
alog ("%s %s %s already at desired (%s) severity level\n",
alog1 ("%s %s %s already at desired (%s) severity level\n",
hostname.c_str(),
identity.c_str(),
instance.c_str(),
@ -259,18 +259,18 @@ int alarmUtil ( string & hostname,
{
if ( alarm.alarm_state == FM_ALARM_STATE_SET )
{
alog ("%s setting %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id );
alog1 ("%s setting %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id );
}
else
{
alog ("%s creating %s %s log\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id );
alog1 ("%s creating %s %s log\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id );
}
/* Debug Logs */
alog ("%s Alarm Reason: %s\n", hostname.c_str(), alarm.reason_text );
alog ("%s Alarm Action: %s\n", hostname.c_str(), alarm.proposed_repair_action );
alog ("%s Alarm Ident : %s : %s\n", hostname.c_str(), alarm.entity_type_id, alarm.entity_instance_id );
alog ("%s Alarm State : state:%d sev:%d type:%d cause:%d sa:%c supp:%c\n",
alog1 ("%s Alarm Reason: %s\n", hostname.c_str(), alarm.reason_text );
alog1 ("%s Alarm Action: %s\n", hostname.c_str(), alarm.proposed_repair_action );
alog1 ("%s Alarm Ident : %s : %s\n", hostname.c_str(), alarm.entity_type_id, alarm.entity_instance_id );
alog1 ("%s Alarm State : state:%d sev:%d type:%d cause:%d sa:%c supp:%c\n",
hostname.c_str(),
alarm.alarm_state,
alarm.severity,
@ -322,7 +322,7 @@ int alarmUtil ( string & hostname,
// send_log_message ( mtclogd_ptr, hostname.data(), &__alarmObject.varlog_filename[0],
// &__alarmObject.temp_str[0] );
alog ("%s clearing %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id);
alog1 ("%s clearing %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id);
nodeUtil_latency_log ( hostname, NODEUTIL_LATENCY_MON_START , 0 );
if ( ( rc = fm_clear_fault_async ( &filter )) != FM_ERR_OK )
{

View File

@ -3902,7 +3902,7 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int in
{
if ( node_ptr->mtcAlive_clstr == false )
{
alog2 ("%s %s mtcAlive received",
alog ("%s %s mtcAlive received",
node_ptr->hostname.c_str(),
get_iface_name_str(interface));
node_ptr->mtcAlive_clstr = true ;
@ -3912,7 +3912,7 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int in
{
if ( node_ptr->mtcAlive_mgmnt == false )
{
alog2 ("%s %s mtcAlive received",
alog ("%s %s mtcAlive received",
node_ptr->hostname.c_str(),
get_iface_name_str(interface));
node_ptr->mtcAlive_mgmnt = true ;

View File

@ -454,7 +454,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr )
}
else
{
int delay = (((offline_period*offline_threshold)/1000)+3);
int delay = (((offline_period*offline_threshold)/1000)*3);
ilog ("%s searching for offline ; next reboot attempt in %d seconds\n",
node_ptr->hostname.c_str(), delay);
@ -507,7 +507,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr )
wlog ("%s Board Management Interface not accessible\n", node_ptr->hostname.c_str());
}
}
int delay = (((offline_period*offline_threshold)/1000)+3);
int delay = (((offline_period*offline_threshold)/1000)*3);
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, delay );
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ;
break ;
@ -516,7 +516,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr->mtcCmd_timer.ring == true )
{
int delay = (((offline_period*offline_threshold)/1000)+3);
int delay = (((offline_period*offline_threshold)/1000)*3);
/* bmc power control reset by bmc */
rc = bmc_command_recv ( node_ptr );

View File

@ -3221,6 +3221,7 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr )
}
case MTC_OFFLINE__START:
{
node_ptr->mtcAlive_count = 0 ;
node_ptr->mtcAlive_mgmnt = false ;
node_ptr->mtcAlive_clstr = false ;
node_ptr->offline_log_throttle = 0 ;
@ -3245,6 +3246,27 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr )
availStatus_enum_to_str(node_ptr->availStatus).c_str());
this->ctl_mtcAlive_gate ( node_ptr, false ) ;
/**
* Handle the race condition case where the
* mtcAlive was received after the last check
* while in MTC_OFFLINE__WAIT below and here when
* the node_ptr->mtcAlive_<iface> state variables
* are cleared. Need to also clear the
* offline_search_count here as well.
**/
if (( node_ptr->mtcAlive_mgmnt || node_ptr->mtcAlive_clstr ) && node_ptr->offline_search_count )
{
node_ptr->mtcAlive_online = true ;
ilog ("%s still seeing mtcAlive (%d) (%c:%c) ; reset offline_search_count=%d of %d\n",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_count,
node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
node_ptr->mtcAlive_clstr ? 'Y' : 'n',
node_ptr->offline_search_count,
offline_threshold );
node_ptr->offline_search_count = 0 ; /* reset the count */
}
node_ptr->mtcAlive_mgmnt = false ;
node_ptr->mtcAlive_clstr = false ;
@ -3299,24 +3321,36 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr )
offline_threshold );
}
}
else
else if ( node_ptr->offline_search_count )
{
/**
* This algorithm was assuming the node is offline after
* offline_search_count reached offline_threshold count.
*
* Note: The mtcClient sends periodic mtcAlive messages
* until it is shutdown.
* This algorithm also explicitely 'requests' the message.
* The algorithm depends on not receving the message, even
* when requested for offline_threshold counts 'in a row'.
*
* When shutdown is slowed or delayed, a late mtcAlive
* can trick this FSM into seeing the node as recovered
* when in fact its still shuttingdown.
*
* To maintain the intended absence of mtcAlive messages
* count 'in a row', this check resets the search count
* if a mtcAlive is seen during the search.
**/
node_ptr->mtcAlive_online = true ;
if ( node_ptr->mtcAlive_mgmnt || node_ptr->mtcAlive_clstr )
{
ilog_throttled ( node_ptr->offline_log_throttle, 10,
"%s still seeing mtcAlive (%c:%c)\n",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
node_ptr->mtcAlive_clstr ? 'Y' : 'n');
}
else
{
alog ("%s still seeing mtcAlive (%c:%c)\n",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
node_ptr->mtcAlive_clstr ? 'Y' : 'n');
}
ilog ("%s still seeing mtcAlive (%d) (%c:%c) ; reset offline_search_count=%d of %d\n",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_count,
node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
node_ptr->mtcAlive_clstr ? 'Y' : 'n',
node_ptr->offline_search_count,
offline_threshold );
node_ptr->offline_search_count = 0 ; /* reset the search count */
}
if ( node_ptr->offlineStage == MTC_OFFLINE__IDLE )

View File

@ -84,6 +84,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->unknown_health_reported = false ;
node_ptr->goEnabled_failed_subf = false ;
node_ptr->hbsClient_ready = false ;
/* load worker subfunciton alarm state */
EFmAlarmSeverityT sev = mtcAlarm_state ( node_ptr->hostname,