Merge "Debian: Make Mtce offline handler more resilient to slow shutdowns"
This commit is contained in:
commit
03e953d8df
@ -151,7 +151,7 @@ EFmAlarmSeverityT alarmUtil_query ( string & hostname,
|
||||
ENTITY_PREFIX, hostname.data(), instance.data());
|
||||
}
|
||||
|
||||
alog ("entity_instance:%s\n", alarm_filter.entity_instance_id );
|
||||
alog1 ("entity_instance:%s\n", alarm_filter.entity_instance_id );
|
||||
if (( rc = fm_get_fault ( &alarm_filter, &alarm_query )) == FM_ERR_OK )
|
||||
{
|
||||
dlog ("Found with Severity: %d\n", alarm_query.severity );
|
||||
@ -183,7 +183,7 @@ int alarmUtil_query_identity ( string identity, SFmAlarmDataT * alarm_list_ptr,
|
||||
memset(&alarm_filter, 0, sizeof(alarm_filter));
|
||||
snprintf ( alarm_filter.alarm_id, FM_MAX_BUFFER_LENGTH, "%s", identity.data());
|
||||
rc = fm_get_faults_by_id ( &alarm_filter.alarm_id, alarm_list_ptr, &max_alarms );
|
||||
alog ("%s fm_get_faults_by_id rc = %d\n", alarm_filter.alarm_id, rc );
|
||||
alog1 ("%s fm_get_faults_by_id rc = %d\n", alarm_filter.alarm_id, rc );
|
||||
if ( rc == FM_ERR_OK )
|
||||
{
|
||||
return (PASS);
|
||||
@ -242,7 +242,7 @@ int alarmUtil ( string & hostname,
|
||||
if (( alarm.alarm_state != FM_ALARM_STATE_MSG ) &&
|
||||
( curr_sev == alarm.severity ))
|
||||
{
|
||||
alog ("%s %s %s already at desired (%s) severity level\n",
|
||||
alog1 ("%s %s %s already at desired (%s) severity level\n",
|
||||
hostname.c_str(),
|
||||
identity.c_str(),
|
||||
instance.c_str(),
|
||||
@ -259,18 +259,18 @@ int alarmUtil ( string & hostname,
|
||||
{
|
||||
if ( alarm.alarm_state == FM_ALARM_STATE_SET )
|
||||
{
|
||||
alog ("%s setting %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id );
|
||||
alog1 ("%s setting %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id );
|
||||
}
|
||||
else
|
||||
{
|
||||
alog ("%s creating %s %s log\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id );
|
||||
alog1 ("%s creating %s %s log\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id );
|
||||
}
|
||||
|
||||
/* Debug Logs */
|
||||
alog ("%s Alarm Reason: %s\n", hostname.c_str(), alarm.reason_text );
|
||||
alog ("%s Alarm Action: %s\n", hostname.c_str(), alarm.proposed_repair_action );
|
||||
alog ("%s Alarm Ident : %s : %s\n", hostname.c_str(), alarm.entity_type_id, alarm.entity_instance_id );
|
||||
alog ("%s Alarm State : state:%d sev:%d type:%d cause:%d sa:%c supp:%c\n",
|
||||
alog1 ("%s Alarm Reason: %s\n", hostname.c_str(), alarm.reason_text );
|
||||
alog1 ("%s Alarm Action: %s\n", hostname.c_str(), alarm.proposed_repair_action );
|
||||
alog1 ("%s Alarm Ident : %s : %s\n", hostname.c_str(), alarm.entity_type_id, alarm.entity_instance_id );
|
||||
alog1 ("%s Alarm State : state:%d sev:%d type:%d cause:%d sa:%c supp:%c\n",
|
||||
hostname.c_str(),
|
||||
alarm.alarm_state,
|
||||
alarm.severity,
|
||||
@ -322,7 +322,7 @@ int alarmUtil ( string & hostname,
|
||||
// send_log_message ( mtclogd_ptr, hostname.data(), &__alarmObject.varlog_filename[0],
|
||||
// &__alarmObject.temp_str[0] );
|
||||
|
||||
alog ("%s clearing %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id);
|
||||
alog1 ("%s clearing %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id);
|
||||
nodeUtil_latency_log ( hostname, NODEUTIL_LATENCY_MON_START , 0 );
|
||||
if ( ( rc = fm_clear_fault_async ( &filter )) != FM_ERR_OK )
|
||||
{
|
||||
|
@ -3902,7 +3902,7 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int in
|
||||
{
|
||||
if ( node_ptr->mtcAlive_clstr == false )
|
||||
{
|
||||
alog2 ("%s %s mtcAlive received",
|
||||
alog ("%s %s mtcAlive received",
|
||||
node_ptr->hostname.c_str(),
|
||||
get_iface_name_str(interface));
|
||||
node_ptr->mtcAlive_clstr = true ;
|
||||
@ -3912,7 +3912,7 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int in
|
||||
{
|
||||
if ( node_ptr->mtcAlive_mgmnt == false )
|
||||
{
|
||||
alog2 ("%s %s mtcAlive received",
|
||||
alog ("%s %s mtcAlive received",
|
||||
node_ptr->hostname.c_str(),
|
||||
get_iface_name_str(interface));
|
||||
node_ptr->mtcAlive_mgmnt = true ;
|
||||
|
@ -454,7 +454,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
else
|
||||
{
|
||||
int delay = (((offline_period*offline_threshold)/1000)+3);
|
||||
int delay = (((offline_period*offline_threshold)/1000)*3);
|
||||
ilog ("%s searching for offline ; next reboot attempt in %d seconds\n",
|
||||
node_ptr->hostname.c_str(), delay);
|
||||
|
||||
@ -507,7 +507,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
wlog ("%s Board Management Interface not accessible\n", node_ptr->hostname.c_str());
|
||||
}
|
||||
}
|
||||
int delay = (((offline_period*offline_threshold)/1000)+3);
|
||||
int delay = (((offline_period*offline_threshold)/1000)*3);
|
||||
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, delay );
|
||||
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ;
|
||||
break ;
|
||||
@ -516,7 +516,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
if ( node_ptr->mtcCmd_timer.ring == true )
|
||||
{
|
||||
int delay = (((offline_period*offline_threshold)/1000)+3);
|
||||
int delay = (((offline_period*offline_threshold)/1000)*3);
|
||||
|
||||
/* bmc power control reset by bmc */
|
||||
rc = bmc_command_recv ( node_ptr );
|
||||
|
@ -3221,6 +3221,7 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
case MTC_OFFLINE__START:
|
||||
{
|
||||
node_ptr->mtcAlive_count = 0 ;
|
||||
node_ptr->mtcAlive_mgmnt = false ;
|
||||
node_ptr->mtcAlive_clstr = false ;
|
||||
node_ptr->offline_log_throttle = 0 ;
|
||||
@ -3245,6 +3246,27 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
availStatus_enum_to_str(node_ptr->availStatus).c_str());
|
||||
|
||||
this->ctl_mtcAlive_gate ( node_ptr, false ) ;
|
||||
|
||||
/**
|
||||
* Handle the race condition case where the
|
||||
* mtcAlive was received after the last check
|
||||
* while in MTC_OFFLINE__WAIT below and here when
|
||||
* the node_ptr->mtcAlive_<iface> state variables
|
||||
* are cleared. Need to also clear the
|
||||
* offline_search_count here as well.
|
||||
**/
|
||||
if (( node_ptr->mtcAlive_mgmnt || node_ptr->mtcAlive_clstr ) && node_ptr->offline_search_count )
|
||||
{
|
||||
node_ptr->mtcAlive_online = true ;
|
||||
ilog ("%s still seeing mtcAlive (%d) (%c:%c) ; reset offline_search_count=%d of %d\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->mtcAlive_count,
|
||||
node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
|
||||
node_ptr->mtcAlive_clstr ? 'Y' : 'n',
|
||||
node_ptr->offline_search_count,
|
||||
offline_threshold );
|
||||
node_ptr->offline_search_count = 0 ; /* reset the count */
|
||||
}
|
||||
node_ptr->mtcAlive_mgmnt = false ;
|
||||
node_ptr->mtcAlive_clstr = false ;
|
||||
|
||||
@ -3299,24 +3321,36 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
offline_threshold );
|
||||
}
|
||||
}
|
||||
else
|
||||
else if ( node_ptr->offline_search_count )
|
||||
{
|
||||
/**
|
||||
* This algorithm was assuming the node is offline after
|
||||
* offline_search_count reached offline_threshold count.
|
||||
*
|
||||
* Note: The mtcClient sends periodic mtcAlive messages
|
||||
* until it is shutdown.
|
||||
* This algorithm also explicitely 'requests' the message.
|
||||
* The algorithm depends on not receving the message, even
|
||||
* when requested for offline_threshold counts 'in a row'.
|
||||
*
|
||||
* When shutdown is slowed or delayed, a late mtcAlive
|
||||
* can trick this FSM into seeing the node as recovered
|
||||
* when in fact its still shuttingdown.
|
||||
*
|
||||
* To maintain the intended absence of mtcAlive messages
|
||||
* count 'in a row', this check resets the search count
|
||||
* if a mtcAlive is seen during the search.
|
||||
**/
|
||||
|
||||
node_ptr->mtcAlive_online = true ;
|
||||
if ( node_ptr->mtcAlive_mgmnt || node_ptr->mtcAlive_clstr )
|
||||
{
|
||||
ilog_throttled ( node_ptr->offline_log_throttle, 10,
|
||||
"%s still seeing mtcAlive (%c:%c)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
|
||||
node_ptr->mtcAlive_clstr ? 'Y' : 'n');
|
||||
}
|
||||
else
|
||||
{
|
||||
alog ("%s still seeing mtcAlive (%c:%c)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
|
||||
node_ptr->mtcAlive_clstr ? 'Y' : 'n');
|
||||
}
|
||||
ilog ("%s still seeing mtcAlive (%d) (%c:%c) ; reset offline_search_count=%d of %d\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->mtcAlive_count,
|
||||
node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
|
||||
node_ptr->mtcAlive_clstr ? 'Y' : 'n',
|
||||
node_ptr->offline_search_count,
|
||||
offline_threshold );
|
||||
node_ptr->offline_search_count = 0 ; /* reset the search count */
|
||||
}
|
||||
|
||||
if ( node_ptr->offlineStage == MTC_OFFLINE__IDLE )
|
||||
|
@ -84,6 +84,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
|
||||
node_ptr->unknown_health_reported = false ;
|
||||
node_ptr->goEnabled_failed_subf = false ;
|
||||
node_ptr->hbsClient_ready = false ;
|
||||
|
||||
/* load worker subfunciton alarm state */
|
||||
EFmAlarmSeverityT sev = mtcAlarm_state ( node_ptr->hostname,
|
||||
|
Loading…
x
Reference in New Issue
Block a user