Eric MacDonald c038b1a9a7 Collectd+InfluxDb-RMON Replacement(ALL METRICS) P1
This update adds Maintenance support for receiving host degrade assert
and clear messages from collectd.
This update also disables platform memory, cpu and file system resource
monitoring in the maintenance resource monitor process rmon.
These disabled resources are now monitored by collectd and therefore
should not be monitored by rmond any longer.

Change-Id: I13fd033bb1d14f299dcb97fa80296641c958d0a9
Signed-off-by: Jack Ding <jack.ding@windriver.com>
2018-07-03 11:04:27 -04:00

8579 lines
292 KiB
C++
Executable File

/*
* Copyright (c) 2013-2016 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
*/
/**
* @file
* Wind River CGTS Platform Nodal Health Check Service Node Implementation
*/
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <string>
#include <errno.h> /* for ENODEV, EFAULT and ENXIO */
#include <unistd.h> /* for close and usleep */
using namespace std;
#ifdef __AREA__
#undef __AREA__
#define __AREA__ "---"
#endif
#include "nodeBase.h"
#include "threadUtil.h"
#include "nodeClass.h"
#include "nodeUtil.h"
#include "mtcNodeMsg.h" /* for ... send_mtc_cmd */
#include "nlEvent.h" /* for ... get_netlink_events */
#include "daemon_common.h"
#include "alarmUtil.h"
#include "mtcAlarm.h"
#include "alarm.h"
#include "hbsAlarm.h"
extern void mtcTimer_handler ( int sig, siginfo_t *si, void *uc);
const char mtc_nodeAdminAction_str[MTC_ADMIN_ACTIONS][20] =
{
"none",
"lock",
"unlock",
"reset",
"reboot",
"reinstall",
"power-off",
"power-on",
"recovery",
"delete",
"powercycle",
"add",
"swact",
"force-lock",
"force-swact",
"enable",
"enable-subf",
};
const char * get_adminAction_str ( mtc_nodeAdminAction_enum action )
{
if ( action > MTC_ADMIN_ACTIONS )
{
slog ("Invalid admin action (%d)\n", action);
action = MTC_ADMIN_ACTION__NONE ;
}
return ( &mtc_nodeAdminAction_str[action][0] );
}
const char mtc_nodeAdminState_str[MTC_ADMIN_STATES][15] =
{
"locked",
"unlocked",
};
string get_adminState_str ( mtc_nodeAdminState_enum adminState )
{
if ( adminState > MTC_ADMIN_STATES )
{
slog ("Invalid admin state (%d)\n", adminState );
adminState = MTC_ADMIN_STATE__LOCKED ;
}
return ( mtc_nodeAdminState_str[adminState] );
}
bool adminStateOk ( string admin )
{
if (( admin.compare(mtc_nodeAdminState_str[0])) &&
( admin.compare(mtc_nodeAdminState_str[1])))
{
wlog ("Invalid 'admin' state (%s)\n", admin.c_str());
return ( false );
}
return (true);
}
const char mtc_nodeOperState_str[MTC_OPER_STATES][15] =
{
"disabled",
"enabled"
};
string get_operState_str ( mtc_nodeOperState_enum operState )
{
if ( operState > MTC_OPER_STATES )
{
slog ("Invalid oper state (%d)\n", operState );
operState = MTC_OPER_STATE__DISABLED ;
}
return ( mtc_nodeOperState_str[operState] );
}
bool operStateOk ( string oper )
{
if (( oper.compare(mtc_nodeOperState_str[0])) &&
( oper.compare(mtc_nodeOperState_str[1])))
{
wlog ("Invalid 'oper' state (%s)\n", oper.c_str());
return ( false );
}
return (true);
}
const char mtc_nodeAvailStatus_str[MTC_AVAIL_STATUS][15] =
{
"not-installed",
"available",
"degraded",
"failed",
"intest",
"power-off",
"offline",
"online",
"offduty"
};
bool availStatusOk ( string avail )
{
if (( avail.compare(mtc_nodeAvailStatus_str[0])) &&
( avail.compare(mtc_nodeAvailStatus_str[1])) &&
( avail.compare(mtc_nodeAvailStatus_str[2])) &&
( avail.compare(mtc_nodeAvailStatus_str[3])) &&
( avail.compare(mtc_nodeAvailStatus_str[4])) &&
( avail.compare(mtc_nodeAvailStatus_str[5])) &&
( avail.compare(mtc_nodeAvailStatus_str[6])) &&
( avail.compare(mtc_nodeAvailStatus_str[7])) &&
( avail.compare(mtc_nodeAvailStatus_str[8])))
{
wlog ("Invalid 'avail' status (%s)\n", avail.c_str());
return ( false );
}
return (true);
}
string get_availStatus_str ( mtc_nodeAvailStatus_enum availStatus )
{
if ( availStatus > MTC_AVAIL_STATUS )
{
slog ("Invalid avail status (%d)\n", availStatus );
availStatus = MTC_AVAIL_STATUS__FAILED ;
}
return ( mtc_nodeAvailStatus_str[availStatus] );
}
#ifdef WANT_nodeClass_latency_log /* Needs to be tied to a node */
#define NODECLASS_LATENCY_MON_START ((const char *)"start")
#define MAX_DELAY_B4_LATENCY_LOG (1700)
void nodeClass_latency_log ( const char * label_ptr, int msecs )
{
static unsigned long long prev__time = 0 ;
static unsigned long long this__time = 0 ;
this__time = gettime_monotonic_nsec () ;
/* If label_ptr is != NULL and != start then take the measurement */
if ( label_ptr && strncmp ( label_ptr, NODECLASS_LATENCY_MON_START, strlen(NODECLASS_LATENCY_MON_START)))
{
if ( this__time > (prev__time + (NSEC_TO_MSEC*(msecs))))
{
llog ("%4llu.%-4llu msec - %s\n",
((this__time-prev__time) > NSEC_TO_MSEC) ? ((this__time-prev__time)/NSEC_TO_MSEC) : 0,
((this__time-prev__time) > NSEC_TO_MSEC) ? ((this__time-prev__time)%NSEC_TO_MSEC) : 0,
label_ptr);
}
}
/* reset to be equal for next round */
prev__time = this__time ;
}
#endif
/* nodeLinkClass constructor */
nodeLinkClass::nodeLinkClass()
{
this->is_poweron_handler = NULL;
for(unsigned int i=0; i<MAX_NODES; ++i)
{
this->node_ptrs[i] = NULL;
}
this->offline_threshold = 0;
this->offline_period = 0;
/* this->mtcTimer = mtc_timer();
* this->mtcTimer_mnfa = mtc_timer();
* this->mtcTimer_token = mtc_timer();
* this->mtcTimer_uptime = mtc_timer();
*/
this->api_retries = 0;
for(unsigned int i =0; i<MAX_IFACES; ++i)
{
this->pulse_requests[i] = 0;
this->hbs_expected_pulses[i] = 0;
this->hbs_detected_pulses[i] = 0;
}
this->compute_mtcalive_timeout = 0;
this->controller_mtcalive_timeout = 0;
this->goenabled_timeout = 0;
this->loc_recovery_timeout = 0;
this->mnfa_recovery_timeout = 0;
this->node_reinstall_timeout = 0;
this->token_refresh_rate = 0;
this->autorecovery_enabled = false ;
this->autorecovery_disabled = false ;
head = tail = NULL;
memory_allocs = 0 ;
memory_used = 0 ;
hosts = 0 ;
host_deleted = false ;
/* Init the base level pulse info and pointers for all interfaces */
pulse_ptr = NULL ;
for ( int i = 0 ; i < MAX_IFACES ; i++ )
{
pulse_list[i].head_ptr = NULL ;
pulse_list[i].tail_ptr = NULL ;
pulse_list[i].last_ptr = NULL ;
pulses[i] = 0 ;
}
/* init the resource reference index to null */
rrri = 0 ;
/* Entry of RRA is reserved (not used) and set to NULL */
hbs_rra[0] = static_cast<struct node *>(NULL) ;
/* Make no assumption on the service */
maintenance = false ;
heartbeat = false ;
active = false ;
/* Set some defaults for the hearbeat service */
hbs_ready = false ;
hbs_state_change = false ;
hbs_disabled = true ;
hbs_pulse_period = hbs_pulse_period_save = 200 ;
hbs_minor_threshold = HBS_MINOR_THRESHOLD ;
hbs_degrade_threshold = HBS_DEGRADE_THRESHOLD ;
hbs_failure_threshold = HBS_FAILURE_THRESHOLD ;
hbs_silent_fault_detector = 0 ;
hbs_silent_fault_logged = false ;
/* Start with null identity */
my_hostname.clear() ;
my_local_ip.clear() ;
my_float_ip.clear() ;
active_controller_hostname.clear() ;
inactive_controller_hostname.clear() ;
/* Start with no failures */
mnfa_awol_list.clear();
mnfa_host_count[MGMNT_IFACE] = 0 ;
mnfa_host_count[INFRA_IFACE] = 0 ;
mnfa_occurances = 0 ;
mnfa_active = false ;
mnfa_threshold_type = MNFA_NUMBER ;
mnfa_threshold_percent = 5 ;
mnfa_threshold_number = 3 ;
mnfa_threshold = mnfa_threshold_number ;
mgmnt_link_up_and_running = false ;
infra_link_up_and_running = false ;
infra_network_provisioned = false ;
infra_degrade_only = false ;
dor_mode_active = false ;
dor_start_time = 0 ;
dor_mode_active_log_throttle = 0 ;
swact_timeout = MTC_MINS_2 ;
uptime_period = MTC_UPTIME_REFRESH_TIMER ;
online_period = MTC_OFFLINE_TIMER ;
sysinv_timeout = HTTP_SYSINV_CRIT_TIMEOUT ;
sysinv_noncrit_timeout = HTTP_SYSINV_NONC_TIMEOUT ;
work_queue_timeout = MTC_WORKQUEUE_TIMEOUT ;
/* Inservice test periods in seconds - 0 = disabled */
insv_test_period = 0 ;
oos_test_period = 0 ;
/* Init the inotify shadow password file descriptors to zero */
inotify_shadow_file_fd = 0 ;
inotify_shadow_file_wd = 0 ;
/* Ensure that HA Swact gate is open on init.
* This true gates maintenance commands */
smgrEvent.mutex = false ;
/* Init the event bases to null as they have not been allocated yet */
sysinvEvent.base = NULL ;
smgrEvent.base = NULL ;
tokenEvent.base = NULL ;
sysinvEvent.conn = NULL ;
smgrEvent.conn = NULL ;
tokenEvent.conn = NULL ;
sysinvEvent.req = NULL ;
smgrEvent.req = NULL ;
tokenEvent.req = NULL ;
sysinvEvent.buf = NULL ;
smgrEvent.buf = NULL ;
tokenEvent.buf = NULL ;
unknown_host_throttle = 0 ;
invalid_arg_throttle = 0 ;
testmode = 0 ;
module_init( );
}
/* nodeLinkClass destructor */
nodeLinkClass::~nodeLinkClass()
{
/* Free any allocated host memory */
for ( int i = 0 ; i < MAX_HOSTS ; i++ )
{
if ( node_ptrs[i] )
{
delete node_ptrs[i] ;
}
}
}
/* Clear start host service controls */
void nodeLinkClass::clear_hostservices_ctls ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr )
{
node_ptr->start_services_needed = false ;
node_ptr->start_services_needed_subf = false ;
node_ptr->start_services_running_main = false ;
node_ptr->start_services_running_subf = false ;
node_ptr->start_services_retries = 0 ;
}
}
/* Clear all the main function enable failure bools */
void nodeLinkClass::clear_main_failed_bools ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr )
{
node_ptr->config_failed = false ;
node_ptr->goEnabled_failed = false ;
node_ptr->inservice_failed = false ;
node_ptr->hostservices_failed = false ;
return;
}
slog ("null pointer\n");
}
/* Clear all the sub function enable failure bools */
void nodeLinkClass::clear_subf_failed_bools ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr )
{
node_ptr->config_failed_subf = false ;
node_ptr->goEnabled_failed_subf = false ;
node_ptr->inservice_failed_subf = false ;
node_ptr->hostservices_failed_subf = false ;
return;
}
slog ("null pointer\n");
}
/*
* Allocates memory for a new node and stores its the address in node_ptrs
* @param void
* @return node pointer to the newly allocted node
*/
struct nodeLinkClass::node * nodeLinkClass::newNode ( void )
{
struct nodeLinkClass::node * temp_node_ptr = NULL ;
if ( memory_allocs == 0 )
{
memset ( node_ptrs, 0 , sizeof(struct node *)*MAX_NODES);
}
// find an empty spot
for ( int i = 0 ; i < MAX_NODES ; i++ )
{
if ( node_ptrs[i] == NULL )
{
node_ptrs[i] = temp_node_ptr = new node ;
memory_allocs++ ;
memory_used += sizeof (struct nodeLinkClass::node);
return temp_node_ptr ;
}
}
elog ( "Failed to save new node pointer address\n" );
return temp_node_ptr ;
}
/* Frees the memory of a pre-allocated node and removes
* it from the node_ptrs list
* @param node * pointer to the node memory address to be freed
* @return int return code { PASS or -EINVAL }
*/
int nodeLinkClass::delNode ( struct nodeLinkClass::node * node_ptr )
{
if ( memory_allocs > 0 )
{
for ( int i = 0 ; i < MAX_NODES ; i++ )
{
if ( node_ptrs[i] == node_ptr )
{
delete node_ptr ;
node_ptrs[i] = NULL ;
memory_allocs-- ;
memory_used -= sizeof (struct nodeLinkClass::node);
return PASS ;
}
}
elog ( "Error: Unable to validate memory address being freed\n" );
}
else
elog ( "Error: Free memory called when there is no memory to free\n" );
return -EINVAL ;
}
/*
* Allocate new node and tack it on the end of the node_list
*/
struct
nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
{
/* verify node is not already provisioned */
struct node * ptr = getNode ( hostname );
if ( ptr )
{
/* if it is then clean it up and fall through */
if ( !testmode )
{
wlog ("Warning: Node already provisioned\n");
}
if ( remNode ( hostname ) )
{
/* Should never get here but if we do then */
/* something is seriously wrong */
elog ("Error: Unable to remove node during reprovision\n");
return static_cast<struct node *>(NULL);
}
}
/* allocate memory for new node */
ptr = newNode ();
if( ptr == NULL )
{
elog ( "Error: Failed to allocate memory for new node\n" );
return static_cast<struct node *>(NULL);
}
/* init the new node */
ptr->hostname = hostname ;
ptr->ip = "" ;
ptr->mac = "" ;
ptr->infra_ip = "" ;
ptr->infra_mac = "" ;
ptr->patching = false ;
ptr->patched = false ;
/* the goenabled state bool */
ptr->goEnabled = false ;
ptr->goEnabled_subf = false ;
clear_hostservices_ctls ( ptr );
/* clear all the enable failure bools */
clear_main_failed_bools ( ptr );
clear_subf_failed_bools ( ptr );
/* Set the subfunction to disabled */
ptr->operState_subf = MTC_OPER_STATE__DISABLED ;
ptr->availStatus_subf = MTC_AVAIL_STATUS__NOT_INSTALLED ;
ptr->operState_dport = MTC_OPER_STATE__DISABLED ;
ptr->availStatus_dport= MTC_AVAIL_STATUS__OFFDUTY ;
ptr->enabled_count = 0 ;
ptr->cmdName = "";
ptr->cmdReq = 0 ;
ptr->cmdRsp = 0 ;
ptr->cmdRsp_status= 0 ;
ptr->cmdRsp_status_string = "" ;
ptr->add_completed = false ;
/* init the hwmon reset and powercycle recovery control structures */
recovery_ctrl_init ( ptr->hwmon_reset );
recovery_ctrl_init ( ptr->hwmon_powercycle );
/* Default timeout values */
ptr->mtcalive_timeout = HOST_MTCALIVE_TIMEOUT ;
/* no ned to send a reboot response back to any client */
ptr->activeClient = CLIENT_NONE ;
ptr->task = "none" ;
ptr->action = "none" ;
ptr->clear_task = false ;
ptr->mtcAlive_gate = true ;
ptr->mtcAlive_online = false ;
ptr->mtcAlive_offline = true ;
ptr->mtcAlive_misses = 0 ;
ptr->mtcAlive_hits = 0 ;
ptr->mtcAlive_count = 0 ;
ptr->mtcAlive_purge = 0 ;
ptr->offline_search_count = 0 ;
ptr->mtcAlive_mgmnt = false ;
ptr->mtcAlive_infra = false ;
ptr->reboot_cmd_ack_mgmnt = false ;
ptr->reboot_cmd_ack_infra = false ;
ptr->offline_log_reported = true ;
ptr->online_log_reported = false ;
ptr->dor_recovery_mode = false ;
ptr->was_dor_recovery_mode= false ;
ptr->dor_recovery_time = 0 ;
mtcTimer_init ( ptr->mtcTimer, hostname, "mtc timer"); /* Init node's general mtc timer */
mtcTimer_init ( ptr->insvTestTimer, hostname, "insv test timer");
mtcTimer_init ( ptr->oosTestTimer, hostname, "oos test timer"); /* Init node's oos test timer */
mtcTimer_init ( ptr->mtcSwact_timer, hostname, "mtcSwact timer"); /* Init node's mtcSwact timer */
mtcTimer_init ( ptr->mtcCmd_timer, hostname, "mtcCmd timer"); /* Init node's mtcCmd timer */
mtcTimer_init ( ptr->mtcConfig_timer, hostname, "mtcConfig timer"); /* Init node's mtcConfig timer */
mtcTimer_init ( ptr->mtcAlive_timer , hostname, "mtcAlive timer"); /* Init node's mtcAlive timer */
mtcTimer_init ( ptr->offline_timer, hostname, "offline timer"); /* Init node's FH offline timer */
mtcTimer_init ( ptr->http_timer, hostname, "http timer" ); /* Init node's http timer */
mtcTimer_init ( ptr->bm_timer, hostname, "bm timer" ); /* Init node's bm timer */
mtcTimer_init ( ptr->bm_ping_info.timer,hostname,"ping timer" ); /* Init node's ping timer */
mtcTimer_init ( ptr->bmc_access_timer, hostname, "bmc acc timer" ); /* Init node's bm access timer */
mtcTimer_init ( ptr->host_services_timer, hostname, "host services timer" ); /* host services timer */
mtcTimer_init ( ptr->hwmon_powercycle.control_timer, hostname, "powercycle control timer");
mtcTimer_init ( ptr->hwmon_powercycle.recovery_timer, hostname, "powercycle recovery timer");
mtcTimer_init ( ptr->hwmon_reset.control_timer, hostname, "reset control timer");
mtcTimer_init ( ptr->hwmon_reset.recovery_timer, hostname, "reset recovery timer");
mtcCmd_init ( ptr->host_services_req );
mtcCmd_init ( ptr->mtcAlive_req );
mtcCmd_init ( ptr->reboot_req );
mtcCmd_init ( ptr->general_req );
ptr->configStage = MTC_CONFIG__START ;
ptr->swactStage = MTC_SWACT__START ;
ptr->offlineStage = MTC_OFFLINE__IDLE ;
ptr->onlineStage = MTC_ONLINE__START ;
ptr->addStage = MTC_ADD__START ;
ptr->delStage = MTC_DEL__START ;
ptr->recoveryStage = MTC_RECOVERY__START ;
ptr->insvTestStage = MTC_INSV_TEST__RUN ; /* Start wo initial delay */
ptr->oosTestStage = MTC_OOS_TEST__LOAD_NEXT_TEST ;
ptr->resetProgStage = MTC_RESETPROG__START;
ptr->powerStage = MTC_POWER__DONE ;
ptr->powercycleStage = MTC_POWERCYCLE__DONE;
ptr->subStage = MTC_SUBSTAGE__DONE ;
ptr->reinstallStage = MTC_REINSTALL__DONE ;
ptr->resetStage = MTC_RESET__START ;
ptr->handlerStage.enable = MTC_ENABLE__START ; /* Enable and Disable */
ptr->oos_test_count = 0 ;
ptr->insv_test_count = 0 ;
ptr->insv_recovery_counter = 0 ;
ptr->uptime = 0 ;
ptr->uptime_refresh_counter = 0 ;
ptr->node_unlocked_counter = 0 ;
/* Default to a healthy config until mtcAlive messages prove otherwise */
ptr->mtce_flags = ( MTC_FLAG__I_AM_CONFIGURED |
MTC_FLAG__I_AM_HEALTHY ) ;
ptr->graceful_recovery_counter = 0 ;
ptr->health_threshold_counter = 0 ;
ptr->unknown_health_reported = false ;
ptr->mnfa_graceful_recovery = false ;
/* initialize all board management variables for this host */
ptr->bm_ip = NONE ;
ptr->bm_type = NONE ;
ptr->bm_un = NONE ;
ptr->bm_pw = NONE ;
ptr->bm_provisioned = false ; /* assume not provisioned until learned */
ptr->power_on = false ; /* learned on first BMC connection */
bmc_access_data_init ( ptr ); /* init all the BMC access vars all modes */
/* init the alarm array only to have it updated later
* with current alarm severities */
for ( int id = 0 ; id < MAX_ALARMS ; id++ )
{
ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ;
}
ptr->alarms_loaded = false ;
ptr->cfgEvent.base = NULL ;
ptr->sysinvEvent.base= NULL ;
ptr->vimEvent.base = NULL ;
ptr->httpReq.base = NULL ;
ptr->libEvent_done_fifo.clear();
ptr->libEvent_work_fifo.clear();
ptr->oper_sequence = 0 ;
ptr->oper_failures = 0 ;
ptr->mtcCmd_work_fifo.clear();
ptr->mtcCmd_done_fifo.clear();
ptr->cfgEvent.conn = NULL ;
ptr->sysinvEvent.conn= NULL ;
ptr->vimEvent.conn = NULL ;
ptr->httpReq.conn = NULL ;
ptr->cfgEvent.req = NULL ;
ptr->sysinvEvent.req = NULL ;
ptr->vimEvent.req = NULL ;
ptr->httpReq.req = NULL ;
ptr->cfgEvent.buf = NULL ;
ptr->sysinvEvent.buf = NULL ;
ptr->vimEvent.buf = NULL ;
ptr->httpReq.buf = NULL ;
ptr->stall_recovery_log_throttle = 0 ;
ptr->stall_monitor_log_throttle = 0 ;
ptr->unexpected_pulse_log_throttle = 0 ;
ptr->lookup_mismatch_log_throttle = 0 ;
ptr->log_throttle = 0 ;
ptr->no_work_log_throttle = 0 ;
ptr->degrade_mask = ptr->degrade_mask_save = DEGRADE_MASK_NONE ;
ptr->degraded_resources_list.clear () ;
ptr->pmond_ready = false ;
ptr->rmond_ready = false ;
ptr->hwmond_ready = false ;
ptr->hbsClient_ready = false ;
ptr->toggle = false ;
ptr->retries = 0 ;
ptr->http_retries_cur = 0 ;
ptr->cmd_retries = 0 ;
ptr->power_action_retries = 0 ;
ptr->subf_enabled = false ;
for ( int i = 0 ; i < MAX_IFACES ; i++ )
{
ptr->pulse_link[i].next_ptr = NULL ;
ptr->pulse_link[i].prev_ptr = NULL ;
ptr->monitor[i] = false ;
ptr->hbs_minor[i] = false ;
ptr->hbs_degrade[i] = false ;
ptr->hbs_failure[i] = false ;
ptr->max_count[i] = 0 ;
ptr->hbs_count[i] = 0 ;
ptr->hbs_minor_count[i] = 0 ;
ptr->b2b_misses_count[i] = 0 ;
ptr->hbs_degrade_count[i] = 0 ;
ptr->hbs_failure_count[i] = 0 ;
ptr->heartbeat_failed[i] = false;
}
ptr->health = NODE_HEALTH_UNKNOWN ;
ptr->pmon_missing_count = 0;
ptr->pmon_degraded = false ;
/* now add it to the node list ; dealing with all conditions */
/* if the node list is empty add it to the head */
if( head == NULL )
{
head = ptr ;
tail = ptr ;
ptr->prev = NULL ;
ptr->next = NULL ;
}
else
{
/* link in the new_node to the tail of the node_list
* then mark the next field as the end of the node_list
* adjust tail to point to the last node
*/
tail->next = ptr ;
ptr->prev = tail ;
ptr->next = NULL ;
tail = ptr ;
}
/* start with no action and an empty todo list */
ptr->adminAction = MTC_ADMIN_ACTION__NONE ;
ptr->adminAction_todo_list.clear();
ptr->handlerStage.enable = MTC_ENABLE__START;
hosts++ ;
/* (re)build the Resource Reference Array */
if ( heartbeat )
build_rra ();
return ptr ;
}
struct nodeLinkClass::node* nodeLinkClass::getNode ( string hostname )
{
/* check for empty list condition */
if ( head == NULL )
return NULL ;
if ( hostname.empty() )
return static_cast<struct node *>(NULL);
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( !hostname.compare ( ptr->hostname ))
{
return ptr ;
}
/* Node can be looked up by ip addr too */
if ( !hostname.compare ( ptr->ip ))
{
return ptr ;
}
/* Node can be looked up by infra_ip addr too */
if ( !hostname.compare ( ptr->infra_ip ))
{
return ptr ;
}
/* Node can be looked up by uuid too */
if ( !hostname.compare ( ptr->uuid ))
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node* nodeLinkClass::getEventBaseNode ( libEvent_enum request,
struct event_base * base_ptr)
{
struct node * ptr = static_cast<struct node *>(NULL) ;
/* check for empty list condition */
if ( head == NULL )
return NULL ;
if ( base_ptr == NULL )
return NULL ;
for ( ptr = head ; ; ptr = ptr->next )
{
switch ( request )
{
case SYSINV_HOST_QUERY:
{
if ( ptr->sysinvEvent.base == base_ptr )
{
hlog1 ("%s Found Sysinv Event Base Pointer (%p)\n",
ptr->hostname.c_str(), ptr->sysinvEvent.base);
return ptr ;
}
}
case VIM_HOST_DISABLED:
case VIM_HOST_ENABLED:
case VIM_HOST_OFFLINE:
case VIM_HOST_FAILED:
{
if ( ptr->vimEvent.base == base_ptr )
{
hlog1 ("%s Found vimEvent Base Pointer (%p) \n",
ptr->hostname.c_str(), ptr->vimEvent.base);
return ptr ;
}
}
default:
;
} /* End Switch */
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
wlog ("%s Event Base Pointer (%p) - Not Found\n",
ptr->hostname.c_str(), base_ptr);
return static_cast<struct node *>(NULL);
}
/* Find the node in the list of nodes being heartbeated and splice it out */
int nodeLinkClass::remNode( string hostname )
{
int rc = -ENODEV ;
if ( hostname.c_str() == NULL )
return -EFAULT ;
if ( head == NULL )
return -ENXIO ;
struct node * ptr = getNode ( hostname );
if ( ptr == NULL )
return -EFAULT ;
mtcTimer_fini ( ptr->mtcTimer );
mtcTimer_fini ( ptr->mtcSwact_timer );
mtcTimer_fini ( ptr->mtcAlive_timer );
mtcTimer_fini ( ptr->offline_timer );
mtcTimer_fini ( ptr->mtcCmd_timer );
mtcTimer_fini ( ptr->http_timer );
mtcTimer_fini ( ptr->insvTestTimer );
mtcTimer_fini ( ptr->oosTestTimer );
mtcTimer_fini ( ptr->mtcConfig_timer );
mtcTimer_fini ( ptr->host_services_timer );
mtcTimer_fini ( ptr->hwmon_powercycle.control_timer );
mtcTimer_fini ( ptr->hwmon_powercycle.recovery_timer );
mtcTimer_fini ( ptr->hwmon_reset.control_timer );
mtcTimer_fini ( ptr->hwmon_reset.recovery_timer );
mtcTimer_fini ( ptr->bm_timer );
mtcTimer_fini ( ptr->bmc_access_timer );
mtcTimer_fini ( ptr->bm_ping_info.timer );
#ifdef WANT_PULSE_LIST_SEARCH_ON_DELETE
/* Splice the node out of the pulse monitor list */
for ( int i = 0 ; i < MAX_IFACES ; i++ )
{
/* Does the pulse monitor list exist ? */
if ( pulse_list[i].head_ptr != NULL )
{
pulse_ptr = ptr ;
if ( pulse_list[i].head_ptr == pulse_ptr )
{
if ( pulse_list[i].head_ptr == pulse_list[i].tail_ptr )
{
pulse_list[i].head_ptr = NULL ;
pulse_list[i].tail_ptr = NULL ;
dlog ("Pulse: Single Node -> Head Case\n");
}
else
{
dlog ("Pulse: Multiple Nodes -> Head Case\n");
pulse_list[i].head_ptr = pulse_list[i].head_ptr->pulse_link[i].next_ptr ;
pulse_list[i].head_ptr->pulse_link[i].prev_ptr = NULL ;
}
}
else if ( pulse_list[i].tail_ptr == pulse_ptr )
{
dlog ("Pulse: Multiple Node -> Tail Case\n");
pulse_list[i].tail_ptr = pulse_list[i].tail_ptr->pulse_link[i].prev_ptr ;
pulse_list[i].tail_ptr->pulse_link[i].next_ptr = NULL ;
}
else
{
dlog ("Pulse: Multiple Node -> Full Splice Out\n");
pulse_ptr->pulse_link[i].prev_ptr->pulse_link[i].next_ptr = pulse_ptr->pulse_link[i].next_ptr ;
pulse_ptr->pulse_link[i].next_ptr->pulse_link[i].prev_ptr = pulse_ptr->pulse_link[i].prev_ptr ;
}
}
}
#endif
/* If the node is the head node */
if ( ptr == head )
{
/* only one node in the list case */
if ( head == tail )
{
dlog ("Single Node -> Head Case\n");
head = NULL ;
tail = NULL ;
delNode ( ptr );
rc = PASS ;
}
else
{
dlog ("Multiple Nodes -> Head Case\n");
head = head->next ;
head->prev = NULL ;
delNode ( ptr );
rc = PASS ;
}
}
/* if not head but tail then there must be more than one
* node in the list so go ahead and chop the tail.
*/
else if ( ptr == tail )
{
dlog ("Multiple Node -> Tail Case\n");
tail = tail->prev ;
tail->next = NULL ;
delNode ( ptr );
rc = PASS ;
}
else
{
dlog ("Multiple Node -> Full Splice Out\n");
ptr->prev->next = ptr->next ;
ptr->next->prev = ptr->prev ;
delNode( ptr );
rc = PASS ;
}
hosts-- ;
/* (re)build the Resource Reference Array */
if ( heartbeat )
build_rra ();
return rc ;
}
/**
* Node state set'ers and get'ers
*/
mtc_nodeAdminAction_enum nodeLinkClass::get_adminAction ( string & hostname )
{
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
if ( node_ptr )
return ( node_ptr->adminAction );
elog ("Failed getting 'admin action' for '%s'\n", hostname.c_str());
return (MTC_ADMIN_ACTION__NONE);
}
int nodeLinkClass::set_adminAction ( string & hostname, mtc_nodeAdminAction_enum adminAction )
{
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
if ( node_ptr )
{
adminActionChange ( node_ptr, adminAction ) ;
return (PASS) ;
}
elog ("Failed setting 'admin action' for '%s'\n", hostname.c_str());
return (FAIL) ;
}
mtc_nodeAdminState_enum nodeLinkClass::get_adminState ( string & hostname )
{
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
if ( node_ptr )
return ( node_ptr->adminState );
elog ("Failed getting 'admin state' state for '%s'\n", hostname.c_str());
return (MTC_ADMIN_STATE__LOCKED);
}
int nodeLinkClass::set_adminState ( string & hostname, mtc_nodeAdminState_enum adminState )
{
int rc = FAIL ;
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
if ( node_ptr )
{
rc = nodeLinkClass::adminStateChange ( node_ptr , adminState );
}
if ( rc )
{
elog ("Failed setting 'admin state' for '%s'\n", hostname.c_str());
}
return (rc) ;
}
mtc_nodeOperState_enum nodeLinkClass::get_operState ( string & hostname )
{
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
if ( node_ptr )
return ( node_ptr->operState );
elog ("Failed getting 'operational state' for '%s'\n", hostname.c_str());
return (MTC_OPER_STATE__DISABLED);
}
int nodeLinkClass::set_operState ( string & hostname, mtc_nodeOperState_enum operState )
{
int rc = FAIL ;
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
if ( node_ptr )
{
rc = nodeLinkClass::operStateChange ( node_ptr , operState );
}
if ( rc )
{
elog ("Failed setting 'operational state' for '%s'\n", hostname.c_str());
}
return (rc) ;
}
mtc_nodeAvailStatus_enum nodeLinkClass::get_availStatus ( string & hostname )
{
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
if ( node_ptr )
return ( node_ptr->availStatus );
elog ("Failed getting 'availability status' for '%s'\n", hostname.c_str());
return (MTC_AVAIL_STATUS__OFFDUTY);
}
int nodeLinkClass::set_availStatus ( string & hostname, mtc_nodeAvailStatus_enum availStatus )
{
int rc = FAIL ;
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
if ( node_ptr != NULL )
{
rc = nodeLinkClass::availStatusChange ( node_ptr , availStatus );
}
if ( rc )
{
elog ("Failed setting 'availability status' for '%s'\n", hostname.c_str());
}
return (FAIL) ;
}
/** Return a string representing the data port operational state
* according to the X.731 standard */
string nodeLinkClass::get_operState_dport ( string & hostname )
{
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
if ( node_ptr )
{
return ( operState_enum_to_str(node_ptr->operState_dport));
}
elog ("%s failed getting 'operState_dport'\n", hostname.c_str());
return ("");
}
/** Return a string representing the data port availability status
* according to the X.731 standard */
string nodeLinkClass::get_availStatus_dport ( string & hostname )
{
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
if ( node_ptr )
{
return ( availStatus_enum_to_str(node_ptr->availStatus_dport));
}
elog ("%s failed getting 'availStatus_dport'\n", hostname.c_str());
return ("");
}
void nodeLinkClass::print_node_info ( nodeLinkClass::node * ptr )
{
ilog ("%17s (%15s) %8s %8s-%-9s | %s-%s-%s | %s | %0X",
ptr->hostname.c_str(),
ptr->ip.c_str(),
mtc_nodeAdminState_str[ptr->adminState],
mtc_nodeOperState_str[ptr->operState],
mtc_nodeAvailStatus_str[ptr->availStatus],
ptr->subfunction_str.c_str(),
mtc_nodeOperState_str[ptr->operState_subf],
mtc_nodeAvailStatus_str[ptr->availStatus_subf],
mtc_nodeAdminAction_str [ptr->adminAction],
ptr->degrade_mask);
}
void nodeLinkClass::print_node_info ( void )
{
if ( maintenance )
{
syslog ( LOG_INFO,"+--------------------------------------+-------------------+-----------------+\n");
for ( struct node * ptr = head ; ptr != NULL ; ptr = ptr->next )
{
if ( LARGE_SYSTEM )
{
syslog ( LOG_INFO, "| %36s | %17s | %15s | %8s %8s-%s",
ptr->uuid.length() ? ptr->uuid.c_str() : "",
ptr->hostname.c_str(),
ptr->ip.c_str(),
mtc_nodeAdminState_str[ptr->adminState],
mtc_nodeOperState_str[ptr->operState],
mtc_nodeAvailStatus_str[ptr->availStatus]);
}
else
{
syslog ( LOG_INFO, "| %36s | %17s | %15s | %8s %8s-%-9s | %s-%s-%s",
ptr->uuid.length() ? ptr->uuid.c_str() : "",
ptr->hostname.c_str(),
ptr->ip.c_str(),
mtc_nodeAdminState_str[ptr->adminState],
mtc_nodeOperState_str[ptr->operState],
mtc_nodeAvailStatus_str[ptr->availStatus],
ptr->subfunction_str.c_str(),
mtc_nodeOperState_str[ptr->operState_subf],
mtc_nodeAvailStatus_str[ptr->availStatus_subf]);
}
// syslog ( LOG_INFO, "\n");
}
syslog ( LOG_INFO, "+--------------------------------------+-------------------+-----------------+\n\n");
}
if ( heartbeat )
{
for ( int i = 0 ; i < MAX_IFACES ; i++ )
{
if (( i == INFRA_IFACE ) && ( infra_network_provisioned == false ))
continue ;
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+-----------------+\n");
syslog ( LOG_INFO, "| %s: %3d | Mon | Mis | Max | Deg | Fail | Pulses | %s (%4d) |\n" ,
get_iface_name_str ((iface_enum)i), hosts, hbs_disabled ? "DISABLED" : "Enabled ", hbs_pulse_period );
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+-----------------+\n");
for ( struct node * ptr = head ; ptr != NULL ; ptr = ptr->next )
{
syslog ( LOG_INFO, "| %-12s | %c | %3i | %4i | %3i | %4i | %8x | %d msec\n",
ptr->hostname.c_str(),
ptr->monitor[i] ? 'Y' : 'n',
ptr->b2b_misses_count[i],
ptr->max_count[i],
ptr->hbs_degrade_count[i],
ptr->hbs_failure_count[i],
ptr->hbs_count[i],
hbs_pulse_period );
}
}
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+-----------------+\n");
}
}
/** Convert the supplied string to a valid maintenance Admin State enum */
mtc_nodeAdminState_enum nodeLinkClass::adminState_str_to_enum ( const char * admin_ptr )
{
/* Default state */
mtc_nodeAdminState_enum temp = MTC_ADMIN_STATE__LOCKED;
if ( admin_ptr == NULL )
{
wlog ("Administrative state is Null\n");
}
else if ( !strcmp ( &mtc_nodeAdminState_str[MTC_ADMIN_STATE__UNLOCKED][0], admin_ptr ))
temp = MTC_ADMIN_STATE__UNLOCKED ;
return (temp) ;
}
/** Convert the supplied string to a valid maintenance Oper State enum */
mtc_nodeOperState_enum nodeLinkClass::operState_str_to_enum ( const char * oper_ptr )
{
/* Default state */
mtc_nodeOperState_enum temp = MTC_OPER_STATE__DISABLED;
if ( oper_ptr == NULL )
{
wlog ("Operation state is Null\n");
}
else if ( !strcmp ( &mtc_nodeOperState_str[MTC_ADMIN_STATE__UNLOCKED][0], oper_ptr ))
temp = MTC_OPER_STATE__ENABLED ;
return (temp) ;
}
/** Convert the supplied string to a valid maintenance Avail Status enum */
mtc_nodeAvailStatus_enum nodeLinkClass::availStatus_str_to_enum ( const char * avail_ptr )
{
/* Default state */
mtc_nodeAvailStatus_enum temp = MTC_AVAIL_STATUS__OFFDUTY;
/* Could do this as a loop but this is more resiliant to enum changes */
/* TODO: consider using a paired list */
if ( avail_ptr == NULL )
{
wlog ("Availability status is Null\n");
}
else if ( !strcmp ( &mtc_nodeAvailStatus_str[MTC_AVAIL_STATUS__AVAILABLE][0], avail_ptr ))
temp = MTC_AVAIL_STATUS__AVAILABLE ;
else if ( !strcmp ( &mtc_nodeAvailStatus_str[MTC_AVAIL_STATUS__FAILED][0], avail_ptr ))
temp = MTC_AVAIL_STATUS__FAILED ;
else if ( !strcmp ( &mtc_nodeAvailStatus_str[MTC_AVAIL_STATUS__INTEST][0], avail_ptr ))
temp = MTC_AVAIL_STATUS__INTEST ;
else if ( !strcmp ( &mtc_nodeAvailStatus_str[MTC_AVAIL_STATUS__DEGRADED][0], avail_ptr ))
temp = MTC_AVAIL_STATUS__DEGRADED ;
else if ( !strcmp ( &mtc_nodeAvailStatus_str[MTC_AVAIL_STATUS__OFFLINE][0], avail_ptr ))
temp = MTC_AVAIL_STATUS__OFFLINE ;
else if ( !strcmp ( &mtc_nodeAvailStatus_str[MTC_AVAIL_STATUS__ONLINE][0], avail_ptr ))
temp = MTC_AVAIL_STATUS__ONLINE ;
else if ( !strcmp ( &mtc_nodeAvailStatus_str[MTC_AVAIL_STATUS__POWERED_OFF][0], avail_ptr ))
temp = MTC_AVAIL_STATUS__POWERED_OFF ;
return (temp) ;
}
/** Convert the supplied enum to the corresponding Admin State string */
string nodeLinkClass::adminAction_enum_to_str ( mtc_nodeAdminAction_enum val )
{
if ( val < MTC_ADMIN_ACTIONS )
{
string adminAction_string = &mtc_nodeAdminAction_str[val][0] ;
return ( adminAction_string );
}
return ( NULL );
}
/** Convert the supplied enum to the corresponding Admin State string */
string nodeLinkClass::adminState_enum_to_str ( mtc_nodeAdminState_enum val )
{
if ( val < MTC_ADMIN_STATES )
{
string adminState_string = &mtc_nodeAdminState_str[val][0] ;
return ( adminState_string );
}
return ( NULL );
}
/** Convert the supplied enum to the corresponding Oper State string */
string nodeLinkClass::operState_enum_to_str ( mtc_nodeOperState_enum val )
{
if ( val < MTC_OPER_STATES )
{
string operState_string = &mtc_nodeOperState_str[val][0] ;
return ( operState_string );
}
return ( NULL );
}
/** Convert the supplied enum to the corresponding Avail Status string */
string nodeLinkClass::availStatus_enum_to_str ( mtc_nodeAvailStatus_enum val )
{
if ( val < MTC_AVAIL_STATUS )
{
string availStatus_string = &mtc_nodeAvailStatus_str[val][0] ;
return ( availStatus_string );
}
return ( NULL );
}
void nodeLinkClass::host_print ( struct nodeLinkClass::node * node_ptr )
{
string uuid ;
if ( daemon_get_cfg_ptr()->debug_level == 1 )
{
const char bar [] = { "+-------------+--------------------------------------+" };
const char uar [] = { "+- Add Host -+--------------------------------------+" };
syslog ( LOG_INFO, "%s\n", &uar[0]);
syslog ( LOG_INFO, "| uuid : %s\n", node_ptr->uuid.c_str());
syslog ( LOG_INFO, "| main : %s\n", node_ptr->function_str.c_str());
syslog ( LOG_INFO, "| subf : %s\n", node_ptr->subfunction_str.c_str());
syslog ( LOG_INFO, "| name : %s\n", node_ptr->hostname.c_str());
syslog ( LOG_INFO, "| ip : %s\n", node_ptr->ip.c_str());
syslog ( LOG_INFO, "| admin : %s\n", adminState_enum_to_str (node_ptr->adminState).c_str());
syslog ( LOG_INFO, "| oper : %s\n", operState_enum_to_str (node_ptr->operState).c_str());
syslog ( LOG_INFO, "| avail_subf: %s\n", availStatus_enum_to_str (node_ptr->availStatus_subf).c_str());
syslog ( LOG_INFO, "| oper_subf: %s\n", operState_enum_to_str (node_ptr->operState_subf).c_str());
syslog ( LOG_INFO, "%s\n", &bar[0]);
}
/* ec3624cb-d80f-4e8b-a6d7-0c11f6937f6a */
/* Just print the last 4 chars of the uuid */
if ( node_ptr->uuid.empty() )
uuid = "---" ;
else
uuid = node_ptr->uuid.substr(32) ;
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE )
{
ilog ("%s '%s' %s %s-%s (%s)\n", node_ptr->hostname.c_str(),
functions.c_str(),
node_ptr->ip.c_str(),
adminState_enum_to_str (node_ptr->adminState).c_str(),
operState_enum_to_str (node_ptr->operState).c_str(),
uuid.c_str());
}
else
{
ilog ("%s '%s' %s %s-%s-%s (%s)\n",node_ptr->hostname.c_str(),
functions.c_str(),
node_ptr->ip.c_str(),
adminState_enum_to_str (node_ptr->adminState).c_str(),
operState_enum_to_str (node_ptr->operState).c_str(),
availStatus_enum_to_str(node_ptr->availStatus).c_str(),
uuid.c_str());
}
}
/** Host Administrative State Change public member function */
int nodeLinkClass::admin_state_change ( string hostname,
string newAdminState )
{
int rc = FAIL ;
struct nodeLinkClass::node * node_ptr = nodeLinkClass::getNode( hostname );
if ( node_ptr )
{
if ( newAdminState.empty() )
{
rc = FAIL_STRING_EMPTY ;
}
else if ( adminState_str_to_enum ( newAdminState.data() ) == node_ptr->adminState )
{
rc = PASS ;
}
else if ( adminStateOk ( newAdminState ) )
{
clog ("%s %s (from %s)\n", hostname.c_str(), newAdminState.c_str(), adminState_enum_to_str (node_ptr->adminState).c_str());
node_ptr->adminState = adminState_str_to_enum ( newAdminState.data() );
rc = PASS ;
}
else
{
elog ("%s Invalid 'admin' state (%s)\n",
hostname.c_str(), newAdminState.c_str() );
}
}
else
{
wlog ("Cannot change 'admin' state for unknown hostname (%s)\n",
hostname.c_str());
}
return (rc);
}
/** Host Operational State Change public member function */
int nodeLinkClass::oper_state_change ( string hostname, string newOperState )
{
int rc = FAIL ;
struct nodeLinkClass::node * node_ptr = nodeLinkClass::getNode( hostname );
if ( node_ptr )
{
if ( newOperState.empty() )
{
rc = FAIL_STRING_EMPTY ;
}
else if ( operState_str_to_enum ( newOperState.data() ) == node_ptr->operState )
{
rc = PASS ;
}
else if ( operStateOk ( newOperState ) )
{
mtc_nodeOperState_enum oper = operState_str_to_enum ( newOperState.data() );
if (( node_ptr->operState == MTC_OPER_STATE__DISABLED ) &&
( oper == MTC_OPER_STATE__ENABLED ))
{
mtcAlarm_log ( hostname, MTC_LOG_ID__STATUSCHANGE_ENABLED );
}
if (( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
( oper == MTC_OPER_STATE__DISABLED ))
{
mtcAlarm_log ( hostname, MTC_LOG_ID__STATUSCHANGE_DISABLED );
}
clog ("%s %s (from %s)\n", hostname.c_str(), newOperState.c_str(), operState_enum_to_str (node_ptr->operState).c_str());
node_ptr->operState = oper ;
rc = PASS ;
}
else
{
elog ("%s Invalid 'oper' state (%s)\n",
hostname.c_str(), newOperState.c_str() );
}
}
else
{
wlog ("Cannot change 'oper' state for unknown hostname (%s)\n",
hostname.c_str() );
}
return (rc);
}
/** Host Availability Status Change public member function */
int nodeLinkClass::avail_status_change ( string hostname,
string newAvailStatus )
{
int rc = FAIL ;
struct nodeLinkClass::node * node_ptr = nodeLinkClass::getNode( hostname );
if ( node_ptr )
{
if ( newAvailStatus.empty() )
{
rc = FAIL_STRING_EMPTY ;
}
else if ( availStatus_str_to_enum ( newAvailStatus.data() ) == node_ptr->availStatus )
{
rc = PASS ;
}
else if ( availStatusOk ( newAvailStatus ))
{
mtc_nodeAvailStatus_enum avail = availStatus_str_to_enum ( newAvailStatus.data() );
/* if we go to the failed state then clear all mtcAlive counts
* so that the last ones don't look like we are online when we
* might not be - we should relearn the on/off line state */
if (( node_ptr->availStatus != MTC_AVAIL_STATUS__FAILED ) &&
( avail == MTC_AVAIL_STATUS__FAILED ))
{
node_ptr->mtcAlive_misses = 0 ;
node_ptr->mtcAlive_hits = 0 ;
node_ptr->mtcAlive_gate = false ;
}
/* check for need to generate power on log */
if (( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF ) &&
( avail != MTC_AVAIL_STATUS__POWERED_OFF ))
{
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__POWERON )
{
mtcAlarm_log ( hostname, MTC_LOG_ID__COMMAND_MANUAL_POWER_ON );
}
else
{
mtcAlarm_log ( hostname, MTC_LOG_ID__COMMAND_AUTO_POWER_ON );
}
}
/* check for need to generate power off log */
if (( node_ptr->availStatus != MTC_AVAIL_STATUS__POWERED_OFF ) &&
( avail == MTC_AVAIL_STATUS__POWERED_OFF ))
{
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__POWEROFF )
{
mtcAlarm_log ( hostname, MTC_LOG_ID__COMMAND_MANUAL_POWER_OFF );
}
else
{
mtcAlarm_log ( hostname, MTC_LOG_ID__COMMAND_AUTO_POWER_OFF );
}
}
/* check for need to generate online log */
if (( node_ptr->availStatus != MTC_AVAIL_STATUS__ONLINE ) &&
( avail == MTC_AVAIL_STATUS__ONLINE ))
{
if ( node_ptr->offline_log_reported == true )
{
mtcAlarm_log ( hostname, MTC_LOG_ID__STATUSCHANGE_ONLINE );
node_ptr->offline_log_reported = false ;
node_ptr->online_log_reported = true ;
}
}
/* check for need to generate offline log */
if (( node_ptr->availStatus != MTC_AVAIL_STATUS__OFFLINE ) &&
( avail == MTC_AVAIL_STATUS__OFFLINE ))
{
if ( node_ptr->online_log_reported == true )
{
mtcAlarm_log ( hostname, MTC_LOG_ID__STATUSCHANGE_OFFLINE );
node_ptr->offline_log_reported = true ;
node_ptr->online_log_reported = false ;
}
}
/* If the availability status is moving away from off or online then
* be sure we cancel the mtcAlive timer */
if ((( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE ) ||
( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE )) &&
(( avail != MTC_AVAIL_STATUS__OFFLINE ) &&
( avail != MTC_AVAIL_STATUS__ONLINE )))
{
/* Free the mtc timer if in use */
if ( node_ptr->mtcAlive_timer.tid )
{
tlog ("%s Stopping mtcAlive timer\n", node_ptr->hostname.c_str());
mtcTimer_stop ( node_ptr->mtcAlive_timer );
node_ptr->mtcAlive_timer.ring = false ;
node_ptr->mtcAlive_timer.tid = NULL ;
}
node_ptr->onlineStage = MTC_ONLINE__START ;
}
clog ("%s %s (from %s)\n", hostname.c_str(), newAvailStatus.c_str(), availStatus_enum_to_str (node_ptr->availStatus).c_str());
node_ptr->availStatus = avail ;
rc = PASS ;
}
else
{
elog ("%s Invalid 'avail' status (%s)\n",
hostname.c_str(), newAvailStatus.c_str() );
}
}
else
{
wlog ("Cannot change 'avail' status for unknown hostname (%s)\n",
hostname.c_str());
}
return (rc);
}
/** Set host to the disabled failed state and generate the disabled-failed customer log
* This interface allows the disabled-failed state change to be combined so as to avoid
* setting a 'disabled' AND 'disabled-failed' customer log for the failure case */
int nodeLinkClass::failed_state_change ( struct nodeLinkClass::node * node_ptr )
{
int rc = FAIL_NULL_POINTER ;
if ( node_ptr )
{
node_ptr->availStatus = MTC_AVAIL_STATUS__FAILED ;
node_ptr->operState = MTC_OPER_STATE__DISABLED ;
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_FAILED );
rc = PASS ;
}
else
{
slog ("Cannot change to disabled-failed state for null node pointer\n");
}
return (rc);
}
/*****************************************************************************
*
* Name : lazy_graceful_fs_reboot
*
* Description: Issue a lazy reboot and signal SM to shutdown services
*
* Assumptions: No return
*
*****************************************************************************/
int nodeLinkClass::lazy_graceful_fs_reboot ( struct nodeLinkClass::node * node_ptr )
{
/* issue a lazy reboot to the mtcClient and as a backup launch a sysreq reset thresd */
send_mtc_cmd ( node_ptr->hostname, MTC_CMD_LAZY_REBOOT, MGMNT_INTERFACE ) ;
fork_sysreq_reboot ( daemon_get_cfg_ptr()->failsafe_shutdown_delay );
/* loop until reboot */
for ( ; ; )
{
for ( int i = 0 ; i < LAZY_REBOOT_RETRY_DELAY_SECS ; i++ )
{
daemon_signal_hdlr ();
sleep (MTC_SECS_1);
/* give sysinv time to handle the response and get its state in order */
if ( i == SM_NOTIFY_UNHEALTHY_DELAY_SECS )
{
daemon_log ( SMGMT_UNHEALTHY_FILE, "AIO shutdown request" );
}
}
/* Should never get there but if we do resend the reboot request
* but this time not Lazy */
send_mtc_cmd ( node_ptr->hostname, MTC_CMD_REBOOT, MGMNT_INTERFACE ) ;
}
return (FAIL);
}
/* Generate a log and a critical alarm if the node config failed */
int nodeLinkClass::alarm_config_failure ( struct nodeLinkClass::node * node_ptr )
{
if ( (node_ptr->degrade_mask & DEGRADE_MASK_CONFIG) == 0 )
{
node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
}
if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CRITICAL )
{
elog ("%s critical config failure\n", node_ptr->hostname.c_str());
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CRITICAL ;
}
return (PASS);
}
/* Clear the config alarm and degrade flag */
int nodeLinkClass::alarm_config_clear ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr->degrade_mask & DEGRADE_MASK_CONFIG )
{
node_ptr->degrade_mask &= ~DEGRADE_MASK_CONFIG ;
}
if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CLEAR )
{
ilog ("%s config alarm clear\n", node_ptr->hostname.c_str());
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CLEAR ;
}
return (PASS);
}
/* Generate a log and a critical alarm if the node enable failed */
int nodeLinkClass::alarm_enabled_failure ( struct nodeLinkClass::node * node_ptr )
{
if ( (node_ptr->degrade_mask & DEGRADE_MASK_ENABLE) == 0 )
{
node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ;
}
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
{
elog ("%s critical enable failure\n", node_ptr->hostname.c_str());
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ;
}
return (PASS);
}
/*
* Generate a major (in-service) enable alarm
* - don't downgrade the alarm from critical
* - do nothing if the alarm is already at major level
*
**/
int nodeLinkClass::alarm_insv_failure ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr )
{
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_MAJOR )
{
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
{
elog ("%s major inservice enable failure\n", node_ptr->hostname.c_str());
node_ptr->degrade_mask |= DEGRADE_MASK_INSV_TEST ;
mtcAlarm_major ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_MAJOR ;
}
}
}
return (PASS);
}
/* Clear the enable alarm and degrade flag */
int nodeLinkClass::alarm_enabled_clear ( struct nodeLinkClass::node * node_ptr, bool force )
{
if ( node_ptr->degrade_mask & DEGRADE_MASK_ENABLE )
{
node_ptr->degrade_mask &= ~DEGRADE_MASK_ENABLE ;
}
/* The inservice test degrade flag needs to be cleared too. */
if ( node_ptr->degrade_mask & DEGRADE_MASK_INSV_TEST )
{
node_ptr->degrade_mask &= ~DEGRADE_MASK_INSV_TEST ;
}
if (( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CLEAR ) ||
( force == true ))
{
ilog ("%s enable alarm clear\n", node_ptr->hostname.c_str());
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CLEAR ;
}
return (PASS);
}
/* Generate compute subfunction failure alarm */
int nodeLinkClass::alarm_compute_failure ( struct nodeLinkClass::node * node_ptr, EFmAlarmSeverityT sev )
{
if ( (node_ptr->degrade_mask & DEGRADE_MASK_SUBF) == 0 )
{
node_ptr->degrade_mask |= DEGRADE_MASK_SUBF ;
}
if ( node_ptr->alarms[MTC_ALARM_ID__CH_COMP] != sev )
{
if ( sev == FM_ALARM_SEVERITY_CRITICAL )
{
elog ("%s critical compute subf failure\n", node_ptr->hostname.c_str());
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__CH_COMP );
}
else
{
elog ("%s major compute subf failure\n", node_ptr->hostname.c_str());
mtcAlarm_major ( node_ptr->hostname, MTC_ALARM_ID__CH_COMP );
}
node_ptr->alarms[MTC_ALARM_ID__CH_COMP] = sev ;
}
return (PASS);
}
/* Clear the enable alarm if is at the Major severity level */
int nodeLinkClass::alarm_insv_clear ( struct nodeLinkClass::node * node_ptr, bool force )
{
if ( node_ptr->degrade_mask & DEGRADE_MASK_INSV_TEST )
{
node_ptr->degrade_mask &= ~DEGRADE_MASK_INSV_TEST ;
}
if (( node_ptr->alarms[MTC_ALARM_ID__ENABLE] == FM_ALARM_SEVERITY_MAJOR ) ||
( force == true ))
{
ilog ("%s %s enable alarm clear\n", node_ptr->hostname.c_str(), force ? "force" : "major" );
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CLEAR ;
}
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] == FM_ALARM_SEVERITY_CLEAR )
{
if ( node_ptr->degrade_mask & DEGRADE_MASK_ENABLE )
{
node_ptr->degrade_mask &= ~DEGRADE_MASK_ENABLE ;
}
}
return (PASS);
}
/* Clear the compute subfunction alarm and degrade flag */
int nodeLinkClass::alarm_compute_clear ( struct nodeLinkClass::node * node_ptr, bool force )
{
if ( node_ptr->degrade_mask & DEGRADE_MASK_SUBF )
{
node_ptr->degrade_mask &= ~DEGRADE_MASK_SUBF ;
}
if (( node_ptr->alarms[MTC_ALARM_ID__CH_COMP] != FM_ALARM_SEVERITY_CLEAR ) ||
( force == true ))
{
ilog ("%s major enable alarm clear\n", node_ptr->hostname.c_str());
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__CH_COMP );
node_ptr->alarms[MTC_ALARM_ID__CH_COMP] = FM_ALARM_SEVERITY_CLEAR ;
}
return (PASS);
}
/** Host Operational State Change public member function */
int nodeLinkClass::oper_subf_state_change ( string hostname, string newOperState )
{
int rc = FAIL ;
struct nodeLinkClass::node * node_ptr = nodeLinkClass::getNode( hostname );
if ( node_ptr )
{
if ( newOperState.empty() )
{
rc = FAIL_STRING_EMPTY ;
}
else if ( operStateOk ( newOperState ) )
{
node_ptr->operState_subf = operState_str_to_enum ( newOperState.data() );
rc = PASS ;
}
else
{
elog ("%s Invalid subfunction 'oper' state (%s)\n",
hostname.c_str(), newOperState.c_str() );
}
}
else
{
wlog ("Cannot change subfuction 'oper' state for unknown hostname (%s)\n",
hostname.c_str() );
}
return (rc);
}
/** Host Subfunction Availability Status Change public member function */
int nodeLinkClass::avail_subf_status_change ( string hostname, string newAvailStatus )
{
int rc = FAIL ;
struct nodeLinkClass::node * node_ptr = nodeLinkClass::getNode( hostname );
if ( node_ptr )
{
if ( newAvailStatus.empty() )
{
rc = FAIL_STRING_EMPTY ;
}
else if ( availStatusOk ( newAvailStatus ) )
{
node_ptr->availStatus_subf = availStatus_str_to_enum ( newAvailStatus.data() );
rc = PASS ;
}
else
{
elog ("%s Invalid subfunction 'avail' status (%s)\n",
hostname.c_str(), newAvailStatus.c_str() );
}
}
else
{
wlog ("Cannot change subfunction 'avail' status for unknown hostname (%s)\n",
hostname.c_str());
}
return (rc);
}
/** Update the mtce key with value */
int nodeLinkClass::update_key_value ( string hostname, string key , string value )
{
int rc = PASS ;
struct nodeLinkClass::node * node_ptr = nodeLinkClass::getNode( hostname );
if ( node_ptr )
{
/* TODO: Add all database members to this utility */
if ( !key.compare(MTC_JSON_INV_BMIP) )
node_ptr->bm_ip = value ;
else if ( !key.compare(MTC_JSON_INV_TASK) )
node_ptr->task = value ;
else
{
wlog ("%s Unsupported key '%s' update with value '%s'\n",
hostname.c_str(), key.c_str(), value.c_str());
rc = FAIL_BAD_PARM ;
}
}
else
{
wlog ("Cannot change 'admin' state for unknown hostname (%s)\n",
hostname.c_str());
}
return (rc);
}
int nodeLinkClass::del_host ( const string uuid )
{
string hostname = "unknown" ;
int rc = FAIL_DEL_UNKNOWN ;
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode( uuid );
if ( node_ptr )
{
hostname = node_ptr->hostname ;
if ( node_ptr->mtcTimer.tid )
mtcTimer_stop ( node_ptr->mtcTimer );
if ( nodeLinkClass::maintenance == true )
{
if ( node_ptr->bm_provisioned == true )
{
set_bm_prov ( node_ptr, false);
}
doneQueue_purge ( node_ptr );
workQueue_purge ( node_ptr );
mtcCmd_doneQ_purge ( node_ptr );
mtcCmd_workQ_purge ( node_ptr );
/* Cleanup if this is the inactive controller */
if ( !node_ptr->hostname.compare(inactive_controller_hostname))
{
inactive_controller_hostname = "" ;
}
}
rc = rem_host ( hostname );
if ( rc == PASS )
{
plog ("%s Deleted\n", hostname.c_str());
print_node_info();
}
else
{
elog ("%s Delete Failed (rc:%d)\n", hostname.c_str(), rc );
}
this->host_deleted = true ;
}
else
{
wlog ("Unknown uuid: %s\n", uuid.c_str());
}
return (rc);
}
int nodeLinkClass::set_host_failed ( node_inv_type & inv )
{
struct nodeLinkClass::node * node_ptr = nodeLinkClass::getNode( inv.name );
if(NULL == node_ptr)
{
return FAIL_UNKNOWN_HOSTNAME;
}
if( (node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED) &&
(node_ptr->operState == MTC_OPER_STATE__ENABLED) )
{
elog( "%s is being force failed by SM", inv.name.c_str() );
this->force_full_enable (node_ptr);
}
return PASS;
}
int nodeLinkClass::mod_host ( node_inv_type & inv )
{
int rc = PASS ;
bool modify = false ;
bool modify_bm = false ;
print_inv (inv);
struct nodeLinkClass::node * node_ptr = nodeLinkClass::getNode( inv.uuid );
if ( node_ptr )
{
dlog ("%s Modify\n", node_ptr->hostname.c_str());
#ifdef WANT_FIT_TESTING
if ( daemon_want_fit ( FIT_CODE__TRANSLATE_LOCK_TO_FORCELOCK , node_ptr->hostname ) )
{
if ( inv.action.compare("lock") == 0 )
{
slog ("%s FIT action from 'lock' to 'force-lock'\n", node_ptr->hostname.c_str());
inv.action = "force-lock";
}
}
#endif
/* Handle Administrative state mismatch between SYSINV and Maintenance */
if ( strcmp ( mtc_nodeAdminState_str[node_ptr->adminState], inv.admin.data()))
{
plog ("%s Modify 'Administrative' state %s -> %sed\n", node_ptr->hostname.c_str(),
mtc_nodeAdminState_str[node_ptr->adminState], inv.action.c_str());
modify = true ; /* we have a delta */
/* Local admin state takes precedence */
if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
{
/* handle a lock request while unlocked */
if ( !inv.action.compare ( "lock" ) )
{
if ( node_ptr->dor_recovery_mode == true )
node_ptr->dor_recovery_mode = false ;
/* Set action to LOCK and let the FSM run the disable handler */
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__LOCK );
}
else
{
ilog ("%s Already Unlocked ; no action required\n", node_ptr->hostname.c_str() );
}
}
else
{
if ( node_ptr->patching == true )
{
wlog ("%s cannot unlock host while patching is in progress\n", node_ptr->hostname.c_str());
rc = FAIL_PATCH_INPROGRESS ;
}
else
{
/* generate command=unlock log */
mtcAlarm_log ( inv.name, MTC_LOG_ID__COMMAND_UNLOCK );
/* Set action to UNLOCK and let the FSM run the enable handler */
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__UNLOCK );
}
}
}
else if ( (!inv.action.empty()) && (inv.action.compare ( "none" )))
{
dlog ("%s Modify Action is '%s'\n", node_ptr->hostname.c_str(), inv.action.c_str() );
node_ptr->action = inv.action ;
modify = true ; /* we have a delta */
/* Do not permit administrative actions while Swact is in progress */
/* Note: There is a self corrective clause in the mtcTimer_handler
* that will auto clear this flag if it gets stuck for 5 minutes */
if ( smgrEvent.mutex )
{
elog ("%s Rejecting '%s' - Swact Operation in-progress\n",
node_ptr->hostname.c_str(), inv.action.c_str());
rc = FAIL_SWACT_INPROGRESS ;
}
else if (!inv.action.compare ( "force-lock" ))
{
/* TODO: Create customer log of this action */
ilog ("%s Force Lock Action\n", node_ptr->hostname.c_str());
if ( node_ptr->dor_recovery_mode == true )
node_ptr->dor_recovery_mode = false ;
if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
{
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__FORCE_LOCK )
{
ilog ("%s Force Lock Action - already in progress ...\n", node_ptr->hostname.c_str());
}
else
{
/* generate command=forcelock log */
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_FORCE_LOCK );
/* Set action to LOCK and let the FSM run the disable handler */
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__FORCE_LOCK );
}
}
else
{
wlog ("%s Already Locked\n", node_ptr->hostname.c_str() );
}
}
else if (!inv.action.compare ( "lock" ))
{
if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
{
if ( node_ptr->dor_recovery_mode == true )
node_ptr->dor_recovery_mode = false ;
/* Set action to LOCK and let the FSM run the disable handler */
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__LOCK );
}
else
{
wlog ("%s Already Locked\n", node_ptr->hostname.c_str() );
}
}
else if (!inv.action.compare ( "unlock" ))
{
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
{
if ( node_ptr->patching == true )
{
wlog ("%s cannot unlock host while patching is in progress\n", node_ptr->hostname.c_str());
rc = FAIL_PATCH_INPROGRESS ;
}
else
{
recovery_ctrl_init ( node_ptr->hwmon_reset );
recovery_ctrl_init ( node_ptr->hwmon_powercycle );
/* Set action to UNLOCK and let the FSM run the enable handler */
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__UNLOCK );
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__LOCK );
node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_CLEAR ;
/* generate command=unlock log */
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_UNLOCK );
}
}
else
{
wlog ("%s Already UnLocked\n", node_ptr->hostname.c_str() );
}
}
else if ( !inv.action.compare ( "swact" ) ||
!inv.action.compare ( "force-swact" ) )
{
if ( !((get_host_function_mask ( inv.type ) & CONTROLLER_TYPE) == CONTROLLER_TYPE) )
{
elog ("%s Rejecting '%s' - Swact only supported for Controllers\n",
node_ptr->hostname.c_str(),
inv.action.c_str());
rc = FAIL_NODETYPE ;
}
else if ( nodeLinkClass::is_inactive_controller_main_insv() != true )
{
elog ("%s Rejecting '%s' - No In-Service Mate\n",
node_ptr->hostname.c_str(),
inv.action.c_str());
rc = FAIL_SWACT_NOINSVMATE ;
}
else if ( node_ptr->adminAction != MTC_ADMIN_ACTION__NONE )
{
elog ("%s Rejecting '%s' - '%s' In-Progress\n",
node_ptr->hostname.c_str(),
inv.action.c_str(),
get_adminAction_str( node_ptr->adminAction ));
rc = FAIL_OPER_INPROGRESS ;
}
else if ( smgrEvent.mutex )
{
elog ("%s Rejecting '%s' - Operation in-progress\n",
node_ptr->hostname.c_str(),
inv.action.c_str());
rc = FAIL_SWACT_INPROGRESS ;
}
// don't run the patching tests during a force-swact action
else if ( node_ptr->patching == true )
{
wlog ("%s cannot swact active controller while patching is in progress\n", node_ptr->hostname.c_str());
rc = FAIL_PATCH_INPROGRESS ;
}
else if ( inactive_controller_is_patching() == true )
{
wlog ("%s cannot swact to inactive controller while patching is in progress\n", node_ptr->hostname.c_str());
rc = FAIL_PATCH_INPROGRESS ;
}
// if this is a force-swact action then allow swact to a
// patched node that has not been rebooted yet, since
// this is a recoverable operation. The other two patching tests
// (above) need to be done on all swact actions since it may
// render the system non-recoverable.
else if ( !inv.action.compare ( "swact" ) &&
inactive_controller_is_patched() == true )
{
wlog ("%s cannot swact to a 'patched' but not 'rebooted' host\n", node_ptr->hostname.c_str());
rc = FAIL_PATCHED_NOREBOOT ;
}
else
{
plog ("%s Action=%s\n", node_ptr->hostname.c_str(),
inv.action.c_str());
if ( !inv.action.compare ( "force-swact" ) )
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__FORCE_SWACT );
else
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__SWACT );
/* generate command=swact log */
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_SWACT );
smgrEvent.mutex = true ;
}
}
else if ( !inv.action.compare ( "reboot" ) )
{
plog ("%s Reboot Action\n", node_ptr->hostname.c_str());
node_ptr->resetProgStage = MTC_RESETPROG__START ;
node_ptr->retries = 0 ;
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_MANUAL_REBOOT );
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__REBOOT );
}
else if ( !inv.action.compare ( "reinstall" ) )
{
plog ("%s Reinstall Action\n", node_ptr->hostname.c_str());
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__REINSTALL );
/* generate command=reinstall log */
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_REINSTALL );
}
else if ( !inv.action.compare ( "reset" ) )
{
string bm_ip = NONE ;
rc = FAIL_RESET_CONTROL ;
node_ptr->retries = 0 ;
plog ("%s Reset Action\n", node_ptr->hostname.c_str());
if ( hostUtil_is_valid_bm_type ( node_ptr->bm_type ) == false )
{
wlog ("%s reset rejected due to unprovisioned bmc\n",
node_ptr->hostname.c_str());
rc = FAIL_BM_PROVISION_ERR ;
}
else if ( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF )
{
wlog ("%s reset rejected for powered off host\n",
node_ptr->hostname.c_str());
rc = FAIL_RESET_POWEROFF;
}
else if ( node_ptr->bm_un.empty() )
{
wlog ("%s reset rejected due to unconfigured 'bm_username'\n",
node_ptr->bm_un.c_str());
rc = FAIL_BM_PROVISION_ERR ;
}
else
{
rc = PASS ;
node_ptr->resetStage = MTC_RESET__START ;
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__RESET );
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_MANUAL_RESET );
}
}
else if ( !inv.action.compare ( "power-on" ) )
{
string bm_ip = NONE ;
rc = FAIL_POWER_CONTROL ;
plog ("%s Power-On Action\n", node_ptr->hostname.c_str());
if ( hostUtil_is_valid_bm_type ( node_ptr->bm_type ) == false )
{
wlog ("%s power-on rejected due to unprovisioned bmc\n",
node_ptr->hostname.c_str());
rc = FAIL_BM_PROVISION_ERR ;
}
else if ( node_ptr->bm_un.empty() )
{
wlog ("%s power-on rejected due to unconfigured 'bm_username'\n",
node_ptr->hostname.c_str());
rc = FAIL_BM_PROVISION_ERR ;
}
else
{
rc = PASS ;
node_ptr->powerStage = MTC_POWERON__START ;
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__POWERON );
}
mtcInvApi_update_task ( node_ptr, "" );
}
else if ( !inv.action.compare ( "power-off" ) )
{
string bm_ip = NONE ;
rc = FAIL_POWER_CONTROL ;
plog ("%s Power-Off Action\n", node_ptr->hostname.c_str());
if ( hostUtil_is_valid_bm_type ( node_ptr->bm_type ) == false )
{
wlog ("%s power-off rejected due to unprovisioned bmc\n",
node_ptr->hostname.c_str());
rc = FAIL_BM_PROVISION_ERR ;
}
else if ( node_ptr->bm_un.empty() )
{
wlog ("%s power-off rejected due to unconfigured 'bm_username'\n",
node_ptr->hostname.c_str());
rc = FAIL_BM_PROVISION_ERR ;
}
else
{
if (( !hostUtil_is_valid_ip_addr ( node_ptr->bm_ip )) &&
( !hostUtil_is_valid_ip_addr ( bm_ip )))
{
wlog ("%s power-off may fail ; 'bm_ip' is undiscovered\n",
node_ptr->hostname.c_str());
}
rc = PASS ;
node_ptr->powerStage = MTC_POWEROFF__START ;
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__POWEROFF );
}
mtcInvApi_update_task ( node_ptr, "" );
}
else
{
wlog ("%s Unsupported action '%s'\n",
node_ptr->hostname.c_str(),
inv.action.c_str());
rc = FAIL_ADMIN_ACTION ;
mtcInvApi_update_task ( node_ptr, "" );
}
}
if ( node_ptr->uuid.compare ( inv.uuid ) )
{
send_hwmon_command ( node_ptr->hostname, MTC_CMD_DEL_HOST );
plog ("%s Modify 'uuid' from %s -> %s\n",
node_ptr->hostname.c_str(),
node_ptr->uuid.c_str(), inv.uuid.c_str() );
node_ptr->uuid = inv.uuid ;
send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
modify = true ; /* we have a delta */
}
if ( node_ptr->type.compare ( inv.type ) )
{
plog ("%s Modify 'personality' from %s -> %s\n",
node_ptr->hostname.c_str(),
node_ptr->type.c_str(), inv.type.c_str() );
node_ptr->type = inv.type ;
node_ptr->nodetype = get_host_function_mask ( inv.type );
modify = true ; /* we have a delta */
}
if ( node_ptr->ip.compare ( inv.ip ) )
{
plog ("%s Modify 'mgmt_ip' from %s -> %s\n",
node_ptr->hostname.c_str(),
node_ptr->ip.c_str(), inv.ip.c_str());
node_ptr->ip = inv.ip ;
/* Tell the guestAgent the new IP */
rc = send_guest_command(node_ptr->hostname,MTC_CMD_MOD_HOST);
modify = true ; /* we have a delta */
}
if ( node_ptr->mac.compare ( inv.mac ) )
{
plog ("%s Modify 'mgmt_mac' from %s -> %s\n",
node_ptr->hostname.c_str(),
node_ptr->mac.c_str(), inv.mac.c_str() );
node_ptr->mac = inv.mac ;
modify = true ; /* we have a delta */
}
if ( node_ptr->infra_ip.compare ( inv.infra_ip ) )
{
if (( hostUtil_is_valid_ip_addr ( inv.infra_ip )) || ( hostUtil_is_valid_ip_addr ( node_ptr->infra_ip )))
{
plog ("%s Modify 'infra_ip' from %s -> %s\n",
node_ptr->hostname.c_str(),
node_ptr->infra_ip.c_str(), inv.infra_ip.c_str() );
modify = true ; /* we have a delta */
}
node_ptr->infra_ip = inv.infra_ip ;
}
if ( (!inv.name.empty()) && (node_ptr->hostname.compare ( inv.name)) )
{
mtcCmd cmd ;
mtcCmd_init ( cmd );
cmd.stage = MTC_CMD_STAGE__START ;
cmd.cmd = MTC_OPER__MODIFY_HOSTNAME ;
cmd.name = inv.name ;
node_ptr->mtcCmd_work_fifo.push_back(cmd);
plog ("%s Modify 'hostname' to %s (mtcCmd_queue:%ld)\n",
node_ptr->hostname.c_str(),
cmd.name.c_str() ,
node_ptr->mtcCmd_work_fifo.size());
modify_bm = true ; /* board mgmnt change */
modify = true ; /* we have some delta */
}
if ( node_ptr->bm_un.compare ( inv.bm_un ) )
{
if ( inv.bm_un.empty () )
inv.bm_un = "none" ;
plog ("%s Modify 'bm_username' from %s -> %s\n",
node_ptr->hostname.c_str(),
node_ptr->bm_un.c_str(), inv.bm_un.c_str());
node_ptr->bm_un = inv.bm_un ;
modify_bm = true ; /* board mgmnt change */
modify = true ; /* we have some delta */
}
/* PATCHBACK - issue found during BMC refactoring user story
* where there was a race condition found where the bmc dnsmasq file
* was updated with a new bm_ip close to when there was an
* administrative operation (unlock in this case). The newly learned
* bm_ip was overwritten by the now stale bm_ip that came in from
* inventory. The bm_ip should never come from sysinv while in
* internal mode. */
if (( node_ptr->bm_ip.compare ( inv.bm_ip )))
{
if ( inv.bm_ip.empty () )
inv.bm_ip = NONE ;
/* if not empty and not none and already used then reject */
if ( is_bm_ip_already_used ( inv.bm_ip ) == true )
{
wlog ("%s cannot use already provisioned bm ip %s\n",
node_ptr->hostname.c_str(),
inv.bm_ip.c_str());
return (FAIL_DUP_IPADDR);
}
plog ("%s Modify 'bm_ip' from %s -> %s\n",
node_ptr->hostname.c_str(),
node_ptr->bm_ip.c_str(), inv.bm_ip.c_str());
node_ptr->bm_ip = inv.bm_ip ;
modify_bm = true ; /* board mgmnt change */
modify = true ; /* we have some delta */
}
if ( node_ptr->bm_type.compare ( inv.bm_type ) )
{
if ( inv.bm_type.empty() )
inv.bm_type = "none" ;
else
inv.bm_type = tolowercase(inv.bm_type) ;
plog ("%s Modify 'bm_type' from %s -> %s\n",
node_ptr->hostname.c_str(),
node_ptr->bm_type.c_str(), inv.bm_type.c_str());
modify_bm = true ; /* board mgmnt change */
modify = true ; /* we have some delta */
}
/* print a log if we find that there was nothing to modify */
if ( modify == false )
{
wlog ("%s Modify request without anything to modify\n", node_ptr->hostname.c_str());
}
if ( modify_bm == true )
{
wlog ("%s Board Management provisioning has changed\n", node_ptr->hostname.c_str());
bool bm_type_was_valid = hostUtil_is_valid_bm_type (node_ptr->bm_type) ;
bool bm_type_now_valid = hostUtil_is_valid_bm_type (inv.bm_type) ;
/* update bm_type now */
node_ptr->bm_type = inv.bm_type ;
/* BM is provisioned */
if ( bm_type_now_valid == true )
{
/* force (re)provision */
manage_bmc_provisioning ( node_ptr );
}
/* BM is already provisioned but is now deprovisioned */
else if (( bm_type_was_valid == true ) && ( bm_type_now_valid == false ))
{
node_ptr->bm_type = NONE ;
node_ptr->bm_ip = NONE ;
node_ptr->bm_un = NONE ;
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_BM_DEPROVISIONED );
set_bm_prov ( node_ptr, false );
}
/* BM was not provisioned and is still not provisioned */
else
{
/* Handle all other provisioning changes ; username, ip address */
manage_bmc_provisioning ( node_ptr );
}
}
}
else
{
elog ("getNode failed to find uuid: %s\n", inv.uuid.c_str());
}
return ( rc );
}
void nodeLinkClass::start_offline_handler ( struct nodeLinkClass::node * node_ptr )
{
bool already_active = false ;
mtc_offlineStages_enum offlineStage_saved = node_ptr->offlineStage ;
if ( node_ptr->offlineStage == MTC_OFFLINE__IDLE )
{
node_ptr->offlineStage = MTC_OFFLINE__START ;
}
else
{
already_active = true ;
}
plog ("%s%soffline handler (%s-%s-%s) (stage:%d)\n",
node_ptr->hostname.c_str(),
already_active ? " " : " starting ",
adminState_enum_to_str(node_ptr->adminState).c_str(),
operState_enum_to_str(node_ptr->operState).c_str(),
availStatus_enum_to_str(node_ptr->availStatus).c_str(),
offlineStage_saved);
}
void nodeLinkClass::stop_offline_handler ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr->offlineStage != MTC_OFFLINE__IDLE )
{
plog ("%s stopping offline handler (%s-%s-%s) (stage:%d)\n",
node_ptr->hostname.c_str(),
adminState_enum_to_str(node_ptr->adminState).c_str(),
operState_enum_to_str(node_ptr->operState).c_str(),
availStatus_enum_to_str(node_ptr->availStatus).c_str(),
node_ptr->offlineStage);
node_ptr->offlineStage = MTC_OFFLINE__IDLE ;
}
}
string nodeLinkClass::get_host ( string uuid )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( uuid );
if ( node_ptr != NULL )
{
return (node_ptr->hostname) ;
}
return ( "" );
}
/** Check to see if the node list already contains any of the following
* information and reject the add or modify if it does
*
* uuid
* hostname
* ip address
* mac address
*
**/
int nodeLinkClass::add_host_precheck ( node_inv_type & inv )
{
struct node * ptr = static_cast<struct node *>(NULL) ;
struct node * node_ptr = static_cast<struct node *>(NULL) ;
int rc = PASS ;
if ( head == NULL )
return (PASS);
for ( node_ptr = head ; ; node_ptr = node_ptr->next )
{
/* look or the UUID */
if ( !node_ptr->uuid.compare(inv.uuid))
{
rc = RETRY ;
dlog ("%s found in mtce\n", node_ptr->uuid.c_str());
break ;
}
else if (( node_ptr->next == NULL ) || ( node_ptr == tail ))
break ;
}
/** If that uuid is not found then make sure there
* are no other entries in the list that already
* has the same info that we want to create a
* new host with.
* If so then reject by returning a failure.
*/
for ( ptr = head ; ; ptr = ptr->next )
{
/* if this uuid is found then see if we are being
* asked to modify and make sure that we are not being
* asked to modify other members to values that are
* used by other hosts
* If so then reject by returning a failure.
* otherwise then allow the modification by returning a retry.
*/
if (( rc == RETRY ) && ( ptr == node_ptr ))
{
dlog ("%s skip\n", ptr->hostname.c_str());
/* skip the node we found the UUID on
* but make sure that none of the other nodes
* have the same data */
}
else
{
dlog ("%s check\n", ptr->hostname.c_str());
if ( !ptr->hostname.compare(inv.name))
{
wlog ("hostname (%s) already used ; rejecting add / modify\n", inv.name.c_str());
return(FAIL_DUP_HOSTNAME);
}
if ( ptr->ip.compare("none") != 0 && !ptr->ip.compare(inv.ip))
{
wlog ("ip address (%s) already used ; rejecting add / modify\n", inv.ip.c_str());
return(FAIL_DUP_IPADDR);
}
if ( !ptr->mac.compare(inv.mac))
{
wlog ("mac address (%s) already used ; rejecting add / modify\n", inv.mac.c_str());
return(FAIL_DUP_MACADDR);
}
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
return (rc);
}
int nodeLinkClass::add_host ( node_inv_type & inv )
{
int rc = FAIL ;
struct nodeLinkClass::node * node_ptr = static_cast<struct node *>(NULL);
if ((!inv.name.compare("controller-0")) ||
(!inv.name.compare("controller-1")))
{
dlog ("Adding %s\n", inv.name.c_str());
node_ptr = nodeLinkClass::getNode(inv.name);
}
else if (( inv.name.empty()) ||
( !inv.name.compare ("none") ) ||
( !inv.name.compare ("None") ))
{
wlog ("Refusing to add host with 'null' or 'invalid' hostname (%s)\n",
inv.uuid.c_str());
return (FAIL_INVALID_HOSTNAME) ;
}
else if (( inv.uuid.empty()) ||
( !inv.uuid.compare ("none") ) ||
( !inv.uuid.compare ("None") ))
{
wlog ("Refusing to add host with 'null' or 'invalid' uuid (%s)\n",
inv.uuid.c_str());
return (FAIL_INVALID_UUID) ;
}
/* Ensure we don't add a host with critical info that is
* already used by other members of inventory like ;
* hostname, uuid, ip, mac, bm_ip */
else if ( ( rc = add_host_precheck ( inv )) > RETRY )
{
return (rc);
}
else
{
if ( rc == RETRY )
{
dlog ("%s modify operation\n", inv.uuid.c_str());
}
else
{
dlog ("%s add operation\n", inv.uuid.c_str());
}
node_ptr = nodeLinkClass::getNode(inv.uuid);
}
if ( node_ptr )
{
dlog ("%s Already provisioned\n", node_ptr->hostname.c_str());
/* update some of the info */
node_ptr->adminState = adminState_str_to_enum (inv.admin.data());
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
}
/* Send back a retry so that this add is converted to a modify */
return (RETRY);
}
/* Otherwise add it as a new node */
else
{
if ( daemon_get_cfg_ptr()->debug_level != 0 )
print_inv ( inv );
unsigned int nodetype_temp = get_host_function_mask ( inv.type );
if ( (nodetype_temp & CONTROLLER_TYPE) == CONTROLLER_TYPE )
{
if ( inactive_controller_hostname.empty () == false )
{
wlog ("Cannot provision more than 2 controllers\n");
wlog ("%s is already provisioned as inactive\n",
inactive_controller_hostname.c_str());
return (FAIL);
}
}
/* Prevent allowing add of a reserved hostname
* with and incorrect node type. Reserved names are
*
* controller-0 and controller-1 must be a controller type
* storage-0 must be a storage type
*
* */
if ((( !inv.name.compare ("controller-0")) && (( nodetype_temp & CONTROLLER_TYPE) != CONTROLLER_TYPE )) ||
(( !inv.name.compare ("controller-1")) && (( nodetype_temp & CONTROLLER_TYPE ) != CONTROLLER_TYPE)) ||
(( !inv.name.compare ("storage-0" )) && (( nodetype_temp & STORAGE_TYPE) != STORAGE_TYPE)))
{
wlog ("Cannot provision '%s' as a '%s' host\n", inv.name.c_str(), inv.type.c_str());
return (FAIL_RESERVED_NAME);
}
node_ptr = nodeLinkClass::addNode(inv.name);
if ( node_ptr )
{
bool validStates = false ;
node_ptr->hostname = inv.name ;
/* set the node type ; string and define code */
node_ptr->type = inv.type ;
node_ptr->nodetype = get_host_function_mask ( inv.type ) ;
update_host_functions ( inv.name, inv.func );
node_ptr->ip = inv.ip ;
node_ptr->mac = inv.mac ;
node_ptr->uuid = inv.uuid ;
node_ptr->infra_ip = inv.infra_ip ;
if ( inv.uptime.length() )
{
sscanf ( inv.uptime.data(), "%u", &node_ptr->uptime );
dlog2 ("%s Uptime (%s:%u)\n", inv.name.c_str(), inv.uptime.c_str(), node_ptr->uptime );
}
else
{
node_ptr->uptime = 0 ;
}
node_ptr->thread_extra_info.bm_ip = node_ptr->bm_ip = inv.bm_ip ;
node_ptr->thread_extra_info.bm_un = node_ptr->bm_un = inv.bm_un ;
node_ptr->thread_extra_info.bm_type= node_ptr->bm_type = inv.bm_type ;
node_ptr->bm_ping_info.sock = 0 ;
/* initialize the host power and reset control thread */
thread_init ( node_ptr->ipmitool_thread_ctrl,
node_ptr->ipmitool_thread_info,
&node_ptr->thread_extra_info,
mtcThread_ipmitool,
DEFAULT_THREAD_TIMEOUT_SECS,
node_ptr->hostname,
THREAD_NAME__IPMITOOL);
if ( adminStateOk (inv.admin) &&
operStateOk (inv.oper ) &&
availStatusOk (inv.avail))
{
validStates = true ;
}
clog ("%s subf state %s-%s\n", node_ptr->hostname.c_str(), inv.oper_subf.c_str(), inv.avail_subf.c_str() );
node_ptr->task = inv.task ;
/* Add based on 'action' */
if ((!inv.action.empty()) && (inv.action.compare ("none")))
{
/* Save current action */
node_ptr->action = inv.action ;
if ( !inv.action.compare ("unlock") && validStates )
{
ilog ("%s Added in 'unlocked' state\n", node_ptr->hostname.c_str());
print_inv ( inv );
node_ptr->adminState = adminState_str_to_enum (inv.admin.data());
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
}
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__UNLOCK );
}
else if ( !inv.action.compare ("lock") && validStates )
{
ilog ("%s Added in 'locked' state\n", node_ptr->hostname.c_str());
print_inv ( inv );
node_ptr->adminState = adminState_str_to_enum (inv.admin.data());
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
}
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__LOCK );
}
else if ( !inv.action.compare ("force-lock") && validStates )
{
ilog ("%s Added in 'force-locked' state\n", node_ptr->hostname.c_str());
print_inv ( inv );
node_ptr->adminState = adminState_str_to_enum (inv.admin.data());
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
}
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__FORCE_LOCK );
}
else if ( !inv.action.compare ("reboot") && validStates )
{
ilog ("%s Added with 'reboot' in 'locked' state\n", node_ptr->hostname.c_str());
print_inv ( inv );
node_ptr->adminState = adminState_str_to_enum (inv.admin.data());
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
}
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__REBOOT );
}
else if ( !inv.action.compare ("reset") && validStates )
{
ilog ("%s Added with 'reset' in 'locked' state\n", node_ptr->hostname.c_str());
print_inv ( inv );
node_ptr->adminState = adminState_str_to_enum (inv.admin.data());
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
}
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RESET );
}
else if ( !inv.action.compare ("power-off") && validStates )
{
ilog ("%s Added in a 'locked' and 'power-off' state\n", node_ptr->hostname.c_str());
print_inv ( inv );
node_ptr->adminState = MTC_ADMIN_STATE__LOCKED ;
node_ptr->operState = MTC_OPER_STATE__DISABLED;
node_ptr->availStatus = MTC_AVAIL_STATUS__POWERED_OFF ;
node_ptr->operState_subf = MTC_OPER_STATE__DISABLED ;
node_ptr->availStatus_subf = MTC_AVAIL_STATUS__POWERED_OFF ;
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__POWEROFF );
}
else if ( !inv.action.compare ("power-on") && validStates )
{
ilog ("%s Added with 'power-on' in 'locked' state\n", node_ptr->hostname.c_str());
print_inv ( inv );
node_ptr->adminState = MTC_ADMIN_STATE__LOCKED ;
node_ptr->operState = MTC_OPER_STATE__DISABLED;
node_ptr->availStatus = MTC_AVAIL_STATUS__OFFLINE ;
node_ptr->operState_subf = MTC_OPER_STATE__DISABLED ;
node_ptr->availStatus_subf = MTC_AVAIL_STATUS__OFFLINE ;
node_ptr->onlineStage = MTC_ONLINE__START ;
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__POWERON );
}
else
{
wlog ("%s Need add Action support for '%s' action\n", node_ptr->hostname.c_str(),
inv.action.c_str());
print_inv ( inv );
/* Load in maintenance states */
node_ptr->adminState = MTC_ADMIN_STATE__LOCKED ;
node_ptr->operState = MTC_OPER_STATE__DISABLED ;
node_ptr->availStatus = MTC_AVAIL_STATUS__OFFLINE ;
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = MTC_OPER_STATE__DISABLED ;
node_ptr->availStatus_subf = MTC_AVAIL_STATUS__OFFLINE ;
}
node_ptr->onlineStage = MTC_ONLINE__START ;
wlog ("%s Need '%s' action enabled here\n", node_ptr->hostname.c_str(),
inv.action.c_str());
}
}
else
{
node_ptr->adminState = adminState_str_to_enum (inv.admin.data());
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
}
}
/* Clear the heartbeat failure conts for this host */
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
node_ptr->hbs_degrade_count[iface] = 0;
node_ptr->hbs_failure_count[iface] = 0;
}
/* Add to the end of inventory */
hostname_inventory.push_back ( node_ptr->hostname );
rc = PASS ;
}
}
if (( rc == PASS ) && ( node_ptr ))
{
node_ptr->addStage = MTC_ADD__START ;
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__ADD );
}
return (rc);
}
void nodeLinkClass::clear_service_readies ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr )
{
if ( node_ptr->hbsClient_ready || node_ptr->pmond_ready )
{
ilog ("%s clearing service ready events\n", node_ptr->hostname.c_str());
node_ptr->hbsClient_ready = false ;
node_ptr->pmond_ready = false ;
}
}
}
/* Used by the heartbeat service to add a host to its list */
int nodeLinkClass::add_heartbeat_host ( const node_inv_type & inv )
{
int rc = FAIL ;
struct nodeLinkClass::node * node_ptr = static_cast<struct node *>(NULL);
dlog ("%s with nodetype %u\n", inv.name.c_str(), inv.nodetype );
/* no hostname - no add ! */
if ( inv.name.length() )
{
/* Handle the case where we are adding a node that is already */
/* present if so just update the inventory data not the mtc state */
node_ptr = nodeLinkClass::getNode(inv.name);
if ( node_ptr )
{
dlog ("%s already provisioned\n", node_ptr->hostname.c_str());
rc = RETRY ;
}
/* Otherwise add it as a new node */
else
{
node_ptr = nodeLinkClass::addNode(inv.name);
if ( node_ptr != NULL )
{
node_ptr->hostname = inv.name ;
node_ptr->nodetype = inv.nodetype ;
dlog ("%s added to linked list\n", inv.name.c_str());
rc = PASS ;
}
else
{
elog ("Failed to addNode %s to heartbeat service\n", inv.name.c_str());
}
}
}
return (rc);
}
string nodeLinkClass::get_uuid ( string hostname )
{
struct nodeLinkClass::node * node_ptr = nodeLinkClass::getNode(hostname);
if ( node_ptr )
{
return (node_ptr->uuid);
}
else
{
return ("");
}
}
void nodeLinkClass::set_uuid ( string hostname, string uuid )
{
struct nodeLinkClass::node * node_ptr = nodeLinkClass::getNode(hostname);
if ( node_ptr )
{
node_ptr->uuid = uuid ;
}
}
/* Set the task field in the maintenance class object for the specified host */
void nodeLinkClass::set_task ( string hostname, string task )
{
struct nodeLinkClass::node * node_ptr = nodeLinkClass::getNode(hostname);
if ( node_ptr )
{
node_ptr->task = task ;
}
}
/* Lock Rules
*
* 1. Cannot lock this controller
* 2. Cannot lock inactive controller if storage-0 is locked
* 3. Cannot lock storage node with monitor if inactive conroller is locked or not present
* 4. Cannot lock last storage host.
*/
bool nodeLinkClass::can_uuid_be_locked ( string uuid , int & reason )
{
struct nodeLinkClass::node * node_ptr = nodeLinkClass::getNode(uuid);
if ( node_ptr )
{
dlog1 ("%s Lock permission query\n", node_ptr->hostname.c_str());
/* Allow lock of already locked 'any' host */
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
{
ilog ("%s is already 'locked'\n", node_ptr->hostname.c_str());
return (true);
}
else if ( node_ptr->operState == MTC_OPER_STATE__DISABLED )
{
ilog ("%s allowing lock of 'disabled' host\n", node_ptr->hostname.c_str() );
return (true);
}
else if (is_controller(node_ptr))
{
/* Rule 1 - Cannot lock active controller */
if ( THIS_HOST )
{
elog ("%s Cannot be 'locked' - controller is 'active'\n", node_ptr->hostname.c_str());
reason = FAIL_UNIT_ACTIVE ;
return (false);
}
/* Rule 2 - Cannot lock inactive controller if the floating storage
* ceph monitor is locked */
if (( get_storage_backend() == CGCS_STORAGE_CEPH ) &&
( is_storage_mon_enabled () == false ))
{
wlog ("%s cannot be 'locked' - failed storage redundancy check\n", node_ptr->hostname.c_str());
reason = FAIL_NEED_STORAGE_MON ;
return (false);
}
ilog ("%s can be locked\n", node_ptr->hostname.c_str());
return (true);
}
else if ( is_compute(node_ptr) )
{
if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
{
dlog ("%s is 'unlocked' and can be 'locked'\n", node_ptr->hostname.c_str());
}
return (true);
}
/* Deal with lock of storage cases - Rules 3 and 4 */
else if ( is_storage(node_ptr) )
{
/* Only need to semantic check if this host is unlocked-enabled */
if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
{
/* Both active controllers path ... */
if ( num_controllers_enabled () >= 2 )
{
/* If we are locking storage-0 make sure that there
* is another enabled storage node */
if ( !node_ptr->hostname.compare("storage-0") )
{
/* We already know that this storage node is enabled so
* we need to see a count greated than 1 */
if ( enabled_storage_nodes () > 1 )
{
/* We have 2 enabled controllers and 2 enabled
* storage nodes so we can allow lock of storage-0 */
ilog ("%s can be locked - there is storage redundancy\n",
node_ptr->hostname.c_str());
return (true);
}
/* Rule 4 - Cannot lock last storage node */
else
{
wlog ("%s cannot be locked - no storage redundancy\n",
node_ptr->hostname.c_str());
reason = FAIL_LOW_STORAGE ;
return (false);
}
}
/* O.K. we are trying to lock a storage host tha is not
* the floating storage monitor */
else if (( is_storage_mon_enabled () == true ) &&
( enabled_storage_nodes() > 1 ))
{
/* We have - 2 enabled controllers
* - the storage mon is enabled and
* - is not this one. */
ilog ("%s can be locked - there is storage redundancy\n",
node_ptr->hostname.c_str());
return (true);
}
/* Rule 4 - Cannot lock last storage node */
else if (enabled_storage_nodes() <= 1)
{
wlog ("%s cannot be locked - no storage redundancy\n",
node_ptr->hostname.c_str());
reason = FAIL_LOW_STORAGE ;
return (false);
}
else
{
/* Other redundancy checks here and in SysInv have passed. */
ilog ("%s can be locked - storage redundancy filters passed.\n",
node_ptr->hostname.c_str());
return (true);
}
}
/* Rule 3 - Cannot lock storage node with monitor if inactive
* controller is locked or not present and there is
* not another storage node enabled */
else
{
/* Cannot lock storage-0 if there is only a single enabled controller */
if ( !node_ptr->hostname.compare("storage-0") )
{
wlog ("%s cannot be locked - simplex system\n",
node_ptr->hostname.c_str());
reason = FAIL_NEED_STORAGE_MON ;
return (false);
}
/* Only allow locking of a storage node if there is another in service */
else if (( is_storage_mon_enabled () == true ) &&
( enabled_storage_nodes() > 1 ))
{
ilog ("%s can be locked - there is storage redundancy\n",
node_ptr->hostname.c_str());
return (true);
}
/* Rule 4 - Cannot lock last storage node */
else
{
wlog ("%s cannot be locked - no redundancy\n",
node_ptr->hostname.c_str());
reason = FAIL_LOW_STORAGE ;
return (false);
}
}
}
else
{
ilog ("%s allowing lock of disabled storage host\n",
node_ptr->hostname.c_str());
return (true);
}
}
else
{
elog ("%s unsupported nodetype (%u)\n",
node_ptr->hostname.c_str(),
node_ptr->nodetype);
return (false);
}
}
else
{
dlog ("Unknown uuid: %s\n", uuid.c_str());
/* allowing lock as a means to clear up error */
return (true);
}
}
int nodeLinkClass::rem_host ( string & hostname )
{
int rc = FAIL ;
if ( ! hostname.empty() )
{
hostname_inventory.remove ( hostname );
rc = nodeLinkClass::remNode ( hostname );
}
return ( rc );
}
void nodeLinkClass::set_my_hostname ( string hostname )
{
struct nodeLinkClass::node * node_ptr ;
nodeLinkClass::my_hostname = hostname ;
/* set it in the local inventory as well */
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
node_ptr->hostname = hostname ;
}
}
string nodeLinkClass::get_my_hostname ( void )
{
return( nodeLinkClass::my_hostname );
}
void nodeLinkClass::set_my_local_ip ( string & ip )
{
nodeLinkClass::node* node_ptr ;
nodeLinkClass::my_local_ip = ip ;
/* set it in the local inventory as well */
node_ptr = nodeLinkClass::getNode ( my_hostname );
if ( node_ptr != NULL )
{
node_ptr->ip = ip ;
}
}
string nodeLinkClass::get_my_local_ip ( void )
{
return( nodeLinkClass::my_local_ip );
}
void nodeLinkClass::set_my_float_ip ( string & ip )
{
nodeLinkClass::my_float_ip = ip ;
}
string nodeLinkClass::get_my_float_ip ( void )
{
return( nodeLinkClass::my_float_ip );
}
static string null_str = "" ;
string nodeLinkClass::get_hostaddr ( string & hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
return ( node_ptr->ip );
}
return ( null_str );
}
string nodeLinkClass::get_infra_hostaddr ( string & hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
return ( node_ptr->infra_ip );
}
return ( null_str );
}
string nodeLinkClass::get_hostIfaceMac ( string & hostname, int iface )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
if ( iface == MGMNT_IFACE )
return ( node_ptr->mac );
if ( iface == INFRA_IFACE )
return ( node_ptr->infra_mac );
}
ilog ("%s has unknown mac address for %s interface\n", hostname.c_str(), get_iface_name_str(iface));
return ( null_str );
}
int nodeLinkClass::set_hostaddr ( string & hostname, string & ip )
{
int rc = FAIL ;
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
node_ptr->ip = ip ;
rc = PASS ;
}
return ( rc );
}
int nodeLinkClass::set_infra_hostaddr ( string & hostname, string & ip )
{
int rc = FAIL ;
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
node_ptr->infra_ip = ip ;
rc = PASS ;
}
return ( rc );
}
string nodeLinkClass::get_hostname ( string & hostaddr )
{
if (( hostaddr == LOOPBACK_IPV6 ) ||
( hostaddr == LOOPBACK_IP ) ||
( hostaddr == LOCALHOST ))
{
return(my_hostname);
}
else
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostaddr );
if ( node_ptr != NULL )
{
return ( node_ptr->hostname );
}
return ( null_str );
}
}
string nodeLinkClass::get_hostname_from_bm_ip ( string bm_ip )
{
if ( head )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ! ptr->bm_ip.compare(bm_ip) )
{
return ( ptr->hostname );
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return ("") ;
}
int nodeLinkClass::num_hosts ( void )
{
return ( nodeLinkClass::hosts ) ;
}
void nodeLinkClass::set_cmd_resp ( string & hostname, mtc_message_type & msg )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
if ( is_host_services_cmd ( msg.cmd ) )
{
/*****************************************************
* Host Services Request's Response Handling
*****************************************************/
node_ptr->host_services_req.status = msg.parm[0] ;
if ( msg.cmd == node_ptr->host_services_req.cmd )
{
// print_mtc_message ( &msg, true );
/* if num > 1 then expect a host services result message */
if ( msg.cmd == MTC_CMD_HOST_SVCS_RESULT )
{
if ( !node_ptr->host_services_req.ack )
{
slog ("%s %s without initial command ACK\n",
hostname.c_str(),
node_ptr->host_services_req.name.c_str());
}
node_ptr->host_services_req.rsp = msg.cmd ;
if ( msg.buf[0] != '\0' )
{
node_ptr->host_services_req.status_string = msg.buf ;
}
}
/* Check to see if the start/stop host services command
* response demonstrates support for the enhanced host
* services extension. */
else if (( msg.num > 1 ) && ( msg.parm[1] == MTC_ENHANCED_HOST_SERVICES ))
{
dlog ("%s %s request ack\n",
hostname.c_str(),
node_ptr->host_services_req.name.c_str());
node_ptr->host_services_req.ack = true ;
}
else
{
ilog ("%s %s request ack (legacy mode)\n",
hostname.c_str(),
node_ptr->host_services_req.name.c_str());
/* support legacy client by copying the cmd to cmdRsp */
node_ptr->host_services_req.status = PASS ;
node_ptr->host_services_req.rsp = msg.cmd ;
node_ptr->host_services_req.ack = MTC_CMD_NONE ;
}
}
if ( msg.num && ( node_ptr->host_services_req.status != PASS ))
{
dlog ("%s %s command failed (rc:%d) [%s]\n",
hostname.c_str(),
get_mtcNodeCommand_str(msg.cmd),
node_ptr->host_services_req.status,
node_ptr->host_services_req.status_string.empty() ?
"no error string" : node_ptr->host_services_req.status_string.c_str());
}
}
else
{
node_ptr->cmdRsp = msg.cmd ;
if ( msg.num > 0 )
node_ptr->cmdRsp_status = msg.parm[0] ;
else
node_ptr->cmdRsp_status = -1 ;
dlog ("%s '%s' command response status [%u:%s]\n",
hostname.c_str(),
node_ptr->cmdName.c_str(),
msg.num ? node_ptr->cmdRsp_status : PASS,
node_ptr->cmdRsp_status_string.empty() ? "empty" : node_ptr->cmdRsp_status_string.c_str());
}
}
}
unsigned int nodeLinkClass::get_cmd_resp ( string & hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
return ( node_ptr->cmdRsp ) ;
}
return (-1);
}
mtc_client_enum nodeLinkClass::get_activeClient ( string hostname )
{
nodeLinkClass::node* node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
return ( node_ptr->activeClient ) ;
}
else
{
slog ("Host lookup failed for '%s'\n", hostname.c_str());
}
return (CLIENT_NONE);
}
int nodeLinkClass::set_activeClient ( string hostname, mtc_client_enum client )
{
nodeLinkClass::node* node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
node_ptr->activeClient = client ;
return (PASS);
}
else
{
slog ("Host lookup failed for '%s'\n", hostname.c_str());
}
return (FAIL_HOSTNAME_LOOKUP);
}
/*****************************************************************************
*
* Name : set_mtcAlive
*
* Description:
*
* If mtcAlive is ungated then
*
* 1. manage the online/offline state bools
* 2. increment the mtcAlive count and
* 3. set the mtcAlive received bool for the specified interface
*
*****************************************************************************/
void nodeLinkClass::set_mtcAlive ( string & hostname, int interface )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
if ( node_ptr->mtcAlive_gate == false )
{
node_ptr->mtcAlive_online = true ;
node_ptr->mtcAlive_offline = false ;
node_ptr->mtcAlive_count++ ;
if ( interface == INFRA_INTERFACE )
{
node_ptr->mtcAlive_infra = true ;
}
else
{
node_ptr->mtcAlive_mgmnt = true ;
}
}
}
}
bool nodeLinkClass::get_mtcAlive_gate ( string & hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
return ( node_ptr->mtcAlive_gate ) ;
}
/* If we can't find the node then gate off the alive messages */
return (true);
}
void nodeLinkClass::ctl_mtcAlive_gate ( string & hostname, bool gated )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
node_ptr->mtcAlive_gate = gated ;
if ( gated == true )
{
alog ("%s mtcAlive gated\n", node_ptr->hostname.c_str());
}
else
{
alog ("%s mtcAlive ungated\n", node_ptr->hostname.c_str());
}
}
}
/* Main-Function Go Enabled member Functions */
void nodeLinkClass::set_goEnabled ( string & hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
node_ptr->goEnabled = true ;
}
}
bool nodeLinkClass::get_goEnabled ( string & hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
return ( node_ptr->goEnabled ) ;
}
return (false);
}
void nodeLinkClass::set_goEnabled_failed ( string & hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
node_ptr->goEnabled_failed = true ;
}
}
/* Sub-Function Go Enabled Member Functions */
void nodeLinkClass::set_goEnabled_subf ( string & hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
node_ptr->goEnabled_subf = true ;
}
}
bool nodeLinkClass::get_goEnabled_subf ( string & hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
return ( node_ptr->goEnabled_subf ) ;
}
return (false);
}
void nodeLinkClass::set_goEnabled_failed_subf ( string & hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
node_ptr->goEnabled_failed_subf = true ;
}
}
/* Set and Get Uptime Member Function */
void nodeLinkClass::set_uptime ( struct nodeLinkClass::node * node_ptr, unsigned int uptime, bool force )
{
if ( node_ptr != NULL )
{
/* Force the uptime into the database if
* - passed in value is 0 and current value is !0
* - passed in value is !0 and current value is 0
* - if ther force option is used
* Otherwise allow the audit to push time to the database
*/
if ((force == true ) ||
(( uptime != 0 ) && ( node_ptr->uptime == 0 )) ||
(( node_ptr->uptime != 0 ) && ( uptime == 0 )))
{
mtcInvApi_update_uptime ( node_ptr, uptime );
}
node_ptr->uptime = uptime ;
}
}
void nodeLinkClass::set_uptime ( string & hostname, unsigned int uptime, bool force )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
set_uptime ( node_ptr, uptime, force );
}
unsigned int nodeLinkClass::get_uptime ( string & hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
return ( node_ptr->uptime ) ;
}
return (0);
}
void nodeLinkClass::set_uptime_refresh_ctr ( string & hostname, int value )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
node_ptr->uptime_refresh_counter = value ;
}
}
int nodeLinkClass::get_uptime_refresh_ctr ( string & hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
return ( node_ptr->uptime_refresh_counter ) ;
}
return (0);
}
void nodeLinkClass::set_mtce_flags ( string hostname, int flags )
{
nodeLinkClass::node* node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
/* Deal with host level */
node_ptr->mtce_flags = flags ;
if ( flags & MTC_FLAG__MAIN_GOENABLED )
node_ptr->goEnabled = true ;
else
node_ptr->goEnabled = false ;
/* Track host patching state by Out-Of-Band flag */
if ( flags & MTC_FLAG__PATCHING )
{
if ( node_ptr->patching == false )
{
plog ("%s software patching has begun\n", node_ptr->hostname.c_str());
}
node_ptr->patching = true ;
}
else
{
if ( node_ptr->patching == true )
{
plog ("%s software patching done\n", node_ptr->hostname.c_str());
}
node_ptr->patching = false ;
}
/* Track host patched state by Out-Of-Band flag.
* This flag is set when the host is patched but not reset */
if ( flags & MTC_FLAG__PATCHED )
{
if ( node_ptr->patched == false )
{
plog ("%s software patched\n", node_ptr->hostname.c_str());
}
node_ptr->patched = true ;
}
else
{
if ( node_ptr->patched == true )
{
plog ("%s software patch is applied\n", node_ptr->hostname.c_str());
}
node_ptr->patched = false ;
}
/* Deal with sub-function if AIO controller host */
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
if ( flags & MTC_FLAG__SUBF_GOENABLED )
{
if ( node_ptr->operState_subf == MTC_OPER_STATE__ENABLED )
{
node_ptr->goEnabled_subf = true ;
}
}
else
{
node_ptr->goEnabled_subf = false ;
}
}
}
}
void nodeLinkClass::set_health ( string & hostname, int health )
{
switch ( health )
{
case NODE_HEALTH_UNKNOWN:
case NODE_HEALTHY:
case NODE_UNHEALTHY:
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
if ( health == NODE_UNHEALTHY )
{
if ( node_ptr->health != NODE_UNHEALTHY )
{
if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
{
wlog ("%s Health State Change -> UNHEALTHY\n", hostname.c_str());
}
}
}
node_ptr->health = health ;
}
break ;
}
default:
{
wlog ("%s Unexpected health code (%d), defaulting to (unknown)\n", hostname.c_str(), health );
break ;
}
}
}
/*************************************************************************************
*
* Name : manage_bmc_provisioning
*
* Description: This utility manages a change in bmc provisioning for
* bm region EXTERNAL mode. Creates provisioning logs and
* sends START and STOP monitoring commands to the hardware monitor.
*
* Warning : Should only be called when there is a change to BM provisioning.
* as it will first always first disable provisioning and then
* decides whether it needs to be re-enabled or not.
*
*************************************************************************************/
int nodeLinkClass::manage_bmc_provisioning ( struct node * node_ptr )
{
int rc = PASS ;
bool was_provisioned = node_ptr->bm_provisioned ;
set_bm_prov ( node_ptr, false);
if ((hostUtil_is_valid_ip_addr ( node_ptr->bm_ip )) &&
(!node_ptr->bm_un.empty()))
{
if ( was_provisioned == true )
{
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_BM_REPROVISIONED );
}
else
{
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_BM_PROVISIONED );
}
set_bm_prov ( node_ptr, true );
}
else if ( was_provisioned == true )
{
send_hwmon_command(node_ptr->hostname,MTC_CMD_STOP_HOST);
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_BM_DEPROVISIONED );
}
/* Send hmond updated bm info */
ilog ("%s sending board management info update to hwmond\n", node_ptr->hostname.c_str() );
if ( ( rc = send_hwmon_command(node_ptr->hostname,MTC_CMD_MOD_HOST) ) == PASS )
{
if ( node_ptr->bm_provisioned == true )
{
rc = send_hwmon_command(node_ptr->hostname,MTC_CMD_START_HOST);
}
else
{
rc = send_hwmon_command(node_ptr->hostname,MTC_CMD_STOP_HOST);
}
if ( rc )
{
wlog ("%s failed to send START or STOP command to hwmond\n", node_ptr->hostname.c_str());
}
}
else
{
wlog ("%s failed to send MODIFY command to hwmond\n", node_ptr->hostname.c_str());
}
return (rc);
}
bool nodeLinkClass::is_bm_ip_already_used ( string bm_ip )
{
if ( hostUtil_is_valid_ip_addr ( bm_ip ) == true )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( !bm_ip.compare(ptr->bm_ip) )
{
return (true);
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return (false);
}
int nodeLinkClass::set_bm_type ( string hostname , string bm_type )
{
int rc = FAIL_HOSTNAME_LOOKUP ;
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
node_ptr->bm_type = bm_type ;
dlog ("%s '%s' updated to '%s'\n",
hostname.c_str(),
MTC_JSON_INV_BMTYPE,
node_ptr->bm_type.c_str());
rc = PASS ;
}
return (rc);
}
int nodeLinkClass::set_bm_un ( string hostname , string bm_un )
{
int rc = FAIL_HOSTNAME_LOOKUP ;
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
if ( bm_un.length() )
{
node_ptr->bm_un = bm_un ;
}
else
{
node_ptr->bm_un = NONE ;
}
dlog ("%s '%s' updated to '%s'\n",
hostname.c_str(),
MTC_JSON_INV_BMUN,
node_ptr->bm_un.c_str());
rc = PASS ;
}
return (rc);
}
int nodeLinkClass::set_bm_ip ( string hostname , string bm_ip )
{
int rc = FAIL_HOSTNAME_LOOKUP ;
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
node_ptr->bm_ip = bm_ip ;
dlog ("%s '%s' updated to '%s'\n",
hostname.c_str(),
MTC_JSON_INV_BMIP,
node_ptr->bm_ip.c_str());
rc = PASS ;
}
return (rc);
}
void nodeLinkClass::bmc_access_data_init ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr )
{
node_ptr->bm_accessible = false;
node_ptr->mc_info_query_active = false ;
node_ptr->mc_info_query_done = false ;
node_ptr->reset_cause_query_active = false ;
node_ptr->reset_cause_query_done = false ;
node_ptr->power_status_query_active = false;
node_ptr->power_status_query_done = false ;
}
}
/*****************************************************************************
*
* Name : set_bm_prov
*
* Description: Manage the local provisioning state of the
* board management connection.
*
* Assumptions: Does not set HTTP requests to sysinv so it is
* safe to call from thje modify handler
*
* Does not clear alarms.
*
******************************************************************************/
int nodeLinkClass::set_bm_prov ( struct nodeLinkClass::node * node_ptr, bool state )
{
int rc = FAIL_HOSTNAME_LOOKUP ;
if ( node_ptr != NULL )
{
ilog ("%s bmc %sprovision request (provisioned:%s)\n", // ERIC blog
node_ptr->hostname.c_str(),
state ? "" : "de",
node_ptr->bm_provisioned ? "Yes" : "No" );
/* Clear the alarm if we are starting fresh from an unprovisioned state */
if (( node_ptr->bm_provisioned == false ) && ( state == true ))
{
/* BMC is managed by IPMI/IPMITOOL */
ilog ("%s starting BM ping monitor to address '%s'\n",
node_ptr->hostname.c_str(),
node_ptr->bm_ip.c_str());
// mtcTimer_reset ( node_ptr->bm_ping_info.timer );
node_ptr->bm_ping_info.ip = node_ptr->bm_ip ;
node_ptr->bm_ping_info.stage = PINGUTIL_MONITOR_STAGE__OPEN ;
bmc_access_data_init ( node_ptr );
node_ptr->bm_ping_info.timer_handler = &mtcTimer_handler ;
node_ptr->thread_extra_info.bm_pw =
node_ptr->bm_pw =
get_bm_password (node_ptr->uuid.data());
node_ptr->thread_extra_info.bm_ip = node_ptr->bm_ip ;
node_ptr->thread_extra_info.bm_un = node_ptr->bm_un ;
send_hwmon_command(node_ptr->hostname, MTC_CMD_ADD_HOST);
send_hwmon_command(node_ptr->hostname, MTC_CMD_START_HOST);
}
/* handle the case going from provisioned to not provisioned */
else if (( node_ptr->bm_provisioned == true ) && ( state == false ))
{
/* BMC is managed by IPMI/IPMITOOL */
ilog ("%s deprovisioning bmc ; accessible:%s\n",
node_ptr->hostname.c_str(),
node_ptr->bm_accessible ? "Yes" : "No" );
pingUtil_fini ( node_ptr->bm_ping_info );
bmc_access_data_init ( node_ptr );
node_ptr->bm_accessible = false;
if ( !thread_idle( node_ptr->ipmitool_thread_ctrl ) )
{
thread_kill ( node_ptr->ipmitool_thread_ctrl , node_ptr->ipmitool_thread_info);
}
node_ptr->mc_info_query_active = false ;
node_ptr->mc_info_query_done = false ;
node_ptr->reset_cause_query_active = false ;
node_ptr->reset_cause_query_done = false ;
node_ptr->power_status_query_active = false;
node_ptr->power_status_query_done = false ;
/* send a delete to hwmon if the provisioning data is NONE */
if ( hostUtil_is_valid_bm_type ( node_ptr->bm_type ) == false )
{
send_hwmon_command(node_ptr->hostname, MTC_CMD_DEL_HOST);
}
}
if (( node_ptr->bm_provisioned == false ) && ( state == true ))
{
/* start the connection timer - if it expires before we
* are 'accessible' then the BM Alarm is raised.
* Timer is further managed in mtcNodeHdlrs.cpp */
plog ("%s bmc access timer started (%d secs)\n", node_ptr->hostname.c_str(), MTC_MINS_2);
mtcTimer_reset ( node_ptr->bmc_access_timer );
mtcTimer_start ( node_ptr->bmc_access_timer, mtcTimer_handler, MTC_MINS_2 );
}
node_ptr->bm_provisioned = state ;
}
return (rc);
}
string nodeLinkClass::get_bm_ip ( string hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
return (node_ptr->bm_ip);
}
elog ("%s bm ip lookup failed\n", hostname.c_str() );
return ("");
}
string nodeLinkClass::get_bm_un ( string hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
return (node_ptr->bm_un);
}
elog ("%s bm username lookup failed\n", hostname.c_str() );
return ("");
}
string nodeLinkClass::get_bm_type ( string hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
return (node_ptr->bm_type);
}
elog ("%s bm type lookup failed\n", hostname.c_str() );
return ("");
}
string nodeLinkClass::get_hwmon_info ( string hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
string hwmon_info = "" ;
hwmon_info.append( "{ \"personality\":\"" ) ;
hwmon_info.append( node_ptr->type );
hwmon_info.append( "\"");
hwmon_info.append( ",\"hostname\":\"" ) ;
hwmon_info.append( node_ptr->hostname );
hwmon_info.append( "\"");
hwmon_info.append( ",\"bm_ip\":\"" ) ;
hwmon_info.append( node_ptr->bm_ip );
hwmon_info.append( "\"");
hwmon_info.append( ",\"bm_type\":\"");
hwmon_info.append( node_ptr->bm_type );
hwmon_info.append( "\"");
hwmon_info.append( ",\"bm_username\":\"");
hwmon_info.append( node_ptr->bm_un );
hwmon_info.append( "\"");
hwmon_info.append( ",\"uuid\":\"" ) ;
hwmon_info.append( node_ptr->uuid );
hwmon_info.append( "\" }");
return (hwmon_info);
}
elog ("%s hwmon info lookup failed\n", hostname.c_str() );
return ("");
}
int nodeLinkClass::manage_shadow_change ( string hostname )
{
int rc = FAIL ;
if ( ! hostname.empty() )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
rc = PASS ;
if ( node_ptr->configAction == MTC_CONFIG_ACTION__NONE )
{
node_ptr->configStage = MTC_CONFIG__START ;
node_ptr->configAction = MTC_CONFIG_ACTION__CHANGE_PASSWD ;
}
else
{
node_ptr->configAction = MTC_CONFIG_ACTION__CHANGE_PASSWD_AGAIN ;
}
}
}
return (rc);
}
/** Returns the number of compute hosts that are operationally 'enabled' */
int nodeLinkClass::enabled_compute_nodes ( void )
{
int temp_count = 0 ;
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if (( is_compute( ptr )) &&
( ptr->operState == MTC_OPER_STATE__ENABLED ))
{
temp_count++ ;
}
else if (( is_compute_subfunction ( ptr )) &&
( ptr->operState_subf == MTC_OPER_STATE__ENABLED ))
{
temp_count++ ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
return (temp_count);
}
/** Returns the number of storage hosts that are operationally 'enabled' */
int nodeLinkClass::enabled_storage_nodes ( void )
{
int temp_count = 0 ;
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if (( is_storage( ptr ) ) &&
( ptr->operState == MTC_OPER_STATE__ENABLED ))
{
temp_count++ ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
return (temp_count);
}
int nodeLinkClass::enabled_nodes ( void )
{
int temp_count = 0 ;
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->operState == MTC_OPER_STATE__ENABLED )
{
temp_count++ ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
/* Remove the active controller from the count */
if (temp_count)
temp_count-- ;
return (temp_count);
}
/** Returns the system's storage back end type ceph or nfs */
int nodeLinkClass::get_storage_backend ( void )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( is_storage(ptr) )
return ( CGCS_STORAGE_CEPH ) ;
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
return (CGCS_STORAGE_NFS);
}
/** Returns true if the storage pool has a monitor running on
* an unlocked-enabled storage host */
bool nodeLinkClass::is_storage_mon_enabled ( void )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if (( is_storage(ptr) ) &&
( ptr->operState == MTC_OPER_STATE__ENABLED ) &&
( !ptr->hostname.compare("storage-0")))
{
return ( true ) ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
return (false);
}
/** Returns number of enabled controllers */
int nodeLinkClass::num_controllers_enabled ( void )
{
int cnt = 0 ;
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if (( is_controller(ptr) ) &&
( ptr->operState == MTC_OPER_STATE__ENABLED ))
{
++cnt ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
return (cnt);
}
int nodeLinkClass::service_netlink_events ( int nl_socket , int ioctl_socket )
{
std::list<string> links_gone_down ;
std::list<string> links_gone_up ;
std::list<string>::iterator iter_curr_ptr ;
if ( get_netlink_events ( nl_socket, links_gone_down, links_gone_up ))
{
const char * mgmnt_iface_ptr = daemon_get_cfg_ptr()->mgmnt_iface ;
const char * infra_iface_ptr = daemon_get_cfg_ptr()->infra_iface ;
bool running = false ;
if ( !links_gone_down.empty() )
{
//wlog ("one or more links have dropped\n");
/* Look at the down list */
for ( iter_curr_ptr = links_gone_down.begin();
iter_curr_ptr != links_gone_down.end() ;
iter_curr_ptr++ )
{
bool care = false ;
if ( iter_curr_ptr->size() == 0 )
continue ;
if ( !strcmp (mgmnt_iface_ptr, iter_curr_ptr->data()))
{
care = true ;
mgmnt_link_up_and_running = false ;
wlog ("Management link %s is down\n", mgmnt_iface_ptr );
}
if ( !strcmp (infra_iface_ptr, iter_curr_ptr->data()))
{
care = true ;
infra_link_up_and_running = false ;
wlog ("Infrastructure link %s is down\n", infra_iface_ptr );
}
if ( care == true )
{
if ( get_link_state ( ioctl_socket, iter_curr_ptr->data(), &running ) == PASS )
{
wlog ("%s is down (oper:%s)\n", iter_curr_ptr->c_str(), running ? "up" : "down" );
}
else
{
wlog ("%s is down (driver query failed)\n", iter_curr_ptr->c_str() );
}
}
}
}
if ( !links_gone_up.empty() )
{
// wlog ("one or more links have recovered\n");
/* Look at the up list */
for ( iter_curr_ptr = links_gone_up.begin();
iter_curr_ptr != links_gone_up.end() ;
iter_curr_ptr++ )
{
bool care = false ;
if ( iter_curr_ptr->size() == 0 )
continue ;
if ( !strcmp (mgmnt_iface_ptr, iter_curr_ptr->data()))
{
mgmnt_link_up_and_running = true ;
wlog ("Management link %s is up\n", mgmnt_iface_ptr );
}
if ( !strcmp (infra_iface_ptr, iter_curr_ptr->data()))
{
infra_link_up_and_running = true ;
wlog ("Infrastructure link %s is up\n", infra_iface_ptr );
}
if ( care == true )
{
if ( get_link_state ( ioctl_socket, iter_curr_ptr->data(), &running ) == PASS )
{
wlog ("%s is up (oper:%s)\n", iter_curr_ptr->c_str(), running ? "up" : "down" );
}
else
{
wlog ("%s is up (driver query failed)\n", iter_curr_ptr->c_str() );
}
}
}
}
}
return (PASS);
}
/* ***************************************************************************
*
* Name : hbs_minor_clear
*
* Description: Clear the heartbeat minor state from the specified host.
*
* Manage overall mnfa counts and call mnfa_exit when the number crosses
* the recovery threwshold.
*
******************************************************************************/
void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, iface_enum iface )
{
if ( mnfa_host_count[iface] == 0 )
return ;
/* Nothing to do if this host is not in the hbs_minor state */
if ( node_ptr->hbs_minor[iface] == true )
{
/* clear it - possibly temporarily */
node_ptr->hbs_minor[iface] = false ;
/* manage counts over heartbeat failure */
if ( mnfa_host_count[iface] )
{
/* If we are mnfa_active AND now below the threshold
* then trigger mnfa_exit */
if (( --mnfa_host_count[iface] < mnfa_calculate_threshold( node_ptr->hostname ) ) &&
( mnfa_active == true ))
{
wlog ("%s MNFA exit with graceful recovery (%s:%d)\n",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
mnfa_host_count[iface] );
/* re-activate this to true so that it is part
* of the recovery group in mnfa_exit */
node_ptr->hbs_minor[iface] = true ;
mnfa_exit ( false );
}
/* Otherwise this is a single host that has recovered
* possibly as part of a mnfa group or simply a lone wolf */
else
{
if ( node_ptr->mnfa_graceful_recovery == true )
{
ilog ("%s MNFA removed from pool\n", node_ptr->hostname.c_str() );
mnfa_awol_list.remove(node_ptr->hostname);
}
mnfa_recover_host ( node_ptr );
if ( mnfa_active == true )
{
/* Restart the heartbeat for this recovered host */
send_hbs_command ( node_ptr->hostname, MTC_RESTART_HBS );
/* don't restart graceful recovery for this host if its already in that FSM */
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__RECOVER )
{
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
}
}
}
}
}
/* lets clean-up - walk the inventory and make sure the
* avoidance count meets the number of hosts in the minor
* degrade state */
int temp_count = 0 ;
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->hbs_minor[iface] == true )
{
if ( ptr->operState != MTC_OPER_STATE__ENABLED )
{
slog ("%s found hbs_minor set for disabled host\n" , ptr->hostname.c_str() );
}
temp_count++ ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
if ( temp_count != mnfa_host_count[iface] )
{
slog ("%s MNFA host tally (%s:%d incorrect - expected %d) ; correcting\n",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
mnfa_host_count[iface], temp_count );
mnfa_host_count[iface] = temp_count ;
mnfa_host_count[iface] = temp_count ;
}
else
{
wlog ("%s MNFA host tally (%s:%d)\n",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
mnfa_host_count[iface] );
}
}
/****************************************************************************
*
* Name : manage_dor_recovery
*
* Description: Enable DOR recovery mode for this host.
* Generate log
*
* The severity parm is used to enhance the logs to indicate what
* severity level this utility was called from ;
* minor, major, or critical
*
***************************************************************************/
void nodeLinkClass::manage_dor_recovery ( struct nodeLinkClass::node * node_ptr,
EFmAlarmSeverityT severity )
{
if (( severity == FM_ALARM_SEVERITY_CLEAR ) &&
( node_ptr->dor_recovery_mode == true ))
{
node_ptr->dor_recovery_mode = false ;
node_ptr->was_dor_recovery_mode = true ;
}
else if (( severity == FM_ALARM_SEVERITY_CRITICAL ) &&
( node_ptr->dor_recovery_mode == false ))
{
struct timespec ts ;
clock_gettime (CLOCK_MONOTONIC, &ts );
wlog ("%-12s is waiting ; DOR recovery %2ld:%02ld mins (%4ld secs)\n",
node_ptr->hostname.c_str(),
ts.tv_sec/60,
ts.tv_sec%60,
ts.tv_sec);
node_ptr->dor_recovery_time = 0 ;
node_ptr->dor_recovery_mode = true ;
node_ptr->hbsClient_ready = false ;
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT );
/* don't restart graceful recovery for this host if its already in that FSM */
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__RECOVER ) &&
( node_ptr->adminAction != MTC_ADMIN_ACTION__LOCK ))
{
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
}
}
}
/** Manage heartbeat failure events */
void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface, bool clear_event )
{
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
{
wlog ("%s Unknown host\n", hostname.c_str());
return ;
}
/* Handle clear */
if ( clear_event == true )
{
hbs_minor_clear ( node_ptr, iface );
plog ("%s %s Heartbeat failure clear\n", hostname.c_str(), get_iface_name_str(iface));
// if (( mnfa_host_count == 0 ) || ( iface == INFRA_IFACE ))
if ( mnfa_host_count[iface] == 0 ) // || ( iface == INFRA_IFACE ))
{
slog ("%s %s Heartbeat failure clear\n", hostname.c_str(), get_iface_name_str(iface));
node_ptr->hbs_failure[iface] = false ;
}
}
else if ( this->mtcTimer_dor.tid )
{
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
}
else
{
mnfa_add_host ( node_ptr , iface );
if ( mnfa_active == false )
{
elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface));
if ( iface == INFRA_IFACE )
{
node_ptr->heartbeat_failed[INFRA_IFACE] = true ;
}
else if ( iface == MGMNT_IFACE )
{
node_ptr->heartbeat_failed[MGMNT_IFACE] = true ;
}
if ( mnfa_host_count[iface] < mnfa_calculate_threshold( hostname ))
{
elog ("%s %s network heartbeat failure\n", hostname.c_str(), get_iface_name_str(iface));
nodeLinkClass::set_availStatus ( hostname, MTC_AVAIL_STATUS__FAILED );
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
{
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL;
}
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) &&
( node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK ))
{
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER )
{
wlog ("%s restarting graceful recovery\n", hostname.c_str() );
}
else
{
wlog ("%s starting graceful recovery\n", hostname.c_str() );
}
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
}
else
{
mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_FAIL_HB );
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
}
}
}
}
}
void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
{
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
{
wlog ("%s Unknown host\n", hostname.c_str());
return ;
}
if ( iface == MAX_IFACES )
{
for ( int i = 0 ; i < MAX_IFACES ; i++ )
{
node_ptr->heartbeat_failed[i] = false ;
}
}
else
{
node_ptr->heartbeat_failed[iface] = false ;
}
}
/** Manage compute host maintenance based on this heartbeat
* degrade event and others that may be present at this moment */
void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface, bool clear_event )
{
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
{
wlog ("%s Unknown host\n", hostname.c_str());
return ;
}
if ( clear_event == true )
{
alog ("%s %s Heartbeat Degrade (clear)\n", hostname.c_str(), get_iface_name_str(iface));
manage_heartbeat_clear ( hostname, iface );
if ( iface == MGMNT_IFACE )
{
node_ptr->no_work_log_throttle = 0 ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
}
else if ( iface == INFRA_IFACE )
{
node_ptr->no_work_log_throttle = 0 ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_INFRA ;
}
hbs_minor_clear ( node_ptr, iface );
}
else if ( this->mtcTimer_dor.tid )
{
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_MAJOR );
}
else
{
if ( mnfa_active == false )
{
wlog ("%s %s *** Heartbeat Miss ***\n", hostname.c_str(), get_iface_name_str(iface) );
}
mnfa_add_host ( node_ptr, iface );
if ( nodeLinkClass::get_operState ( hostname ) == MTC_OPER_STATE__ENABLED )
{
if ( iface == MGMNT_IFACE )
{
/* Don't raise the alarm again if this host is already degraded */
if ( !(node_ptr->degrade_mask & DEGRADE_MASK_HEARTBEAT_MGMNT) )
{
node_ptr->degrade_mask |= DEGRADE_MASK_HEARTBEAT_MGMNT ;
}
}
if ( iface == INFRA_IFACE )
{
if ( !(node_ptr->degrade_mask & DEGRADE_MASK_HEARTBEAT_INFRA) )
{
node_ptr->degrade_mask |= DEGRADE_MASK_HEARTBEAT_INFRA ;
}
}
}
}
}
/** Manage heartbeat minor events */
void nodeLinkClass::manage_heartbeat_minor ( string hostname, iface_enum iface, bool clear_event )
{
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
{
wlog ("%s Unknown host\n", hostname.c_str());
return ;
}
/* is this a clear event ? */
if ( clear_event == true )
{
alog ("%s %s Heartbeat Minor (clear)\n", hostname.c_str(), get_iface_name_str(iface));
hbs_minor_clear ( node_ptr, iface );
}
/* if not a clear then only set if the host is enabled
* - we don't care about disabled hosts */
else if ( nodeLinkClass::get_operState ( hostname ) == MTC_OPER_STATE__ENABLED )
{
if ( this->mtcTimer_dor.tid )
{
manage_dor_recovery ( node_ptr, FM_ALARM_SEVERITY_MINOR );
}
else if ( node_ptr->hbs_minor[iface] != true )
{
mnfa_add_host ( node_ptr, iface );
}
}
}
/** Interface to declare that a key service on the
* specified host is up, running and ready */
int nodeLinkClass::declare_service_ready ( string & hostname,
unsigned int service )
{
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
{
wlog ("%s Unknown Host\n", hostname.c_str());
return FAIL_UNKNOWN_HOSTNAME ;
}
else if ( service == MTC_SERVICE_PMOND )
{
node_ptr->pmond_ready = true ;
plog ("%s got pmond ready event\n", hostname.c_str());
/* A ready event means that pmond pocess has started.
* Any previous history is gone. Cleanup mtce.
* If there are still process issues on this host then
* they will be reported again.*/
node_ptr->degrade_mask &= ~DEGRADE_MASK_PMON ;
return (PASS);
}
else if ( service == MTC_SERVICE_HWMOND )
{
node_ptr->hwmond_ready = true ;
plog ("%s got hwmond ready event\n", hostname.c_str());
if ( node_ptr->bm_provisioned == true )
{
send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
}
return (PASS);
}
else if ( service == MTC_SERVICE_RMOND )
{
node_ptr->rmond_ready = true ;
plog ("%s got rmond ready event\n", hostname.c_str());
return (PASS);
}
else if ( service == MTC_SERVICE_HEARTBEAT )
{
if ( node_ptr->hbsClient_ready == false )
{
node_ptr->hbsClient_ready = true ;
plog ("%s got hbsClient ready event\n", hostname.c_str());
}
return (PASS);
}
else
{
return (FAIL_BAD_CASE);
}
}
/** Clear pmond degrade flag */
int nodeLinkClass::degrade_pmond_clear ( string & hostname )
{
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
{
wlog ("%s Unknown Host\n", hostname.c_str());
return (FAIL_UNKNOWN_HOSTNAME) ;
}
if ( node_ptr->degrade_mask )
{
node_ptr->degrade_mask &= ~DEGRADE_MASK_PMON ;
}
/* The only detectable inservice failures are process failures */
node_ptr->inservice_failed_subf = false ;
node_ptr->inservice_failed = false ;
return (PASS);
}
/* This private API handles event messages from collectd */
int nodeLinkClass::collectd_notify_handler ( string & hostname,
string & resource,
string & state )
{
int rc = PASS ;
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
{
wlog ("%s Unknown Host\n", hostname.c_str());
return (FAIL_UNKNOWN_HOSTNAME) ;
}
if ( state == "clear" )
{
if ( node_ptr->degrade_mask & DEGRADE_MASK_COLLECTD )
{
ilog("%s collectd degrade state change ; assert -> clear (%s)",
hostname.c_str(), resource.c_str());
node_ptr->degrade_mask &= ~DEGRADE_MASK_COLLECTD ;
}
else
{
mlog3("%s collectd degrade 'clear' request (%s)",
hostname.c_str(), resource.c_str());
}
}
else if ( state == "assert" )
{
if ( (node_ptr->degrade_mask & DEGRADE_MASK_COLLECTD) == 0 )
{
ilog("%s collectd degrade state change ; clear -> assert (due to %s)",
hostname.c_str(), resource.c_str());
node_ptr->degrade_mask |= DEGRADE_MASK_COLLECTD ;
}
else
{
mlog3("%s collectd degrade 'assert' request (%s)",
hostname.c_str(), resource.c_str());
}
}
else
{
wlog ("%s collectd degrade state unknown (%s)\n",
hostname.c_str(),
state.c_str());
rc = FAIL_OPERATION ;
}
return (rc);
}
/** Resource Monitor 'Clear' Event handler.
*
* The resource specified will be removed from the
* 'degraded_resources_list' for specified host.
* if there are no other degraded resources or other
* degraded services/reasons against that host then
* this handler will clear the degrade state for the
* specified host all together. */
int nodeLinkClass::degrade_resource_clear ( string & hostname,
string & resource )
{
/* lr - Log Prefix Rmon */
string lr = hostname ;
lr.append (" rmond:");
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
{
wlog ("%s Unknown Host\n", lr.c_str());
return FAIL_UNKNOWN_HOSTNAME ;
}
else if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
{
/* Clear all resource degrade conditions if there is no resource specified */
/* this is used as a cleanup audit just in case things get stuck */
if ( resource.empty() )
{
node_ptr->degrade_mask &= ~DEGRADE_MASK_RESMON ;
node_ptr->degraded_resources_list.clear () ;
}
else if (( node_ptr->degraded_resources_list.empty()) ||
( node_ptr->degrade_mask == DEGRADE_MASK_NONE ))
{
dlog ("%s '%s' Non-Degraded Clear\n",
lr.c_str(), resource.c_str());
}
else
{
if (is_string_in_string_list (node_ptr->degraded_resources_list, resource))
{
node_ptr->degraded_resources_list.remove(resource);
ilog ("%s '%s' Degrade Clear\n",
lr.c_str(), resource.c_str());
}
else
{
wlog ("%s '%s' Unexpected Degrade Clear\n",
lr.c_str(), resource.c_str());
}
if ( node_ptr->degraded_resources_list.empty() )
{
node_ptr->degrade_mask &= ~DEGRADE_MASK_RESMON ; ;
}
else
{
string degraded_resources =
get_strings_in_string_list ( node_ptr->degraded_resources_list );
wlog ("%s Degraded Resource List: %s\n",
lr.c_str(), degraded_resources.c_str());
}
}
}
return (PASS);
}
/*********************************************************************************
*
* Name : node_degrade_control
*
* Purpose : Accept and handle degrade raise and clear requests from
* external services.
*
* Description: Maintenance maintains a degrade mask with a bit representing
* various services. The assertion of any one bit causes the host
* to be degraded. All bits need to be cleared in orde to exit
* the degrade state.
*
* Supported 'services' include
*
* "hwmon" - The Hardware Monitor process
*
*
* Future services might be rmon and pmon
*
**********************************************************************************/
int nodeLinkClass::node_degrade_control ( string & hostname, int state, string service )
{
int rc = FAIL_UNKNOWN_HOSTNAME ;
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr )
{
unsigned int service_flag = 0 ;
/* convert service string to degrade mask flag
* - handle empty string and unsupported service */
if ( service.empty() )
{
slog ("%s service not specified", hostname.c_str());
return (FAIL_STRING_EMPTY);
}
else if ( !service.compare("hwmon") )
{
service_flag = DEGRADE_MASK_HWMON ;
}
else
{
slog ("%s service '%s' not supported\n",
hostname.c_str(),
service.c_str());
return (FAIL_INVALID_DATA);
}
switch ( state )
{
/* Handle clear case */
case MTC_DEGRADE_CLEAR:
{
if ( node_ptr->degrade_mask & service_flag )
{
ilog ("%s degrade 'clear' from '%s'\n", hostname.c_str(), service.c_str() );
}
/* clear the mask regardless of host state */
node_ptr->degrade_mask &= ~service_flag ;
rc = PASS ;
break ;
}
/* Handle assertion case */
case MTC_DEGRADE_RAISE:
{
if (( node_ptr->degrade_mask & service_flag ) == 0 )
{
wlog ("%s degrade 'assert' from '%s'\n", hostname.c_str(), service.c_str() );
node_ptr->degrade_mask |= service_flag ;
}
rc = PASS ;
break ;
}
default:
{
wlog ("%s invalid degrade control request '%d'\n", hostname.c_str(), state);
rc = FAIL_BAD_CASE ;
break ;
}
} /* end switch */
}
else
{
dlog ("%s Unknown Host\n", hostname.c_str());
}
return (rc);
}
int nodeLinkClass::hwmon_recovery_monitor ( struct nodeLinkClass::node * node_ptr, int hwmon_event )
{
int delay = MTC_MINS_15 ;
if ( hwmon_event == MTC_EVENT_HWMON_POWERCYCLE )
{
node_ptr->hwmon_powercycle.retries = 0 ;
node_ptr->hwmon_powercycle.queries = 0 ;
node_ptr->hwmon_powercycle.state = RECOVERY_STATE__MONITOR ;
mtcTimer_reset ( node_ptr->hwmon_powercycle.recovery_timer );
mtcTimer_start ( node_ptr->hwmon_powercycle.recovery_timer, mtcTimer_handler, delay );
ilog ("%s starting hwmon 'powercycle' recovery monitor", node_ptr->hostname.c_str());
ilog ("%s ... uninterrupted completion time: %s", node_ptr->hostname.c_str(), future_time(delay));
}
else if ( hwmon_event == MTC_EVENT_HWMON_RESET )
{
node_ptr->hwmon_reset.retries = 0 ;
node_ptr->hwmon_reset.queries = 0 ;
node_ptr->hwmon_reset.state = RECOVERY_STATE__MONITOR ;
mtcTimer_reset ( node_ptr->hwmon_reset.recovery_timer );
mtcTimer_start ( node_ptr->hwmon_reset.recovery_timer, mtcTimer_handler, delay );
ilog ("%s starting hwmon 'reset' recovery monitor", node_ptr->hostname.c_str());
ilog ("%s ... uninterrupted completion time: %s", node_ptr->hostname.c_str(), future_time(delay));
}
return (PASS);
}
/* Hardware Monitor 'Action' Event method
*
* The hardware monitor daemon is calling out a sensor that
* is operating out of spec. The command is the accompanying
* action that hwmond requested as a recovery action to this failure.
* The sensor is the sensor name that triggersed the event. */
int nodeLinkClass::invoke_hwmon_action ( string & hostname, int action, string & sensor )
{
int rc = PASS ;
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
dlog ("%s request to '%s' due to critical sensor '%s' reading\n",
hostname.c_str(),
get_event_str(action).c_str(),
sensor.c_str());
if ( node_ptr )
{
if ( node_ptr->bm_accessible == false )
{
wlog ("%s rejecting %s hwmon action request for '%s' sensor ; BMC not accessible\n",
hostname.c_str(),
get_event_str(action).c_str(),
sensor.c_str());
return (PASS);
}
if ( action == MTC_EVENT_HWMON_RESET )
{
if ( is_active_controller (hostname) == true )
{
wlog ("%s refusing to 'reset' self due to critical '%s' sensor event\n",
hostname.c_str(), sensor.c_str());
recovery_ctrl_init ( node_ptr->hwmon_reset );
return(rc);
}
/* Avoid interrupting higher priority powercycle action */
else if (( node_ptr->adminAction == MTC_ADMIN_ACTION__POWERCYCLE ) ||
( node_ptr->hwmon_powercycle.state != RECOVERY_STATE__INIT ))
{
wlog ("%s bypassing 'reset' request while 'powercycle' already in progress (%s)\n",
hostname.c_str(), sensor.c_str());
}
else if ( node_ptr->adminAction != MTC_ADMIN_ACTION__NONE )
{
wlog ("%s bypassing 'reset' request while '%s' action in progress (%s)\n",
hostname.c_str(), get_adminAction_str(node_ptr->adminAction), sensor.c_str());
}
else if ( node_ptr->hwmon_reset.state )
{
wlog ("%s rejecting 'reset' request while already in progress (%s)\n",
hostname.c_str(), sensor.c_str());
}
else
{
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
{
mtcTimer_reset ( node_ptr->hwmon_reset.recovery_timer );
mtcTimer_start ( node_ptr->hwmon_reset.recovery_timer, mtcTimer_handler, MTC_MINS_15 );
force_full_enable ( node_ptr );
}
else
{
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__RESET )
{
elog ("%s starting 'reset' FSM\n", hostname.c_str());
mtcTimer_reset ( node_ptr->hwmon_reset.recovery_timer );
mtcTimer_start ( node_ptr->hwmon_reset.recovery_timer, mtcTimer_handler, MTC_MINS_15 );
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__RESET );
}
else
{
wlog ("%s mtce 'reset' action already in progress\n", hostname.c_str());
}
}
node_ptr->hwmon_reset.state = RECOVERY_STATE__HOLDOFF ;
}
}
else if ( action == MTC_EVENT_HWMON_POWERCYCLE )
{
if ( node_ptr->hwmon_powercycle.attempts > MAX_POWERCYCLE_ATTEMPT_RETRIES )
{
wlog ("%s ignoring 'powercycle' request ; too many failed attempts (%d)\n",
node_ptr->hostname.c_str(), node_ptr->hwmon_powercycle.attempts );
}
else if ( is_active_controller (hostname) == true )
{
wlog ("%s refusing to 'powercycle' self due to critical '%s' sensor event\n",
hostname.c_str(), sensor.c_str());
recovery_ctrl_init ( node_ptr->hwmon_powercycle ) ;
}
else
{
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__POWERCYCLE )
{
wlog ("%s bypassing 'powercycle' request while already in progress (%s)\n",
hostname.c_str(), sensor.c_str());
}
else if ( node_ptr->adminAction != MTC_ADMIN_ACTION__NONE )
{
wlog ("%s bypassing 'powercycle' request while '%s' action in progress (%s)\n",
hostname.c_str(),
get_adminAction_str(node_ptr->adminAction),
sensor.c_str());
}
else if ( node_ptr->hwmon_powercycle.state == RECOVERY_STATE__COOLOFF )
{
wlog ("%s avoiding 'powercycle' request while in powercycle recovery cooloff (%s)\n",
hostname.c_str(), sensor.c_str());
}
else if ( node_ptr->hwmon_powercycle.state == RECOVERY_STATE__HOLDOFF )
{
wlog ("%s avoiding 'powercycle' request while in powercycle recovery holdoff (%s)\n",
hostname.c_str(), sensor.c_str());
}
else if ( node_ptr->hwmon_powercycle.state == RECOVERY_STATE__ACTION )
{
wlog ("%s avoiding 'powercycle' request while already handling powercycle (%s)\n",
hostname.c_str(), sensor.c_str());
}
else if ( node_ptr->hwmon_powercycle.state == RECOVERY_STATE__BLOCKED )
{
wlog ("%s avoiding 'powercycle' request ; host is powered off due to protect hardware from damage due to critical '%s' sensor\n",
hostname.c_str(), sensor.c_str());
}
else
{
if ( node_ptr->hwmon_powercycle.state == RECOVERY_STATE__MONITOR )
{
wlog ("%s 'powercycle' request while in monitor phase (%s)\n",
hostname.c_str(), sensor.c_str());
}
/* Cancel the recovery timer only to have it started once the
* next power cycle phase is complete */
mtcTimer_reset ( node_ptr->hwmon_powercycle.recovery_timer );
wlog ("%s invoking 'powercycle' due to critical '%s' sensor assertion\n", hostname.c_str(), sensor.c_str());
powercycleStageChange ( node_ptr, MTC_POWERCYCLE__START );
subStageChange ( node_ptr, MTC_SUBSTAGE__START );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__POWERCYCLE );
}
}
}
else
{
slog ("%s '%s' action not supported as request from hwmond\n",
hostname.c_str(),
get_event_str(action).c_str());
rc = FAIL_BAD_PARM ;
}
}
else
{
slog ("%s cannot '%s' due to unknown host\n", hostname.c_str(), get_event_str(action).c_str());
rc = FAIL_UNKNOWN_HOSTNAME ;
}
return (rc);
}
/* Generate a log for the reported failed process if that host is
* unlocked */
int nodeLinkClass::log_process_failure ( string & hostname, string & process )
{
/* lp - Log Prefix */
string lp = hostname ;
lp.append (" pmon:");
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
{
wlog ("%s Unknown Host ; '%s' failed (minor)\n",
lp.c_str(), process.c_str());
return FAIL_UNKNOWN_HOSTNAME ;
}
else if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
{
if ( process.compare("ntpd") )
{
wlog ("%s '%s' process failed and is being auto recovered\n",
lp.c_str(),
process.c_str());
}
else
{
wlog ("%s '%s' process has failed ; manual recovery action required\n",
lp.c_str(),
process.c_str());
}
}
return (PASS);
}
/* if unlocked-enabled generate an alarm for the reported failed process */
int nodeLinkClass::alarm_process_failure ( string & hostname, string & process )
{
/* lp - Log Prefix */
string lp = hostname ;
lp.append (" pmon:");
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
{
wlog ("%s Unknown Host ; '%s' failed (minor)\n",
lp.c_str(), process.c_str());
return FAIL_UNKNOWN_HOSTNAME ;
}
else if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
{
/* TODO: Generate Alarm here */
wlog ("%s '%s' failed (minor)\n", lp.c_str(), process.c_str());
}
return (PASS);
}
/* Generate a log for the reported failed resource if that host is
* unlocked */
int nodeLinkClass::log_resource_failure ( string & hostname, string & resource )
{
/* lr - Log Prefix Rmond */
string lr = hostname ;
lr.append (" rmond:");
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
{
wlog ("%s Unknown Host ; '%s' failed (minor)\n",
lr.c_str(), resource.c_str());
return FAIL_UNKNOWN_HOSTNAME ;
}
else if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
{
ilog ("%s '%s' failed (minor)\n",
lr.c_str(), resource.c_str());
}
return (PASS);
}
/** Process Monitor Degrade Event handler.
*
* The host will enter degrade state due to the specified process
* not running properly. The process name is recorded in the
* 'degraded_processes_list' for specified host.
* Clearing degrade against this process requires that host to
* send a clear event against that process or for that host to
* fully re-enable */
int nodeLinkClass::degrade_process_raise ( string & hostname,
string & process )
{
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
{
wlog ("%s Unknown Host\n", hostname.c_str());
return FAIL_UNKNOWN_HOSTNAME ;
}
else if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
{
if ( (node_ptr->degrade_mask & DEGRADE_MASK_PMON) == 0 )
{
node_ptr->degrade_mask |= DEGRADE_MASK_PMON ;
wlog ("%s is degraded due to '%s' process failure\n", hostname.c_str(), process.c_str());
}
}
return (PASS);
}
/*
* Name : update_dport_states
*
* Purpose: Update data port states based on the event severity
*
* CLEAR = enabled-available
* MAJOR = enabled-degraded
* CRITICAL = disabled-failed
*
*/
int update_dport_states_throttle = 0 ;
int nodeLinkClass::update_dport_states ( struct nodeLinkClass::node * node_ptr, int event )
{
int rc = PASS ;
/* if the host is locked then report the data ports as offline */
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
{
event = MTC_EVENT_AVS_OFFLINE ;
}
switch (event)
{
case MTC_EVENT_AVS_OFFLINE:
{
if ( node_ptr->operState_dport != MTC_OPER_STATE__DISABLED )
{
ilog ("%s data port 'operState' change from '%s' -> 'disabled'",
node_ptr->hostname.c_str(),
operState_enum_to_str(node_ptr->operState_dport).c_str());
node_ptr->operState_dport = MTC_OPER_STATE__DISABLED ;
}
if ( node_ptr->availStatus_dport != MTC_AVAIL_STATUS__OFFLINE )
{
ilog ("%s data port 'availStat' change from '%s' -> 'offline'",
node_ptr->hostname.c_str(),
availStatus_enum_to_str(node_ptr->availStatus_dport).c_str());
node_ptr->availStatus_dport = MTC_AVAIL_STATUS__OFFLINE ;
}
break ;
}
case MTC_EVENT_AVS_CLEAR:
{
bool state_change = false ;
if ( node_ptr->operState_dport != MTC_OPER_STATE__ENABLED )
{
ilog ("%s data port 'operState' change from '%s' -> 'enabled'",
node_ptr->hostname.c_str(),
operState_enum_to_str(node_ptr->operState_dport).c_str());
node_ptr->operState_dport = MTC_OPER_STATE__ENABLED ;
state_change = true ;
}
if ( node_ptr->availStatus_dport != MTC_AVAIL_STATUS__AVAILABLE )
{
ilog ("%s data port 'availStat' change from '%s' -> 'available'",
node_ptr->hostname.c_str(),
availStatus_enum_to_str(node_ptr->availStatus_dport).c_str());
node_ptr->availStatus_dport = MTC_AVAIL_STATUS__AVAILABLE ;
state_change = true ;
}
/** If there has been s state change as a result of a
* clear then send that to the VIM immediately
**/
if ( state_change == true )
{
/* Inform the VIM of the data port state change */
mtcVimApi_state_change ( node_ptr, VIM_DPORT_CLEARED, 3 );
}
break ;
}
case MTC_EVENT_AVS_MAJOR:
{
if ( node_ptr->operState_dport != MTC_OPER_STATE__ENABLED )
{
ilog ("%s data port 'operState' change from '%s' -> 'enabled'",
node_ptr->hostname.c_str(),
operState_enum_to_str(node_ptr->operState_dport).c_str());
node_ptr->operState_dport = MTC_OPER_STATE__ENABLED ;
}
if ( node_ptr->availStatus_dport != MTC_AVAIL_STATUS__DEGRADED )
{
wlog ("%s data port 'availStat' change from '%s' -> 'degraded'",
node_ptr->hostname.c_str(),
availStatus_enum_to_str(node_ptr->availStatus_dport).c_str());
node_ptr->availStatus_dport = MTC_AVAIL_STATUS__DEGRADED ;
}
break ;
}
case MTC_EVENT_AVS_CRITICAL:
{
if ( node_ptr->operState_dport != MTC_OPER_STATE__DISABLED )
{
elog ("%s data port 'operState' change from '%s' -> 'disabled'",
node_ptr->hostname.c_str(),
operState_enum_to_str(node_ptr->operState_dport).c_str());
node_ptr->operState_dport = MTC_OPER_STATE__DISABLED ;
}
if ( node_ptr->availStatus_dport != MTC_AVAIL_STATUS__FAILED )
{
elog ("%s data port 'availStat' change from '%s' -> 'failed'",
node_ptr->hostname.c_str(),
availStatus_enum_to_str(node_ptr->availStatus_dport).c_str());
node_ptr->availStatus_dport = MTC_AVAIL_STATUS__FAILED ;
}
break ;
}
default:
{
wlog_throttled (update_dport_states_throttle, 10, "Invalid port state (%x)\n", event );
rc = FAIL_BAD_CASE ;
}
}
return (rc);
}
/** Resource Monitor 'Raise' Event handler.
*
* The host will enter degrade state due to the specified resource
* threshold being surpased. The resource name is recorded in the
* 'degraded_resources_list' for specified host.
* Clearing degrade against this resource requires that host to
* send a clear event against that resource or for that host to
* fully re-enable */
int nodeLinkClass::degrade_resource_raise ( string & hostname,
string & resource )
{
/* lr - Log Prefix Rmond */
string lr = hostname ;
lr.append (" rmond:");
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
{
wlog ("%s Unknown Host\n", lr.c_str());
return FAIL_UNKNOWN_HOSTNAME ;
}
else if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
{
if ( is_string_in_string_list ( node_ptr->degraded_resources_list, resource ) == false )
{
string degraded_resources = "";
ilog ("%s '%s' Degraded\n", lr.c_str(), resource.c_str());
node_ptr->degraded_resources_list.push_back (resource);
node_ptr->degrade_mask |= DEGRADE_MASK_RESMON ;
/* Cleanup the list */
node_ptr->degraded_resources_list.sort ();
node_ptr->degraded_resources_list.unique ();
degraded_resources =
get_strings_in_string_list ( node_ptr->degraded_resources_list );
wlog ("%s Failing Resources: %s\n",
lr.c_str(), degraded_resources.c_str());
}
else
{
dlog ("%s '%s' Degraded (again)\n", lr.c_str(), resource.c_str());
}
}
return (PASS);
}
/** Process Monitor 'Critical Process Failed' Event handler.
*
* This utility handles critical process failure event notifications.
* Typically this interface will force a host re-enable through reset.
*
* For AIO Simplex this failure sets the auto recovery bool
* so that the main enable FSM can handle it through a thresholded
* self reboot.
*
* That as well as all other failure handling cases are deferred to
* the enable handler's from failure case.
*
**/
int nodeLinkClass::critical_process_failed( string & hostname,
string & process,
unsigned int nodetype )
{
UNUSED(nodetype);
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
{
wlog ("%s pmon: Unknown host\n", hostname.c_str());
return FAIL_UNKNOWN_HOSTNAME ;
}
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
{
elog ("%s has critical '%s' process failure\n", hostname.c_str(), process.c_str());
node_ptr->degrade_mask |= DEGRADE_MASK_PMON ;
/* Special critical process failure handling for AIO system */
if ( THIS_HOST && ( is_inactive_controller_main_insv() == false ))
{
if ( this->autorecovery_disabled == true )
{
dlog ("%s bypassing persistent critical process failure\n",
node_ptr->hostname.c_str());
return (PASS);
}
dlog ("%s critical process failure (aio)\n",
node_ptr->hostname.c_str()); /* dlog */
}
/* Start fresh the next time we enter graceful recovery handler */
node_ptr->graceful_recovery_counter = 0 ;
/* Set node as unlocked-disabled-failed */
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
MTC_OPER_STATE__DISABLED,
MTC_AVAIL_STATUS__FAILED );
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
dlog ("%s adminState:%s EnableStage:%s\n",
node_ptr->hostname.c_str(),
adminAction_enum_to_str(node_ptr->adminAction).c_str(),
get_enableStages_str(node_ptr->handlerStage.enable).c_str());
}
return (PASS);
}
/** Resource Monitor 'Failed' Event handler.
*
* The host will go out of service, be reset and
* automatically re-enabled. */
int nodeLinkClass::critical_resource_failed( string & hostname,
string & resource )
{
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
{
wlog ("%s rmond: Unknown host\n", hostname.c_str());
return FAIL_UNKNOWN_HOSTNAME ;
}
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
{
/* Start fresh the next time we enter graceful recovery handler */
node_ptr->graceful_recovery_counter = 0 ;
elog ("%s rmond: Critical Resource '%s' Failure\n", hostname.c_str(), resource.c_str());
/* Set node as unlocked-enabled */
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
MTC_OPER_STATE__DISABLED,
MTC_AVAIL_STATUS__FAILED );
}
return (PASS);
}
bool nodeLinkClass::is_active_controller ( string hostname )
{
if ( nodeLinkClass::my_hostname.compare(hostname) )
{
return (false) ;
}
return (true);
}
string nodeLinkClass::get_inactive_controller_hostname ( void )
{
return (inactive_controller_hostname);
}
void nodeLinkClass::set_inactive_controller_hostname ( string hostname )
{
inactive_controller_hostname = hostname ;
}
string nodeLinkClass::get_active_controller_hostname ( void )
{
return (active_controller_hostname);
}
void nodeLinkClass::set_active_controller_hostname ( string hostname )
{
active_controller_hostname = hostname ;
}
bool nodeLinkClass::inactive_controller_is_patched ( void )
{
nodeLinkClass::node * node_ptr = getNode ( inactive_controller_hostname ) ;
if ( node_ptr != NULL )
{
return ( node_ptr->patched );
}
return (false) ;
}
bool nodeLinkClass::inactive_controller_is_patching ( void )
{
nodeLinkClass::node * node_ptr = getNode ( inactive_controller_hostname ) ;
if ( node_ptr != NULL )
{
return ( node_ptr->patching );
}
return (false) ;
}
bool nodeLinkClass::is_inactive_controller_main_insv ( void )
{
nodeLinkClass::node * node_ptr = getNode ( inactive_controller_hostname ) ;
if ( node_ptr != NULL )
{
if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
{
return (true) ;
}
}
return (false) ;
}
bool nodeLinkClass::is_inactive_controller_subf_insv ( void )
{
nodeLinkClass::node * node_ptr = getNode ( inactive_controller_hostname ) ;
if ( node_ptr != NULL )
{
if ( node_ptr->operState_subf == MTC_OPER_STATE__ENABLED )
{
return (true) ;
}
}
return (false) ;
}
int nodeLinkClass::set_subf_info ( string hostname,
string functions,
string operState_subf,
string availState_subf )
{
int rc = FAIL_HOSTNAME_LOOKUP ;
if ( functions.empty() )
{
elog ("%s called with empty 'functions' string\n", hostname.c_str());
return (FAIL_STRING_EMPTY);
}
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
if ( node_ptr != NULL )
{
node_ptr->functions = functions ;
node_ptr->operState_subf = operState_str_to_enum(operState_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum(availState_subf.data());
rc = update_host_functions ( hostname, functions );
}
return (rc);
}
/**********************************************************************************
*
* Name : update_host_functions
*
* Purpose: Loads a nodeLinkClass with function information based on a comma
* delimited function string like.
*
* controller
* compute
* storage
* controller,compute
* controller,storage
*
**********************************************************************************/
int nodeLinkClass::update_host_functions ( string hostname , string functions )
{
int rc = FAIL ;
if ( functions.empty() )
{
elog ("%s called with empty 'functions' string\n", hostname.c_str());
return (rc);
}
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
if ( node_ptr != NULL )
{
node_ptr->functions = functions ;
if ( set_host_functions ( functions, &node_ptr->nodetype, &node_ptr->function, &node_ptr->subfunction ) != PASS )
{
elog ("%s failed to extract nodetype\n", hostname.c_str());
rc = FAIL_NODETYPE;
}
else
{
if ( node_ptr->function == CONTROLLER_TYPE )
node_ptr->function_str = "controller" ;
else if ( node_ptr->function == COMPUTE_TYPE )
node_ptr->function_str = "compute" ;
else if ( node_ptr->function == STORAGE_TYPE )
node_ptr->function_str = "storage" ;
else
node_ptr->function_str = "" ;
if ( node_ptr->subfunction == COMPUTE_TYPE )
{
node_ptr->subfunction_str = "compute" ;
}
else if ( node_ptr->subfunction == STORAGE_TYPE )
{
node_ptr->subfunction_str = "storage" ;
}
else
node_ptr->subfunction_str = "" ;
}
rc = PASS ;
}
return (rc);
}
/** Fetch the node type (compute or controller) by hostname */
int nodeLinkClass::get_nodetype ( string & hostname )
{
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
if ( node_ptr != NULL )
{
return ( node_ptr->nodetype );
}
return (false);
}
/** Check if a node is a controller */
bool nodeLinkClass::is_controller ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr != NULL )
{
if ( (node_ptr->function & CONTROLLER_TYPE ) == CONTROLLER_TYPE )
{
return (true);
}
}
return (false);
}
/** Check if a node is a compute */
bool nodeLinkClass::is_compute_subfunction ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr != NULL )
{
if ( (node_ptr->subfunction & COMPUTE_TYPE ) == COMPUTE_TYPE )
{
return (true);
}
}
return (false);
}
/** Check if a node is a compute */
bool nodeLinkClass::is_compute ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr != NULL )
{
if ( (node_ptr->function & COMPUTE_TYPE ) == COMPUTE_TYPE )
{
return (true);
}
}
return (false);
}
/** Check if a node is a storage */
bool nodeLinkClass::is_storage ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr != NULL )
{
if ( (node_ptr->function & STORAGE_TYPE ) == STORAGE_TYPE )
{
return (true);
}
}
return (false);
}
string nodeLinkClass::get_node_function_str ( string hostname )
{
nodeLinkClass::node * node_ptr = getNode ( hostname );
if ( node_ptr != NULL )
{
return node_ptr->function_str ;
}
return "unknown" ;
}
string nodeLinkClass::get_node_subfunction_str ( string hostname )
{
nodeLinkClass::node * node_ptr = getNode ( hostname );
if ( node_ptr != NULL )
{
return node_ptr->subfunction_str ;
}
return "unknown" ;
}
/** Check if a node is a controller */
bool nodeLinkClass::is_controller ( string & hostname )
{
nodeLinkClass::node * node_ptr = getNode ( hostname );
if ( node_ptr )
{
return is_controller(node_ptr);
}
return false ;
}
/** Check if a node is a compute */
bool nodeLinkClass::is_compute ( string & hostname )
{
nodeLinkClass::node * node_ptr = getNode ( hostname );
if ( node_ptr )
{
return is_compute(node_ptr);
}
return false ;
}
/** Check if a node is a compute */
bool nodeLinkClass::is_compute_subfunction ( string & hostname )
{
nodeLinkClass::node * node_ptr = getNode ( hostname );
if ( node_ptr )
{
return is_compute_subfunction(node_ptr);
}
return false ;
}
/** Check if a node is a storage */
bool nodeLinkClass::is_storage ( string & hostname )
{
nodeLinkClass::node * node_ptr = getNode ( hostname );
if ( node_ptr )
{
return is_storage(node_ptr);
}
return false ;
}
/** Maintenance FSM Test Case Setup procedure */
int nodeLinkClass::set_enableStage ( string & hostname,
mtc_enableStages_enum stage )
{
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
if ( node_ptr != NULL )
{
node_ptr->handlerStage.enable = stage ;
return (PASS);
}
return (FAIL);
}
/* Set the reboot stage */
int nodeLinkClass::set_rebootStage ( string & hostname, mtc_resetProgStages_enum stage )
{
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
if ( node_ptr != NULL )
{
node_ptr->resetProgStage = stage ;
return (PASS);
}
return (FAIL);
}
/** Maintenance FSM Test Case Setup procedure */
mtc_enableStages_enum nodeLinkClass::get_enableStage ( string & hostname)
{
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
if ( node_ptr != NULL )
{
return ( node_ptr->handlerStage.enable ) ;
}
return (MTC_ENABLE__STAGES);
}
int nodeLinkClass::allStateChange ( struct nodeLinkClass::node * node_ptr,
mtc_nodeAdminState_enum adminState,
mtc_nodeOperState_enum operState,
mtc_nodeAvailStatus_enum availStatus )
{
int rc = FAIL ;
if (( adminState < MTC_ADMIN_STATES ) &&
( operState < MTC_OPER_STATES ) &&
( availStatus < MTC_AVAIL_STATUS ))
{
bool change = false ;
if (( node_ptr->adminState != adminState ) ||
( node_ptr->operState != operState ) ||
( node_ptr->availStatus != availStatus ))
{
change = true ;
}
string admin = mtc_nodeAdminState_str [adminState ] ;
string oper = mtc_nodeOperState_str [operState ] ;
string avail = mtc_nodeAvailStatus_str[availStatus] ;
rc = mtcInvApi_force_states ( node_ptr, admin, oper, avail );
admin_state_change ( node_ptr->hostname, admin );
if ((( operState == MTC_OPER_STATE__DISABLED ) && ( node_ptr->operState != MTC_OPER_STATE__DISABLED )) &&
(( availStatus == MTC_AVAIL_STATUS__FAILED ) && ( node_ptr->availStatus != MTC_AVAIL_STATUS__FAILED )))
{
failed_state_change ( node_ptr );
}
else
{
oper_state_change ( node_ptr->hostname, oper );
avail_status_change ( node_ptr->hostname, avail );
}
if ( change == true )
{
/* after */
ilog ("%s %s-%s-%s (seq:%d)\n",
node_ptr->hostname.c_str(),
mtc_nodeAdminState_str [node_ptr->adminState ],
mtc_nodeOperState_str [node_ptr->operState ],
mtc_nodeAvailStatus_str[node_ptr->availStatus],
node_ptr->oper_sequence-1);
}
}
else
{
slog ("Invalid State (%d:%d:%d)\n", adminState, operState, availStatus );
}
return (rc);
}
int nodeLinkClass::subfStateChange ( struct nodeLinkClass::node * node_ptr,
mtc_nodeOperState_enum operState_subf,
mtc_nodeAvailStatus_enum availStatus_subf )
{
int rc = FAIL ;
if (( operState_subf < MTC_OPER_STATES ) &&
( availStatus_subf < MTC_AVAIL_STATUS ))
{
bool change = false ;
if (( node_ptr->operState_subf != operState_subf ) ||
( node_ptr->availStatus_subf != availStatus_subf ))
{
change = true ;
}
string oper = mtc_nodeOperState_str [operState_subf ] ;
string avail = mtc_nodeAvailStatus_str[availStatus_subf] ;
rc = mtcInvApi_subf_states ( node_ptr, oper, avail );
node_ptr->operState_subf = operState_subf ;
node_ptr->availStatus_subf = availStatus_subf;
if ( change == true )
{
/* after */
ilog ("%s-%s %s-%s-%s (seq:%d)\n",
node_ptr->hostname.c_str(),
node_ptr->subfunction_str.c_str(),
mtc_nodeAdminState_str [node_ptr->adminState ],
mtc_nodeOperState_str [node_ptr->operState_subf ],
mtc_nodeAvailStatus_str[node_ptr->availStatus_subf],
node_ptr->oper_sequence-1);
}
}
else
{
slog ("Invalid State (%d:%d:%d)\n", node_ptr->adminState, availStatus_subf, availStatus_subf );
}
return (rc);
}
/**
* Set the required action and then let the FSP and handlers deal with it
* If we are in an action already then just add the action to the
* action todo list. When we chnage the action to none then query the
* todo list and pop it off and apply it
**/
int nodeLinkClass::adminActionChange ( struct nodeLinkClass::node * node_ptr,
mtc_nodeAdminAction_enum newActionState )
{
int rc = FAIL ;
if (( newActionState < MTC_ADMIN_ACTIONS ) &&
( node_ptr->adminAction < MTC_ADMIN_ACTIONS ))
{
rc = PASS ;
if ( node_ptr->adminAction == newActionState )
{
/* no action change */
return (rc);
}
/**
* Any of these actions need to complete before any
* other action can take effect.
* If its not one of these action then just proceed with it
**/
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ADD ) &&
( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK ))
{
clog ("%s Administrative Action '%s' -> '%s'\n",
node_ptr->hostname.c_str(),
mtc_nodeAdminAction_str [node_ptr->adminAction],
mtc_nodeAdminAction_str [newActionState]);
}
/* handle queue'd requests if here are any and
* we are done with the curent action */
else if (( newActionState == MTC_ADMIN_ACTION__NONE ) &&
( !node_ptr->adminAction_todo_list.empty()))
{
newActionState = *(node_ptr->adminAction_todo_list.begin());
node_ptr->adminAction_todo_list.pop_front();
clog ("%s Administrative Action '%s' -> '%s' from queue\n",
node_ptr->hostname.c_str(),
mtc_nodeAdminAction_str [node_ptr->adminAction],
mtc_nodeAdminAction_str [newActionState]);
}
/* queue the request if we are already acting on a current action
* ... handle unsupported action queueing conditions */
else if (( node_ptr->adminAction != MTC_ADMIN_ACTION__NONE ) &&
( newActionState != MTC_ADMIN_ACTION__NONE ))
{
/* refuse to add duplicate action */
if ( newActionState == node_ptr->adminAction )
{
wlog ("%s refusing to queue duplicate of current action (%s)\n",
node_ptr->hostname.c_str(),
mtc_nodeAdminAction_str [node_ptr->adminAction] );
return (FAIL);
}
else if ( node_ptr->adminAction_todo_list.size() >= MTC_MAX_QUEUED_ACTIONS )
{
wlog ("%s rejecting action '%s' request ; max queued actions reached (%ld of %d)\n",
node_ptr->hostname.c_str(),
mtc_nodeAdminAction_str [newActionState],
node_ptr->adminAction_todo_list.size(),
MTC_MAX_QUEUED_ACTIONS );
return (FAIL);
}
/* refuse to queue action that already exists in the queue */
else
{
list<mtc_nodeAdminAction_enum>::iterator adminAction_todo_list_ptr ;
for ( adminAction_todo_list_ptr = node_ptr->adminAction_todo_list.begin();
adminAction_todo_list_ptr != node_ptr->adminAction_todo_list.end();
adminAction_todo_list_ptr++ )
{
if ( *adminAction_todo_list_ptr == newActionState )
{
wlog ("%s refusing to queue duplicate already queued action (%s)\n",
node_ptr->hostname.c_str(),
mtc_nodeAdminAction_str [*adminAction_todo_list_ptr]);
return (FAIL);
}
}
}
/* Add the action to the action todo list */
node_ptr->adminAction_todo_list.push_back( newActionState );
ilog ("%s Administrative Action '%s' queued ; already handling '%s' action\n",
node_ptr->hostname.c_str(),
mtc_nodeAdminAction_str [newActionState],
mtc_nodeAdminAction_str [node_ptr->adminAction]);
return (PASS);
}
/* otherwise just take the action change */
else
{
clog ("%s Administrative Action '%s' -> '%s'\n",
node_ptr->hostname.c_str(),
mtc_nodeAdminAction_str [node_ptr->adminAction],
mtc_nodeAdminAction_str [newActionState]);
}
mtc_nodeAdminAction_enum oldActionState = node_ptr->adminAction ;
log_adminAction ( node_ptr->hostname, oldActionState, newActionState );
node_ptr->adminAction = newActionState ;
node_ptr->action = mtc_nodeAdminAction_str [node_ptr->adminAction] ;
/* If we are starting a new ( not 'none' ) action ...
* be sure we start at the beginning */
if ( newActionState != MTC_ADMIN_ACTION__NONE )
{
if (( oldActionState == MTC_ADMIN_ACTION__POWERCYCLE ) &&
(( newActionState != MTC_ADMIN_ACTION__POWERCYCLE ) &&
( newActionState != MTC_ADMIN_ACTION__POWEROFF )))
{
blog ("%s (mon:%d:prov:%d)\n", node_ptr->hostname.c_str(), node_ptr->hwmond_monitor, node_ptr->bm_provisioned );
if (( node_ptr->hwmond_monitor == false ) && ( node_ptr->bm_provisioned == true ))
{
send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
}
}
/* Lets ensure that the handlers start in the right stage
* The enable_handler -> MTC_ENABLE__START
* The disable_handler -> MTC_DISABLE__START
* The reset_handler -> MTC_RESET__START
* The reboot_handler -> MTC_RESET__START
*
* This is a little detailed but exists for maintainability
* All START stages are 0.
*/
switch ( newActionState )
{
case MTC_ADMIN_ACTION__UNLOCK:
{
if ( oldActionState != MTC_ADMIN_ACTION__UNLOCK )
{
node_ptr->node_unlocked_counter++ ;
}
if ( is_controller ( node_ptr ) )
autorecovery_clear (node_ptr->hostname);
node_ptr->handlerStage.enable = MTC_ENABLE__START ;
break ;
}
case MTC_ADMIN_ACTION__LOCK:
case MTC_ADMIN_ACTION__FORCE_LOCK:
{
node_ptr->handlerStage.disable = MTC_DISABLE__START ;
break ;
}
case MTC_ADMIN_ACTION__RESET:
{
node_ptr->resetStage = MTC_RESET__START ;
break ;
}
case MTC_ADMIN_ACTION__REBOOT:
{
break ;
}
case MTC_ADMIN_ACTION__REINSTALL:
{
node_ptr->reinstallStage = MTC_REINSTALL__START ;
break ;
}
case MTC_ADMIN_ACTION__POWERON:
{
node_ptr->powerStage = MTC_POWERON__START ;
break ;
}
case MTC_ADMIN_ACTION__RECOVER:
{
if ( node_ptr->mtcTimer.tid )
{
mtcTimer_stop ( node_ptr->mtcTimer ) ;
}
if ( node_ptr->mtcSwact_timer.tid )
{
mtcTimer_stop ( node_ptr->mtcSwact_timer ) ;
}
node_ptr->recoveryStage = MTC_RECOVERY__START ;
break ;
}
case MTC_ADMIN_ACTION__POWEROFF:
{
node_ptr->powerStage = MTC_POWEROFF__START ;
break ;
}
case MTC_ADMIN_ACTION__DELETE:
{
node_ptr->delStage = MTC_DEL__START ;
break ;
}
case MTC_ADMIN_ACTION__ENABLE:
default:
{
break ;
}
}
}
}
return (rc);
}
int nodeLinkClass::adminStateChange ( struct nodeLinkClass::node * node_ptr,
mtc_nodeAdminState_enum newAdminState )
{
int rc = FAIL ;
if (( newAdminState < MTC_ADMIN_STATES ) &&
( node_ptr->adminState < MTC_ADMIN_STATES ))
{
rc = PASS ;
/* See if we are actually changing the state */
if ( node_ptr->adminState != newAdminState )
{
ilog ("%s %s-%s-%s' -> %s-%s-%s\n", node_ptr->hostname.c_str(),
mtc_nodeAdminState_str [node_ptr->adminState],
mtc_nodeOperState_str [node_ptr->operState],
mtc_nodeAvailStatus_str[node_ptr->availStatus],
mtc_nodeAdminState_str [newAdminState],
mtc_nodeOperState_str [node_ptr->operState],
mtc_nodeAvailStatus_str[node_ptr->availStatus]);
node_ptr->adminState = newAdminState ;
}
}
else
{
slog ("Invalid Host Operational State (now:%d new:%d)\n",
node_ptr->adminState, newAdminState );
}
return (rc);
}
int nodeLinkClass::operStateChange ( struct nodeLinkClass::node * node_ptr,
mtc_nodeOperState_enum newOperState )
{
int rc = FAIL ;
if (( newOperState < MTC_OPER_STATES ) &&
( node_ptr->operState < MTC_OPER_STATES ))
{
rc = PASS ;
/* See if we are actually changing the state */
if ( node_ptr->operState != newOperState )
{
clog ("%s %s-%s-%s\n", node_ptr->hostname.c_str(),
mtc_nodeAdminState_str [node_ptr->adminState],
mtc_nodeOperState_str [node_ptr->operState],
mtc_nodeAvailStatus_str[node_ptr->availStatus]);
/* Push it to the database */
if ( node_ptr->uuid.length() == UUID_LEN )
{
string key = MTC_JSON_INV_OPER ;
string value = operState_enum_to_str(newOperState) ;
rc = mtcInvApi_update_state ( node_ptr, key, value );
}
else
{
wlog ("%s has invalid uuid:%s so %s state not written to database\n",
node_ptr->hostname.c_str(),
node_ptr->uuid.c_str(),
operState_enum_to_str(newOperState).c_str());
}
node_ptr->operState = newOperState ;
clog ("%s %s-%s-%s\n", node_ptr->hostname.c_str(),
mtc_nodeAdminState_str [node_ptr->adminState],
mtc_nodeOperState_str [node_ptr->operState],
mtc_nodeAvailStatus_str[node_ptr->availStatus]);
}
}
else
{
slog ("Invalid Host Operational State (now:%d new:%d)\n",
node_ptr->operState, newOperState );
}
return (rc);
}
int nodeLinkClass::availStatusChange ( struct nodeLinkClass::node * node_ptr,
mtc_nodeAvailStatus_enum newAvailStatus )
{
int rc = FAIL ;
if (( newAvailStatus < MTC_AVAIL_STATUS ) &&
( node_ptr->availStatus < MTC_AVAIL_STATUS ))
{
rc = PASS ;
/* See if we are actually changing the state */
if ( node_ptr->availStatus != newAvailStatus )
{
clog ("%s %s-%s-%s\n", node_ptr->hostname.c_str(),
mtc_nodeAdminState_str [node_ptr->adminState],
mtc_nodeOperState_str [node_ptr->operState],
mtc_nodeAvailStatus_str[node_ptr->availStatus]);
/* Push it to the database */
if ( node_ptr->uuid.length() == UUID_LEN )
{
string key = MTC_JSON_INV_AVAIL ;
string value = availStatus_enum_to_str(newAvailStatus) ;
rc = mtcInvApi_update_state ( node_ptr, key, value );
if ( rc != PASS )
{
wlog ("%s Failed to update availability '%s' to '%s'\n",
node_ptr->hostname.c_str(),
mtc_nodeAvailStatus_str[node_ptr->availStatus],
mtc_nodeAvailStatus_str[newAvailStatus]);
}
}
else
{
wlog ("%s has invalid uuid:%s so %s state not written to database\n",
node_ptr->hostname.c_str(),
node_ptr->uuid.c_str(),
availStatus_enum_to_str(newAvailStatus).c_str());
}
if (( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
(( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )) &&
( newAvailStatus == MTC_AVAIL_STATUS__FAILED ))
{
enableStageChange ( node_ptr, MTC_ENABLE__START );
}
/* if we go to the failed state then clear all mtcAlive counts
* so that the last ones don't look like we are online when we
* might not be - we should relearn the on/off line state */
if (( node_ptr->availStatus != MTC_AVAIL_STATUS__FAILED ) &&
( newAvailStatus == MTC_AVAIL_STATUS__FAILED ))
{
node_ptr->mtcAlive_misses = 0 ;
node_ptr->mtcAlive_hits = 0 ;
node_ptr->mtcAlive_gate = false ;
}
/* check for need to generate power on log */
if (( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF ) &&
( newAvailStatus != MTC_AVAIL_STATUS__POWERED_OFF ))
{
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__POWERON )
{
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_MANUAL_POWER_ON );
}
else
{
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_AUTO_POWER_ON );
}
}
/* check for need to generate power off log */
if (( node_ptr->availStatus != MTC_AVAIL_STATUS__POWERED_OFF ) &&
( newAvailStatus == MTC_AVAIL_STATUS__POWERED_OFF ))
{
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__POWEROFF )
{
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_MANUAL_POWER_OFF );
}
else
{
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_AUTO_POWER_OFF );
}
}
/* check for need to generate online log */
if (( node_ptr->availStatus != MTC_AVAIL_STATUS__ONLINE ) &&
( newAvailStatus == MTC_AVAIL_STATUS__ONLINE ))
{
if ( node_ptr->offline_log_reported == true )
{
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_ONLINE );
node_ptr->offline_log_reported = false ;
node_ptr->online_log_reported = true ;
}
}
/* check for need to generate offline log */
if (( node_ptr->availStatus != MTC_AVAIL_STATUS__OFFLINE ) &&
( newAvailStatus == MTC_AVAIL_STATUS__OFFLINE ))
{
if ( node_ptr->online_log_reported == true )
{
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_OFFLINE );
node_ptr->offline_log_reported = true ;
node_ptr->online_log_reported = false ;
}
}
/* If the availability status is moving away from off or online then
* be sure we cancel the mtcAlive timer */
if ((( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE ) ||
( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE )) &&
(( newAvailStatus != MTC_AVAIL_STATUS__OFFLINE ) &&
( newAvailStatus != MTC_AVAIL_STATUS__ONLINE )))
{
/* Free the mtc timer if in use */
if ( node_ptr->mtcAlive_timer.tid )
{
tlog ("%s Stopping mtcAlive timer\n", node_ptr->hostname.c_str());
mtcTimer_stop ( node_ptr->mtcAlive_timer );
node_ptr->mtcAlive_timer.ring = false ;
node_ptr->mtcAlive_timer.tid = NULL ;
}
node_ptr->onlineStage = MTC_ONLINE__START ;
}
clog ("%s %s-%s-%s\n", node_ptr->hostname.c_str(),
mtc_nodeAdminState_str [node_ptr->adminState],
mtc_nodeOperState_str [node_ptr->operState],
mtc_nodeAvailStatus_str[node_ptr->availStatus]);
node_ptr->availStatus = newAvailStatus ;
}
}
else
{
slog ("Invalid Host Availability Status (now:%d new:%d)\n",
node_ptr->availStatus, newAvailStatus );
}
return (rc);
}
/** Host Enable Handler Stage Change member function */
int nodeLinkClass::enableStageChange ( struct nodeLinkClass::node * node_ptr,
mtc_enableStages_enum newHdlrStage )
{
/* TODO: Consider converting stage to strings ... */
if (( newHdlrStage >= MTC_ENABLE__STAGES ) ||
( node_ptr->handlerStage.enable >= MTC_ENABLE__STAGES ))
{
slog ("%s has invalid Enable stage (%d:%d)\n",
node_ptr->hostname.c_str(),
node_ptr->handlerStage.enable,
newHdlrStage );
node_ptr->handlerStage.enable = MTC_ENABLE__FAILURE ;
/* TODO: cause failed or degraded state ? */
return (FAIL);
}
else if ( node_ptr->handlerStage.enable != newHdlrStage )
{
clog ("%s %s -> %s\n",
node_ptr->hostname.c_str(),
get_enableStages_str(node_ptr->handlerStage.enable).c_str(),
get_enableStages_str(newHdlrStage).c_str());
node_ptr->handlerStage.enable = newHdlrStage ;
return (PASS);
}
else
{
/* No state change */
dlog1 ("%s %s -> %s\n",
node_ptr->hostname.c_str(),
get_enableStages_str(node_ptr->handlerStage.enable).c_str(),
get_enableStages_str(newHdlrStage).c_str());
return (PASS);
}
}
/** Host Disable Handler Stage Change member function */
int nodeLinkClass::disableStageChange ( struct nodeLinkClass::node * node_ptr,
mtc_disableStages_enum newHdlrStage )
{
/* TODO: Consider converting stage to strings ... */
if (( newHdlrStage >= MTC_DISABLE__STAGES ) ||
( node_ptr->handlerStage.disable >= MTC_DISABLE__STAGES ))
{
slog ("%s has invalid disable stage (%d:%d)\n",
node_ptr->hostname.c_str(),
node_ptr->handlerStage.disable,
newHdlrStage );
node_ptr->handlerStage.disable = MTC_DISABLE__DISABLED ;
/* TODO: cause failed or degraded state ? */
return (FAIL);
}
else
{
clog ("%s %s -> %s\n",
node_ptr->hostname.c_str(),
get_disableStages_str(node_ptr->handlerStage.disable).c_str(),
get_disableStages_str(newHdlrStage).c_str());
node_ptr->handlerStage.disable = newHdlrStage ;
return (PASS);
}
}
/** Validate and log Recovery stage changes */
int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr,
mtc_recoveryStages_enum newHdlrStage )
{
int rc = PASS ;
if (( newHdlrStage >= MTC_RECOVERY__STAGES ) ||
( node_ptr->recoveryStage >= MTC_RECOVERY__STAGES ))
{
slog ("%s Invalid recovery stage (%d:%d)\n",
node_ptr->hostname.c_str(),
node_ptr->recoveryStage,
newHdlrStage );
if ( newHdlrStage < MTC_RECOVERY__STAGES )
{
clog ("%s ? -> %s\n",
node_ptr->hostname.c_str(),
get_recoveryStages_str(newHdlrStage).c_str());
node_ptr->recoveryStage = newHdlrStage ;
}
else
{
node_ptr->recoveryStage = MTC_RECOVERY__FAILURE ;
rc = FAIL ;
}
}
else
{
clog ("%s %s -> %s\n",
node_ptr->hostname.c_str(),
get_recoveryStages_str(node_ptr->recoveryStage).c_str(),
get_recoveryStages_str(newHdlrStage).c_str());
node_ptr->recoveryStage = newHdlrStage ;
}
return (rc) ;
}
/** Validate and log Recovery stage changes */
int nodeLinkClass::configStageChange ( struct nodeLinkClass::node * node_ptr,
mtc_configStages_enum newHdlrStage )
{
int rc = PASS ;
if (( newHdlrStage >= MTC_CONFIG__STAGES ) ||
( node_ptr->configStage >= MTC_CONFIG__STAGES ))
{
slog ("%s Invalid config stage (%d:%d)\n",
node_ptr->hostname.c_str(),
node_ptr->configStage,
newHdlrStage );
if ( newHdlrStage < MTC_CONFIG__STAGES )
{
clog ("%s ? -> %s\n",
node_ptr->hostname.c_str(),
get_configStages_str(newHdlrStage).c_str());
node_ptr->configStage = newHdlrStage ;
}
else
{
node_ptr->configStage = MTC_CONFIG__FAILURE ;
rc = FAIL ;
}
}
else
{
clog ("%s %s -> %s\n",
node_ptr->hostname.c_str(),
get_configStages_str(node_ptr->configStage).c_str(),
get_configStages_str(newHdlrStage).c_str());
node_ptr->configStage = newHdlrStage ;
}
return (rc) ;
}
/** Host Reset Handler Stage Change member function */
int nodeLinkClass::resetStageChange ( struct nodeLinkClass::node * node_ptr,
mtc_resetStages_enum newHdlrStage )
{
if ( newHdlrStage < MTC_RESET__STAGES )
{
clog ("%s stage %s -> %s\n",
node_ptr->hostname.c_str(),
get_resetStages_str(node_ptr->resetStage).c_str(),
get_resetStages_str(newHdlrStage).c_str());
node_ptr->resetStage = newHdlrStage ;
return (PASS) ;
}
else
{
slog ("%s Invalid reset stage (%d)\n", node_ptr->hostname.c_str(), newHdlrStage );
node_ptr->resetStage = MTC_RESET__DONE ;
return (FAIL) ;
}
}
/* Host Reset Handler Stage Change member function */
int nodeLinkClass::reinstallStageChange ( struct nodeLinkClass::node * node_ptr,
mtc_reinstallStages_enum newHdlrStage )
{
if ( newHdlrStage < MTC_REINSTALL__STAGES )
{
clog ("%s stage %s -> %s\n",
node_ptr->hostname.c_str(),
get_reinstallStages_str(node_ptr->reinstallStage).c_str(),
get_reinstallStages_str(newHdlrStage).c_str());
node_ptr->reinstallStage = newHdlrStage ;
return (PASS) ;
}
else
{
slog ("%s Invalid reinstall stage (%d)\n", node_ptr->hostname.c_str(), newHdlrStage );
node_ptr->reinstallStage = MTC_REINSTALL__DONE ;
return (FAIL) ;
}
}
/** Host Power control Handler Stage Change member function */
int nodeLinkClass::powerStageChange ( struct nodeLinkClass::node * node_ptr,
mtc_powerStages_enum newHdlrStage )
{
if ( newHdlrStage < MTC_POWER__STAGES )
{
clog ("%s stage %s -> %s\n",
node_ptr->hostname.c_str(),
get_powerStages_str(node_ptr->powerStage).c_str(),
get_powerStages_str(newHdlrStage).c_str());
node_ptr->powerStage = newHdlrStage ;
return (PASS) ;
}
else
{
slog ("%s Invalid power control stage (%d)\n",
node_ptr->hostname.c_str(), newHdlrStage );
node_ptr->powerStage = MTC_POWER__DONE ;
return (FAIL) ;
}
}
/** Host Power Cycle control Handler Stage Change member function */
int nodeLinkClass::powercycleStageChange ( struct nodeLinkClass::node * node_ptr,
mtc_powercycleStages_enum newHdlrStage )
{
if ( newHdlrStage < MTC_POWERCYCLE__STAGES )
{
clog ("%s stage %s -> %s\n",
node_ptr->hostname.c_str(),
get_powercycleStages_str(node_ptr->powercycleStage).c_str(),
get_powercycleStages_str(newHdlrStage).c_str());
node_ptr->powercycleStage = newHdlrStage ;
return (PASS) ;
}
else
{
slog ("%s Invalid powercycle stage (%d)\n",
node_ptr->hostname.c_str(), newHdlrStage );
node_ptr->powercycleStage = MTC_POWERCYCLE__DONE ;
return (FAIL) ;
}
}
/** Host Out-Of-Service Stage Change member function */
int nodeLinkClass::oosTestStageChange ( struct nodeLinkClass::node * node_ptr,
mtc_oosTestStages_enum newHdlrStage )
{
if ( newHdlrStage < MTC_OOS_TEST__STAGES )
{
clog ("%s stage %s -> %s\n",
node_ptr->hostname.c_str(),
get_oosTestStages_str(node_ptr->oosTestStage).c_str(),
get_oosTestStages_str(newHdlrStage).c_str());
node_ptr->oosTestStage = newHdlrStage ;
return (PASS) ;
}
else
{
slog ("%s Invalid oos test stage (%d)\n", node_ptr->hostname.c_str(), newHdlrStage );
node_ptr->oosTestStage = MTC_OOS_TEST__DONE ;
return (FAIL) ;
}
}
/** Host in-Service Stage Change member function */
int nodeLinkClass::insvTestStageChange ( struct nodeLinkClass::node * node_ptr,
mtc_insvTestStages_enum newHdlrStage )
{
if ( newHdlrStage < MTC_INSV_TEST__STAGES )
{
clog ("%s stage %s -> %s\n",
node_ptr->hostname.c_str(),
get_insvTestStages_str(node_ptr->insvTestStage).c_str(),
get_insvTestStages_str(newHdlrStage).c_str());
node_ptr->insvTestStage = newHdlrStage ;
return (PASS) ;
}
else
{
slog ("%s Invalid insv test stage (%d)\n", node_ptr->hostname.c_str(), newHdlrStage );
node_ptr->insvTestStage = MTC_INSV_TEST__START ;
return (FAIL) ;
}
}
/** SubStage Change member function */
int nodeLinkClass::subStageChange ( struct nodeLinkClass::node * node_ptr,
mtc_subStages_enum newHdlrStage )
{
if ( newHdlrStage < MTC_SUBSTAGE__STAGES )
{
clog ("%s stage %s -> %s\n",
node_ptr->hostname.c_str(),
get_subStages_str(node_ptr->subStage).c_str(),
get_subStages_str(newHdlrStage).c_str());
node_ptr->subStage = newHdlrStage ;
return (PASS) ;
}
else
{
slog ("%s Invalid 'subStage' stage (%d)\n",
node_ptr->hostname.c_str(), newHdlrStage );
node_ptr->subStage = MTC_SUBSTAGE__DONE ;
return (FAIL) ;
}
}
struct nodeLinkClass::node * nodeLinkClass::get_mtcTimer_timer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->mtcTimer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node * nodeLinkClass::get_mtcCmd_timer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->mtcCmd_timer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node * nodeLinkClass::get_host_services_timer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->host_services_timer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node * nodeLinkClass::get_http_timer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->http_timer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node * nodeLinkClass::get_thread_timer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->ipmitool_thread_ctrl.timer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node * nodeLinkClass::get_ping_timer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->bm_ping_info.timer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node * nodeLinkClass::get_bm_timer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->bm_timer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node * nodeLinkClass::get_bmc_access_timer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->bmc_access_timer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node * nodeLinkClass::get_mtcConfig_timer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->mtcConfig_timer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node * nodeLinkClass::get_powercycle_control_timer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->hwmon_powercycle.control_timer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node * nodeLinkClass::get_reset_control_timer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->hwmon_reset.control_timer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node * nodeLinkClass::get_powercycle_recovery_timer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->hwmon_powercycle.recovery_timer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node * nodeLinkClass::get_reset_recovery_timer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->hwmon_reset.recovery_timer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node * nodeLinkClass::get_mtcAlive_timer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->mtcAlive_timer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node * nodeLinkClass::get_offline_timer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->offline_timer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node * nodeLinkClass::get_mtcSwact_timer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->mtcSwact_timer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node * nodeLinkClass::get_oosTestTimer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->oosTestTimer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
struct nodeLinkClass::node * nodeLinkClass::get_insvTestTimer ( timer_t tid )
{
/* check for empty list condition */
if ( tid != NULL )
{
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->insvTestTimer.tid == tid )
{
return ptr ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
return static_cast<struct node *>(NULL);
}
/*****************************************************************************
*
* Name : autorecovery_clear
*
* Assumptions: Applies when simplex.
*
* Description: Removes the auto recovery count file if it exists.
*
* Auto recovery count is tracked/preserved in a host named auto recovery
* counter file /tmp/hostname_ar_count.
*
*****************************************************************************/
void autorecovery_clear ( string hostname )
{
string ar_file = TMP_DIR_PATH + hostname + AUTO_RECOVERY_FILE_SUFFIX ;
if ( daemon_is_file_present (ar_file.data()))
{
wlog ("%s clearing autorecovery counter\n", hostname.c_str());
daemon_remove_file (ar_file.data());
}
}
/*****************************************************************************
*
* Name : manage_autorecovery
*
* Assumptions: Applies to the active controller only while simplex.
*
* Description: Issues an immediate lazy reboot if the autorecovery threshold
* is reached. Otherwise it disables autorecovery and returns
* do we don't get a rolling boot loop.
*
* Auto recovery count is tracked/preserved in a host named auto recovery
* counter file /etc/mtc/tmp/hostname_ar_count.
*
* in the event of a persistent autorecovery failure that results in a
* disable then the active controller goes enabled-degraded with a horizon
* status that indicates the active controller has a critical failure but
* auto recovery is disabled. The enable alarm is raised.
*
*****************************************************************************/
void nodeLinkClass::manage_autorecovery ( struct nodeLinkClass::node * node_ptr )
{
/* manage removing the auto recovery threshold count file */
if ( ( THIS_HOST ) &&
( this->autorecovery_enabled == true ) &&
( this->autorecovery_disabled == false ) &&
( is_inactive_controller_main_insv() == false ))
{
int value = 0 ;
string ar_file = TMP_DIR_PATH + node_ptr->hostname + AUTO_RECOVERY_FILE_SUFFIX ;
int threshold = daemon_get_cfg_ptr()->autorecovery_threshold ;
if ( daemon_is_file_present (ar_file.data()))
{
/* if the file is there then read the count and increment it */
value = daemon_get_file_int ( ar_file.data() );
}
value++ ;
/* Save the new value in the file */
daemon_log_value ( ar_file.data(), value );
/* set rc to reflect what the caller should do */
if ( value > threshold )
{
elog ("%s auto recovery threshold exceeded (%d)\n",
node_ptr->hostname.c_str(), threshold );
this->autorecovery_disabled = true ;
if ( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX )
{
alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_CRITICAL ) ;
}
else
{
alarm_enabled_failure ( node_ptr ) ;
}
allStateChange ( node_ptr, node_ptr->adminState,
MTC_OPER_STATE__ENABLED,
MTC_AVAIL_STATUS__DEGRADED );
mtcInvApi_update_task ( node_ptr,
MTC_TASK_AUTO_RECOVERY_DISABLED );
return ;
}
wlog ("%s auto recovery (try %d of %d)\n",
node_ptr->hostname.c_str(), value , threshold );
mtcInvApi_update_states_now ( node_ptr,
"unlocked",
"disabled",
"failed",
"disabled",
"failed" );
mtcInvApi_update_task_now ( node_ptr,
MTC_TASK_AUTO_RECOVERY );
lazy_graceful_fs_reboot ( node_ptr );
}
}
/****************************************************************************
*
* Name : report_dor_recovery
*
* Description: Create a specifically formatted log for the the specified
* hosts DOR recovery state and timing.
*
* Parameters : The node and a caller prefix string that states if the node
* is ENABELD
* is FAILED
* is ENMABLED-degraded
* etc.
*
***************************************************************************/
void nodeLinkClass::report_dor_recovery ( struct nodeLinkClass::node * node_ptr,
string node_state_log_prefix )
{
struct timespec ts ;
clock_gettime (CLOCK_MONOTONIC, &ts );
node_ptr->dor_recovery_time = ts.tv_sec ;
plog ("%-12s %s ; DOR Recovery %2d:%02d mins (%4d secs) (uptime:%2d:%02d mins)\n",
node_ptr->hostname.c_str(),
node_state_log_prefix.c_str(),
node_ptr->dor_recovery_time/60,
node_ptr->dor_recovery_time%60,
node_ptr->dor_recovery_time,
node_ptr->uptime/60,
node_ptr->uptime%60 );
node_ptr->dor_recovery_mode = false ;
node_ptr->was_dor_recovery_mode = false ;
}
void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
{
/* don't do a full enable if active controller in simplex mode */
if ( THIS_HOST && SIMPLEX )
{
wlog ("%s avoiding full enable of simplex system\n", node_ptr->hostname.c_str());
wlog ("%s ... lock and unlock host to force recovery\n", node_ptr->hostname.c_str());
return ;
}
if ( node_ptr->was_dor_recovery_mode )
{
report_dor_recovery ( node_ptr , "is FAILED " );
}
plog ("%s Forcing Full Enable Sequence\n", node_ptr->hostname.c_str());
/* Raise Critical Enable Alarm */
alarm_enabled_failure ( node_ptr );
allStateChange ( node_ptr, node_ptr->adminState, MTC_OPER_STATE__DISABLED, MTC_AVAIL_STATUS__FAILED );
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
recoveryStageChange ( node_ptr, MTC_RECOVERY__START ); /* reset the fsm */
// don't override the add action or lock actions /
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ADD ) &&
( node_ptr->adminAction != MTC_ADMIN_ACTION__LOCK ) &&
( node_ptr->adminAction != MTC_ADMIN_ACTION__FORCE_LOCK ))
{
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); // no action
}
else
{
wlog ("%s refusing to override '%s' action with 'none' action\n",
node_ptr->hostname.c_str(),
mtc_nodeAdminAction_str [node_ptr->adminAction]);
}
}
/*****************************************************************************
*
* Name : launch_host_services_cmd
*
* Description: This is a multi timeslice service that is executed
* by the command handler.
*
* This interface just determines the host type and loads the
* command handler with the host type corresponding host
* services command based on the start bool. If 'subf' is
* specified then the start or stop command defaults to COMPUTE.
*
* Supported Commands are defined in nodeBase.h
*
* start = False (means stop)
*
* MTC_CMD_STOP_CONTROL_SVCS
* MTC_CMD_STOP_COMPUTE_SVCS
* MTC_CMD_STOP_STORAGE_SVCS
*
* start = True
*
* MTC_CMD_START_CONTROL_SVCS
* MTC_CMD_START_COMPUTE_SVCS
* MTC_CMD_START_STORAGE_SVCS
*
* Returns : PASS = launch success
* !PASS = launch failure
*
****************************************************************************/
int nodeLinkClass::launch_host_services_cmd ( struct nodeLinkClass::node * node_ptr, bool start, bool subf )
{
if ( !node_ptr )
return (FAIL_NULL_POINTER);
/* Initialize the host's command request control structure */
mtcCmd_init ( node_ptr->host_services_req );
/* Service subfunction override first, efficiency. */
if ( subf == true )
{
/* only supported subfunction (right now) is COMPUTE */
if ( start == true )
node_ptr->host_services_req.cmd = MTC_CMD_START_COMPUTE_SVCS ;
else
node_ptr->host_services_req.cmd = MTC_CMD_STOP_COMPUTE_SVCS ;
}
else if ( start == true )
{
if ( is_controller (node_ptr) )
node_ptr->host_services_req.cmd = MTC_CMD_START_CONTROL_SVCS ;
else if ( is_compute (node_ptr) )
node_ptr->host_services_req.cmd = MTC_CMD_START_COMPUTE_SVCS ;
else if ( is_storage (node_ptr) )
node_ptr->host_services_req.cmd = MTC_CMD_START_STORAGE_SVCS ;
else
{
slog ("%s start host services is not supported for this host type\n",
node_ptr->hostname.c_str());
return (FAIL_BAD_CASE) ;
}
}
else
{
if ( is_controller (node_ptr) )
node_ptr->host_services_req.cmd = MTC_CMD_STOP_CONTROL_SVCS ;
else if ( is_compute (node_ptr) )
node_ptr->host_services_req.cmd = MTC_CMD_STOP_COMPUTE_SVCS ;
else if ( is_storage (node_ptr) )
node_ptr->host_services_req.cmd = MTC_CMD_STOP_STORAGE_SVCS ;
else
{
slog ("%s stop host services is not supported for this host type\n",
node_ptr->hostname.c_str());
return (FAIL_BAD_CASE);
}
}
/* Translate that command to its named string */
node_ptr->host_services_req.name =
get_mtcNodeCommand_str(node_ptr->host_services_req.cmd);
/* Get the host services timeout and add MTC_AGENT_TIMEOUT_EXTENSION
* seconds so that it is a bit longer than the mtcClient timeout */
int timeout = daemon_get_cfg_ptr()->host_services_timeout ;
timeout+= MTC_AGENT_TIMEOUT_EXTENSION ;
ilog ("%s %s launch\n",
node_ptr->hostname.c_str(),
node_ptr->host_services_req.name.c_str());
/* The launch part.
* init the */
mtcCmd_init ( node_ptr->cmd );
node_ptr->cmd.stage = MTC_CMD_STAGE__START ;
node_ptr->cmd.cmd = MTC_OPER__HOST_SERVICES_CMD ;
node_ptr->mtcCmd_work_fifo.clear() ;
node_ptr->mtcCmd_work_fifo.push_front(node_ptr->cmd);
/* start an unbrella timer and start waiting for the result,
* a little longer than the mtcClient version */
mtcTimer_reset ( node_ptr->host_services_timer );
mtcTimer_start ( node_ptr->host_services_timer, mtcTimer_handler, timeout ) ;
return (PASS);
}
int send_event ( string & hostname, unsigned int cmd, iface_enum iface );
int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool true_false )
{
int rc = FAIL ;
if ( ! hostname.empty() )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
node_ptr->monitor[iface] = true_false ;
if ( true_false == true )
{
node_ptr->no_work_log_throttle = 0 ;
node_ptr->b2b_misses_count[iface] = 0 ;
node_ptr->max_count[iface] = 0 ;
node_ptr->hbs_failure[iface] = false ;
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_CLR, iface ) ;
node_ptr->hbs_minor[iface] = false ;
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_CLR, iface ) ;
node_ptr->hbs_degrade[iface] = false ;
}
return PASS ;
}
}
return ( rc );
}
/* store the current hardware monitor monitoring state */
void nodeLinkClass::set_hwmond_monitor_state ( string & hostname, bool state )
{
if ( hostname.length() )
{
struct nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
node_ptr->hwmond_monitor = state ;
}
}
}
/* get the current hardware monitor monitoring state */
bool nodeLinkClass::get_hwmond_monitor_state ( string & hostname )
{
bool state = false ;
if ( hostname.length() )
{
struct nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
state = node_ptr->hwmond_monitor ;
}
}
return (state);
}
/* get the current heartbeat monitoring state */
bool nodeLinkClass::get_hbs_monitor_state ( string & hostname, int iface )
{
bool state = false ;
if ( hostname.length() )
{
struct nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
state = node_ptr->monitor[iface] ;
}
}
return (state);
}
/* Manage the heartbeat pulse flags by hostname */
void nodeLinkClass::manage_pulse_flags ( string & hostname, unsigned int flags )
{
if ( hostname.length() )
{
struct nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
manage_pulse_flags ( node_ptr, flags );
}
}
}
/* Manage the heartbeat pulse flags by pulse_ptr */
void nodeLinkClass::manage_pulse_flags ( struct nodeLinkClass::node * node_ptr, unsigned int flags )
{
/* Do nothing with the flags for missing pulse
* responses (identified with flags=NULL_PULSE_FLAGS) */
if ( flags == NULL_PULSE_FLAGS )
{
return ;
}
/* Code that manages enabling of Infrastructrure network moonitoring
*
* Algorithm: Only monitor a hosts infrastructure network while the
* management network of that same host is being monitored
* and while that host indicates support for the infrastructure
* network by setting the INFRA_FLAG in its management network
* pulse responses. */
if ( node_ptr->monitor[MGMNT_IFACE] == false )
{
node_ptr->monitor[INFRA_IFACE] = false ;
}
else if ( flags & INFRA_FLAG )
{
/* TODO: Does this need to be debounced ??? */
node_ptr->monitor[INFRA_IFACE] = true ;
}
/* A host indicates that its process monitor is running by setting the
* PMOND_FLAG occasionally in its pulse response.
* The following if/else if clauses manage raising an alarm and degrading
* a host has stopped sending the PMOND_FLAG. */
if ( flags & PMOND_FLAG )
{
if ( node_ptr->pmon_degraded == true )
{
if ( node_ptr->alarms[HBS_ALARM_ID__PMOND] != FM_ALARM_SEVERITY_CLEAR )
{
alarm_clear ( node_ptr->hostname, PMOND_ALARM_ID, "pmond" );
}
if ( send_event ( node_ptr->hostname, MTC_EVENT_PMOND_CLEAR, MGMNT_IFACE ) == PASS )
{
node_ptr->alarms[HBS_ALARM_ID__PMOND] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->pmon_degraded = false ;
}
}
node_ptr->pmon_missing_count = 0 ;
node_ptr->stall_monitor_log_throttle = 0 ;
node_ptr->stall_recovery_log_throttle = 0 ;
}
else if ( ++node_ptr->pmon_missing_count > PMOND_MISSING_THRESHOLD )
{
if ( node_ptr->pmon_degraded == false )
{
wlog ("%s sending pmon degrade event to maintenance\n", node_ptr->hostname.c_str());
if ( send_event ( node_ptr->hostname, MTC_EVENT_PMOND_RAISE, MGMNT_IFACE ) == PASS )
{
node_ptr->pmon_degraded = true ;
node_ptr->alarms[HBS_ALARM_ID__PMOND] = FM_ALARM_SEVERITY_MAJOR ;
alarm_major ( node_ptr->hostname, PMOND_ALARM_ID, "pmond" );
}
}
}
/* A host indicates that a process stall condition exists by setting the
* STALL_REC_FLAG it its heartbeat pulse response messages */
if ( flags & STALL_REC_FLAG )
{
wlog ("%s hbsClient stall recovery action (flags:%08x)\n", node_ptr->hostname.c_str(), flags);
if ( node_ptr->stall_recovery_log_throttle++ == 0 )
{
send_event ( node_ptr->hostname, MTC_EVENT_HOST_STALLED , MGMNT_IFACE );
}
}
else if ( flags & STALL_MON_FLAG )
{
if ( node_ptr->stall_monitor_log_throttle++ == 0 )
{
wlog ("%s hbsClient running stall monitor (flags:%08x)\n", node_ptr->hostname.c_str(), flags );
}
else if ( flags & STALL_ERROR_FLAGS )
{
wlog ("%s hbsClient running stall monitor (flags:%08x)\n", node_ptr->hostname.c_str(), flags );
}
}
if ( node_ptr->stall_recovery_log_throttle > STALL_MSG_THLD )
{
node_ptr->stall_recovery_log_throttle = 0 ;
}
if ( node_ptr->stall_monitor_log_throttle > STALL_MSG_THLD )
{
node_ptr->stall_monitor_log_throttle = 0 ;
}
}
/* Create the monitored pulse list for the specified interface */
int nodeLinkClass::create_pulse_list ( iface_enum iface )
{
struct node * ptr = head ;
pulses[iface] = 0 ;
/* No check-in list if there is no inventory */
if (( head == NULL ) || ( hosts == 0 ))
{
return (pulses[iface]) ;
}
if ( iface >= MAX_IFACES )
{
dlog ("Invalid interface (%d)\n", iface );
return (pulses[iface]);
}
pulse_list[iface].last_ptr = NULL ;
pulse_list[iface].head_ptr = NULL ;
pulse_list[iface].tail_ptr = NULL ;
/* walk the node list looking for nodes that should be monitored */
for ( ; ptr != NULL ; ptr = ptr->next )
{
if ( ptr->monitor[iface] == true )
{
/* current monitored node pointer */
pulse_ptr = ptr ;
/* if first pulse node */
if ( pulse_list[iface].head_ptr == NULL )
{
/* need to keep track of the last node so we can deal with
* skipped nodes when they are not in monitor mode */
pulse_list[iface].last_ptr = pulse_ptr ;
pulse_list[iface].head_ptr = pulse_ptr ;
pulse_list[iface].tail_ptr = pulse_ptr ;
pulse_ptr->pulse_link[iface].prev_ptr = NULL ;
}
else
{
pulse_list[iface].last_ptr->pulse_link[iface].next_ptr = pulse_ptr ;
pulse_ptr->pulse_link[iface].prev_ptr = pulse_list[iface].last_ptr ;
pulse_list[iface].last_ptr = pulse_ptr ; /* save current to handle a skip */
pulse_list[iface].tail_ptr = pulse_ptr ; /* migrate tail as list is built */
}
pulse_ptr->pulse_link[iface].next_ptr = NULL ;
pulse_ptr->linknum[iface] = ++pulses[iface] ;
mlog2 ("%s %s Pulse Info: %d:%d - %d:%p\n",
pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->linknum[iface],
pulses[iface],
pulse_ptr->rri,
pulse_ptr);
}
}
print_pulse_list(iface);
return (pulses[iface]);
}
/** Build the Reasource Reference Array */
void nodeLinkClass::build_rra ( void )
{
struct node * ptr = NULL ;
int x = 1 ;
for ( ptr = head ; ptr != NULL ; ptr = ptr->next )
{
hbs_rra [x] = ptr ;
ptr->rri=x ;
x++ ;
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
if ( ptr != NULL )
{
dlog ("%s forced RRA build (%d)\n", ptr->hostname.c_str(), x-1);
}
/* fill the rest with NULL */
for ( ; x < MAX_NODES ; x++ )
hbs_rra[x] = NULL ;
/* Reset the "Running RRI" */
rrri = 0 ;
}
/** Gets the next hostname and resource reference identifier
* (the rra index) and updates the callers variables with them.
*
* This is a helper function in support of the fast resource lookup feature.
* During steady state operation the heartbeat agent cycles through
* all the resources , one per heartbeat request, sending new
* reference identifiers (name and index) to the monitored resources.
* Each time this is called it get the next set.
* */
void nodeLinkClass::get_rris ( string & hostname, int & rri )
{
if ( hosts )
{
hostname = "none" ;
rrri++ ;
if ( rrri > hosts )
{
rrri = 1 ;
}
hostname = hbs_rra[rrri]->hostname ;
rri = rrri ;
}
}
struct nodeLinkClass::node* nodeLinkClass::getPulseNode ( string & hostname , iface_enum iface )
{
/* check for empty list condition */
if ( pulse_list[iface].head_ptr == NULL )
return NULL ;
for ( pulse_ptr = pulse_list[iface].head_ptr ; ; pulse_ptr = pulse_ptr->pulse_link[iface].next_ptr )
{
if ( !hostname.compare ( pulse_ptr->hostname ))
{
return pulse_ptr ;
}
if (( pulse_ptr->pulse_link[iface].next_ptr == NULL ) ||
( pulse_ptr==pulse_list[iface].tail_ptr ))
{
break ;
}
}
return static_cast<struct node *>(NULL);
}
/* Find the node in the list of nodes being heartbeated and splice it out */
int nodeLinkClass::remPulse_by_index ( string hostname, int index, iface_enum iface, bool clear_b2b_misses_count, unsigned int flags )
{
int rc = FAIL ;
if (( index > 0 ) && ( !(index > hosts)))
{
if ( hbs_rra[index] != NULL )
{
struct nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
if (( hbs_rra[index] == node_ptr ) &&
( ! node_ptr->hostname.compare(hostname)))
{
node_ptr->lookup_mismatch_log_throttle = 0 ;
if ( node_ptr->monitor[iface] == true )
{
node_ptr->unexpected_pulse_log_throttle = 0 ;
return ( remPulse ( hbs_rra[index], iface, clear_b2b_misses_count, flags ));
}
else
{
wlog_throttled ( node_ptr->unexpected_pulse_log_throttle, 200, "%s is not being monitored\n", hostname.c_str());
rc = PASS;
}
}
else
{
rc = remPulse_by_name ( hostname, iface, clear_b2b_misses_count, flags );
wlog_throttled ( node_ptr->lookup_mismatch_log_throttle, 200, "%s rri lookup mismatch (%s:%d) ; %s\n", hostname.c_str(), node_ptr->hostname.c_str(), index, rc ? "" : "removed by hostname" );
return (rc);
}
}
else
{
dlog ("%s could not lookup by index or hostname (%d)\n", hostname.c_str(), index );
rc = FAIL_HOSTNAME_LOOKUP ;
}
}
}
return (rc);
}
/* Find the node in the list of nodes being heartbeated and splice it out */
int nodeLinkClass::remPulse_by_name ( string & hostname, iface_enum iface, bool clear_b2b_misses_count, unsigned int flags )
{
return ( remPulse ( getPulseNode ( hostname, iface ), iface, clear_b2b_misses_count, flags ));
}
/** WANT_LINKLIST_FIT is not defined by default.
* Needs to be explicitely defined and the undef commented out for testing
**/
#ifdef WANT_LINKLIST_FIT
#undef WANT_LINKLIST_FIT
#endif
#ifdef WANT_LINKLIST_FIT
static bool already_fit = false ;
#endif
/* Find the node in the list of nodes being heartbeated and splice it out */
int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool clear_b2b_misses_count, unsigned int flags )
{
/* This default RC allows the caller to filter out unexpected pulse responses */
int rc = ENXIO ;
if ( head == NULL )
{
return -ENODEV ;
}
else if ( node_ptr == NULL )
{
return (rc) ;
}
struct node * ptr = node_ptr ;
// dlog ("%s\n", node_ptr->hostname.c_str());
/* Splice the node out of the pulse monitor list */
/* Does the pulse monitor list exist and is the node in the list */
/* Need to gracefully handle being called when there is no pulse */
/* list and/or the specified host is not in the pulse list */
if (( pulse_list[iface].head_ptr != NULL ) && ( ptr != NULL ) && ( ptr->linknum[iface] != 0))
{
pulse_ptr = ptr ;
ptr->hbs_count[iface]++ ;
manage_pulse_flags ( pulse_ptr , flags );
if ( clear_b2b_misses_count == true )
{
if ( ptr->b2b_misses_count[iface] > hbs_degrade_threshold )
{
ilog ("%s %s Pulse Rxed (after %d misses)\n",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
node_ptr->b2b_misses_count[iface]);
}
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CLEAR, iface );
ptr->b2b_misses_count[iface] = 0 ;
if ( pulse_ptr->hbs_degrade[iface] == true )
{
/* Send a degrade clear event to maintenance */
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_CLR, iface ) == PASS )
{
pulse_ptr->hbs_degrade[iface] = false ;
}
}
if ( pulse_ptr->hbs_minor[iface] == true )
{
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_CLR, iface ) == PASS )
{
pulse_ptr->hbs_minor[iface] = false ;
}
}
}
rc = PASS ;
#ifdef WANT_LINKLIST_FIT
if ( already_fit == false )
{
if ( daemon_is_file_present ( MTC_CMD_FIT__LINKLIST ) == true )
{
if ( pulse_list[iface].head_ptr->pulse_link[iface].next_ptr != NULL )
{
slog ("FIT of next pointer\n");
pulse_list[iface].head_ptr->pulse_link[iface].next_ptr = NULL ;
already_fit = true ;
}
}
}
#endif
if ( pulse_list[iface].head_ptr == pulse_ptr )
{
if ( pulse_list[iface].head_ptr == pulse_list[iface].tail_ptr )
{
qlog2 ("%s Pulse: Single Node -> Head Case : %d of %d\n", node_ptr->hostname.c_str(), pulse_ptr->linknum[iface], pulses[iface] );
pulse_list[iface].head_ptr = NULL ;
pulse_list[iface].tail_ptr = NULL ;
}
else
{
qlog2 ("%s Pulse: Multiple Node -> Head Case : %d of %d\n", node_ptr->hostname.c_str(), pulse_ptr->linknum[iface], pulses[iface] );
if ( pulse_list[iface].head_ptr->pulse_link[iface].next_ptr == NULL )
{
slog ("%s unexpected NULL next_ptr ; aborting this pulse window\n", node_ptr->hostname.c_str());
pulse_list[iface].head_ptr = NULL ;
pulse_list[iface].tail_ptr = NULL ;
pulse_ptr->linknum[iface] = 0 ;
pulses[iface] = 0 ;
return (FAIL_NULL_POINTER);
}
else
{
pulse_list[iface].head_ptr = pulse_list[iface].head_ptr->pulse_link[iface].next_ptr ;
pulse_list[iface].head_ptr->pulse_link[iface].prev_ptr = NULL ;
}
}
}
else if ( pulse_list[iface].tail_ptr == pulse_ptr )
{
qlog2 ("%s Pulse: Multiple Node -> Tail Case : %d of %d\n", node_ptr->hostname.c_str(), pulse_ptr->linknum[iface], pulses[iface] );
if ( pulse_list[iface].tail_ptr->pulse_link[iface].prev_ptr == NULL )
{
slog ("%s unexpected NULL prev_ptr ; aborting this pulse window\n", node_ptr->hostname.c_str());
pulse_list[iface].head_ptr = NULL ;
pulse_list[iface].tail_ptr = NULL ;
pulse_ptr->linknum[iface] = 0 ;
pulses[iface] = 0 ;
return (FAIL_NULL_POINTER);
}
else
{
pulse_list[iface].tail_ptr = pulse_list[iface].tail_ptr->pulse_link[iface].prev_ptr ;
pulse_list[iface].tail_ptr->pulse_link[iface].next_ptr = NULL ;
}
}
else
{
/* July 1 emacdona: Make failure path case more robust */
if ( pulse_ptr == NULL ) { slog ("Internal Err 1\n"); rc = FAIL; }
else if ( pulse_ptr->pulse_link[iface].prev_ptr == NULL ) { slog ("Internal Err 2\n"); rc = FAIL; }
else if ( pulse_ptr->pulse_link[iface].prev_ptr->pulse_link[iface].next_ptr == NULL ) { slog ("Internal Err 3\n"); rc = FAIL; }
else if ( pulse_ptr->pulse_link[iface].next_ptr == NULL ) { slog ("Internal Err 4\n"); rc = FAIL; }
else if ( pulse_ptr->pulse_link[iface].next_ptr->pulse_link[iface].prev_ptr == NULL ) { slog ("Internal Err 5\n"); rc = FAIL; }
if ( rc == FAIL )
{
slog ("%s Null pointer error splicing %s out of pulse list with %d pulses remaining (Monitoring:%s)\n",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulses[iface],
node_ptr->monitor[iface] ? "Yes" : "No" );
}
else
{
pulse_ptr->pulse_link[iface].prev_ptr->pulse_link[iface].next_ptr = pulse_ptr->pulse_link[iface].next_ptr ;
pulse_ptr->pulse_link[iface].next_ptr->pulse_link[iface].prev_ptr = pulse_ptr->pulse_link[iface].prev_ptr ;
}
}
if ( rc == PASS )
{
pulse_ptr->linknum[iface]-- ; // = 0 ;
}
pulses[iface]-- ;
}
return rc ;
}
/** This utility will try and remove a pluse from the pulse
* linked list first by index and then by hostname.
*
* By index does not require a lookup whereas hostname does */
int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index, unsigned int flags )
{
if ( index )
{
int rc = remPulse_by_index ( hostname, index , iface, true , flags );
switch (rc)
{
case PASS: return (rc) ;
case ENXIO: return (rc);
default: mlog ("%s RRI Miss (rri:%d) (rc:%d)\n", hostname.c_str(), index, rc );
}
}
else
{
if ( hostname.compare("localhost") )
{
if ( get_hbs_monitor_state ( hostname , iface ) == true )
{
wlog ("%s Not Offering RRI\n", hostname.c_str());
}
}
else
{
/* localhost is not a supported hostname and indicates
* an unconfigured host response ; return the ignore response */
return(ENXIO);
}
}
return ( remPulse_by_name ( hostname , iface, true, flags ));
}
void nodeLinkClass::clear_pulse_list ( iface_enum iface )
{
struct node * ptr = head ;
for ( ; ptr != NULL ; ptr = ptr->next )
{
ptr->pulse_link[iface].prev_ptr = NULL ;
ptr->pulse_link[iface].next_ptr = NULL ;
}
pulse_list[iface].head_ptr = NULL ;
pulse_list[iface].tail_ptr = NULL ;
if ( ptr != NULL )
{
ptr->linknum[iface] = 0 ;
pulses[iface] = 0 ;
}
}
/** Runs in the hbsAgent to set or clear heartbat alarms for all supported interfaces */
void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_ptr, EFmAlarmSeverityT sev, int iface )
{
if ( this->heartbeat != true )
return ;
bool make_alarm_call = false ;
alarm_id_enum id ;
EFmAlarmStateT state = FM_ALARM_STATE_SET ;
const char * alarm_id_ptr = NULL ;
const char * entity_ptr = NULL ;
if ( iface == MGMNT_IFACE )
{
entity_ptr = MGMNT_NAME ;
id = HBS_ALARM_ID__HB_MGMNT ;
alarm_id_ptr = MGMNT_HB_ALARM_ID;
}
else
{
entity_ptr = INFRA_NAME ;
id = HBS_ALARM_ID__HB_INFRA ;
alarm_id_ptr = INFRA_HB_ALARM_ID;
}
if ( sev == FM_ALARM_SEVERITY_CLEAR )
{
state = FM_ALARM_STATE_CLEAR ;
if ( node_ptr->alarms[id] != FM_ALARM_SEVERITY_CLEAR )
{
make_alarm_call = true ;
node_ptr->alarms[id] = sev ;
}
}
else if ( sev == FM_ALARM_SEVERITY_MAJOR )
{
if ( node_ptr->alarms[id] == FM_ALARM_SEVERITY_CRITICAL )
{
; /* we don't go from critical to degrade
need a clear first */
}
else if ( node_ptr->alarms[id] != FM_ALARM_SEVERITY_MAJOR )
{
make_alarm_call = true ;
node_ptr->alarms[id] = FM_ALARM_SEVERITY_MAJOR ;
}
}
else if ( sev == FM_ALARM_SEVERITY_CRITICAL )
{
if ( node_ptr->alarms[id] != sev )
{
make_alarm_call = true ;
node_ptr->alarms[id] = sev ;
}
}
else if ( sev == FM_ALARM_SEVERITY_MINOR )
{
if ( node_ptr->alarms[id] != sev )
{
make_alarm_call = true ;
node_ptr->alarms[id] = sev ;
}
}
else
{
if ( node_ptr->alarms[id] != FM_ALARM_SEVERITY_WARNING )
{
make_alarm_call = true ;
node_ptr->alarms[id] = FM_ALARM_SEVERITY_WARNING ;
}
}
if ( make_alarm_call == true )
{
alarm_ ( node_ptr->hostname, alarm_id_ptr, state, sev, entity_ptr , "");
}
}
int nodeLinkClass::lost_pulses ( iface_enum iface )
{
int rc = PASS ;
for ( ; pulse_list[iface].head_ptr != NULL ; )
{
daemon_signal_hdlr ();
pulse_ptr = pulse_list[iface].head_ptr ;
if ( active )
{
string flat = "Flat Line:" ;
pulse_ptr->b2b_misses_count[iface]++ ;
// pulse_ptr->max_count[iface]++ ;
/* Don't log single misses unless in debug mode */
if ( pulse_ptr->b2b_misses_count[iface] > 1 )
{
// if ( pulse_ptr->b2b_misses_count[iface] >= 25 )
if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold )
{
// if ( pulse_ptr->b2b_misses_count[iface] == 25 )
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
{
ilog ("%-13s %s Pulse Miss (%d) (log throttled to every %d)\n",
pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface],
0xfff);
}
/* Once the misses exceed 25 then throttle the logging to avoid flooding */
if ( (pulse_ptr->b2b_misses_count[iface] & 0xfff) == 0 )
{
ilog ("%-13s %s Pulse Miss (%d)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface] );
}
}
else
{
if ( pulse_ptr->b2b_misses_count[iface] > hbs_failure_threshold )
{
ilog ("%-13s %s Pulse Miss (%3d) (in failure)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface] );
}
else if ( pulse_ptr->b2b_misses_count[iface] > hbs_degrade_threshold )
{
ilog ("%-13s %s Pulse Miss (%3d) (max:%3d) (in degrade)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface],
pulse_ptr->max_count[iface]);
}
else if ( pulse_ptr->b2b_misses_count[iface] > hbs_minor_threshold )
{
ilog ("%-13s %s Pulse Miss (%3d) (max:%3d) (in minor)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface] ,
pulse_ptr->max_count[iface]);
}
else
{
ilog ("%-13s %s Pulse Miss (%3d) (max:%3d)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface],
pulse_ptr->max_count[iface]);
}
}
}
else
{
dlog ("%-13s %s Pulse Miss (%d)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface] );
}
mem_log ( flat, pulse_ptr->b2b_misses_count[iface], pulse_ptr->hostname.c_str());
if ( iface == MGMNT_IFACE )
{
if ( pulse_ptr->b2b_misses_count[iface] == hbs_minor_threshold )
{
send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_SET, iface );
pulse_ptr->hbs_minor[iface] = true ;
pulse_ptr->hbs_minor_count[iface]++ ;
wlog ("%s %s -> MINOR\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface));
}
}
if ( pulse_ptr->b2b_misses_count[iface] == hbs_degrade_threshold )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface );
/* report this host as failed */
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS )
{
pulse_ptr->hbs_degrade[iface] = true ;
}
wlog ("%s %s -> DEGRADED\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface));
pulse_ptr->hbs_degrade_count[iface]++ ;
}
/* Handle lost degrade event case */
if (( pulse_ptr->b2b_misses_count[iface] > hbs_degrade_threshold ) &&
( pulse_ptr->hbs_degrade[iface] == false ))
{
wlog ("%s -> DEGRADED - Auto-Correction\n", pulse_ptr->hostname.c_str());
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface );
/* report this host as failed */
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS )
{
pulse_ptr->hbs_degrade[iface] = true ;
}
}
/* Turn the infra heartbeat loss into a degrade only
* condition if the infra_degrade_only flag is set */
if (( iface == INFRA_IFACE ) &&
( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
( infra_degrade_only == true ))
{
/* Only print the log at the threshold boundary */
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
}
}
/* Turn the infra heartbeat loss into a degrade only
* condition for inactive controller on normal system. */
else if (( iface == INFRA_IFACE ) &&
( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
( this->system_type == SYSTEM_TYPE__NORMAL ) &&
(( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE ))
{
/* Only print the log at the threshold boundary */
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
}
}
else if (( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) &&
( pulse_ptr->hbs_failure[iface] == false ))
{
elog ("%s %s -> FAILED\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
/* report this host as failed */
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS )
{
pulse_ptr->hbs_failure[iface] = true ;
}
pulse_ptr->hbs_failure_count[iface]++ ;
}
if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )
pulse_ptr->max_count[iface] = pulse_ptr->b2b_misses_count[iface] ;
}
rc = remPulse_by_name ( pulse_ptr->hostname, iface, false, NULL_PULSE_FLAGS );
if ( rc != PASS )
{
elog ("%s %s not in pulse list\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface));
clear_pulse_list ( iface );
break ;
}
if ( pulse_list[iface].head_ptr == NULL )
{
// dlog ("Pulse list is Empty\n");
break ;
}
}
return (rc);
}
/* Return true if the specified interface is being monitored for this host */
bool nodeLinkClass::monitored_pulse ( string hostname , iface_enum iface )
{
if ( hostname.length() )
{
struct nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
return ( node_ptr->monitor[iface] ) ;
}
}
return(false);
}
/* Reports pulse list empty status.
* true if empty
* false if not empty
*/
bool nodeLinkClass::pulse_list_empty ( iface_enum iface )
{
if ( pulse_list[iface].head_ptr == NULL )
return true ;
return false ;
}
void nodeLinkClass::print_pulse_list ( iface_enum iface )
{
string pulse_host_list = "- " ;
if ( pulse_list[iface].head_ptr != NULL )
{
for ( pulse_ptr = pulse_list[iface].head_ptr ;
pulse_ptr != NULL ;
pulse_ptr = pulse_ptr->pulse_link[iface].next_ptr )
{
pulse_host_list.append(pulse_ptr->hostname.c_str());
pulse_host_list.append(" ");
}
dlog ("Patients: %s\n", pulse_host_list.c_str());
}
if ( pulses[iface] && !pulse_host_list.empty() )
{
string temp = get_iface_name_str(iface) ;
temp.append(" Patients :") ;
mem_log ( temp, pulses[iface], pulse_host_list );
}
}
/* Clear all degrade flags except for the HWMON one */
void clear_host_degrade_causes ( unsigned int & degrade_mask )
{
if ( degrade_mask & DEGRADE_MASK_HWMON )
{
degrade_mask = DEGRADE_MASK_HWMON ;
}
else
{
degrade_mask = 0 ;
}
}
/***************************************************************************/
/******************* State Dump Utilities ***********************/
/***************************************************************************/
void nodeLinkClass::mem_log_general ( void )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s %s %s %s:%s %s:%s \n",
my_hostname.c_str(),
my_local_ip.c_str(),
my_float_ip.c_str(),
daemon_get_cfg_ptr()->mgmnt_iface,
mgmnt_link_up_and_running ? "Up" : "Down",
daemon_get_cfg_ptr()->infra_iface,
infra_link_up_and_running ? "Up" : "Down");
mem_log (str);
}
void nodeLinkClass::mem_log_dor ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s DOR - Active: %c Was: %c Time: %5d (00:%02d:%02d)\n",
node_ptr->hostname.c_str(),
node_ptr->dor_recovery_mode ? 'Y' : 'N',
node_ptr->was_dor_recovery_mode ? 'Y' : 'N',
node_ptr->dor_recovery_time,
node_ptr->dor_recovery_time ? node_ptr->dor_recovery_time/60 : 0,
node_ptr->dor_recovery_time ? node_ptr->dor_recovery_time%60 : 0);
mem_log (str);
}
/* Multi-Node Failure Avoidance Data */
void nodeLinkClass::mem_log_mnfa ( void )
{
char str[MAX_MEM_LOG_DATA] ;
int temp = mnfa_threshold_number ;
if ( mnfa_threshold_type == MNFA_PERCENT )
temp = mnfa_threshold_percent ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s MNFA: Mode:%s:%d State:%s Hosts:%d:%d Cases:%d Threshold:%d\n",
my_hostname.c_str(),
mnfa_threshold_type ? "Percent" : "Number",
temp,
mnfa_active ? "ACTIVE" : "inactive",
mnfa_host_count[MGMNT_IFACE],
mnfa_host_count[INFRA_IFACE],
mnfa_calculate_threshold( "" ),
mnfa_occurances);
mem_log (str);
}
void nodeLinkClass::mem_log_general_mtce_hosts ( void )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s EnableHosts -> Cont:%d Comp:%d Stor:%d StorType:%d\n",
my_hostname.c_str(),
num_controllers_enabled(),
enabled_compute_nodes(),
enabled_storage_nodes(),
get_storage_backend());
mem_log (str);
}
void nodeLinkClass::mem_log_bm ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tbm_ip:%s bm_un:%s bm_type:%s provisioned: %s\n",
node_ptr->hostname.c_str(),
node_ptr->bm_ip.c_str(),
node_ptr->bm_un.c_str(),
node_ptr->bm_type.c_str(),
node_ptr->bm_provisioned ? "Yes" : "No" );
mem_log (str);
}
void nodeLinkClass::mem_log_identity ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s %s (%u)\n",
node_ptr->hostname.c_str(),
node_ptr->uuid.c_str(),
node_ptr->type.c_str(),
node_ptr->nodetype);
mem_log (str);
}
void nodeLinkClass::mem_log_state1 ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
string ad = adminState_enum_to_str(node_ptr->adminState) ;
string op = operState_enum_to_str(node_ptr->operState) ;
string av = availStatus_enum_to_str(node_ptr->availStatus);
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s-%s-%s degrade_mask:%08x\n",
node_ptr->hostname.c_str(),
ad.c_str(),
op.c_str(),
av.c_str(),
node_ptr->degrade_mask);
mem_log (str);
op = operState_enum_to_str(node_ptr->operState_subf) ;
av = availStatus_enum_to_str(node_ptr->availStatus_subf);
if ( ! node_ptr->subfunction_str.empty() )
{
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tSub-Functions: %s-%s %s-%s-%s\n",
node_ptr->hostname.c_str(),
node_ptr->function_str.c_str(),
node_ptr->subfunction_str.c_str(),
ad.c_str(),
op.c_str(),
av.c_str());
}
mem_log (str);
}
void nodeLinkClass::mem_log_state2 ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
string aa = adminAction_enum_to_str(node_ptr->adminAction) ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAction:%s invAction:%s Task:%s\n",
node_ptr->hostname.c_str(),
aa.c_str(),
node_ptr->action.c_str(),
node_ptr->task.c_str());
mem_log (str);
}
void nodeLinkClass::mem_log_mtcalive ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: on:%c off:%c Cnt:%d State:%s Misses:%d\n",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_online ? 'Y' : 'N',
node_ptr->mtcAlive_offline ? 'Y' : 'N',
node_ptr->mtcAlive_count,
node_ptr->mtcAlive_gate ? "gated" : "rxing",
node_ptr->mtcAlive_misses);
mem_log (str);
}
void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n",
node_ptr->hostname.c_str(),
node_ptr->alarms[MTC_ALARM_ID__LOCK ] ? " Locked" : " .",
node_ptr->alarms[MTC_ALARM_ID__CONFIG ] ? " Config" : " .",
node_ptr->alarms[MTC_ALARM_ID__ENABLE ] ? " Enable" : " .",
node_ptr->alarms[MTC_ALARM_ID__CH_CONT ] ? " Control" : " .",
node_ptr->alarms[MTC_ALARM_ID__CH_COMP ] ? " Compute" : " .",
node_ptr->alarms[MTC_ALARM_ID__BM ] ? " Brd Mgmt" : " .");
mem_log (str);
}
void nodeLinkClass::mem_log_stage ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAdd:%d Offline:%d: Swact:%d Recovery:%d Able:%d\n",
node_ptr->hostname.c_str(),
node_ptr->addStage,
node_ptr->offlineStage,
node_ptr->swactStage,
node_ptr->recoveryStage,
node_ptr->handlerStage.raw);
mem_log (str);
}
void nodeLinkClass::mem_log_power_info ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tStage:%s Attempts:%d: Holdoff:%d Retry:%d State:%x ctid:%p rtid:%p\n",
node_ptr->hostname.c_str(),
get_powercycleStages_str(node_ptr->powercycleStage).c_str(),
node_ptr->hwmon_powercycle.attempts,
node_ptr->hwmon_powercycle.holdoff,
node_ptr->hwmon_powercycle.retries,
node_ptr->hwmon_powercycle.state,
node_ptr->hwmon_powercycle.control_timer.tid,
node_ptr->hwmon_powercycle.recovery_timer.tid);
mem_log (str);
}
void nodeLinkClass::mem_log_reset_info ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tStage:%s Attempts:%d: Holdoff:%d Retry:%d State:%x ctid:%p rtid:%p\n",
node_ptr->hostname.c_str(),
get_resetStages_str(node_ptr->resetStage).c_str(),
node_ptr->hwmon_reset.attempts,
node_ptr->hwmon_reset.holdoff,
node_ptr->hwmon_reset.retries,
node_ptr->hwmon_reset.state,
node_ptr->hwmon_reset.control_timer.tid,
node_ptr->hwmon_reset.recovery_timer.tid);
mem_log (str);
}
void nodeLinkClass::mem_log_network ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s %s infra_ip: %s Uptime: %u\n",
node_ptr->hostname.c_str(),
node_ptr->mac.c_str(),
node_ptr->ip.c_str(),
node_ptr->infra_ip.c_str(),
node_ptr->uptime );
mem_log (str);
}
void nodeLinkClass::mem_log_heartbeat ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Minor:%s Degrade:%s Failed:%s Monitor:%s\n",
node_ptr->hostname.c_str(),
get_iface_name_str (iface),
node_ptr->hbs_minor[iface] ? "true " : "false",
node_ptr->hbs_degrade[iface] ? "true " : "false",
node_ptr->hbs_failure[iface] ? "true " : "false",
node_ptr->monitor[iface] ? "YES" : "no" );
mem_log (str);
}
}
void nodeLinkClass::mem_log_hbs_cnts ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Counts Minor:%d Degrade:%d Failed:%d Max:%d Cur:%d\n",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
node_ptr->hbs_minor_count[iface],
node_ptr->hbs_degrade_count[iface],
node_ptr->hbs_failure_count[iface],
node_ptr->max_count[iface],
node_ptr->hbs_count[iface]);
mem_log (str);
}
}
void nodeLinkClass::mem_log_test_info ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tOOS Stage:%s Runs:%d - INSV Stage:%s Runs:%d\n",
node_ptr->hostname.c_str(),
get_oosTestStages_str(node_ptr->oosTestStage).c_str(),
node_ptr->oos_test_count,
get_insvTestStages_str(node_ptr->insvTestStage).c_str(),
node_ptr->insv_test_count);
mem_log (str);
}
void nodeLinkClass::mem_log_thread_info ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tThread Stage:%d Runs:%d Progress:%d Ctrl Status:%d Thread Status:%d\n",
node_ptr->hostname.c_str(),
node_ptr->ipmitool_thread_ctrl.stage,
node_ptr->ipmitool_thread_ctrl.runcount,
node_ptr->ipmitool_thread_info.progress,
node_ptr->ipmitool_thread_ctrl.status,
node_ptr->ipmitool_thread_info.status);
mem_log (str);
}
void nodeLinkClass::mem_log_type_info ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tSystem:%d NodeMask: %x Function: %s (%u)\n",
node_ptr->hostname.c_str(),
this->system_type,
node_ptr->nodetype,
node_ptr->function_str.c_str(),
node_ptr->function);
mem_log (str);
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tSub-Function: %s (%u) (SubFunc Enabled:%c)\n",
node_ptr->hostname.c_str(),
node_ptr->subfunction_str.c_str(), node_ptr->subfunction,
node_ptr->subf_enabled ? 'Y' : 'n' );
mem_log (str);
}
}
void mem_log_delimit_host ( void )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "-------------------------------------------------------------\n");
mem_log (str);
}
void nodeLinkClass::memDumpNodeState ( string hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
{
mem_log ( hostname, ": ", "Not Found\n" );
return ;
}
else
{
if ( maintenance == true )
{
mem_log_dor ( node_ptr );
mem_log_identity ( node_ptr );
mem_log_type_info ( node_ptr );
mem_log_network ( node_ptr );
mem_log_state1 ( node_ptr );
mem_log_state2 ( node_ptr );
// mem_log_reset_info ( node_ptr );
mem_log_power_info ( node_ptr );
mem_log_alarm1 ( node_ptr );
mem_log_mtcalive ( node_ptr );
mem_log_stage ( node_ptr );
mem_log_bm ( node_ptr );
mem_log_test_info ( node_ptr );
mem_log_thread_info( node_ptr );
workQueue_dump ( node_ptr );
}
if ( heartbeat == true )
{
mem_log_heartbeat( node_ptr );
mem_log_hbs_cnts ( node_ptr );
}
mem_log_delimit_host ();
}
}
void nodeLinkClass::memDumpAllState ( void )
{
mem_log_delimit_host ();
mem_log_general ();
if ( nodeLinkClass::maintenance == true )
{
mem_log_general_mtce_hosts();
mem_log_mnfa ();
}
mem_log_delimit_host ();
/* walk the node list looking for nodes that should be monitored */
for ( struct node * ptr = head ; ptr != NULL ; ptr = ptr->next )
{
memDumpNodeState ( ptr->hostname );
}
}
/***************************************************************************
* *
* Module Test Head *
* *
***************************************************************************/
int nodeLinkClass::testhead ( int test )
{
UNUSED(test);
return (PASS) ;
}