
The current sensor list is shared across all hosts. On large systems, this can lead to list corruption when host sensor read threads output data concurrently. This update moves sensor_list to be thread local, so each thread gets its own unique instance. Although thread_local variables are not on the stack, their memory is tied to the thread’s resources. In many cases, this memory is drawn from the same per-thread region as the stack, also known as TLS (Thread-Local Storage). The TLS area is often allocated adjacent to or within the thread’s stack mapping. A large thread_local variable increases the TLS requirement, and if it exceeds the reserved space or overlaps with the stack, thread creation may fail with Resource temporarily unavailable. To accommodate this, the per-thread stack size was increased. The sensor_list allocates for up to 512 sensors per host, which is excessive. This update reduces the max sensors per host to 256, cutting the list size from 327 KB to 163 KB per thread. Even with this reduction, the thread stack size needed to be increased from 128 KB to 512 KB. The Mtce Thread utility was updated to support custom stack sizes. This allows mtcAgent to remain at 128 KB while hwmond threads can specify a larger size. This update also adds a debug feature to create dated sensor reading files for each host. While testing, it was found that output files were created with inconsistent permissions. This update fixes the file mode to 0644. Test Plan: Verified in 2+2+50 node system PASS: Verify large system install and sensor monitoring PASS: Verify large system sensor monitoring over DOR and Swact PASS: Verify the sensor_sample list storage is unique per thread PASS: Verify sensor read file permissions PASS: Verify dated debug sensor read files PASS: Verify added debug options are disabled by default PASS: Verify 24 hour provision/monitor/deprovision soak PASS: Verify sensor monitoring following host delete and readd PASS: Verify sensor model is deleted completely with host delete PASS: Verify sensor model is recreated over host readd Regression: PASS: Verify sensor monitoring and alarm management PASS: Verify hardware monitor process restart handling PASS: Verify no coredumps PASS: Verify logging for all test cases Closes-Bug: 2102671 Change-Id: I9263ec2242e03d46e9dc768af965fed7e1ac9175 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
375 lines
13 KiB
C++
Executable File
375 lines
13 KiB
C++
Executable File
#ifndef __DAEMON_COMMON_H__
|
|
#define __DAEMON_COMMON_H__
|
|
/*
|
|
* Copyright (c) 2013, 2016 Wind River Systems, Inc.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
* Wind River CGTS Platform Common Maintenance Header
|
|
*/
|
|
|
|
#include <iostream>
|
|
#include <string>
|
|
|
|
using namespace std ;
|
|
|
|
#include "logMacros.h"
|
|
#include "returnCodes.h"
|
|
|
|
#ifndef UNUSED
|
|
#define UNUSED(_x_) ((void) _x_)
|
|
#endif
|
|
|
|
#ifndef MEMSET_ZERO
|
|
#define MEMSET_ZERO(_y_) (memset (&_y_,0,sizeof(_y_)))
|
|
#endif
|
|
|
|
#define DEBUG_HALT ilog ("HALTED !!!!\n"); \
|
|
for ( ;; ) \
|
|
{ \
|
|
daemon_signal_hdlr() ; \
|
|
}
|
|
|
|
|
|
/* List of different types */
|
|
typedef enum
|
|
{
|
|
SYSTEM_TYPE__NORMAL =0,
|
|
SYSTEM_TYPE__AIO__DUPLEX =1,
|
|
SYSTEM_TYPE__AIO__DUPLEX_DIRECT =2,
|
|
SYSTEM_TYPE__AIO__SIMPLEX =3,
|
|
} system_type_enum ;
|
|
|
|
|
|
/** Called by signal handler on daemon exit
|
|
* Performs cleanup by closing open files
|
|
* and freeing used memory */
|
|
void daemon_exit ( void );
|
|
|
|
/** daemon_files.cpp cleanup utility */
|
|
void daemon_files_fini ( void );
|
|
|
|
/** daemon_files.cpp init utility
|
|
* Creates log file, process id file
|
|
* and a process fill script */
|
|
int daemon_files_init ( void );
|
|
int daemon_create_pidfile ( void );
|
|
void daemon_remove_pidfile ( void );
|
|
void daemon_remove_file ( const char * filename );
|
|
void daemon_rename_file ( const char * path, const char * old_filename, const char * new_filename );
|
|
void daemon_make_dir ( const char * dir );
|
|
int daemon_copy_file (string hostname, const char *source );
|
|
string daemon_read_file ( const char * filename );
|
|
|
|
void daemon_logfile_close ( void );
|
|
void daemon_logfile_open ( void );
|
|
|
|
int daemon_log ( const char * filename , const char * str );
|
|
int daemon_log_value ( const char * filename , int val );
|
|
int daemon_log_value ( const char * filename , const char * str, int val );
|
|
|
|
/* reads the first line of a file and if it contains a string
|
|
* that represents an integer value then return it */
|
|
int daemon_get_file_int ( const char * filename );
|
|
string daemon_get_file_str ( const char * filename );
|
|
|
|
string daemon_nodetype ( void );
|
|
string daemon_clstr_iface ( void );
|
|
string daemon_mgmnt_iface ( void );
|
|
string daemon_sw_version ( void );
|
|
string daemon_bmc_hosts_file ( void );
|
|
string daemon_bmc_hosts_dir ( void );
|
|
string daemon_md5sum_file ( const char * file );
|
|
|
|
system_type_enum daemon_system_type ( void );
|
|
|
|
char * daemon_get_iface_master ( char * iface_slave_ptr );
|
|
|
|
string get_shadow_signature ( char * shadowfile , const char * username,
|
|
char * shadowinfo, size_t infolen);
|
|
|
|
void daemon_healthcheck ( const char * sig );
|
|
void daemon_health_test ( void );
|
|
|
|
bool daemon_is_file_present ( const char * filename );
|
|
bool daemon_is_os_debian ( void );
|
|
int daemon_get_rmem_max ( void );
|
|
|
|
typedef struct
|
|
{
|
|
int count ;
|
|
int warnings ;
|
|
int errors ;
|
|
} status_type ;
|
|
|
|
void daemon_dump_info ( void ); /**< Common info dump utility */
|
|
const char * daemon_stream_info ( void ); /**< Send the dump info as a string */
|
|
|
|
void get_debug_options ( const char * file , daemon_config_type * ptr );
|
|
|
|
|
|
/**
|
|
* Read and process mtc.ini file settings into the daemon configuration
|
|
*/
|
|
int daemon_configure ( void );
|
|
|
|
/* Set default config values.
|
|
* This is especially important for char 8 options that default to null. */
|
|
void daemon_config_default ( daemon_config_type * config_ptr );
|
|
|
|
/**
|
|
* Initialize the daemon main service
|
|
*
|
|
* @param iface
|
|
*- user can overide the management interface via -i option on nthe command line
|
|
*
|
|
*/
|
|
int daemon_init ( string iface , string nodetype );
|
|
|
|
/**
|
|
* Run the daemon service
|
|
*/
|
|
void daemon_service_run ( void );
|
|
|
|
/* Don't return from this call until the specified file exists
|
|
* or the timeout is exceeded. In the timeout case a FAIL_TIMEOUT
|
|
* is returned. */
|
|
int daemon_wait_for_file ( const char * filename, int timeout );
|
|
|
|
/**
|
|
* Daemon Signal management - init and main loop handler
|
|
*/
|
|
int daemon_signal_init ( void );
|
|
void daemon_signal_hdlr ( void );
|
|
void daemon_sigchld_hdlr ( void );
|
|
|
|
/**
|
|
* Control the enabled state of the signal handler latency monitor
|
|
* true = enabled
|
|
*/
|
|
void daemon_latency_monitor ( bool state );
|
|
|
|
void daemon_dump_cfg ( void );
|
|
|
|
int timeout_config_handler ( void * user,
|
|
const char * section,
|
|
const char * name,
|
|
const char * value);
|
|
|
|
int debug_config_handler ( void * user,
|
|
const char * section,
|
|
const char * name,
|
|
const char * value);
|
|
|
|
int sysinv_config_handler ( void * user,
|
|
const char * section,
|
|
const char * name,
|
|
const char * value);
|
|
|
|
int barbican_config_handler ( void * user,
|
|
const char * section,
|
|
const char * name,
|
|
const char * value);
|
|
|
|
int client_timeout_handler ( void * user,
|
|
const char * section,
|
|
const char * name,
|
|
const char * value);
|
|
|
|
/* User selectable heartbeat failure actions */
|
|
typedef enum
|
|
{
|
|
HBS_FAILURE_ACTION__NONE = 0, /* no heartbeat tally */
|
|
HBS_FAILURE_ACTION__ALARM = 1, /* alarm only */
|
|
HBS_FAILURE_ACTION__DEGRADE = 2, /* degrade and alarm */
|
|
HBS_FAILURE_ACTION__FAIL = 3, /* fail and alarm */
|
|
} hbs_failure_action_enum ;
|
|
|
|
#define HBS_FAILURE_ACTION__NONE_STR ((const char *)("none"))
|
|
#define HBS_FAILURE_ACTION__ALARM_STR ((const char *)("alarm"))
|
|
#define HBS_FAILURE_ACTION__DEGRADE_STR ((const char *)("degrade"))
|
|
#define HBS_FAILURE_ACTION__FAIL_STR ((const char *)("fail"))
|
|
|
|
hbs_failure_action_enum
|
|
get_hbs_failure_action ( daemon_config_type & config );
|
|
|
|
/** Test Head Entry */
|
|
int daemon_run_testhead ( void );
|
|
/**
|
|
* Debug API used to set module debug level.
|
|
*/
|
|
#define CONFIG_AGENT_HBS_PERIOD 0x00000001 /**< Service period */
|
|
#define CONFIG_AGENT_LOC_TIMEOUT 0x00000002 /**< Loss Of Comm Timeout */
|
|
#define CONFIG_AGENT_MULTICAST 0x00000004 /**< Multicase Addr */
|
|
#define CONFIG_SCHED_PRIORITY 0x00000008 /**< Scheduling priority */
|
|
#define CONFIG_AGENT_HBS_MGMNT_PORT 0x00000010 /**< Management Pulse Rx Port */
|
|
#define CONFIG_AGENT_HBS_CLSTR_PORT 0x00000020 /**< Cluster-host Pulse Rx Port*/
|
|
#define CONFIG_AGENT_HBS_DEGRADE 0x00000040 /**< Heartbeat degrade */
|
|
#define CONFIG_AGENT_HBS_FAILURE 0x00000080 /**< Heartbeat failure */
|
|
#define CONFIG_AGENT_INV_PORT 0x00000100 /**< Inventory Port Number */
|
|
#define CONFIG_AGENT_HA_PORT 0x00000200 /**< HA Framework Port Number */
|
|
#define CONFIG_CLIENT_MTCALARM_PORT 0x00000400 /**< Send alarm requests to */
|
|
#define CONFIG_AGENT_SM_CLIENT_PORT 0x00000800 /**< Port to Send SM data on */
|
|
#define CONFIG_MTC_TO_HWMON_CMD_PORT 0x00001000 /**< HWmon Port Number */
|
|
#define CONFIG_AGENT_KEY_PORT 0x00002000 /**< Keystone HTTP port */
|
|
#define CONFIG_AGENT_HBS_MTC_PORT 0x00004000 /**< Heartbeat Service Port */
|
|
#define CONFIG_AGENT_INV_EVENT_PORT 0x00008000 /**< Inventory Event Port */
|
|
#define CONFIG_AGENT_API_RETRIES 0x00010000 /**< Num api retries b4 fail */
|
|
#define CONFIG_AGENT_MTC_CLSTR_PORT 0x00020000 /**< Agent Clstr network port */
|
|
#define CONFIG_AGENT_MTC_MGMNT_PORT 0x00040000 /**< Agent Mgmnt network port */
|
|
#define CONFIG_AGENT_TOKEN_REFRESH 0x00080000 /**< Token refresh rate mask */
|
|
#define CONFIG_CLIENT_MTC_CLSTR_PORT 0x00100000 /**< Client Clstr nwk mtc port */
|
|
#define CONFIG_CLIENT_MTC_MGMNT_PORT 0x00200000 /**< Client mgmnt nwk mtc port */
|
|
#define CONFIG_AGENT_SM_SERVER_PORT 0x00400000 /**< Port to RX data from SM */
|
|
#define CONFIG_CLIENT_HBS_CLSTR_PORT 0x00800000 /**< Cluster-host ntwk Port */
|
|
#define CONFIG_CLIENT_HBS_MGMNT_PORT 0x01000000 /**< Management network Port */
|
|
#define CONFIG_CLIENT_HBS_EVENT_PORT 0x02000000 /**< Heartbeat Event Messaging */
|
|
#define CONFIG_MTC_TO_HBS_CMD_PORT 0x04000000 /**< Mtce to Hbs Command Port */
|
|
#define CONFIG_HBS_TO_MTC_EVENT_PORT 0x08000000 /**< Hbs to Mtc Event Port */
|
|
#define CONFIG_CLIENT_PULSE_PORT 0x10000000 /**< Pmon pulse port */
|
|
#define CONFIG_AGENT_SECRET_PORT 0x20000000 /**< Barbican HTTP port */
|
|
#define CONFIG_AGENT_VIM_EVENT_PORT 0x40000000 /**< VIM Event Port Mask */
|
|
|
|
typedef struct {
|
|
struct timespec ts ;
|
|
struct tm t;
|
|
char time_buff[50];
|
|
} time_debug_type ;
|
|
|
|
typedef struct
|
|
{
|
|
long secs ;
|
|
long msecs ;
|
|
} time_delta_type ;
|
|
|
|
int timedelta ( struct timespec & before , struct timespec & after, time_delta_type & delta );
|
|
int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_type & delta );
|
|
int gettime ( time_debug_type & nowtime ) ;
|
|
unsigned long long gettime_monotonic_nsec ( void );
|
|
|
|
/* get formatted future time for number of seconds from now */
|
|
char * future_time ( int secs );
|
|
|
|
|
|
/*****************************************************************************************
|
|
*
|
|
* ####### ### ####### ##### # # ###### ###### ####### ###### #######
|
|
* # # # # # # # # # # # # # # # #
|
|
* # # # # # # # # # # # # # # #
|
|
* ##### # # ##### # # ###### ###### # # ###### #
|
|
* # # # # # # # # # # # # #
|
|
* # # # # # # # # # # # # # #
|
|
* # ### # ##### ##### # # ####### # # #
|
|
*
|
|
* Allows a single fault insertion condition to be created and monitored in a commo way
|
|
* for any maintenance daemon.
|
|
*
|
|
* Here is how it works.
|
|
*
|
|
* Daemons that want fit support must add daemon_load_fit to its main loop whic will
|
|
* detect and load any new fit requests.
|
|
*
|
|
* Create '/var/run/fit/fitinfo' file with the following labels (with no spaces)
|
|
*
|
|
* proc=hwmond ; specifies the process name to apply this fit to
|
|
* code=1 ; specifies the unique fit code to loom for
|
|
* hits=2 ; specifies nmber of hits before clearing fit info ; defaults to 1
|
|
*
|
|
* if ( daemon_want_fit ( MY_FIT_CODE ) == true )
|
|
* do_fit_condition
|
|
*
|
|
* Add additional labels for further fit refinements ...
|
|
*
|
|
*
|
|
*
|
|
*
|
|
* host=compute-0
|
|
*
|
|
* if ( daemon_want_fit ( MY_FIT_CODE , hostname ) == true )
|
|
* do_fit_condition
|
|
*
|
|
*
|
|
*
|
|
* name=Temp_CPU0
|
|
*
|
|
* if ( daemon_want_fit ( MY_FIT_CODE, hostname, "Temp_CPU0" ) == true )
|
|
* do_fit_condition
|
|
*
|
|
*
|
|
*
|
|
* data=cr
|
|
*
|
|
* if ( daemon_want_fit ( MY_FIT_CODE hostname, "Temp_CPU0", data ) == true )
|
|
* do_fit_condition_with data
|
|
*
|
|
*
|
|
*
|
|
* When the 'daemon_load_fit' sees this file it will load its content and rename
|
|
* /var/run/fit/fitinfo /var/run/fit/fitinfo.renamed.
|
|
*
|
|
* daemon_want_fit returns a true when that fit condition is met and hits is decremented
|
|
* when hits becomes 0 the fit is removed from memory and requires fitinfo.renamed to be
|
|
* recopied to fitinfo for that fit to be seen and loaded again.
|
|
*
|
|
*****************************************************************************************/
|
|
|
|
// #define WANT_FIT_TESTING
|
|
|
|
#ifdef WANT_FIT_TESTING
|
|
|
|
#define FIT__INFO_FILE ("/var/run/fit/fitinfo")
|
|
#define FIT__INFO_FILEPATH ("/var/run/fit")
|
|
#define FIT__INFO_FILENAME ("fitinfo")
|
|
#define FIT__INFO_FILENAME_RENAMED ("fitinfo.renamed")
|
|
|
|
#define FIT__INIT_FILE ("/var/run/fit/fitinit")
|
|
#define FIT__INIT_FILEPATH ("/var/run/fit")
|
|
#define FIT__INIT_FILENAME ("fitinit")
|
|
#define FIT__INIT_FILENAME_RENAMED ("fitinit.renamed")
|
|
|
|
/* Common Fault Insertion Structure */
|
|
typedef struct
|
|
{
|
|
int code ; /* the unique code specifying the condition to fault */
|
|
int hits ; /* how many times to run fit before it auto clears */
|
|
string proc ; /* the daemon to apply the fit to */
|
|
string host ; /* host to apply the fit to */
|
|
string name ; /* refinement of the fit code to a specific condition */
|
|
string data ; /* returned fit data for specified named condition */
|
|
} daemon_fit_type ;
|
|
|
|
#endif
|
|
|
|
/* Init / Clear the in-memory fit info struct.
|
|
* Automatically called during files init.
|
|
* Can be explicitely called to force remove fit condition. */
|
|
void daemon_init_fit ( void );
|
|
|
|
/* Load fit info from /var/run/fit/fitinfo file.
|
|
* Add a call to this to the daemon's main loop */
|
|
int daemon_load_fit ( void ) ;
|
|
|
|
/* add hits to fit */
|
|
void daemon_hits_fit ( int hits );
|
|
|
|
/* Check for specific fit enabled conditions */
|
|
bool daemon_want_fit ( int code );
|
|
bool daemon_want_fit ( int code, string hostname );
|
|
bool daemon_want_fit ( int code, string hostname, string name );
|
|
|
|
/* ... and in this case update fit data reference string when hit */
|
|
bool daemon_want_fit ( int code, string hostname, string name, string & data );
|
|
|
|
/* Prints the in-memory loaded fit data.
|
|
* This is called on new fit info load (and file rename) */
|
|
void daemon_print_fit( bool hit );
|
|
|
|
|
|
void daemon_do_segfault ( void );
|
|
|
|
#endif /* __MTC_COMMON_H__ */
|