Merge "Make Hardware Monitor sensor list a thread local variable"

This commit is contained in:
Zuul 2025-03-31 12:33:40 +00:00 committed by Gerrit Code Review
commit 121425ce70
14 changed files with 171 additions and 38 deletions

@ -28,6 +28,8 @@ using namespace std;
#define BMC_PROTOCOL__REDFISH_STR ((const char *)("redfish"))
#define BMC_PROTOCOL__IPMITOOL_STR ((const char *)("ipmitool"))
#define BMC_PROTOCOL__REDFISHTOOL_STR ((const char *)("redfishtool"))
#define WANT_DATED_IPMI_SENSOR_DATA_FILES ((const char *)("/var/run/bmc/ipmitool/want_dated_sensor_data_files"))
#define WANT_DATED_REDFISH_SENSOR_DATA_FILES ((const char *)("/var/run/bmc/redfishtool/want_dated_sensor_data_files"))
/* learned graceful and immediate power control command strings */
typedef struct

@ -982,7 +982,7 @@ int httpUtil_api_request ( libEvent & event )
}
httpUtil_api_request_done:
daemon_signal_hdlr ();
httpUtil_free_conn ( event );
httpUtil_free_base ( event );
@ -1085,7 +1085,7 @@ void httpUtil_log_event ( libEvent * event_ptr )
}
snprintf (&rest_api_log_str[0], MAX_API_LOG_LEN-1,
"%s [%5d] %s %s '%s' seq:%d -> Status : %d {execution time %ld.%06ld secs}\n",
"%s [%5d] %s %s '%s' seq:%d -> Status : %d {execution time %ld.%ld secs}\n",
pt(), getpid(),
event_ptr->hostname.c_str(),
event_ptr->service.c_str(),

@ -2500,8 +2500,10 @@ static void redirect_stdout_stderr(const string& hostname,
stdout_copy = -1;
stderr_copy = -1;
// Open file with explicit permissions: rw-r--r-- (0644)
int redirect_fd = open(output_filename.c_str(),
O_CREAT | O_WRONLY | O_TRUNC);
O_CREAT | O_WRONLY | O_TRUNC,
S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); // 0644
if ( redirect_fd < 0 )
{
elog ("%s failed to open output filename: [%s] - error code = %d (%s)",

@ -50,7 +50,7 @@ static unsigned int __thread_init_sig ;
static std::string threadStages_str[THREAD_STAGE__STAGES+1];
int threadUtil_init ( void (*handler)(int, siginfo_t*, void* ))
int threadUtil_init ( void (*handler)(int, siginfo_t*, void* ), size_t stack_size )
{
/* preserve parent process timer handler */
thread_timer_handler = handler ;
@ -67,24 +67,23 @@ int threadUtil_init ( void (*handler)(int, siginfo_t*, void* ))
/* setup to create a 'detached' threads */
pthread_attr_init(&__attr);
pthread_attr_setdetachstate(&__attr, PTHREAD_CREATE_DETACHED);
threadUtil_setstack_size ();
threadUtil_setstack_size ( stack_size );
__thread_init_sig = THREAD_INIT_SIG ;
return (PASS);
}
#define MTCE_PTHREAD_MAX_STACK_SIZE (0x20000) /* 128K */
void threadUtil_setstack_size ( void )
void threadUtil_setstack_size ( size_t stack_size )
{
size_t stack_size_before = 0 ;
size_t stack_size_after = 0 ;
/* manage pthread stack size */
if ( pthread_attr_getstacksize (&__attr,&stack_size_before) == PASS )
{
if ( stack_size_before > MTCE_PTHREAD_MAX_STACK_SIZE )
if ( stack_size_before > stack_size )
{
if ( pthread_attr_setstacksize ( &__attr, MTCE_PTHREAD_MAX_STACK_SIZE ) == PASS )
if ( pthread_attr_setstacksize ( &__attr, stack_size ) == PASS )
{
if ( pthread_attr_getstacksize (&__attr,&stack_size_after) == PASS )
{

@ -257,7 +257,7 @@ typedef struct
/* module init/fini */
void threadUtil_fini ( void );
int threadUtil_init ( void (*handler)(int, siginfo_t*, void* ));
int threadUtil_init ( void (*handler)(int, siginfo_t*, void* ), size_t stack_size);
#define DEFAULT_SYSTEM_REQUEST_LATENCY_SECS (unsigned long long)(15)
int threadUtil_bmcSystemCall (string hostname,
@ -265,7 +265,7 @@ int threadUtil_bmcSystemCall (string hostname,
string datafile,
unsigned long long latency_threshold_secs);
void threadUtil_setstack_size ( void );
void threadUtil_setstack_size ( size_t stack_size );
/* Onetime thread init setup */
void thread_init ( thread_ctrl_type & ctrl,

@ -62,7 +62,7 @@ void daemon_remove_pidfile ( void );
void daemon_remove_file ( const char * filename );
void daemon_rename_file ( const char * path, const char * old_filename, const char * new_filename );
void daemon_make_dir ( const char * dir );
int daemon_copy_file (string hostname, const char *source );
string daemon_read_file ( const char * filename );
void daemon_logfile_close ( void );

@ -121,6 +121,69 @@ bool daemon_is_file_present ( const char * filename )
return (false);
}
static void get_current_date_as_string ( char *date_str, size_t size )
{
time_t now = time(NULL);
struct tm *t = localtime(&now);
// Suffix Format: _YYYY-MM-DD_HH-MM-SS
strftime(date_str, size, "%Y-%m-%d_%H-%M-%S", t);
}
// 16KB buffer for efficient copying
#define MAX_FILE_CONTENT_BUFFER_SIZE 0x4000 // 16 KBytes
int daemon_copy_file ( string hostname, const char *source )
{
// Open source file in binary mode
FILE *src = fopen (source, "rb");
if (!src)
{
// Error path
wlog ("%s unable to open source file: %s ; (%d:%m)", hostname.c_str(), source, errno);
return FAIL ;
}
// Generate the destination filename with date suffix
// Format: _YYYY-MM-DD_HH-MM-SS needs 21 chars with null termination
char date_suffix[21];
get_current_date_as_string (date_suffix, sizeof(date_suffix));
// Max hostname size is 256 plus extra for path and the rest
// of the filename size with the dated suffix. 512 is ample.
// Example /var/run/bmc/redfishtool/hwmond_controller-0_thermal_sensor_data
char destination[512];
// Create the date suffixed destination filename
snprintf(destination, sizeof(destination), "%s_%s", source, date_suffix);
// Open destination file in binary mode
FILE *dest = fopen(destination, "wb");
if (!dest)
{
// Error path
wlog ("%s failed to open destination file '%s' for copy operation",
hostname.c_str(), destination);
fclose (src);
return FAIL ;
}
char buffer [MAX_FILE_CONTENT_BUFFER_SIZE];
size_t bytes_read;
// Read from source and write to destination in chunks
while ((bytes_read = fread(buffer, 1, MAX_FILE_CONTENT_BUFFER_SIZE, src)) > 0)
fwrite(buffer, 1, bytes_read, dest);
// Close files
fclose (src);
fclose (dest);
ilog ("%s file '%s' copied to '%s'\n", hostname.c_str(), source, destination);
return PASS ;
}
void daemon_healthcheck ( const char * sig )
{
FILE * hc_file_stream ;

@ -49,8 +49,8 @@ using namespace std;
#endif
#define __AREA__ "mon"
#define MAX_HOST_SENSORS (512) // (100)
#define MAX_HOST_GROUPS (20)
#define MAX_HOST_SENSORS (256)
#define MAX_HOST_GROUPS (10)
#define MIN_SENSOR_GROUPS (4)
#define HWMON_DEFAULT_LARGE_INTERVAL (MTC_MINS_15)
#define HWMON_DEFAULT_AUDIT_INTERVAL (MTC_MINS_2)
@ -58,6 +58,7 @@ using namespace std;
#define DEGRADE_AUDIT_TRIGGER (2)
#define MAX_SENSORS_NOT_FOUND (5)
#define START_DEBOUCE_COUNT (1)
#define HWMOND_STACK_SIZE (0x80000) // 512 KByes
// Power sensor data for Dell R740-emc-1 needs 45KiB
// Thermal sensor readout on wolfpass requires 20KiB

@ -82,9 +82,9 @@ void sensor_data_init ( sensor_data_type & data )
*
*****************************************************************************/
void sensor_data_print ( const sensor_data_type & data )
void sensor_data_print ( string & hostname, const sensor_data_type & data )
{
blog3 ("%s is %s : %s (%s) %s %s %s %s %s %s %s\n",
blog3 ("%s %s is %s : %s (%s) %s %s %s %s %s %s %s\n", hostname.c_str(),
data.name.c_str(),
data.status.c_str(),
data.value.c_str(),
@ -107,7 +107,7 @@ void sensor_data_print ( const sensor_data_type & data )
*
*****************************************************************************/
int bmc_load_json_sensor ( sensor_data_type & sensor_data , string json_sensor_data )
int bmc_load_json_sensor ( string & hostname, sensor_data_type & sensor_data , string json_sensor_data )
{
int rc = FAIL_KEY_VALUE_PARSE ;
// ilog ("sensor data:%s\n", json_sensor_data.c_str() );
@ -126,7 +126,7 @@ int bmc_load_json_sensor ( sensor_data_type & sensor_data , string json_sensor_d
sensor_data.ucr = jsonUtil_get_key_value_string ( raw_obj, "ucr" ) ;
sensor_data.unc = jsonUtil_get_key_value_string ( raw_obj, "unc" ) ;
sensor_data_print ( sensor_data );
sensor_data_print ( hostname, sensor_data );
json_object_put(raw_obj);
rc = PASS ;
@ -310,7 +310,7 @@ int hwmonHostClass::bmc_load_sensor_samples ( struct hwmonHostClass::hwmon_host
rc = jsonUtil_get_array_idx ( msg_ptr, BMC_JSON__SENSORS_LABEL, index, sensor_data ) ;
if ( rc == PASS )
{
if ( bmc_load_json_sensor ( host_ptr->sample[host_ptr->samples], sensor_data ) == PASS )
if ( bmc_load_json_sensor ( host_ptr->hostname , host_ptr->sample[host_ptr->samples], sensor_data ) == PASS )
{
bool found = false ;
@ -635,7 +635,7 @@ int hwmonHostClass::bmc_update_sensors ( struct hwmonHostClass::hwmon_host * hos
host_ptr->sensor[i].sensorname.c_str(),
bmc_status);
sensor_data_print (host_ptr->sample[j]);
sensor_data_print ( host_ptr->hostname, host_ptr->sample[j]);
blog3 ("%s ... %s\n", host_ptr->hostname.c_str(), host_ptr->bmc_thread_info.data.c_str());
host_ptr->sensor[i].sample_severity = HWMON_SEVERITY_MINOR ;
@ -698,7 +698,7 @@ int hwmonHostClass::bmc_update_sensors ( struct hwmonHostClass::hwmon_host * hos
host_ptr->sensor[i].sensorname.c_str(),
bmc_status);
sensor_data_print (host_ptr->sample[j]);
sensor_data_print ( host_ptr->hostname, host_ptr->sample[j]);
blog3 ("%s ... %s\n", host_ptr->hostname.c_str(), host_ptr->bmc_thread_info.data.c_str());
host_ptr->sensor[i].sample_severity = HWMON_SEVERITY_MINOR ;

@ -41,9 +41,9 @@
#define MAX_IPMITOOL_PARSE_ERRORS (20)
void sensor_data_init ( sensor_data_type & data );
void sensor_data_print ( const sensor_data_type & data );
void sensor_data_print ( string & hostname, const sensor_data_type & data );
void sensor_data_copy ( sensor_data_type & from, sensor_data_type & to );
int bmc_load_json_sensor ( sensor_data_type & sensor_data , string json_sensor_data );
int bmc_load_json_sensor ( string & hostname, sensor_data_type & sensor_data , string json_sensor_data );
#endif

@ -272,8 +272,6 @@ int daemon_init ( string iface, string nodetype )
obj_ptr->system_type = daemon_system_type ();
threadUtil_init ( hwmonTimer_handler ) ;
/* Bind signal handlers */
if ( daemon_signal_init () != PASS )
{
@ -295,7 +293,7 @@ int daemon_init ( string iface, string nodetype )
rc = FAIL_SOCKET_INIT ;
}
threadUtil_init ( hwmonTimer_handler ) ;
threadUtil_init ( hwmonTimer_handler, HWMOND_STACK_SIZE ) ;
/* override the config reload for the startup case */
obj_ptr->config_reload = false ;

@ -38,17 +38,55 @@ using namespace std;
#include "hwmonClass.h" /* for ... thread_extra_info_type */
#include "nodeUtil.h" /* for ... fork_execv */
/***************************************************************************
*
* Name : bmc_sample_type
*
* Description: An array of sensor data.
*
* _sample_list
*
***************************************************************************/
static bmc_sample_type _sample_list[MAX_HOST_SENSORS] ;
/* One instance per thread. Uses the memory allocated for the stack.
*
* Although thread_local variables are not on the stack, they still
* consume memory thats tied to the threads overall resources,
* and that memory often comes from the same per-thread allocation
* that includes the stack ; refer to TLS (Thread-Local Storage).
* The TLS area is often allocated adjacent to or within the thread's
* stack mapping. A large thread_local variable increases the TLS
* memory requirement, and if it exceeds the reserved space or
* overlaps with the stack space, the OS may fail to allocate the
* thread with a errno "Resource temporarily unavailable".
* This allocation required the per thread stack to be increased. */
thread_local bmc_sample_type _sample_list[MAX_HOST_SENSORS];
// #define WANT_SAMPLE_LIST_DEBUG
#ifdef WANT_SAMPLE_LIST_DEBUG
void print_sample_list ( string & hostname )
{
bool empty = false ;
for ( int i = 0 ; i < MAX_HOST_SENSORS ; i++)
{
if ( strlen ( _sample_list[i].name ) != 0 )
{
if ( empty )
{
slog ("%s has sparse sensor list ; gap at %d", hostname.c_str(), i);
empty = false ;
}
ilog ("%s Sample %d: %s - %s - %s - %s ... %s - %s - %s - %s - %s - %s",
hostname.c_str(), i,
_sample_list[i].name,
_sample_list[i].value,
_sample_list[i].unit,
_sample_list[i].status,
_sample_list[i].lnr,
_sample_list[i].lcr,
_sample_list[i].lnc,
_sample_list[i].unc,
_sample_list[i].ucr,
_sample_list[i].unr);
}
else
{
empty = true ;
}
}
}
#endif // WANT_SAMPLE_LIST_DEBUG
/***************************************************************************
*
@ -164,6 +202,11 @@ static void _parse_sensor_data ( thread_info_type * info_ptr )
info_ptr->data.append (",\"");
info_ptr->data.append (BMC_JSON__SENSORS_LABEL);
info_ptr->data.append ("\":[");
#ifdef WANT_SAMPLE_LIST_DEBUG
print_sample_list ( info_ptr->hostname );
#endif // WANT_SAMPLE_LIST_DEBUG
for ( int i = 0 ; i < samples ; )
{
_add_json_sensor_tuple ( &_sample_list[i], info_ptr->data ) ;
@ -331,6 +374,7 @@ void * hwmonThread_ipmitool ( void * arg )
/* the number of sensors are learned */
extra_ptr->samples = samples = 0 ;
MEMSET_ZERO (_sample_list);
switch ( info_ptr->command )
{
case BMC_THREAD_CMD__POWER_STATUS:
@ -542,7 +586,17 @@ void * hwmonThread_ipmitool ( void * arg )
unlink(info_ptr->password_file.data());
daemon_remove_file (info_ptr->password_file.data());
// info_ptr->password_file.clear();
/* Debug Option - enable lane debug_bmgt3 = 8 and touch
* /var/run/bmc/ipmitool/want_dated_sensor_data_files for ipmi
* or
* /var/run/bmc/redfishtool/want_dated_sensor_data_files for redfish
*
* ... to save ther current sensor read file with a dated extension
* so that a read history is maintained for debug purposes. */
if(daemon_get_cfg_ptr()->debug_bmgmt&8)
if ( daemon_is_file_present (WANT_DATED_IPMI_SENSOR_DATA_FILES))
daemon_copy_file(info_ptr->hostname, sensor_datafile.data());
/* check for system call error case */
if ( rc != PASS )
@ -1227,6 +1281,17 @@ static int _parse_redfish_sensor_data_output_file( thread_info_type * info_ptr,
fread(buffer,(st.st_size + 2), 1, _fp);
fclose(_fp);
/* Debug Option - enable lane debug_bmgt3 = 8 and touch
* /var/run/bmc/ipmitool/want_dated_sensor_data_files for ipmi
* or
* /var/run/bmc/redfishtool/want_dated_sensor_data_files for redfish
*
* ... to save ther current sensor read file with a dated extension
* so that a read history is maintained for debug purposes. */
if(daemon_get_cfg_ptr()->debug_bmgmt&8)
if ( daemon_is_file_present (WANT_DATED_REDFISH_SENSOR_DATA_FILES))
daemon_copy_file(info_ptr->hostname, datafile.data());
switch (sensor_group)
{
case BMC_SENSOR_POWER_GROUP:
@ -1311,6 +1376,7 @@ void * hwmonThread_redfish ( void * arg )
/* the number of sensors learned */
extra_ptr->samples = samples = 0 ;
MEMSET_ZERO (_sample_list);
switch ( info_ptr->command )
{

@ -1179,7 +1179,7 @@ int daemon_init ( string iface, string nodetype )
mtcTimer_init ( mtcInv.mtcTimer, mtcInv.my_hostname, "mtc timer" ); /* Init general mtc timer */
mtcAlarm_init ();
mtc_stages_init ();
threadUtil_init ( mtcTimer_handler ) ;
threadUtil_init ( mtcTimer_handler, MTCAGENT_STACK_SIZE ) ;
/* Bind signal handlers */
rc = daemon_signal_init () ;

@ -22,6 +22,8 @@ typedef struct
string bm_cmd ;
} thread_extra_info_type ;
#define MTCAGENT_STACK_SIZE (0x20000) // 128 kBytes
void * mtcThread_bmc ( void * );
void * mtcThread_bmc_test ( void * arg );