diff --git a/mtce-common/src/common/bmcUtil.h b/mtce-common/src/common/bmcUtil.h index 8f295d30..3c7dd94f 100644 --- a/mtce-common/src/common/bmcUtil.h +++ b/mtce-common/src/common/bmcUtil.h @@ -28,6 +28,8 @@ using namespace std; #define BMC_PROTOCOL__REDFISH_STR ((const char *)("redfish")) #define BMC_PROTOCOL__IPMITOOL_STR ((const char *)("ipmitool")) #define BMC_PROTOCOL__REDFISHTOOL_STR ((const char *)("redfishtool")) +#define WANT_DATED_IPMI_SENSOR_DATA_FILES ((const char *)("/var/run/bmc/ipmitool/want_dated_sensor_data_files")) +#define WANT_DATED_REDFISH_SENSOR_DATA_FILES ((const char *)("/var/run/bmc/redfishtool/want_dated_sensor_data_files")) /* learned graceful and immediate power control command strings */ typedef struct diff --git a/mtce-common/src/common/httpUtil.cpp b/mtce-common/src/common/httpUtil.cpp index 12f7d9db..00337e53 100644 --- a/mtce-common/src/common/httpUtil.cpp +++ b/mtce-common/src/common/httpUtil.cpp @@ -982,7 +982,7 @@ int httpUtil_api_request ( libEvent & event ) } httpUtil_api_request_done: - + daemon_signal_hdlr (); httpUtil_free_conn ( event ); httpUtil_free_base ( event ); @@ -1085,7 +1085,7 @@ void httpUtil_log_event ( libEvent * event_ptr ) } snprintf (&rest_api_log_str[0], MAX_API_LOG_LEN-1, - "%s [%5d] %s %s '%s' seq:%d -> Status : %d {execution time %ld.%06ld secs}\n", + "%s [%5d] %s %s '%s' seq:%d -> Status : %d {execution time %ld.%ld secs}\n", pt(), getpid(), event_ptr->hostname.c_str(), event_ptr->service.c_str(), diff --git a/mtce-common/src/common/nodeUtil.cpp b/mtce-common/src/common/nodeUtil.cpp index 9684f0c8..ba1ae834 100755 --- a/mtce-common/src/common/nodeUtil.cpp +++ b/mtce-common/src/common/nodeUtil.cpp @@ -2500,8 +2500,10 @@ static void redirect_stdout_stderr(const string& hostname, stdout_copy = -1; stderr_copy = -1; + // Open file with explicit permissions: rw-r--r-- (0644) int redirect_fd = open(output_filename.c_str(), - O_CREAT | O_WRONLY | O_TRUNC); + O_CREAT | O_WRONLY | O_TRUNC, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); // 0644 if ( redirect_fd < 0 ) { elog ("%s failed to open output filename: [%s] - error code = %d (%s)", diff --git a/mtce-common/src/common/threadUtil.cpp b/mtce-common/src/common/threadUtil.cpp index d8f513dd..d4d58ff4 100644 --- a/mtce-common/src/common/threadUtil.cpp +++ b/mtce-common/src/common/threadUtil.cpp @@ -50,7 +50,7 @@ static unsigned int __thread_init_sig ; static std::string threadStages_str[THREAD_STAGE__STAGES+1]; -int threadUtil_init ( void (*handler)(int, siginfo_t*, void* )) +int threadUtil_init ( void (*handler)(int, siginfo_t*, void* ), size_t stack_size ) { /* preserve parent process timer handler */ thread_timer_handler = handler ; @@ -67,24 +67,23 @@ int threadUtil_init ( void (*handler)(int, siginfo_t*, void* )) /* setup to create a 'detached' threads */ pthread_attr_init(&__attr); pthread_attr_setdetachstate(&__attr, PTHREAD_CREATE_DETACHED); - threadUtil_setstack_size (); + threadUtil_setstack_size ( stack_size ); __thread_init_sig = THREAD_INIT_SIG ; return (PASS); } -#define MTCE_PTHREAD_MAX_STACK_SIZE (0x20000) /* 128K */ -void threadUtil_setstack_size ( void ) +void threadUtil_setstack_size ( size_t stack_size ) { size_t stack_size_before = 0 ; size_t stack_size_after = 0 ; /* manage pthread stack size */ if ( pthread_attr_getstacksize (&__attr,&stack_size_before) == PASS ) { - if ( stack_size_before > MTCE_PTHREAD_MAX_STACK_SIZE ) + if ( stack_size_before > stack_size ) { - if ( pthread_attr_setstacksize ( &__attr, MTCE_PTHREAD_MAX_STACK_SIZE ) == PASS ) + if ( pthread_attr_setstacksize ( &__attr, stack_size ) == PASS ) { if ( pthread_attr_getstacksize (&__attr,&stack_size_after) == PASS ) { diff --git a/mtce-common/src/common/threadUtil.h b/mtce-common/src/common/threadUtil.h index 20682b70..96d6cf34 100644 --- a/mtce-common/src/common/threadUtil.h +++ b/mtce-common/src/common/threadUtil.h @@ -257,7 +257,7 @@ typedef struct /* module init/fini */ void threadUtil_fini ( void ); -int threadUtil_init ( void (*handler)(int, siginfo_t*, void* )); +int threadUtil_init ( void (*handler)(int, siginfo_t*, void* ), size_t stack_size); #define DEFAULT_SYSTEM_REQUEST_LATENCY_SECS (unsigned long long)(15) int threadUtil_bmcSystemCall (string hostname, @@ -265,7 +265,7 @@ int threadUtil_bmcSystemCall (string hostname, string datafile, unsigned long long latency_threshold_secs); -void threadUtil_setstack_size ( void ); +void threadUtil_setstack_size ( size_t stack_size ); /* Onetime thread init setup */ void thread_init ( thread_ctrl_type & ctrl, diff --git a/mtce-common/src/daemon/daemon_common.h b/mtce-common/src/daemon/daemon_common.h index 2b1fd292..2ddb919e 100755 --- a/mtce-common/src/daemon/daemon_common.h +++ b/mtce-common/src/daemon/daemon_common.h @@ -62,7 +62,7 @@ void daemon_remove_pidfile ( void ); void daemon_remove_file ( const char * filename ); void daemon_rename_file ( const char * path, const char * old_filename, const char * new_filename ); void daemon_make_dir ( const char * dir ); - +int daemon_copy_file (string hostname, const char *source ); string daemon_read_file ( const char * filename ); void daemon_logfile_close ( void ); diff --git a/mtce-common/src/daemon/daemon_files.cpp b/mtce-common/src/daemon/daemon_files.cpp index 4bdd66f6..6799eae3 100755 --- a/mtce-common/src/daemon/daemon_files.cpp +++ b/mtce-common/src/daemon/daemon_files.cpp @@ -121,6 +121,69 @@ bool daemon_is_file_present ( const char * filename ) return (false); } +static void get_current_date_as_string ( char *date_str, size_t size ) +{ + time_t now = time(NULL); + struct tm *t = localtime(&now); + + // Suffix Format: _YYYY-MM-DD_HH-MM-SS + strftime(date_str, size, "%Y-%m-%d_%H-%M-%S", t); +} + +// 16KB buffer for efficient copying +#define MAX_FILE_CONTENT_BUFFER_SIZE 0x4000 // 16 KBytes +int daemon_copy_file ( string hostname, const char *source ) +{ + // Open source file in binary mode + FILE *src = fopen (source, "rb"); + if (!src) + { + // Error path + wlog ("%s unable to open source file: %s ; (%d:%m)", hostname.c_str(), source, errno); + return FAIL ; + } + + // Generate the destination filename with date suffix + + // Format: _YYYY-MM-DD_HH-MM-SS needs 21 chars with null termination + char date_suffix[21]; + get_current_date_as_string (date_suffix, sizeof(date_suffix)); + + // Max hostname size is 256 plus extra for path and the rest + // of the filename size with the dated suffix. 512 is ample. + // Example /var/run/bmc/redfishtool/hwmond_controller-0_thermal_sensor_data + char destination[512]; + + // Create the date suffixed destination filename + snprintf(destination, sizeof(destination), "%s_%s", source, date_suffix); + + // Open destination file in binary mode + FILE *dest = fopen(destination, "wb"); + if (!dest) + { + // Error path + wlog ("%s failed to open destination file '%s' for copy operation", + hostname.c_str(), destination); + + fclose (src); + return FAIL ; + } + + char buffer [MAX_FILE_CONTENT_BUFFER_SIZE]; + size_t bytes_read; + + // Read from source and write to destination in chunks + while ((bytes_read = fread(buffer, 1, MAX_FILE_CONTENT_BUFFER_SIZE, src)) > 0) + fwrite(buffer, 1, bytes_read, dest); + + // Close files + fclose (src); + fclose (dest); + + ilog ("%s file '%s' copied to '%s'\n", hostname.c_str(), source, destination); + return PASS ; +} + void daemon_healthcheck ( const char * sig ) { FILE * hc_file_stream ; diff --git a/mtce/src/hwmon/hwmon.h b/mtce/src/hwmon/hwmon.h index 5703f66d..add7416c 100644 --- a/mtce/src/hwmon/hwmon.h +++ b/mtce/src/hwmon/hwmon.h @@ -49,8 +49,8 @@ using namespace std; #endif #define __AREA__ "mon" -#define MAX_HOST_SENSORS (512) // (100) -#define MAX_HOST_GROUPS (20) +#define MAX_HOST_SENSORS (256) +#define MAX_HOST_GROUPS (10) #define MIN_SENSOR_GROUPS (4) #define HWMON_DEFAULT_LARGE_INTERVAL (MTC_MINS_15) #define HWMON_DEFAULT_AUDIT_INTERVAL (MTC_MINS_2) @@ -58,6 +58,7 @@ using namespace std; #define DEGRADE_AUDIT_TRIGGER (2) #define MAX_SENSORS_NOT_FOUND (5) #define START_DEBOUCE_COUNT (1) +#define HWMOND_STACK_SIZE (0x80000) // 512 KByes // Power sensor data for Dell R740-emc-1 needs 45KiB // Thermal sensor readout on wolfpass requires 20KiB diff --git a/mtce/src/hwmon/hwmonBmc.cpp b/mtce/src/hwmon/hwmonBmc.cpp index 3a1d6c55..1a6c1d47 100644 --- a/mtce/src/hwmon/hwmonBmc.cpp +++ b/mtce/src/hwmon/hwmonBmc.cpp @@ -82,9 +82,9 @@ void sensor_data_init ( sensor_data_type & data ) * *****************************************************************************/ -void sensor_data_print ( const sensor_data_type & data ) +void sensor_data_print ( string & hostname, const sensor_data_type & data ) { - blog3 ("%s is %s : %s (%s) %s %s %s %s %s %s %s\n", + blog3 ("%s %s is %s : %s (%s) %s %s %s %s %s %s %s\n", hostname.c_str(), data.name.c_str(), data.status.c_str(), data.value.c_str(), @@ -107,7 +107,7 @@ void sensor_data_print ( const sensor_data_type & data ) * *****************************************************************************/ -int bmc_load_json_sensor ( sensor_data_type & sensor_data , string json_sensor_data ) +int bmc_load_json_sensor ( string & hostname, sensor_data_type & sensor_data , string json_sensor_data ) { int rc = FAIL_KEY_VALUE_PARSE ; // ilog ("sensor data:%s\n", json_sensor_data.c_str() ); @@ -126,7 +126,7 @@ int bmc_load_json_sensor ( sensor_data_type & sensor_data , string json_sensor_d sensor_data.ucr = jsonUtil_get_key_value_string ( raw_obj, "ucr" ) ; sensor_data.unc = jsonUtil_get_key_value_string ( raw_obj, "unc" ) ; - sensor_data_print ( sensor_data ); + sensor_data_print ( hostname, sensor_data ); json_object_put(raw_obj); rc = PASS ; @@ -310,7 +310,7 @@ int hwmonHostClass::bmc_load_sensor_samples ( struct hwmonHostClass::hwmon_host rc = jsonUtil_get_array_idx ( msg_ptr, BMC_JSON__SENSORS_LABEL, index, sensor_data ) ; if ( rc == PASS ) { - if ( bmc_load_json_sensor ( host_ptr->sample[host_ptr->samples], sensor_data ) == PASS ) + if ( bmc_load_json_sensor ( host_ptr->hostname , host_ptr->sample[host_ptr->samples], sensor_data ) == PASS ) { bool found = false ; @@ -635,7 +635,7 @@ int hwmonHostClass::bmc_update_sensors ( struct hwmonHostClass::hwmon_host * hos host_ptr->sensor[i].sensorname.c_str(), bmc_status); - sensor_data_print (host_ptr->sample[j]); + sensor_data_print ( host_ptr->hostname, host_ptr->sample[j]); blog3 ("%s ... %s\n", host_ptr->hostname.c_str(), host_ptr->bmc_thread_info.data.c_str()); host_ptr->sensor[i].sample_severity = HWMON_SEVERITY_MINOR ; @@ -698,7 +698,7 @@ int hwmonHostClass::bmc_update_sensors ( struct hwmonHostClass::hwmon_host * hos host_ptr->sensor[i].sensorname.c_str(), bmc_status); - sensor_data_print (host_ptr->sample[j]); + sensor_data_print ( host_ptr->hostname, host_ptr->sample[j]); blog3 ("%s ... %s\n", host_ptr->hostname.c_str(), host_ptr->bmc_thread_info.data.c_str()); host_ptr->sensor[i].sample_severity = HWMON_SEVERITY_MINOR ; diff --git a/mtce/src/hwmon/hwmonBmc.h b/mtce/src/hwmon/hwmonBmc.h index df84e522..6ffbb032 100644 --- a/mtce/src/hwmon/hwmonBmc.h +++ b/mtce/src/hwmon/hwmonBmc.h @@ -41,9 +41,9 @@ #define MAX_IPMITOOL_PARSE_ERRORS (20) void sensor_data_init ( sensor_data_type & data ); -void sensor_data_print ( const sensor_data_type & data ); +void sensor_data_print ( string & hostname, const sensor_data_type & data ); void sensor_data_copy ( sensor_data_type & from, sensor_data_type & to ); -int bmc_load_json_sensor ( sensor_data_type & sensor_data , string json_sensor_data ); +int bmc_load_json_sensor ( string & hostname, sensor_data_type & sensor_data , string json_sensor_data ); #endif diff --git a/mtce/src/hwmon/hwmonInit.cpp b/mtce/src/hwmon/hwmonInit.cpp index f24d7b49..5666b3da 100644 --- a/mtce/src/hwmon/hwmonInit.cpp +++ b/mtce/src/hwmon/hwmonInit.cpp @@ -272,8 +272,6 @@ int daemon_init ( string iface, string nodetype ) obj_ptr->system_type = daemon_system_type (); - threadUtil_init ( hwmonTimer_handler ) ; - /* Bind signal handlers */ if ( daemon_signal_init () != PASS ) { @@ -295,7 +293,7 @@ int daemon_init ( string iface, string nodetype ) rc = FAIL_SOCKET_INIT ; } - threadUtil_init ( hwmonTimer_handler ) ; + threadUtil_init ( hwmonTimer_handler, HWMOND_STACK_SIZE ) ; /* override the config reload for the startup case */ obj_ptr->config_reload = false ; diff --git a/mtce/src/hwmon/hwmonThreads.cpp b/mtce/src/hwmon/hwmonThreads.cpp index 183a6c49..b2a18da8 100644 --- a/mtce/src/hwmon/hwmonThreads.cpp +++ b/mtce/src/hwmon/hwmonThreads.cpp @@ -38,17 +38,55 @@ using namespace std; #include "hwmonClass.h" /* for ... thread_extra_info_type */ #include "nodeUtil.h" /* for ... fork_execv */ -/*************************************************************************** - * - * Name : bmc_sample_type - * - * Description: An array of sensor data. - * - * _sample_list - * - ***************************************************************************/ -static bmc_sample_type _sample_list[MAX_HOST_SENSORS] ; +/* One instance per thread. Uses the memory allocated for the stack. + * + * Although thread_local variables are not on the stack, they still + * consume memory that’s tied to the thread’s overall resources, + * and that memory often comes from the same per-thread allocation + * that includes the stack ; refer to TLS (Thread-Local Storage). + * The TLS area is often allocated adjacent to or within the thread's + * stack mapping. A large thread_local variable increases the TLS + * memory requirement, and if it exceeds the reserved space or + * overlaps with the stack space, the OS may fail to allocate the + * thread with a errno "Resource temporarily unavailable". + * This allocation required the per thread stack to be increased. */ +thread_local bmc_sample_type _sample_list[MAX_HOST_SENSORS]; + +// #define WANT_SAMPLE_LIST_DEBUG +#ifdef WANT_SAMPLE_LIST_DEBUG +void print_sample_list ( string & hostname ) +{ + bool empty = false ; + for ( int i = 0 ; i < MAX_HOST_SENSORS ; i++) + { + if ( strlen ( _sample_list[i].name ) != 0 ) + { + if ( empty ) + { + slog ("%s has sparse sensor list ; gap at %d", hostname.c_str(), i); + empty = false ; + } + ilog ("%s Sample %d: %s - %s - %s - %s ... %s - %s - %s - %s - %s - %s", + hostname.c_str(), i, + _sample_list[i].name, + _sample_list[i].value, + _sample_list[i].unit, + _sample_list[i].status, + _sample_list[i].lnr, + _sample_list[i].lcr, + _sample_list[i].lnc, + _sample_list[i].unc, + _sample_list[i].ucr, + _sample_list[i].unr); + } + else + { + empty = true ; + } + } +} +#endif // WANT_SAMPLE_LIST_DEBUG /*************************************************************************** * @@ -164,6 +202,11 @@ static void _parse_sensor_data ( thread_info_type * info_ptr ) info_ptr->data.append (",\""); info_ptr->data.append (BMC_JSON__SENSORS_LABEL); info_ptr->data.append ("\":["); + +#ifdef WANT_SAMPLE_LIST_DEBUG + print_sample_list ( info_ptr->hostname ); +#endif // WANT_SAMPLE_LIST_DEBUG + for ( int i = 0 ; i < samples ; ) { _add_json_sensor_tuple ( &_sample_list[i], info_ptr->data ) ; @@ -331,6 +374,7 @@ void * hwmonThread_ipmitool ( void * arg ) /* the number of sensors are learned */ extra_ptr->samples = samples = 0 ; + MEMSET_ZERO (_sample_list); switch ( info_ptr->command ) { case BMC_THREAD_CMD__POWER_STATUS: @@ -542,7 +586,17 @@ void * hwmonThread_ipmitool ( void * arg ) unlink(info_ptr->password_file.data()); daemon_remove_file (info_ptr->password_file.data()); - // info_ptr->password_file.clear(); + + /* Debug Option - enable lane debug_bmgt3 = 8 and touch + * /var/run/bmc/ipmitool/want_dated_sensor_data_files for ipmi + * or + * /var/run/bmc/redfishtool/want_dated_sensor_data_files for redfish + * + * ... to save ther current sensor read file with a dated extension + * so that a read history is maintained for debug purposes. */ + if(daemon_get_cfg_ptr()->debug_bmgmt&8) + if ( daemon_is_file_present (WANT_DATED_IPMI_SENSOR_DATA_FILES)) + daemon_copy_file(info_ptr->hostname, sensor_datafile.data()); /* check for system call error case */ if ( rc != PASS ) @@ -1227,6 +1281,17 @@ static int _parse_redfish_sensor_data_output_file( thread_info_type * info_ptr, fread(buffer,(st.st_size + 2), 1, _fp); fclose(_fp); + /* Debug Option - enable lane debug_bmgt3 = 8 and touch + * /var/run/bmc/ipmitool/want_dated_sensor_data_files for ipmi + * or + * /var/run/bmc/redfishtool/want_dated_sensor_data_files for redfish + * + * ... to save ther current sensor read file with a dated extension + * so that a read history is maintained for debug purposes. */ + if(daemon_get_cfg_ptr()->debug_bmgmt&8) + if ( daemon_is_file_present (WANT_DATED_REDFISH_SENSOR_DATA_FILES)) + daemon_copy_file(info_ptr->hostname, datafile.data()); + switch (sensor_group) { case BMC_SENSOR_POWER_GROUP: @@ -1311,6 +1376,7 @@ void * hwmonThread_redfish ( void * arg ) /* the number of sensors learned */ extra_ptr->samples = samples = 0 ; + MEMSET_ZERO (_sample_list); switch ( info_ptr->command ) { diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index 900805d0..ab7874b2 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -1179,7 +1179,7 @@ int daemon_init ( string iface, string nodetype ) mtcTimer_init ( mtcInv.mtcTimer, mtcInv.my_hostname, "mtc timer" ); /* Init general mtc timer */ mtcAlarm_init (); mtc_stages_init (); - threadUtil_init ( mtcTimer_handler ) ; + threadUtil_init ( mtcTimer_handler, MTCAGENT_STACK_SIZE ) ; /* Bind signal handlers */ rc = daemon_signal_init () ; diff --git a/mtce/src/maintenance/mtcThreads.h b/mtce/src/maintenance/mtcThreads.h index bb521a63..55063350 100644 --- a/mtce/src/maintenance/mtcThreads.h +++ b/mtce/src/maintenance/mtcThreads.h @@ -22,6 +22,8 @@ typedef struct string bm_cmd ; } thread_extra_info_type ; +#define MTCAGENT_STACK_SIZE (0x20000) // 128 kBytes + void * mtcThread_bmc ( void * ); void * mtcThread_bmc_test ( void * arg );