From 0bd607fdcdaad8bf642a5f676ad919f86ef1428d Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Sun, 23 Mar 2025 13:34:14 +0000 Subject: [PATCH] Extend bmc secret timeout and randomize retry delay The bmc secret query utility and fsm is mostly common to both the mtcAgent and hwmond processes This update extends the timeout for bmc secret requests from 5 to 20. This lines up with other http request timeouts and gives at scale systems more time to respond during busy controller use cases like swact or dead office recovery. In the event of a failure, this update also randomizes the retry delay between 10 and 100 seconds per host so that retries don't happen in bulk all at once. This spreads the load secret request query retries has on system inventory. Testing also revealed that the hardware monitor was not freeing its connection resources after a successful bmc secret query. This update fixes that and adds a failure handling path for the case where the bmc secret response payload is empty. Similar handling was fine for the mtcAgent. Test Plan: PASS: Verify large system bmc secret fetch over DOR soak;5 loops PASS: Verify randomized delay handling upon all failure case handling. PASS: Verify large system bmc provisioning/deprovisioning soak;10 loops PASS: Verify large system swact soak;30 loops PASS: Verify mtcAgent and hwmond logging ovber all test cases Closes-Bug: 2103925 Change-Id: I4a696a7d5e4452a8fbd9f25cf11ddf0f065dbe1a Signed-off-by: Eric MacDonald --- mtce-common/src/common/httpUtil.h | 4 ++-- mtce-common/src/common/secretUtil.cpp | 8 ++++++-- mtce/src/hwmon/hwmonFsm.cpp | 19 ++++++++++++++++--- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/mtce-common/src/common/httpUtil.h b/mtce-common/src/common/httpUtil.h index 313de54e..7ea45bf1 100644 --- a/mtce-common/src/common/httpUtil.h +++ b/mtce-common/src/common/httpUtil.h @@ -2,7 +2,7 @@ #define __INCLUDE_HTTPUTIL_H__ /* - * Copyright (c) 2013, 2016, 2024 Wind River Systems, Inc. + * Copyright (c) 2013, 2016, 2024, 2025 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -52,7 +52,7 @@ using namespace std; #define HTTP_KEYSTONE_GET_TIMEOUT (10) #define HTTP_SMGR_TIMEOUT (20) #define HTTP_VIM_TIMEOUT (20) -#define HTTP_SECRET_TIMEOUT (5) +#define HTTP_SECRET_TIMEOUT (20) #define SMGR_MAX_RETRIES (3) diff --git a/mtce-common/src/common/secretUtil.cpp b/mtce-common/src/common/secretUtil.cpp index f212d1f3..225d5588 100755 --- a/mtce-common/src/common/secretUtil.cpp +++ b/mtce-common/src/common/secretUtil.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Wind River Systems, Inc. + * Copyright (c) 2019,2025 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -247,6 +247,9 @@ barbicanSecret_type * secretUtil_manage_secret ( libEvent & event, case MTC_SECRET__GET_REF_FAIL: case MTC_SECRET__GET_PWD_FAIL: { + // Random number between 10 and 100 assuming SECRET_RETRY_DELAY is 10 + // The 91 ensures the result is between 0 and 90 + int random_retry_delay = (rand() % 91) + SECRET_RETRY_DELAY ; if ( it->second.stage == MTC_SECRET__GET_REF_FAIL ) { wlog ( "%s failed to get secret reference \n", hostname.c_str() ); @@ -257,7 +260,8 @@ barbicanSecret_type * secretUtil_manage_secret ( libEvent & event, } it->second.stage = MTC_SECRET__START ; mtcTimer_reset ( secret_timer ); - mtcTimer_start ( secret_timer, handler, SECRET_RETRY_DELAY ); + mtcTimer_start ( secret_timer, handler, random_retry_delay ); + ilog ("%s bmc secret query will retry in %d seconds", hostname.c_str(), random_retry_delay ); httpUtil_free_conn ( event ); httpUtil_free_base ( event ); break ; diff --git a/mtce/src/hwmon/hwmonFsm.cpp b/mtce/src/hwmon/hwmonFsm.cpp index 58eaf75e..517fc5dc 100644 --- a/mtce/src/hwmon/hwmonFsm.cpp +++ b/mtce/src/hwmon/hwmonFsm.cpp @@ -133,9 +133,22 @@ void hwmonHostClass::hwmon_fsm ( void ) if ( secret->stage == MTC_SECRET__GET_PWD_RECV ) { - host_ptr->bm_pw = host_ptr->thread_extra_info.bm_pw = secret->payload ; - ilog ("%s bmc credentials received", - hostname.c_str()); + /* Free the http connection and base resources */ + httpUtil_free_conn ( host_ptr->secretEvent ); + httpUtil_free_base ( host_ptr->secretEvent ); + + if ( secret->payload.empty() ) + { + wlog ("%s failed to acquire bmc password", hostname.c_str()); + secret->stage = MTC_SECRET__GET_PWD_FAIL ; + } + else + { + host_ptr->bm_pw = host_ptr->thread_extra_info.bm_pw = secret->payload ; + ilog ("%s bmc credentials received", hostname.c_str()); + /* put the FSM back to the start */ + secret->stage = MTC_SECRET__START ; + } } else {