Send Node Locked command on pxeboot network as corrective action.

This update affects only locked nodes.

If a remote node fails early config in a way that prevents IPSec
over management from being established, and no cluster interface
is configured or provisioned, then Node Locked commands sent from
mtcAgent over management and cluster networks are not received by
mtcClient.

This leads to a perpetual watchdog reset loop. The pmon process fails
to reach the configured state, and without the presence of
the .node_locked file, the watchdog treats the node as unlocked.
A quorum failure triggers a crashdump reset, repeating indefinitely.

The mtcAgent detects this and attempts corrective action by resending
the Node Locked command over the same failing networks, which also
fails.

This update adds a fallback: the Node Locked command is also sent
over the pxeboot network.

Testing also revealed that mtcClient socket recovery stops at the
first socket failure rather than try and rcover them all.

This update improves socket recovery by attempting all sockets in
order. The pxeboot socket is tried first, now followed by management
and cluster sockets.

Test Plan:

PASS: Verify mtcClient socket init and failure recovery handling.
PASS: Verify the mtcAgent sends the Node Locked command on the
      pxeboot network when it sees a node locked state mismatch.
PASS: Verify a locked node with failing management and cluster
      networking will get the node locked command serviced and
      node locked file produced as expected on the remote node.
      This event is noted by the following host specific mtcAgent log.

      "hostname mtcAlive reporting unlocked while locked ; correcting"

      Note: that before this update we see the above 'correcting' log
            every 5 seconds. With this update we see that log only
            once and the remote node does not go into a perpetual
            crashdump loop.

      Note: The host watchdog will not force a quorum failure
            crashdump if the /var/run/.noide_locked file is present.

Closes-Bug: 2103863
Change-Id: I020c7ebe1e83254c52219546ec938f6cf3284c2e
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2025-03-22 15:33:07 +00:00
parent a0f8eb3fc4
commit cbcb19420c
2 changed files with 26 additions and 30 deletions

View File

@ -4671,6 +4671,7 @@ void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface )
{
wlog ("%s mtcAlive reporting unlocked while locked ; correcting",
node_ptr->hostname.c_str());
send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, PXEBOOT_INTERFACE );
}
send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, MGMNT_INTERFACE );
send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, CLSTR_INTERFACE );

View File

@ -650,12 +650,6 @@ int mtc_socket_init ( void )
ctrl.mtcAgent_ip = getipbyname ( CONTROLLER );
ilog ("Controller : %s\n", ctrl.mtcAgent_ip.c_str());
/************************************************************/
/* Setup Mgmnt Network messaging sockets to/from mtcAgent */
/************************************************************/
setup_mgmt_rx_socket ();
setup_mgmt_tx_socket ();
/************************************************************/
/* Setup Pxeboot Network messaging sockets to/from mtcAgent */
/************************************************************/
@ -665,6 +659,12 @@ int mtc_socket_init ( void )
setup_pxeboot_tx_socket ();
}
/************************************************************/
/* Setup Mgmnt Network messaging sockets to/from mtcAgent */
/************************************************************/
setup_mgmt_rx_socket ();
setup_mgmt_tx_socket ();
/* Manage Cluster-host network setup */
string mgmnt_iface_name = daemon_mgmnt_iface();
string clstr_iface_name = daemon_clstr_iface();
@ -1697,7 +1697,7 @@ void daemon_service_run ( void )
}
if ( mtcTimer_expired ( ctrl.timer ) )
{
bool socket_reinit = true ;
bool socket_reinit = false ;
/**
* Look for failing sockets and try to recover them,
@ -1716,8 +1716,8 @@ void daemon_service_run ( void )
}
/* Mgmt Tx */
else if (( mtc_sock.mtc_client_mgmt_tx_socket == NULL ) ||
( mtc_sock.mtc_client_mgmt_tx_socket->sock_ok() == false ))
if (( mtc_sock.mtc_client_mgmt_tx_socket == NULL ) ||
( mtc_sock.mtc_client_mgmt_tx_socket->sock_ok() == false ))
{
wlog ("calling setup_mgmt_tx_socket (auto-recovery)");
setup_mgmt_tx_socket();
@ -1725,7 +1725,7 @@ void daemon_service_run ( void )
}
/* Pxeboot Rx */
else if ((ctrl.pxeboot_iface_provisioned == true) && (mtc_sock.pxeboot_rx_socket <= 0))
if ((ctrl.pxeboot_iface_provisioned == true) && (mtc_sock.pxeboot_rx_socket <= 0))
{
wlog ("calling setup_pxeboot_rx_socket (auto-recovery)");
setup_pxeboot_rx_socket();
@ -1733,7 +1733,7 @@ void daemon_service_run ( void )
}
/* Pxeboot Tx */
else if ((ctrl.pxeboot_iface_provisioned == true) && (mtc_sock.pxeboot_tx_socket == 0))
if ((ctrl.pxeboot_iface_provisioned == true) && (mtc_sock.pxeboot_tx_socket == 0))
{
wlog ("calling setup_pxeboot_tx_socket (auto-recovery)");
setup_pxeboot_tx_socket();
@ -1741,9 +1741,9 @@ void daemon_service_run ( void )
}
/* Clstr Rx */
else if (( ctrl.clstr_iface_provisioned == true ) &&
(( mtc_sock.mtc_client_clstr_rx_socket == NULL ) ||
( mtc_sock.mtc_client_clstr_rx_socket->sock_ok() == false )))
if (( ctrl.clstr_iface_provisioned == true ) &&
(( mtc_sock.mtc_client_clstr_rx_socket == NULL ) ||
( mtc_sock.mtc_client_clstr_rx_socket->sock_ok() == false )))
{
wlog ("calling setup_clstr_rx_socket (auto-recovery)");
setup_clstr_rx_socket();
@ -1751,10 +1751,10 @@ void daemon_service_run ( void )
}
/* Clstr Tx ; AIO SX */
else if ((ctrl.system_type == SYSTEM_TYPE__AIO__SIMPLEX) &&
( ctrl.clstr_iface_provisioned == true ) &&
(( mtc_sock.mtc_client_clstr_tx_socket_c0 == NULL ) ||
( mtc_sock.mtc_client_clstr_tx_socket_c0->sock_ok() == false )))
if ((ctrl.system_type == SYSTEM_TYPE__AIO__SIMPLEX) &&
( ctrl.clstr_iface_provisioned == true ) &&
(( mtc_sock.mtc_client_clstr_tx_socket_c0 == NULL ) ||
( mtc_sock.mtc_client_clstr_tx_socket_c0->sock_ok() == false )))
{
wlog ("calling setup_clstr_tx_sockets (auto-recovery)");
setup_clstr_tx_sockets();
@ -1762,28 +1762,24 @@ void daemon_service_run ( void )
}
/* Clstr Tx ; not AIO SX */
else if ((ctrl.system_type != SYSTEM_TYPE__AIO__SIMPLEX) &&
( ctrl.clstr_iface_provisioned == true ) &&
(( mtc_sock.mtc_client_clstr_tx_socket_c0 == NULL ) ||
( mtc_sock.mtc_client_clstr_tx_socket_c1 == NULL ) ||
( mtc_sock.mtc_client_clstr_tx_socket_c0->sock_ok() == false ) ||
( mtc_sock.mtc_client_clstr_tx_socket_c1->sock_ok() == false )))
if ((ctrl.system_type != SYSTEM_TYPE__AIO__SIMPLEX) &&
( ctrl.clstr_iface_provisioned == true ) &&
(( mtc_sock.mtc_client_clstr_tx_socket_c0 == NULL ) ||
( mtc_sock.mtc_client_clstr_tx_socket_c1 == NULL ) ||
( mtc_sock.mtc_client_clstr_tx_socket_c0->sock_ok() == false ) ||
( mtc_sock.mtc_client_clstr_tx_socket_c1->sock_ok() == false )))
{
wlog ("calling setup_clstr_tx_sockets (auto-recovery)");
setup_clstr_tx_sockets();
socket_reinit = true ;
}
else if ( mtc_sock.amon_socket <= 0 )
if ( mtc_sock.amon_socket <= 0 )
{
setup_amon_socket ();
wlog ("calling setup_amon_socket (auto-recovery)");
socket_reinit = true ;
}
else
{
socket_reinit = false ;
}
if ( socket_reinit )
{
@ -1796,7 +1792,6 @@ void daemon_service_run ( void )
/* re-get identity if interfaces are re-initialized */
string who_i_am = _self_identify ( ctrl.nodetype_str );
}
alog1 ("sending mtcAlive on all provisioned mtcAlive networks");
#ifdef WANT_FIT_TESTING
if ( ! daemon_want_fit ( FIT_CODE__FAIL_PXEBOOT_MTCALIVE ) )