Send Node Locked command on pxeboot network as corrective action.
This update affects only locked nodes. If a remote node fails early config in a way that prevents IPSec over management from being established, and no cluster interface is configured or provisioned, then Node Locked commands sent from mtcAgent over management and cluster networks are not received by mtcClient. This leads to a perpetual watchdog reset loop. The pmon process fails to reach the configured state, and without the presence of the .node_locked file, the watchdog treats the node as unlocked. A quorum failure triggers a crashdump reset, repeating indefinitely. The mtcAgent detects this and attempts corrective action by resending the Node Locked command over the same failing networks, which also fails. This update adds a fallback: the Node Locked command is also sent over the pxeboot network. Testing also revealed that mtcClient socket recovery stops at the first socket failure rather than try and rcover them all. This update improves socket recovery by attempting all sockets in order. The pxeboot socket is tried first, now followed by management and cluster sockets. Test Plan: PASS: Verify mtcClient socket init and failure recovery handling. PASS: Verify the mtcAgent sends the Node Locked command on the pxeboot network when it sees a node locked state mismatch. PASS: Verify a locked node with failing management and cluster networking will get the node locked command serviced and node locked file produced as expected on the remote node. This event is noted by the following host specific mtcAgent log. "hostname mtcAlive reporting unlocked while locked ; correcting" Note: that before this update we see the above 'correcting' log every 5 seconds. With this update we see that log only once and the remote node does not go into a perpetual crashdump loop. Note: The host watchdog will not force a quorum failure crashdump if the /var/run/.noide_locked file is present. Closes-Bug: 2103863 Change-Id: I020c7ebe1e83254c52219546ec938f6cf3284c2e Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
a0f8eb3fc4
commit
cbcb19420c
@ -4671,6 +4671,7 @@ void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface )
|
||||
{
|
||||
wlog ("%s mtcAlive reporting unlocked while locked ; correcting",
|
||||
node_ptr->hostname.c_str());
|
||||
send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, PXEBOOT_INTERFACE );
|
||||
}
|
||||
send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, MGMNT_INTERFACE );
|
||||
send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, CLSTR_INTERFACE );
|
||||
|
@ -650,12 +650,6 @@ int mtc_socket_init ( void )
|
||||
ctrl.mtcAgent_ip = getipbyname ( CONTROLLER );
|
||||
ilog ("Controller : %s\n", ctrl.mtcAgent_ip.c_str());
|
||||
|
||||
/************************************************************/
|
||||
/* Setup Mgmnt Network messaging sockets to/from mtcAgent */
|
||||
/************************************************************/
|
||||
setup_mgmt_rx_socket ();
|
||||
setup_mgmt_tx_socket ();
|
||||
|
||||
/************************************************************/
|
||||
/* Setup Pxeboot Network messaging sockets to/from mtcAgent */
|
||||
/************************************************************/
|
||||
@ -665,6 +659,12 @@ int mtc_socket_init ( void )
|
||||
setup_pxeboot_tx_socket ();
|
||||
}
|
||||
|
||||
/************************************************************/
|
||||
/* Setup Mgmnt Network messaging sockets to/from mtcAgent */
|
||||
/************************************************************/
|
||||
setup_mgmt_rx_socket ();
|
||||
setup_mgmt_tx_socket ();
|
||||
|
||||
/* Manage Cluster-host network setup */
|
||||
string mgmnt_iface_name = daemon_mgmnt_iface();
|
||||
string clstr_iface_name = daemon_clstr_iface();
|
||||
@ -1697,7 +1697,7 @@ void daemon_service_run ( void )
|
||||
}
|
||||
if ( mtcTimer_expired ( ctrl.timer ) )
|
||||
{
|
||||
bool socket_reinit = true ;
|
||||
bool socket_reinit = false ;
|
||||
|
||||
/**
|
||||
* Look for failing sockets and try to recover them,
|
||||
@ -1716,8 +1716,8 @@ void daemon_service_run ( void )
|
||||
}
|
||||
|
||||
/* Mgmt Tx */
|
||||
else if (( mtc_sock.mtc_client_mgmt_tx_socket == NULL ) ||
|
||||
( mtc_sock.mtc_client_mgmt_tx_socket->sock_ok() == false ))
|
||||
if (( mtc_sock.mtc_client_mgmt_tx_socket == NULL ) ||
|
||||
( mtc_sock.mtc_client_mgmt_tx_socket->sock_ok() == false ))
|
||||
{
|
||||
wlog ("calling setup_mgmt_tx_socket (auto-recovery)");
|
||||
setup_mgmt_tx_socket();
|
||||
@ -1725,7 +1725,7 @@ void daemon_service_run ( void )
|
||||
}
|
||||
|
||||
/* Pxeboot Rx */
|
||||
else if ((ctrl.pxeboot_iface_provisioned == true) && (mtc_sock.pxeboot_rx_socket <= 0))
|
||||
if ((ctrl.pxeboot_iface_provisioned == true) && (mtc_sock.pxeboot_rx_socket <= 0))
|
||||
{
|
||||
wlog ("calling setup_pxeboot_rx_socket (auto-recovery)");
|
||||
setup_pxeboot_rx_socket();
|
||||
@ -1733,7 +1733,7 @@ void daemon_service_run ( void )
|
||||
}
|
||||
|
||||
/* Pxeboot Tx */
|
||||
else if ((ctrl.pxeboot_iface_provisioned == true) && (mtc_sock.pxeboot_tx_socket == 0))
|
||||
if ((ctrl.pxeboot_iface_provisioned == true) && (mtc_sock.pxeboot_tx_socket == 0))
|
||||
{
|
||||
wlog ("calling setup_pxeboot_tx_socket (auto-recovery)");
|
||||
setup_pxeboot_tx_socket();
|
||||
@ -1741,9 +1741,9 @@ void daemon_service_run ( void )
|
||||
}
|
||||
|
||||
/* Clstr Rx */
|
||||
else if (( ctrl.clstr_iface_provisioned == true ) &&
|
||||
(( mtc_sock.mtc_client_clstr_rx_socket == NULL ) ||
|
||||
( mtc_sock.mtc_client_clstr_rx_socket->sock_ok() == false )))
|
||||
if (( ctrl.clstr_iface_provisioned == true ) &&
|
||||
(( mtc_sock.mtc_client_clstr_rx_socket == NULL ) ||
|
||||
( mtc_sock.mtc_client_clstr_rx_socket->sock_ok() == false )))
|
||||
{
|
||||
wlog ("calling setup_clstr_rx_socket (auto-recovery)");
|
||||
setup_clstr_rx_socket();
|
||||
@ -1751,10 +1751,10 @@ void daemon_service_run ( void )
|
||||
}
|
||||
|
||||
/* Clstr Tx ; AIO SX */
|
||||
else if ((ctrl.system_type == SYSTEM_TYPE__AIO__SIMPLEX) &&
|
||||
( ctrl.clstr_iface_provisioned == true ) &&
|
||||
(( mtc_sock.mtc_client_clstr_tx_socket_c0 == NULL ) ||
|
||||
( mtc_sock.mtc_client_clstr_tx_socket_c0->sock_ok() == false )))
|
||||
if ((ctrl.system_type == SYSTEM_TYPE__AIO__SIMPLEX) &&
|
||||
( ctrl.clstr_iface_provisioned == true ) &&
|
||||
(( mtc_sock.mtc_client_clstr_tx_socket_c0 == NULL ) ||
|
||||
( mtc_sock.mtc_client_clstr_tx_socket_c0->sock_ok() == false )))
|
||||
{
|
||||
wlog ("calling setup_clstr_tx_sockets (auto-recovery)");
|
||||
setup_clstr_tx_sockets();
|
||||
@ -1762,28 +1762,24 @@ void daemon_service_run ( void )
|
||||
}
|
||||
|
||||
/* Clstr Tx ; not AIO SX */
|
||||
else if ((ctrl.system_type != SYSTEM_TYPE__AIO__SIMPLEX) &&
|
||||
( ctrl.clstr_iface_provisioned == true ) &&
|
||||
(( mtc_sock.mtc_client_clstr_tx_socket_c0 == NULL ) ||
|
||||
( mtc_sock.mtc_client_clstr_tx_socket_c1 == NULL ) ||
|
||||
( mtc_sock.mtc_client_clstr_tx_socket_c0->sock_ok() == false ) ||
|
||||
( mtc_sock.mtc_client_clstr_tx_socket_c1->sock_ok() == false )))
|
||||
if ((ctrl.system_type != SYSTEM_TYPE__AIO__SIMPLEX) &&
|
||||
( ctrl.clstr_iface_provisioned == true ) &&
|
||||
(( mtc_sock.mtc_client_clstr_tx_socket_c0 == NULL ) ||
|
||||
( mtc_sock.mtc_client_clstr_tx_socket_c1 == NULL ) ||
|
||||
( mtc_sock.mtc_client_clstr_tx_socket_c0->sock_ok() == false ) ||
|
||||
( mtc_sock.mtc_client_clstr_tx_socket_c1->sock_ok() == false )))
|
||||
{
|
||||
wlog ("calling setup_clstr_tx_sockets (auto-recovery)");
|
||||
setup_clstr_tx_sockets();
|
||||
socket_reinit = true ;
|
||||
}
|
||||
|
||||
else if ( mtc_sock.amon_socket <= 0 )
|
||||
if ( mtc_sock.amon_socket <= 0 )
|
||||
{
|
||||
setup_amon_socket ();
|
||||
wlog ("calling setup_amon_socket (auto-recovery)");
|
||||
socket_reinit = true ;
|
||||
}
|
||||
else
|
||||
{
|
||||
socket_reinit = false ;
|
||||
}
|
||||
|
||||
if ( socket_reinit )
|
||||
{
|
||||
@ -1796,7 +1792,6 @@ void daemon_service_run ( void )
|
||||
/* re-get identity if interfaces are re-initialized */
|
||||
string who_i_am = _self_identify ( ctrl.nodetype_str );
|
||||
}
|
||||
alog1 ("sending mtcAlive on all provisioned mtcAlive networks");
|
||||
|
||||
#ifdef WANT_FIT_TESTING
|
||||
if ( ! daemon_want_fit ( FIT_CODE__FAIL_PXEBOOT_MTCALIVE ) )
|
||||
|
Loading…
x
Reference in New Issue
Block a user