Improve Cluster validation checks

1. Replace the url used for manager status check.
2. Change the order of validation checks since the list action can
be timed out.
3. Run the status validation without retries, to make sure the node is
identified as DOWN quicker.

Change-Id: I60501b544b5892dcc6eb1c4c897ee4add6262e0b
This commit is contained in:
Adit Sarfaty 2019-01-17 09:35:59 +02:00
parent bb3fb29fad
commit 2551a5382f
3 changed files with 22 additions and 15 deletions

View File

@ -116,10 +116,14 @@ class NsxLib(lib.NsxLibBase):
def validate_connection_method(self): def validate_connection_method(self):
"""Return a method that will validate the NSX manager status""" """Return a method that will validate the NSX manager status"""
def check_manager_status(client, manager_url): def check_manager_status(client, manager_url):
status = client.get('node/services/manager/status', silent=True) # Try to get the cluster status silently and with no retries
if (not status or 'runtime_state' not in status or status = client.get('operational/application/status',
status['runtime_state'] != 'running'): silent=True, with_retries=False)
msg = _("Manager is not in running state: %s") % status if (not status or
status.get('application_status') != 'WORKING' or
status.get('corfu_status') != 'CONNECTED' or
status.get('corfu_status') != 'CONNECTED'):
msg = _("Manager is not in working state: %s") % status
LOG.warning(msg) LOG.warning(msg)
raise exceptions.ResourceNotFound( raise exceptions.ResourceNotFound(
manager=manager_url, operation=msg) manager=manager_url, operation=msg)

View File

@ -89,8 +89,9 @@ class RESTClient(object):
def list(self, resource='', headers=None, silent=False): def list(self, resource='', headers=None, silent=False):
return self.url_list(resource, headers=headers, silent=silent) return self.url_list(resource, headers=headers, silent=silent)
def get(self, uuid, headers=None, silent=False): def get(self, uuid, headers=None, silent=False, with_retries=True):
return self.url_get(uuid, headers=headers, silent=silent) return self.url_get(uuid, headers=headers, silent=silent,
with_retries=with_retries)
def delete(self, uuid, headers=None, expected_results=None): def delete(self, uuid, headers=None, expected_results=None):
return self.url_delete(uuid, headers=headers, return self.url_delete(uuid, headers=headers,
@ -120,9 +121,9 @@ class RESTClient(object):
cursor = page.get('cursor', NULL_CURSOR_PREFIX) cursor = page.get('cursor', NULL_CURSOR_PREFIX)
return concatenate_response return concatenate_response
def url_get(self, url, headers=None, silent=False): def url_get(self, url, headers=None, silent=False, with_retries=True):
return self._rest_call(url, method='GET', headers=headers, return self._rest_call(url, method='GET', headers=headers,
silent=silent) silent=silent, with_retries=with_retries)
def url_delete(self, url, headers=None, expected_results=None): def url_delete(self, url, headers=None, expected_results=None):
return self._rest_call(url, method='DELETE', headers=headers, return self._rest_call(url, method='DELETE', headers=headers,
@ -195,7 +196,7 @@ class RESTClient(object):
return re.sub(pattern, '"password": "********"', json) return re.sub(pattern, '"password": "********"', json)
def _rest_call(self, url, method='GET', body=None, headers=None, def _rest_call(self, url, method='GET', body=None, headers=None,
silent=False, expected_results=None): silent=False, expected_results=None, **kwargs):
request_headers = headers.copy() if headers else {} request_headers = headers.copy() if headers else {}
request_headers.update(self._default_headers) request_headers.update(self._default_headers)
if utils.INJECT_HEADERS_CALLBACK: if utils.INJECT_HEADERS_CALLBACK:
@ -305,7 +306,7 @@ class NSX3Client(JSONRESTClient):
error_code=error_code) error_code=error_code)
def _rest_call(self, url, **kwargs): def _rest_call(self, url, **kwargs):
if self.rate_limit_retry: if self.rate_limit_retry and kwargs.get('with_retries', True):
# If too many requests are handled by the nsx at the same time, # If too many requests are handled by the nsx at the same time,
# error "429: Too Many Requests" or "503: Server Unavailable" # error "429: Too Many Requests" or "503: Server Unavailable"
# will be returned. # will be returned.

View File

@ -179,9 +179,15 @@ class NSXRequestsHTTPProvider(AbstractHTTPProvider):
conn, url_prefix=endpoint.provider.url, conn, url_prefix=endpoint.provider.url,
url_path_base=cluster_api.nsxlib_config.url_base, url_path_base=cluster_api.nsxlib_config.url_base,
default_headers=conn.default_headers) default_headers=conn.default_headers)
# Check the manager state directly
if cluster_api.nsxlib_config.validate_connection_method:
cluster_api.nsxlib_config.validate_connection_method(
client, endpoint.provider.url)
# If keeplive section returns a list, it is assumed to be non-empty
keepalive_section = cluster_api.nsxlib_config.keepalive_section keepalive_section = cluster_api.nsxlib_config.keepalive_section
result = client.get(keepalive_section, silent=True) result = client.get(keepalive_section, silent=True)
# If keeplive section returns a list, it is assumed to be non-empty
if not result or result.get('result_count', 1) <= 0: if not result or result.get('result_count', 1) <= 0:
msg = _("No %(section)s found " msg = _("No %(section)s found "
"for '%(url)s'") % {'section': keepalive_section, "for '%(url)s'") % {'section': keepalive_section,
@ -189,10 +195,6 @@ class NSXRequestsHTTPProvider(AbstractHTTPProvider):
LOG.warning(msg) LOG.warning(msg)
raise exceptions.ResourceNotFound( raise exceptions.ResourceNotFound(
manager=endpoint.provider.url, operation=msg) manager=endpoint.provider.url, operation=msg)
# Also check the manager state directly
if cluster_api.nsxlib_config.validate_connection_method:
cluster_api.nsxlib_config.validate_connection_method(
client, endpoint.provider.url)
def new_connection(self, cluster_api, provider): def new_connection(self, cluster_api, provider):
config = cluster_api.nsxlib_config config = cluster_api.nsxlib_config