Improve Cluster validation checks

1. Replace the url used for manager status check. 2. Change the order of validation checks since the list action can be timed out. 3. Run the status validation without retries, to make sure the node is identified as DOWN quicker. Change-Id: I60501b544b5892dcc6eb1c4c897ee4add6262e0b
2019-01-17 09:35:59 +02:00 · 2019-01-17 09:35:59 +02:00 · 2551a5382f
commit 2551a5382f
parent bb3fb29fad
3 changed files with 22 additions and 15 deletions
--- a/vmware_nsxlib/v3/init.py
+++ b/vmware_nsxlib/v3/init.py
@ -116,10 +116,14 @@ class NsxLib(lib.NsxLibBase):
    def validate_connection_method(self):
        """Return a method that will validate the NSX manager status"""
        def check_manager_status(client, manager_url):
-            status = client.get('node/services/manager/status', silent=True)
+            # Try to get the cluster status silently and with no retries
-            if (not status or 'runtime_state' not in status or
+            status = client.get('operational/application/status',
-                status['runtime_state'] != 'running'):
+                                silent=True, with_retries=False)
-                msg = _("Manager is not in running state: %s") % status
+            if (not status or
                status.get('application_status') != 'WORKING' or
                status.get('corfu_status') != 'CONNECTED' or
                status.get('corfu_status') != 'CONNECTED'):
                msg = _("Manager is not in working state: %s") % status
                LOG.warning(msg)
                raise exceptions.ResourceNotFound(
                    manager=manager_url, operation=msg)
--- a/vmware_nsxlib/v3/client.py
+++ b/vmware_nsxlib/v3/client.py
@ -89,8 +89,9 @@ class RESTClient(object):
    def list(self, resource='', headers=None, silent=False):
        return self.url_list(resource, headers=headers, silent=silent)
-    def get(self, uuid, headers=None, silent=False):
+    def get(self, uuid, headers=None, silent=False, with_retries=True):
-        return self.url_get(uuid, headers=headers, silent=silent)
+        return self.url_get(uuid, headers=headers, silent=silent,
                            with_retries=with_retries)
    def delete(self, uuid, headers=None, expected_results=None):
        return self.url_delete(uuid, headers=headers,
@ -120,9 +121,9 @@ class RESTClient(object):
            cursor = page.get('cursor', NULL_CURSOR_PREFIX)
        return concatenate_response
-    def url_get(self, url, headers=None, silent=False):
+    def url_get(self, url, headers=None, silent=False, with_retries=True):
        return self._rest_call(url, method='GET', headers=headers,
-                               silent=silent)
+                               silent=silent, with_retries=with_retries)
    def url_delete(self, url, headers=None, expected_results=None):
        return self._rest_call(url, method='DELETE', headers=headers,
@ -195,7 +196,7 @@ class RESTClient(object):
        return re.sub(pattern, '"password": "********"', json)
    def _rest_call(self, url, method='GET', body=None, headers=None,
-                   silent=False, expected_results=None):
+                   silent=False, expected_results=None, **kwargs):
        request_headers = headers.copy() if headers else {}
        request_headers.update(self._default_headers)
        if utils.INJECT_HEADERS_CALLBACK:
@ -305,7 +306,7 @@ class NSX3Client(JSONRESTClient):
                    error_code=error_code)
    def _rest_call(self, url, **kwargs):
-        if self.rate_limit_retry:
+        if self.rate_limit_retry and kwargs.get('with_retries', True):
            # If too many requests are handled by the nsx at the same time,
            # error "429: Too Many Requests" or "503: Server Unavailable"
            # will be returned.
--- a/vmware_nsxlib/v3/cluster.py
+++ b/vmware_nsxlib/v3/cluster.py
@ -179,9 +179,15 @@ class NSXRequestsHTTPProvider(AbstractHTTPProvider):
            conn, url_prefix=endpoint.provider.url,
            url_path_base=cluster_api.nsxlib_config.url_base,
            default_headers=conn.default_headers)
        # Check the manager state directly
        if cluster_api.nsxlib_config.validate_connection_method:
            cluster_api.nsxlib_config.validate_connection_method(
                client, endpoint.provider.url)
        # If keeplive section returns a list, it is assumed to be non-empty
        keepalive_section = cluster_api.nsxlib_config.keepalive_section
        result = client.get(keepalive_section, silent=True)
        # If keeplive section returns a list, it is assumed to be non-empty
        if not result or result.get('result_count', 1) <= 0:
            msg = _("No %(section)s found "
                    "for '%(url)s'") % {'section': keepalive_section,
@ -189,10 +195,6 @@ class NSXRequestsHTTPProvider(AbstractHTTPProvider):
            LOG.warning(msg)
            raise exceptions.ResourceNotFound(
                manager=endpoint.provider.url, operation=msg)
        # Also check the manager state directly
        if cluster_api.nsxlib_config.validate_connection_method:
            cluster_api.nsxlib_config.validate_connection_method(
                client, endpoint.provider.url)
    def new_connection(self, cluster_api, provider):
        config = cluster_api.nsxlib_config