diff --git a/vmware_nsxlib/tests/unit/v3/nsxlib_testcase.py b/vmware_nsxlib/tests/unit/v3/nsxlib_testcase.py index f63a1266..27c01fe0 100644 --- a/vmware_nsxlib/tests/unit/v3/nsxlib_testcase.py +++ b/vmware_nsxlib/tests/unit/v3/nsxlib_testcase.py @@ -239,7 +239,8 @@ class NsxClientTestCase(NsxLibTestCase): nsx_api_managers=nsx_api_managers or [NSX_MANAGER], plugin_scope=PLUGIN_SCOPE, plugin_tag=PLUGIN_TAG, - plugin_ver=PLUGIN_VER) + plugin_ver=PLUGIN_VER, + cluster_unavailable_retry=True) super(NsxClientTestCase.MockNSXClusteredAPI, self).__init__( nsxlib_config) diff --git a/vmware_nsxlib/tests/unit/v3/test_cluster.py b/vmware_nsxlib/tests/unit/v3/test_cluster.py index 9ea14192..5140de0d 100644 --- a/vmware_nsxlib/tests/unit/v3/test_cluster.py +++ b/vmware_nsxlib/tests/unit/v3/test_cluster.py @@ -392,6 +392,7 @@ class ClusteredAPITestCase(nsxlib_testcase.NsxClientTestCase): max_attempts = 3 api = self.mock_nsx_clustered_api(nsx_api_managers=conf_managers, max_attempts=max_attempts) + api.nsxlib_config.cluster_unavailable_retry = False api._validate = mock.Mock() eps = list(api._endpoints.values()) diff --git a/vmware_nsxlib/v3/cluster.py b/vmware_nsxlib/v3/cluster.py index dc17b3d7..bfa42d66 100644 --- a/vmware_nsxlib/v3/cluster.py +++ b/vmware_nsxlib/v3/cluster.py @@ -460,6 +460,7 @@ class ClusteredAPI(object): self._http_provider = http_provider self._keepalive_interval = keepalive_interval + self._print_keepalive = 0 def _init_cluster(*args, **kwargs): self._init_endpoints(providers, @@ -511,13 +512,18 @@ class ClusteredAPI(object): break eventlet.sleep(0.5) - for endpoint in self._endpoints.values(): - # dynamic loop for each endpoint to ensure connectivity - loop = loopingcall.DynamicLoopingCall( - self._endpoint_keepalive, endpoint) - loop.start(initial_delay=self._keepalive_interval, - periodic_interval_max=self._keepalive_interval, - stop_on_exception=False) + if len(self._endpoints) > 1: + # We don't monitor connectivity when one endpoint is available, + # since there is no alternative to querying this single backend + # If endpoint was down, we can tolerate extra roundtrip to + # validate connectivity + for endpoint in self._endpoints.values(): + # dynamic loop for each endpoint to ensure connectivity + loop = loopingcall.DynamicLoopingCall( + self._endpoint_keepalive, endpoint) + loop.start(initial_delay=self._keepalive_interval, + periodic_interval_max=self._keepalive_interval, + stop_on_exception=False) LOG.debug("Done initializing API endpoint(s). " "API cluster health: %s", self.health) @@ -526,6 +532,13 @@ class ClusteredAPI(object): delta = datetime.datetime.now() - endpoint.last_updated if delta.seconds >= self._keepalive_interval: # TODO(boden): backoff on validation failure + if self._print_keepalive % 10 == 0: + # Print keepalive debug message once every 10 probes + LOG.debug("Running keepalive probe for cluster endpoint " + "'%(ep)s' ", + {'ep': endpoint}) + self._print_keepalive += 1 + self._validate(endpoint) return self._keepalive_interval return self._keepalive_interval - delta.seconds diff --git a/vmware_nsxlib/v3/config.py b/vmware_nsxlib/v3/config.py index f0bc0b64..c6ba896e 100644 --- a/vmware_nsxlib/v3/config.py +++ b/vmware_nsxlib/v3/config.py @@ -85,7 +85,11 @@ class NsxLibConfig(object): :param cluster_unavailable_retry: If True, skip fatal errors when no endpoint in the NSX management cluster is available to serve a request, and retry - the request instead. + the request instead. This setting can + not be False if single endpoint is + configured in the cluster, since there + will be no keepalive probes in this + case. -- Additional parameters which are relevant only for the Policy manager: :param allow_passthrough: If True, use nsx manager api for cases which are @@ -152,6 +156,13 @@ class NsxLibConfig(object): self.realization_max_attempts = realization_max_attempts self.realization_wait_sec = realization_wait_sec + if len(nsx_api_managers) == 1 and not self.cluster_unavailable_retry: + LOG.warning("When only one endpoint is provided, keepalive probes " + " are disabled. For the system to be able to recover " + " from DOWN state, cluster_unavailable_retry is set " + " to True, overriding provided configuration") + self.cluster_unavailable_retry = True + if dhcp_profile_uuid: # this is deprecated, and never used. versionutils.report_deprecated_feature(