From 1b48b1f592cc6a5aee64d45ac5d325adc9128c79 Mon Sep 17 00:00:00 2001 From: Durgesh Rane Date: Wed, 23 Jan 2019 17:47:26 -0800 Subject: [PATCH] Added retries if API call fails due to MP cluster reconfig API calls can fail with corfudb exception if the MP cluster is reconfiguring. When this exception was encountered, MP recovered in a second and subsequent API calls went through. Added a retry with random exponential back-off mechanism if this exception is seen with a API call. Change-Id: I1bb4f432f15b9da025ae204a2f6e7646f268b8f1 --- vmware_nsxlib/tests/unit/v3/test_client.py | 4 ++++ vmware_nsxlib/tests/unit/v3/test_utils.py | 14 ++++++++++++++ vmware_nsxlib/v3/client.py | 19 ++++++++++++------- vmware_nsxlib/v3/exceptions.py | 4 ++++ 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/vmware_nsxlib/tests/unit/v3/test_client.py b/vmware_nsxlib/tests/unit/v3/test_client.py index 9160ce7a..96d66c64 100644 --- a/vmware_nsxlib/tests/unit/v3/test_client.py +++ b/vmware_nsxlib/tests/unit/v3/test_client.py @@ -302,6 +302,10 @@ class NsxV3RESTClientTestCase(nsxlib_testcase.NsxClientTestCase): utils.set_inject_headers_callback(None) self.assertIsNotNone(self.injected) + def test_http_error_to_exception(self): + exc = client.http_error_to_exception(500, 607) + self.assertEqual(exc, nsxlib_exc.APITransactionAborted) + class NsxV3JSONClientTestCase(nsxlib_testcase.NsxClientTestCase): diff --git a/vmware_nsxlib/tests/unit/v3/test_utils.py b/vmware_nsxlib/tests/unit/v3/test_utils.py index 9dfa378c..d2c7b088 100644 --- a/vmware_nsxlib/tests/unit/v3/test_utils.py +++ b/vmware_nsxlib/tests/unit/v3/test_utils.py @@ -291,6 +291,20 @@ class TestNsxV3Utils(nsxlib_testcase.NsxClientTestCase): self.assertRaises(exceptions.NsxLibInvalidInput, func_to_fail, 99) self.assertEqual(max_retries, total_count['val']) + def test_retry_random_tuple(self): + max_retries = 5 + total_count = {'val': 0} + + @utils.retry_random_upon_exception( + (exceptions.NsxLibInvalidInput, exceptions.APITransactionAborted), + max_attempts=max_retries) + def func_to_fail(x): + total_count['val'] = total_count['val'] + 1 + raise exceptions.NsxLibInvalidInput(error_message='foo') + + self.assertRaises(exceptions.NsxLibInvalidInput, func_to_fail, 99) + self.assertEqual(max_retries, total_count['val']) + @mock.patch.object(utils, '_update_max_nsgroups_criteria_tags') @mock.patch.object(utils, '_update_max_tags') @mock.patch.object(utils, '_update_tag_length') diff --git a/vmware_nsxlib/v3/client.py b/vmware_nsxlib/v3/client.py index 569ac8d9..816e1791 100644 --- a/vmware_nsxlib/v3/client.py +++ b/vmware_nsxlib/v3/client.py @@ -40,7 +40,8 @@ def http_error_to_exception(status_code, error_code): requests.codes.CONFLICT: exceptions.StaleRevision, requests.codes.PRECONDITION_FAILED: exceptions.StaleRevision, requests.codes.INTERNAL_SERVER_ERROR: - {'99': exceptions.ClientCertificateNotTrusted}, + {'99': exceptions.ClientCertificateNotTrusted, + '607': exceptions.APITransactionAborted}, requests.codes.FORBIDDEN: {'98': exceptions.BadXSRFToken}, requests.codes.TOO_MANY_REQUESTS: exceptions.TooManyRequests, @@ -306,15 +307,19 @@ class NSX3Client(JSONRESTClient): error_code=error_code) def _rest_call(self, url, **kwargs): - if self.rate_limit_retry and kwargs.get('with_retries', True): - # If too many requests are handled by the nsx at the same time, - # error "429: Too Many Requests" or "503: Server Unavailable" - # will be returned. + if kwargs.get('with_retries', True): + # Retry on "607: Persistence layer is currently reconfiguring" + retry_codes = [exceptions.APITransactionAborted] + if self.rate_limit_retry: + # If too many requests are handled by the nsx at the same time, + # error "429: Too Many Requests" or "503: Server Unavailable" + # will be returned. + retry_codes.append(exceptions.ServerBusy) + # the client is expected to retry after a random 400-600 milli, # and later exponentially until 5 seconds wait @utils.retry_random_upon_exception( - exceptions.ServerBusy, - max_attempts=self.max_attempts) + tuple(retry_codes), max_attempts=self.max_attempts) def _rest_call_with_retry(self, url, **kwargs): return super(NSX3Client, self)._rest_call(url, **kwargs) return _rest_call_with_retry(self, url, **kwargs) diff --git a/vmware_nsxlib/v3/exceptions.py b/vmware_nsxlib/v3/exceptions.py index 305b09c2..cfeefba3 100644 --- a/vmware_nsxlib/v3/exceptions.py +++ b/vmware_nsxlib/v3/exceptions.py @@ -159,3 +159,7 @@ class NsxPendingDelete(NsxLibException): message = _("An object with the same name is marked for deletion. Either " "use another path or wait for the purge cycle to permanently " "remove the deleted object") + + +class APITransactionAborted(ServerBusy): + message = _("API transaction aborted as MP cluster is reconfiguring.")