Added retries if API call fails due to MP cluster reconfig

API calls can fail with corfudb exception if the MP cluster is
reconfiguring. When this exception was encountered, MP recovered
in a second and subsequent API calls went through. Added a retry
with random exponential back-off mechanism if this exception is
seen with a API call.

Change-Id: I1bb4f432f15b9da025ae204a2f6e7646f268b8f1
This commit is contained in:
Durgesh Rane 2019-01-23 17:47:26 -08:00 committed by Adit Sarfaty
parent fb84d8721f
commit 1b48b1f592
4 changed files with 34 additions and 7 deletions

View File

@ -302,6 +302,10 @@ class NsxV3RESTClientTestCase(nsxlib_testcase.NsxClientTestCase):
utils.set_inject_headers_callback(None)
self.assertIsNotNone(self.injected)
def test_http_error_to_exception(self):
exc = client.http_error_to_exception(500, 607)
self.assertEqual(exc, nsxlib_exc.APITransactionAborted)
class NsxV3JSONClientTestCase(nsxlib_testcase.NsxClientTestCase):

View File

@ -291,6 +291,20 @@ class TestNsxV3Utils(nsxlib_testcase.NsxClientTestCase):
self.assertRaises(exceptions.NsxLibInvalidInput, func_to_fail, 99)
self.assertEqual(max_retries, total_count['val'])
def test_retry_random_tuple(self):
max_retries = 5
total_count = {'val': 0}
@utils.retry_random_upon_exception(
(exceptions.NsxLibInvalidInput, exceptions.APITransactionAborted),
max_attempts=max_retries)
def func_to_fail(x):
total_count['val'] = total_count['val'] + 1
raise exceptions.NsxLibInvalidInput(error_message='foo')
self.assertRaises(exceptions.NsxLibInvalidInput, func_to_fail, 99)
self.assertEqual(max_retries, total_count['val'])
@mock.patch.object(utils, '_update_max_nsgroups_criteria_tags')
@mock.patch.object(utils, '_update_max_tags')
@mock.patch.object(utils, '_update_tag_length')

View File

@ -40,7 +40,8 @@ def http_error_to_exception(status_code, error_code):
requests.codes.CONFLICT: exceptions.StaleRevision,
requests.codes.PRECONDITION_FAILED: exceptions.StaleRevision,
requests.codes.INTERNAL_SERVER_ERROR:
{'99': exceptions.ClientCertificateNotTrusted},
{'99': exceptions.ClientCertificateNotTrusted,
'607': exceptions.APITransactionAborted},
requests.codes.FORBIDDEN:
{'98': exceptions.BadXSRFToken},
requests.codes.TOO_MANY_REQUESTS: exceptions.TooManyRequests,
@ -306,15 +307,19 @@ class NSX3Client(JSONRESTClient):
error_code=error_code)
def _rest_call(self, url, **kwargs):
if self.rate_limit_retry and kwargs.get('with_retries', True):
# If too many requests are handled by the nsx at the same time,
# error "429: Too Many Requests" or "503: Server Unavailable"
# will be returned.
if kwargs.get('with_retries', True):
# Retry on "607: Persistence layer is currently reconfiguring"
retry_codes = [exceptions.APITransactionAborted]
if self.rate_limit_retry:
# If too many requests are handled by the nsx at the same time,
# error "429: Too Many Requests" or "503: Server Unavailable"
# will be returned.
retry_codes.append(exceptions.ServerBusy)
# the client is expected to retry after a random 400-600 milli,
# and later exponentially until 5 seconds wait
@utils.retry_random_upon_exception(
exceptions.ServerBusy,
max_attempts=self.max_attempts)
tuple(retry_codes), max_attempts=self.max_attempts)
def _rest_call_with_retry(self, url, **kwargs):
return super(NSX3Client, self)._rest_call(url, **kwargs)
return _rest_call_with_retry(self, url, **kwargs)

View File

@ -159,3 +159,7 @@ class NsxPendingDelete(NsxLibException):
message = _("An object with the same name is marked for deletion. Either "
"use another path or wait for the purge cycle to permanently "
"remove the deleted object")
class APITransactionAborted(ServerBusy):
message = _("API transaction aborted as MP cluster is reconfiguring.")