make DataSourceEc2 more resilliant to slow metadata service (LP: #894279)

This increases the timeout for a metadata request to something that should
be easily satisfiable (50 seconds).  But hopefully does so while still keeping
the case of no-metadata service in mind.

Previously, there was a small timeout and many retries (30) would be done.
Now,
 - larger timeout (50 seconds) by default
 - retry until a given "max_wait" is reached (120 seconds default)

The end result is that if we're hitting the timeout, there will only end up
being a couple attempts made.  But if the requests are coming back quickly
then we'll still make several attempts.

There is one EC2DataSource config change, now 'retries' is not used, but rather
'max_wait' to indicate generally how long it should try to find a metadata
service.
This commit is contained in:
Scott Moser 2011-12-19 12:00:48 -05:00
parent d88b22f965
commit 7445b03f51
2 changed files with 117 additions and 61 deletions

View File

@ -48,8 +48,10 @@ class DataSourceEc2(DataSource.DataSource):
try:
if not self.wait_for_metadata_service():
return False
start = time.time()
self.userdata_raw = boto_utils.get_instance_userdata(self.api_ver, None, self.metadata_address)
self.metadata = boto_utils.get_instance_metadata(self.api_ver, self.metadata_address)
log.debug("crawl of metadata service took %ds" % (time.time()-start))
return True
except Exception as e:
print e
@ -81,37 +83,31 @@ class DataSourceEc2(DataSource.DataSource):
except:
return fallback
def wait_for_metadata_service(self, sleeps = None):
def wait_for_metadata_service(self):
mcfg = self.ds_cfg
if sleeps is None:
sleeps = 30
try:
sleeps = int(mcfg.get("retries",sleeps))
except Exception as e:
util.logexc(log)
log.warn("Failed to get number of sleeps, using %s" % sleeps)
if sleeps == 0: return False
if not hasattr(mcfg, "get"):
mcfg = {}
timeout=3
max_wait = 120
try:
max_wait = int(mcfg.get("max_wait",max_wait))
except Exception as e:
util.logexc(log)
log.warn("Failed to get max wait. using %s" % max_wait)
if max_wait == 0:
return False
timeout = 50
try:
timeout = int(mcfg.get("timeout",timeout))
except Exception as e:
util.logexc(log)
log.warn("Failed to get timeout, using %s" % timeout)
sleeptime = 1
def_mdurls = ["http://169.254.169.254", "http://instance-data:8773"]
try:
mdurls = mcfg.get("metadata_urls", def_mdurls)
except Exception as e:
mdurls = def_mdurls
util.logexc(log)
log.warn("Failed to get metadata URLs, using defaults")
starttime = time.time()
mdurls = mcfg.get("metadata_urls", def_mdurls)
# Remove addresses from the list that wont resolve.
filtered = [x for x in mdurls if util.is_resolvable_url(x)]
@ -126,41 +122,25 @@ class DataSourceEc2(DataSource.DataSource):
log.warn("Empty metadata url list! using default list")
mdurls = def_mdurls
log.debug("Searching the following metadata urls: %s" % mdurls)
urls = [ ]
url2base = { False: False }
for url in mdurls:
cur = "%s/%s/meta-data/instance-id" % (url, self.api_ver)
urls.append(cur)
url2base[cur] = url
for x in range(sleeps):
for url in mdurls:
iurl="%s/%s/meta-data/instance-id" % (url, self.api_ver)
starttime = time.time()
url = wait_for_metadata_service(urls=urls, max_wait=max_wait,
timeout=timeout, status_cb=log.warn)
# given 100 sleeps, this ends up total sleep time of 1050 sec
sleeptime=int(x/5)+1
if url:
log.debug("Using metadata source: '%s'" % url2base[url])
else:
log.critical("giving up on md after %i seconds\n" %
int(time.time()-starttime))
reason = ""
try:
req = urllib2.Request(iurl)
resp = urllib2.urlopen(req, timeout=timeout)
if resp.read() != "":
self.metadata_address = url
log.debug("Using metadata source: '%s'" % url)
return True
reason = "empty data [%s]" % resp.getcode()
except urllib2.HTTPError as e:
reason = "http error [%s]" % e.code
except urllib2.URLError as e:
reason = "url error [%s]" % e.reason
except socket.timeout as e:
reason = "socket timeout [%s]" % e
#not needed? Addresses being checked are displayed above
#if x == 0:
# log.warn("waiting for metadata service at %s" % url)
log.warn("'%s' failed: %s" % (url, reason))
time.sleep(sleeptime)
log.critical("giving up on md after %i seconds\n" %
int(time.time()-starttime))
return False
self.metadata_address = url2base[url]
return (bool(url))
def device_name_to_device(self, name):
# consult metadata service, that has
@ -221,6 +201,84 @@ class DataSourceEc2(DataSource.DataSource):
return True
return False
def wait_for_metadata_service(urls, max_wait=None, timeout=None, status_cb=None):
"""
urls: a list of urls to try
max_wait: roughly the maximum time to wait before giving up
The max time is *actually* len(urls)*timeout as each url will
be tried once and given the timeout provided.
timeout: the timeout provided to urllib2.urlopen
status_cb: call method with string message when a url is not available
the idea of this routine is to wait for the EC2 metdata service to
come up. On both Eucalyptus and EC2 we have seen the case where
the instance hit the MD before the MD service was up. EC2 seems
to have permenantely fixed this, though.
In openstack, the metadata service might be painfully slow, and
unable to avoid hitting a timeout of even up to 10 seconds or more
(LP: #894279) for a simple GET.
Offset those needs with the need to not hang forever (and block boot)
on a system where cloud-init is configured to look for EC2 Metadata
service but is not going to find one. It is possible that the instance
data host (169.254.169.254) may be firewalled off Entirely for a sytem,
meaning that the connection will block forever unless a timeout is set.
"""
starttime = time.time()
sleeptime = 1
timeout_orig = timeout
if status_cb == None:
def status_cb(msg): return
def timeup(max_wait, starttime):
return((max_wait <= 0 or max_wait == None) or
(time.time()-starttime > max_wait))
loop_n = 0
while True:
sleeptime=int(loop_n/5)+1
for url in urls:
now = time.time()
if loop_n != 0:
if timeup(max_wait, starttime):
break
if timeout and (now + timeout > (starttime + max_wait)):
# shorten timeout to not run way over max_time
timeout = int((starttime + max_wait) - now)
reason = ""
try:
req = urllib2.Request(url)
resp = urllib2.urlopen(req, timeout=timeout)
if resp.read() != "":
return url
reason = "empty data [%s]" % resp.getcode()
except urllib2.HTTPError as e:
reason = "http error [%s]" % e.code
except urllib2.URLError as e:
reason = "url error [%s]" % e.reason
except socket.timeout as e:
reason = "socket timeout [%s]" % e
except Exception as e:
reason = "unexpected error [%s]" % e
if log:
status_cb("'%s' failed [%s/%ss]: %s" %
(url, int(time.time()-starttime), max_wait, reason))
if timeup(max_wait, starttime):
break
loop_n = loop_n + 1
time.sleep(sleeptime)
return False
datasources = [
( DataSourceEc2, ( DataSource.DEP_FILESYSTEM , DataSource.DEP_NETWORK ) ),
]

View File

@ -2,16 +2,14 @@
datasource:
# Ec2
Ec2:
# timeout: the timeout value for attempt at metadata service
timeout : 2
# the number of tries that should be attempted at the metadata service
# after each try, a sleep of int(try_number/5)+1 is done
# default is 30
retries : 30
# timeout: the timeout value for a request at metadata service
timeout : 50
# The length in seconds to wait before giving up on the metadata
# service. The actual total wait could be up to
# len(resolvable_metadata_urls)*timeout
max_wait : 120
#metadata_url: a list of URLs to check for metadata services
metadata_urls:
- http://169.254.169.254:80
- http://instance-data:8773