make DataSourceEc2 more resilliant to slow metadata service (LP: #894279)

This increases the timeout for a metadata request to something that should be easily satisfiable (50 seconds). But hopefully does so while still keeping the case of no-metadata service in mind. Previously, there was a small timeout and many retries (30) would be done. Now, - larger timeout (50 seconds) by default - retry until a given "max_wait" is reached (120 seconds default) The end result is that if we're hitting the timeout, there will only end up being a couple attempts made. But if the requests are coming back quickly then we'll still make several attempts. There is one EC2DataSource config change, now 'retries' is not used, but rather 'max_wait' to indicate generally how long it should try to find a metadata service.
2011-12-19 12:00:48 -05:00 · 2011-12-19 12:00:48 -05:00 · 7445b03f51
commit 7445b03f51
parent d88b22f965
2 changed files with 117 additions and 61 deletions
--- a/cloudinit/DataSourceEc2.py
+++ b/cloudinit/DataSourceEc2.py
@ -48,8 +48,10 @@ class DataSourceEc2(DataSource.DataSource):
        try:
            if not self.wait_for_metadata_service():
                return False
+            start = time.time()
            self.userdata_raw = boto_utils.get_instance_userdata(self.api_ver, None, self.metadata_address)
            self.metadata = boto_utils.get_instance_metadata(self.api_ver, self.metadata_address)
+            log.debug("crawl of metadata service took %ds" % (time.time()-start))
            return True
        except Exception as e:
            print e
@ -81,37 +83,31 @@ class DataSourceEc2(DataSource.DataSource):
        except:
            return fallback

-
-    def wait_for_metadata_service(self, sleeps = None):
+    def wait_for_metadata_service(self):
        mcfg = self.ds_cfg
-        if sleeps is None:
-            sleeps = 30
-            try:
-                sleeps = int(mcfg.get("retries",sleeps))
-            except Exception as e:
-                util.logexc(log)
-                log.warn("Failed to get number of sleeps, using %s" % sleeps)

-        if sleeps == 0: return False
+        if not hasattr(mcfg, "get"):
+            mcfg =  {}

-        timeout=3
+        max_wait = 120
+        try:
+            max_wait = int(mcfg.get("max_wait",max_wait))
+        except Exception as e:
+            util.logexc(log)
+            log.warn("Failed to get max wait. using %s" % max_wait)
+
+        if max_wait == 0:
+            return False
+
+        timeout = 50
        try:
            timeout = int(mcfg.get("timeout",timeout))
        except Exception as e:
            util.logexc(log)
            log.warn("Failed to get timeout, using %s" % timeout)

-        sleeptime = 1
-
        def_mdurls = ["http://169.254.169.254", "http://instance-data:8773"]
-        try:
-            mdurls = mcfg.get("metadata_urls", def_mdurls)
-        except Exception as e:
-            mdurls = def_mdurls
-            util.logexc(log)
-            log.warn("Failed to get metadata URLs, using defaults")
-
-        starttime = time.time()
+        mdurls = mcfg.get("metadata_urls", def_mdurls)

        # Remove addresses from the list that wont resolve.
        filtered = [x for x in mdurls if util.is_resolvable_url(x)]
@ -126,41 +122,25 @@ class DataSourceEc2(DataSource.DataSource):
            log.warn("Empty metadata url list! using default list")
            mdurls = def_mdurls

-        log.debug("Searching the following metadata urls: %s" % mdurls)
+        urls = [ ]
+        url2base = { False: False }
+        for url in mdurls:
+            cur = "%s/%s/meta-data/instance-id" % (url, self.api_ver)
+            urls.append(cur)
+            url2base[cur] = url

-        for x in range(sleeps):
-            for url in mdurls:
-                iurl="%s/%s/meta-data/instance-id" % (url, self.api_ver)
+        starttime = time.time()
+        url = wait_for_metadata_service(urls=urls, max_wait=max_wait,
+                  timeout=timeout, status_cb=log.warn)

-                # given 100 sleeps, this ends up total sleep time of 1050 sec
-                sleeptime=int(x/5)+1
+        if url:
+            log.debug("Using metadata source: '%s'" % url2base[url])
+        else:
+            log.critical("giving up on md after %i seconds\n" %
+                         int(time.time()-starttime))

-                reason = ""
-                try:
-                    req = urllib2.Request(iurl)
-                    resp = urllib2.urlopen(req, timeout=timeout)
-                    if resp.read() != "":
-                        self.metadata_address = url
-                        log.debug("Using metadata source: '%s'" % url)
-                        return True
-                    reason = "empty data [%s]" % resp.getcode()
-                except urllib2.HTTPError as e:
-                    reason = "http error [%s]" % e.code
-                except urllib2.URLError as e:
-                    reason = "url error [%s]" % e.reason
-                except socket.timeout as e:
-                    reason = "socket timeout [%s]" % e
-
-                #not needed? Addresses being checked are displayed above
-                #if x == 0:
-                #    log.warn("waiting for metadata service at %s" % url)
-
-                log.warn("'%s' failed: %s" % (url, reason))
-            time.sleep(sleeptime)
-
-        log.critical("giving up on md after %i seconds\n" %
-                  int(time.time()-starttime))
-        return False
+        self.metadata_address = url2base[url]
+        return (bool(url))

    def device_name_to_device(self, name):
        # consult metadata service, that has
@ -221,6 +201,84 @@ class DataSourceEc2(DataSource.DataSource):
            return True
        return False

+
+def wait_for_metadata_service(urls, max_wait=None, timeout=None, status_cb=None):
+    """
+    urls:      a list of urls to try
+    max_wait:  roughly the maximum time to wait before giving up
+               The max time is *actually* len(urls)*timeout as each url will
+               be tried once and given the timeout provided.
+    timeout:   the timeout provided to urllib2.urlopen
+    status_cb: call method with string message when a url is not available
+      
+    the idea of this routine is to wait for the EC2 metdata service to
+    come up.  On both Eucalyptus and EC2 we have seen the case where
+    the instance hit the MD before the MD service was up.  EC2 seems
+    to have permenantely fixed this, though.
+
+    In openstack, the metadata service might be painfully slow, and
+    unable to avoid hitting a timeout of even up to 10 seconds or more
+    (LP: #894279) for a simple GET.
+
+    Offset those needs with the need to not hang forever (and block boot)
+    on a system where cloud-init is configured to look for EC2 Metadata
+    service but is not going to find one.  It is possible that the instance
+    data host (169.254.169.254) may be firewalled off Entirely for a sytem,
+    meaning that the connection will block forever unless a timeout is set.
+    """
+    starttime = time.time()
+
+    sleeptime = 1
+    timeout_orig = timeout
+
+    if status_cb == None:
+        def status_cb(msg): return
+
+    def timeup(max_wait, starttime):
+        return((max_wait <= 0 or max_wait == None) or
+               (time.time()-starttime > max_wait))
+
+    loop_n = 0
+    while True:
+        sleeptime=int(loop_n/5)+1
+        for url in urls:
+            now = time.time()
+            if loop_n != 0:
+                if timeup(max_wait, starttime):
+                    break
+                if timeout and (now + timeout > (starttime + max_wait)):
+                    # shorten timeout to not run way over max_time
+                    timeout = int((starttime + max_wait) - now)
+
+            reason = ""
+            try:
+                req = urllib2.Request(url)
+                resp = urllib2.urlopen(req, timeout=timeout)
+                if resp.read() != "":
+                    return url
+                reason = "empty data [%s]" % resp.getcode()
+            except urllib2.HTTPError as e:
+                reason = "http error [%s]" % e.code
+            except urllib2.URLError as e:
+                reason = "url error [%s]" % e.reason
+            except socket.timeout as e:
+                reason = "socket timeout [%s]" % e
+            except Exception as e:
+                reason = "unexpected error [%s]" % e
+
+            if log:
+                status_cb("'%s' failed [%s/%ss]: %s" %
+                          (url, int(time.time()-starttime), max_wait, reason))
+
+        if timeup(max_wait, starttime):
+            break
+
+        loop_n = loop_n + 1
+        time.sleep(sleeptime)
+
+    return False
+
+
 datasources = [ 
  ( DataSourceEc2, ( DataSource.DEP_FILESYSTEM , DataSource.DEP_NETWORK ) ),
 ]
--- a/doc/examples/cloud-config-datasources.txt
+++ b/doc/examples/cloud-config-datasources.txt
@ -2,16 +2,14 @@
 datasource:
  # Ec2 
  Ec2:
-    # timeout: the timeout value for attempt at metadata service
-    timeout : 2
-    # the number of tries that should be attempted at the metadata service
-    # after each try, a sleep of int(try_number/5)+1 is done
-    # default is 30
-    retries : 30
+    # timeout: the timeout value for a request at metadata service
+    timeout : 50
+    # The length in seconds to wait before giving up on the metadata
+    # service.  The actual total wait could be up to 
+    #   len(resolvable_metadata_urls)*timeout
+    max_wait : 120

    #metadata_url: a list of URLs to check for metadata services
    metadata_urls:
     - http://169.254.169.254:80
     - http://instance-data:8773
-
-