From: Zack Cerza Date: Mon, 6 Feb 2017 21:16:13 +0000 (-0700) Subject: cloud: Retry failed requests in libcloud X-Git-Tag: 1.1.0~456^2~6 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=b0956d71de5c309023b186631214231e6a09e356;p=teuthology.git cloud: Retry failed requests in libcloud It's common to see "429 Rate limit exceeded", at least with OVH. When we encounter the exception associated with that exception, backoff and retry for an interval before eventually giving up. Signed-off-by: Zack Cerza --- diff --git a/teuthology/provision/cloud/openstack.py b/teuthology/provision/cloud/openstack.py index 058b56d09e..36d4188e3b 100644 --- a/teuthology/provision/cloud/openstack.py +++ b/teuthology/provision/cloud/openstack.py @@ -6,6 +6,7 @@ import urllib import yaml from copy import deepcopy +from libcloud.common.exceptions import RateLimitReachedError from paramiko import AuthenticationException from paramiko.ssh_exception import NoValidConnectionsError @@ -21,26 +22,51 @@ from teuthology.provision.cloud.base import Provider log = logging.getLogger(__name__) +RETRY_EXCEPTIONS = (RateLimitReachedError, ) + + +def retry(function, *args, **kwargs): + """ + Call a function (returning its results), retrying if any of the exceptions + in RETRY_EXCEPTIONS are raised + """ + with safe_while(sleep=1, tries=24, increment=1) as proceed: + tries = 0 + while proceed(): + tries += 1 + try: + result = function(*args, **kwargs) + if tries > 1: + log.debug( + "'%s' succeeded after %s tries", + function.__name__, + tries, + ) + return result + except RETRY_EXCEPTIONS: + pass + + class OpenStackProvider(Provider): _driver_posargs = ['username', 'password'] @property def images(self): if not hasattr(self, '_images'): - self._images = self.driver.list_images() + self._images = retry(self.driver.list_images) return self._images @property def sizes(self): if not hasattr(self, '_sizes'): - self._sizes = self.driver.list_sizes() + self._sizes = retry(self.driver.list_sizes) return self._sizes @property def networks(self): if not hasattr(self, '_networks'): try: - self._networks = self.driver.ex_list_networks() + self._networks = retry(self.driver.ex_list_networks) except AttributeError: log.warn("Unable to list networks for %s", self.driver) self._networks = list() @@ -50,7 +76,9 @@ class OpenStackProvider(Provider): def security_groups(self): if not hasattr(self, '_security_groups'): try: - self._security_groups = self.driver.ex_list_security_groups() + self._security_groups = retry( + self.driver.ex_list_security_groups + ) except AttributeError: log.warn("Unable to list security groups for %s", self.driver) self._security_groups = list() @@ -134,11 +162,13 @@ class OpenStackProvisioner(base.Provisioner): security_groups = self.security_groups if security_groups: create_args['ex_security_groups'] = security_groups - self._node = self.provider.driver.create_node( + self._node = retry( + self.provider.driver.create_node, **create_args ) log.debug("Created node: %s", self.node) - results = self.provider.driver.wait_until_running( + results = retry( + self.provider.driver.wait_until_running, nodes=[self.node], ) self._node, self.ips = results[0] @@ -159,12 +189,14 @@ class OpenStackProvisioner(base.Provisioner): for i in range(vol_count)] try: for name in vol_names: - volume = self.provider.driver.create_volume( + volume = retry( + self.provider.driver.create_volume, vol_size, name, ) log.info("Created volume %s", volume) - self.provider.driver.attach_volume( + retry( + self.provider.driver.attach_volume, self.node, volume, device=None, @@ -176,16 +208,16 @@ class OpenStackProvisioner(base.Provisioner): return True def _destroy_volumes(self): - all_volumes = self.provider.driver.list_volumes() + all_volumes = retry(self.provider.driver.list_volumes) our_volumes = [vol for vol in all_volumes if vol.name.startswith("%s_" % self.name)] for vol in our_volumes: try: - self.provider.driver.detach_volume(vol) + retry(self.provider.driver.detach_volume, vol) except Exception: log.exception("Could not detach volume %s", vol) try: - self.provider.driver.destroy_volume(vol) + retry(self.provider.driver.destroy_volume, vol) except Exception: log.exception("Could not destroy volume %s", vol) @@ -306,7 +338,7 @@ class OpenStackProvisioner(base.Provisioner): @property def node(self): if not hasattr(self, '_node'): - nodes = self.provider.driver.list_nodes() + nodes = retry(self.provider.driver.list_nodes) for node in nodes: matches = [node for node in nodes if node.name == self.name] msg = "Unknown error locating %s" diff --git a/teuthology/provision/cloud/test/test_openstack.py b/teuthology/provision/cloud/test/test_openstack.py index f6f7ccb7fa..1b0f50f2f3 100644 --- a/teuthology/provision/cloud/test/test_openstack.py +++ b/teuthology/provision/cloud/test/test_openstack.py @@ -25,6 +25,34 @@ test_config = dict( ) +@patch('time.sleep') +def test_retry(m_sleep): + orig_exceptions = cloud.openstack.RETRY_EXCEPTIONS + new_exceptions = orig_exceptions + (RuntimeError, ) + + class test_cls(object): + def __init__(self, min_val): + self.min_val = min_val + self.cur_val = 0 + + def func(self): + self.cur_val += 1 + if self.cur_val < self.min_val: + raise RuntimeError + return self.cur_val + + with patch.object( + cloud.openstack, + 'RETRY_EXCEPTIONS', + new=new_exceptions, + ): + test_obj = test_cls(min_val=5) + assert cloud.openstack.retry(test_obj.func) == 5 + test_obj = test_cls(min_val=1000) + with raises(MaxWhileTries): + cloud.openstack.retry(test_obj.func) + + def get_fake_obj(mock_args=None, attributes=None): if mock_args is None: mock_args = dict()