]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
cephadm: Retry pull on transient error
authorSebastian Wagner <sebastian.wagner@suse.com>
Tue, 30 Jun 2020 09:26:58 +0000 (11:26 +0200)
committerSebastian Wagner <sebastian.wagner@suse.com>
Thu, 23 Jul 2020 13:20:10 +0000 (15:20 +0200)
Fixes: https://tracker.ceph.com/issues/46271
Signed-off-by: Sebastian Wagner <sebastian.wagner@suse.com>
(cherry picked from commit f3e770f3541c716b4f568cf73255f25243c655e5)

src/cephadm/cephadm

index dd4c3c7a74f08a35e011549a680b3f3bde0e084c..2029b8acdf7efe8fc802ca9a38f023f87a2bbb6a 100755 (executable)
@@ -2389,10 +2389,35 @@ def command_version():
 @infer_image
 def command_pull():
     # type: () -> int
-    logger.info('Pulling latest %s...' % args.image)
-    call_throws([container_path, 'pull', args.image])
+
+    _pull_image(args.image)
     return command_inspect_image()
 
+
+def _pull_image(image):
+    # type: () -> None
+    logger.info('Pulling container image %s...' % image)
+
+    ignorelist = [
+        "error creating read-write layer with ID",
+        "net/http: TLS handshake timeout",
+    ]
+
+    cmd = [container_path, 'pull', image]
+    cmd_str = ' '.join(cmd)
+
+    for sleep_secs in [1, 4, 25]:
+        out, err, ret = call(cmd)
+        if not ret:
+            return
+
+        if not any(pattern in err for pattern in ignorelist):
+            raise RuntimeError('Failed command: %s' % cmd_str)
+
+        logger.info('"%s failed transiently. Retrying. waiting %s seconds...' % (cmd_str, sleep_secs))
+        time.sleep(sleep_secs)
+
+    raise RuntimeError('Failed command: %s: maximum retries reached' % cmd_str)
 ##################################
 
 
@@ -2538,8 +2563,7 @@ def command_bootstrap():
     config = cpf.getvalue()
 
     if not args.skip_pull:
-        logger.info('Pulling latest %s container...' % args.image)
-        call_throws([container_path, 'pull', args.image])
+        _pull_image(args.image)
 
     logger.info('Extracting ceph user uid/gid from container image...')
     (uid, gid) = extract_uid_gid()
@@ -3466,8 +3490,7 @@ def command_adopt():
     # type: () -> None
 
     if not args.skip_pull:
-        logger.info('Pulling latest %s container...' % args.image)
-        call_throws([container_path, 'pull', args.image])
+        _pull_image(args.image)
 
     (daemon_type, daemon_id) = args.name.split('.', 1)