]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
cephadm: Retry pull on transient error
authorSebastian Wagner <sebastian.wagner@suse.com>
Tue, 30 Jun 2020 09:26:58 +0000 (11:26 +0200)
committerSebastian Wagner <sebastian.wagner@suse.com>
Thu, 2 Jul 2020 14:07:01 +0000 (16:07 +0200)
Fixes: https://tracker.ceph.com/issues/46271
Signed-off-by: Sebastian Wagner <sebastian.wagner@suse.com>
src/cephadm/cephadm

index 1a62a61f639968d5d8bef30f0bfcaf28aa5dc381..dd2c6b826283add5b629162940e4e23b1f2aedb5 100755 (executable)
@@ -2293,10 +2293,35 @@ def command_version():
 @infer_image
 def command_pull():
     # type: () -> int
-    logger.info('Pulling latest %s...' % args.image)
-    call_throws([container_path, 'pull', args.image])
+
+    _pull_image(args.image)
     return command_inspect_image()
 
+
+def _pull_image(image):
+    # type: () -> None
+    logger.info('Pulling container image %s...' % image)
+
+    ignorelist = [
+        "error creating read-write layer with ID",
+        "net/http: TLS handshake timeout",
+    ]
+
+    cmd = [container_path, 'pull', image]
+    cmd_str = ' '.join(cmd)
+
+    for sleep_secs in [1, 4, 25]:
+        out, err, ret = call(cmd)
+        if not ret:
+            return
+
+        if not any(pattern in err for pattern in ignorelist):
+            raise RuntimeError('Failed command: %s' % cmd_str)
+
+        logger.info('"%s failed transiently. Retrying. waiting %s seconds...' % (cmd_str, sleep_secs))
+        time.sleep(sleep_secs)
+
+    raise RuntimeError('Failed command: %s: maximum retries reached' % cmd_str)
 ##################################
 
 @infer_image
@@ -2439,8 +2464,7 @@ def command_bootstrap():
     config = cpf.getvalue()
 
     if not args.skip_pull:
-        logger.info('Pulling latest %s container...' % args.image)
-        call_throws([container_path, 'pull', args.image])
+        _pull_image(args.image)
 
     logger.info('Extracting ceph user uid/gid from container image...')
     (uid, gid) = extract_uid_gid()
@@ -3317,8 +3341,7 @@ def command_adopt():
     # type: () -> None
 
     if not args.skip_pull:
-        logger.info('Pulling latest %s container...' % args.image)
-        call_throws([container_path, 'pull', args.image])
+        _pull_image(args.image)
 
     (daemon_type, daemon_id) = args.name.split('.', 1)