From: Sebastian Wagner Date: Tue, 30 Jun 2020 09:26:58 +0000 (+0200) Subject: cephadm: Retry pull on transient error X-Git-Tag: v15.2.5~105^2~6 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=c8f2579f1a92f6e5a49a84d80d1e305e5624dc84;p=ceph.git cephadm: Retry pull on transient error Fixes: https://tracker.ceph.com/issues/46271 Signed-off-by: Sebastian Wagner (cherry picked from commit f3e770f3541c716b4f568cf73255f25243c655e5) --- diff --git a/src/cephadm/cephadm b/src/cephadm/cephadm index dd4c3c7a74f..2029b8acdf7 100755 --- a/src/cephadm/cephadm +++ b/src/cephadm/cephadm @@ -2389,10 +2389,35 @@ def command_version(): @infer_image def command_pull(): # type: () -> int - logger.info('Pulling latest %s...' % args.image) - call_throws([container_path, 'pull', args.image]) + + _pull_image(args.image) return command_inspect_image() + +def _pull_image(image): + # type: () -> None + logger.info('Pulling container image %s...' % image) + + ignorelist = [ + "error creating read-write layer with ID", + "net/http: TLS handshake timeout", + ] + + cmd = [container_path, 'pull', image] + cmd_str = ' '.join(cmd) + + for sleep_secs in [1, 4, 25]: + out, err, ret = call(cmd) + if not ret: + return + + if not any(pattern in err for pattern in ignorelist): + raise RuntimeError('Failed command: %s' % cmd_str) + + logger.info('"%s failed transiently. Retrying. waiting %s seconds...' % (cmd_str, sleep_secs)) + time.sleep(sleep_secs) + + raise RuntimeError('Failed command: %s: maximum retries reached' % cmd_str) ################################## @@ -2538,8 +2563,7 @@ def command_bootstrap(): config = cpf.getvalue() if not args.skip_pull: - logger.info('Pulling latest %s container...' % args.image) - call_throws([container_path, 'pull', args.image]) + _pull_image(args.image) logger.info('Extracting ceph user uid/gid from container image...') (uid, gid) = extract_uid_gid() @@ -3466,8 +3490,7 @@ def command_adopt(): # type: () -> None if not args.skip_pull: - logger.info('Pulling latest %s container...' % args.image) - call_throws([container_path, 'pull', args.image]) + _pull_image(args.image) (daemon_type, daemon_id) = args.name.split('.', 1)