From: Sébastien Han Date: Tue, 18 Aug 2020 13:41:31 +0000 (+0200) Subject: ceph-volume: retry when acquiring lock fails X-Git-Tag: v15.2.8~12^2~2^2~37^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=c040cefa25bbb155bae69ae854e683df1c97e002;p=ceph.git ceph-volume: retry when acquiring lock fails When preaparing the osd device with --mkfs, the ceph-osd binary tries to acquire an exclusive lock on the device (soon to become an OSD). Unfortunately, when running in containers, we have seen cases where there is a race between ceph-osd and systemd-udevd to acquire a lock on the device. Sometimes systemd-udevd gets the lock and releases it soon so that the ceph-osd gets sometimes the lock is still held and because ceph-osd uses LOCK_NB the command fails. This commit retries if the lock cannot be acquired, up to 5 times for 5 seconds, this should be more than enough to acquire the lock and proceed with the OSD mkfs. Unfortunately, this is so transient that we cannot lock earlier from c-v, this won't do anything. Fixes: https://tracker.ceph.com/issues/47010 Signed-off-by: Sébastien Han (cherry picked from commit a285cd08921f51ef1101950eb6b9addcdfbecc35) --- diff --git a/src/ceph-volume/ceph_volume/util/prepare.py b/src/ceph-volume/ceph_volume/util/prepare.py index 85c3480a95094..884112ae7fe50 100644 --- a/src/ceph-volume/ceph_volume/util/prepare.py +++ b/src/ceph-volume/ceph_volume/util/prepare.py @@ -4,9 +4,11 @@ but also a compounded ("single call") helper to do them in order. Some plugins may want to change some part of the process, while others might want to consume the single-call helper """ +import errno import os import logging import json +import time from ceph_volume import process, conf, __release__, terminal from ceph_volume.util import system, constants, str_to_int, disk @@ -458,9 +460,23 @@ def osd_mkfs_bluestore(osd_id, fsid, keyring=None, wal=False, db=False): command = base_command + supplementary_command - _, _, returncode = process.call(command, stdin=keyring, show_command=True) - if returncode != 0: - raise RuntimeError('Command failed with exit code %s: %s' % (returncode, ' '.join(command))) + """ + When running in containers the --mkfs on raw device sometimes fails + to acquire a lock through flock() on the device because systemd-udevd holds one temporarily. + See KernelDevice.cc and _lock() to understand how ceph-osd acquires the lock. + Because this is really transient, we retry up to 5 times and wait for 1 sec in-between + """ + for retry in range(5): + _, _, returncode = process.call(command, stdin=keyring, terminal_verbose=True, show_command=True) + if returncode == 0: + break + else: + if returncode == errno.EWOULDBLOCK: + time.sleep(1) + logger.info('disk is held by another process, trying to mkfs again... (%s/5 attempt)' % retry) + continue + else: + raise RuntimeError('Command failed with exit code %s: %s' % (returncode, ' '.join(command))) def osd_mkfs_filestore(osd_id, fsid, keyring):