From 3f9d90d58fc92c6ee466ef663230177b37ee53e2 Mon Sep 17 00:00:00 2001 From: =?utf8?q?S=C3=A9bastien=20Han?= Date: Tue, 18 Aug 2020 15:41:31 +0200 Subject: [PATCH] ceph-volume: retry when acquiring lock fails MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When preaparing the osd device with --mkfs, the ceph-osd binary tries to acquire an exclusive lock on the device (soon to become an OSD). Unfortunately, when running in containers, we have seen cases where there is a race between ceph-osd and systemd-udevd to acquire a lock on the device. Sometimes systemd-udevd gets the lock and releases it soon so that the ceph-osd gets sometimes the lock is still held and because ceph-osd uses LOCK_NB the command fails. This commit retries if the lock cannot be acquired, up to 5 times for 5 seconds, this should be more than enough to acquire the lock and proceed with the OSD mkfs. Unfortunately, this is so transient that we cannot lock earlier from c-v, this won't do anything. Fixes: https://tracker.ceph.com/issues/47010 Signed-off-by: Sébastien Han (cherry picked from commit a285cd08921f51ef1101950eb6b9addcdfbecc35) --- src/ceph-volume/ceph_volume/util/prepare.py | 22 ++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/ceph-volume/ceph_volume/util/prepare.py b/src/ceph-volume/ceph_volume/util/prepare.py index 85c3480a95094..884112ae7fe50 100644 --- a/src/ceph-volume/ceph_volume/util/prepare.py +++ b/src/ceph-volume/ceph_volume/util/prepare.py @@ -4,9 +4,11 @@ but also a compounded ("single call") helper to do them in order. Some plugins may want to change some part of the process, while others might want to consume the single-call helper """ +import errno import os import logging import json +import time from ceph_volume import process, conf, __release__, terminal from ceph_volume.util import system, constants, str_to_int, disk @@ -458,9 +460,23 @@ def osd_mkfs_bluestore(osd_id, fsid, keyring=None, wal=False, db=False): command = base_command + supplementary_command - _, _, returncode = process.call(command, stdin=keyring, show_command=True) - if returncode != 0: - raise RuntimeError('Command failed with exit code %s: %s' % (returncode, ' '.join(command))) + """ + When running in containers the --mkfs on raw device sometimes fails + to acquire a lock through flock() on the device because systemd-udevd holds one temporarily. + See KernelDevice.cc and _lock() to understand how ceph-osd acquires the lock. + Because this is really transient, we retry up to 5 times and wait for 1 sec in-between + """ + for retry in range(5): + _, _, returncode = process.call(command, stdin=keyring, terminal_verbose=True, show_command=True) + if returncode == 0: + break + else: + if returncode == errno.EWOULDBLOCK: + time.sleep(1) + logger.info('disk is held by another process, trying to mkfs again... (%s/5 attempt)' % retry) + continue + else: + raise RuntimeError('Command failed with exit code %s: %s' % (returncode, ' '.join(command))) def osd_mkfs_filestore(osd_id, fsid, keyring): -- 2.39.5