From 632a1033aa2325cb216e2216b191c492c45ed19d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 1 Feb 2019 11:39:41 -0600 Subject: [PATCH] os/bluestore/KernelDevice: use flock(2) for block device lock The fcntl locks fail due to the classic posix lock gotcha: if you close *any* fd to the same inode from the process, the lock(s) go away. Use flock(2) instead. We have to be careful because we open the main bluestore device via two KernelDevice instances: one for bluestore and one for bluefs. Add a no-lock flag so that the bluefs instance does not try to lock and does not conflict with bluestore's. Fixes: http://tracker.ceph.com/issues/38150 Signed-off-by: Sage Weil --- src/os/bluestore/BlockDevice.h | 5 +++++ src/os/bluestore/BlueFS.cc | 9 +++++++-- src/os/bluestore/BlueFS.h | 3 ++- src/os/bluestore/BlueStore.cc | 3 ++- src/os/bluestore/KernelDevice.cc | 24 +++++++++++++----------- 5 files changed, 29 insertions(+), 15 deletions(-) diff --git a/src/os/bluestore/BlockDevice.h b/src/os/bluestore/BlockDevice.h index 07fd3e2168b52..315d46c19f852 100644 --- a/src/os/bluestore/BlockDevice.h +++ b/src/os/bluestore/BlockDevice.h @@ -142,6 +142,7 @@ protected: uint64_t block_size; bool support_discard = false; bool rotational = true; + bool lock_exclusive = true; public: aio_callback_t aio_callback; @@ -162,6 +163,10 @@ public: virtual void aio_submit(IOContext *ioc) = 0; + void set_no_exclusive_lock() { + lock_exclusive = false; + } + uint64_t get_size() const { return size; } uint64_t get_block_size() const { return block_size; } diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index a4a062a5de5e9..dbc7d35893013 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -157,12 +157,17 @@ void BlueFS::_update_logger_stats() } } -int BlueFS::add_block_device(unsigned id, const string& path, bool trim) +int BlueFS::add_block_device(unsigned id, const string& path, bool trim, + bool shared_with_bluestore) { dout(10) << __func__ << " bdev " << id << " path " << path << dendl; ceph_assert(id < bdev.size()); ceph_assert(bdev[id] == NULL); - BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL, discard_cb[id], static_cast(this)); + BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL, + discard_cb[id], static_cast(this)); + if (shared_with_bluestore) { + b->set_no_exclusive_lock(); + } int r = b->open(path); if (r < 0) { delete b; diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index a22b9d924c22a..c0438232fc541 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -458,7 +458,8 @@ public: void set_slow_device_expander(BlueFSDeviceExpander* a) { slow_dev_expander = a; } - int add_block_device(unsigned bdev, const string& path, bool trim); + int add_block_device(unsigned bdev, const string& path, bool trim, + bool shared_with_bluestore=false); bool bdev_support_label(unsigned id); uint64_t get_block_device_size(unsigned bdev); diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index f52b4868979bc..3b03e0e2c4b77 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -4982,7 +4982,8 @@ int BlueStore::_minimal_open_bluefs(bool create) // shared device bfn = path + "/block"; // never trim here - r = bluefs->add_block_device(bluefs_shared_bdev, bfn, false); + r = bluefs->add_block_device(bluefs_shared_bdev, bfn, false, + true /* shared with bluestore */); if (r < 0) { derr << __func__ << " add block device(" << bfn << ") returned: " << cpp_strerror(r) << dendl; diff --git a/src/os/bluestore/KernelDevice.cc b/src/os/bluestore/KernelDevice.cc index 7008cad8115ee..df75086f43daa 100644 --- a/src/os/bluestore/KernelDevice.cc +++ b/src/os/bluestore/KernelDevice.cc @@ -17,6 +17,7 @@ #include #include #include +#include #include "KernelDevice.h" #include "include/types.h" @@ -55,13 +56,12 @@ KernelDevice::KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, ai int KernelDevice::_lock() { - struct flock l; - memset(&l, 0, sizeof(l)); - l.l_type = F_WRLCK; - l.l_whence = SEEK_SET; - int r = ::fcntl(fd_directs[WRITE_LIFE_NOT_SET], F_SETLK, &l); - if (r < 0) + dout(10) << __func__ << " " << fd_directs[WRITE_LIFE_NOT_SET] << dendl; + int r = ::flock(fd_directs[WRITE_LIFE_NOT_SET], LOCK_EX | LOCK_NB); + if (r < 0) { + derr << __func__ << " flock failed on " << path << dendl; return -errno; + } return 0; } @@ -122,11 +122,13 @@ int KernelDevice::open(const string& p) goto out_fail; } - r = _lock(); - if (r < 0) { - derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r) - << dendl; - goto out_fail; + if (lock_exclusive) { + r = _lock(); + if (r < 0) { + derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r) + << dendl; + goto out_fail; + } } struct stat st; -- 2.39.5