From 3cb26e0b1ca22cfb2d5cd73ee7c5f5c3ae0049f3 Mon Sep 17 00:00:00 2001 From: Prasanna Kumar Kalever Date: Tue, 12 Sep 2023 17:45:05 +0530 Subject: [PATCH] rbd-nbd: fix stuck with disable request Problem: ------- Trying to disable any feature on an rbd image mapped with nbd leads to stuck in rbd-nbd. The rbd-nbd registers a watcher callback to detect image resize in NBDWatchCtx::handle_notify(). The handle_notify calls image info method, which calls refresh_if_required and it got stuck there. It is getting stuck in ImageState::refresh_if_required() because DisableFeaturesRequest issues update notifications while still holding onto the exclusive lock with everything that has to do with it blocked. Solution: -------- Set only notify flag as part of NBDWatchCtx::handle_notify() and handle the resize detection part as part of a different thread. Fixes: https://tracker.ceph.com/issues/58740 Signed-off-by: Prasanna Kumar Kalever (cherry picked from commit dbb4daff404c5d2da32c33f4e852e84a257c0b8d) --- qa/workunits/rbd/rbd-nbd.sh | 10 ++++ src/tools/rbd_nbd/rbd-nbd.cc | 112 +++++++++++++++++++++++++---------- 2 files changed, 91 insertions(+), 31 deletions(-) diff --git a/qa/workunits/rbd/rbd-nbd.sh b/qa/workunits/rbd/rbd-nbd.sh index 122df3d6f35a5..bc89e9be5a185 100755 --- a/qa/workunits/rbd/rbd-nbd.sh +++ b/qa/workunits/rbd/rbd-nbd.sh @@ -472,6 +472,16 @@ DEV= rbd feature disable ${POOL}/${IMAGE} journaling rbd config image rm ${POOL}/${IMAGE} rbd_discard_granularity_bytes +# test that disabling a feature so that the op is proxied to rbd-nbd +# (arranged here by blkdiscard before "rbd feature disable") doesn't hang +DEV=`_sudo rbd device --device-type nbd map ${POOL}/${IMAGE}` +get_pid ${POOL} +rbd feature enable ${POOL}/${IMAGE} journaling +_sudo blkdiscard --offset 0 --length 4096 ${DEV} +rbd feature disable ${POOL}/${IMAGE} journaling +unmap_device ${DEV} ${PID} +DEV= + # test that rbd_op_threads setting takes effect EXPECTED=`ceph-conf --show-config-value librados_thread_count` DEV=`_sudo rbd device --device-type nbd map ${POOL}/${IMAGE}` diff --git a/src/tools/rbd_nbd/rbd-nbd.cc b/src/tools/rbd_nbd/rbd-nbd.cc index 3130e8bc750e2..e348bd8fe4310 100644 --- a/src/tools/rbd_nbd/rbd-nbd.cc +++ b/src/tools/rbd_nbd/rbd-nbd.cc @@ -738,7 +738,67 @@ private: bool use_netlink; librados::IoCtx &io_ctx; librbd::Image ℑ - unsigned long size; + uint64_t size; + std::thread handle_notify_thread; + ceph::condition_variable cond; + ceph::mutex lock = ceph::make_mutex("NBDWatchCtx::Locker"); + bool notify = false; + bool terminated = false; + + bool wait_notify() { + dout(10) << __func__ << dendl; + + std::unique_lock locker{lock}; + cond.wait(locker, [this] { return notify || terminated; }); + + if (terminated) { + return false; + } + + dout(10) << __func__ << ": got notify request" << dendl; + notify = false; + return true; + } + + void handle_notify_entry() { + dout(10) << __func__ << dendl; + + while (wait_notify()) { + uint64_t new_size; + int ret = image.size(&new_size); + if (ret < 0) { + derr << "getting image size failed: " << cpp_strerror(ret) << dendl; + continue; + } + if (new_size == size) { + continue; + } + dout(5) << "resize detected" << dendl; + if (ioctl(fd, BLKFLSBUF, NULL) < 0) { + derr << "invalidate page cache failed: " << cpp_strerror(errno) + << dendl; + } + if (use_netlink) { + ret = netlink_resize(nbd_index, new_size); + } else { + ret = ioctl(fd, NBD_SET_SIZE, new_size); + if (ret < 0) { + derr << "resize failed: " << cpp_strerror(errno) << dendl; + } + } + if (!ret) { + size = new_size; + } + if (ioctl(fd, BLKRRPART, NULL) < 0) { + derr << "rescan of partition table failed: " << cpp_strerror(errno) + << dendl; + } + if (image.invalidate_cache() < 0) { + derr << "invalidate rbd cache failed" << dendl; + } + } + } + public: NBDWatchCtx(int _fd, int _nbd_index, @@ -752,41 +812,31 @@ public: , io_ctx(_io_ctx) , image(_image) , size(_size) - { } + { + handle_notify_thread = make_named_thread("rbd_handle_notify", + &NBDWatchCtx::handle_notify_entry, + this); + } - ~NBDWatchCtx() override {} + ~NBDWatchCtx() override + { + dout(10) << __func__ << ": terminating" << dendl; + std::unique_lock locker{lock}; + terminated = true; + cond.notify_all(); + locker.unlock(); + + handle_notify_thread.join(); + dout(10) << __func__ << ": finish" << dendl; + } void handle_notify() override { - librbd::image_info_t info; - if (image.stat(info, sizeof(info)) == 0) { - unsigned long new_size = info.size; - int ret; - - if (new_size != size) { - dout(5) << "resize detected" << dendl; - if (ioctl(fd, BLKFLSBUF, NULL) < 0) - derr << "invalidate page cache failed: " << cpp_strerror(errno) - << dendl; - if (use_netlink) { - ret = netlink_resize(nbd_index, new_size); - } else { - ret = ioctl(fd, NBD_SET_SIZE, new_size); - if (ret < 0) - derr << "resize failed: " << cpp_strerror(errno) << dendl; - } - - if (!ret) - size = new_size; + dout(10) << __func__ << dendl; - if (ioctl(fd, BLKRRPART, NULL) < 0) { - derr << "rescan of partition table failed: " << cpp_strerror(errno) - << dendl; - } - if (image.invalidate_cache() < 0) - derr << "invalidate rbd cache failed" << dendl; - } - } + std::unique_lock locker{lock}; + notify = true; + cond.notify_all(); } }; -- 2.39.5