From: Ilya Dryomov Date: Sat, 4 Sep 2021 10:59:07 +0000 (+0200) Subject: librbd: fix pool validation lockup X-Git-Tag: v16.2.7~109^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=b4060e13c07c9a0c7d055336251eee75f0477316;p=ceph.git librbd: fix pool validation lockup Concurrent rbd_pool_init() or rbd_create() operations on an unvalidated (uninitialized) pool trigger a lockup in ValidatePoolRequest state machine caused by blocking selfmanaged_snap_{create,remove}() calls. There are two reactor threads by default (librados_thread_count) but we effectively need N + 1 reactor threads for N concurrent pool validation requests, especially for small N. Switch to aio_selfmanaged_snap_{create,remove}(). At the time this code was initially written, these aio variants weren't available. The workqueue offload introduced later worked prior to the move to asio in pacific. Fixes: https://tracker.ceph.com/issues/52537 Signed-off-by: Ilya Dryomov (cherry picked from commit 56d41a9ada8c673256fed2768b32a5a13ff10245) --- diff --git a/src/librbd/image/ValidatePoolRequest.cc b/src/librbd/image/ValidatePoolRequest.cc index 9e685dc477c..af41106bb10 100644 --- a/src/librbd/image/ValidatePoolRequest.cc +++ b/src/librbd/image/ValidatePoolRequest.cc @@ -97,11 +97,11 @@ void ValidatePoolRequest::create_snapshot() { // allocate a self-managed snapshot id if this a new pool to force // self-managed snapshot mode - auto ctx = new LambdaContext([this](int r) { - r = m_io_ctx.selfmanaged_snap_create(&m_snap_id); - handle_create_snapshot(r); - }); - m_op_work_queue->queue(ctx, 0); + auto comp = create_rados_callback< + ValidatePoolRequest, + &ValidatePoolRequest::handle_create_snapshot>(this); + m_io_ctx.aio_selfmanaged_snap_create(&m_snap_id, comp); + comp->release(); } template @@ -161,11 +161,11 @@ template void ValidatePoolRequest::remove_snapshot() { ldout(m_cct, 5) << dendl; - auto ctx = new LambdaContext([this](int r) { - r = m_io_ctx.selfmanaged_snap_remove(m_snap_id); - handle_remove_snapshot(r); - }); - m_op_work_queue->queue(ctx, 0); + auto comp = create_rados_callback< + ValidatePoolRequest, + &ValidatePoolRequest::handle_remove_snapshot>(this); + m_io_ctx.aio_selfmanaged_snap_remove(m_snap_id, comp); + comp->release(); } template diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc index 4dbd2e2442a..d9f31ee9b6a 100644 --- a/src/test/librbd/test_librbd.cc +++ b/src/test/librbd/test_librbd.cc @@ -2089,6 +2089,35 @@ TEST_F(TestLibRBD, TestCreateLsRenameSnapPP) ioctx.close(); } +TEST_F(TestLibRBD, ConcurrentCreatesUnvalidatedPool) +{ + rados_ioctx_t ioctx; + ASSERT_EQ(0, rados_ioctx_create(_cluster, create_pool(true).c_str(), + &ioctx)); + + std::vector names; + for (int i = 0; i < 4; i++) { + names.push_back(get_temp_image_name()); + } + uint64_t size = 2 << 20; + + std::vector threads; + for (const auto& name : names) { + threads.emplace_back([ioctx, &name, size]() { + int order = 0; + ASSERT_EQ(0, create_image(ioctx, name.c_str(), size, &order)); + }); + } + for (auto& thread : threads) { + thread.join(); + } + + for (const auto& name : names) { + ASSERT_EQ(0, rbd_remove(ioctx, name.c_str())); + } + rados_ioctx_destroy(ioctx); +} + TEST_F(TestLibRBD, TestIO) { rados_ioctx_t ioctx;