From f91318d19c8c4a8f7b8cd08d86127e5574069598 Mon Sep 17 00:00:00 2001 From: Ramana Raja Date: Mon, 2 Oct 2023 12:39:26 -0400 Subject: [PATCH] librbd/ManagedLock: kickstart ExclusiveLock state machine ... that is stalled waiting for lock. Do this when trying to reacquire lock in the ImageWatcher's rewatch mechanism. This would enable the ExclusiveLock state machine to propagate the blocklist error to the caller trying to perform an image operation requiring an exclusive lock. Previous attempt, e66db763, to fix the hang due to exclusive lock acquisiton (stuck waiting for lock) racing with client blocklisting did not always work. e66db763 kickstarted the ExclusiveLock state machine when the ImageWatcher tried to schedule a exclusive lock request and the blocklisting was detected. However, there is a short window between a watch getting deregistered and client blocklisting getting detected as part of rewatching. If hit when trying to schedule a lock request, the ExclusiveLock state machine wasn't kickstarted, blocklist error wasn't propagated, and the hang resurfaced. A more robust approach is taken to resume the ExclusiveLock state machine stuck waiting for lock during client blocklisting. Whenever a client's ImageWatcher loses connection to the cluster, as it happens during blocklising, the ImageWatcher initiates a mechanism to rewatch the image and tries to reacquire the lock. Piggyback on this rewatch mechanism that gets triggered during client blocklisting. And when trying to reacquire the lock, kickstart the ExclusiveLock state machine stalled waiting for lock (STATE_WAITING_FOR_LOCK). Fixes: https://tracker.ceph.com/issues/63009 Signed-off-by: Ramana Raja (cherry picked from commit 18b018578cf8ac51a7e7a7d25f62d7bde345461a) --- src/librbd/ImageWatcher.cc | 4 ---- src/librbd/ManagedLock.cc | 6 +++--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/librbd/ImageWatcher.cc b/src/librbd/ImageWatcher.cc index f2427b129e679..4a4d8aa2bd11f 100644 --- a/src/librbd/ImageWatcher.cc +++ b/src/librbd/ImageWatcher.cc @@ -596,10 +596,6 @@ void ImageWatcher::schedule_request_lock(bool use_timer, int timer_delay) { } else { m_task_finisher->queue(TASK_CODE_REQUEST_LOCK, ctx); } - } else if (is_blocklisted()) { - lderr(m_image_ctx.cct) << this << " blocklisted waiting for exclusive lock" - << dendl; - m_image_ctx.exclusive_lock->handle_peer_notification(0); } } diff --git a/src/librbd/ManagedLock.cc b/src/librbd/ManagedLock.cc index bb11160cb5412..4cc49a1fe1916 100644 --- a/src/librbd/ManagedLock.cc +++ b/src/librbd/ManagedLock.cc @@ -207,7 +207,8 @@ void ManagedLock::reacquire_lock(Context *on_reacquired) { { std::lock_guard locker{m_lock}; - if (m_state == STATE_WAITING_FOR_REGISTER) { + if (m_state == STATE_WAITING_FOR_REGISTER || + m_state == STATE_WAITING_FOR_LOCK) { // restart the acquire lock process now that watch is valid ldout(m_cct, 10) << "woke up waiting (re)acquire" << dendl; Action active_action = get_active_action(); @@ -217,8 +218,7 @@ void ManagedLock::reacquire_lock(Context *on_reacquired) { } else if (!is_state_shutdown() && (m_state == STATE_LOCKED || m_state == STATE_ACQUIRING || - m_state == STATE_POST_ACQUIRING || - m_state == STATE_WAITING_FOR_LOCK)) { + m_state == STATE_POST_ACQUIRING)) { // interlock the lock operation with other state ops ldout(m_cct, 10) << dendl; execute_action(ACTION_REACQUIRE_LOCK, on_reacquired); -- 2.39.5