]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
librbd/ManagedLock: kickstart ExclusiveLock state machine 53295/head
authorRamana Raja <rraja@redhat.com>
Mon, 2 Oct 2023 16:39:26 +0000 (12:39 -0400)
committerRamana Raja <rraja@redhat.com>
Tue, 10 Oct 2023 17:09:40 +0000 (13:09 -0400)
... that is stalled waiting for lock. Do this when trying to reacquire
lock in the ImageWatcher's rewatch mechanism. This would enable the
ExclusiveLock state machine to propagate the blocklist error to the
caller trying to perform an image operation requiring an exclusive
lock.

Previous attempt, e66db763, to fix the hang due to exclusive lock
acquisiton (stuck waiting for lock) racing with client blocklisting
did not always work. e66db763 kickstarted the ExclusiveLock state
machine when the ImageWatcher tried to schedule a exclusive lock
request and the blocklisting was detected. However, there is a short
window between a watch getting deregistered and client blocklisting
getting detected as part of rewatching. If hit when trying to schedule
a lock request, the ExclusiveLock state machine wasn't kickstarted,
blocklist error wasn't propagated, and the hang resurfaced.

A more robust approach is taken to resume the ExclusiveLock state
machine stuck waiting for lock during client blocklisting. Whenever
a client's ImageWatcher loses connection to the cluster, as it happens
during blocklising, the ImageWatcher initiates a mechanism to rewatch
the image and tries to reacquire the lock. Piggyback on this rewatch
mechanism that gets triggered during client blocklisting. And when
trying to reacquire the lock, kickstart the ExclusiveLock state
machine stalled waiting for lock (STATE_WAITING_FOR_LOCK).

Fixes: https://tracker.ceph.com/issues/63009
Signed-off-by: Ramana Raja <rraja@redhat.com>
(cherry picked from commit 18b018578cf8ac51a7e7a7d25f62d7bde345461a)

src/librbd/ImageWatcher.cc
src/librbd/ManagedLock.cc

index 6cf86bae769b17444f39d55d2fec23d336571fb4..b26b7d9971c8d8fe98c9068bfc60d34deca402d1 100644 (file)
@@ -596,10 +596,6 @@ void ImageWatcher<I>::schedule_request_lock(bool use_timer, int timer_delay) {
     } else {
       m_task_finisher->queue(TASK_CODE_REQUEST_LOCK, ctx);
     }
-  } else if (is_blocklisted()) {
-    lderr(m_image_ctx.cct) << this << " blocklisted waiting for exclusive lock"
-                           << dendl;
-    m_image_ctx.exclusive_lock->handle_peer_notification(0);
   }
 }
 
index bb11160cb5412b07b16ab91ac04078f49cecd5da..4cc49a1fe19169ef9b7b3096cdf376453ed510f3 100644 (file)
@@ -207,7 +207,8 @@ void ManagedLock<I>::reacquire_lock(Context *on_reacquired) {
   {
     std::lock_guard locker{m_lock};
 
-    if (m_state == STATE_WAITING_FOR_REGISTER) {
+    if (m_state == STATE_WAITING_FOR_REGISTER ||
+        m_state == STATE_WAITING_FOR_LOCK) {
       // restart the acquire lock process now that watch is valid
       ldout(m_cct, 10) << "woke up waiting (re)acquire" << dendl;
       Action active_action = get_active_action();
@@ -217,8 +218,7 @@ void ManagedLock<I>::reacquire_lock(Context *on_reacquired) {
     } else if (!is_state_shutdown() &&
                (m_state == STATE_LOCKED ||
                 m_state == STATE_ACQUIRING ||
-                m_state == STATE_POST_ACQUIRING ||
-                m_state == STATE_WAITING_FOR_LOCK)) {
+                m_state == STATE_POST_ACQUIRING)) {
       // interlock the lock operation with other state ops
       ldout(m_cct, 10) << dendl;
       execute_action(ACTION_REACQUIRE_LOCK, on_reacquired);