]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
librbd: add more robust retry handling to maintenance ops
authorJason Dillaman <dillaman@redhat.com>
Wed, 14 Jan 2015 20:56:15 +0000 (15:56 -0500)
committerJosh Durgin <jdurgin@redhat.com>
Sat, 24 Jan 2015 23:05:49 +0000 (15:05 -0800)
When image locking is enabled, snapshot create, resize, and
flatten are coordinated with the lock owner.  Previously, if the
the lock owner changed during one of this operations, the
operation would fail.  Now librbd will attempt to restart the
operation with the new lock owner (or become the owner itself).

Signed-off-by: Jason Dillaman <dillaman@redhat.com>
src/librbd/ImageWatcher.cc
src/librbd/ImageWatcher.h
src/librbd/internal.cc
src/test/librbd/test_internal.cc

index 9f95918ce278aa40b9516dd94f563fa99899f73a..13cf69bee740a7e2c0b930256d28bb43f57aeb3e 100644 (file)
@@ -267,12 +267,21 @@ int ImageWatcher::request_lock(
 }
 
 bool ImageWatcher::try_request_lock() {
-  RWLock::WLocker l(m_image_ctx.owner_lock);
+  assert(m_image_ctx.owner_lock.is_locked());
   if (is_lock_owner()) {
     return true;
   }
 
-  int r = try_lock();
+  int r = 0;
+  m_image_ctx.owner_lock.put_read();
+  {
+    RWLock::WLocker l(m_image_ctx.owner_lock);
+    if (!is_lock_owner()) {
+      r = try_lock();
+    }
+  }
+  m_image_ctx.owner_lock.get_read();
+
   if (r < 0) {
     ldout(m_image_ctx.cct, 5) << "failed to acquire exclusive lock:"
                              << cpp_strerror(r) << dendl;
@@ -292,8 +301,14 @@ bool ImageWatcher::try_request_lock() {
 void ImageWatcher::finalize_request_lock() {
   cancel_retry_aio_requests();
 
-  if (try_request_lock()) {
+  bool owned_lock;
+  {
+    RWLock::RLocker l(m_image_ctx.owner_lock);
+    owned_lock = try_request_lock();
+  }
+  if (owned_lock) {
     retry_aio_requests();
+    
   } else {
     schedule_retry_aio_requests();
   }
@@ -450,6 +465,9 @@ int ImageWatcher::notify_async_complete(const RemoteAsyncRequest &request,
 }
 
 int ImageWatcher::notify_flatten(ProgressContext &prog_ctx) {
+  assert(m_image_ctx.owner_lock.is_locked());
+  assert(!is_lock_owner());
+
   bufferlist bl;
   uint64_t async_request_id;
   ENCODE_START(NOTIFY_VERSION, NOTIFY_VERSION, bl);
@@ -461,6 +479,9 @@ int ImageWatcher::notify_flatten(ProgressContext &prog_ctx) {
 }
 
 int ImageWatcher::notify_resize(uint64_t size, ProgressContext &prog_ctx) {
+  assert(m_image_ctx.owner_lock.is_locked());
+  assert(!is_lock_owner());
+
   bufferlist bl;
   uint64_t async_request_id;
   ENCODE_START(NOTIFY_VERSION, NOTIFY_VERSION, bl);
@@ -473,6 +494,9 @@ int ImageWatcher::notify_resize(uint64_t size, ProgressContext &prog_ctx) {
 }
 
 int ImageWatcher::notify_snap_create(const std::string &snap_name) {
+  assert(m_image_ctx.owner_lock.is_locked());
+  assert(!is_lock_owner());
+
   bufferlist bl;
   ENCODE_START(NOTIFY_VERSION, NOTIFY_VERSION, bl);
   ::encode(NOTIFY_OP_SNAP_CREATE, bl);
@@ -599,10 +623,14 @@ uint64_t ImageWatcher::encode_async_request(bufferlist &bl) {
 
 int ImageWatcher::decode_response_code(bufferlist &bl) {
   int r;
-  bufferlist::iterator iter = bl.begin();
-  DECODE_START(NOTIFY_VERSION, iter);
-  ::decode(r, iter);
-  DECODE_FINISH(iter);
+  try {
+    bufferlist::iterator iter = bl.begin();
+    DECODE_START(NOTIFY_VERSION, iter);
+    ::decode(r, iter);
+    DECODE_FINISH(iter);
+  } catch (const buffer::error &err) {
+    r = -EINVAL;
+  }
   return r;
 }
 
@@ -617,7 +645,9 @@ void ImageWatcher::notify_released_lock() {
 void ImageWatcher::notify_request_lock() {
   cancel_retry_aio_requests();
 
+  m_image_ctx.owner_lock.get_read();
   if (try_request_lock()) {
+    m_image_ctx.owner_lock.put_read();
     retry_aio_requests();
     return;
   }
@@ -629,6 +659,8 @@ void ImageWatcher::notify_request_lock() {
 
   bufferlist response;
   int r = notify_lock_owner(bl, response);
+  m_image_ctx.owner_lock.put_read();
+
   if (r == -ETIMEDOUT) {
     ldout(m_image_ctx.cct, 5) << "timed out requesting lock: retrying" << dendl;
     retry_aio_requests();
@@ -640,9 +672,15 @@ void ImageWatcher::notify_request_lock() {
 }
 
 int ImageWatcher::notify_lock_owner(bufferlist &bl, bufferlist& response) {
+  assert(m_image_ctx.owner_lock.is_locked());
+
+  // since we need to ack our own notifications, release the owner lock just in
+  // case another notification occurs before this one and it requires the lock
   bufferlist response_bl;
+  m_image_ctx.owner_lock.put_read();
   int r = m_image_ctx.md_ctx.notify2(m_image_ctx.header_oid, bl, NOTIFY_TIMEOUT,
                                     &response_bl);
+  m_image_ctx.owner_lock.get_read();
   if (r < 0 && r != -ETIMEDOUT) {
     lderr(m_image_ctx.cct) << "lock owner notification failed: "
                           << cpp_strerror(r) << dendl;
@@ -683,6 +721,8 @@ int ImageWatcher::notify_lock_owner(bufferlist &bl, bufferlist& response) {
 int ImageWatcher::notify_async_request(uint64_t async_request_id,
                                       bufferlist &in,
                                       ProgressContext& prog_ctx) {
+  assert(m_image_ctx.owner_lock.is_locked());
+
   Mutex my_lock("librbd::ImageWatcher::notify_async_request::my_lock");
   Cond cond;
   bool done = false;
@@ -734,13 +774,16 @@ void ImageWatcher::handle_acquired_lock() {
 
 void ImageWatcher::handle_released_lock() {
   ldout(m_image_ctx.cct, 20) << "exclusive lock released" << dendl;
+  FunctionContext *ctx = new FunctionContext(
+    boost::bind(&ImageWatcher::cancel_async_requests, this, -ERESTART));
+  m_finisher->queue(ctx);
 
   Mutex::Locker l(m_aio_request_lock);
   if (!m_aio_requests.empty()) {
     ldout(m_image_ctx.cct, 20) << "queuing lock request" << dendl;
-    FunctionContext *ctx = new FunctionContext(
+    FunctionContext *req_ctx = new FunctionContext(
       boost::bind(&ImageWatcher::finalize_request_lock, this));
-    m_finisher->queue(ctx);
+    m_finisher->queue(req_ctx);
   }
 }
 
@@ -867,7 +910,6 @@ void ImageWatcher::handle_snap_create(bufferlist::iterator iter, bufferlist *out
     ::decode(snap_name, iter);
 
     ldout(m_image_ctx.cct, 20) << "remote snap_create request: " << snap_name << dendl;
-
     int r = librbd::snap_create(&m_image_ctx, snap_name.c_str(), false);
     ENCODE_START(NOTIFY_VERSION, NOTIFY_VERSION, *out);
     ::encode(r, *out);
index b30ab5ffcf7fa7edcc5cdc168fe9fa7876467962..1ef02f140f845f0f2645f9841aaadd1c07d7dde5 100644 (file)
@@ -151,7 +151,6 @@ namespace librbd {
     int notify_async_request(uint64_t async_request_id, bufferlist &in,
                             ProgressContext& prog_ctx);
     void notify_request_leadership();
-    int notify_leader(bufferlist &bl, bufferlist &response);
 
     void handle_header_update();
     void handle_acquired_lock();
index 5087a92ed7f4640ba6c41a0d635dabcc7a7ea02d..db2be22d5a7463b5e46c189ab261a16a23c8aace 100644 (file)
@@ -157,6 +157,10 @@ namespace librbd {
 
   void trim_image(ImageCtx *ictx, uint64_t newsize, ProgressContext& prog_ctx)
   {
+    assert(ictx->owner_lock.is_locked());
+    assert(!ictx->image_watcher->is_lock_supported() ||
+          ictx->image_watcher->is_lock_owner());
+
     Mutex my_lock("librbd::trim_image::my_lock");
     Cond cond;
     bool done;
@@ -445,7 +449,7 @@ namespace librbd {
   {
     assert(ictx->owner_lock.is_locked() && !ictx->owner_lock.is_wlocked());
     if (ictx->image_watcher == NULL) {
-      return -EROFS;;
+      return -EROFS;
     } else if (!ictx->image_watcher->is_lock_supported() ||
               ictx->image_watcher->is_lock_owner()) {
       return 0;
@@ -476,13 +480,19 @@ namespace librbd {
     if (r < 0)
       return r;
 
-    r = prepare_image_update(ictx);
-    if (r < 0) {
-      return -EROFS;
-    }
-    if (ictx->image_watcher->is_lock_supported() &&
-        !ictx->image_watcher->is_lock_owner()) {
-      return ictx->image_watcher->notify_snap_create(snap_name);
+    while (ictx->image_watcher->is_lock_supported()) {
+      r = prepare_image_update(ictx);
+      if (r < 0) {
+       return -EROFS;
+      } else if (ictx->image_watcher->is_lock_owner()) {
+       break;
+      }
+
+      r = ictx->image_watcher->notify_snap_create(snap_name);
+      if (r != -ETIMEDOUT) {
+       return r;
+      }
+      ldout(ictx->cct, 5) << "snap_create timed out notifying lock owner" << dendl;
     }
 
     RWLock::RLocker l2(ictx->md_lock);
@@ -1443,8 +1453,20 @@ reprotect_and_return_err:
       unknown_format = false;
       id = ictx->id;
 
+      ictx->owner_lock.get_read();
+      if (ictx->image_watcher->is_lock_supported()) {
+        r = prepare_image_update(ictx);
+        if (r < 0 || !ictx->image_watcher->is_lock_owner()) {
+         lderr(cct) << "cannot obtain exclusive lock - not removing" << dendl;
+         ictx->owner_lock.put_read();
+         close_image(ictx);
+          return -EBUSY;
+        }
+      }
+
       if (ictx->snaps.size()) {
        lderr(cct) << "image has snapshots - not removing" << dendl;
+       ictx->owner_lock.put_read();
        close_image(ictx);
        return -ENOTEMPTY;
       }
@@ -1453,11 +1475,13 @@ reprotect_and_return_err:
       r = io_ctx.list_watchers(header_oid, &watchers);
       if (r < 0) {
         lderr(cct) << "error listing watchers" << dendl;
+       ictx->owner_lock.put_read();
         close_image(ictx);
         return r;
       }
       if (watchers.size() > 1) {
         lderr(cct) << "image has watchers - not removing" << dendl;
+       ictx->owner_lock.put_read();
         close_image(ictx);
         return -EBUSY;
       }
@@ -1476,9 +1500,12 @@ reprotect_and_return_err:
                                   parent_info.spec, id);
       if (r < 0 && r != -ENOENT) {
        lderr(cct) << "error removing child from children list" << dendl;
+       ictx->owner_lock.put_read();
         close_image(ictx);
        return r;
       }
+
+      ictx->owner_lock.put_read();
       close_image(ictx);
 
       ldout(cct, 2) << "removing header..." << dendl;
@@ -1532,39 +1559,50 @@ reprotect_and_return_err:
     ldout(cct, 20) << "resize " << ictx << " " << ictx->size << " -> "
                   << size << dendl;
 
-    {
-      RWLock::RLocker l(ictx->owner_lock);
-      int r = prepare_image_update(ictx);
-      if (r < 0) {
-       return -EROFS;
-      }
-      if (ictx->image_watcher->is_lock_supported() &&
-         !ictx->image_watcher->is_lock_owner()) {
-       return ictx->image_watcher->notify_resize(size, prog_ctx);
-      }
-    }
+    int r;
+    do {
+      Mutex my_lock("librbd::resize::my_lock");
+      Cond cond;
+      bool done;
+      {
+       RWLock::RLocker l(ictx->owner_lock);
+       while (ictx->image_watcher->is_lock_supported()) {
+         r = prepare_image_update(ictx);
+         if (r < 0) {
+           return -EROFS;
+         } else if (ictx->image_watcher->is_lock_owner()) {
+           break;
+         }
 
-    Mutex my_lock("librbd::resize::my_lock");
-    Cond cond;
-    bool done;
-    int ret;
-    Context *ctx = new C_SafeCond(&my_lock, &cond, &done, &ret);
+         r = ictx->image_watcher->notify_resize(size, prog_ctx);
+         if (r != -ETIMEDOUT && r != -ERESTART) {
+           return r;
+         }
+         ldout(ictx->cct, 5) << "resize timed out notifying lock owner" << dendl;
+       }
 
-    ret = async_resize(ictx, ctx, size, prog_ctx);
-    if (ret < 0) {
-      delete ctx;
-      return ret;
-    }
+       Context *ctx = new C_SafeCond(&my_lock, &cond, &done, &r);
+       r = async_resize(ictx, ctx, size, prog_ctx);
+       if (r < 0) {
+         delete ctx;
+         return r;
+       }
+      }
 
-    my_lock.Lock();
-    while (!done) {
-      cond.Wait(my_lock);
-    }
-    my_lock.Unlock();
+      my_lock.Lock();
+      while (!done) {
+       cond.Wait(my_lock);
+      }
+      my_lock.Unlock();
+
+      if (r == -ERESTART) {
+       ldout(ictx->cct, 5) << "resize interrupted: restarting" << dendl;
+      }
+    } while (r == -ERESTART);
 
     notify_change(ictx->md_ctx, ictx->header_oid, ictx);
     ldout(cct, 2) << "resize finished" << dendl;
-    return ret;
+    return r;
   }
 
   class AsyncResizeFinishContext : public Context {
@@ -1588,6 +1626,10 @@ reprotect_and_return_err:
   int async_resize(ImageCtx *ictx, Context *ctx, uint64_t size,
                   ProgressContext &prog_ctx)
   {
+    assert(ictx->owner_lock.is_locked());
+    assert(!ictx->image_watcher->is_lock_supported() ||
+          ictx->image_watcher->is_lock_owner());
+
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "async_resize " << ictx << " " << ictx->size << " -> "
                   << size << dendl;
@@ -1603,17 +1645,7 @@ reprotect_and_return_err:
 
     uint64_t original_size;
     {
-      RWLock::RLocker l(ictx->owner_lock);
-      r = prepare_image_update(ictx);
-      if (r < 0) {
-       return -EROFS;
-      }
-      if (ictx->image_watcher->is_lock_supported() &&
-         !ictx->image_watcher->is_lock_owner()) {
-       return -EROFS;
-      }
-
-      RWLock::RLocker l2(ictx->md_lock);
+      RWLock::RLocker l(ictx->md_lock);
       original_size = ictx->size;
       if (size < ictx->size) {
        ictx->wait_for_pending_copyup();
@@ -1662,7 +1694,7 @@ reprotect_and_return_err:
       RWLock::RLocker l(m_ictx->owner_lock);
       if (m_ictx->image_watcher->is_lock_supported() &&
           !m_ictx->image_watcher->is_lock_owner()) {
-        r = -EROFS;
+        r = -ERESTART;
         return;
       }
 
@@ -1745,7 +1777,7 @@ reprotect_and_return_err:
       RWLock::RLocker l(m_ictx->owner_lock);
       if (m_ictx->image_watcher->is_lock_supported() &&
           !m_ictx->image_watcher->is_lock_owner()) {
-        return -EROFS;
+        return -ERESTART;
       }
 
       string oid = m_ictx->get_object_name(m_object_no);
@@ -1782,7 +1814,7 @@ reprotect_and_return_err:
       RWLock::RLocker l(m_ictx->owner_lock);
       if (m_ictx->image_watcher->is_lock_supported() &&
           !m_ictx->image_watcher->is_lock_owner()) {
-       r = -EROFS;
+       r = -ERESTART;
        return;
       }
 
@@ -2588,7 +2620,7 @@ reprotect_and_return_err:
       RWLock::RLocker l(m_ictx->owner_lock);
       if (m_ictx->image_watcher->is_lock_supported() &&
           !m_ictx->image_watcher->is_lock_owner()) {
-       return -EROFS;
+       return -ERESTART;
       }
 
       RWLock::RLocker l2(m_ictx->md_lock);
@@ -2661,7 +2693,7 @@ reprotect_and_return_err:
       RWLock::RLocker l(m_ictx->owner_lock);
       if (m_ictx->image_watcher->is_lock_supported() &&
           !m_ictx->image_watcher->is_lock_owner()) {
-       r = -EROFS;
+       r = -ERESTART;
         return;
       }
 
@@ -2713,43 +2745,58 @@ reprotect_and_return_err:
       return -EROFS;
     }
 
-    {
-      RWLock::RLocker l(ictx->owner_lock);
-      int r = prepare_image_update(ictx);
-      if (r < 0) {
-        return -EROFS;
-      }
-      if (ictx->image_watcher->is_lock_supported() &&
-          !ictx->image_watcher->is_lock_owner()) {
-        return ictx->image_watcher->notify_flatten(prog_ctx);
-      }
-    }
+    int r;
+    do {
+      Mutex my_lock("librbd::flatten:my_lock");
+      Cond cond;
+      bool done;
+      {
+        RWLock::RLocker l(ictx->owner_lock);
+        while (ictx->image_watcher->is_lock_supported()) {
+          r = prepare_image_update(ictx);
+          if (r < 0) {
+            return -EROFS;
+          } else if (ictx->image_watcher->is_lock_owner()) {
+           break;
+         }
 
-    Mutex my_lock("librbd::flatten:my_lock");
-    Cond cond;
-    bool done;
-    int ret;
-    Context *ctx = new C_SafeCond(&my_lock, &cond, &done, &ret);
+          r = ictx->image_watcher->notify_flatten(prog_ctx);
+          if (r != -ETIMEDOUT && r != -ERESTART) {
+            return r;
+          }
+          ldout(ictx->cct, 5) << "flatten timed out notifying lock owner" << dendl;
+        }
 
-    ret = async_flatten(ictx, ctx, prog_ctx);
-    if (ret < 0) {
-      delete ctx;
-      return ret;
-    }
+        Context *ctx = new C_SafeCond(&my_lock, &cond, &done, &r);
+        r = async_flatten(ictx, ctx, prog_ctx);
+        if (r < 0) {
+         delete ctx;
+         return r;
+        }
+      }
 
-    my_lock.Lock();
-    while (!done) {
-      cond.Wait(my_lock);
-    }
-    my_lock.Unlock();
+      my_lock.Lock();
+      while (!done) {
+        cond.Wait(my_lock);
+      }
+      my_lock.Unlock();
+
+      if (r == -ERESTART) {
+       ldout(ictx->cct, 5) << "flatten interrupted: restarting" << dendl;
+      }
+    } while (r == -ERESTART);
 
     notify_change(ictx->md_ctx, ictx->header_oid, ictx);
     ldout(cct, 20) << "flatten finished" << dendl;
-    return ret;
+    return r;
   }
 
   int async_flatten(ImageCtx *ictx, Context *ctx, ProgressContext &prog_ctx)
   {
+    assert(ictx->owner_lock.is_locked());
+    assert(!ictx->image_watcher->is_lock_supported() ||
+          ictx->image_watcher->is_lock_owner());
+
     CephContext *cct = ictx->cct;
     ldout(cct, 20) << "flatten" << dendl;
 
@@ -2793,19 +2840,6 @@ reprotect_and_return_err:
       overlap_objects = Striper::get_num_objects(ictx->layout, overlap); 
     }
 
-    {
-      RWLock::RLocker l(ictx->owner_lock);
-      r = prepare_image_update(ictx);
-      if (r < 0) {
-       return -EROFS;
-      }
-      if (ictx->image_watcher->is_lock_supported() &&
-         !ictx->image_watcher->is_lock_owner()) {
-       // TODO: temporary until request proxied to lock owner
-       return -EROFS;
-      }
-    }
-
     AsyncObjectThrottle::ContextFactory context_factory(
       boost::lambda::bind(boost::lambda::new_ptr<AsyncFlattenObjectContext>(),
         boost::lambda::_1, ictx, object_size, snapc,
index 0d13fd68b592616e3397057f7ab88d2d87992d54..f74e83ec769e853ec7fc880bcae4ffcfc93ef3be 100644 (file)
@@ -225,19 +225,19 @@ TEST_F(TestInternal, FlattenFailsToLockImage) {
   int order = ictx->order;
   ASSERT_EQ(0, librbd::clone(m_ioctx, m_image_name.c_str(), "snap1", m_ioctx,
                              clone_name.c_str(), features, &order, 0, 0));
-  BOOST_SCOPE_EXIT( (&m_ioctx) (clone_name) ) {
+
+  TestInternal *parent = this;
+  librbd::ImageCtx *ictx2 = NULL;
+  BOOST_SCOPE_EXIT( (&m_ioctx) (clone_name) (parent) (&ictx2) ) {
+    if (ictx2 != NULL) {
+      parent->close_image(ictx2);
+      parent->unlock_image();
+    }
     librbd::NoOpProgressContext no_op;
     ASSERT_EQ(0, librbd::remove(m_ioctx, clone_name.c_str(), no_op));
   } BOOST_SCOPE_EXIT_END;
 
-  librbd::ImageCtx *ictx2;
   ASSERT_EQ(0, open_image(clone_name, &ictx2));
-
-  TestInternal *parent = this;
-  BOOST_SCOPE_EXIT( (parent) (ictx2) ) {
-    parent->close_image(ictx2);
-  } BOOST_SCOPE_EXIT_END;
-
   ASSERT_EQ(0, lock_image(*ictx2, LOCK_EXCLUSIVE, "manually locked"));
 
   librbd::NoOpProgressContext no_op;