]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
librbd: better handling for image watch errors
authorJason Dillaman <dillaman@redhat.com>
Mon, 2 Feb 2015 23:07:03 +0000 (18:07 -0500)
committerJason Dillaman <dillaman@redhat.com>
Tue, 3 Feb 2015 11:36:11 +0000 (06:36 -0500)
When the librados watcher fails, librbd will now continuously
attempt to re-register the watch until it succeeds or the image
is closed.

Signed-off-by: Jason Dillaman <dillaman@redhat.com>
src/librbd/ImageWatcher.cc
src/librbd/ImageWatcher.h
src/librbd/internal.cc

index 08210bbe355705800de9a4606b7fdcf45e650b27..d42da27645fd57348f6ee2c9e2953063d3d9300e 100644 (file)
@@ -61,12 +61,14 @@ enum {
 };
 
 ImageWatcher::ImageWatcher(ImageCtx &image_ctx)
-  : m_image_ctx(image_ctx), m_watch_ctx(*this), m_handle(0),
+  : m_image_ctx(image_ctx),
+    m_watch_lock("librbd::ImageWatcher::m_watch_lock"),
+    m_watch_ctx(*this), m_watch_handle(0),
+    m_watch_state(WATCH_STATE_UNREGISTERED),
     m_lock_owner_state(LOCK_OWNER_STATE_NOT_LOCKED),
     m_finisher(new Finisher(image_ctx.cct)),
     m_timer_lock("librbd::ImageWatcher::m_timer_lock"),
     m_timer(new SafeTimer(image_ctx.cct, m_timer_lock)),
-    m_watch_lock("librbd::ImageWatcher::m_watch_lock"), m_watch_error(0),
     m_async_request_lock("librbd::ImageWatcher::m_async_request_lock"),
     m_async_request_id(0),
     m_aio_request_lock("librbd::ImageWatcher::m_aio_request_lock"),
@@ -78,8 +80,14 @@ ImageWatcher::ImageWatcher(ImageCtx &image_ctx)
 
 ImageWatcher::~ImageWatcher()
 {
-  Mutex::Locker l(m_timer_lock);
-  m_timer->shutdown();
+  {
+    Mutex::Locker l(m_timer_lock);
+    m_timer->shutdown();
+  }
+  {
+    RWLock::RLocker l(m_watch_lock);
+    assert(m_watch_state != WATCH_STATE_REGISTERED);
+  }
   m_finisher->stop();
   delete m_finisher;
 }
@@ -101,14 +109,16 @@ int ImageWatcher::register_watch() {
   ldout(m_image_ctx.cct, 20) << "registering image watcher" << dendl;
 
   RWLock::WLocker l(m_watch_lock);
-  m_watch_error = m_image_ctx.md_ctx.watch2(m_image_ctx.header_oid, &m_handle,
-                                           &m_watch_ctx);
-  return m_watch_error;
-}
+  assert(m_watch_state == WATCH_STATE_UNREGISTERED);
+  int r = m_image_ctx.md_ctx.watch2(m_image_ctx.header_oid,
+                                   &m_watch_handle,
+                                   &m_watch_ctx);
+  if (r < 0) {
+    return r;
+  }
 
-int ImageWatcher::get_watch_error() {
-  RWLock::RLocker l(m_watch_lock);
-  return m_watch_error;
+  m_watch_state = WATCH_STATE_REGISTERED;
+  return 0;
 }
 
 int ImageWatcher::unregister_watch() {
@@ -121,8 +131,19 @@ int ImageWatcher::unregister_watch() {
 
   cancel_async_requests(-ESHUTDOWN);
 
-  RWLock::WLocker l(m_watch_lock);
-  return m_image_ctx.md_ctx.unwatch2(m_handle);
+  int r = 0;
+  {
+    RWLock::WLocker l(m_watch_lock);
+    assert(m_watch_state != WATCH_STATE_UNREGISTERED);
+    if (m_watch_state == WATCH_STATE_REGISTERED) {
+      r = m_image_ctx.md_ctx.unwatch2(m_watch_handle);
+    }
+    m_watch_state = WATCH_STATE_UNREGISTERED;
+  }
+
+  librados::Rados rados(m_image_ctx.md_ctx);
+  rados.watch_flush();
+  return r;
 }
 
 bool ImageWatcher::has_pending_aio_operations() {
@@ -216,6 +237,14 @@ int ImageWatcher::request_lock(
     }
   }
 
+  {
+    // aio operations will be retried once the the watch is re-established
+    RWLock::RLocker l(m_watch_lock);
+    if (m_watch_state == WATCH_STATE_ERROR) {
+      return 0;
+    }
+  }
+
   // run notify request in finisher to avoid blocking aio path
   FunctionContext *ctx = new FunctionContext(
     boost::bind(&ImageWatcher::notify_request_lock, this));
@@ -499,7 +528,7 @@ void ImageWatcher::notify_header_update(librados::IoCtx &io_ctx,
 std::string ImageWatcher::encode_lock_cookie() const {
   RWLock::RLocker l(m_watch_lock);
   std::ostringstream ss;
-  ss << WATCHER_LOCK_COOKIE_PREFIX << " " << m_handle;
+  ss << WATCHER_LOCK_COOKIE_PREFIX << " " << m_watch_handle;
   return ss.str();
 }
 
@@ -577,7 +606,7 @@ uint64_t ImageWatcher::encode_async_request(bufferlist &bl) {
   ++m_async_request_id;
 
   RemoteAsyncRequest request(m_image_ctx.md_ctx.get_instance_id(),
-                            m_handle, m_async_request_id);
+                            m_watch_handle, m_async_request_id);
   ::encode(request, bl);
 
   ldout(m_image_ctx.cct, 20) << "async request: " << request << dendl;
@@ -789,7 +818,7 @@ void ImageWatcher::handle_async_progress(bufferlist::iterator iter) {
   ::decode(offset, iter);
   ::decode(total, iter);
   if (request.gid == m_image_ctx.md_ctx.get_instance_id() &&
-      request.handle == m_handle) {
+      request.handle == m_watch_handle) {
     RWLock::RLocker l(m_async_request_lock);
     std::map<uint64_t, AsyncRequest>::iterator iter =
       m_async_requests.find(request.request_id);
@@ -809,7 +838,7 @@ void ImageWatcher::handle_async_complete(bufferlist::iterator iter) {
   int r;
   ::decode(r, iter);
   if (request.gid == m_image_ctx.md_ctx.get_instance_id() &&
-      request.handle == m_handle) {
+      request.handle == m_watch_handle) {
     Context *ctx = NULL;
     {
       RWLock::RLocker l(m_async_request_lock);
@@ -977,9 +1006,16 @@ void ImageWatcher::handle_notify(uint64_t notify_id, uint64_t handle,
 void ImageWatcher::handle_error(uint64_t handle, int err) {
   lderr(m_image_ctx.cct) << "image watch failed: " << handle << ", "
                          << cpp_strerror(err) << dendl;
-  FunctionContext *ctx = new FunctionContext(
-    boost::bind(&ImageWatcher::reregister_watch, this));
-  m_finisher->queue(ctx);
+
+  RWLock::WLocker l(m_watch_lock);
+  if (m_watch_state == WATCH_STATE_REGISTERED) {
+    m_image_ctx.md_ctx.unwatch2(m_watch_handle);
+    m_watch_state = WATCH_STATE_ERROR;
+
+    FunctionContext *ctx = new FunctionContext(
+      boost::bind(&ImageWatcher::reregister_watch, this));
+    m_finisher->queue(ctx);
+  }
 }
 
 void ImageWatcher::acknowledge_notify(uint64_t notify_id, uint64_t handle,
@@ -993,22 +1029,30 @@ void ImageWatcher::reregister_watch() {
   {
     RWLock::WLocker l(m_image_ctx.owner_lock);
     bool lock_owner = (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED);
-    int r;
     if (lock_owner) {
       unlock();
     }
 
+    int r;
     {
       RWLock::WLocker l(m_watch_lock);
-      m_image_ctx.md_ctx.unwatch2(m_handle);
-      m_watch_error = m_image_ctx.md_ctx.watch2(m_image_ctx.header_oid,
-                                                &m_handle, &m_watch_ctx);
-      if (m_watch_error < 0) {
+      if (m_watch_state != WATCH_STATE_ERROR) {
+       return;
+      }
+
+      r = m_image_ctx.md_ctx.watch2(m_image_ctx.header_oid,
+                                    &m_watch_handle, &m_watch_ctx);
+      if (r < 0) {
         lderr(m_image_ctx.cct) << "failed to re-register image watch: "
-                               << cpp_strerror(m_watch_error) << dendl;
-       schedule_retry_aio_requests();
+                               << cpp_strerror(r) << dendl;
+       Mutex::Locker l(m_timer_lock);
+       FunctionContext *ctx = new FunctionContext(boost::bind(
+         &ImageWatcher::reregister_watch, this));
+       m_timer->add_event_after(RETRY_DELAY_SECONDS, ctx);
         return;
       }
+
+      m_watch_state = WATCH_STATE_REGISTERED;
     }
     handle_header_update();
 
index 1efd32f7b372828a6bebdd54871d3419dfa73420..0261b1307701304efa793d36b55ce37064892503 100644 (file)
@@ -55,7 +55,6 @@ namespace librbd {
 
     int register_watch();
     int unregister_watch();
-    int get_watch_error();
 
     bool has_pending_aio_operations();
     void flush_aio_operations();
@@ -88,6 +87,12 @@ namespace librbd {
       LOCK_OWNER_STATE_RELEASING
     };
 
+    enum WatchState {
+      WATCH_STATE_UNREGISTERED,
+      WATCH_STATE_REGISTERED,
+      WATCH_STATE_ERROR
+    };
+
     typedef std::pair<Context *, ProgressContext *> AsyncRequest;
     typedef std::pair<boost::function<int(AioCompletion *)>,
                      AioCompletion *> AioRequest;
@@ -151,8 +156,10 @@ namespace librbd {
 
     ImageCtx &m_image_ctx;
 
+    RWLock m_watch_lock;
     WatchCtx m_watch_ctx;
-    uint64_t m_handle;
+    uint64_t m_watch_handle;
+    WatchState m_watch_state;
 
     LockOwnerState m_lock_owner_state;
 
@@ -161,9 +168,6 @@ namespace librbd {
     Mutex m_timer_lock;
     SafeTimer *m_timer;
 
-    RWLock m_watch_lock;
-    int m_watch_error;
-
     RWLock m_async_request_lock;
     uint64_t m_async_request_id;
     std::map<uint64_t, AsyncRequest> m_async_requests;
index 285f0551eebe7fd60fd9968b16178ea4303a4626..fb59a0b2eae701900e9066b7990304e896f7c794 100644 (file)
@@ -1778,16 +1778,6 @@ reprotect_and_return_err:
     bool needs_refresh = ictx->last_refresh != ictx->refresh_seq;
     ictx->refresh_lock.Unlock();
 
-    if (ictx->image_watcher != NULL) {
-      // might have encountered an error re-registering a watch
-      int r = ictx->image_watcher->get_watch_error();
-      if (r < 0) {
-        lderr(cct) << "rbd header watch invalid: " << cpp_strerror(r)
-                   << dendl;
-       return r;
-      }
-    }
-
     if (needs_refresh) {
       RWLock::WLocker l(ictx->md_lock);