From 65e6daa7f198b18137919e121ff578cdca401da9 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 13 Jun 2018 16:53:20 +0800
Subject: [PATCH] client: introduce ceph_abort_conn()

the function forcibly close mds sessions, leave client state intact
on MDS.

Fixes: http://tracker.ceph.com/issues/24465
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 src/client/Client.cc           | 138 +++++++++++++++++++--------------
 src/client/Client.h            |  12 +--
 src/client/Delegation.cc       |   2 +-
 src/include/cephfs/libcephfs.h |   8 ++
 src/libcephfs.cc               |  13 ++++
 src/pybind/cephfs/cephfs.pyx   |  12 +++
 6 files changed, 122 insertions(+), 63 deletions(-)

diff --git a/src/client/Client.cc b/src/client/Client.cc
index 213b78465d4..e2d92c23712 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -248,8 +248,6 @@ Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
     objecter(objecter_),
     whoami(mc->get_global_id()), cap_epoch_barrier(0),
     last_tid(0), oldest_tid(0), last_flush_tid(1),
-    initialized(false),
-    mounted(false), unmounting(false), blacklisted(false),
     local_osd(-ENXIO), local_osd_epoch(0),
     unsafe_sync_write(0),
     client_lock("Client::client_lock"),
@@ -2421,29 +2419,8 @@ void Client::handle_osd_map(MOSDMap *m)
         });
     lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
     blacklisted = true;
-    for (std::map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
-         p != mds_requests.end(); ) {
-      auto req = p->second;
-      ++p;
-      req->abort(-EBLACKLISTED);
-      if (req->caller_cond) {
-	req->kick = true;
-	req->caller_cond->Signal();
-      }
-    }
 
-    // Progress aborts on any requests that were on this waitlist.  Any
-    // requests that were on a waiting_for_open session waitlist
-    // will get kicked during close session below.
-    signal_cond_list(waiting_for_mdsmap);
-
-    // Force-close all sessions: assume this is not abandoning any state
-    // on the MDS side because the MDS will have seen the blacklist too.
-    while(!mds_sessions.empty()) {
-      auto i = mds_sessions.begin();
-      auto &session = i->second;
-      _closed_mds_session(&session);
-    }
+    _abort_mds_sessions(-EBLACKLISTED);
 
     // Since we know all our OSD ops will fail, cancel them all preemtively,
     // so that on an unhealthy cluster we can umount promptly even if e.g.
@@ -5784,17 +5761,58 @@ void Client::flush_mdlog(MetaSession *session)
 }
 
 
-void Client::_unmount()
+void Client::_abort_mds_sessions(int err)
+{
+  for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
+    auto req = p->second;
+    ++p;
+    // unsafe requests will be removed during close session below.
+    if (req->got_unsafe)
+      continue;
+
+    req->abort(err);
+    if (req->caller_cond) {
+      req->kick = true;
+      req->caller_cond->Signal();
+    }
+  }
+
+  // Process aborts on any requests that were on this waitlist.
+  // Any requests that were on a waiting_for_open session waitlist
+  // will get kicked during close session below.
+  signal_cond_list(waiting_for_mdsmap);
+
+  // Force-close all sessions
+  while(!mds_sessions.empty()) {
+    auto& session = mds_sessions.begin()->second;
+    _closed_mds_session(&session);
+  }
+}
+
+void Client::_unmount(bool abort)
 {
   if (unmounting)
     return;
 
-  ldout(cct, 2) << "unmounting" << dendl;
+  if (abort || blacklisted) {
+    ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
+  } else {
+    ldout(cct, 2) << "unmounting" << dendl;
+  }
   unmounting = true;
 
   deleg_timeout = 0;
 
-  flush_mdlog_sync(); // flush the mdlog for pending requests, if any
+  if (abort) {
+    // Abort all mds sessions
+    _abort_mds_sessions(-ENOTCONN);
+
+    objecter->op_cancel_writes(-ENOTCONN);
+  } else {
+    // flush the mdlog for pending requests, if any
+    flush_mdlog_sync();
+  }
+
   while (!mds_requests.empty()) {
     ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
     mount_cond.Wait(client_lock);
@@ -5830,23 +5848,6 @@ void Client::_unmount()
 
   _ll_drop_pins();
 
-  if (blacklisted) {
-    ldout(cct, 0) << " skipping clean shutdown, we are blacklisted" << dendl;
-
-    if (cct->_conf->client_oc) {
-      // Purge all cached data so that ObjectCacher doesn't get hung up
-      // trying to flush it.  ObjectCacher's behaviour on EBLACKLISTED
-      // is to just leave things marked dirty
-      // (http://tracker.ceph.com/issues/9105)
-      for (const auto &i : inode_map) {
-        objectcacher->purge_set(&(i.second->oset));
-      }
-    }
-
-    mounted = false;
-    return;
-  }
-
   while (unsafe_sync_write > 0) {
     ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting"  << dendl;
     mount_cond.Wait(client_lock);
@@ -5854,27 +5855,40 @@ void Client::_unmount()
 
   if (cct->_conf->client_oc) {
     // flush/release all buffered data
-    ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
-    for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
-	 p != inode_map.end();
-	 p = next) {
-      next = p;
-      ++next;
-      Inode *in = p->second;
+    std::list<InodeRef> anchor;
+    for (auto& p : inode_map) {
+      Inode *in = p.second;
       if (!in) {
-	ldout(cct, 0) << "null inode_map entry ino " << p->first << dendl;
+	ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
 	assert(in);
       }
-      if (!in->caps.empty()) {
-	InodeRef tmp_ref(in);
+
+      // prevent inode from getting freed
+      anchor.emplace_back(in);
+
+      if (abort || blacklisted) {
+        objectcacher->purge_set(&in->oset);
+      } else if (!in->caps.empty()) {
 	_release(in);
 	_flush(in, new C_Client_FlushComplete(this, in));
       }
     }
   }
 
-  flush_caps_sync();
-  wait_sync_caps(last_flush_tid);
+  if (abort || blacklisted) {
+    for (auto p = dirty_list.begin(); !p.end(); ) {
+      Inode *in = *p;
+      ++p;
+      if (in->dirty_caps) {
+	ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
+	in->mark_caps_clean();
+	put_inode(in);
+      }
+    }
+  } else {
+    flush_caps_sync();
+    wait_sync_caps(last_flush_tid);
+  }
 
   // empty lru cache
   trim_cache();
@@ -5910,7 +5924,13 @@ void Client::_unmount()
 void Client::unmount()
 {
   Mutex::Locker lock(client_lock);
-  _unmount();
+  _unmount(false);
+}
+
+void Client::abort_conn()
+{
+  Mutex::Locker lock(client_lock);
+  _unmount(true);
 }
 
 void Client::flush_cap_releases()
@@ -9895,6 +9915,10 @@ void Client::_release_filelocks(Fh *fh)
   if (to_release.empty())
     return;
 
+  // mds has already released filelocks if session was closed.
+  if (in->caps.empty())
+    return;
+
   struct flock fl;
   memset(&fl, 0, sizeof(fl));
   fl.l_whence = SEEK_SET;
diff --git a/src/client/Client.h b/src/client/Client.h
index cb0af08c74e..c19f4de7edf 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -381,10 +381,10 @@ protected:
   void handle_client_reply(MClientReply *reply);
   bool is_dir_operation(MetaRequest *request);
 
-  bool   initialized;
-  bool   mounted;
-  bool   unmounting;
-  bool   blacklisted;
+  bool   initialized = false;
+  bool   mounted = false;
+  bool   unmounting = false;
+  bool   blacklisted = false;
 
   // When an MDS has sent us a REJECT, remember that and don't
   // contact it again.  Remember which inst rejected us, so that
@@ -492,8 +492,9 @@ protected:
   void put_inode(Inode *in, int n=1);
   void close_dir(Dir *dir);
 
+  void _abort_mds_sessions(int err);
   // same as unmount() but for when the client_lock is already held
-  void _unmount();
+  void _unmount(bool abort);
 
   friend class C_Client_FlushComplete; // calls put_inode()
   friend class C_Client_CacheInvalidate;  // calls ino_invalidate_cb
@@ -952,6 +953,7 @@ public:
   int mount(const std::string &mount_root, const UserPerm& perms,
 	    bool require_mds=false);
   void unmount();
+  void abort_conn();
 
   int mds_command(
     const std::string &mds_spec,
diff --git a/src/client/Delegation.cc b/src/client/Delegation.cc
index 6c972fde05a..c628cf66df0 100644
--- a/src/client/Delegation.cc
+++ b/src/client/Delegation.cc
@@ -23,7 +23,7 @@ public:
 	  ": delegation return timeout for inode 0x" <<
 	  std::hex << in->ino << ". Forcibly unmounting client. "<<
 	  client << std::dec << dendl;
-    client->_unmount();
+    client->_unmount(false);
   }
 };
 
diff --git a/src/include/cephfs/libcephfs.h b/src/include/cephfs/libcephfs.h
index e23bcf24d1b..77ebb5107b5 100644
--- a/src/include/cephfs/libcephfs.h
+++ b/src/include/cephfs/libcephfs.h
@@ -298,6 +298,14 @@ void ceph_buffer_free(char *buf);
  */
 int ceph_unmount(struct ceph_mount_info *cmount);
 
+/**
+ * Abort mds connections
+ *
+ * @param cmount the mount handle
+ * @return 0 on success, negative error code on failure
+ */
+int ceph_abort_conn(struct ceph_mount_info *cmount);
+
 /**
  * Destroy the mount handle.
  *
diff --git a/src/libcephfs.cc b/src/libcephfs.cc
index a65e763da7e..e2d4b513731 100644
--- a/src/libcephfs.cc
+++ b/src/libcephfs.cc
@@ -147,6 +147,14 @@ public:
     shutdown();
     return 0;
   }
+  int abort_conn()
+  {
+    if (mounted) {
+      client->abort_conn();
+      mounted = false;
+    }
+    return 0;
+  }
 
   void shutdown()
   {
@@ -349,6 +357,11 @@ extern "C" int ceph_unmount(struct ceph_mount_info *cmount)
   return cmount->unmount();
 }
 
+extern "C" int ceph_abort_conn(struct ceph_mount_info *cmount)
+{
+  return cmount->abort_conn();
+}
+
 extern "C" int ceph_release(struct ceph_mount_info *cmount)
 {
   if (cmount->is_mounted())
diff --git a/src/pybind/cephfs/cephfs.pyx b/src/pybind/cephfs/cephfs.pyx
index 47f2b1ea8bc..049d5be664d 100644
--- a/src/pybind/cephfs/cephfs.pyx
+++ b/src/pybind/cephfs/cephfs.pyx
@@ -111,6 +111,7 @@ cdef extern from "cephfs/libcephfs.h" nogil:
 
     int ceph_mount(ceph_mount_info *cmount, const char *root)
     int ceph_unmount(ceph_mount_info *cmount)
+    int ceph_abort_conn(ceph_mount_info *cmount)
     int ceph_fstatx(ceph_mount_info *cmount, int fd, statx *stx, unsigned want, unsigned flags)
     int ceph_statx(ceph_mount_info *cmount, const char *path, statx *stx, unsigned want, unsigned flags)
     int ceph_statfs(ceph_mount_info *cmount, const char *path, statvfs *stbuf)
@@ -559,6 +560,17 @@ cdef class LibCephFS(object):
             raise make_ex(ret, "error calling ceph_unmount")
         self.state = "initialized"
 
+    def abort_conn(self):
+        """
+        Abort mds connections.
+        """
+        self.require_state("mounted")
+        with nogil:
+            ret = ceph_abort_conn(self.cluster)
+        if ret != 0:
+            raise make_ex(ret, "error calling ceph_abort_conn")
+        self.state = "initialized"
+
     def statfs(self, path):
         """
         Perform a statfs on the ceph file system.  This call fills in file system wide statistics
-- 
2.39.5