]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: extend reconnect period when mds is busy 25784/head
authorYan, Zheng <zyan@redhat.com>
Thu, 13 Dec 2018 13:18:23 +0000 (21:18 +0800)
committerYan, Zheng <zyan@redhat.com>
Mon, 7 Jan 2019 08:41:10 +0000 (16:41 +0800)
Fixes: https://tracker.ceph.com/issues/37644
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
(cherry picked from commit ad1f964395f65c5d3baed712c0949928998e47be)

 Conflicts:
src/mds/Server.cc

src/mds/Server.cc
src/mds/Server.h

index 3cb08b15e7157f2487d222653336b9df828cba27..90b2ef8e330a3f12a1a28822ec7c398c43c72dc9 100644 (file)
@@ -1074,6 +1074,8 @@ void Server::handle_client_reconnect(MClientReconnect *m)
   }
   mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
 
+  reconnect_last_seen = clock::now();
+
   // remove from gather set
   client_reconnect_gather.erase(from);
   if (client_reconnect_gather.empty())
@@ -1095,51 +1097,70 @@ void Server::reconnect_gather_finish()
 void Server::reconnect_tick()
 {
   if (reconnect_evicting) {
-    dout(4) << "reconnect_tick: waiting for evictions" << dendl;
+    dout(7) << "reconnect_tick: waiting for evictions" << dendl;
     return;
   }
 
-  auto elapse = std::chrono::duration<double>(clock::now() - reconnect_start).count();
-  if (elapse >= g_conf()->mds_reconnect_timeout &&
-      !client_reconnect_gather.empty()) {
-    dout(10) << "reconnect timed out" << dendl;
+  if (client_reconnect_gather.empty())
+    return;
 
-    // If we're doing blacklist evictions, use this to wait for them before
-    // proceeding to reconnect_gather_finish
-    MDSGatherBuilder gather(g_ceph_context);
+  auto now = clock::now();
+  auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
+  if (elapse1 < g_conf->mds_reconnect_timeout)
+    return;
 
-    for (set<client_t>::iterator p = client_reconnect_gather.begin();
-        p != client_reconnect_gather.end();
-        ++p) {
-      Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
-      assert(session);
-      dout(1) << "reconnect gave up on " << session->info.inst << dendl;
-
-      mds->clog->warn() << "evicting unresponsive client " << *session
-                        << ", after waiting " << g_conf->mds_reconnect_timeout
-                        << " seconds during MDS startup";
-
-      if (g_conf->mds_session_blacklist_on_timeout) {
-        std::stringstream ss;
-        mds->evict_client(session->info.inst.name.num(), false, true, ss,
-                          gather.new_sub());
-      } else {
-        kill_session(session, NULL);
-      }
+  vector<Session*> remaining_sessions;
+  remaining_sessions.reserve(client_reconnect_gather.size());
+  for (auto c : client_reconnect_gather) {
+    Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
+    ceph_assert(session);
+    remaining_sessions.push_back(session);
+    // client re-sends cap flush messages before the reconnect message
+    if (session->last_seen > reconnect_last_seen)
+      reconnect_last_seen = session->last_seen;
+  }
 
-      failed_reconnects++;
-    }
-    client_reconnect_gather.clear();
+  auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
+  if (elapse2 < g_conf->mds_reconnect_timeout / 2) {
+    dout(7) << "reconnect_tick: last seen " << elapse2
+            << " seconds ago, extending reconnect interval" << dendl;
+    return;
+  }
 
-    if (gather.has_subs()) {
-      dout(1) << "reconnect will complete once clients are evicted" << dendl;
-      gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
-            [this](int r){reconnect_gather_finish();})));
-      gather.activate();
-      reconnect_evicting = true;
+  dout(7) << "reconnect timed out, " << remaining_sessions.size()
+         << " clients have not reconnected in time" << dendl;
+
+  // If we're doing blacklist evictions, use this to wait for them before
+  // proceeding to reconnect_gather_finish
+  MDSGatherBuilder gather(g_ceph_context);
+
+  for (auto session : remaining_sessions) {
+    dout(1) << "reconnect gives up on " << session->info.inst << dendl;
+
+    mds->clog->warn() << "evicting unresponsive client " << *session
+                     << ", after waiting " << elapse1
+                     << " seconds during MDS startup";
+
+    if (g_conf->mds_session_blacklist_on_timeout) {
+      std::stringstream ss;
+      mds->evict_client(session->get_client().v, false, true, ss,
+                       gather.new_sub());
     } else {
-      reconnect_gather_finish();
+      kill_session(session, NULL);
     }
+
+    failed_reconnects++;
+  }
+  client_reconnect_gather.clear();
+
+  if (gather.has_subs()) {
+    dout(1) << "reconnect will complete once clients are evicted" << dendl;
+    gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
+           [this](int r){reconnect_gather_finish();})));
+    gather.activate();
+    reconnect_evicting = true;
+  } else {
+    reconnect_gather_finish();
   }
 }
 
index e369eb2bfffe655a61c185ce441b77cf0672fee4..4fff7ae6fe17c807e86f6831e0a46f70f61c7fe4 100644 (file)
@@ -90,6 +90,7 @@ private:
   bool reconnect_evicting;  // true if I am waiting for evictions to complete
                             // before proceeding to reconnect_gather_finish
   time reconnect_start = time::min();
+  time reconnect_last_seen = time::min();
   set<client_t> client_reconnect_gather;  // clients i need a reconnect msg from.
 
   double cap_revoke_eviction_timeout = 0;