From: Yan, Zheng Date: Thu, 13 Dec 2018 13:18:23 +0000 (+0800) Subject: mds: extend reconnect period when mds is busy X-Git-Tag: v12.2.11~36^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=222f17d15f6bcfa6199dc8b982f3a8614d2d433e;p=ceph.git mds: extend reconnect period when mds is busy Fixes: https://tracker.ceph.com/issues/37644 Signed-off-by: "Yan, Zheng" (cherry picked from commit ad1f964395f65c5d3baed712c0949928998e47be) Conflicts: src/mds/Server.cc --- diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 3cb08b15e715..90b2ef8e330a 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1074,6 +1074,8 @@ void Server::handle_client_reconnect(MClientReconnect *m) } mdcache->rejoin_recovered_client(session->get_client(), session->info.inst); + reconnect_last_seen = clock::now(); + // remove from gather set client_reconnect_gather.erase(from); if (client_reconnect_gather.empty()) @@ -1095,51 +1097,70 @@ void Server::reconnect_gather_finish() void Server::reconnect_tick() { if (reconnect_evicting) { - dout(4) << "reconnect_tick: waiting for evictions" << dendl; + dout(7) << "reconnect_tick: waiting for evictions" << dendl; return; } - auto elapse = std::chrono::duration(clock::now() - reconnect_start).count(); - if (elapse >= g_conf()->mds_reconnect_timeout && - !client_reconnect_gather.empty()) { - dout(10) << "reconnect timed out" << dendl; + if (client_reconnect_gather.empty()) + return; - // If we're doing blacklist evictions, use this to wait for them before - // proceeding to reconnect_gather_finish - MDSGatherBuilder gather(g_ceph_context); + auto now = clock::now(); + auto elapse1 = std::chrono::duration(now - reconnect_start).count(); + if (elapse1 < g_conf->mds_reconnect_timeout) + return; - for (set::iterator p = client_reconnect_gather.begin(); - p != client_reconnect_gather.end(); - ++p) { - Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v)); - assert(session); - dout(1) << "reconnect gave up on " << session->info.inst << dendl; - - mds->clog->warn() << "evicting unresponsive client " << *session - << ", after waiting " << g_conf->mds_reconnect_timeout - << " seconds during MDS startup"; - - if (g_conf->mds_session_blacklist_on_timeout) { - std::stringstream ss; - mds->evict_client(session->info.inst.name.num(), false, true, ss, - gather.new_sub()); - } else { - kill_session(session, NULL); - } + vector remaining_sessions; + remaining_sessions.reserve(client_reconnect_gather.size()); + for (auto c : client_reconnect_gather) { + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v)); + ceph_assert(session); + remaining_sessions.push_back(session); + // client re-sends cap flush messages before the reconnect message + if (session->last_seen > reconnect_last_seen) + reconnect_last_seen = session->last_seen; + } - failed_reconnects++; - } - client_reconnect_gather.clear(); + auto elapse2 = std::chrono::duration(now - reconnect_last_seen).count(); + if (elapse2 < g_conf->mds_reconnect_timeout / 2) { + dout(7) << "reconnect_tick: last seen " << elapse2 + << " seconds ago, extending reconnect interval" << dendl; + return; + } - if (gather.has_subs()) { - dout(1) << "reconnect will complete once clients are evicted" << dendl; - gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext( - [this](int r){reconnect_gather_finish();}))); - gather.activate(); - reconnect_evicting = true; + dout(7) << "reconnect timed out, " << remaining_sessions.size() + << " clients have not reconnected in time" << dendl; + + // If we're doing blacklist evictions, use this to wait for them before + // proceeding to reconnect_gather_finish + MDSGatherBuilder gather(g_ceph_context); + + for (auto session : remaining_sessions) { + dout(1) << "reconnect gives up on " << session->info.inst << dendl; + + mds->clog->warn() << "evicting unresponsive client " << *session + << ", after waiting " << elapse1 + << " seconds during MDS startup"; + + if (g_conf->mds_session_blacklist_on_timeout) { + std::stringstream ss; + mds->evict_client(session->get_client().v, false, true, ss, + gather.new_sub()); } else { - reconnect_gather_finish(); + kill_session(session, NULL); } + + failed_reconnects++; + } + client_reconnect_gather.clear(); + + if (gather.has_subs()) { + dout(1) << "reconnect will complete once clients are evicted" << dendl; + gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext( + [this](int r){reconnect_gather_finish();}))); + gather.activate(); + reconnect_evicting = true; + } else { + reconnect_gather_finish(); } } diff --git a/src/mds/Server.h b/src/mds/Server.h index e369eb2bfffe..4fff7ae6fe17 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -90,6 +90,7 @@ private: bool reconnect_evicting; // true if I am waiting for evictions to complete // before proceeding to reconnect_gather_finish time reconnect_start = time::min(); + time reconnect_last_seen = time::min(); set client_reconnect_gather; // clients i need a reconnect msg from. double cap_revoke_eviction_timeout = 0;