From: Yan, Zheng Date: Fri, 6 May 2016 11:07:07 +0000 (+0800) Subject: mds: finish lock waiters in the same order that they were added. X-Git-Tag: v10.2.3~98^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=eea0e916640c3ac2d69ffb9c335dde6332b03938;p=ceph.git mds: finish lock waiters in the same order that they were added. Current code first processes lock waiters who have smaller wait mask. Lock waiters who have large wait mask can starve if client keeps sending requests that add waiter with small mask. Signed-off-by: Yan, Zheng (cherry picked from commit d463107473382170c07d9250bb7ace0e5a2a7de2) --- diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 188750a195dc4..34b1fb1512eb7 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -251,13 +251,13 @@ void MDSRankDispatcher::shutdown() /** * Helper for simple callbacks that call a void fn with no args. */ -class C_VoidFn : public MDSInternalContext +class C_MDS_VoidFn : public MDSInternalContext { typedef void (MDSRank::*fn_ptr)(); protected: fn_ptr fn; public: - C_VoidFn(MDSRank *mds_, fn_ptr fn_) + C_MDS_VoidFn(MDSRank *mds_, fn_ptr fn_) : MDSInternalContext(mds_), fn(fn_) { assert(mds_); @@ -1160,7 +1160,7 @@ void MDSRank::resolve_start() reopen_log(); - mdcache->resolve_start(new C_VoidFn(this, &MDSRank::resolve_done)); + mdcache->resolve_start(new C_MDS_VoidFn(this, &MDSRank::resolve_done)); finish_contexts(g_ceph_context, waiting_for_resolve); } void MDSRank::resolve_done() @@ -1177,7 +1177,7 @@ void MDSRank::reconnect_start() reopen_log(); } - server->reconnect_clients(new C_VoidFn(this, &MDSRank::reconnect_done)); + server->reconnect_clients(new C_MDS_VoidFn(this, &MDSRank::reconnect_done)); finish_contexts(g_ceph_context, waiting_for_reconnect); } void MDSRank::reconnect_done() @@ -1194,7 +1194,7 @@ void MDSRank::rejoin_joint_start() void MDSRank::rejoin_start() { dout(1) << "rejoin_start" << dendl; - mdcache->rejoin_start(new C_VoidFn(this, &MDSRank::rejoin_done)); + mdcache->rejoin_start(new C_MDS_VoidFn(this, &MDSRank::rejoin_done)); } void MDSRank::rejoin_done() { @@ -1299,7 +1299,7 @@ void MDSRank::boot_create() { dout(3) << "boot_create" << dendl; - MDSGatherBuilder fin(g_ceph_context, new C_VoidFn(this, &MDSRank::creating_done)); + MDSGatherBuilder fin(g_ceph_context, new C_MDS_VoidFn(this, &MDSRank::creating_done)); mdcache->init_layouts(); diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h index faec26ad5a473..6d1d7fab9d55d 100644 --- a/src/mds/SimpleLock.h +++ b/src/mds/SimpleLock.h @@ -306,7 +306,7 @@ public: parent->take_waiting(mask << get_wait_shift(), ls); } void add_waiter(uint64_t mask, MDSInternalContextBase *c) { - parent->add_waiter(mask << get_wait_shift(), c); + parent->add_waiter((mask << get_wait_shift()) | MDSCacheObject::WAIT_ORDERED, c); } bool is_waiter_for(uint64_t mask) const { return parent->is_waiter_for(mask << get_wait_shift()); diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc index 066b26b3536fa..2fefbb7159d53 100644 --- a/src/mds/mdstypes.cc +++ b/src/mds/mdstypes.cc @@ -1038,6 +1038,8 @@ void cap_reconnect_t::generate_test_instances(list& ls) ls.back()->capinfo.cap_id = 1; } +uint64_t MDSCacheObject::last_wait_seq = 0; + void MDSCacheObject::dump(Formatter *f) const { f->dump_bool("is_auth", is_auth()); diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index e789856b36d7c..88f184de04290 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -1336,6 +1336,7 @@ class MDSCacheObject { // -- wait -- + const static uint64_t WAIT_ORDERED = (1ull<<61); const static uint64_t WAIT_SINGLEAUTH = (1ull<<60); const static uint64_t WAIT_UNFREEZE = (1ull<<59); // pka AUTHPINNABLE @@ -1544,7 +1545,8 @@ protected: // --------------------------------------------- // waiting protected: - compact_multimap waiting; + compact_multimap > waiting; + static uint64_t last_wait_seq; public: bool is_waiter_for(uint64_t mask, uint64_t min=0) { @@ -1553,7 +1555,7 @@ protected: while (min & (min-1)) // if more than one bit is set min &= min-1; // clear LSB } - for (compact_multimap::iterator p = waiting.lower_bound(min); + for (auto p = waiting.lower_bound(min); p != waiting.end(); ++p) { if (p->first & mask) return true; @@ -1564,7 +1566,15 @@ protected: virtual void add_waiter(uint64_t mask, MDSInternalContextBase *c) { if (waiting.empty()) get(PIN_WAITER); - waiting.insert(pair(mask, c)); + + uint64_t seq = 0; + if (mask & WAIT_ORDERED) { + seq = ++last_wait_seq; + mask &= ~WAIT_ORDERED; + } + waiting.insert(pair >( + mask, + pair(seq, c))); // pdout(10,g_conf->debug_mds) << (mdsco_db_line_prefix(this)) // << "add_waiter " << hex << mask << dec << " " << c // << " on " << *this @@ -1573,10 +1583,18 @@ protected: } virtual void take_waiting(uint64_t mask, list& ls) { if (waiting.empty()) return; - compact_multimap::iterator it = waiting.begin(); - while (it != waiting.end()) { + + // process ordered waiters in the same order that they were added. + std::map ordered_waiters; + + for (auto it = waiting.begin(); + it != waiting.end(); ) { if (it->first & mask) { - ls.push_back(it->second); + + if (it->second.first > 0) + ordered_waiters.insert(it->second); + else + ls.push_back(it->second.second); // pdout(10,g_conf->debug_mds) << (mdsco_db_line_prefix(this)) // << "take_waiting mask " << hex << mask << dec << " took " << it->second // << " tag " << hex << it->first << dec @@ -1591,6 +1609,11 @@ protected: ++it; } } + for (auto it = ordered_waiters.begin(); + it != ordered_waiters.end(); + ++it) { + ls.push_back(it->second); + } if (waiting.empty()) put(PIN_WAITER); }