From: Sage Weil Date: Thu, 19 Aug 2010 22:19:33 +0000 (-0700) Subject: mds: add wait on auth change machinery X-Git-Tag: v0.22~236^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=aa226df5cae22c03f7015e26aa775f075be3ea6f;p=ceph.git mds: add wait on auth change machinery Special wait mask is passed through lock wait mask to parent object. Caller adds item to a list on the subtree root. Removal of wait item automatically removes from said list. Subtree topology changes adjust authchange wait lists. Migrator auth change update waits waiters. Import/export should be protected by freeze/thaw or the blanket wakeups. --- diff --git a/src/mds/CDentry.cc b/src/mds/CDentry.cc index 2d7979e8876f..6abba341d3ae 100644 --- a/src/mds/CDentry.cc +++ b/src/mds/CDentry.cc @@ -548,3 +548,8 @@ void CDentry::remove_client_lease(ClientLease *l, Locker *locker) } + +CDir *CDentry::get_containing_subtree() +{ + return get_dir()->get_containing_subtree(); +} diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h index 77aab7700ecc..e68842d8b763 100644 --- a/src/mds/CDentry.h +++ b/src/mds/CDentry.h @@ -258,6 +258,8 @@ public: lru_unpin(); } + CDir *get_containing_subtree(); + // auth pins bool can_auth_pin(); void auth_pin(void *by); diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index a4fb5fb944a3..ab6540bf3dee 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -139,7 +139,7 @@ ostream& CDir::print_db_line_prefix(ostream& out) // CDir CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) : - item_dirty(this), item_new(this) + item_dirty(this), item_new(this), waiting_on_auth_change(member_offset(MDSCacheObject, item_waiting_on_auth_change)) { g_num_dir++; g_num_dira++; @@ -880,7 +880,7 @@ void CDir::add_waiter(uint64_t tag, Context *c) -/* NOTE: this checks dentry waiters too */ +/* NOTE: this checks dentry and authchange waiters too */ void CDir::take_waiting(uint64_t mask, list& ls) { if ((mask & WAIT_DENTRY) && waiting_on_dentry.size()) { @@ -895,6 +895,15 @@ void CDir::take_waiting(uint64_t mask, list& ls) } put(PIN_DNWAITER); } + + if (mask & MDSCacheObject::WAIT_AUTHCHANGE) { + elist::iterator p = waiting_on_auth_change.begin(); + while (!p.end()) { + MDSCacheObject *o = *p; + ++p; + o->take_waiting(MDSCacheObject::WAIT_AUTHCHANGE, ls); // careful, this removes *o from the elist + } + } // waiting MDSCacheObject::take_waiting(mask, ls); @@ -2217,5 +2226,9 @@ void CDir::unfreeze_dir() +CDir *CDir::get_containing_subtree() +{ + return cache->get_subtree_root(this); +} diff --git a/src/mds/CDir.h b/src/mds/CDir.h index a01e57311e62..7ae958163885 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -254,6 +254,16 @@ protected: int num_dentries_auth_subtree_nested; + // extra wait stuff + elist waiting_on_auth_change; // only on subtree roots + +public: + void add_auth_change_waiter(MDSCacheObject *o) { + waiting_on_auth_change.push_back(&o->item_waiting_on_auth_change); + } +protected: + + // friends friend class Migrator; friend class CInode; @@ -333,6 +343,8 @@ private: public: bool try_trim_snap_dentry(CDentry *dn, const set& snaps); + CDir *get_containing_subtree(); + public: void split(int bits, list& subs, list& waiters, bool replay); diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index f5ea2e43fc0e..27febc85fec8 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -416,6 +416,10 @@ void CInode::put_stickydirs() +CDir *CInode::get_containing_subtree() +{ + return get_projected_parent_dn()->get_dir()->get_containing_subtree(); +} // pins diff --git a/src/mds/CInode.h b/src/mds/CInode.h index bfef15713135..19d4dd7bd389 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -264,6 +264,9 @@ public: void get_stickydirs(); void put_stickydirs(); + CDir *get_containing_subtree(); + + protected: // parent dentries in cache CDentry *parent; // primary link diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 4e84393c9c45..c6fc3e150fca 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -808,11 +808,18 @@ bool Locker::rdlock_start(SimpleLock *lock, MDRequest *mut, bool as_anon) } // wait! - int wait_on; + uint64_t wait_on; if (lock->get_parent()->is_auth() && lock->is_stable()) wait_on = SimpleLock::WAIT_RD; - else - wait_on = SimpleLock::WAIT_STABLE; // REQRDLOCK is ignored if lock is unstable, so we need to retry. + else { + // REQRDLOCK is ignored if lock is unstable, so we need to retry on stable OR auth change + wait_on = SimpleLock::WAIT_STABLE; + if (!lock->get_parent()->is_auth()) { + wait_on |= MDSCacheObject::WAIT_AUTHCHANGE; + CDir *subtree = lock->get_parent()->get_containing_subtree(); + subtree->add_auth_change_waiter(in); + } + } dout(7) << "rdlock_start waiting on " << *lock << " on " << *lock->get_parent() << dendl; lock->add_waiter(wait_on, new C_MDS_RetryRequest(mdcache, mut)); nudge_log(lock); @@ -904,7 +911,6 @@ bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mut, bool nowait) } else { // replica. - // auth should be auth_pinned (see acquire_locks wrlock weird mustpin case). int auth = lock->get_parent()->authority().first; dout(10) << "requesting scatter from auth on " << *lock << " on " << *lock->get_parent() << dendl; @@ -915,7 +921,13 @@ bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mut, bool nowait) if (!nowait) { dout(7) << "wrlock_start waiting on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mut)); + uint64_t mask = SimpleLock::WAIT_STABLE; + if (!lock->get_parent()->is_auth()) { + mask |= MDSCacheObject::WAIT_AUTHCHANGE; + CDir *subtree = lock->get_parent()->get_containing_subtree(); + subtree->add_auth_change_waiter(in); + } + lock->add_waiter(mask, new C_MDS_RetryRequest(mdcache, mut)); nudge_log(lock); } diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index c4872fa3198a..f7529266547c 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -752,6 +752,13 @@ void MDCache::try_subtree_merge_at(CDir *dir) subtrees.erase(dir); subtrees[parent].erase(dir); + // move auth change waiters + while (!dir->waiting_on_auth_change.empty()) { + MDSCacheObject *o = dir->waiting_on_auth_change.front(); + parent->add_auth_change_waiter(o); + dout(10) << " moved auth change waiter " << *o << dendl; + } + // adjust popularity? if (dir->is_auth()) { utime_t now = g_clock.now(); @@ -870,6 +877,17 @@ void MDCache::adjust_bounded_subtree_auth(CDir *dir, set& bounds, pair::iterator q = root->waiting_on_auth_change.begin(); + while (!q.end()) { + MDSCacheObject *o = *q; + ++q; + if (o->get_containing_subtree() == dir) { + dout(20) << " moving auth change waiter " << *o << dendl; + dir->add_auth_change_waiter(o); // careful, this removes *o from root's list + } + } // i am a bound of the parent subtree. subtrees[root].insert(dir); @@ -1006,6 +1024,7 @@ void MDCache::remove_subtree(CDir *dir) assert(subtrees[p].count(dir)); subtrees[p].erase(dir); } + assert(dir->waiting_on_auth_change.empty()); } void MDCache::get_subtree_bounds(CDir *dir, set& bounds) diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 9b325c8e122e..70994f19fd0a 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -2249,6 +2249,15 @@ void Migrator::handle_export_notify(MExportDirNotify *m) set have; cache->map_dirfrag_set(m->get_bounds(), have); cache->adjust_bounded_subtree_auth(dir, have, new_auth); + + if (new_auth.second == CDIR_AUTH_UNKNOWN) { + // wake up any auth change waiters + list ls; + dir->take_waiting(MDSCacheObject::WAIT_AUTHCHANGE, ls); + if (!ls.empty()) + dout(10) << "handle_export_notify woke up some AUTHCHANGE waiters" << dendl; + mds->queue_waiters(ls); + } // induce a merge? cache->try_subtree_merge(dir); diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h index 7b2ac0c2574f..a686aa19ab41 100644 --- a/src/mds/SimpleLock.h +++ b/src/mds/SimpleLock.h @@ -288,7 +288,8 @@ public: parent->take_waiting(mask << get_wait_shift(), ls); } void add_waiter(uint64_t mask, Context *c) { - parent->add_waiter(mask << get_wait_shift(), c); + // preserve WAIT_AUTHCHANGE bit unshifted, if present. + parent->add_waiter((mask << get_wait_shift()) | (mask & MDSCacheObject::WAIT_AUTHCHANGE), c); } bool is_waiter_for(uint64_t mask) { return parent->is_waiter_for(mask << get_wait_shift()); diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 0263df9942c8..b79682fd1fbf 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -16,6 +16,7 @@ using namespace std; #include "include/frag.h" #include "include/xlist.h" +#include "include/elist.h" #include @@ -1665,6 +1666,8 @@ class SimpleLock; class MDSCacheObject; +class CDir; + // -- authority delegation -- // directory authority types // >= 0 is the auth mds @@ -1772,6 +1775,7 @@ class MDSCacheObject { // -- wait -- const static uint64_t WAIT_SINGLEAUTH = (1ull<<60); const static uint64_t WAIT_UNFREEZE = (1ull<<59); // pka AUTHPINNABLE + const static uint64_t WAIT_AUTHCHANGE = (1ull<<58); // ============================================ @@ -1780,7 +1784,8 @@ class MDSCacheObject { MDSCacheObject() : state(0), ref(0), - replica_nonce(0) {} + replica_nonce(0), + waiting_on_auth_change(0) {} virtual ~MDSCacheObject() {} // printing @@ -1812,6 +1817,7 @@ class MDSCacheObject { bool is_ambiguous_auth() { return authority().second != CDIR_AUTH_UNKNOWN; } + virtual CDir *get_containing_subtree() = 0; // -------------------------------------------- // pins @@ -1957,7 +1963,10 @@ protected: // waiting protected: multimap waiting; - + int waiting_on_auth_change; + elist::item item_waiting_on_auth_change; + friend class CDir; + public: bool is_waiter_for(uint64_t mask, uint64_t min=0) { if (!min) { @@ -1977,6 +1986,10 @@ protected: if (waiting.empty()) get(PIN_WAITER); waiting.insert(pair(mask, c)); + + if (mask & WAIT_AUTHCHANGE) + waiting_on_auth_change++; + pdout(10,g_conf.debug_mds) << (mdsco_db_line_prefix(this)) << "add_waiter " << hex << mask << dec << " " << c << " on " << *this @@ -1989,6 +2002,13 @@ protected: while (it != waiting.end()) { if (it->first & mask) { ls.push_back(it->second); + + if (it->first & WAIT_AUTHCHANGE) { + waiting_on_auth_change--; + if (!waiting_on_auth_change) + item_waiting_on_auth_change.remove_myself(); + } + pdout(10,g_conf.debug_mds) << (mdsco_db_line_prefix(this)) << "take_waiting mask " << hex << mask << dec << " took " << it->second << " tag " << it->first