From: Yan, Zheng Date: Mon, 13 Jan 2014 02:26:08 +0000 (+0800) Subject: mds: introduce fine-grained discover dirfrag wait queue X-Git-Tag: v0.78~165^2~26 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=1ff776669b1c332a3bad6e87581f790d0d45e3ce;p=ceph.git mds: introduce fine-grained discover dirfrag wait queue Current discover dirfrag code only allows discover one dirfrag at a time. This can cause deadlock if there are directories that are fragmented to several dirfrags. For example: mds.0 mds.1 ----------------------------------------------------------------- freeze subtree (1.*) with bound (2.1*) discover (2.0*) -> handle discover (2.0*), frozen tree, wait <- export subtree (1.*) to with bound (2.1*) discover (2.1*), wait Signed-off-by: Yan, Zheng --- diff --git a/src/mds/CDir.h b/src/mds/CDir.h index 2c0b7e45a67c..3507f10b08d8 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -531,7 +531,7 @@ protected: map< inodeno_t, list > waiting_on_ino; public: - bool is_waiting_for_dentry(const char *dname, snapid_t snap) { + bool is_waiting_for_dentry(const string& dname, snapid_t snap) { return waiting_on_dentry.count(string_snap_t(dname, snap)); } void add_dentry_waiter(const string& dentry, snapid_t snap, Context *c); diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 48ee839da9eb..82958ad3b4ff 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -1959,6 +1959,30 @@ bool CInode::is_freezing() return false; } +void CInode::add_dir_waiter(frag_t fg, Context *c) +{ + if (waiting_on_dir.empty()) + get(PIN_DIRWAITER); + waiting_on_dir[fg].push_back(c); + dout(10) << "add_dir_waiter frag " << fg << " " << c << " on " << *this << dendl; +} + +void CInode::take_dir_waiting(frag_t fg, list& ls) +{ + if (waiting_on_dir.empty()) + return; + + map >::iterator p = waiting_on_dir.find(fg); + if (p != waiting_on_dir.end()) { + dout(10) << "take_dir_waiting frag " << fg << " on " << *this << dendl; + ls.splice(ls.end(), p->second); + waiting_on_dir.erase(p); + + if (waiting_on_dir.empty()) + put(PIN_DIRWAITER); + } +} + void CInode::add_waiter(uint64_t tag, Context *c) { dout(10) << "add_waiter tag " << std::hex << tag << std::dec << " " << c @@ -1979,6 +2003,23 @@ void CInode::add_waiter(uint64_t tag, Context *c) MDSCacheObject::add_waiter(tag, c); } +void CInode::take_waiting(uint64_t mask, list& ls) +{ + if ((mask & WAIT_DIR) && !waiting_on_dir.empty()) { + // take all dentry waiters + while (!waiting_on_dir.empty()) { + map >::iterator p = waiting_on_dir.begin(); + dout(10) << "take_waiting dirfrag " << p->first << " on " << *this << dendl; + ls.splice(ls.end(), p->second); + waiting_on_dir.erase(p); + } + put(PIN_DIRWAITER); + } + + // waiting + MDSCacheObject::take_waiting(mask, ls); +} + bool CInode::freeze_inode(int auth_pin_allowance) { assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 3b50560fb549..28400aa3744d 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -110,6 +110,7 @@ public: static const int PIN_DIRTYRSTAT = 21; static const int PIN_EXPORTINGCAPS = 22; static const int PIN_DIRTYPARENT = 23; + static const int PIN_DIRWAITER = 24; const char *pin_name(int p) { switch (p) { @@ -135,6 +136,7 @@ public: case PIN_NEEDSNAPFLUSH: return "needsnapflush"; case PIN_DIRTYRSTAT: return "dirtyrstat"; case PIN_DIRTYPARENT: return "dirtyparent"; + case PIN_DIRWAITER: return "dirwaiter"; default: return generic_pin_name(p); } } @@ -570,10 +572,17 @@ private: _decode_locks_state(p, is_new); } - // -- waiting -- +protected: + map > waiting_on_dir; +public: + void add_dir_waiter(frag_t fg, Context *c); + void take_dir_waiting(frag_t fg, list& ls); + bool is_waiting_for_dir(frag_t fg) { + return waiting_on_dir.count(fg); + } void add_waiter(uint64_t tag, Context *c); - + void take_waiting(uint64_t tag, list& ls); // -- encode/decode helpers -- void _encode_base(bufferlist& bl); @@ -584,7 +593,6 @@ private: void _decode_locks_state(bufferlist::iterator& p, bool is_new); void _decode_locks_rejoin(bufferlist::iterator& p, list& waiters); - // -- import/export -- void encode_export(bufferlist& bl); void finish_export(utime_t now); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 0e2073223ab5..a44e3a840235 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -9727,7 +9727,7 @@ void MDCache::discover_dir_frag(CInode *base, dout(7) << "discover_dir_frag " << df << " from mds." << from << dendl; - if (!base->is_waiter_for(CInode::WAIT_DIR) || !onfinish) { // FIXME: this is kind of weak! + if (!base->is_waiting_for_dir(approx_fg) || !onfinish) { discover_info_t& d = _create_discover(from); d.ino = base->ino(); d.frag = approx_fg; @@ -9736,7 +9736,7 @@ void MDCache::discover_dir_frag(CInode *base, } if (onfinish) - base->add_waiter(CInode::WAIT_DIR, onfinish); + base->add_dir_waiter(approx_fg, onfinish); } struct C_MDC_RetryDiscoverPath : public Context { @@ -9779,10 +9779,12 @@ void MDCache::discover_path(CInode *base, return; } + frag_t fg = base->pick_dirfrag(want_path[0]); if ((want_xlocked && want_path.depth() == 1) || - !base->is_waiter_for(CInode::WAIT_DIR) || !onfinish) { // FIXME: weak! + !base->is_waiting_for_dir(fg) || !onfinish) { discover_info_t& d = _create_discover(from); d.ino = base->ino(); + d.frag = fg; d.snap = snap; d.want_path = want_path; d.want_base_dir = true; @@ -9792,7 +9794,7 @@ void MDCache::discover_path(CInode *base, // register + wait if (onfinish) - base->add_waiter(CInode::WAIT_DIR, onfinish); + base->add_dir_waiter(fg, onfinish); } struct C_MDC_RetryDiscoverPath2 : public Context { @@ -10329,46 +10331,36 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) if (who >= 0) dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl; - // try again? - if (m->get_error_dentry().length()) { - // wanted a dentry - frag_t fg = cur->pick_dirfrag(m->get_error_dentry()); - CDir *dir = cur->get_dirfrag(fg); - filepath relpath(m->get_error_dentry(), 0); - - if (cur->is_waiter_for(CInode::WAIT_DIR)) { - if (cur->is_auth() || dir) - cur->take_waiting(CInode::WAIT_DIR, finished); - else - discover_path(cur, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked(), who); - } else - dout(7) << " doing nothing, nobody is waiting for dir" << dendl; - - if (dir) { - // don't actaully need the hint, now - if (dir->is_waiting_for_dentry(m->get_error_dentry().c_str(), m->get_wanted_snapid())) { - if (dir->is_auth() || dir->lookup(m->get_error_dentry())) - dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(), - m->get_wanted_snapid(), finished); - else - discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked()); - } else - dout(7) << " doing nothing, have dir but nobody is waiting on dentry " - << m->get_error_dentry() << dendl; - } - } else { - // wanted dir or ino - frag_t fg = m->get_base_dir_frag(); - CDir *dir = cur->get_dirfrag(fg); + frag_t fg = m->get_base_dir_frag(); + CDir *dir = cur->get_dirfrag(fg); - if (cur->is_waiter_for(CInode::WAIT_DIR)) { - if (cur->is_auth() || dir) + if (m->get_wanted_base_dir()) { + if (cur->is_waiting_for_dir(fg)) { + if (cur->is_auth()) cur->take_waiting(CInode::WAIT_DIR, finished); + else if (dir) + cur->take_dir_waiting(fg, finished); else discover_dir_frag(cur, fg, 0, who); } else dout(7) << " doing nothing, nobody is waiting for dir" << dendl; + } + // try again? + if (m->get_error_dentry().length()) { + // wanted a dentry + if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) { + if (dir->is_auth() || dir->lookup(m->get_error_dentry())) { + dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(), + m->get_wanted_snapid(), finished); + } else { + filepath relpath(m->get_error_dentry(), 0); + discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked()); + } + } else + dout(7) << " doing nothing, have dir but nobody is waiting on dentry " + << m->get_error_dentry() << dendl; + } else { if (dir && m->get_wanted_ino() && dir->is_waiting_for_ino(m->get_wanted_ino())) { if (dir->is_auth() || get_inode(m->get_wanted_ino())) dir->take_ino_waiting(m->get_wanted_ino(), finished); @@ -10428,7 +10420,7 @@ CDir *MDCache::add_replica_dir(bufferlist::iterator& p, CInode *diri, int from, dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl; // get waiters - diri->take_waiting(CInode::WAIT_DIR, finished); + diri->take_dir_waiting(df.frag, finished); } return dir; @@ -11510,11 +11502,13 @@ void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify) // refragment list waiters; list resultfrags; - adjust_dir_fragments(diri, base, bits, - resultfrags, waiters, false); + adjust_dir_fragments(diri, base, bits, resultfrags, waiters, false); if (g_conf->mds_debug_frag) diri->verify_dirfrags(); + for (list::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p) + diri->take_dir_waiting((*p)->get_frag(), waiters); + /* // add new replica dirs values bufferlist::iterator p = notify->basebl.begin();