From: John Spray Date: Tue, 10 Mar 2015 14:40:30 +0000 (+0000) Subject: mds: give up replicas of a stopping mds's stuff X-Git-Tag: v9.0.0~170^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F3752%2Fhead;p=ceph.git mds: give up replicas of a stopping mds's stuff In order for an MDS to make it through stopping when it had some strays, the other ranks that serviced the migrate_stray renames must ensure that they give up any cache objects that belonged to the stopping MDS, so that the stopping MDS can finish emptying its cache. Fixes: #10744 Signed-off-by: John Spray --- diff --git a/src/mds/CInode.h b/src/mds/CInode.h index e13e80e075c0..e30e0c268c87 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -583,6 +583,9 @@ public: // -- accessors -- bool is_root() const { return inode.ino == MDS_INO_ROOT; } bool is_stray() const { return MDS_INO_IS_STRAY(inode.ino); } + mds_rank_t get_stray_owner() const { + return (mds_rank_t)MDS_INO_STRAY_OWNER(inode.ino); + } bool is_mdsdir() const { return MDS_INO_IS_MDSDIR(inode.ino); } bool is_base() const { return is_root() || is_mdsdir(); } bool is_system() const { return inode.ino < MDS_INO_SYSTEM_BASE; } diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 1bf5c2819c3e..64758f3db530 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -3548,6 +3548,61 @@ void MDCache::remove_inode_recursive(CInode *in) remove_inode(in); } +bool MDCache::expire_recursive( + CInode *in, + map& expiremap, + CDir *subtree) +{ + assert(!in->is_auth()); + + dout(10) << __func__ << ":" << *in << dendl; + + mds_rank_t owner = subtree->dir_auth.first; + MCacheExpire *expire_msg = expiremap[owner]; + assert(expire_msg); + + // Recurse into any dirfrags beneath this inode + list ls; + in->get_dirfrags(ls); + list::iterator p = ls.begin(); + for (std::list::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *subdir = *p; + + dout(10) << __func__ << ": entering dirfrag " << subdir << dendl; + for (CDir::map_t::iterator q = subdir->items.begin(); + q != subdir->items.end(); ++q) { + CDentry *dn = q->second; + CDentry::linkage_t *dnl = dn->get_linkage(); + if (dnl->is_primary()) { + CInode *tin = dnl->get_inode(); + dout(10) << __func__ << ": tin=" + << *tin << dendl; + + /* Remote strays with linkage (i.e. hardlinks) should not be + * expired, because they may be the target of + * a rename() as the owning MDS shuts down */ + if (!tin->is_dir() && tin->inode.nlink) { + dout(10) << __func__ << ": child still has linkage" << dendl; + return true; + } + + const bool abort = expire_recursive(tin, expiremap, subtree); + if (abort) { + return true; + } + } + if (dn->lru_is_expireable()) { + trim_dentry(dn, expiremap); + } else { + dout(10) << __func__ << ": dn still has linkage " << *dn << dendl; + return true; + } + } + } + + return false; +} + void MDCache::trim_unlinked_inodes() { dout(7) << "trim_unlinked_inodes" << dendl; @@ -6158,6 +6213,49 @@ bool MDCache::trim(int max, int count) trim_inode(0, root, 0, expiremap); } + // Trim remote stray dirs for stopping MDS ranks + std::list subtree_list; + list_subtrees(subtree_list); // Take copy because will modify in loop + for (std::list::iterator s = subtree_list.begin(); + s != subtree_list.end(); ++s) { + CDir *subtree = *s; + if (subtree->inode->is_mdsdir()) { + mds_rank_t owner = mds_rank_t(MDS_INO_MDSDIR_OWNER(subtree->inode->ino())); + if (owner == mds->whoami) { + continue; + } + + dout(20) << __func__ << ": checking remote MDS dir " << *(subtree) << dendl; + + const MDSMap::mds_info_t &owner_info = mds->mdsmap->get_mds_info(owner); + if (owner_info.state == MDSMap::STATE_STOPPING) { + dout(20) << __func__ << ": it's stopping, remove it" << dendl; + if (expiremap.count(owner) == 0) { + expiremap[owner] = new MCacheExpire(mds->get_nodeid()); + } + + const bool aborted = expire_recursive( + subtree->inode, expiremap, subtree); + if (!aborted) { + dout(20) << __func__ << ": successfully expired mdsdir" << dendl; + CInode *subtree_in = subtree->inode; + list ls; + subtree->inode->get_dirfrags(ls); + list::iterator p = ls.begin(); + for (std::list::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *frag = *p; + trim_dirfrag(frag, subtree, expiremap); + } + trim_inode(NULL, subtree_in, NULL, expiremap); + } else { + dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl; + } + } else { + dout(20) << __func__ << ": not stopping, leaving it alone" << dendl; + } + } + } + // send any expire messages send_expire_messages(expiremap); @@ -6770,6 +6868,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m) assert(in); } assert(in->is_auth()); + dout(20) << __func__ << ": expiring inode " << *in << dendl; // check nonce if (nonce == in->get_replica_nonce(from)) { @@ -6793,6 +6892,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m) ++it) { CDir *dir = get_dirfrag(it->first); unsigned nonce = it->second; + dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl; if (!dir) { CInode *diri = get_inode(it->first.ino); diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index c31ffdbd2a6e..dca96b958757 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -615,6 +615,25 @@ public: uncommitted_slave_rename_olddir.count(dir->inode) == 0; } + /** + * For all unreferenced inodes, dirs, dentries below an inode, compose + * expiry messages. This is used when giving up all replicas of entities + * for an MDS peer in the 'stopping' state, such that the peer can + * empty its cache and finish shutting down. + * + * We have to make sure we're only expiring un-referenced items to + * avoid interfering with ongoing stray-movement (we can't distinguish + * between the "moving my strays" and "waiting for my cache to empty" + * phases within 'stopping') + * + * @return false if we completed cleanly, true if caller should stop + * expiring because we hit something with refs. + */ + bool expire_recursive( + CInode *in, + std::map& expiremap, + CDir *subtree); + void trim_client_leases(); void check_memory_usage(); diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index ca6f3efb9363..8c479dc87453 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -64,6 +64,7 @@ #define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < (MDS_INO_STRAY_OFFSET+(MAX_MDS*NUM_STRAY))) #define MDS_INO_IS_MDSDIR(i) ((i) >= MDS_INO_MDSDIR_OFFSET && (i) < (MDS_INO_MDSDIR_OFFSET+MAX_MDS)) +#define MDS_INO_MDSDIR_OWNER(i) (signed ((unsigned (i)) - MDS_INO_MDSDIR_OFFSET)) #define MDS_INO_IS_BASE(i) (MDS_INO_ROOT == (i) || MDS_INO_IS_MDSDIR(i)) #define MDS_INO_STRAY_OWNER(i) (signed (((unsigned (i)) - MDS_INO_STRAY_OFFSET) / NUM_STRAY)) #define MDS_INO_STRAY_INDEX(i) (((unsigned (i)) - MDS_INO_STRAY_OFFSET) % NUM_STRAY)