From ffcbcdd61f2b797aa17deaeca9c7f0ef3ea20a34 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 20 Mar 2014 11:30:46 +0800 Subject: [PATCH] mds: handle race between cache rejoin and fragmenting MDCache::handle_cache_expire() ignores mismatched dirfrags. this is OK during normal operation because MDS doesn't trim replica inode whose dirfrags are likely being fragmented (see commit 22535340). During recovery, the recovering MDS can reveive survivor MDS' cache expire message before it sends cache rejoin acks. In this case, there still can be mismatched dirfrags, but nothing prevents the survivor MDS to trim inode of these mismatched dirfrags. So there can be unconnected dirfrags when the recovering MDS sends cache rejoin acks. The fix is, when mismatched dirfrag is encountered during recovery, check if inode of the dirfrag is still replicated to the sender MDS. If the inode is not replicated, remove the sender MDS from replica maps of all child dirfrags. Signed-off-by: Yan, Zheng --- src/mds/MDCache.cc | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 92361ff296284..f9b01b70ba7a1 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -6301,6 +6301,11 @@ bool MDCache::trim(int max) CDir *dir = p->first; ++p; if (!dir->is_auth() && !dir->get_inode()->is_auth()) { + // don't trim subtree root if its auth MDS is recovering. + // This simplify the cache rejoin code. + if (dir->is_subtree_root() && + rejoin_ack_gather.count(dir->get_dir_auth().first)) + continue; if (dir->get_num_ref() == 1) // subtree pin trim_dirfrag(dir, 0, expiremap); } @@ -6494,8 +6499,11 @@ bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map dfls; in->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) - trim_dirfrag(*p, con ? con:*p, expiremap); // if no container (e.g. root dirfrag), use *p + for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) { + CDir *dir = *p; + assert(!dir->is_subtree_root()); + trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p + } } // INODE @@ -6916,6 +6924,22 @@ void MDCache::handle_cache_expire(MCacheExpire *m) if (!dir) { CInode *diri = get_inode(it->first.ino); if (diri) { + if (mds->is_rejoin() && + rejoin_ack_gather.count(mds->whoami) && // haven't sent rejoin ack yet + !diri->is_replica(from)) { + list ls; + diri->get_nested_dirfrags(ls); + dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from + << " while rejoining, inode isn't replicated" << dendl; + for (list::iterator q = ls.begin(); q != ls.end(); ++q) { + dir = *q; + if (dir->is_replica(from)) { + dout(7) << " dir expire on " << *dir << " from mds." << from << dendl; + dir->remove_replica(from); + } + } + continue; + } CDir *other = diri->get_approx_dirfrag(it->first.frag); if (other) { dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from -- 2.39.5