From: Yan, Zheng Date: Sun, 19 Jan 2014 10:37:22 +0000 (+0800) Subject: mds: avoid race between cache expire and pushing replicas X-Git-Tag: v0.78~165^2~6 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=22535340b402bd4a466f85d11d937f7b2513d6c5;p=ceph.git mds: avoid race between cache expire and pushing replicas MDentryLink and MMDSFragmentNotify push replica inode/dirfrags to other MDS. They both are racy because, when the target MDS receives them, it may has expired the replicaed inode/dirfrags' ancestor. The race creates unconnected replica inode/dirfrags, unconnected replicas are problematic for subtree migration because migrator sends MExportDirNotify according to subtree dirfrag's replica list. MDS that contains unconnected replicas may not receive MExportDirNotify. The fix is, for MDentryLink and MMDSFragmentNotify messages that may be received later, we avoid trimming their parent replica objects. If null replica dentry is not readable, we may receive a MDentryLink message later. If replica inode's dirfragtreelock is not readable, it's likely some dirfrags of the inode are being fragmented, we may receive a MMDSFragmentNotify message later. Signed-off-by: Yan, Zheng --- diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 6d604d7df1ce..2bdb2290b48e 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -6311,8 +6311,41 @@ bool MDCache::trim_dentry(CDentry *dn, map& expiremap) assert(dn->is_auth()); } + // adjust the dir state + // NOTE: we can safely remove a clean, null dentry without effecting + // directory completeness. + // (check this _before_ we unlink the inode, below!) + bool null_dentry = false; + bool clear_complete = false; + if (!(dnl->is_null() && dn->is_clean())) + clear_complete = true; + + // unlink the dentry + if (dnl->is_remote()) { + // just unlink. + dir->unlink_inode(dn); + } else if (dnl->is_primary()) { + // expire the inode, too. + CInode *in = dnl->get_inode(); + assert(in); + if (trim_inode(dn, in, con, expiremap)) + return true; // purging stray instead of trimming + } else { + assert(dnl->is_null()); + null_dentry = true; + } + // notify dentry authority? if (!dn->is_auth()) { + // If null replica dentry is not readable, it's likely we will + // receive a MDentryLink message soon. MDentryLink message only + // replicates an inode, so we should avoid trimming the inode's + // parent dentry. This is because that unconnected replicas are + // problematic for subtree migration. + if (null_dentry && !dn->lock.can_read(-1) && + !dn->get_dir()->get_inode()->is_stray()) + return true; + pair auth = dn->authority(); for (int p=0; p<2; p++) { @@ -6331,32 +6364,6 @@ bool MDCache::trim_dentry(CDentry *dn, map& expiremap) } } - // adjust the dir state - // NOTE: we can safely remove a clean, null dentry without effecting - // directory completeness. - // (check this _before_ we unlink the inode, below!) - bool clear_complete = false; - if (!(dnl->is_null() && dn->is_clean())) - clear_complete = true; - - // unlink the dentry - if (dnl->is_remote()) { - // just unlink. - dir->unlink_inode(dn); - } - else if (dnl->is_primary()) { - // expire the inode, too. - CInode *in = dnl->get_inode(); - assert(in); - trim_inode(dn, in, con, expiremap); - // purging stray instead of trimming ? - if (dn->get_num_ref() > 0) - return true; - } - else { - assert(dnl->is_null()); - } - // remove dentry if (dir->is_auth()) dir->add_to_bloom(dn); @@ -6419,18 +6426,28 @@ void MDCache::trim_dirfrag(CDir *dir, CDir *con, map& expire in->close_dirfrag(dir->dirfrag().frag); } -void MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map& expiremap) +bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map& expiremap) { dout(15) << "trim_inode " << *in << dendl; assert(in->get_num_ref() == 0); - - // DIR - list dfls; - in->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); - p != dfls.end(); - ++p) - trim_dirfrag(*p, con ? con:*p, expiremap); // if no container (e.g. root dirfrag), use *p + + if (in->is_dir()) { + // If replica inode's dirfragtreelock is not readable, it's likely + // some dirfrags of the inode are being fragmented and we will receive + // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new + // dirfrags, so we should avoid trimming these dirfrags' parent inode. + // This is because that unconnected replicas are problematic for + // subtree migration. + // + if (!in->is_auth() && !in->dirfragtreelock.can_read(-1)) + return true; + + // DIR + list dfls; + in->get_dirfrags(dfls); + for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) + trim_dirfrag(*p, con ? con:*p, expiremap); // if no container (e.g. root dirfrag), use *p + } // INODE if (in->is_auth()) { @@ -6438,7 +6455,7 @@ void MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, mapget_num_ref() > 0) - return; + return true; } } else { pair auth = in->authority(); @@ -6480,6 +6497,7 @@ void MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, mapget_dir()->unlink_inode(dn); remove_inode(in); + return false; } @@ -10772,16 +10790,8 @@ void MDCache::handle_dentry_link(MDentryLink *m) ::decode(d_type, p); dir->link_remote_inode(dn, ino, d_type); } - } else if (m->get_is_primary()) { - CInode *in = add_replica_inode(p, NULL, finished); - assert(in->get_num_ref() == 0); - assert(in->get_parent_dn() == NULL); - map expiremap; - int from = m->get_source().num(); - expiremap[from] = new MCacheExpire(mds->get_nodeid()); - expiremap[from]->add_inode(m->get_subtree(), in->vino(), in->get_replica_nonce()); - send_expire_messages(expiremap); - remove_inode(in); + } else { + assert(0); } if (!finished.empty()) @@ -11704,6 +11714,8 @@ void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify) add_replica_dir(p, diri, notify->get_source().num(), waiters); mds->queue_waiters(waiters); + } else { + assert(0); } notify->put(); diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 3aab1fa77a2d..d523c575254a 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -571,7 +571,7 @@ public: bool trim_dentry(CDentry *dn, map& expiremap); void trim_dirfrag(CDir *dir, CDir *con, map& expiremap); - void trim_inode(CDentry *dn, CInode *in, CDir *con, + bool trim_inode(CDentry *dn, CInode *in, CDir *con, map& expiremap); void send_expire_messages(map& expiremap); void trim_non_auth(); // trim out trimmable non-auth items