From 9d271696b7735ab5b7384cdd386d6ac3eaafe437 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sat, 17 Feb 2018 09:37:48 +0800 Subject: [PATCH] mds: fix check of underwater dentries Underwater dentry is dentry that is dirty in our cache from journal replay, but had already been flushed to disk before the mds failed. To decide if an dentry is underwater, original code compares dirty dentry's version to on-disk dirfrag's version. This method is racy because CDir::log_mark_dirty() can increase dirfrag's version without adding log event. After mds failover, version of dirfrag from journal replay can be less than on-disk dirfrag's version. So newly dirtied dentry can be equal to or less than the on-disk dirfrag's version. The race can cause incorrect fragstat/rstat Fixes: http://tracker.ceph.com/issues/23032 Signed-off-by: Yan, Zheng --- src/mds/CDir.cc | 92 ++++++++++++++++++++++++++----------------------- src/mds/CDir.h | 3 +- 2 files changed, 50 insertions(+), 45 deletions(-) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index a210ba23894d0..aeaa54d94ce1d 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1658,8 +1658,7 @@ CDentry *CDir::_load_dentry( bufferlist &bl, const int pos, const std::set *snaps, - bool *force_dirty, - list *undef_inodes) + bool *force_dirty) { bufferlist::iterator q = bl.begin(); @@ -1711,10 +1710,16 @@ CDentry *CDir::_load_dentry( } if (dn) { - if (dn->get_linkage()->get_inode() == 0) { - dout(12) << "_fetched had NEG dentry " << *dn << dendl; - } else { - dout(12) << "_fetched had dentry " << *dn << dendl; + CDentry::linkage_t *dnl = dn->get_linkage(); + dout(12) << "_fetched had " << (dnl->is_null() ? "NEG" : "") << " dentry " << *dn << dendl; + if (committed_version == 0 && + dnl->is_remote() && + dn->is_dirty() && + ino == dnl->get_remote_ino() && + d_type == dnl->get_remote_d_type()) { + // see comment below + dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << dendl; + dn->mark_clean(); } } else { // (remote) link @@ -1747,15 +1752,35 @@ CDentry *CDir::_load_dentry( bool undef_inode = false; if (dn) { - CInode *in = dn->get_linkage()->get_inode(); - if (in) { - dout(12) << "_fetched had dentry " << *dn << dendl; - if (in->state_test(CInode::STATE_REJOINUNDEF)) { - undef_inodes->push_back(in); - undef_inode = true; - } - } else - dout(12) << "_fetched had NEG dentry " << *dn << dendl; + CDentry::linkage_t *dnl = dn->get_linkage(); + dout(12) << "_fetched had " << (dnl->is_null() ? "NEG" : "") << " dentry " << *dn << dendl; + + if (dnl->is_primary()) { + CInode *in = dnl->get_inode(); + if (in->state_test(CInode::STATE_REJOINUNDEF)) { + undef_inode = true; + } else if (committed_version == 0 && + dn->is_dirty() && + inode_data.inode.ino == in->ino() && + inode_data.inode.version == in->get_version()) { + /* clean underwater item? + * Underwater item is something that is dirty in our cache from + * journal replay, but was previously flushed to disk before the + * mds failed. + * + * We only do this is committed_version == 0. that implies either + * - this is a fetch after from a clean/empty CDir is created + * (and has no effect, since the dn won't exist); or + * - this is a fetch after _recovery_, which is what we're worried + * about. Items that are marked dirty from the journal should be + * marked clean if they appear on disk. + */ + dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << dendl; + dn->mark_clean(); + dout(10) << "_fetched had underwater inode " << *dnl->get_inode() << ", marking clean" << dendl; + in->mark_clean(); + } + } } if (!dn || undef_inode) { @@ -1917,7 +1942,7 @@ void CDir::_omap_fetched(bufferlist& hdrbl, map& omap, try { dn = _load_dentry( p->first, dname, last, p->second, pos, snaps, - &force_dirty, &undef_inodes); + &force_dirty); } catch (const buffer::error &err) { cache->mds->clog->warn() << "Corrupt dentry '" << dname << "' in " "dir frag " << dirfrag() << ": " @@ -1936,35 +1961,16 @@ void CDir::_omap_fetched(bufferlist& hdrbl, map& omap, continue; } - if (dn && (wanted_items.count(mempool::mds_co::string(dname)) > 0 || !complete)) { - dout(10) << " touching wanted dn " << *dn << dendl; - inode->mdcache->touch_dentry(dn); - } + if (!dn) + continue; - /** clean underwater item? - * Underwater item is something that is dirty in our cache from - * journal replay, but was previously flushed to disk before the - * mds failed. - * - * We only do this is committed_version == 0. that implies either - * - this is a fetch after from a clean/empty CDir is created - * (and has no effect, since the dn won't exist); or - * - this is a fetch after _recovery_, which is what we're worried - * about. Items that are marked dirty from the journal should be - * marked clean if they appear on disk. - */ - if (committed_version == 0 && - dn && - dn->get_version() <= got_fnode.version && - dn->is_dirty()) { - dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << dendl; - dn->mark_clean(); + CDentry::linkage_t *dnl = dn->get_linkage(); + if (dnl->is_primary() && dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) + undef_inodes.push_back(dnl->get_inode()); - if (dn->get_linkage()->is_primary()) { - assert(dn->get_linkage()->get_inode()->get_version() <= got_fnode.version); - dout(10) << "_fetched had underwater inode " << *dn->get_linkage()->get_inode() << ", marking clean" << dendl; - dn->get_linkage()->get_inode()->mark_clean(); - } + if (wanted_items.count(mempool::mds_co::string(dname)) > 0 || !complete) { + dout(10) << " touching wanted dn " << *dn << dendl; + inode->mdcache->touch_dentry(dn); } } diff --git a/src/mds/CDir.h b/src/mds/CDir.h index 3e6bc0eb2d290..5528c02aa60a9 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -605,8 +605,7 @@ protected: bufferlist &bl, int pos, const std::set *snaps, - bool *force_dirty, - std::list *undef_inodes); + bool *force_dirty); /** * Mark this fragment as BADFRAG (common part of go_bad and go_bad_dentry) -- 2.39.5