From 777bcba0a1f5f2b1118de4ea16e01c2f3a28c852 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 15 Apr 2011 10:02:46 -0700 Subject: [PATCH] mds: don't skip inodes in journal that may be trimmed during replay During replay we trim non-auth inodes on EExport or EImportFinish abort. Subtree trimming may be delayed, too. Skip parents if the diri is in the same blob, or if it is journaled in the current segment *and* it is in a subtree that is unambiguously auth. We can't easily be more precise than that because the actual event we care about on replay is EExport, but the migrator doesn't twiddle auth bits to false until later. Also, reset last_journaled on import. This fixes replay bugs like 2011-04-13 18:15:18.064029 7f65588ef710 mds1.journal EImportStart.replay 10000000015 bounds [] 2011-04-13 18:15:18.064034 7f65588ef710 mds1.journal EMetaBlob.replay 2 dirlumps by unknown0 2011-04-13 18:15:18.064040 7f65588ef710 mds1.journal EMetaBlob.replay dir 10000000010 2011-04-13 18:15:18.064046 7f65588ef710 mds1.journal EMetaBlob.replay missing dir ino 10000000010 mds/journal.cc: In function 'void EMetaBlob::replay(MDS*, LogSegment*)', in thread '0x7f65588ef710' mds/journal.cc: 407: FAILED assert(0) ceph version 0.25-683-g653580a (commit:653580ae84c471c34872f14a0308c78af71f7243) 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x53) [0xa53d26] 2: (EMetaBlob::replay(MDS*, LogSegment*)+0x7eb) [0x7a737d] Fixes: #994 Signed-off-by: Sage Weil --- src/mds/Migrator.cc | 7 +++- src/mds/Migrator.h | 2 +- src/mds/Server.cc | 2 +- src/mds/events/EMetaBlob.h | 42 +-------------------- src/mds/journal.cc | 77 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 85 insertions(+), 45 deletions(-) diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index c5798cfb41864..cf714d549a144 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -2142,7 +2142,7 @@ void Migrator::import_finish(CDir *dir) void Migrator::decode_import_inode(CDentry *dn, bufferlist::iterator& blp, int oldauth, - LogSegment *ls, + LogSegment *ls, uint64_t log_offset, map >& cap_imports, list& updated_scatterlocks) { @@ -2165,6 +2165,9 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist::iterator& blp, int o // state after link -- or not! -sage in->decode_import(blp, ls); // cap imports are noted for later action + // note that we are journaled at this log offset + in->last_journaled = log_offset; + // caps decode_import_inode_caps(in, blp, cap_imports); @@ -2338,7 +2341,7 @@ int Migrator::decode_import_dir(bufferlist::iterator& blp, } else if (icode == 'I') { // inode - decode_import_inode(dn, blp, oldauth, ls, cap_imports, updated_scatterlocks); + decode_import_inode(dn, blp, oldauth, ls, le->get_start_off(), cap_imports, updated_scatterlocks); } // add dentry to journal entry diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h index 086e09ea75670..e7816339c6947 100644 --- a/src/mds/Migrator.h +++ b/src/mds/Migrator.h @@ -235,7 +235,7 @@ public: public: void decode_import_inode(CDentry *dn, bufferlist::iterator& blp, int oldauth, - LogSegment *ls, + LogSegment *ls, uint64_t log_offset, map >& cap_imports, list& updated_scatterlocks); void decode_import_inode_caps(CInode *in, diff --git a/src/mds/Server.cc b/src/mds/Server.cc index f3b339ed41cc1..4c5840c27cc91 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -5016,7 +5016,7 @@ version_t Server::_rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferl list updated_scatterlocks; // we clear_updated explicitly below mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, - mdr->ls, + mdr->ls, 0, mdr->more()->cap_imports, updated_scatterlocks); srcdnl->get_inode()->filelock.clear_dirty(); srcdnl->get_inode()->nestlock.clear_dirty(); diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h index 35e762eb14fa0..deb4ed3b9ed2d 100644 --- a/src/mds/events/EMetaBlob.h +++ b/src/mds/events/EMetaBlob.h @@ -633,47 +633,7 @@ private: static const int TO_AUTH_SUBTREE_ROOT = 0; // default. static const int TO_ROOT = 1; - void add_dir_context(CDir *dir, int mode = TO_AUTH_SUBTREE_ROOT) { - // already have this dir? (we must always add in order) - if (lump_map.count(dir->dirfrag())) - return; - - if (mode == TO_AUTH_SUBTREE_ROOT) { - //return; // hack: for comparison purposes.. what if NO context? - - // subtree root? - if (dir->is_subtree_root() && dir->is_auth()) - return; - - // was the inode journaled since the last subtree_map? - if (//false && // for benchmarking - last_subtree_map && - dir->inode->last_journaled >= last_subtree_map) { - /* - cout << " inode " << dir->inode->inode.ino - << " last journaled at " << dir->inode->last_journaled - << " and last_subtree_map is " << last_subtree_map - << std::endl; - */ - return; - } - } - - // stop at root/stray - CInode *diri = dir->get_inode(); - if (!diri->get_projected_parent_dn()) - return; - - // journaled? - - // add parent dn - CDentry *parent = diri->get_projected_parent_dn(); - add_dir_context(parent->get_dir(), mode); - add_dentry(parent, false); - } - - - + void add_dir_context(CDir *dir, int mode = TO_AUTH_SUBTREE_ROOT); void print(ostream& out) const { out << "[metablob"; diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 6a739e01bd11d..7e77443f9b5a6 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -317,6 +317,83 @@ EMetaBlob::EMetaBlob(MDLog *mdlog) : root(NULL), my_offset(mdlog ? mdlog->get_write_pos() : 0) //, _segment(0) { } +void EMetaBlob::add_dir_context(CDir *dir, int mode) +{ + MDS *mds = dir->cache->mds; + + list parents; + + // it may be okay not to include the maybe items, if + // - we journaled the maybe child inode in this segment + // - that subtree turns out to be unambiguously auth + list maybe; + bool maybenot = false; + + while (true) { + // already have this dir? (we must always add in order) + if (lump_map.count(dir->dirfrag())) { + dout(20) << "EMetaBlob::add_dir_context(" << dir << ") have lump " << dir->dirfrag() << dendl; + break; + } + + // stop at root/stray + CInode *diri = dir->get_inode(); + CDentry *parent = diri->get_projected_parent_dn(); + + if (!parent) + break; + + if (mode == TO_AUTH_SUBTREE_ROOT) { + // subtree root? + if (dir->is_subtree_root()) { + if (dir->is_auth() && !dir->is_ambiguous_auth()) { + // it's an auth subtree, we don't need maybe (if any), and we're done. + dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached unambig auth subtree, don't need " << maybe + << " at " << *dir << dendl; + maybe.clear(); + break; + } else { + dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached ambig or !auth subtree, need " << maybe + << " at " << *dir << dendl; + // we need the maybe list after all! + parents.splice(parents.begin(), maybe); + maybenot = false; + } + } + + // was the inode journaled in this blob? + if (my_offset && diri->last_journaled == my_offset) { + dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri this blob " << *diri << dendl; + break; + } + + // have we journaled this inode since the last subtree map? + if (!maybenot && last_subtree_map && diri->last_journaled >= last_subtree_map) { + dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri in this segment (" + << diri->last_journaled << " >= " << last_subtree_map << "), setting maybenot flag " + << *diri << dendl; + maybenot = true; + } + } + + if (maybenot) { + dout(25) << "EMetaBlob::add_dir_context(" << dir << ") maybe " << *parent << dendl; + maybe.push_front(parent); + } else { + dout(25) << "EMetaBlob::add_dir_context(" << dir << ") definitely " << *parent << dendl; + parents.push_front(parent); + } + + dir = parent->get_dir(); + } + + parents.splice(parents.begin(), maybe); + + dout(20) << "EMetaBlob::add_dir_context final: " << parents << dendl; + for (list::iterator p = parents.begin(); p != parents.end(); p++) + add_dentry(*p, false); +} + void EMetaBlob::update_segment(LogSegment *ls) { // atids? -- 2.39.5