From: Sam Lang Date: Tue, 5 Mar 2013 14:48:29 +0000 (-0600) Subject: mds: New backtrace handling X-Git-Tag: v0.60~53 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=4d0448f87626a8c67a2a34a79d679b82f80eca57;p=ceph.git mds: New backtrace handling Add unified backtrace handling for storing a backtrace on file objects (the first data object) and dirs. The backtrace store operation is queued on the LogSegment (for performing the store on log segment expire). We encode the backtrace on queue to avoid keeping a reference around to the CInode, which may get dropped from the cache by the time the log segment is expired (and the backtrace is written out). Fetching the backtrace is implemented on the CInode. Also allow incrementing/decrementing the DIRTYPARENT pin ref as needed, instead of using a state semaphore to keep track of whether itsset or not. This allows us to remove the STATE_DIRTYPARENT field on CInode. Signed-off-by: Sam Lang Reviewed-by: Greg Farnum --- diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 231630e08af..47c5fbec215 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1962,12 +1962,6 @@ void CDir::_commit(version_t want) map_t::iterator committed_dn; unsigned max_write_size = cache->max_dir_commit_size; - // update parent pointer while we're here. - // NOTE: the pointer is ONLY required to be valid for the first frag. we put the xattr - // on other frags too because it can't hurt, but it won't necessarily be up to date - // in that case!! - max_write_size -= inode->encode_parent_mutation(m); - if (is_complete() && (num_dirty > (num_head_items*g_conf->mds_dir_commit_ratio))) { fnode.snap_purged_thru = realm->get_last_destroyed(); @@ -2025,16 +2019,6 @@ void CDir::_committed(version_t v, version_t lrv) bool stray = inode->is_stray(); - // did we update the parent pointer too? - if (get_frag() == frag_t() && // only counts on first frag - inode->state_test(CInode::STATE_DIRTYPARENT) && - lrv == inode->inode.last_renamed_version) { - inode->item_renamed_file.remove_myself(); - inode->state_clear(CInode::STATE_DIRTYPARENT); - inode->put(CInode::PIN_DIRTYPARENT); - dout(10) << "_committed stored parent pointer, removed from renamed_files list " << *inode << dendl; - } - // take note. assert(v > committed_version); assert(v <= committing_version); diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 42137f37828..ae75840978f 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -127,7 +127,6 @@ ostream& operator<<(ostream& out, CInode& in) if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH"; if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " needsrecover"; if (in.state_test(CInode::STATE_RECOVERING)) out << " recovering"; - if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " dirtyparent"; if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance; if (in.is_frozen_inode()) out << " FROZEN"; if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN"; @@ -967,90 +966,68 @@ void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin) delete fin; } - - -// ------------------ -// parent dir - -void CInode::build_backtrace(inode_backtrace_t& bt) -{ - bt.ino = inode.ino; - bt.ancestors.clear(); - - CInode *in = this; - CDentry *pdn = get_parent_dn(); - while (pdn) { - CInode *diri = pdn->get_dir()->get_inode(); - bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name, in->inode.version)); - in = diri; - pdn = in->get_parent_dn(); - } -} - -unsigned CInode::encode_parent_mutation(ObjectOperation& m) -{ - string path; - make_path_string(path); - m.setxattr("path", path); - - inode_backtrace_t bt; - build_backtrace(bt); - - bufferlist parent; - ::encode(bt, parent); - m.setxattr("parent", parent); - return path.length() + parent.length(); -} - -struct C_Inode_StoredParent : public Context { +class C_CInode_FetchedBacktrace : public Context { CInode *in; - version_t version; + inode_backtrace_t *backtrace; Context *fin; - C_Inode_StoredParent(CInode *i, version_t v, Context *f) : in(i), version(v), fin(f) {} +public: + bufferlist bl; + C_CInode_FetchedBacktrace(CInode *i, inode_backtrace_t *bt, Context *f) : + in(i), backtrace(bt), fin(f) {} + void finish(int r) { - in->_stored_parent(version, fin); + if (r == 0) { + in->_fetched_backtrace(&bl, backtrace, fin); + } else { + fin->finish(r); + } } }; -void CInode::store_parent(Context *fin) +void CInode::fetch_backtrace(inode_backtrace_t *bt, Context *fin) { - dout(10) << "store_parent" << dendl; - - ObjectOperation m; - encode_parent_mutation(m); - - // write it. - SnapContext snapc; - object_t oid = get_object_name(ino(), frag_t(), ""); - object_locator_t oloc(mdcache->mds->mdsmap->get_metadata_pool()); - - mdcache->mds->objecter->mutate(oid, oloc, m, snapc, ceph_clock_now(g_ceph_context), 0, - NULL, new C_Inode_StoredParent(this, inode.last_renamed_version, fin) ); + object_locator_t oloc(inode.layout.fl_pg_pool); + SnapContext snapc; + C_CInode_FetchedBacktrace *c = new C_CInode_FetchedBacktrace(this, bt, fin); + mdcache->mds->objecter->getxattr(oid, oloc, "parent", CEPH_NOSNAP, &c->bl, 0, c); } -void CInode::_stored_parent(version_t v, Context *fin) +void CInode::_fetched_backtrace(bufferlist *bl, inode_backtrace_t *bt, Context *fin) { - if (state_test(STATE_DIRTYPARENT)) { - if (v == inode.last_renamed_version) { - dout(10) << "stored_parent committed v" << v << ", removing from list" << dendl; - item_renamed_file.remove_myself(); - state_clear(STATE_DIRTYPARENT); - put(PIN_DIRTYPARENT); - } else { - dout(10) << "stored_parent committed v" << v << " < " << inode.last_renamed_version - << ", renamed again, not removing from list" << dendl; - } - } else { - dout(10) << "stored_parent committed v" << v << ", tho i wasn't on the renamed_files list" << dendl; - } + ::decode(*bt, *bl); if (fin) { fin->finish(0); - delete fin; } } +void CInode::build_backtrace(int64_t location, inode_backtrace_t* bt) +{ + bt->ino = inode.ino; + bt->ancestors.clear(); + + CInode *in = this; + CDentry *pdn = get_parent_dn(); + while (pdn) { + CInode *diri = pdn->get_dir()->get_inode(); + bt->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name, in->inode.version)); + in = diri; + pdn = in->get_parent_dn(); + } + vector::iterator i = inode.old_layouts.begin(); + while(i != inode.old_layouts.end()) { + // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0) + if (i->fl_pg_pool == location) + continue; + bt->old_pools.insert(i->fl_pg_pool); + i++; + } +} + +// ------------------ +// parent dir + void CInode::encode_store(bufferlist& bl) { ENCODE_START(4, 4, bl); diff --git a/src/mds/CInode.h b/src/mds/CInode.h index e43a4565c65..81747014046 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -151,7 +151,6 @@ public: static const int STATE_NEEDSRECOVER = (1<<11); static const int STATE_RECOVERING = (1<<12); static const int STATE_PURGING = (1<<13); - static const int STATE_DIRTYPARENT = (1<<14); static const int STATE_DIRTYRSTAT = (1<<15); static const int STATE_STRAYPINNED = (1<<16); static const int STATE_FROZENAUTHPIN = (1<<17); @@ -385,7 +384,6 @@ public: elist::item item_dirty; elist::item item_caps; elist::item item_open_file; - elist::item item_renamed_file; elist::item item_dirty_dirfrag_dir; elist::item item_dirty_dirfrag_nest; elist::item item_dirty_dirfrag_dirfragtree; @@ -426,7 +424,7 @@ private: parent(0), inode_auth(CDIR_AUTH_DEFAULT), replica_caps_wanted(0), - item_dirty(this), item_caps(this), item_open_file(this), item_renamed_file(this), + item_dirty(this), item_caps(this), item_open_file(this), item_dirty_dirfrag_dir(this), item_dirty_dirfrag_nest(this), item_dirty_dirfrag_dirfragtree(this), @@ -529,11 +527,10 @@ private: void fetch(Context *fin); void _fetched(bufferlist& bl, bufferlist& bl2, Context *fin); - void store_parent(Context *fin); - void _stored_parent(version_t v, Context *fin); + void fetch_backtrace(inode_backtrace_t *bt, Context *fin); + void _fetched_backtrace(bufferlist *bl, inode_backtrace_t *bt, Context *fin); - void build_backtrace(inode_backtrace_t& bt); - unsigned encode_parent_mutation(ObjectOperation& m); + void build_backtrace(int64_t location, inode_backtrace_t* bt); void encode_store(bufferlist& bl); void decode_store(bufferlist::iterator& bl); diff --git a/src/mds/LogSegment.h b/src/mds/LogSegment.h index 7c7360466a5..bcdd54b4432 100644 --- a/src/mds/LogSegment.h +++ b/src/mds/LogSegment.h @@ -33,6 +33,19 @@ class CDentry; class MDS; class MDSlaveUpdate; +// The backtrace info struct here is used to maintain the backtrace in +// a queue that we will eventually want to write out (on journal segment +// expiry). +class BacktraceInfo { +public: + int64_t location; + int64_t pool; + struct inode_backtrace_t bt; + elist::item item_logseg; + BacktraceInfo(int64_t l, CInode *i, LogSegment *ls, int64_t p = -1); + ~BacktraceInfo(); +}; + class LogSegment { public: uint64_t offset, end; @@ -45,11 +58,12 @@ class LogSegment { elist dirty_dentries; elist open_files; - elist renamed_files; elist dirty_dirfrag_dir; elist dirty_dirfrag_nest; elist dirty_dirfrag_dirfragtree; + elist update_backtraces; + elist slave_updates; set truncating_inodes; @@ -76,13 +90,20 @@ class LogSegment { dirty_inodes(member_offset(CInode, item_dirty)), dirty_dentries(member_offset(CDentry, item_dirty)), open_files(member_offset(CInode, item_open_file)), - renamed_files(member_offset(CInode, item_renamed_file)), dirty_dirfrag_dir(member_offset(CInode, item_dirty_dirfrag_dir)), dirty_dirfrag_nest(member_offset(CInode, item_dirty_dirfrag_nest)), dirty_dirfrag_dirfragtree(member_offset(CInode, item_dirty_dirfrag_dirfragtree)), + update_backtraces(member_offset(BacktraceInfo, item_logseg)), slave_updates(0), // passed to begin() manually inotablev(0), sessionmapv(0) { } -}; + + // backtrace handling + void queue_backtrace_update(CInode *in, int64_t location, int64_t pool = -1); + void remove_pending_backtraces(inodeno_t ino, int64_t pool); + void store_backtrace_update(MDS *mds, BacktraceInfo *info, Context *fin); + void _stored_backtrace(BacktraceInfo *info, Context *fin); + unsigned encode_parent_mutation(ObjectOperation& m, BacktraceInfo *info); + }; #endif diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc index 9ed021035bc..0fabbc633e7 100644 --- a/src/mds/MDLog.cc +++ b/src/mds/MDLog.cc @@ -620,7 +620,6 @@ void MDLog::standby_trim_segments() seg->dirty_inodes.clear_list(); seg->dirty_dentries.clear_list(); seg->open_files.clear_list(); - seg->renamed_files.clear_list(); seg->dirty_dirfrag_dir.clear_list(); seg->dirty_dirfrag_nest.clear_list(); seg->dirty_dirfrag_dirfragtree.clear_list(); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 6d0519ff8a4..888ea12344f 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -6348,16 +6348,6 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen if (destdn->is_auth()) { in->pop_and_dirty_projected_inode(mdr->ls); - if (in->is_dir()) { - mdr->ls->renamed_files.push_back(&in->item_renamed_file); - if (!in->state_test(CInode::STATE_DIRTYPARENT)) { - in->state_set(CInode::STATE_DIRTYPARENT); - in->get(CInode::PIN_DIRTYPARENT); - dout(10) << "added dir to logsegment renamed_files list " << *in << dendl; - } else { - dout(10) << "re-added dir to logsegment renamed_files list " << *in << dendl; - } - } } else { // FIXME: fix up snaprealm! } diff --git a/src/mds/inode_backtrace.cc b/src/mds/inode_backtrace.cc index c0457b28ff7..e2ab809db10 100644 --- a/src/mds/inode_backtrace.cc +++ b/src/mds/inode_backtrace.cc @@ -55,15 +55,17 @@ void inode_backpointer_t::generate_test_instances(list& ls void inode_backtrace_t::encode(bufferlist& bl) const { - ENCODE_START(4, 4, bl); + ENCODE_START(5, 4, bl); ::encode(ino, bl); ::encode(ancestors, bl); + ::encode(pool, bl); + ::encode(old_pools, bl); ENCODE_FINISH(bl); } void inode_backtrace_t::decode(bufferlist::iterator& bl) { - DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl); + DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl); if (struct_v < 3) return; // sorry, the old data was crap ::decode(ino, bl); @@ -77,6 +79,10 @@ void inode_backtrace_t::decode(bufferlist::iterator& bl) ancestors.back().decode_old(bl); } } + if (struct_v >= 5) { + ::decode(pool, bl); + ::decode(old_pools, bl); + } DECODE_FINISH(bl); } @@ -90,6 +96,12 @@ void inode_backtrace_t::dump(Formatter *f) const f->close_section(); } f->close_section(); + f->dump_int("pool", pool); + f->open_array_section("old_pools"); + for (set::iterator p = old_pools.begin(); p != old_pools.end(); ++p) { + f->dump_int("old_pool", *p); + } + f->close_section(); } void inode_backtrace_t::generate_test_instances(list& ls) @@ -101,5 +113,8 @@ void inode_backtrace_t::generate_test_instances(list& ls) ls.back()->ancestors.back().dirino = 123; ls.back()->ancestors.back().dname = "bar"; ls.back()->ancestors.back().version = 456; + ls.back()->pool = 0; + ls.back()->old_pools.insert(10); + ls.back()->old_pools.insert(7); } diff --git a/src/mds/inode_backtrace.h b/src/mds/inode_backtrace.h index 6b512913fd9..d223f724a99 100644 --- a/src/mds/inode_backtrace.h +++ b/src/mds/inode_backtrace.h @@ -47,6 +47,11 @@ inline ostream& operator<<(ostream& out, const inode_backpointer_t& ib) { struct inode_backtrace_t { inodeno_t ino; // my ino vector ancestors; + int64_t pool; + // we use a set for old_pools to avoid duplicate entries, e.g. setlayout 0, 1, 0 + set old_pools; + + inode_backtrace_t() : pool(-1) {} void encode(bufferlist& bl) const; void decode(bufferlist::iterator &bl); @@ -56,7 +61,7 @@ struct inode_backtrace_t { WRITE_CLASS_ENCODER(inode_backtrace_t) inline ostream& operator<<(ostream& out, const inode_backtrace_t& it) { - return out << it.ino << ":" << it.ancestors; + return out << "(" << it.pool << ")" << it.ino << ":" << it.ancestors << "//" << it.old_pools; } diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 5b3bd71c107..8ba70a23a98 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -178,12 +178,10 @@ void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld) } } - // parent pointers on renamed dirs - for (elist::iterator p = renamed_files.begin(); !p.end(); ++p) { - CInode *in = *p; - dout(10) << "try_to_expire waiting for dir parent pointer update on " << *in << dendl; - assert(in->state_test(CInode::STATE_DIRTYPARENT)); - in->store_parent(gather_bld.new_sub()); + // backtraces to be stored/updated + for (elist::iterator p = update_backtraces.begin(); !p.end(); ++p) { + BacktraceInfo *btinfo = *p; + store_backtrace_update(mds, btinfo, gather_bld.new_sub()); } // slave updates @@ -260,6 +258,100 @@ void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld) } } +// ---------------------------- +// backtrace handling + +// BacktraceInfo is used for keeping the +// current state of the backtrace to be stored later on +// logsegment expire. Constructing a BacktraceInfo +// automatically puts it on the LogSegment list that is passed in, +// after building the backtrace based on the current state of the inode. We +// construct the backtrace here to avoid keeping a ref to the inode. +BacktraceInfo::BacktraceInfo( + int64_t l, CInode *i, LogSegment *ls, int64_t p) : + location(l), pool(p) { + + // on setlayout cases, forward pointers mean + // pool != location, but for all others it does + if (pool == -1) pool = location; + + bt.pool = pool; + i->build_backtrace(l, bt); + ls->update_backtraces.push_back(&item_logseg); +} + +// When the info_t is destroyed, it just needs to remove itself +// from the LogSegment list +BacktraceInfo::~BacktraceInfo() { + item_logseg.remove_myself(); +} + +// Queue a backtrace for later +void LogSegment::queue_backtrace_update(CInode *inode, int64_t location, int64_t pool) { + // allocating a pointer here and not setting it to anything + // might look strange, but the constructor adds itself to the backtraces + // list of this LogSegment, which is how we keep track of it + new BacktraceInfo(location, inode, this, pool); +} + +void LogSegment::remove_pending_backtraces(inodeno_t ino, int64_t pool) { + elist::iterator i = update_backtraces.begin(); + while(!i.end()) { + ++i; + if((*i)->bt.ino == ino && (*i)->location == pool) { + delete (*i); + } + } +} + +unsigned LogSegment::encode_parent_mutation(ObjectOperation& m, BacktraceInfo *info) +{ + bufferlist parent; + ::encode(info->bt, parent); + m.setxattr("parent", parent); + return parent.length(); +} + +struct C_LogSegment_StoredBacktrace : public Context { + LogSegment *ls; + BacktraceInfo *info; + Context *fin; + C_LogSegment_StoredBacktrace(LogSegment *l, BacktraceInfo *c, + Context *f) : ls(l), info(c), fin(f) {} + void finish(int r) { + ls->_stored_backtrace(info, fin); + } +}; + +void LogSegment::store_backtrace_update(MDS *mds, BacktraceInfo *info, Context *fin) +{ + ObjectOperation m; + // prev_pool will be the target pool on create,mkdir,etc. + encode_parent_mutation(m, info); + + // write it. + SnapContext snapc; + + object_t oid = CInode::get_object_name(info->bt.ino, frag_t(), ""); + + dout(10) << "store_parent for oid " << oid << " location " << info->location << " pool " << info->pool << dendl; + + // store the backtrace in the specified pool + object_locator_t oloc(info->location); + + mds->objecter->mutate(oid, oloc, m, snapc, ceph_clock_now(g_ceph_context), 0, + NULL, new C_LogSegment_StoredBacktrace(this, info, fin) ); + +} + +void LogSegment::_stored_backtrace(BacktraceInfo *info, Context *fin) +{ + delete info; + if (fin) { + fin->finish(0); + delete fin; + } +} #undef DOUT_COND #define DOUT_COND(cct, l) (l<=cct->_conf->debug_mds || l <= cct->_conf->debug_mds_log) diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc index 99c302c7750..51103bd6d18 100644 --- a/src/mds/mdstypes.cc +++ b/src/mds/mdstypes.cc @@ -204,7 +204,7 @@ ostream& operator<<(ostream& out, const client_writeable_range_t& r) */ void inode_t::encode(bufferlist &bl) const { - ENCODE_START(6, 6, bl); + ENCODE_START(7, 6, bl); ::encode(ino, bl); ::encode(rdev, bl); @@ -237,13 +237,14 @@ void inode_t::encode(bufferlist &bl) const ::encode(file_data_version, bl); ::encode(xattr_version, bl); ::encode(last_renamed_version, bl); + ::encode(old_pools, bl); ENCODE_FINISH(bl); } void inode_t::decode(bufferlist::iterator &p) { - DECODE_START_LEGACY_COMPAT_LEN(6, 6, 6, p); + DECODE_START_LEGACY_COMPAT_LEN(7, 6, 6, p); ::decode(ino, p); ::decode(rdev, p); @@ -291,6 +292,8 @@ void inode_t::decode(bufferlist::iterator &p) ::decode(xattr_version, p); if (struct_v >= 2) ::decode(last_renamed_version, p); + if (struct_v >= 7) + ::decode(old_pools, p); DECODE_FINISH(p); } @@ -314,6 +317,13 @@ void inode_t::dump(Formatter *f) const ::dump(layout, f); f->close_section(); + f->open_array_section("old_pools"); + vector::const_iterator i = old_pools.begin(); + while(i != old_pools.end()) { + ::dump(*i, f); + } + f->close_section(); + f->dump_unsigned("size", size); f->dump_unsigned("truncate_seq", truncate_seq); f->dump_unsigned("truncate_size", truncate_size); diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 21cc60e9391..c7d678699ab 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -323,6 +323,7 @@ struct inode_t { // file (data access) ceph_dir_layout dir_layout; // [dir only] ceph_file_layout layout; + vector old_pools; uint64_t size; // on directory, # dentries uint32_t truncate_seq; uint64_t truncate_size, truncate_from; @@ -421,6 +422,10 @@ struct inode_t { } } + void add_old_pool(int64_t l) { + old_pools.push_back(l); + } + void encode(bufferlist &bl) const; void decode(bufferlist::iterator& bl); void dump(Formatter *f) const;