From fbe7c6c46ab3b609737e2bce114a75904e4943e1 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 21 Jul 2008 12:42:44 -0700 Subject: [PATCH] mds: move Locker::predirty_nested to MDCache::predirty_journal_parents --- src/TODO | 10 +- src/mds/CInode.cc | 5 +- src/mds/Locker.cc | 206 +--------------------------------------- src/mds/Locker.h | 9 +- src/mds/MDCache.cc | 231 +++++++++++++++++++++++++++++++++++++++++++-- src/mds/MDCache.h | 11 ++- src/mds/Server.cc | 48 +++++----- src/mds/mdstypes.h | 6 ++ 8 files changed, 275 insertions(+), 251 deletions(-) diff --git a/src/TODO b/src/TODO index 1553792977dd3..9df7010e36e37 100644 --- a/src/TODO +++ b/src/TODO @@ -230,10 +230,12 @@ todo - make better sense of snap_highwater...? /- cdir fetch/store versioned dentries -- emetablob.. journaling a versioned update.. - - replay +/- emetablob.. journaling a versioned update.. +/ - replay - fetch may need to adjust loaded dentry first,last? +- client reconnect vs snaps + - hard link backpointers - anchor source dir - build snaprealm for any hardlinked file @@ -245,11 +247,9 @@ todo primary dir link -> multiversion inode remote link -> multiversion inode -** HRM, how to cope with split notifications from multiple mds's racing to client... - - for simplicity, don't replicate any snapshot data. -- need rrealms in fraginfo_t +- need rsnaprealms in fraginfo_t - rename() needs to create a new realm if src/dst realms differ and (rrealms, or open_children, or not subtree leaf) (similar logic to the anchor update) - will snapshots and CAS play nice? diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index c5f7e1b25be9d..58fcb7f1a7c45 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -754,6 +754,8 @@ void CInode::clear_dirty_scattered(int type) void CInode::finish_scatter_gather_update(int type) { dout(10) << "finish_scatter_gather_update " << type << " on " << *this << dendl; + assert(is_auth()); + switch (type) { case CEPH_LOCK_IDIR: { @@ -788,9 +790,6 @@ void CInode::finish_scatter_gather_update(int type) break; case CEPH_LOCK_IDFT: - { - assert(is_auth()); - } break; default: diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 56c7fd2e0a3eb..911b1f11a791e 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -883,7 +883,7 @@ bool Locker::check_inode_max_size(CInode *in, bool forceupdate, __u64 new_size) EOpen *le = new EOpen(mds->mdlog); if (forceupdate) // FIXME if/when we do max_size nested accounting - predirty_nested(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY); + mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY); else le->metablob.add_dir_context(in->get_parent_dir()); mdcache->journal_dirty_inode(&le->metablob, in); @@ -1134,7 +1134,7 @@ void Locker::_do_cap_update(CInode *in, int had, int all_wanted, snapid_t follow mut->ls = mds->mdlog->get_current_segment(); file_wrlock_force(&in->filelock, mut); // wrlock for duration of journal mut->auth_pin(in); - predirty_nested(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, false); + mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, false); mdcache->journal_dirty_inode(&le->metablob, in, follows); @@ -1305,204 +1305,6 @@ void Locker::revoke_client_leases(SimpleLock *lock) } -// nested --------------------------------------------------------------- - - -/* - * NOTE: we _have_ to delay the scatter if we are called during a - * rejoin, because we can't twiddle locks between when the - * rejoin_(weak|strong) is received and when we send the rejoin_ack. - * normally, this isn't a problem: a recover mds doesn't twiddle locks - * (no requests), and a survivor acks immediately. _except_ that - * during rejoin_(weak|strong) processing, we may complete a lock - * gather, and do a scatter_writebehind.. and we _can't_ twiddle the - * scatterlock state in that case or the lock states will get out of - * sync between the auth and replica. - * - * the simple solution is to never do the scatter here. instead, put - * the scatterlock on a list if it isn't already wrlockable. this is - * probably the best plan anyway, since we avoid too many - * scatters/locks under normal usage. - */ -void Locker::predirty_nested(Mutation *mut, EMetaBlob *blob, - CInode *in, CDir *parent, - int flags, int linkunlink) -{ - bool primary_dn = flags & PREDIRTY_PRIMARY; - bool do_parent_mtime = flags & PREDIRTY_DIR; - bool shallow = flags & PREDIRTY_SHALLOW; - - // declare now? - if (mut->now == utime_t()) - mut->now = g_clock.real_now(); - - dout(10) << "predirty_nested" - << (do_parent_mtime ? " do_parent_mtime":"") - << " linkunlink=" << linkunlink - << (primary_dn ? " primary_dn":" remote_dn") - << (shallow ? " SHALLOW":"") - << " " << *in << dendl; - - if (!parent) { - assert(primary_dn); - parent = in->get_projected_parent_dn()->get_dir(); - } - - if (flags == 0 && linkunlink == 0) { - dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl; - blob->add_dir_context(parent); - return; - } - - inode_t *curi = in->get_projected_inode(); - - __s64 drbytes = 1, drfiles = 0, drsubdirs = 0, dranchors = 0; - utime_t rctime; - - // build list of inodes to wrlock, dirty, and update - list lsi; - CInode *cur = in; - while (parent) { - //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack - assert(parent->is_auth()); - - // opportunistically adjust parent dirfrag - CInode *pin = parent->get_inode(); - - if (do_parent_mtime || linkunlink) { - assert(mut->wrlocks.count(&pin->dirlock) || - mut->is_slave()); // we are slave. master will have wrlocked the dir. - } - - // inode -> dirfrag - mut->auth_pin(parent); - mut->add_projected_fnode(parent); - - fnode_t *pf = parent->project_fnode(); - pf->version = parent->pre_dirty(); - - if (do_parent_mtime) { - pf->fragstat.mtime = mut->now; - if (mut->now > pf->fragstat.rctime) { - dout(10) << "predirty_nested updating mtime on " << *parent << dendl; - pf->fragstat.rctime = mut->now; - } else { - dout(10) << "predirty_nested updating mtime UNDERWATER on " << *parent << dendl; - } - } - if (linkunlink) { - dout(10) << "predirty_nested updating size on " << *parent << dendl; - if (in->is_dir()) - pf->fragstat.nsubdirs += linkunlink; - else - pf->fragstat.nfiles += linkunlink; - } - if (primary_dn) { - if (linkunlink == 0) { - drbytes = curi->dirstat.rbytes - curi->accounted_dirstat.rbytes; - drfiles = curi->dirstat.rfiles - curi->accounted_dirstat.rfiles; - drsubdirs = curi->dirstat.rsubdirs - curi->accounted_dirstat.rsubdirs; - dranchors = curi->dirstat.ranchors - curi->accounted_dirstat.ranchors; - } else if (linkunlink < 0) { - drbytes = 0 - curi->accounted_dirstat.rbytes; - drfiles = 0 - curi->accounted_dirstat.rfiles; - drsubdirs = 0 - curi->accounted_dirstat.rsubdirs; - dranchors = 0 - curi->accounted_dirstat.ranchors; - } else { - drbytes = curi->dirstat.rbytes; - drfiles = curi->dirstat.rfiles; - drsubdirs = curi->dirstat.rsubdirs; - dranchors = curi->dirstat.ranchors; - } - rctime = MAX(curi->ctime, curi->dirstat.rctime); - - dout(10) << "predirty_nested delta " - << drbytes << " bytes / " << drfiles << " files / " << drsubdirs << " subdirs for " - << *parent << dendl; - pf->fragstat.rbytes += drbytes; - pf->fragstat.rfiles += drfiles; - pf->fragstat.rsubdirs += drsubdirs; - pf->fragstat.ranchors += dranchors; - pf->fragstat.rctime = rctime; - - curi->accounted_dirstat = curi->dirstat; - } else { - dout(10) << "predirty_nested no delta (remote dentry, or rename within same dir) in " << *parent << dendl; - pf->fragstat.rfiles += linkunlink; - } - - - // stop? - if (pin->is_base()) - break; - - bool stop = false; - if (!pin->is_auth() || pin->is_ambiguous_auth()) { - dout(10) << "predirty_nested !auth or ambig on " << *pin << dendl; - stop = true; - } - if (!stop && - mut->wrlocks.count(&pin->dirlock) == 0 && - (!pin->can_auth_pin() || - !pin->versionlock.can_wrlock() || // make sure we can take versionlock, too - !scatter_wrlock_try(&pin->dirlock, mut, false))) { // ** do not initiate.. see above comment ** - dout(10) << "predirty_nested can't wrlock one of " << pin->versionlock << " or " << pin->dirlock - << " on " << *pin << dendl; - stop = true; - } - if (stop) { - dout(10) << "predirty_nested stop. marking dirlock on " << *pin << dendl; - mark_updated_scatterlock(&pin->dirlock); - mut->ls->dirty_dirfrag_dir.push_back(&pin->xlist_dirty_dirfrag_dir); - mut->add_updated_scatterlock(&pin->dirlock); - break; - } - local_wrlock_grab(&pin->versionlock, mut); - - // dirfrag -> diri - mut->auth_pin(pin); - mut->add_projected_inode(pin); - lsi.push_front(pin); - - inode_t *pi = pin->project_inode(); - pi->version = pin->pre_dirty(); - pi->dirstat.version++; - dout(15) << "predirty_nested take_diff " << pf->fragstat << dendl; - dout(15) << "predirty_nested - " << pf->accounted_fragstat << dendl; - bool touched_mtime = false; - pi->dirstat.take_diff(pf->fragstat, pf->accounted_fragstat, touched_mtime); - if (touched_mtime) - pi->mtime = pi->ctime = pi->dirstat.mtime; - dout(15) << "predirty_nested gives " << pi->dirstat << " on " << *pin << dendl; - - // next parent! - cur = pin; - curi = pi; - parent = cur->get_projected_parent_dn()->get_dir(); - linkunlink = 0; - do_parent_mtime = false; - primary_dn = true; - } - - // now, stick it in the blob - assert(parent->is_auth()); - blob->add_dir_context(parent); - blob->add_dir(parent, true); - SnapRealm *realm = 0; - for (list::iterator p = lsi.begin(); - p != lsi.end(); - p++) { - CInode *cur = *p; - if (!realm) - realm = cur->find_snaprealm(); - else if (cur->snaprealm) - realm = cur->snaprealm; - mds->mdcache->journal_dirty_inode(blob, cur); - } - -} - - // locks ---------------------------------------------------------------- @@ -2420,15 +2222,13 @@ void Locker::scatter_writebehind(ScatterLock *lock) mut->locks.insert(lock); inode_t *pi = in->project_inode(); - - //????pi->mtime = in->inode.mtime; // make sure an intermediate version isn't goofing us up pi->version = in->pre_dirty(); lock->get_parent()->finish_scatter_gather_update(lock->get_type()); lock->clear_updated(); EUpdate *le = new EUpdate(mds->mdlog, "scatter_writebehind"); - predirty_nested(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, false); + mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, false); mdcache->journal_dirty_inode(&le->metablob, in); mds->mdlog->submit_entry(le); diff --git a/src/mds/Locker.h b/src/mds/Locker.h index 602be38a62644..1cd2e3adf3836 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -54,11 +54,6 @@ class ScatterLock; class LocalLock; class MDCache; -// flags for predirty_nested() -static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting -static const int PREDIRTY_DIR = 2; // update parent dir mtime/size -static const int PREDIRTY_SHALLOW = 4; // only go to immediate parrent (for easier rollback) - class Locker { private: MDS *mds; @@ -167,12 +162,10 @@ protected: xlist updated_scatterlocks; public: void mark_updated_scatterlock(ScatterLock *lock); - void predirty_nested(Mutation *mut, EMetaBlob *blob, CInode *in, CDir *dir, - int flags, int linkunlink=0); // local -protected: void local_wrlock_grab(LocalLock *lock, Mutation *mut); +protected: bool local_wrlock_start(LocalLock *lock, MDRequest *mut); void local_wrlock_finish(LocalLock *lock, Mutation *mut); bool local_xlock_start(LocalLock *lock, MDRequest *mut); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 332096a226155..7a392df19be75 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -1085,6 +1085,210 @@ inode_t *MDCache::journal_dirty_inode(EMetaBlob *metablob, CInode *in, snapid_t +// nested --------------------------------------------------------------- + + +/* + * NOTE: we _have_ to delay the scatter if we are called during a + * rejoin, because we can't twiddle locks between when the + * rejoin_(weak|strong) is received and when we send the rejoin_ack. + * normally, this isn't a problem: a recover mds doesn't twiddle locks + * (no requests), and a survivor acks immediately. _except_ that + * during rejoin_(weak|strong) processing, we may complete a lock + * gather, and do a scatter_writebehind.. and we _can't_ twiddle the + * scatterlock state in that case or the lock states will get out of + * sync between the auth and replica. + * + * the simple solution is to never do the scatter here. instead, put + * the scatterlock on a list if it isn't already wrlockable. this is + * probably the best plan anyway, since we avoid too many + * scatters/locks under normal usage. + */ +void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob, + CInode *in, CDir *parent, + int flags, int linkunlink) +{ + bool primary_dn = flags & PREDIRTY_PRIMARY; + bool do_parent_mtime = flags & PREDIRTY_DIR; + bool shallow = flags & PREDIRTY_SHALLOW; + + // declare now? + if (mut->now == utime_t()) + mut->now = g_clock.real_now(); + + dout(10) << "predirty_journal_parents" + << (do_parent_mtime ? " do_parent_mtime":"") + << " linkunlink=" << linkunlink + << (primary_dn ? " primary_dn":" remote_dn") + << (shallow ? " SHALLOW":"") + << " " << *in << dendl; + + if (!parent) { + assert(primary_dn); + parent = in->get_projected_parent_dn()->get_dir(); + } + + if (flags == 0 && linkunlink == 0) { + dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl; + blob->add_dir_context(parent); + return; + } + + inode_t *curi = in->get_projected_inode(); + + __s64 drbytes = 1, drfiles = 0, drsubdirs = 0, dranchors = 0, drsnaprealms = 0; + utime_t rctime; + + // build list of inodes to wrlock, dirty, and update + list lsi; + CInode *cur = in; + while (parent) { + //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack + assert(parent->is_auth()); + + // opportunistically adjust parent dirfrag + CInode *pin = parent->get_inode(); + + if (do_parent_mtime || linkunlink) { + assert(mut->wrlocks.count(&pin->dirlock) || + mut->is_slave()); // we are slave. master will have wrlocked the dir. + } + + // inode -> dirfrag + mut->auth_pin(parent); + mut->add_projected_fnode(parent); + + fnode_t *pf = parent->project_fnode(); + pf->version = parent->pre_dirty(); + + if (do_parent_mtime) { + pf->fragstat.mtime = mut->now; + if (mut->now > pf->fragstat.rctime) { + dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl; + pf->fragstat.rctime = mut->now; + } else { + dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl; + } + } + if (linkunlink) { + dout(10) << "predirty_journal_parents updating size on " << *parent << dendl; + if (in->is_dir()) + pf->fragstat.nsubdirs += linkunlink; + else + pf->fragstat.nfiles += linkunlink; + } + if (primary_dn) { + if (linkunlink == 0) { + drbytes = curi->dirstat.rbytes - curi->accounted_dirstat.rbytes; + drfiles = curi->dirstat.rfiles - curi->accounted_dirstat.rfiles; + drsubdirs = curi->dirstat.rsubdirs - curi->accounted_dirstat.rsubdirs; + dranchors = curi->dirstat.ranchors - curi->accounted_dirstat.ranchors; + drsnaprealms = curi->dirstat.rsnaprealms - curi->accounted_dirstat.rsnaprealms; + } else if (linkunlink < 0) { + drbytes = 0 - curi->accounted_dirstat.rbytes; + drfiles = 0 - curi->accounted_dirstat.rfiles; + drsubdirs = 0 - curi->accounted_dirstat.rsubdirs; + dranchors = 0 - curi->accounted_dirstat.ranchors; + drsnaprealms = 0 - curi->accounted_dirstat.rsnaprealms; + } else { + drbytes = curi->dirstat.rbytes; + drfiles = curi->dirstat.rfiles; + drsubdirs = curi->dirstat.rsubdirs; + dranchors = curi->dirstat.ranchors; + drsnaprealms = curi->dirstat.rsnaprealms; + } + rctime = MAX(curi->ctime, curi->dirstat.rctime); + + dout(10) << "predirty_journal_parents delta " + << drbytes << " bytes / " << drfiles << " files / " << drsubdirs << " subdirs for " + << *parent << dendl; + pf->fragstat.rbytes += drbytes; + pf->fragstat.rfiles += drfiles; + pf->fragstat.rsubdirs += drsubdirs; + pf->fragstat.ranchors += dranchors; + pf->fragstat.rsnaprealms += drsnaprealms; + pf->fragstat.rctime = rctime; + + curi->accounted_dirstat = curi->dirstat; + } else { + dout(10) << "predirty_journal_parents no delta (remote dentry, or rename within same dir) in " << *parent << dendl; + pf->fragstat.rfiles += linkunlink; + } + + + // stop? + if (pin->is_base()) + break; + + bool stop = false; + if (!pin->is_auth() || pin->is_ambiguous_auth()) { + dout(10) << "predirty_journal_parents !auth or ambig on " << *pin << dendl; + stop = true; + } + if (!stop && + mut->wrlocks.count(&pin->dirlock) == 0 && + (!pin->can_auth_pin() || + !pin->versionlock.can_wrlock() || // make sure we can take versionlock, too + !mds->locker->scatter_wrlock_try(&pin->dirlock, mut, false))) { // ** do not initiate.. see above comment ** + dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->dirlock + << " on " << *pin << dendl; + stop = true; + } + if (stop) { + dout(10) << "predirty_journal_parents stop. marking dirlock on " << *pin << dendl; + mds->locker->mark_updated_scatterlock(&pin->dirlock); + mut->ls->dirty_dirfrag_dir.push_back(&pin->xlist_dirty_dirfrag_dir); + mut->add_updated_scatterlock(&pin->dirlock); + break; + } + mds->locker->local_wrlock_grab(&pin->versionlock, mut); + + // dirfrag -> diri + mut->auth_pin(pin); + mut->add_projected_inode(pin); + lsi.push_front(pin); + + inode_t *pi = pin->project_inode(); + pi->version = pin->pre_dirty(); + pi->dirstat.version++; + dout(15) << "predirty_journal_parents take_diff " << pf->fragstat << dendl; + dout(15) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl; + bool touched_mtime = false; + pi->dirstat.take_diff(pf->fragstat, pf->accounted_fragstat, touched_mtime); + if (touched_mtime) + pi->mtime = pi->ctime = pi->dirstat.mtime; + dout(15) << "predirty_journal_parents gives " << pi->dirstat << " on " << *pin << dendl; + + // next parent! + cur = pin; + curi = pi; + parent = cur->get_projected_parent_dn()->get_dir(); + linkunlink = 0; + do_parent_mtime = false; + primary_dn = true; + } + + // now, stick it in the blob + assert(parent->is_auth()); + blob->add_dir_context(parent); + blob->add_dir(parent, true); + SnapRealm *realm = 0; + for (list::iterator p = lsi.begin(); + p != lsi.end(); + p++) { + CInode *cur = *p; + if (!realm) + realm = cur->find_snaprealm(); + else if (cur->snaprealm) + realm = cur->snaprealm; + journal_dirty_inode(blob, cur); + } + +} + + + + // =================================== // slave requests @@ -5253,7 +5457,7 @@ void MDCache::_anchor_prepared(CInode *in, version_t atid, bool add) Mutation *mut = new Mutation; mut->ls = mds->mdlog->get_current_segment(); EUpdate *le = new EUpdate(mds->mdlog, add ? "anchor_create":"anchor_destroy"); - mds->locker->predirty_nested(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY); + predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY); journal_dirty_inode(&le->metablob, in); le->metablob.add_table_transaction(TABLE_ANCHOR, atid); mds->mdlog->submit_entry(le, new C_MDC_AnchorLogged(this, in, atid, mut)); @@ -5277,7 +5481,9 @@ void MDCache::_anchor_logged(CInode *in, version_t atid, Mutation *mut) // tell the anchortable we've committed mds->anchorclient->commit(atid, mut->ls); + // drop locks and finish mds->locker->drop_locks(mut); + delete mut; // trigger waiters in->finish_waiting(CInode::WAIT_ANCHORED, 0); @@ -5290,10 +5496,12 @@ void MDCache::_anchor_logged(CInode *in, version_t atid, Mutation *mut) struct C_MDC_snaprealm_create_finish : public Context { MDCache *cache; MDRequest *mdr; + Mutation *mut; CInode *in; - C_MDC_snaprealm_create_finish(MDCache *c, MDRequest *m, CInode *i) : cache(c), mdr(m), in(i) {} + C_MDC_snaprealm_create_finish(MDCache *c, MDRequest *m, Mutation *mu, CInode *i) : + cache(c), mdr(m), mut(mu), in(i) {} void finish(int r) { - cache->_snaprealm_create_finish(mdr, in); + cache->_snaprealm_create_finish(mdr, mut, in); } }; @@ -5314,29 +5522,38 @@ void MDCache::snaprealm_create(MDRequest *mdr, CInode *in) return; } + Mutation *mut = new Mutation; + mut->ls = mds->mdlog->get_current_segment(); EUpdate *le = new EUpdate(mds->mdlog, "snaprealm_create"); le->metablob.add_table_transaction(TABLE_SNAP, mdr->more()->stid); inode_t *pi = in->project_inode(); pi->version = in->pre_dirty(); + pi->dirstat.rsnaprealms++; SnapRealm t(this, in); t.created = mdr->more()->stid; bufferlist snapbl; ::encode(t, snapbl); + predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY); journal_cow_inode(&le->metablob, in); le->metablob.add_primary_dentry(in->get_projected_parent_dn(), true, 0, pi, 0, &snapbl); - mds->mdlog->submit_entry(le, new C_MDC_snaprealm_create_finish(this, mdr, in)); + mds->mdlog->submit_entry(le, new C_MDC_snaprealm_create_finish(this, mdr, mut, in)); } -void MDCache::_snaprealm_create_finish(MDRequest *mdr, CInode *in) +void MDCache::_snaprealm_create_finish(MDRequest *mdr, Mutation *mut, CInode *in) { dout(10) << "_snaprealm_create_finish " << *in << dendl; - in->pop_and_dirty_projected_inode(mdr->ls); - mdr->apply(); + // apply + in->pop_and_dirty_projected_inode(mut->ls); + mut->apply(); + delete mut; + + // tell table we've committed + mds->snapclient->commit(mdr->more()->stid, mut->ls); // create in->open_snaprealm(); diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 75bf39fb4a471..98489a1de65f6 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -341,6 +341,12 @@ struct MDSlaveUpdate { }; +// flags for predirty_journal_parents() +static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting +static const int PREDIRTY_DIR = 2; // update parent dir mtime/size +static const int PREDIRTY_SHALLOW = 4; // only go to immediate parrent (for easier rollback) + + class MDCache { public: // my master @@ -476,6 +482,9 @@ public: void journal_cow_dentry(EMetaBlob *metablob, CDentry *dn, snapid_t follows=CEPH_NOSNAP); void journal_cow_inode(EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP); inode_t *journal_dirty_inode(EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP); + void predirty_journal_parents(Mutation *mut, EMetaBlob *blob, + CInode *in, CDir *parent, + int flags, int linkunlink=0); // slaves void add_uncommitted_master(metareqid_t reqid, LogSegment *ls, set &slaves) { @@ -788,7 +797,7 @@ protected: // -- snaprealms -- public: void snaprealm_create(MDRequest *mdr, CInode *in); - void _snaprealm_create_finish(MDRequest *mdr, CInode *in); + void _snaprealm_create_finish(MDRequest *mdr, Mutation *mut, CInode *in); // -- stray -- public: diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 7699d243fc055..37c604efebe8a 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1677,7 +1677,7 @@ void Server::handle_client_utime(MDRequest *mdr) mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "utime"); le->metablob.add_client_req(req->get_reqid()); - mds->locker->predirty_nested(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); + mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); mdcache->journal_dirty_inode(&le->metablob, cur); mdlog->submit_entry(le, new C_MDS_inode_update_finish(mds, mdr, cur)); @@ -1719,7 +1719,7 @@ void Server::handle_client_chmod(MDRequest *mdr) mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "chmod"); le->metablob.add_client_req(req->get_reqid()); - mds->locker->predirty_nested(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); + mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); mdcache->journal_dirty_inode(&le->metablob, cur); mdlog->submit_entry(le, new C_MDS_inode_update_finish(mds, mdr, cur)); @@ -1760,7 +1760,7 @@ void Server::handle_client_chown(MDRequest *mdr) mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "chown"); le->metablob.add_client_req(req->get_reqid()); - mds->locker->predirty_nested(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); + mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); mdcache->journal_dirty_inode(&le->metablob, cur); mdlog->submit_entry(le, new C_MDS_inode_update_finish(mds, mdr, cur)); @@ -1814,7 +1814,7 @@ void Server::handle_client_setxattr(MDRequest *mdr) mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "setxattr"); le->metablob.add_client_req(req->get_reqid()); - mds->locker->predirty_nested(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); + mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); mdcache->journal_cow_inode(&le->metablob, cur); cur->xattrs.erase(name); @@ -1863,7 +1863,7 @@ void Server::handle_client_removexattr(MDRequest *mdr) mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "removexattr"); le->metablob.add_client_req(req->get_reqid()); - mds->locker->predirty_nested(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); + mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); mdcache->journal_cow_inode(&le->metablob, cur); cur->xattrs.erase(name); @@ -2081,7 +2081,7 @@ void Server::handle_client_mknod(MDRequest *mdr) le->metablob.add_client_req(req->get_reqid()); le->metablob.add_allocated_ino(newi->ino(), mds->inotable->get_version()); - mds->locker->predirty_nested(mdr, &le->metablob, newi, dn->dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); + mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); le->metablob.add_primary_dentry(dn, true, newi); // log + wait @@ -2129,7 +2129,7 @@ void Server::handle_client_mkdir(MDRequest *mdr) EUpdate *le = new EUpdate(mdlog, "mkdir"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_allocated_ino(newi->ino(), mds->inotable->get_version()); - mds->locker->predirty_nested(mdr, &le->metablob, newi, dn->dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); + mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); le->metablob.add_dir(newdir, true, true); // dirty AND complete @@ -2170,7 +2170,7 @@ void Server::handle_client_symlink(MDRequest *mdr) EUpdate *le = new EUpdate(mdlog, "symlink"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_allocated_ino(newi->ino(), mds->inotable->get_version()); - mds->locker->predirty_nested(mdr, &le->metablob, newi, dn->dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); + mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); le->metablob.add_primary_dentry(dn, true, newi); // log + wait @@ -2314,8 +2314,8 @@ void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti) // log + wait EUpdate *le = new EUpdate(mdlog, "link_local"); le->metablob.add_client_req(mdr->reqid); - mds->locker->predirty_nested(mdr, &le->metablob, targeti, dn->dir, PREDIRTY_DIR, 1); // new dn - mds->locker->predirty_nested(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti + mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->dir, PREDIRTY_DIR, 1); // new dn + mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti le->metablob.add_remote_dentry(dn, true, targeti->ino(), MODE_TO_DT(targeti->inode.mode)); // new remote mdcache->journal_dirty_inode(&le->metablob, targeti); @@ -2407,12 +2407,12 @@ void Server::_link_remote(MDRequest *mdr, bool inc, CDentry *dn, CInode *targeti if (inc) { dn->pre_dirty(); - mds->locker->predirty_nested(mdr, &le->metablob, targeti, dn->dir, PREDIRTY_DIR, 1); + mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->dir, PREDIRTY_DIR, 1); le->metablob.add_remote_dentry(dn, true, targeti->ino(), MODE_TO_DT(targeti->inode.mode)); // new remote } else { dn->pre_dirty(); - mds->locker->predirty_nested(mdr, &le->metablob, targeti, dn->dir, PREDIRTY_DIR, -1); + mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->dir, PREDIRTY_DIR, -1); mdcache->journal_cow_dentry(&le->metablob, dn); le->metablob.add_null_dentry(dn, true); } @@ -2554,7 +2554,7 @@ void Server::handle_slave_link_prep(MDRequest *mdr) dout(10) << " projected inode " << pi << " v " << pi->version << dendl; // commit case - mds->locker->predirty_nested(mdr, &le->commit, dn->inode, 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY, 0); + mdcache->predirty_journal_parents(mdr, &le->commit, dn->inode, 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY, 0); mdcache->journal_dirty_inode(&le->commit, targeti); mdlog->submit_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti)); @@ -2914,13 +2914,13 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) if (dn->is_primary()) { // primary link. add stray dentry. assert(straydn); - mds->locker->predirty_nested(mdr, &le->metablob, dn->inode, dn->dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, -1); - mds->locker->predirty_nested(mdr, &le->metablob, dn->inode, straydn->dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); + mdcache->predirty_journal_parents(mdr, &le->metablob, dn->inode, dn->dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, -1); + mdcache->predirty_journal_parents(mdr, &le->metablob, dn->inode, straydn->dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); le->metablob.add_primary_dentry(straydn, true, dn->inode, pi); } else { // remote link. update remote inode. - mds->locker->predirty_nested(mdr, &le->metablob, dn->inode, dn->dir, PREDIRTY_DIR, -1); - mds->locker->predirty_nested(mdr, &le->metablob, dn->inode, 0, PREDIRTY_PRIMARY); + mdcache->predirty_journal_parents(mdr, &le->metablob, dn->inode, dn->dir, PREDIRTY_DIR, -1); + mdcache->predirty_journal_parents(mdr, &le->metablob, dn->inode, 0, PREDIRTY_PRIMARY); mdcache->journal_dirty_inode(&le->metablob, dn->inode); } @@ -3598,16 +3598,16 @@ void Server::_rename_prepare(MDRequest *mdr, // sub off target if (destdn->is_auth() && !destdn->is_null()) - mds->locker->predirty_nested(mdr, metablob, destdn->inode, destdn->dir, + mdcache->predirty_journal_parents(mdr, metablob, destdn->inode, destdn->dir, (destdn->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1); // move srcdn int predirty_primary = (srcdn->is_primary() && srcdn->dir != destdn->dir) ? PREDIRTY_PRIMARY:0; int flags = predirty_dir | predirty_primary; if (srcdn->is_auth()) - mds->locker->predirty_nested(mdr, metablob, srcdn->inode, srcdn->dir, flags, -1); + mdcache->predirty_journal_parents(mdr, metablob, srcdn->inode, srcdn->dir, flags, -1); if (destdn->is_auth()) - mds->locker->predirty_nested(mdr, metablob, srcdn->inode, destdn->dir, flags, 1); + mdcache->predirty_journal_parents(mdr, metablob, srcdn->inode, destdn->dir, flags, 1); metablob->add_dir_context(srcdn->dir); metablob->add_dir_context(destdn->dir); @@ -4424,7 +4424,7 @@ void Server::handle_client_truncate(MDRequest *mdr) pi->ctime = ctime; pi->version = pdv; pi->size = le64_to_cpu(req->head.args.truncate.length); - mds->locker->predirty_nested(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); + mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); mdcache->journal_dirty_inode(&le->metablob, cur); mdlog->submit_entry(le, fin); @@ -4621,7 +4621,7 @@ void Server::handle_client_opent(MDRequest *mdr) pi->ctime = ctime; pi->version = pdv; pi->size = 0; - mds->locker->predirty_nested(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); + mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); mdcache->journal_dirty_inode(&le->metablob, cur); mdlog->submit_entry(le, fin); @@ -4713,7 +4713,7 @@ void Server::handle_client_openc(MDRequest *mdr) EUpdate *le = new EUpdate(mdlog, "openc"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_allocated_ino(in->ino(), mds->inotable->get_version()); - mds->locker->predirty_nested(mdr, &le->metablob, in, dn->dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); + mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); le->metablob.add_primary_dentry(dn, true, in); // log + wait @@ -4929,7 +4929,7 @@ void Server::handle_client_mksnap(MDRequest *mdr) EUpdate *le = new EUpdate(mdlog, "mksnap"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_table_transaction(TABLE_SNAP, stid); - mds->locker->predirty_nested(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false); + mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false); mdcache->journal_cow_inode(&le->metablob, diri); // project the snaprealm.. hack! diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 4c5e389ef7694..c0751c8c22cf6 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -89,7 +89,9 @@ struct frag_info_t { __s64 rfiles; __s64 rsubdirs; __s64 rsize() const { return rfiles + rsubdirs; } + __s64 ranchors; // for dirstat, includes inode's anchored flag. + __s64 rsnaprealms; void zero() { memset(this, 0, sizeof(*this)); @@ -108,6 +110,7 @@ struct frag_info_t { rfiles += cur.rfiles - acc.rfiles; rsubdirs += cur.rsubdirs - acc.rsubdirs; ranchors += cur.ranchors - acc.ranchors; + rsnaprealms += cur.rsnaprealms - acc.rsnaprealms; acc = cur; acc.version = version; } @@ -121,6 +124,7 @@ struct frag_info_t { ::encode(rfiles, bl); ::encode(rsubdirs, bl); ::encode(ranchors, bl); + ::encode(rsnaprealms, bl); ::encode(rctime, bl); } void decode(bufferlist::iterator &bl) { @@ -132,6 +136,7 @@ struct frag_info_t { ::decode(rfiles, bl); ::decode(rsubdirs, bl); ::decode(ranchors, bl); + ::decode(rsnaprealms, bl); ::decode(rctime, bl); } }; @@ -148,6 +153,7 @@ inline ostream& operator<<(ostream &out, const frag_info_t &f) { << " rc" << f.rctime << " b" << f.rbytes << " a" << f.ranchors + << " sr" << f.rsnaprealms << " " << f.rsize() << "=" << f.rfiles << "+" << f.rsubdirs << ")"; } -- 2.39.5