From 5391bf441e82c52c95cc6d2e941c771c4b60e0b4 Mon Sep 17 00:00:00 2001 From: sageweil Date: Tue, 3 Jul 2007 20:34:13 +0000 Subject: [PATCH] * journal trimming waits for scatterlocks to flush. git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1470 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/TODO | 8 +- branches/sage/cephmds2/mds/CInode.h | 2 +- branches/sage/cephmds2/mds/LocalLock.h | 2 + branches/sage/cephmds2/mds/Locker.cc | 47 ++++++--- branches/sage/cephmds2/mds/Locker.h | 12 +++ branches/sage/cephmds2/mds/ScatterLock.h | 16 +++- branches/sage/cephmds2/mds/Server.cc | 96 ++++++------------- branches/sage/cephmds2/mds/Server.h | 1 - branches/sage/cephmds2/mds/SimpleLock.h | 11 ++- branches/sage/cephmds2/mds/events/EMetaBlob.h | 9 ++ branches/sage/cephmds2/mds/journal.cc | 40 ++++++++ branches/sage/cephmds2/mds/mdstypes.h | 12 +-- 12 files changed, 163 insertions(+), 93 deletions(-) diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO index 9b83945e248c3..3ec5548339cf3 100644 --- a/branches/sage/cephmds2/TODO +++ b/branches/sage/cephmds2/TODO @@ -57,10 +57,10 @@ sage mds - then, mtimes: / - avoid migration race concern (on auth). - - writeback and dirty on gather. - - cleaned up pv/pi makes writebehind play nice with concurrent updates. - - should pin lock in LOCK state? - - scatterlock 'updated' flag. +/ - writeback and dirty on gather. +/ - cleaned up pv/pi makes writebehind play nice with concurrent updates. +/ - should pin lock in LOCK state? (acdtually, gather) +/ - scatterlock 'updated' flag. - on replica, clear only on sync | rescatter. - make sure "dirty" scatterlock prevents journal expire. - EMetaBlob map dirty_scatter; diff --git a/branches/sage/cephmds2/mds/CInode.h b/branches/sage/cephmds2/mds/CInode.h index a0c0ccbf80575..243dadc17b98d 100644 --- a/branches/sage/cephmds2/mds/CInode.h +++ b/branches/sage/cephmds2/mds/CInode.h @@ -92,7 +92,7 @@ class CInode : public MDSCacheObject { static const int STATE_OPENINGDIR = (1<<9); // -- waiters -- - static const int WAIT_SLAVEAGREE = (1<<0); + //static const int WAIT_SLAVEAGREE = (1<<0); static const int WAIT_DIR = (1<<1); static const int WAIT_ANCHORED = (1<<2); static const int WAIT_UNANCHORED = (1<<3); diff --git a/branches/sage/cephmds2/mds/LocalLock.h b/branches/sage/cephmds2/mds/LocalLock.h index b6fc62afc2012..752fdcb4d3fd1 100644 --- a/branches/sage/cephmds2/mds/LocalLock.h +++ b/branches/sage/cephmds2/mds/LocalLock.h @@ -34,10 +34,12 @@ public: } void get_wrlock() { assert(can_wrlock()); + if (num_wrlock == 0) parent->get(MDSCacheObject::PIN_LOCK); ++num_wrlock; } void put_wrlock() { --num_wrlock; + if (num_wrlock == 0) parent->put(MDSCacheObject::PIN_LOCK); } bool is_wrlocked() { return num_wrlock > 0; } int get_num_wrlocks() { return num_wrlock; } diff --git a/branches/sage/cephmds2/mds/Locker.cc b/branches/sage/cephmds2/mds/Locker.cc index 1ced7255ec05f..d268fbc7779d8 100644 --- a/branches/sage/cephmds2/mds/Locker.cc +++ b/branches/sage/cephmds2/mds/Locker.cc @@ -1422,17 +1422,10 @@ void Locker::scatter_eval_gather(ScatterLock *lock) // glockc -> lock? else if (lock->get_state() == LOCK_GLOCKC && !lock->is_gathering() && - !lock->is_wrlocked()) { + !lock->is_wrlocked() && + !lock->is_updated()) { dout(7) << "scatter_eval finished lock gather/un-wrlock on " << *lock << " on " << *lock->get_parent() << endl; - - if (lock->is_updated()) { - // updated flag is set: we got new data during the gather. - // write-behind journal. - // version_t v - - } - lock->set_state(LOCK_LOCK); //lock->get_parent()->put(CInode::PIN_SCATTERED); lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE); @@ -1478,7 +1471,8 @@ void Locker::scatter_eval_gather(ScatterLock *lock) else if ((lock->get_state() == LOCK_GTEMPSYNCC || lock->get_state() == LOCK_GTEMPSYNCL) && !lock->is_gathering() && - !lock->is_wrlocked()) { + !lock->is_wrlocked() && + !lock->is_updated()) { dout(7) << "scatter_eval finished tempsync gather/un-wrlock on " << *lock << " on " << *lock->get_parent() << endl; lock->set_state(LOCK_TEMPSYNC); @@ -1746,6 +1740,7 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) lock->set_state(LOCK_SYNC); lock->decode_locked_state(m->get_data()); + lock->clear_updated(); lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); break; @@ -1780,6 +1775,7 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) case LOCK_AC_SCATTER: assert(lock->get_state() == LOCK_LOCK); lock->decode_locked_state(m->get_data()); + lock->clear_updated(); lock->set_state(LOCK_SCATTER); //lock->get_parent()->get(CInode::PIN_SCATTERED); lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); @@ -1802,7 +1798,25 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent() << " from " << from << ", last one" << endl; - scatter_eval_gather(lock); + + if (lock->is_updated()) { + // journal write-behind. + CInode *in = (CInode*)lock->get_parent(); + inode_t *pi = in->project_inode(); + pi->version = in->pre_dirty(); + + EUpdate *le = new EUpdate("dir.mtime writebehind"); + le->metablob.add_dir_context(in->get_parent_dn()->get_dir()); + le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi); + + mds->mdlog->submit_entry(le); + mds->mdlog->wait_for_sync(new C_Locker_GatherWB(this, lock)); + } + else { + // WARNING: this is non-optimal, but simplest. + // just block the gather until we flush the writeback to the journal. + scatter_eval_gather(lock); + } } break; @@ -1822,6 +1836,17 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) delete m; } +void Locker::scatter_gather_writebehind(ScatterLock *lock) +{ + CInode *in = (CInode*)lock->get_parent(); + dout(10) << "scatter_gather_writebehind on " << *lock << " on " << *in << endl; + in->pop_and_dirty_projected_inode(); + lock->clear_updated(); + scatter_eval_gather(lock); +} + + + // ========================================================================== // local lock diff --git a/branches/sage/cephmds2/mds/Locker.h b/branches/sage/cephmds2/mds/Locker.h index 432f3f65a7e42..f630acb79f77a 100644 --- a/branches/sage/cephmds2/mds/Locker.h +++ b/branches/sage/cephmds2/mds/Locker.h @@ -110,6 +110,7 @@ public: void try_scatter_eval(ScatterLock *lock); void scatter_eval(ScatterLock *lock); // public for MDCache::adjust_subtree_auth() void scatter_eval_gather(ScatterLock *lock); + protected: void handle_scatter_lock(ScatterLock *lock, MLock *m); void scatter_sync(ScatterLock *lock); @@ -121,6 +122,17 @@ protected: bool scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr); void scatter_wrlock_finish(ScatterLock *lock, MDRequest *mdr); + class C_Locker_GatherWB : public Context { + Locker *locker; + ScatterLock *lock; + public: + C_Locker_GatherWB(Locker *l, ScatterLock *sl) : locker(l), lock(sl) {} + void finish(int r) { + locker->scatter_gather_writebehind(lock); + } + }; + void scatter_gather_writebehind(ScatterLock *lock); + // local protected: bool local_wrlock_start(LocalLock *lock, MDRequest *mdr); diff --git a/branches/sage/cephmds2/mds/ScatterLock.h b/branches/sage/cephmds2/mds/ScatterLock.h index ae9944ca537a5..56153ebef8409 100644 --- a/branches/sage/cephmds2/mds/ScatterLock.h +++ b/branches/sage/cephmds2/mds/ScatterLock.h @@ -98,8 +98,18 @@ public: } } - void set_updated() { updated = true; } - void clear_updated() { updated = false; } + void set_updated() { + if (!updated) { + parent->get(MDSCacheObject::PIN_DIRTYSCATTERED); + updated = true; + } + } + void clear_updated() { + if (updated) { + parent->put(MDSCacheObject::PIN_DIRTYSCATTERED); + updated = false; + } + } bool is_updated() { return updated; } void replicate_relax() { @@ -135,10 +145,12 @@ public: } void get_wrlock() { assert(can_wrlock()); + if (num_wrlock == 0) parent->get(MDSCacheObject::PIN_LOCK); ++num_wrlock; } void put_wrlock() { --num_wrlock; + if (num_wrlock == 0) parent->put(MDSCacheObject::PIN_LOCK); } bool is_wrlocked() { return num_wrlock > 0; } int get_num_wrlocks() { return num_wrlock; } diff --git a/branches/sage/cephmds2/mds/Server.cc b/branches/sage/cephmds2/mds/Server.cc index 7894b7cf4763b..3f263f1318ae6 100644 --- a/branches/sage/cephmds2/mds/Server.cc +++ b/branches/sage/cephmds2/mds/Server.cc @@ -1205,18 +1205,25 @@ version_t Server::predirty_dn_diri(MDRequest *mdr, CDentry *dn, EMetaBlob *blob) version_t dirpv = 0; CInode *diri = dn->dir->inode; - if (diri->is_auth() && - !diri->is_root() && - mdr->is_master()) { - assert(mdr->wrlocks.count(&diri->dirlock));// || // either we wrlocked, - //mdr->is_slave()); // or the master did. + if (diri->is_root()) return 0; + if (diri->is_auth()) { + assert(mdr->wrlocks.count(&diri->dirlock)); + dirpv = diri->pre_dirty(); + dout(10) << "predirty_dn_diri ctime/mtime " << mdr->now << " pv " << dirpv << " on " << *diri << endl; + + // predirty+journal inode_t *pi = diri->project_inode(); - pi->version = dirpv; + if (dirpv) pi->version = dirpv; pi->ctime = pi->mtime = mdr->now; blob->add_primary_dentry(diri->get_parent_dn(), true, 0, pi); - dout(10) << "predirty_dn_diri ctime/mtime " << mdr->now << " pv " << dirpv << " on " << *diri << endl; + } else { + // journal the mtime change anyway. + inode_t *ji = blob->add_primary_dentry(diri->get_parent_dn(), true); + ji->ctime = ji->mtime = mdr->now; + + blob->add_dirtied_inode_mtime(diri->ino(), mdr->now); } return dirpv; @@ -1229,66 +1236,19 @@ void Server::dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime) { CInode *diri = dn->dir->inode; - // make the udpate - diri->inode.ctime = diri->inode.mtime = mtime; + if (diri->is_root()) return; if (dirpv) { + // we journaled and predirtied. assert(diri->is_auth() && !diri->is_root()); - - // we were before, too. diri->pop_and_dirty_projected_inode(); - //diri->mark_dirty(dirpv); dout(10) << "dirty_dn_diri ctime/mtime " << mtime << " v " << diri->inode.version << " on " << *diri << endl; } else { - /*assert(!dn->is_auth() || // slave - !diri->is_auth() || - diri->is_root() || - diri->is_frozen()); // then not auth, or still importing. - */ + dout(10) << "dirty_dn_diri ctime/mtime " << mtime << " (non-dirty) on " << *diri << endl; // dirlock scatterlock will propagate the update. + diri->inode.ctime = diri->inode.mtime = mtime; + diri->dirlock.set_updated(); } - - /* any writebehind should be handled by the lock gather probably? - } else { - // write-behind. - if (!diri->is_dirty()) - dirty_diri_mtime_writebehind(diri, mtime); - // otherwise, if it's dirty, we know the mtime is journaled by another local update. - // (something after the import, or the import itself) - } - */ -} - - -class C_MDS_DirtyDiriMtimeWB : public Context { - Server *server; - CInode *diri; - version_t dirpv; -public: - C_MDS_DirtyDiriMtimeWB(Server *s, CInode *i, version_t v) : - server(s), diri(i), dirpv(v) {} - void finish(int r) { - diri->mark_dirty(dirpv); - diri->auth_unpin(); - } -}; - -void Server::dirty_diri_mtime_writebehind(CInode *diri, utime_t mtime) -{ - if (!diri->can_auth_pin()) - return; // oh well! hrm. - - diri->auth_pin(); - - // we're newly auth. write-behind. - EUpdate *le = new EUpdate("dir.mtime writebehind"); - le->metablob.add_dir_context(diri->get_parent_dn()->get_dir()); - inode_t *pi = diri->project_inode(); - pi->version = diri->pre_dirty(); - le->metablob.add_primary_dentry(diri->get_parent_dn(), true, 0, pi); - - mds->mdlog->submit_entry(le); - mds->mdlog->wait_for_sync(new C_MDS_DirtyDiriMtimeWB(this, diri, pi->version)); } @@ -2881,9 +2841,11 @@ void Server::_rename_prepare(MDRequest *mdr, bool linkmerge = (srcdn->inode == destdn->inode && (srcdn->is_primary() || destdn->is_primary())); - mdr->pvmap[destdn->dir->inode] = predirty_dn_diri(mdr, destdn, metablob); - if (destdn->dir != srcdn->dir) - mdr->pvmap[srcdn->dir->inode] = predirty_dn_diri(mdr, srcdn, metablob); + if (mdr->is_master()) { + mdr->pvmap[destdn->dir->inode] = predirty_dn_diri(mdr, destdn, metablob); + if (destdn->dir != srcdn->dir) + mdr->pvmap[srcdn->dir->inode] = predirty_dn_diri(mdr, srcdn, metablob); + } inode_t *ji = 0; // journaled inode getting nlink-- version_t ipv; // it's version @@ -2996,10 +2958,12 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen (srcdn->is_primary() || destdn->is_primary())); // dir mtimes - dirty_dn_diri(destdn, mdr->pvmap[destdn->dir->inode], mdr->now); - if (destdn->dir != srcdn->dir) - dirty_dn_diri(srcdn, mdr->pvmap[srcdn->dir->inode], mdr->now); - + if (mdr->is_master()) { + dirty_dn_diri(destdn, mdr->pvmap[destdn->dir->inode], mdr->now); + if (destdn->dir != srcdn->dir) + dirty_dn_diri(srcdn, mdr->pvmap[srcdn->dir->inode], mdr->now); + } + if (linkmerge) { if (destdn->is_primary()) { dout(10) << "merging remote onto primary link" << endl; diff --git a/branches/sage/cephmds2/mds/Server.h b/branches/sage/cephmds2/mds/Server.h index fda67aae48727..4a34ff4793259 100644 --- a/branches/sage/cephmds2/mds/Server.h +++ b/branches/sage/cephmds2/mds/Server.h @@ -80,7 +80,6 @@ public: version_t predirty_dn_diri(MDRequest *mdr, CDentry *dn, class EMetaBlob *blob); void dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime); - void dirty_diri_mtime_writebehind(CInode *diri, utime_t mtime); // requests on existing inodes. diff --git a/branches/sage/cephmds2/mds/SimpleLock.h b/branches/sage/cephmds2/mds/SimpleLock.h index 23669126c72f9..faeb885d81777 100644 --- a/branches/sage/cephmds2/mds/SimpleLock.h +++ b/branches/sage/cephmds2/mds/SimpleLock.h @@ -165,19 +165,26 @@ public: // ref counting bool is_rdlocked() { return num_rdlock > 0; } - int get_rdlock() { return ++num_rdlock; } + int get_rdlock() { + if (!num_rdlock) parent->get(MDSCacheObject::PIN_LOCK); + return ++num_rdlock; + } int put_rdlock() { assert(num_rdlock>0); - return --num_rdlock; + --num_rdlock; + if (num_rdlock == 0) parent->put(MDSCacheObject::PIN_LOCK); + return num_rdlock; } int get_num_rdlocks() { return num_rdlock; } void get_xlock(MDRequest *who) { assert(xlock_by == 0); + parent->get(MDSCacheObject::PIN_LOCK); xlock_by = who; } void put_xlock() { assert(xlock_by); + parent->put(MDSCacheObject::PIN_LOCK); xlock_by = 0; } bool is_xlocked() { return xlock_by ? true:false; } diff --git a/branches/sage/cephmds2/mds/events/EMetaBlob.h b/branches/sage/cephmds2/mds/events/EMetaBlob.h index 9aff60c541052..1d7222104947e 100644 --- a/branches/sage/cephmds2/mds/events/EMetaBlob.h +++ b/branches/sage/cephmds2/mds/events/EMetaBlob.h @@ -212,6 +212,9 @@ class EMetaBlob { // anchor transactions included in this update. list atids; + // inode dirlocks (scatterlocks) i've touched. + map dirty_inode_mtimes; + // ino's i've allocated list allocated_inos; version_t alloc_tablev; @@ -232,6 +235,10 @@ class EMetaBlob { atids.push_back(atid); } + void add_dirtied_inode_mtime(inodeno_t ino, utime_t ctime) { + dirty_inode_mtimes[ino] = ctime; + } + void add_allocated_ino(inodeno_t ino, version_t tablev) { allocated_inos.push_back(ino); alloc_tablev = tablev; @@ -355,6 +362,7 @@ class EMetaBlob { lump_map[*i]._encode(bl); } ::_encode(atids, bl); + ::_encode(dirty_inode_mtimes, bl); ::_encode(allocated_inos, bl); if (!allocated_inos.empty()) ::_encode(alloc_tablev, bl); @@ -373,6 +381,7 @@ class EMetaBlob { lump_map[dirfrag]._decode(bl, off); } ::_decode(atids, bl, off); + ::_decode(dirty_inode_mtimes, bl, off); ::_decode(allocated_inos, bl, off); if (!allocated_inos.empty()) ::_decode(alloc_tablev, bl, off); diff --git a/branches/sage/cephmds2/mds/journal.cc b/branches/sage/cephmds2/mds/journal.cc index c337bf017359a..988bf6e2b15aa 100644 --- a/branches/sage/cephmds2/mds/journal.cc +++ b/branches/sage/cephmds2/mds/journal.cc @@ -138,6 +138,20 @@ bool EMetaBlob::has_expired(MDS *mds) return false; } } + + if (!dirty_inode_mtimes.empty()) + for (map::iterator p = dirty_inode_mtimes.begin(); + p != dirty_inode_mtimes.end(); + ++p) { + CInode *in = mds->mdcache->get_inode(p->first); + if (in) { + if (in->inode.ctime == p->second && + in->dirlock.is_updated()) { + dout(10) << "EMetaBlob.has_expired dirty mtime dirlock hasn't flushed on " << *in << endl; + return false; + } + } + } // allocated_ios if (!allocated_inos.empty()) { @@ -270,6 +284,22 @@ void EMetaBlob::expire(MDS *mds, Context *c) } } + // dirtied inode mtimes + if (!dirty_inode_mtimes.empty()) + for (map::iterator p = dirty_inode_mtimes.begin(); + p != dirty_inode_mtimes.end(); + ++p) { + CInode *in = mds->mdcache->get_inode(p->first); + if (in) { + if (in->inode.ctime == p->second && + in->dirlock.is_updated()) { + dout(10) << "EMetaBlob.expire dirty mtime dirlock hasn't flushed, waiting on " + << *in << endl; + in->dirlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub()); + } + } + } + // allocated_inos if (!allocated_inos.empty()) { version_t cv = mds->idalloc->get_committed_version(); @@ -437,6 +467,16 @@ void EMetaBlob::replay(MDS *mds) mds->anchorclient->got_journaled_agree(*p); } + // dirtied inode mtimes + if (!dirty_inode_mtimes.empty()) + for (map::iterator p = dirty_inode_mtimes.begin(); + p != dirty_inode_mtimes.end(); + ++p) { + CInode *in = mds->mdcache->get_inode(p->first); + dout(10) << "EMetaBlob.replay setting dirlock updated flag on " << *in << endl; + in->dirlock.set_updated(); + } + // allocated_inos if (!allocated_inos.empty()) { if (mds->idalloc->get_version() >= alloc_tablev) { diff --git a/branches/sage/cephmds2/mds/mdstypes.h b/branches/sage/cephmds2/mds/mdstypes.h index 749de0e85e7f7..b2c3c302d99dd 100644 --- a/branches/sage/cephmds2/mds/mdstypes.h +++ b/branches/sage/cephmds2/mds/mdstypes.h @@ -292,19 +292,19 @@ class MDSCacheObject { // -- pins -- const static int PIN_REPLICATED = 1000; const static int PIN_DIRTY = 1001; - const static int PIN_RDLOCK = -1002; - const static int PIN_XLOCK = 1003; - const static int PIN_REQUEST = -1004; - const static int PIN_WAITER = 1005; + const static int PIN_LOCK = -1002; + const static int PIN_REQUEST = -1003; + const static int PIN_WAITER = 1004; + const static int PIN_DIRTYSCATTERED = 1005; const char *generic_pin_name(int p) { switch (p) { case PIN_REPLICATED: return "replicated"; case PIN_DIRTY: return "dirty"; - case PIN_RDLOCK: return "rdlock"; - case PIN_XLOCK: return "xlock"; + case PIN_LOCK: return "lock"; case PIN_REQUEST: return "request"; case PIN_WAITER: return "waiter"; + case PIN_DIRTYSCATTERED: return "dirtyscattered"; default: assert(0); } } -- 2.39.5