From f571124dc54c6dc937f0d6cb17874a03e38e2968 Mon Sep 17 00:00:00 2001 From: sageweil Date: Mon, 2 Jul 2007 17:55:24 +0000 Subject: [PATCH] * migrator: pin exporting dirs * migrator: send export_cancel to right person.. and dispatch it * locker: auth_pin in unstable states * locker: _eval functions broken up, cleaned up * server: projected_inodes tweaking * server: commit/rollback needs reworking. link() partly there. git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1465 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/TODO | 17 +- branches/sage/cephmds2/ebofs/FileJournal.cc | 2 +- branches/sage/cephmds2/mds/CInode.cc | 26 ++ branches/sage/cephmds2/mds/CInode.h | 41 +- branches/sage/cephmds2/mds/Locker.cc | 413 +++++++++--------- branches/sage/cephmds2/mds/Locker.h | 8 +- branches/sage/cephmds2/mds/MDCache.cc | 47 +- branches/sage/cephmds2/mds/Migrator.cc | 18 +- branches/sage/cephmds2/mds/Server.cc | 129 +++--- branches/sage/cephmds2/mds/Server.h | 6 +- .../sage/cephmds2/mds/events/ESlaveUpdate.h | 5 +- branches/sage/cephmds2/mds/journal.cc | 2 +- branches/sage/cephmds2/mon/MDSMonitor.cc | 4 + 13 files changed, 417 insertions(+), 301 deletions(-) diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO index 1be4677f036ae..0c39ad48af2ef 100644 --- a/branches/sage/cephmds2/TODO +++ b/branches/sage/cephmds2/TODO @@ -33,7 +33,6 @@ general kernel planning - sage doc - mdsmonitor beacon semantics - cache expiration, cache invariants @@ -47,14 +46,26 @@ sage doc sage mds +- slave request rollback - why it's hard. + - we pipeline multiple updates to the same inode. +- e.g., link/unlink + - live metadata needs to be cleaned (fi slave survives) + - sufficient data for rollback (old ctime, ++ or --) needs to be journaled with original prepare + - slave recovery needs to apply prepare's rollback to it's live metadata... _and_ re-journal a rollback entry (if it needs to) + - rollback should include cleaned inode data, with (!!) incremented version so that reverted data gets flushed to storage properly. +- e.g., rename + - hrm.! + + - unlink needs to journal on witnesses (probably), since unlinked inodes may be in those journals -- rename_prep should create a subtree (auth,auth) so that cache expires are routed properly -- make locks auth_pin for unstable states. +- make locks auth_pin for unstable states. + - can locker *_eval() starve freezing? - fix rename to delay the _apply. - need to fix locking vs migration first. +- rename_prep should create a subtree (auth,auth) so that cache expires are routed properly - then, mtimes: - avoid migration race concern (on auth). diff --git a/branches/sage/cephmds2/ebofs/FileJournal.cc b/branches/sage/cephmds2/ebofs/FileJournal.cc index 74edecf41c71a..8bc942c861b92 100644 --- a/branches/sage/cephmds2/ebofs/FileJournal.cc +++ b/branches/sage/cephmds2/ebofs/FileJournal.cc @@ -23,7 +23,7 @@ #include "config.h" #undef dout -#define dout(x) if (true || x <= g_conf.debug_ebofs) cout << "ebofs(" << ebofs->dev.get_device_name() << ").journal " +#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << ebofs->dev.get_device_name() << ").journal " #define derr(x) if (x <= g_conf.debug_ebofs) cerr << "ebofs(" << ebofs->dev.get_device_name() << ").journal " diff --git a/branches/sage/cephmds2/mds/CInode.cc b/branches/sage/cephmds2/mds/CInode.cc index 05018b8265502..ef2cf83c19a35 100644 --- a/branches/sage/cephmds2/mds/CInode.cc +++ b/branches/sage/cephmds2/mds/CInode.cc @@ -97,6 +97,32 @@ void CInode::print(ostream& out) } + +inode_t *CInode::project_inode() +{ + if (projected_inode.empty()) { + projected_inode.push_back(new inode_t(inode)); + } else { + projected_inode.push_back(new inode_t(*projected_inode.back())); + } + dout(15) << "project_inode " << projected_inode.back() << endl; + return projected_inode.back(); +} + +void CInode::pop_and_dirty_projected_inode() +{ + assert(!projected_inode.empty()); + dout(15) << "pop_and_dirty_projected_inode " << projected_inode.front() + << " v" << projected_inode.front()->version << endl; + mark_dirty(projected_inode.front()->version); + inode = *projected_inode.front(); + delete projected_inode.front(); + projected_inode.pop_front(); +} + + + + // ====== CInode ======= // dirfrags diff --git a/branches/sage/cephmds2/mds/CInode.h b/branches/sage/cephmds2/mds/CInode.h index c0fc9c5881ffb..04412a0af8261 100644 --- a/branches/sage/cephmds2/mds/CInode.h +++ b/branches/sage/cephmds2/mds/CInode.h @@ -122,16 +122,33 @@ class CInode : public MDSCacheObject { off_t last_open_journaled; // log offset for the last journaled EOpen // projected values (only defined while dirty) - list projected_inode; + list projected_inode; list projected_dirfragtree; - + + + inode_t *project_inode(); + void pop_and_dirty_projected_inode(); + + /* inode_t *project_inode() { - if (projected_inode.empty()) - projected_inode.push_back(inode); - else - projected_inode.push_back(projected_inode.back()); - return &projected_inode.back(); + if (projected_inode.empty()) { + projected_inode.push_back(new inode_t(inode)); + } else { + inode_t *lastback = projected_inode.back(); + projected_inode.push_back(new inode_t); + *projected_inode.back() = *lastback; + } + return projected_inode.back(); } + void pop_and_dirty_projected_inode() { + assert(!projected_inode.empty()); + mark_dirty(projected_inode.front()->version); + inode = *projected_inode.front(); + delete projected_inode.front(); + projected_inode.pop_front(); + }*/ + + /* fragtree_t *project_dirfragtree() { if (projected_dirfragtree.empty()) projected_dirfragtree.push_back(dirfragtree); @@ -139,16 +156,12 @@ class CInode : public MDSCacheObject { projected_dirfragtree.push_back(projected_dirfragtree.back()); return &projected_dirfragtree.back(); } - void pop_and_dirty_projected_inode() { - mark_dirty(projected_inode.front().version); - inode = projected_inode.front(); - projected_inode.pop_front(); - } void pop_and_dirty_projected_dirfragtree() { - mark_dirty(projected_inode.front().version); + assert(!projected_dirfragtree.empty()); + mark_dirty(projected_dirfragtree.front().version); dirfragtree = projected_dirfragtree.front(); projected_dirfragtree.pop_front(); - } + }*/ // -- cache infrastructure -- diff --git a/branches/sage/cephmds2/mds/Locker.cc b/branches/sage/cephmds2/mds/Locker.cc index 2332208dde482..26ff41f90ebf7 100644 --- a/branches/sage/cephmds2/mds/Locker.cc +++ b/branches/sage/cephmds2/mds/Locker.cc @@ -427,7 +427,8 @@ Capability* Locker::issue_new_caps(CInode *in, if (in->is_auth()) { // [auth] twiddle mode? - file_eval(&in->filelock); + if (in->filelock.is_stable()) + file_eval(&in->filelock); } else { // [replica] tell auth about any new caps wanted request_inode_file_caps(in); @@ -584,7 +585,8 @@ void Locker::handle_inode_file_caps(MInodeFileCaps *m) else in->mds_caps_wanted.erase(m->get_from()); - try_file_eval(&in->filelock); // ** may or may not be auth_pinned ** + if (in->filelock.is_stable()) + try_file_eval(&in->filelock); // ** may or may not be auth_pinned ** delete m; } @@ -674,7 +676,8 @@ void Locker::handle_client_file_caps(MClientFileCaps *m) } // reevaluate, waiters - try_file_eval(&in->filelock); // ** may or may not be auth_pinned ** + if (in->filelock.is_stable()) + try_file_eval(&in->filelock); // ** may or may not be auth_pinned ** in->finish_waiting(CInode::WAIT_CAPS, 0); delete m; @@ -853,6 +856,7 @@ void Locker::handle_simple_lock(SimpleLock *lock, MLock *m) delete m; } +/* unused, currently. class C_Locker_SimpleEval : public Context { Locker *locker; @@ -860,10 +864,38 @@ class C_Locker_SimpleEval : public Context { public: C_Locker_SimpleEval(Locker *l, SimpleLock *lk) : locker(l), lock(lk) {} void finish(int r) { - locker->simple_eval(lock); + locker->try_simple_eval(lock); } }; +void Locker::try_simple_eval(SimpleLock *lock) +{ + // unstable and ambiguous auth? + if (!lock->is_stable() && + lock->get_parent()->is_ambiguous_auth()) { + dout(7) << "simple_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << endl; + //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) + lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_SimpleEval(this, lock)); + return; + } + + if (!lock->get_parent()->is_auth()) { + dout(7) << "try_simple_eval not auth for " << *lock->get_parent() << endl; + return; + } + + if (!lock->get_parent()->can_auth_pin()) { + dout(7) << "try_simple_eval can't auth_pin, waiting on " << *lock->get_parent() << endl; + //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) + lock->get_parent()->add_waiter(MDSCacheObject::WAIT_AUTHPINNABLE, new C_Locker_SimpleEval(this, lock)); + return; + } + + if (lock->is_stable()) + simple_eval(lock); +} +*/ + void Locker::simple_eval_gather(SimpleLock *lock) { dout(10) << "simple_eval_gather " << *lock << " on " << *lock->get_parent() << endl; @@ -884,9 +916,10 @@ void Locker::simple_eval_gather(SimpleLock *lock) lock->set_state(LOCK_LOCK); lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR); + lock->get_parent()->auth_unpin(); // re-eval? - if (lock->get_parent()->can_auth_pin()) + //if (lock->get_parent()->can_auth_pin()) simple_eval(lock); } } @@ -895,35 +928,11 @@ void Locker::simple_eval(SimpleLock *lock) { dout(10) << "simple_eval " << *lock << " on " << *lock->get_parent() << endl; - // unstable and ambiguous auth? - if (!lock->is_stable() && - lock->get_parent()->is_ambiguous_auth()) { - dout(7) << "simple_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << endl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_SimpleEval(this, lock)); - return; - } - - // finished remote xlock? - /* hmm: why did i do this here, and not in simple_xlock_finish()? - if (lock->get_state() == LOCK_REMOTEXLOCK && - !lock->is_xlocked()) { - // tell auth - assert(!lock->get_parent()->is_auth()); // should be auth_pinned on the auth - dout(7) << "simple_eval releasing remote xlock on " << *lock->get_parent() << endl; - int auth = lock->get_parent()->authority().first; - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { - MMDSSlaveRequest *slavereq = new MMDSSlaveRequest(); - mds->send_message_mds(new MLock(lock, LOCK_AC_UNXLOCK, mds->get_nodeid()), - auth, MDS_PORT_LOCKER); - lock->set_state(LOCK_LOCK); - } - */ + assert(lock->get_parent()->is_auth()); + assert(lock->is_stable()); // stable -> sync? - if (lock->get_parent()->is_auth() && - lock->is_stable() && - !lock->is_xlocked() && + if (!lock->is_xlocked() && lock->get_state() != LOCK_SYNC && !lock->is_waiter_for(SimpleLock::WAIT_WR)) { dout(7) << "simple_eval stable, syncing " << *lock @@ -940,12 +949,11 @@ void Locker::simple_sync(SimpleLock *lock) { dout(7) << "simple_sync on " << *lock << " on " << *lock->get_parent() << endl; assert(lock->get_parent()->is_auth()); + assert(lock->is_stable()); // check state if (lock->get_state() == LOCK_SYNC) return; // already sync - if (lock->get_state() == LOCK_GLOCKR) - assert(0); // um... hmm! assert(lock->get_state() == LOCK_LOCK); // sync. @@ -969,11 +977,10 @@ void Locker::simple_lock(SimpleLock *lock) { dout(7) << "simple_lock on " << *lock << " on " << *lock->get_parent() << endl; assert(lock->get_parent()->is_auth()); + assert(lock->is_stable()); // check state - if (lock->get_state() == LOCK_LOCK || - lock->get_state() == LOCK_GLOCKR) - return; // already lock or locking + if (lock->get_state() == LOCK_LOCK) return; assert(lock->get_state() == LOCK_SYNC); if (lock->get_parent()->is_replicated()) { @@ -983,6 +990,7 @@ void Locker::simple_lock(SimpleLock *lock) // change lock lock->set_state(LOCK_GLOCKR); lock->init_gather(); + lock->get_parent()->auth_pin(); } else { lock->set_state(LOCK_LOCK); } @@ -1130,8 +1138,9 @@ void Locker::simple_xlock_finish(SimpleLock *lock, MDRequest *mdr) // others waiting? lock->finish_waiters(SimpleLock::WAIT_WR, 0); - // eval - simple_eval_gather(lock); + // eval? + if (lock->get_parent()->is_auth()) + simple_eval(lock); } @@ -1301,11 +1310,39 @@ class C_Locker_ScatterEval : public Context { public: C_Locker_ScatterEval(Locker *l, ScatterLock *lk) : locker(l), lock(lk) {} void finish(int r) { - locker->scatter_eval(lock); + locker->try_scatter_eval(lock); } }; +void Locker::try_scatter_eval(ScatterLock *lock) +{ + // unstable and ambiguous auth? + if (!lock->is_stable() && + lock->get_parent()->is_ambiguous_auth()) { + dout(7) << "scatter_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << endl; + //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) + lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_ScatterEval(this, lock)); + return; + } + + if (!lock->get_parent()->is_auth()) { + dout(7) << "try_scatter_eval not auth for " << *lock->get_parent() << endl; + return; + } + + if (!lock->get_parent()->can_auth_pin()) { + dout(7) << "try_scatter_eval can't auth_pin, waiting on " << *lock->get_parent() << endl; + //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) + lock->get_parent()->add_waiter(MDSCacheObject::WAIT_AUTHPINNABLE, new C_Locker_ScatterEval(this, lock)); + return; + } + + if (lock->is_stable()) + scatter_eval(lock); +} + + void Locker::scatter_eval_gather(ScatterLock *lock) { dout(10) << "scatter_eval_gather " << *lock << " on " << *lock->get_parent() << endl; @@ -1339,16 +1376,15 @@ void Locker::scatter_eval_gather(ScatterLock *lock) << " on " << *lock->get_parent() << endl; lock->set_state(LOCK_LOCK); lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE); + lock->get_parent()->auth_unpin(); } // glockc -> lock? - if (lock->get_state() == LOCK_GLOCKC && - !lock->is_gathering() && - !lock->is_wrlocked()) { + else if (lock->get_state() == LOCK_GLOCKC && + !lock->is_gathering() && + !lock->is_wrlocked()) { dout(7) << "scatter_eval finished lock gather/un-wrlock on " << *lock << " on " << *lock->get_parent() << endl; - lock->set_state(LOCK_LOCK); - //lock->get_parent()->put(CInode::PIN_SCATTERED); if (lock->is_updated()) { // updated flag is set: we got new data during the gather. @@ -1357,59 +1393,62 @@ void Locker::scatter_eval_gather(ScatterLock *lock) } + lock->set_state(LOCK_LOCK); + //lock->get_parent()->put(CInode::PIN_SCATTERED); lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE); + lock->get_parent()->auth_unpin(); } // gSyncL -> sync? - if (lock->get_state() == LOCK_GSYNCL && - !lock->is_wrlocked()) { + else if (lock->get_state() == LOCK_GSYNCL && + !lock->is_wrlocked()) { dout(7) << "scatter_eval finished sync un-wrlock on " << *lock << " on " << *lock->get_parent() << endl; - lock->set_state(LOCK_SYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); - if (lock->get_parent()->is_replicated()) { // encode and bcast bufferlist data; lock->encode_locked_state(data); send_lock_message(lock, LOCK_AC_SYNC, data); } + lock->set_state(LOCK_SYNC); + lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); + lock->get_parent()->auth_unpin(); } // gscattert|gscatters -> scatter? - if ((lock->get_state() == LOCK_GSCATTERT || - lock->get_state() == LOCK_GSCATTERS) && - !lock->is_gathering() && - !lock->is_rdlocked()) { + else if ((lock->get_state() == LOCK_GSCATTERT || + lock->get_state() == LOCK_GSCATTERS) && + !lock->is_gathering() && + !lock->is_rdlocked()) { dout(7) << "scatter_eval finished scatter un-rdlock(/gather) on " << *lock << " on " << *lock->get_parent() << endl; - lock->set_state(LOCK_SCATTER); - //lock->get_parent()->get(CInode::PIN_SCATTERED); - lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); - if (lock->get_parent()->is_replicated()) { // encode and bcast bufferlist data; lock->encode_locked_state(data); send_lock_message(lock, LOCK_AC_SCATTER, data); } + lock->set_state(LOCK_SCATTER); + //lock->get_parent()->get(CInode::PIN_SCATTERED); + lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); + lock->get_parent()->auth_unpin(); } // gTempsyncC|gTempsyncL -> tempsync - if ((lock->get_state() == LOCK_GTEMPSYNCC || - lock->get_state() == LOCK_GTEMPSYNCL) && - !lock->is_gathering() && - !lock->is_wrlocked()) { + else if ((lock->get_state() == LOCK_GTEMPSYNCC || + lock->get_state() == LOCK_GTEMPSYNCL) && + !lock->is_gathering() && + !lock->is_wrlocked()) { dout(7) << "scatter_eval finished tempsync gather/un-wrlock on " << *lock << " on " << *lock->get_parent() << endl; lock->set_state(LOCK_TEMPSYNC); lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); + lock->get_parent()->auth_unpin(); } // re-eval? - if (lock->is_stable() && - lock->get_parent()->can_auth_pin()) + if (lock->is_stable()) // && lock->get_parent()->can_auth_pin()) scatter_eval(lock); } } @@ -1418,31 +1457,22 @@ void Locker::scatter_eval(ScatterLock *lock) { dout(10) << "scatter_eval " << *lock << " on " << *lock->get_parent() << endl; - // unstable and ambiguous auth? - if (!lock->is_stable() && - lock->get_parent()->is_ambiguous_auth()) { - dout(7) << "scatter_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << endl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_ScatterEval(this, lock)); - return; - } + assert(lock->get_parent()->is_auth()); + assert(lock->is_stable()); - if (lock->get_parent()->is_auth() && - lock->is_stable()) { - if (((CInode*)lock->get_parent())->has_subtree_root_dirfrag()) { - // i _should_ be scattered. - if (!lock->is_rdlocked() && - !lock->is_xlocked()) { - dout(10) << "scatter_eval no rdlocks|xlocks, am subtree root inode, scattering" << endl; - scatter_scatter(lock); - } - } else { - // i _should_ be sync. - if (!lock->is_wrlocked() && - !lock->is_xlocked()) { - dout(10) << "scatter_eval no wrlocks|xlocks, not subtree root inode, syncing" << endl; - scatter_sync(lock); - } + if (((CInode*)lock->get_parent())->has_subtree_root_dirfrag()) { + // i _should_ be scattered. + if (!lock->is_rdlocked() && + !lock->is_xlocked()) { + dout(10) << "scatter_eval no rdlocks|xlocks, am subtree root inode, scattering" << endl; + scatter_scatter(lock); + } + } else { + // i _should_ be sync. + if (!lock->is_wrlocked() && + !lock->is_xlocked()) { + dout(10) << "scatter_eval no wrlocks|xlocks, not subtree root inode, syncing" << endl; + scatter_sync(lock); } } } @@ -1453,7 +1483,6 @@ void Locker::scatter_sync(ScatterLock *lock) dout(10) << "scatter_sync " << *lock << " on " << *lock->get_parent() << endl; assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); switch (lock->get_state()) { @@ -1466,6 +1495,7 @@ void Locker::scatter_sync(ScatterLock *lock) case LOCK_LOCK: if (lock->is_wrlocked() || lock->is_xlocked()) { lock->set_state(LOCK_GSYNCL); + lock->get_parent()->auth_pin(); return; } break; // do it. @@ -1482,6 +1512,7 @@ void Locker::scatter_sync(ScatterLock *lock) } } lock->set_state(LOCK_GLOCKC); + lock->get_parent()->auth_pin(); return; default: @@ -1517,6 +1548,7 @@ void Locker::scatter_scatter(ScatterLock *lock) lock->init_gather(); } lock->set_state(LOCK_GSCATTERS); + lock->get_parent()->auth_pin(); return; case LOCK_LOCK: @@ -1530,6 +1562,7 @@ void Locker::scatter_scatter(ScatterLock *lock) case LOCK_TEMPSYNC: if (lock->is_rdlocked()) { lock->set_state(LOCK_GSCATTERT); + lock->get_parent()->auth_pin(); return; } break; // do it @@ -1555,7 +1588,6 @@ void Locker::scatter_lock(ScatterLock *lock) dout(10) << "scatter_lock " << *lock << " on " << *lock->get_parent() << endl; assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); switch (lock->get_state()) { @@ -1569,6 +1601,7 @@ void Locker::scatter_lock(ScatterLock *lock) lock->init_gather(); } lock->set_state(LOCK_GLOCKS); + lock->get_parent()->auth_pin(); return; case LOCK_LOCK: @@ -1586,11 +1619,13 @@ void Locker::scatter_lock(ScatterLock *lock) lock->init_gather(); } lock->set_state(LOCK_GLOCKC); + lock->get_parent()->auth_pin(); return; case LOCK_TEMPSYNC: if (lock->is_rdlocked()) { lock->set_state(LOCK_GLOCKT); + lock->get_parent()->auth_pin(); return; } break; // do it. @@ -1606,7 +1641,6 @@ void Locker::scatter_tempsync(ScatterLock *lock) dout(10) << "scatter_tempsync " << *lock << " on " << *lock->get_parent() << endl; assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); switch (lock->get_state()) { @@ -1617,6 +1651,7 @@ void Locker::scatter_tempsync(ScatterLock *lock) if (lock->is_wrlocked() || lock->is_xlocked()) { lock->set_state(LOCK_GTEMPSYNCL); + lock->get_parent()->auth_pin(); return; } break; // do it. @@ -1633,6 +1668,7 @@ void Locker::scatter_tempsync(ScatterLock *lock) lock->init_gather(); } lock->set_state(LOCK_GTEMPSYNCC); + lock->get_parent()->auth_pin(); return; case LOCK_TEMPSYNC: @@ -1726,7 +1762,7 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent() << " from " << from << ", last one" << endl; - scatter_eval(lock); + scatter_eval_gather(lock); } break; @@ -1882,9 +1918,8 @@ void Locker::file_xlock_finish(FileLock *lock, MDRequest *mdr) // others waiting? lock->finish_waiters(SimpleLock::WAIT_WR, 0); - //// drop lock? - //if (!lock->is_waiter_for(SimpleLock::WAIT_STABLE)) - file_eval_gather(lock); + if (lock->get_parent()->is_auth()) + file_eval(lock); } @@ -1901,7 +1936,7 @@ class C_Locker_FileEval : public Context { public: C_Locker_FileEval(Locker *l, FileLock *lk) : locker(l), lock(lk) {} void finish(int r) { - locker->file_eval(lock); + locker->try_file_eval(lock); } }; @@ -1918,6 +1953,11 @@ void Locker::try_file_eval(FileLock *lock) return; } + if (!lock->get_parent()->is_auth()) { + dout(7) << "try_file_eval not auth for " << *lock->get_parent() << endl; + return; + } + if (!lock->get_parent()->can_auth_pin()) { dout(7) << "try_file_eval can't auth_pin, waiting on " << *in << endl; //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) @@ -1925,7 +1965,8 @@ void Locker::try_file_eval(FileLock *lock) return; } - file_eval(lock); + if (lock->is_stable()) + file_eval(lock); } @@ -1935,9 +1976,10 @@ void Locker::file_eval_gather(FileLock *lock) CInode *in = (CInode*)lock->get_parent(); int issued = in->get_caps_issued(); + assert(!lock->is_stable()); + // [auth] finished gather? if (in->is_auth() && - !lock->is_stable() && !lock->is_gathering()) { dout(7) << "file_eval finished mds gather on " << *lock << " on " << *lock->get_parent() << endl; @@ -1953,6 +1995,7 @@ void Locker::file_eval_gather(FileLock *lock) lock->get_rdlock(); lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD); lock->put_rdlock(); + lock->get_parent()->auth_unpin(); } break; @@ -1961,6 +2004,7 @@ void Locker::file_eval_gather(FileLock *lock) if ((issued & ~(CAP_FILE_RD)) == 0) { lock->set_state(LOCK_MIXED); lock->finish_waiters(SimpleLock::WAIT_STABLE); + lock->get_parent()->auth_unpin(); } break; @@ -1978,6 +2022,7 @@ void Locker::file_eval_gather(FileLock *lock) } lock->finish_waiters(SimpleLock::WAIT_STABLE); + lock->get_parent()->auth_unpin(); } break; @@ -1986,6 +2031,7 @@ void Locker::file_eval_gather(FileLock *lock) if (issued == 0) { lock->set_state(LOCK_LONER); lock->finish_waiters(SimpleLock::WAIT_STABLE); + lock->get_parent()->auth_unpin(); } break; @@ -1993,6 +2039,7 @@ void Locker::file_eval_gather(FileLock *lock) if ((issued & ~CAP_FILE_WR) == 0) { lock->set_state(LOCK_LONER); lock->finish_waiters(SimpleLock::WAIT_STABLE); + lock->get_parent()->auth_unpin(); } break; @@ -2013,6 +2060,7 @@ void Locker::file_eval_gather(FileLock *lock) lock->get_rdlock(); lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); lock->put_rdlock(); + lock->get_parent()->auth_unpin(); } break; @@ -2021,11 +2069,14 @@ void Locker::file_eval_gather(FileLock *lock) } issue_caps(in); + + // stable re-eval? + if (lock->is_stable()) //&& lock->get_parent()->can_auth_pin()) + file_eval(lock); } // [replica] finished caps gather? - if (!in->is_auth() && - !lock->is_stable()) { + if (!in->is_auth()) { switch (lock->get_state()) { case LOCK_GMIXEDR: if ((issued & ~(CAP_FILE_RD)) == 0) { @@ -2053,81 +2104,61 @@ void Locker::file_eval_gather(FileLock *lock) } - // re-eval? - if (lock->get_parent()->is_auth() && - lock->is_stable() && - lock->get_parent()->can_auth_pin()) - file_eval(lock); } void Locker::file_eval(FileLock *lock) { CInode *in = (CInode*)lock->get_parent(); + int wanted = in->get_caps_wanted(); + bool loner = (in->client_caps.size() == 1) && in->mds_caps_wanted.empty(); + dout(7) << "file_eval wanted=" << cap_string(wanted) + << " filelock=" << *lock << " on " << *lock->get_parent() + << " loner=" << loner + << endl; - // unstable and ambiguous auth? - if (!lock->is_stable() && - in->is_ambiguous_auth()) { - dout(7) << "file_eval not stable and ambiguous auth, waiting on " << *in << endl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - in->add_waiter(CInode::WAIT_SINGLEAUTH, new C_Locker_FileEval(this, lock)); - return; - } - - // !stable -> do nothing. - if (!lock->is_stable()) return; - - // stable. + assert(lock->get_parent()->is_auth()); assert(lock->is_stable()); - if (in->is_auth() && - !lock->is_xlocked()) { - // [auth] - // and not xlocked! - int wanted = in->get_caps_wanted(); - bool loner = (in->client_caps.size() == 1) && in->mds_caps_wanted.empty(); - dout(7) << "file_eval wanted=" << cap_string(wanted) - << " filelock=" << *lock << " on " << *lock->get_parent() - << " loner=" << loner - << endl; - - // * -> loner? - if (!lock->is_rdlocked() && - !lock->is_waiter_for(SimpleLock::WAIT_WR) && - (wanted & CAP_FILE_WR) && - loner && - lock->get_state() != LOCK_LONER) { - dout(7) << "file_eval stable, bump to loner " << *lock << " on " << *lock->get_parent() << endl; - file_loner(lock); - } - - // * -> mixed? - else if (!lock->is_rdlocked() && - !lock->is_waiter_for(SimpleLock::WAIT_WR) && - (wanted & CAP_FILE_RD) && - (wanted & CAP_FILE_WR) && - !(loner && lock->get_state() == LOCK_LONER) && - lock->get_state() != LOCK_MIXED) { - dout(7) << "file_eval stable, bump to mixed " << *lock << " on " << *lock->get_parent() << endl; - file_mixed(lock); - } - - // * -> sync? - else if (!in->filelock.is_waiter_for(SimpleLock::WAIT_WR) && - !(wanted & (CAP_FILE_WR|CAP_FILE_WRBUFFER)) && - ((wanted & CAP_FILE_RD) || - in->is_replicated() || - (!loner && lock->get_state() == LOCK_LONER)) && - lock->get_state() != LOCK_SYNC) { - dout(7) << "file_eval stable, bump to sync " << *lock << " on " << *lock->get_parent() << endl; - file_sync(lock); - } - - // * -> lock? (if not replicated or open) - else if (!in->is_replicated() && - wanted == 0 && - lock->get_state() != LOCK_LOCK) { - file_lock(lock); - } + // not xlocked! + if (lock->is_xlocked()) return; + + // * -> loner? + if (!lock->is_rdlocked() && + !lock->is_waiter_for(SimpleLock::WAIT_WR) && + (wanted & CAP_FILE_WR) && + loner && + lock->get_state() != LOCK_LONER) { + dout(7) << "file_eval stable, bump to loner " << *lock << " on " << *lock->get_parent() << endl; + file_loner(lock); + } + + // * -> mixed? + else if (!lock->is_rdlocked() && + !lock->is_waiter_for(SimpleLock::WAIT_WR) && + (wanted & CAP_FILE_RD) && + (wanted & CAP_FILE_WR) && + !(loner && lock->get_state() == LOCK_LONER) && + lock->get_state() != LOCK_MIXED) { + dout(7) << "file_eval stable, bump to mixed " << *lock << " on " << *lock->get_parent() << endl; + file_mixed(lock); + } + + // * -> sync? + else if (!in->filelock.is_waiter_for(SimpleLock::WAIT_WR) && + !(wanted & (CAP_FILE_WR|CAP_FILE_WRBUFFER)) && + ((wanted & CAP_FILE_RD) || + in->is_replicated() || + (!loner && lock->get_state() == LOCK_LONER)) && + lock->get_state() != LOCK_SYNC) { + dout(7) << "file_eval stable, bump to sync " << *lock << " on " << *lock->get_parent() << endl; + file_sync(lock); + } + + // * -> lock? (if not replicated or open) + else if (!in->is_replicated() && + wanted == 0 && + lock->get_state() != LOCK_LOCK) { + file_lock(lock); } } @@ -2140,13 +2171,6 @@ bool Locker::file_sync(FileLock *lock) dout(7) << "file_sync " << *lock << " on " << *lock->get_parent() << endl; assert(in->is_auth()); - - // check state - if (lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_GSYNCL || - lock->get_state() == LOCK_GSYNCM) - return true; - assert(lock->is_stable()); int issued = in->get_caps_issued(); @@ -2172,6 +2196,7 @@ bool Locker::file_sync(FileLock *lock) if (issued & CAP_FILE_WR) { // gather client write caps lock->set_state(LOCK_GSYNCM); + lock->get_parent()->auth_pin(); issue_caps(in); } else { // no writers, go straight to sync @@ -2192,6 +2217,7 @@ bool Locker::file_sync(FileLock *lock) if (issued & CAP_FILE_WR) { // gather client write caps lock->set_state(LOCK_GSYNCL); + lock->get_parent()->auth_pin(); issue_caps(in); } else { // no writers, go straight to sync @@ -2220,14 +2246,6 @@ void Locker::file_lock(FileLock *lock) dout(7) << "inode_file_lock " << *lock << " on " << *lock->get_parent() << endl; assert(in->is_auth()); - - // check state - if (lock->get_state() == LOCK_LOCK || - lock->get_state() == LOCK_GLOCKR || - lock->get_state() == LOCK_GLOCKM || - lock->get_state() == LOCK_GLOCKL) - return; // lock or locking - assert(lock->is_stable()); int issued = in->get_caps_issued(); @@ -2240,6 +2258,7 @@ void Locker::file_lock(FileLock *lock) // change lock lock->set_state(LOCK_GLOCKR); + lock->get_parent()->auth_pin(); // call back caps if (issued) @@ -2248,6 +2267,7 @@ void Locker::file_lock(FileLock *lock) if (issued) { // call back caps lock->set_state(LOCK_GLOCKR); + lock->get_parent()->auth_pin(); issue_caps(in); } else { lock->set_state(LOCK_LOCK); @@ -2263,6 +2283,7 @@ void Locker::file_lock(FileLock *lock) // change lock lock->set_state(LOCK_GLOCKM); + lock->get_parent()->auth_pin(); // call back caps issue_caps(in); @@ -2271,6 +2292,7 @@ void Locker::file_lock(FileLock *lock) if (issued) { // change lock lock->set_state(LOCK_GLOCKM); + lock->get_parent()->auth_pin(); // call back caps issue_caps(in); @@ -2284,7 +2306,8 @@ void Locker::file_lock(FileLock *lock) if (issued & CAP_FILE_WR) { // change lock lock->set_state(LOCK_GLOCKL); - + lock->get_parent()->auth_pin(); + // call back caps issue_caps(in); } else { @@ -2302,12 +2325,6 @@ void Locker::file_mixed(FileLock *lock) CInode *in = (CInode*)lock->get_parent(); assert(in->is_auth()); - - // check state - if (lock->get_state() == LOCK_GMIXEDR || - lock->get_state() == LOCK_GMIXEDL) - return; // mixed or mixing - assert(lock->is_stable()); int issued = in->get_caps_issued(); @@ -2319,11 +2336,14 @@ void Locker::file_mixed(FileLock *lock) lock->init_gather(); lock->set_state(LOCK_GMIXEDR); + lock->get_parent()->auth_pin(); + issue_caps(in); } else { if (issued) { lock->set_state(LOCK_GMIXEDR); - issue_caps(in); + lock->get_parent()->auth_pin(); + issue_caps(in); } else { lock->set_state(LOCK_MIXED); } @@ -2349,6 +2369,7 @@ void Locker::file_mixed(FileLock *lock) if (issued & CAP_FILE_WRBUFFER) { // gather up WRBUFFER caps lock->set_state(LOCK_GMIXEDL); + lock->get_parent()->auth_pin(); issue_caps(in); } else if (in->is_replicated()) { @@ -2373,14 +2394,8 @@ void Locker::file_loner(FileLock *lock) dout(7) << "inode_file_loner " << *lock << " on " << *lock->get_parent() << endl; assert(in->is_auth()); - - // check state - if (lock->get_state() == LOCK_LONER || - lock->get_state() == LOCK_GLONERR || - lock->get_state() == LOCK_GLONERM) - return; - assert(lock->is_stable()); + assert((in->client_caps.size() == 1) && in->mds_caps_wanted.empty()); if (lock->get_state() == LOCK_SYNC) { @@ -2391,6 +2406,7 @@ void Locker::file_loner(FileLock *lock) // change lock lock->set_state(LOCK_GLONERR); + lock->get_parent()->auth_pin(); } else { // only one guy with file open, who gets it all, so lock->set_state(LOCK_LONER); @@ -2412,6 +2428,7 @@ void Locker::file_loner(FileLock *lock) // change lock lock->set_state(LOCK_GLONERM); + lock->get_parent()->auth_pin(); } else { lock->set_state(LOCK_LONER); issue_caps(in); @@ -2478,7 +2495,7 @@ void Locker::handle_file_lock(FileLock *lock, MLock *m) issue_caps(in); break; } - if (lock->is_rdlocked()) { + else if (lock->is_rdlocked()) { dout(7) << "handle_file_lock rdlocked, waiting before ack on " << *in << endl; break; } diff --git a/branches/sage/cephmds2/mds/Locker.h b/branches/sage/cephmds2/mds/Locker.h index f5d297186bc74..7fe6c8c41decd 100644 --- a/branches/sage/cephmds2/mds/Locker.h +++ b/branches/sage/cephmds2/mds/Locker.h @@ -86,10 +86,11 @@ protected: // simple public: + void try_simple_eval(SimpleLock *lock); void simple_eval_gather(SimpleLock *lock); - void simple_eval(SimpleLock *lock); bool simple_rdlock_try(SimpleLock *lock, Context *con); protected: + void simple_eval(SimpleLock *lock); void handle_simple_lock(SimpleLock *lock, MLock *m); void simple_sync(SimpleLock *lock); void simple_lock(SimpleLock *lock); @@ -105,8 +106,9 @@ public: // scatter public: + void try_scatter_eval(ScatterLock *lock); + void scatter_eval(ScatterLock *lock); // public for MDCache::adjust_subtree_auth() void scatter_eval_gather(ScatterLock *lock); - void scatter_eval(ScatterLock *lock); protected: void handle_scatter_lock(ScatterLock *lock, MLock *m); void scatter_sync(ScatterLock *lock); @@ -121,9 +123,9 @@ protected: // file public: void file_eval_gather(FileLock *lock); - void file_eval(FileLock *lock); void try_file_eval(FileLock *lock); protected: + void file_eval(FileLock *lock); void handle_file_lock(FileLock *lock, MLock *m); bool file_sync(FileLock *lock); void file_lock(FileLock *lock); diff --git a/branches/sage/cephmds2/mds/MDCache.cc b/branches/sage/cephmds2/mds/MDCache.cc index 2a6eeff95da22..806353ded0738 100644 --- a/branches/sage/cephmds2/mds/MDCache.cc +++ b/branches/sage/cephmds2/mds/MDCache.cc @@ -402,8 +402,14 @@ void MDCache::adjust_subtree_auth(CDir *dir, pair auth) // evaluate subtree inode dirlock? // (we should scatter the dirlock on subtree bounds) - if (dir->inode->is_auth()) - mds->locker->scatter_eval(&dir->inode->dirlock); // ** may or may not be auth_pinned ** + if (dir->inode->is_auth() && + dir->inode->dirlock.is_stable()) { + // force the issue a bit + if (!dir->inode->is_frozen()) + mds->locker->scatter_eval(&dir->inode->dirlock); + else + mds->locker->try_scatter_eval(&dir->inode->dirlock); // ** may or may not be auth_pinned ** + } show_subtrees(); } @@ -1330,7 +1336,7 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack) if (mds->is_resolve()) { assert(uncommitted_slave_updates[from].count(*p)); uncommitted_slave_updates[from].erase(*p); - mds->mdlog->submit_entry(new ESlaveUpdate("unknown", *p, from, ESlaveUpdate::OP_ABORT)); + mds->mdlog->submit_entry(new ESlaveUpdate("unknown", *p, from, ESlaveUpdate::OP_ROLLBACK)); } else { MDRequest *mdr = request_get(*p); if (mdr->slave_commit) { @@ -2820,16 +2826,14 @@ void MDCache::inode_remove_replica(CInode *in, int from) // note: this code calls _eval more often than it needs to! // fix lock - if (in->authlock.remove_replica(from)) - mds->locker->simple_eval_gather(&in->authlock); - if (in->linklock.remove_replica(from)) - mds->locker->simple_eval_gather(&in->linklock); - if (in->dirfragtreelock.remove_replica(from)) - mds->locker->simple_eval_gather(&in->dirfragtreelock); - if (in->filelock.remove_replica(from)) - mds->locker->simple_eval_gather(&in->filelock); + if (in->authlock.remove_replica(from)) mds->locker->simple_eval_gather(&in->authlock); + if (in->linklock.remove_replica(from)) mds->locker->simple_eval_gather(&in->linklock); + if (in->dirfragtreelock.remove_replica(from)) mds->locker->simple_eval_gather(&in->dirfragtreelock); + if (in->filelock.remove_replica(from)) mds->locker->simple_eval_gather(&in->filelock); + if (in->dirlock.remove_replica(from)) mds->locker->simple_eval_gather(&in->dirlock); // alone now? + /* if (!in->is_replicated()) { mds->locker->simple_eval_gather(&in->authlock); mds->locker->simple_eval_gather(&in->linklock); @@ -2837,6 +2841,7 @@ void MDCache::inode_remove_replica(CInode *in, int from) mds->locker->file_eval_gather(&in->filelock); mds->locker->scatter_eval_gather(&in->dirlock); } + */ } void MDCache::dentry_remove_replica(CDentry *dn, int from) @@ -2846,7 +2851,7 @@ void MDCache::dentry_remove_replica(CDentry *dn, int from) // fix lock if (dn->lock.remove_replica(from) || !dn->is_replicated()) - mds->locker->simple_eval(&dn->lock); + mds->locker->simple_eval_gather(&dn->lock); } @@ -2990,7 +2995,7 @@ bool MDCache::shutdown_pass() show_subtrees(); migrator->show_importing(); migrator->show_exporting(); - //show_cache(); + show_cache(); return false; } assert(subtrees.empty()); @@ -4264,11 +4269,11 @@ void MDCache::handle_discover(MDiscover *dis) // add dir if (reply->is_empty() && !dis->wants_base_dir()) { - dout(7) << "not adding unwanted base dir " << *curdir << endl; + dout(7) << "handle_discover not adding unwanted base dir " << *curdir << endl; } else { assert(!curdir->is_ambiguous_auth()); // would be frozen. reply->add_dir( curdir->replicate_to(dis->get_asker()) ); - dout(7) << "added dir " << *curdir << endl; + dout(7) << "handle_discover added dir " << *curdir << endl; } if (dis->get_want().depth() == 0) break; @@ -4322,9 +4327,9 @@ void MDCache::handle_discover(MDiscover *dis) // is this the last (tail) item in the discover traversal? bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1); if (tailitem && dis->wants_xlocked()) { - dout(7) << "allowing discovery of xlocked tail " << *dn << endl; + dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << endl; } else { - dout(7) << "blocking on xlocked " << *dn << endl; + dout(7) << "handle_discover blocking on xlocked " << *dn << endl; dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis)); delete reply; return; @@ -4333,7 +4338,7 @@ void MDCache::handle_discover(MDiscover *dis) // add dentry reply->add_dentry( dn->replicate_to( dis->get_asker() ) ); - dout(7) << "added dentry " << *dn << endl; + dout(7) << "handle_discover added dentry " << *dn << endl; if (!dn->is_primary()) break; // stop on null or remote link. @@ -4342,7 +4347,7 @@ void MDCache::handle_discover(MDiscover *dis) assert(next->is_auth()); reply->add_inode( next->replicate_to( dis->get_asker() ) ); - dout(7) << "added inode " << *next << endl; + dout(7) << "handle_discover added inode " << *next << endl; // descend, keep going. cur = next; @@ -4351,10 +4356,10 @@ void MDCache::handle_discover(MDiscover *dis) // how did we do? if (reply->is_empty()) { - dout(7) << "dropping this empty reply)." << endl; + dout(7) << "handle_discover dropping this empty reply)." << endl; delete reply; } else { - dout(7) << "sending result back to asker mds" << dis->get_asker() << endl; + dout(7) << "handle_discover sending result back to asker mds" << dis->get_asker() << endl; mds->send_message_mds(reply, dis->get_asker(), MDS_PORT_CACHE); } diff --git a/branches/sage/cephmds2/mds/Migrator.cc b/branches/sage/cephmds2/mds/Migrator.cc index 83189d094acbd..9d59e232638b0 100644 --- a/branches/sage/cephmds2/mds/Migrator.cc +++ b/branches/sage/cephmds2/mds/Migrator.cc @@ -69,6 +69,9 @@ void Migrator::dispatch(Message *m) case MSG_MDS_EXPORTDIRFINISH: handle_export_finish((MExportDirFinish*)m); break; + case MSG_MDS_EXPORTDIRCANCEL: + handle_export_cancel((MExportDirCancel*)m); + break; // export case MSG_MDS_EXPORTDIRDISCOVERACK: @@ -165,7 +168,7 @@ void Migrator::handle_mds_failure_or_stop(int who) // abort exports: // - that are going to the failed node - // - that aren't frozen yet (to about auth_pin deadlock) + // - that aren't frozen yet (to avoid auth_pin deadlock) if (export_peer[dir] == who || p->second == EXPORT_DISCOVERING || p->second == EXPORT_FREEZING) { // the guy i'm exporting to failed, or we're just freezing. @@ -178,8 +181,9 @@ void Migrator::handle_mds_failure_or_stop(int who) dir->auth_unpin(); export_state.erase(dir); // clean up dir->state_clear(CDir::STATE_EXPORTING); + dir->put(CDir::PIN_EXPORTING); if (export_peer[dir] != who) // tell them. - mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), who, MDS_PORT_MIGRATOR); + mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir], MDS_PORT_MIGRATOR); break; case EXPORT_FREEZING: @@ -187,8 +191,9 @@ void Migrator::handle_mds_failure_or_stop(int who) dir->unfreeze_tree(); // cancel the freeze export_state.erase(dir); // clean up dir->state_clear(CDir::STATE_EXPORTING); + dir->put(CDir::PIN_EXPORTING); if (export_peer[dir] != who) // tell them. - mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), who, MDS_PORT_MIGRATOR); + mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir], MDS_PORT_MIGRATOR); break; // NOTE: state order reversal, warning comes after loggingstart+prepping @@ -213,6 +218,7 @@ void Migrator::handle_mds_failure_or_stop(int who) cache->try_subtree_merge(dir); export_state.erase(dir); // clean up dir->state_clear(CDir::STATE_EXPORTING); + dir->put(CDir::STATE_EXPORTING); break; case EXPORT_EXPORTING: @@ -220,6 +226,7 @@ void Migrator::handle_mds_failure_or_stop(int who) export_reverse(dir); export_state.erase(dir); // clean up dir->state_clear(CDir::STATE_EXPORTING); + dir->put(CDir::PIN_EXPORTING); break; case EXPORT_LOGGINGFINISH: @@ -508,6 +515,7 @@ void Migrator::export_dir(CDir *dir, int dest) export_peer[dir] = dest; dir->state_set(CDir::STATE_EXPORTING); + dir->get(CDir::PIN_EXPORTING); // send ExportDirDiscover (ask target) mds->send_message_mds(new MExportDirDiscover(dir), export_peer[dir], MDS_PORT_MIGRATOR); @@ -1157,6 +1165,7 @@ void Migrator::export_finish(CDir *dir) // remove from exporting list, clean up state dir->state_clear(CDir::STATE_EXPORTING); + dir->put(CDir::PIN_EXPORTING); export_state.erase(dir); export_peer.erase(dir); export_bounds.erase(dir); @@ -1722,9 +1731,6 @@ void Migrator::import_finish(CDir *dir, bool now) cache->show_subtrees(); audit(); - // re-eval scatterlock? - if (dir->inode->is_auth()) - mds->locker->scatter_eval(&dir->inode->dirlock); // is it empty? if (dir->get_size() == 0 && diff --git a/branches/sage/cephmds2/mds/Server.cc b/branches/sage/cephmds2/mds/Server.cc index ae35c48abc9db..53c9299032a20 100644 --- a/branches/sage/cephmds2/mds/Server.cc +++ b/branches/sage/cephmds2/mds/Server.cc @@ -1232,7 +1232,8 @@ void Server::dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime) assert(diri->is_auth() && !diri->is_root()); // we were before, too. - diri->mark_dirty(dirpv); + diri->pop_and_dirty_projected_inode(); + //diri->mark_dirty(dirpv); dout(10) << "dirty_dn_diri ctime/mtime " << mtime << " v " << diri->inode.version << " on " << *diri << endl; } else { assert(!diri->is_auth() || diri->is_root() || @@ -1974,14 +1975,14 @@ class C_MDS_SlaveLinkPrep : public Context { Server *server; MDRequest *mdr; CInode *targeti; - version_t tpv; + utime_t old_ctime; bool inc; public: - C_MDS_SlaveLinkPrep(Server *s, MDRequest *r, CInode *t, version_t v, bool in) : - server(s), mdr(r), targeti(t), tpv(v), inc(in) { } + C_MDS_SlaveLinkPrep(Server *s, MDRequest *r, CInode *t, utime_t oct, bool in) : + server(s), mdr(r), targeti(t), old_ctime(oct), inc(in) { } void finish(int r) { assert(r == 0); - server->_logged_slave_link(mdr, targeti, tpv, inc); + server->_logged_slave_link(mdr, targeti, old_ctime, inc); } }; @@ -2012,13 +2013,6 @@ void Server::handle_slave_link_prep(MDRequest *mdr) } } - // journal it - ESlaveUpdate *le = new ESlaveUpdate("slave_link_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); - - version_t tpv = targeti->pre_dirty(); - - // add to event - le->metablob.add_dir_context(targeti->get_parent_dir()); inode_t *pi = dn->inode->project_inode(); @@ -2031,78 +2025,117 @@ void Server::handle_slave_link_prep(MDRequest *mdr) inc = false; pi->nlink--; } + utime_t old_ctime = pi->ctime; pi->ctime = mdr->now; - pi->version = tpv; - le->metablob.add_primary_dentry(dn, true, targeti, pi); // update old primary + pi->version = targeti->pre_dirty(); + + dout(10) << " projected inode " << pi << " v " << pi->version << endl; - mds->mdlog->submit_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, tpv, inc)); + // journal it + ESlaveUpdate *le = new ESlaveUpdate("slave_link_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); + le->metablob.add_dir_context(targeti->get_parent_dir()); + le->metablob.add_primary_dentry(dn, true, targeti, pi); // update old primary + mds->mdlog->submit_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, old_ctime, inc)); } class C_MDS_SlaveLinkCommit : public Context { Server *server; MDRequest *mdr; CInode *targeti; - version_t tpv; + utime_t old_ctime; bool inc; public: - C_MDS_SlaveLinkCommit(Server *s, MDRequest *r, CInode *t, version_t v, bool in) : - server(s), mdr(r), targeti(t), tpv(v), inc(in) { } + C_MDS_SlaveLinkCommit(Server *s, MDRequest *r, CInode *t, utime_t oct, bool in) : + server(s), mdr(r), targeti(t), old_ctime(oct), inc(in) { } void finish(int r) { - server->_commit_slave_link(mdr, r, targeti, tpv, inc); + server->_commit_slave_link(mdr, r, targeti, old_ctime, inc); } }; -void Server::_logged_slave_link(MDRequest *mdr, CInode *targeti, version_t tpv, bool inc) +void Server::_logged_slave_link(MDRequest *mdr, CInode *targeti, utime_t old_ctime, bool inc) { dout(10) << "_logged_slave_link " << *mdr << " inc=" << inc << " " << *targeti << endl; + // update the target + targeti->pop_and_dirty_projected_inode(); + // ack MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_LINKPREPACK); mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); // set up commit waiter - mdr->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti, tpv, inc); + mdr->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti, old_ctime, inc); // done. delete mdr->slave_request; mdr->slave_request = 0; } -void Server::_commit_slave_link(MDRequest *mdr, int r, CInode *targeti, version_t tpv, bool inc) + +class C_MDS_SlaveLinkRollback : public Context { + Server *server; + CInode *targeti; +public: + C_MDS_SlaveLinkRollback(Server *s, CInode *t) : + server(s), targeti(t) { } + void finish(int r) { + targeti->pop_and_dirty_projected_inode(); + } +}; + +void Server::_commit_slave_link(MDRequest *mdr, int r, CInode *targeti, utime_t old_ctime, bool inc) { dout(10) << "_commit_slave_link " << *mdr << " r=" << r << " inc=" << inc << " " << *targeti << endl; - ESlaveUpdate *le; - if (r == 0) { - // commit. + // write a commit to the journal + ESlaveUpdate *le = new ESlaveUpdate("slave_link_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); + mds->mdlog->submit_entry(le); + } else { + // rollback: undo nlink change. - // update the target - if (inc) + // -- rollback in journal -- + ESlaveUpdate *le = new ESlaveUpdate("slave_link_rollback", + mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); + + inode_t *pi = targeti->project_inode(); + if (inc) + pi->nlink++; + else + pi->nlink--; + if (pi->ctime == mdr->now) + pi->ctime = old_ctime; + le->metablob.add_primary_dentry(targeti->parent, true, 0, pi); + mds->mdlog->submit_entry(le); + mds->mdlog->wait_for_sync(new C_MDS_SlaveLinkRollback(this, targeti)); + + // -- rollback in memory -- + // in inode. + if (inc) targeti->inode.nlink++; else targeti->inode.nlink--; - targeti->inode.ctime = mdr->now; - targeti->mark_dirty(tpv); - - // write a commit to the journal - le = new ESlaveUpdate("slave_link_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); - } else { - // abort - le = new ESlaveUpdate("slave_link_abort", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ABORT); - } + if (targeti->inode.ctime == mdr->now) + targeti->inode.ctime = old_ctime; - mds->mdlog->submit_entry(le); + // in any queued projected items. + for (list::iterator p = targeti->projected_inode.begin(); + p != targeti->projected_inode.end(); + ++p) + if (inc) + (*p)->nlink++; + else + (*p)->nlink--; + } } - void Server::handle_slave_link_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m) { dout(10) << "handle_slave_link_prep_ack " << *mdr @@ -2250,18 +2283,16 @@ class C_MDS_unlink_local_finish : public Context { MDRequest *mdr; CDentry *dn; CDentry *straydn; - version_t ipv; // referred inode version_t dnpv; // deleted dentry version_t dirpv; public: C_MDS_unlink_local_finish(MDS *m, MDRequest *r, CDentry *d, CDentry *sd, - version_t v, version_t dirpv_) : + version_t dirpv_) : mds(m), mdr(r), dn(d), straydn(sd), - ipv(v), dnpv(d->get_projected_version()), dirpv(dirpv_) { } void finish(int r) { assert(r == 0); - mds->server->_unlink_local_finish(mdr, dn, straydn, ipv, dnpv, dirpv); + mds->server->_unlink_local_finish(mdr, dn, straydn, dnpv, dirpv); } }; @@ -2308,7 +2339,7 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) // finisher C_MDS_unlink_local_finish *fin = new C_MDS_unlink_local_finish(mds, mdr, dn, straydn, - ipv, dirpv); + dirpv); journal_opens(); // journal pending opens, just in case @@ -2321,7 +2352,7 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) void Server::_unlink_local_finish(MDRequest *mdr, CDentry *dn, CDentry *straydn, - version_t ipv, version_t dnpv, version_t dirpv) + version_t dnpv, version_t dirpv) { dout(10) << "_unlink_local_finish " << *dn << endl; @@ -2332,11 +2363,9 @@ void Server::_unlink_local_finish(MDRequest *mdr, // relink as stray? (i.e. was primary link?) if (straydn) straydn->dir->link_inode(straydn, in); - // nlink-- - in->inode.ctime = mdr->now; - in->inode.nlink--; - in->mark_dirty(ipv); // dirty inode - dn->mark_dirty(dnpv); // dirty old dentry + // nlink--, dirty old dentry + in->pop_and_dirty_projected_inode(); + dn->mark_dirty(dnpv); // dir inode's mtime dirty_dn_diri(dn, dirpv, mdr->now); @@ -3185,7 +3214,7 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r, le = new ESlaveUpdate("slave_rename_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); } else { // abort - le = new ESlaveUpdate("slave_rename_abort", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ABORT); + le = new ESlaveUpdate("slave_rename_abort", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); } mds->mdlog->submit_entry(le); } diff --git a/branches/sage/cephmds2/mds/Server.h b/branches/sage/cephmds2/mds/Server.h index ccfb6a16ddf27..6a60b4cc96ed5 100644 --- a/branches/sage/cephmds2/mds/Server.h +++ b/branches/sage/cephmds2/mds/Server.h @@ -128,8 +128,8 @@ public: version_t, version_t); void handle_slave_link_prep(MDRequest *mdr); - void _logged_slave_link(MDRequest *mdr, CInode *targeti, version_t tpv, bool inc); - void _commit_slave_link(MDRequest *mdr, int r, CInode *targeti, version_t tpv, bool inc); + void _logged_slave_link(MDRequest *mdr, CInode *targeti, utime_t old_ctime, bool inc); + void _commit_slave_link(MDRequest *mdr, int r, CInode *targeti, utime_t old_ctime, bool inc); void handle_slave_link_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m); // unlink @@ -138,7 +138,7 @@ public: void _unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn); void _unlink_local_finish(MDRequest *mdr, CDentry *dn, CDentry *straydn, - version_t, version_t, version_t); + version_t, version_t); void _unlink_remote(MDRequest *mdr, CDentry *dn); void _unlink_remote_finish(MDRequest *mdr, diff --git a/branches/sage/cephmds2/mds/events/ESlaveUpdate.h b/branches/sage/cephmds2/mds/events/ESlaveUpdate.h index f83dc87778a0f..6ac1ca26f4bc9 100644 --- a/branches/sage/cephmds2/mds/events/ESlaveUpdate.h +++ b/branches/sage/cephmds2/mds/events/ESlaveUpdate.h @@ -22,13 +22,14 @@ class ESlaveUpdate : public LogEvent { public: const static int OP_PREPARE = 1; const static int OP_COMMIT = 2; - const static int OP_ABORT = 3; + const static int OP_ROLLBACK = 3; string type; metareqid_t reqid; int master; int op; // prepare, commit, abort EMetaBlob metablob; + bufferlist rollback_data; // any special sauce needed for a correct rollback. (*) ESlaveUpdate() : LogEvent(EVENT_SLAVEUPDATE) { } ESlaveUpdate(const char *s, metareqid_t ri, int mastermds, int o) : @@ -53,6 +54,7 @@ public: ::_encode(master, bl); ::_encode(op, bl); metablob._encode(bl); + ::_encode(rollback_data, bl); } void decode_payload(bufferlist& bl, int& off) { ::_decode(type, bl, off); @@ -60,6 +62,7 @@ public: ::_decode(master, bl, off); ::_decode(op, bl, off); metablob._decode(bl, off); + ::_decode(rollback_data, bl, off); } bool has_expired(MDS *mds); diff --git a/branches/sage/cephmds2/mds/journal.cc b/branches/sage/cephmds2/mds/journal.cc index da57c55e471e2..af247042442cc 100644 --- a/branches/sage/cephmds2/mds/journal.cc +++ b/branches/sage/cephmds2/mds/journal.cc @@ -750,7 +750,7 @@ void ESlaveUpdate::replay(MDS *mds) } break; - case ESlaveUpdate::OP_ABORT: + case ESlaveUpdate::OP_ROLLBACK: if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) { dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds" << master << ": discarding previously saved blob" << endl; diff --git a/branches/sage/cephmds2/mon/MDSMonitor.cc b/branches/sage/cephmds2/mon/MDSMonitor.cc index 8644f769eaaed..995b6053f1c60 100644 --- a/branches/sage/cephmds2/mon/MDSMonitor.cc +++ b/branches/sage/cephmds2/mon/MDSMonitor.cc @@ -284,6 +284,10 @@ void MDSMonitor::handle_mds_beacon(MMDSBeacon *m) } } + // can't go from stopping -> active + if (state == MDSMap::STATE_ACTIVE && mdsmap.mds_state[from] == MDSMap::STATE_STOPPING) + state = MDSMap::STATE_STOPPING; // dummy + // if creating -> active, go to standby instead if (state == MDSMap::STATE_ACTIVE && mdsmap.is_creating(from)) { mdsmap.mds_created.insert(from); -- 2.39.5