From: Sage Weil Date: Mon, 2 Jun 2008 17:32:53 +0000 (-0700) Subject: mds: rollback fix. avoid scatterlock change during rejoin. X-Git-Tag: v0.3~170^2~19 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=db02ed71b9adc373d859cae63ba0b00de69b91fb;p=ceph.git mds: rollback fix. avoid scatterlock change during rejoin. --- diff --git a/src/TODO b/src/TODO index 182849161d24..3c5c5f3d5a66 100644 --- a/src/TODO +++ b/src/TODO @@ -64,14 +64,15 @@ mon - some sort of tester for PaxosService... - osdmon needs to lower-bound old osdmap versions it keeps around? -mds nested -- fix rejoin vs updated dirfrag nested/dirlocks - mds mustfix +- fix rejoin vs updated dirfrag nested/dirlocks + - not inode_full, now.. send the fnode. but for auth dirfrags, _to_ inode auth... which is sorta outside the subtree..? +/- locks (esp scatter) vs rejoin, scatter_writebehind - make sure locker avoids frozen inodes - make sure predirty_nested stops if it can't wrlock versionlock (acquire_locks normally hides that detail for us) - make sure stray inode is always opened on startup +- make sure inode cache expire for frozen inode behaves - look at the client_map session opening code.. versus rollback (of import, or slave request) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 44da27011312..53152da7e984 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -86,6 +86,13 @@ void Locker::dispatch(Message *m) } +/* + * locks vs rejoin + * + * + * + */ + void Locker::send_lock_message(SimpleLock *lock, int msg) { for (map::iterator it = lock->get_parent()->replicas_begin(); @@ -1214,6 +1221,24 @@ void Locker::revoke_client_leases(SimpleLock *lock) // nested --------------------------------------------------------------- + +/* + * NOTE: we _have_ to delay the scatter if we are called during a + * rejoin, because we can't twiddle locks between when the + * rejoin_(weak|strong) is received and when we send the rejoin_ack. + * normally, this isn't a problem: a recover mds doesn't twiddle locks + * (no requests), and a survivor acks immediately. _except_ that + * during rejoin_(weak|strong) processing, we may complete a lock + * gather, and do a scatter_writebehind.. and we _can't_ twiddle the + * scatterlock state in that case or the lock states will get out of + * sync between the auth and replica. + * + * the simple solution is to never do the scatter here. instead, put + * the scatterlock on a list if it isn't already wrlockable. this is + * probably the best plan anyway, since we avoid too many + * scatters/locks under normal usage. + * + */ void Locker::predirty_nested(Mutation *mut, EMetaBlob *blob, CInode *in, CDir *parent, int flags, int linkunlink) @@ -1320,7 +1345,7 @@ void Locker::predirty_nested(Mutation *mut, EMetaBlob *blob, bool stop = false; if (mut->wrlocks.count(&pin->dirlock) == 0 && - !scatter_wrlock_try(&pin->dirlock, mut)) { + !scatter_wrlock_try(&pin->dirlock, mut, false)) { // ** do not initiate.. see above comment ** dout(10) << "predirty_nested can't wrlock " << pin->dirlock << " on " << *pin << dendl; stop = true; } @@ -1955,7 +1980,7 @@ void Locker::scatter_rdlock_finish(ScatterLock *lock, Mutation *mut) } -bool Locker::scatter_wrlock_try(ScatterLock *lock, Mutation *mut) +bool Locker::scatter_wrlock_try(ScatterLock *lock, Mutation *mut, bool initiate) { dout(7) << "scatter_wrlock_try on " << *lock << " on " << *lock->get_parent() << dendl; @@ -1981,7 +2006,7 @@ bool Locker::scatter_wrlock_try(ScatterLock *lock, Mutation *mut) } // initiate scatter or lock? - if (lock->is_stable()) { + if (initiate && lock->is_stable()) { if (lock->get_parent()->is_auth()) { if (want_scatter) scatter_scatter(lock); @@ -2005,7 +2030,7 @@ bool Locker::scatter_wrlock_start(ScatterLock *lock, MDRequest *mut) dout(7) << "scatter_wrlock_start on " << *lock << " on " << *lock->get_parent() << dendl; - if (scatter_wrlock_try(lock, mut)) + if (scatter_wrlock_try(lock, mut, true)) // initiate return true; // wait for write. diff --git a/src/mds/Locker.h b/src/mds/Locker.h index 8690c6a6aaee..a0c76b53c971 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -145,7 +145,7 @@ protected: void scatter_tempsync(ScatterLock *lock); bool scatter_rdlock_start(ScatterLock *lock, MDRequest *mut); void scatter_rdlock_finish(ScatterLock *lock, Mutation *mut); - bool scatter_wrlock_try(ScatterLock *lock, Mutation *mut); + bool scatter_wrlock_try(ScatterLock *lock, Mutation *mut, bool initiate); bool scatter_wrlock_start(ScatterLock *lock, MDRequest *mut); void scatter_wrlock_finish(ScatterLock *lock, Mutation *mut); bool scatter_xlock_start(ScatterLock *lock, MDRequest *mut); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index bdbe43a1dd6c..c79e6ebc573a 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -2502,7 +2502,7 @@ void Server::do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr) ::decode(rollback, p); dout(10) << "do_link_rollback on " << rollback.reqid - << (rollback.was_inc ? "inc":"dec") + << (rollback.was_inc ? " inc":" dec") << " ino " << rollback.ino << dendl; @@ -3011,6 +3011,15 @@ public: /** handle_client_rename * + * rename master is the destdn auth. this is because cached inodes + * must remain connected. thus, any replica of srci, must also + * replicate destdn, and possibly straydn, so that srci (and + * destdn->inode) remain connected during the rename. + * + * to do this, we freeze srci, then master (destdn auth) verifies that + * all other nodes have also replciated destdn and straydn. note that + * destdn replicas need not also replicate srci. this only works when + * destdn is master. */ void Server::handle_client_rename(MDRequest *mdr) { @@ -3775,6 +3784,43 @@ void Server::handle_slave_rename_prep(MDRequest *mdr) dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl; } + // encode everything we'd need to roll this back... basically, just the original state. + rename_rollback rollback; + + rollback.reqid = mdr->reqid; + + rollback.orig_src.dirfrag = srcdn->dir->dirfrag(); + rollback.orig_src.dirfrag_old_mtime = srcdn->dir->get_projected_fnode()->fragstat.mtime; + rollback.orig_src.dirfrag_old_rctime = srcdn->dir->get_projected_fnode()->fragstat.rctime; + rollback.orig_src.dname = srcdn->name; + if (srcdn->is_primary()) + rollback.orig_src.ino = srcdn->inode->ino(); + else { + assert(srcdn->is_remote()); + rollback.orig_src.remote_ino = srcdn->get_remote_ino(); + rollback.orig_src.remote_ino = srcdn->get_remote_d_type(); + } + + rollback.orig_dest.dirfrag = destdn->dir->dirfrag(); + rollback.orig_dest.dirfrag_old_mtime = destdn->dir->get_projected_fnode()->fragstat.mtime; + rollback.orig_dest.dirfrag_old_rctime = destdn->dir->get_projected_fnode()->fragstat.rctime; + rollback.orig_dest.dname = destdn->name; + if (destdn->is_primary()) + rollback.orig_dest.ino = destdn->inode->ino(); + else if (destdn->is_remote()) { + rollback.orig_dest.remote_ino = destdn->get_remote_ino(); + rollback.orig_dest.remote_ino = destdn->get_remote_d_type(); + } + + if (straydn) { + rollback.stray.dirfrag = straydn->dir->dirfrag(); + rollback.stray.dirfrag_old_mtime = straydn->dir->get_projected_fnode()->fragstat.mtime; + rollback.stray.dirfrag_old_rctime = straydn->dir->get_projected_fnode()->fragstat.rctime; + rollback.stray.dname = straydn->name; + } + ::encode(rollback, mdr->more()->rollback_bl); + dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl; + // journal it? if (srcdn->is_auth() || (destdn->inode && destdn->inode->is_auth()) || @@ -3783,45 +3829,7 @@ void Server::handle_slave_rename_prep(MDRequest *mdr) mdr->ls = mdlog->get_current_segment(); ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME); - - // encode everything we'd need to roll this back... basically, just the original state. - rename_rollback rollback; - - rollback.reqid = mdr->reqid; - - rollback.orig_src.dirfrag = srcdn->dir->dirfrag(); - rollback.orig_src.dirfrag_old_mtime = srcdn->dir->get_projected_fnode()->fragstat.mtime; - rollback.orig_src.dirfrag_old_rctime = srcdn->dir->get_projected_fnode()->fragstat.rctime; - rollback.orig_src.dname = srcdn->name; - if (srcdn->is_primary()) - rollback.orig_src.ino = srcdn->inode->ino(); - else { - assert(srcdn->is_remote()); - rollback.orig_src.remote_ino = srcdn->get_remote_ino(); - rollback.orig_src.remote_ino = srcdn->get_remote_d_type(); - } - - rollback.orig_dest.dirfrag = destdn->dir->dirfrag(); - rollback.orig_dest.dirfrag_old_mtime = destdn->dir->get_projected_fnode()->fragstat.mtime; - rollback.orig_dest.dirfrag_old_rctime = destdn->dir->get_projected_fnode()->fragstat.rctime; - rollback.orig_dest.dname = destdn->name; - if (destdn->is_primary()) - rollback.orig_dest.ino = destdn->inode->ino(); - else if (destdn->is_remote()) { - rollback.orig_dest.remote_ino = destdn->get_remote_ino(); - rollback.orig_dest.remote_ino = destdn->get_remote_d_type(); - } - - if (straydn) { - rollback.stray.dirfrag = straydn->dir->dirfrag(); - rollback.stray.dirfrag_old_mtime = straydn->dir->get_projected_fnode()->fragstat.mtime; - rollback.stray.dirfrag_old_rctime = straydn->dir->get_projected_fnode()->fragstat.rctime; - rollback.stray.dname = straydn->name; - } - ::encode(rollback, le->rollback); - dout(10) << " rollback is " << le->rollback.length() << " bytes" << dendl; - mdr->more()->rollback_bl = le->rollback; - dout(10) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl; + le->rollback = mdr->more()->rollback_bl; bufferlist blah; // inode import data... obviously not used if we're the slave _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);