]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: rollback fix. avoid scatterlock change during rejoin.
authorSage Weil <sage@newdream.net>
Mon, 2 Jun 2008 17:32:53 +0000 (10:32 -0700)
committerSage Weil <sage@newdream.net>
Mon, 2 Jun 2008 17:32:53 +0000 (10:32 -0700)
src/TODO
src/mds/Locker.cc
src/mds/Locker.h
src/mds/Server.cc

index 182849161d2414c26d107804de2930d840793d00..3c5c5f3d5a6671af56ae501fb6dce2622cd4938e 100644 (file)
--- a/src/TODO
+++ b/src/TODO
@@ -64,14 +64,15 @@ mon
 - some sort of tester for PaxosService...
 - osdmon needs to lower-bound old osdmap versions it keeps around?
 
-mds nested
-- fix rejoin vs updated dirfrag nested/dirlocks
-
 mds mustfix
+- fix rejoin vs updated dirfrag nested/dirlocks
+  - not inode_full, now.. send the fnode.  but for auth dirfrags, _to_ inode auth... which is sorta outside the subtree..?
 
+/- locks (esp scatter) vs rejoin, scatter_writebehind
 - make sure locker avoids frozen inodes
 - make sure predirty_nested stops if it can't wrlock versionlock (acquire_locks normally hides that detail for us)
 - make sure stray inode is always opened on startup
+- make sure inode cache expire for frozen inode behaves
 
 - look at the client_map session opening code.. versus rollback (of import, or slave request)
 
index 44da27011312d0f20c180973ab0cf0aac9e2114e..53152da7e9840eb84d9bc12f75b24412a512f365 100644 (file)
@@ -86,6 +86,13 @@ void Locker::dispatch(Message *m)
 }
 
 
+/*
+ * locks vs rejoin
+ *
+ * 
+ *
+ */
+
 void Locker::send_lock_message(SimpleLock *lock, int msg)
 {
   for (map<int,int>::iterator it = lock->get_parent()->replicas_begin(); 
@@ -1214,6 +1221,24 @@ void Locker::revoke_client_leases(SimpleLock *lock)
 
 // nested ---------------------------------------------------------------
 
+
+/*
+ * NOTE: we _have_ to delay the scatter if we are called during a
+ * rejoin, because we can't twiddle locks between when the
+ * rejoin_(weak|strong) is received and when we send the rejoin_ack.
+ * normally, this isn't a problem: a recover mds doesn't twiddle locks
+ * (no requests), and a survivor acks immediately.  _except_ that
+ * during rejoin_(weak|strong) processing, we may complete a lock
+ * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
+ * scatterlock state in that case or the lock states will get out of
+ * sync between the auth and replica.
+ *
+ * the simple solution is to never do the scatter here.  instead, put
+ * the scatterlock on a list if it isn't already wrlockable.  this is
+ * probably the best plan anyway, since we avoid too many
+ * scatters/locks under normal usage.
+ *
+ */
 void Locker::predirty_nested(Mutation *mut, EMetaBlob *blob,
                             CInode *in, CDir *parent,
                             int flags, int linkunlink)
@@ -1320,7 +1345,7 @@ void Locker::predirty_nested(Mutation *mut, EMetaBlob *blob,
 
     bool stop = false;
     if (mut->wrlocks.count(&pin->dirlock) == 0 &&
-       !scatter_wrlock_try(&pin->dirlock, mut)) {
+       !scatter_wrlock_try(&pin->dirlock, mut, false)) {  // ** do not initiate.. see above comment **
       dout(10) << "predirty_nested can't wrlock " << pin->dirlock << " on " << *pin << dendl;
       stop = true;
     }
@@ -1955,7 +1980,7 @@ void Locker::scatter_rdlock_finish(ScatterLock *lock, Mutation *mut)
 }
 
 
-bool Locker::scatter_wrlock_try(ScatterLock *lock, Mutation *mut)
+bool Locker::scatter_wrlock_try(ScatterLock *lock, Mutation *mut, bool initiate)
 {
   dout(7) << "scatter_wrlock_try  on " << *lock
          << " on " << *lock->get_parent() << dendl;  
@@ -1981,7 +2006,7 @@ bool Locker::scatter_wrlock_try(ScatterLock *lock, Mutation *mut)
   }
 
   // initiate scatter or lock?
-  if (lock->is_stable()) {
+  if (initiate && lock->is_stable()) {
     if (lock->get_parent()->is_auth()) {
       if (want_scatter) 
        scatter_scatter(lock);
@@ -2005,7 +2030,7 @@ bool Locker::scatter_wrlock_start(ScatterLock *lock, MDRequest *mut)
   dout(7) << "scatter_wrlock_start  on " << *lock
          << " on " << *lock->get_parent() << dendl;  
   
-  if (scatter_wrlock_try(lock, mut))
+  if (scatter_wrlock_try(lock, mut, true)) // initiate
     return true;
 
   // wait for write.
index 8690c6a6aaeeea89172c9bb50924a2b9d6f100f8..a0c76b53c971b12da531452809466c31b24bc4dc 100644 (file)
@@ -145,7 +145,7 @@ protected:
   void scatter_tempsync(ScatterLock *lock);
   bool scatter_rdlock_start(ScatterLock *lock, MDRequest *mut);
   void scatter_rdlock_finish(ScatterLock *lock, Mutation *mut);
-  bool scatter_wrlock_try(ScatterLock *lock, Mutation *mut);
+  bool scatter_wrlock_try(ScatterLock *lock, Mutation *mut, bool initiate);
   bool scatter_wrlock_start(ScatterLock *lock, MDRequest *mut);
   void scatter_wrlock_finish(ScatterLock *lock, Mutation *mut);
   bool scatter_xlock_start(ScatterLock *lock, MDRequest *mut);
index bdbe43a1dd6cd773fd5821afbc5c04f860f3c308..c79e6ebc573a599575f58571a4edb93be45be596 100644 (file)
@@ -2502,7 +2502,7 @@ void Server::do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr)
   ::decode(rollback, p);
 
   dout(10) << "do_link_rollback on " << rollback.reqid 
-          << (rollback.was_inc ? "inc":"dec") 
+          << (rollback.was_inc ? " inc":" dec") 
           << " ino " << rollback.ino
           << dendl;
 
@@ -3011,6 +3011,15 @@ public:
 
 /** handle_client_rename
  *
+ * rename master is the destdn auth.  this is because cached inodes
+ * must remain connected.  thus, any replica of srci, must also
+ * replicate destdn, and possibly straydn, so that srci (and
+ * destdn->inode) remain connected during the rename.
+ *
+ * to do this, we freeze srci, then master (destdn auth) verifies that
+ * all other nodes have also replciated destdn and straydn.  note that
+ * destdn replicas need not also replicate srci.  this only works when 
+ * destdn is master.
  */
 void Server::handle_client_rename(MDRequest *mdr)
 {
@@ -3775,6 +3784,43 @@ void Server::handle_slave_rename_prep(MDRequest *mdr)
     dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
   }
 
+  // encode everything we'd need to roll this back... basically, just the original state.
+  rename_rollback rollback;
+  
+  rollback.reqid = mdr->reqid;
+  
+  rollback.orig_src.dirfrag = srcdn->dir->dirfrag();
+  rollback.orig_src.dirfrag_old_mtime = srcdn->dir->get_projected_fnode()->fragstat.mtime;
+  rollback.orig_src.dirfrag_old_rctime = srcdn->dir->get_projected_fnode()->fragstat.rctime;
+  rollback.orig_src.dname = srcdn->name;
+  if (srcdn->is_primary())
+    rollback.orig_src.ino = srcdn->inode->ino();
+  else {
+    assert(srcdn->is_remote());
+    rollback.orig_src.remote_ino = srcdn->get_remote_ino();
+    rollback.orig_src.remote_ino = srcdn->get_remote_d_type();
+  }
+  
+  rollback.orig_dest.dirfrag = destdn->dir->dirfrag();
+  rollback.orig_dest.dirfrag_old_mtime = destdn->dir->get_projected_fnode()->fragstat.mtime;
+  rollback.orig_dest.dirfrag_old_rctime = destdn->dir->get_projected_fnode()->fragstat.rctime;
+  rollback.orig_dest.dname = destdn->name;
+  if (destdn->is_primary())
+    rollback.orig_dest.ino = destdn->inode->ino();
+  else if (destdn->is_remote()) {
+    rollback.orig_dest.remote_ino = destdn->get_remote_ino();
+    rollback.orig_dest.remote_ino = destdn->get_remote_d_type();
+  }
+  
+  if (straydn) {
+    rollback.stray.dirfrag = straydn->dir->dirfrag();
+    rollback.stray.dirfrag_old_mtime = straydn->dir->get_projected_fnode()->fragstat.mtime;
+    rollback.stray.dirfrag_old_rctime = straydn->dir->get_projected_fnode()->fragstat.rctime;
+    rollback.stray.dname = straydn->name;
+  }
+  ::encode(rollback, mdr->more()->rollback_bl);
+  dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
+
   // journal it?
   if (srcdn->is_auth() ||
       (destdn->inode && destdn->inode->is_auth()) ||
@@ -3783,45 +3829,7 @@ void Server::handle_slave_rename_prep(MDRequest *mdr)
     mdr->ls = mdlog->get_current_segment();
     ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds,
                                        ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME);
-
-    // encode everything we'd need to roll this back... basically, just the original state.
-    rename_rollback rollback;
-    
-    rollback.reqid = mdr->reqid;
-
-    rollback.orig_src.dirfrag = srcdn->dir->dirfrag();
-    rollback.orig_src.dirfrag_old_mtime = srcdn->dir->get_projected_fnode()->fragstat.mtime;
-    rollback.orig_src.dirfrag_old_rctime = srcdn->dir->get_projected_fnode()->fragstat.rctime;
-    rollback.orig_src.dname = srcdn->name;
-    if (srcdn->is_primary())
-      rollback.orig_src.ino = srcdn->inode->ino();
-    else {
-      assert(srcdn->is_remote());
-      rollback.orig_src.remote_ino = srcdn->get_remote_ino();
-      rollback.orig_src.remote_ino = srcdn->get_remote_d_type();
-    }
-    
-    rollback.orig_dest.dirfrag = destdn->dir->dirfrag();
-    rollback.orig_dest.dirfrag_old_mtime = destdn->dir->get_projected_fnode()->fragstat.mtime;
-    rollback.orig_dest.dirfrag_old_rctime = destdn->dir->get_projected_fnode()->fragstat.rctime;
-    rollback.orig_dest.dname = destdn->name;
-    if (destdn->is_primary())
-      rollback.orig_dest.ino = destdn->inode->ino();
-    else if (destdn->is_remote()) {
-      rollback.orig_dest.remote_ino = destdn->get_remote_ino();
-      rollback.orig_dest.remote_ino = destdn->get_remote_d_type();
-    }
-
-    if (straydn) {
-      rollback.stray.dirfrag = straydn->dir->dirfrag();
-      rollback.stray.dirfrag_old_mtime = straydn->dir->get_projected_fnode()->fragstat.mtime;
-      rollback.stray.dirfrag_old_rctime = straydn->dir->get_projected_fnode()->fragstat.rctime;
-      rollback.stray.dname = straydn->name;
-    }
-    ::encode(rollback, le->rollback);
-    dout(10) << " rollback is " << le->rollback.length() << " bytes" << dendl;
-    mdr->more()->rollback_bl = le->rollback;
-    dout(10) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
+    le->rollback = mdr->more()->rollback_bl;
 
     bufferlist blah;  // inode import data... obviously not used if we're the slave
     _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);