}
+/*
+ * locks vs rejoin
+ *
+ *
+ *
+ */
+
void Locker::send_lock_message(SimpleLock *lock, int msg)
{
for (map<int,int>::iterator it = lock->get_parent()->replicas_begin();
// nested ---------------------------------------------------------------
+
+/*
+ * NOTE: we _have_ to delay the scatter if we are called during a
+ * rejoin, because we can't twiddle locks between when the
+ * rejoin_(weak|strong) is received and when we send the rejoin_ack.
+ * normally, this isn't a problem: a recover mds doesn't twiddle locks
+ * (no requests), and a survivor acks immediately. _except_ that
+ * during rejoin_(weak|strong) processing, we may complete a lock
+ * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
+ * scatterlock state in that case or the lock states will get out of
+ * sync between the auth and replica.
+ *
+ * the simple solution is to never do the scatter here. instead, put
+ * the scatterlock on a list if it isn't already wrlockable. this is
+ * probably the best plan anyway, since we avoid too many
+ * scatters/locks under normal usage.
+ *
+ */
void Locker::predirty_nested(Mutation *mut, EMetaBlob *blob,
CInode *in, CDir *parent,
int flags, int linkunlink)
bool stop = false;
if (mut->wrlocks.count(&pin->dirlock) == 0 &&
- !scatter_wrlock_try(&pin->dirlock, mut)) {
+ !scatter_wrlock_try(&pin->dirlock, mut, false)) { // ** do not initiate.. see above comment **
dout(10) << "predirty_nested can't wrlock " << pin->dirlock << " on " << *pin << dendl;
stop = true;
}
}
-bool Locker::scatter_wrlock_try(ScatterLock *lock, Mutation *mut)
+bool Locker::scatter_wrlock_try(ScatterLock *lock, Mutation *mut, bool initiate)
{
dout(7) << "scatter_wrlock_try on " << *lock
<< " on " << *lock->get_parent() << dendl;
}
// initiate scatter or lock?
- if (lock->is_stable()) {
+ if (initiate && lock->is_stable()) {
if (lock->get_parent()->is_auth()) {
if (want_scatter)
scatter_scatter(lock);
dout(7) << "scatter_wrlock_start on " << *lock
<< " on " << *lock->get_parent() << dendl;
- if (scatter_wrlock_try(lock, mut))
+ if (scatter_wrlock_try(lock, mut, true)) // initiate
return true;
// wait for write.
::decode(rollback, p);
dout(10) << "do_link_rollback on " << rollback.reqid
- << (rollback.was_inc ? "inc":"dec")
+ << (rollback.was_inc ? " inc":" dec")
<< " ino " << rollback.ino
<< dendl;
/** handle_client_rename
*
+ * rename master is the destdn auth. this is because cached inodes
+ * must remain connected. thus, any replica of srci, must also
+ * replicate destdn, and possibly straydn, so that srci (and
+ * destdn->inode) remain connected during the rename.
+ *
+ * to do this, we freeze srci, then master (destdn auth) verifies that
+ * all other nodes have also replciated destdn and straydn. note that
+ * destdn replicas need not also replicate srci. this only works when
+ * destdn is master.
*/
void Server::handle_client_rename(MDRequest *mdr)
{
dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
}
+ // encode everything we'd need to roll this back... basically, just the original state.
+ rename_rollback rollback;
+
+ rollback.reqid = mdr->reqid;
+
+ rollback.orig_src.dirfrag = srcdn->dir->dirfrag();
+ rollback.orig_src.dirfrag_old_mtime = srcdn->dir->get_projected_fnode()->fragstat.mtime;
+ rollback.orig_src.dirfrag_old_rctime = srcdn->dir->get_projected_fnode()->fragstat.rctime;
+ rollback.orig_src.dname = srcdn->name;
+ if (srcdn->is_primary())
+ rollback.orig_src.ino = srcdn->inode->ino();
+ else {
+ assert(srcdn->is_remote());
+ rollback.orig_src.remote_ino = srcdn->get_remote_ino();
+ rollback.orig_src.remote_ino = srcdn->get_remote_d_type();
+ }
+
+ rollback.orig_dest.dirfrag = destdn->dir->dirfrag();
+ rollback.orig_dest.dirfrag_old_mtime = destdn->dir->get_projected_fnode()->fragstat.mtime;
+ rollback.orig_dest.dirfrag_old_rctime = destdn->dir->get_projected_fnode()->fragstat.rctime;
+ rollback.orig_dest.dname = destdn->name;
+ if (destdn->is_primary())
+ rollback.orig_dest.ino = destdn->inode->ino();
+ else if (destdn->is_remote()) {
+ rollback.orig_dest.remote_ino = destdn->get_remote_ino();
+ rollback.orig_dest.remote_ino = destdn->get_remote_d_type();
+ }
+
+ if (straydn) {
+ rollback.stray.dirfrag = straydn->dir->dirfrag();
+ rollback.stray.dirfrag_old_mtime = straydn->dir->get_projected_fnode()->fragstat.mtime;
+ rollback.stray.dirfrag_old_rctime = straydn->dir->get_projected_fnode()->fragstat.rctime;
+ rollback.stray.dname = straydn->name;
+ }
+ ::encode(rollback, mdr->more()->rollback_bl);
+ dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
+
// journal it?
if (srcdn->is_auth() ||
(destdn->inode && destdn->inode->is_auth()) ||
mdr->ls = mdlog->get_current_segment();
ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds,
ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME);
-
- // encode everything we'd need to roll this back... basically, just the original state.
- rename_rollback rollback;
-
- rollback.reqid = mdr->reqid;
-
- rollback.orig_src.dirfrag = srcdn->dir->dirfrag();
- rollback.orig_src.dirfrag_old_mtime = srcdn->dir->get_projected_fnode()->fragstat.mtime;
- rollback.orig_src.dirfrag_old_rctime = srcdn->dir->get_projected_fnode()->fragstat.rctime;
- rollback.orig_src.dname = srcdn->name;
- if (srcdn->is_primary())
- rollback.orig_src.ino = srcdn->inode->ino();
- else {
- assert(srcdn->is_remote());
- rollback.orig_src.remote_ino = srcdn->get_remote_ino();
- rollback.orig_src.remote_ino = srcdn->get_remote_d_type();
- }
-
- rollback.orig_dest.dirfrag = destdn->dir->dirfrag();
- rollback.orig_dest.dirfrag_old_mtime = destdn->dir->get_projected_fnode()->fragstat.mtime;
- rollback.orig_dest.dirfrag_old_rctime = destdn->dir->get_projected_fnode()->fragstat.rctime;
- rollback.orig_dest.dname = destdn->name;
- if (destdn->is_primary())
- rollback.orig_dest.ino = destdn->inode->ino();
- else if (destdn->is_remote()) {
- rollback.orig_dest.remote_ino = destdn->get_remote_ino();
- rollback.orig_dest.remote_ino = destdn->get_remote_d_type();
- }
-
- if (straydn) {
- rollback.stray.dirfrag = straydn->dir->dirfrag();
- rollback.stray.dirfrag_old_mtime = straydn->dir->get_projected_fnode()->fragstat.mtime;
- rollback.stray.dirfrag_old_rctime = straydn->dir->get_projected_fnode()->fragstat.rctime;
- rollback.stray.dname = straydn->name;
- }
- ::encode(rollback, le->rollback);
- dout(10) << " rollback is " << le->rollback.length() << " bytes" << dendl;
- mdr->more()->rollback_bl = le->rollback;
- dout(10) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
+ le->rollback = mdr->more()->rollback_bl;
bufferlist blah; // inode import data... obviously not used if we're the slave
_rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);