From a25374e40ab585bef7314bb258f0c831f881bf96 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 15 Jul 2011 16:36:07 -0700 Subject: [PATCH] mds: witness rmdir when subtrees are on other hosts If there is an rmdir with an empty subtree on another mds, we need to witness/ journal that on the dirfrag's auth mds so that replay correctly updates the subtree map. This is simpler than the rename witnesses (and the link/unlink ones) because we aren't actually journaling a modification to any actual metadata; it's just the subtree map that is changing. The projection of the subtree map update needs work still, but that is also the case for renames. Signed-off-by: Sage Weil --- src/mds/MDCache.cc | 21 ++- src/mds/MDCache.h | 2 +- src/mds/Server.cc | 251 +++++++++++++++++++++++++++++++- src/mds/Server.h | 9 +- src/mds/events/ESlaveUpdate.h | 38 ++++- src/messages/MMDSSlaveRequest.h | 6 + 6 files changed, 312 insertions(+), 15 deletions(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index f2772dda74f60..577117bc99bb4 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -2740,12 +2740,19 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack) // perform rollback (and journal a rollback entry) // note: this will hold up the resolve a bit, until the rollback entries journal. - if (uncommitted_slave_updates[from][*p]->origop == ESlaveUpdate::LINK) + switch (uncommitted_slave_updates[from][*p]->origop) { + case ESlaveUpdate::LINK: mds->server->do_link_rollback(uncommitted_slave_updates[from][*p]->rollback, from, 0); - else if (uncommitted_slave_updates[from][*p]->origop == ESlaveUpdate::RENAME) - mds->server->do_rename_rollback(uncommitted_slave_updates[from][*p]->rollback, from, 0); - else + break; + case ESlaveUpdate::RENAME: + mds->server->do_rename_rollback(uncommitted_slave_updates[from][*p]->rollback, from, 0); + break; + case ESlaveUpdate::RMDIR: + mds->server->do_rmdir_rollback(uncommitted_slave_updates[from][*p]->rollback, from, 0); + break; + default: assert(0); + } delete uncommitted_slave_updates[from][*p]; uncommitted_slave_updates[from].erase(*p); @@ -9096,13 +9103,17 @@ void MDCache::handle_dentry_link(MDentryLink *m) // UNLINK -void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn) +void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequest *mdr) { dout(10) << "send_dentry_unlink " << *dn << dendl; // share unlink news with replicas for (map::iterator it = dn->replicas_begin(); it != dn->replicas_end(); it++) { + // don't tell (rmdir) witnesses; they already know + if (mdr && mdr->more()->witnessed.count(it->first)) + continue; + MDentryUnlink *unlink = new MDentryUnlink(dn->get_dir()->dirfrag(), dn->name); if (straydn) replicate_stray(straydn, it->first, unlink->straybl); diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 0e9f023c333e1..fcbc17759d4e2 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -1142,7 +1142,7 @@ public: // -- namespace -- public: void send_dentry_link(CDentry *dn); - void send_dentry_unlink(CDentry *dn, CDentry *straydn); + void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequest *mdr); protected: void handle_dentry_link(MDentryLink *m); void handle_dentry_unlink(MDentryUnlink *m); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index a30ec88dd5f43..c0e7d13b431d8 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1281,6 +1281,13 @@ void Server::handle_slave_request(MMDSSlaveRequest *m) } break; + case MMDSSlaveRequest::OP_RMDIRPREPACK: + { + MDRequest *mdr = mdcache->request_get(m->get_reqid()); + handle_slave_rmdir_prep_ack(mdr, m); + } + break; + case MMDSSlaveRequest::OP_RENAMEPREPACK: { MDRequest *mdr = mdcache->request_get(m->get_reqid()); @@ -1430,6 +1437,10 @@ void Server::dispatch_slave_request(MDRequest *mdr) handle_slave_link_prep(mdr); break; + case MMDSSlaveRequest::OP_RMDIRPREP: + handle_slave_rmdir_prep(mdr); + break; + case MMDSSlaveRequest::OP_RENAMEPREP: handle_slave_rename_prep(mdr); break; @@ -3955,7 +3966,7 @@ void Server::_link_remote_finish(MDRequest *mdr, bool inc, if (inc) mds->mdcache->send_dentry_link(dn); else - mds->mdcache->send_dentry_unlink(dn, NULL); + mds->mdcache->send_dentry_unlink(dn, NULL, NULL); // commit anchor update? if (mdr->more()->dst_reanchor_atid) @@ -4395,6 +4406,34 @@ void Server::handle_client_unlink(MDRequest *mdr) } } + if (in->is_dir() && in->has_subtree_root_dirfrag()) { + // subtree root auths need to be witnesses + set witnesses; + list ls; + in->get_subtree_dirfrags(ls); + for (list::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *dir = *p; + int auth = dir->authority().first; + witnesses.insert(auth); + dout(10) << " need mds" << auth << " to witness for dirfrag " << *dir << dendl; + } + dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl; + + for (set::iterator p = witnesses.begin(); + p != witnesses.end(); + ++p) { + if (mdr->more()->witnessed.count(*p)) { + dout(10) << " already witnessed by mds" << *p << dendl; + } else if (mdr->more()->waiting_on_slave.count(*p)) { + dout(10) << " already waiting on witness mds" << *p << dendl; + } else { + _rmdir_prepare_witness(mdr, *p, dn, straydn); + } + } + if (!mdr->more()->waiting_on_slave.empty()) + return; // we're waiting for a witness. + } + // ok! if (dnl->is_remote() && !dnl->get_inode()->is_auth()) _link_remote(mdr, false, dn, dnl->get_inode()); @@ -4402,8 +4441,6 @@ void Server::handle_client_unlink(MDRequest *mdr) _unlink_local(mdr, dn, straydn); } - - class C_MDS_unlink_local_finish : public Context { MDS *mds; MDRequest *mdr; @@ -4420,7 +4457,6 @@ public: } }; - void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) { dout(10) << "_unlink_local " << *dn << dendl; @@ -4516,7 +4552,7 @@ void Server::_unlink_local_finish(MDRequest *mdr, if (snap_is_new) //only new if straydnl exists mdcache->do_realm_invalidate_and_update_notify(straydnl->get_inode(), CEPH_SNAP_OP_SPLIT, true); - mds->mdcache->send_dentry_unlink(dn, straydn); + mds->mdcache->send_dentry_unlink(dn, straydn, mdr); // update subtree map? if (straydn && straydnl->get_inode()->is_dir()) @@ -4541,6 +4577,209 @@ void Server::_unlink_local_finish(MDRequest *mdr, dn->get_dir()->try_remove_unlinked_dn(dn); } +void Server::_rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn) +{ + dout(10) << "_rmdir_prepare_witness mds" << who << " for " << *mdr << dendl; + + MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RMDIRPREP); + dn->make_path(req->srcdnpath); + straydn->make_path(req->destdnpath); + req->now = mdr->now; + + mdcache->replicate_stray(straydn, who, req->stray); + + mds->send_message_mds(req, who); + + assert(mdr->more()->waiting_on_slave.count(who) == 0); + mdr->more()->waiting_on_slave.insert(who); +} + +struct C_MDS_SlaveRmdirPrep : public Context { + Server *server; + MDRequest *mdr; + CDentry *dn, *straydn; + C_MDS_SlaveRmdirPrep(Server *s, MDRequest *r, CDentry *d, CDentry *st) + : server(s), mdr(r), dn(d), straydn(st) {} + void finish(int r) { + server->_rmdir_logged_witness(mdr, dn, straydn); + } +}; + +void Server::handle_slave_rmdir_prep(MDRequest *mdr) +{ + dout(10) << "handle_slave_rmdir_prep " << *mdr + << " " << mdr->slave_request->srcdnpath + << " to " << mdr->slave_request->destdnpath + << dendl; + + vector trace; + filepath srcpath(mdr->slave_request->srcdnpath); + dout(10) << " src " << srcpath << dendl; + CInode *in; + int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &in, MDS_TRAVERSE_DISCOVERXLOCK); + assert(r == 0); + CDentry *dn = trace[trace.size()-1]; + dout(10) << " dn " << *dn << dendl; + mdr->pin(dn); + + assert(mdr->slave_request->stray.length() > 0); + CDentry *straydn = mdcache->add_replica_stray(mdr->slave_request->stray, mdr->slave_to_mds); + assert(straydn); + mdr->pin(straydn); + dout(10) << " straydn " << *straydn << dendl; + + mdr->now = mdr->slave_request->now; + + rmdir_rollback rollback; + rollback.reqid = mdr->reqid; + rollback.src_dir = dn->get_dir()->dirfrag(); + rollback.src_dname = dn->name; + rollback.dest_dir = straydn->get_dir()->dirfrag(); + rollback.dest_dname = straydn->name; + ::encode(rollback, mdr->more()->rollback_bl); + dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl; + + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds, + ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR); + mdlog->start_entry(le); + le->rollback = mdr->more()->rollback_bl; + + le->commit.add_dir_context(dn->get_dir()); + le->commit.add_dir_context(straydn->get_dir()); + le->commit.add_primary_dentry(straydn, true, in); + le->commit.add_null_dentry(dn, true); + + dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl; + le->commit.renamed_dirino = in->ino(); + + mdlog->submit_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn)); + mdlog->flush(); +} + +void Server::_rmdir_logged_witness(MDRequest *mdr, CDentry *dn, CDentry *straydn) +{ + dout(10) << "_rmdir_logged_witness " << *mdr << " on " << *dn << dendl; + + // update our cache now, so we are consistent with what is in the journal + // when we journal a subtree map + CInode *in = dn->get_linkage()->get_inode(); + dn->get_dir()->unlink_inode(dn); + straydn->get_dir()->link_primary_inode(straydn, in); + mdcache->adjust_subtree_after_rename(in, dn->get_dir()); + + MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RMDIRPREPACK); + mds->send_message_mds(reply, mdr->slave_to_mds); + + // done. + mdr->slave_request->put(); + mdr->slave_request = 0; +} + +void Server::handle_slave_rmdir_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack) +{ + dout(10) << "handle_slave_rmdir_prep_ack " << *mdr + << " " << *ack << dendl; + + int from = ack->get_source().num(); + + mdr->more()->slaves.insert(from); + mdr->more()->witnessed.insert(from); + + // remove from waiting list + assert(mdr->more()->waiting_on_slave.count(from)); + mdr->more()->waiting_on_slave.erase(from); + + if (mdr->more()->waiting_on_slave.empty()) + dispatch_client_request(mdr); // go again! + else + dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl; +} + +void Server::_commit_slave_rmdir(MDRequest *mdr, int r, CDentry *dn, CDentry *straydn) +{ + dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl; + + if (r == 0) { + // write a commit to the journal + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid, mdr->slave_to_mds, + ESlaveUpdate::OP_COMMIT, ESlaveUpdate::RMDIR); + mdlog->start_entry(le); + mdr->cleanup(); + + mdlog->submit_entry(le, new C_MDS_CommittedSlave(this, mdr)); + mdlog->flush(); + } else { + // abort + do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr); + } +} + +struct C_MDS_LoggedRmdirRollback : public Context { + Server *server; + MDRequest *mdr; + metareqid_t reqid; + CDentry *dn; + CDentry *straydn; + C_MDS_LoggedRmdirRollback(Server *s, MDRequest *m, metareqid_t mr, CDentry *d, CDentry *st) + : server(s), mdr(m), reqid(mr), dn(d), straydn(st) {} + void finish(int r) { + server->_rmdir_rollback_finish(mdr, reqid, dn, straydn); + } +}; + +void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr) +{ + rmdir_rollback rollback; + bufferlist::iterator p = rbl.begin(); + ::decode(rollback, p); + + dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl; + if (!mdr) { + assert(mds->is_resolve()); + mds->mdcache->add_rollback(rollback.reqid); // need to finish this update before resolve finishes + } + Mutation *mut = new Mutation(rollback.reqid); + mut->ls = mds->mdlog->get_current_segment(); + + CDir *dir = mds->mdcache->get_dirfrag(rollback.src_dir); + CDentry *dn = dir->lookup(rollback.src_dname); + dout(10) << " dn " << *dn << dendl; + dir = mds->mdcache->get_dirfrag(rollback.dest_dir); + CDentry *straydn = dir->lookup(rollback.dest_dname); + dout(10) << " straydn " << *dn << dendl; + CInode *in = straydn->get_linkage()->get_inode(); + + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master, + ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR); + mdlog->start_entry(le); + + le->commit.add_dir_context(dn->get_dir()); + le->commit.add_dir_context(straydn->get_dir()); + le->commit.add_primary_dentry(dn, true, in); + le->commit.add_null_dentry(straydn, true); + + dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl; + le->commit.renamed_dirino = in->ino(); + + mdlog->submit_entry(le, new C_MDS_LoggedRmdirRollback(this, mdr, rollback.reqid, dn, straydn)); + mdlog->flush(); +} + +void Server::_rmdir_rollback_finish(MDRequest *mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn) +{ + dout(10) << "_rmdir_rollback_finish " << *mdr << dendl; + + CInode *in = straydn->get_linkage()->get_inode(); + straydn->get_dir()->unlink_inode(dn); + dn->get_dir()->link_primary_inode(dn, in); + + mdcache->adjust_subtree_after_rename(in, straydn->get_dir()); + + if (mdr) + mds->mdcache->request_finish(mdr); + else + mds->mdcache->finish_rollback(reqid); +} /** _dir_is_nonempty[_unlocked] @@ -5919,8 +6158,6 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr) ::decode(rollback, p); dout(10) << "do_rename_rollback on " << rollback.reqid << dendl; - - //Mutation *mut = mdr; if (!mdr) { assert(mds->is_resolve()); mds->mdcache->add_rollback(rollback.reqid); // need to finish this update before resolve finishes diff --git a/src/mds/Server.h b/src/mds/Server.h index ec1ee66793fd9..2a227f8d6b026 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -191,7 +191,14 @@ public: void _unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn); void _unlink_local_finish(MDRequest *mdr, CDentry *dn, CDentry *straydn, - version_t); + version_t); + void _rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn); + void handle_slave_rmdir_prep(MDRequest *mdr); + void _rmdir_logged_witness(MDRequest *mdr, CDentry *srcdn, CDentry *straydn); + void handle_slave_rmdir_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack); + void _commit_slave_rmdir(MDRequest *mdr, int r, CDentry *dn, CDentry *straydn); + void do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr); + void _rmdir_rollback_finish(MDRequest *mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn); // rename void handle_client_rename(MDRequest *mdr); diff --git a/src/mds/events/ESlaveUpdate.h b/src/mds/events/ESlaveUpdate.h index b5741a5ba6def..c5f8e10847981 100644 --- a/src/mds/events/ESlaveUpdate.h +++ b/src/mds/events/ESlaveUpdate.h @@ -54,6 +54,40 @@ struct link_rollback { }; WRITE_CLASS_ENCODER(link_rollback) +/* + * this is only used on an empty dir with a dirfrag on a remote node. + * we are auth for nothing. all we need to do is relink the directory + * in the hierarchy properly during replay to avoid breaking the + * subtree map. + */ +struct rmdir_rollback { + metareqid_t reqid; + dirfrag_t src_dir; + string src_dname; + dirfrag_t dest_dir; + string dest_dname; + + void encode(bufferlist& bl) const { + __u8 struct_v = 1; + ::encode(struct_v, bl); + ::encode(reqid, bl); + ::encode(src_dir, bl); + ::encode(src_dname, bl); + ::encode(dest_dir, bl); + ::encode(dest_dname, bl); + } + void decode(bufferlist::iterator& bl) { + __u8 struct_v; + ::decode(struct_v, bl); + ::decode(reqid, bl); + ::decode(src_dir, bl); + ::decode(src_dname, bl); + ::decode(dest_dir, bl); + ::decode(dest_dname, bl); + } +}; +WRITE_CLASS_ENCODER(rmdir_rollback) + struct rename_rollback { struct drec { dirfrag_t dirfrag; @@ -75,7 +109,7 @@ struct rename_rollback { ::encode(dname, bl); ::encode(remote_d_type, bl); ::encode(old_ctime, bl); - } + } void decode(bufferlist::iterator &bl) { __u8 struct_v; ::decode(struct_v, bl); @@ -126,6 +160,8 @@ public: const static int LINK = 1; const static int RENAME = 2; + const static int RMDIR = 3; + /* * we journal a rollback metablob that contains the unmodified metadata * too, because we may be updating previously dirty metadata, which diff --git a/src/messages/MMDSSlaveRequest.h b/src/messages/MMDSSlaveRequest.h index 855088755b2fd..7bdf93025e8d5 100644 --- a/src/messages/MMDSSlaveRequest.h +++ b/src/messages/MMDSSlaveRequest.h @@ -38,6 +38,9 @@ class MMDSSlaveRequest : public Message { static const int OP_WRLOCKACK = -8; static const int OP_UNWRLOCK = 9; + static const int OP_RMDIRPREP = 10; + static const int OP_RMDIRPREPACK = -10; + static const int OP_FINISH = 17; static const int OP_COMMITTED = -18; @@ -67,6 +70,9 @@ class MMDSSlaveRequest : public Message { case OP_WRLOCKACK: return "wrlock_ack"; case OP_UNWRLOCK: return "unwrlock"; + case OP_RMDIRPREP: return "rmdir_prep"; + case OP_RMDIRPREPACK: return "rmdir_prep_ack"; + case OP_ABORT: return "abort"; //case OP_COMMIT: return "commit"; -- 2.39.5