if (!p->second->is_slave() || !p->second->slave_did_prepare())
continue;
int master = p->second->slave_to_mds;
- if (resolve_set.count(master)) {
+ if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
dout(10) << " including uncommitted " << *p->second << dendl;
if (!resolves.count(master))
resolves[master] = new MMDSResolve;
resolve_gather.insert(who);
discard_delayed_resolve(who);
+ ambiguous_slave_updates.erase(who);
rejoin_gather.insert(who);
rejoin_sent.erase(who); // i need to send another
finish.push_back(p->second);
}
}
+
+ if (p->second->is_slave() &&
+ p->second->slave_did_prepare() && p->second->more()->srcdn_auth_mds == who &&
+ mds->mdsmap->is_clientreplay_or_active_or_stopping(p->second->slave_to_mds)) {
+ // rename srcdn's auth mds failed, resolve even I'm a survivor.
+ dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl;
+ add_ambiguous_slave_update(p->first, p->second->slave_to_mds);
+ }
// failed node is slave?
if (p->second->is_master() && !p->second->committing) {
+ if (p->second->more()->srcdn_auth_mds == who) {
+ dout(10) << " master request " << *p->second << " waiting for rename srcdn's auth mds."
+ << who << " to recover" << dendl;
+ assert(p->second->more()->witnessed.count(who) == 0);
+ if (p->second->more()->is_ambiguous_auth)
+ p->second->clear_ambiguous_auth();
+ // rename srcdn's auth mds failed, all witnesses will rollback
+ p->second->more()->witnessed.clear();
+ pending_masters.erase(p->first);
+ }
+
if (p->second->more()->witnessed.count(who)) {
- dout(10) << " master request " << *p->second << " no longer witnessed by slave mds." << who
- << dendl;
- // discard this peer's prepare (if any)
- p->second->more()->witnessed.erase(who);
+ int srcdn_auth = p->second->more()->srcdn_auth_mds;
+ if (srcdn_auth >= 0 && p->second->more()->waiting_on_slave.count(srcdn_auth)) {
+ dout(10) << " master request " << *p->second << " waiting for rename srcdn's auth mds."
+ << p->second->more()->srcdn_auth_mds << " to reply" << dendl;
+ // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
+ // until either the request is committing or the slave also fails.
+ assert(p->second->more()->waiting_on_slave.size() == 1);
+ pending_masters.insert(p->first);
+ } else {
+ dout(10) << " master request " << *p->second << " no longer witnessed by slave mds."
+ << who << " to recover" << dendl;
+ if (srcdn_auth >= 0)
+ assert(p->second->more()->witnessed.count(srcdn_auth) == 0);
+
+ // discard this peer's prepare (if any)
+ p->second->more()->witnessed.erase(who);
+ }
}
if (p->second->more()->waiting_on_slave.count(who)) {
<< " to recover" << dendl;
// retry request when peer recovers
p->second->more()->waiting_on_slave.erase(who);
- mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second));
- }
-
- if (p->second->has_more() && p->second->more()->is_ambiguous_auth &&
- p->second->more()->rename_inode->authority().first == who) {
- dout(10) << " master request " << *p->second << " waiting for renamed inode's auth mds." << who
- << " to recover" << dendl;
- p->second->clear_ambiguous_auth();
+ if (p->second->more()->waiting_on_slave.empty())
+ mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second));
}
if (p->second->locking && p->second->locking_target_mds == who)
dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
int from = ack->get_source().num();
- if (!resolve_ack_gather.count(from)) {
+ if (!resolve_ack_gather.count(from) ||
+ mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
ack->put();
return;
}
+ if (ambiguous_slave_updates.count(from)) {
+ assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
+ assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
+ }
+
for (vector<metareqid_t>::iterator p = ack->commit.begin();
p != ack->commit.end();
++p) {
dout(10) << " commit on slave " << *p << dendl;
+ if (ambiguous_slave_updates.count(from)) {
+ remove_ambiguous_slave_update(*p, from);
+ continue;
+ }
+
if (mds->is_resolve()) {
// replay
MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
}
}
- if (!mds->is_resolve()) {
- for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
- p != active_requests.end(); ++p)
- assert(p->second->slave_to_mds != from);
- }
-
- resolve_ack_gather.erase(from);
+ if (!ambiguous_slave_updates.count(from))
+ resolve_ack_gather.erase(from);
if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
send_subtree_resolves();
process_delayed_resolve();
map<metareqid_t, umaster> uncommitted_masters; // master: req -> slave set
set<metareqid_t> pending_masters;
+ map<int, set<metareqid_t> > ambiguous_slave_updates;
- //map<metareqid_t, bool> ambiguous_slave_updates; // for log trimming.
- //map<metareqid_t, Context*> waiting_for_slave_update_commit;
friend class ESlaveUpdate;
friend class ECommitted;
public:
void remove_inode_recursive(CInode *in);
+ bool is_ambiguous_slave_update(metareqid_t reqid, int master) {
+ return ambiguous_slave_updates.count(master) &&
+ ambiguous_slave_updates[master].count(reqid);
+ }
+ void add_ambiguous_slave_update(metareqid_t reqid, int master) {
+ ambiguous_slave_updates[master].insert(reqid);
+ }
+ void remove_ambiguous_slave_update(metareqid_t reqid, int master) {
+ assert(ambiguous_slave_updates[master].count(reqid));
+ ambiguous_slave_updates[master].erase(reqid);
+ if (ambiguous_slave_updates[master].empty())
+ ambiguous_slave_updates.erase(master);
+ }
+
void add_rollback(metareqid_t reqid, int master) {
need_resolve_rollback[reqid] = master;
}
// for rename
set<int> extra_witnesses; // replica list from srcdn auth (rename)
+ int srcdn_auth_mds;
version_t src_reanchor_atid; // src->dst
version_t dst_reanchor_atid; // dst->stray
bufferlist inode_import;
bufferlist rollback_bl;
More() :
+ srcdn_auth_mds(-1),
src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0),
rename_inode(0), is_freeze_authpin(false), is_ambiguous_auth(false),
is_remote_frozen_authpin(false), is_inode_exporter(false),
if (mdr->now == utime_t())
mdr->now = ceph_clock_now(g_ceph_context);
+ // -- prepare anchor updates --
+ if (!linkmerge || srcdnl->is_primary()) {
+ C_GatherBuilder anchorgather(g_ceph_context);
+
+ if (srcdnl->is_primary() &&
+ (srcdnl->get_inode()->is_anchored() ||
+ (srcdnl->get_inode()->is_dir() && (srcdnl->get_inode()->inode.rstat.ranchors ||
+ srcdnl->get_inode()->nested_anchors ||
+ !mdcache->is_leaf_subtree(mdcache->get_projected_subtree_root(srcdn->get_dir()))))) &&
+ !mdr->more()->src_reanchor_atid) {
+ dout(10) << "reanchoring src->dst " << *srcdnl->get_inode() << dendl;
+ vector<Anchor> trace;
+ destdn->make_anchor_trace(trace, srcdnl->get_inode());
+ mds->anchorclient->prepare_update(srcdnl->get_inode()->ino(),
+ trace, &mdr->more()->src_reanchor_atid,
+ anchorgather.new_sub());
+ }
+ if (destdnl->is_primary() &&
+ destdnl->get_inode()->is_anchored() &&
+ !mdr->more()->dst_reanchor_atid) {
+ dout(10) << "reanchoring dst->stray " << *destdnl->get_inode() << dendl;
+
+ assert(straydn);
+ vector<Anchor> trace;
+ straydn->make_anchor_trace(trace, destdnl->get_inode());
+
+ mds->anchorclient->prepare_update(destdnl->get_inode()->ino(), trace,
+ &mdr->more()->dst_reanchor_atid, anchorgather.new_sub());
+ }
+
+ if (anchorgather.has_subs()) {
+ anchorgather.set_finisher(new C_MDS_RetryRequest(mdcache, mdr));
+ anchorgather.activate();
+ return; // waiting for anchor prepares
+ }
+
+ assert(g_conf->mds_kill_rename_at != 2);
+ }
+
// -- prepare witnesses --
// do srcdn auth last
int last = -1;
if (!srcdn->is_auth()) {
last = srcdn->authority().first;
+ mdr->more()->srcdn_auth_mds = last;
// ask auth of srci to mark srci as ambiguous auth if more than two MDS
// are involved in the rename operation.
if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
if (!mdr->more()->waiting_on_slave.empty())
return; // we're waiting for a witness.
- if (last >= 0 &&
- mdr->more()->witnessed.count(last) == 0 &&
- mdr->more()->waiting_on_slave.count(last) == 0) {
+ if (last >= 0 && mdr->more()->witnessed.count(last) == 0) {
dout(10) << " preparing last witness (srcdn auth)" << dendl;
+ assert(mdr->more()->waiting_on_slave.count(last) == 0);
_rename_prepare_witness(mdr, last, witnesses, srcdn, destdn, straydn);
return;
}
// test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
if (!mdr->more()->slaves.empty() && !srci->is_dir())
- assert(g_conf->mds_kill_rename_at != 2);
+ assert(g_conf->mds_kill_rename_at != 3);
if (!mdr->more()->slaves.empty() && srci->is_dir())
- assert(g_conf->mds_kill_rename_at != 3);
-
- // -- prepare anchor updates --
- if (!linkmerge || srcdnl->is_primary()) {
- C_GatherBuilder anchorgather(g_ceph_context);
-
- if (srcdnl->is_primary() &&
- (srcdnl->get_inode()->is_anchored() ||
- (srcdnl->get_inode()->is_dir() && (srcdnl->get_inode()->inode.rstat.ranchors ||
- srcdnl->get_inode()->nested_anchors ||
- !mdcache->is_leaf_subtree(mdcache->get_projected_subtree_root(srcdn->get_dir()))))) &&
- !mdr->more()->src_reanchor_atid) {
- dout(10) << "reanchoring src->dst " << *srcdnl->get_inode() << dendl;
- vector<Anchor> trace;
- destdn->make_anchor_trace(trace, srcdnl->get_inode());
- mds->anchorclient->prepare_update(srcdnl->get_inode()->ino(),
- trace, &mdr->more()->src_reanchor_atid,
- anchorgather.new_sub());
- }
- if (destdnl->is_primary() &&
- destdnl->get_inode()->is_anchored() &&
- !mdr->more()->dst_reanchor_atid) {
- dout(10) << "reanchoring dst->stray " << *destdnl->get_inode() << dendl;
-
- assert(straydn);
- vector<Anchor> trace;
- straydn->make_anchor_trace(trace, destdnl->get_inode());
-
- mds->anchorclient->prepare_update(destdnl->get_inode()->ino(), trace,
- &mdr->more()->dst_reanchor_atid, anchorgather.new_sub());
- }
-
- if (anchorgather.has_subs()) {
- anchorgather.set_finisher(new C_MDS_RetryRequest(mdcache, mdr));
- anchorgather.activate();
- return; // waiting for anchor prepares
- }
-
assert(g_conf->mds_kill_rename_at != 4);
- }
// -- prepare journal entry --
mdr->ls = mdlog->get_current_segment();
// abort
// rollback_bl may be empty if we froze the inode but had to provide an expanded
// witness list from the master, and they failed before we tried prep again.
- if (mdr->more()->rollback_bl.length())
- do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
- else
+ if (mdr->more()->rollback_bl.length()) {
+ if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
+ mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
+ // rollback but preserve the slave request
+ do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, NULL);
+ } else
+ do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
+ } else {
dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
+ mds->mdcache->request_finish(mdr);
+ }
}
}
dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
// need to finish this update before sending resolve to claim the subtree
mds->mdcache->add_rollback(rollback.reqid, master);
- assert(mdr || mds->is_resolve());
Mutation *mut = new Mutation(rollback.reqid);
mut->ls = mds->mdlog->get_current_segment();