From 0c5470465d26baf75dc4e4586117fb3343003c25 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 14 Dec 2011 18:41:10 -0800 Subject: [PATCH] osd: preserve write order when waiting on src_oids We need to preserve the order of write operations on each object. If we have a write on X that needs to read from Y, and Y is degraded, then we need to wait for Y to repair. Doing so blindly will allow other writes to X to proceed while our clone op is still waiting, violating the ordering. Fix this by adding blocked_by and blocking vars to the ObjectContext. If we wait on a src_oid, the oid is "blocked" by that object, and any subsequent writes should also wait on the same queue. Use a helper to do the cleanup when we complete recovery, or when the pg resets. Signed-off-by: Sage Weil --- src/osd/ReplicatedPG.cc | 40 ++++++++++++++++++++++++++++++++++------ src/osd/ReplicatedPG.h | 6 ++++++ 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index dc0892fde4a8b..19ad90d6fdbf6 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -519,6 +519,15 @@ void ReplicatedPG::do_op(MOSDOp *op) dout(10) << "do_op mode now " << mode << dendl; + // are writes blocked by another object? + if (obc->blocked_by) { + dout(10) << "do_op writes for " << obc->obs.oi.soid << " blocked by " + << obc->blocked_by->obs.oi.soid << dendl; + wait_for_degraded_object(obc->blocked_by->obs.oi.soid, op); + put_object_context(obc); + return; + } + // src_oids map src_obc; for (vector::iterator p = op->ops.begin(); p != op->ops.end(); p++) { @@ -543,6 +552,10 @@ void ReplicatedPG::do_op(MOSDOp *op) osd->reply_op_error(op, r); } else if (is_degraded_object(sobc->obs.oi.soid)) { wait_for_degraded_object(sobc->obs.oi.soid, op); + dout(10) << " writes for " << obc->obs.oi.soid << " now blocked by " + << sobc->obs.oi.soid << dendl; + obc->blocked_by = sobc; + sobc->blocking.insert(obc); } else if (sobc->obs.oi.oloc.key != obc->obs.oi.oloc.key && sobc->obs.oi.oloc.key != obc->obs.oi.soid.oid.name && sobc->obs.oi.soid.oid.name != obc->obs.oi.oloc.key) { @@ -4181,11 +4194,7 @@ void ReplicatedPG::sub_op_push_reply(MOSDSubOpReply *reply) osd->requeue_ops(this, waiting_for_degraded_object[soid]); waiting_for_degraded_object.erase(soid); } - map::iterator i = - object_contexts.find(soid); - if (i != object_contexts.end()) { - populate_obc_watchers(i->second); - } + finish_degraded_object(soid); } else { dout(10) << "pushed " << soid << ", still waiting for push ack from " << pushing[soid].size() << " others" << dendl; @@ -4195,6 +4204,20 @@ void ReplicatedPG::sub_op_push_reply(MOSDSubOpReply *reply) reply->put(); } +void ReplicatedPG::finish_degraded_object(const hobject_t& oid) +{ + dout(10) << "finish_degraded_object " << oid << dendl; + map::iterator i = object_contexts.find(oid); + if (i != object_contexts.end()) { + populate_obc_watchers(i->second); + for (set::iterator j = i->second->blocking.begin(); + j != i->second->blocking.end(); + i->second->blocking.erase(j++)) { + dout(10) << " no longer blocking writes for " << (*j)->obs.oi.soid << dendl; + (*j)->blocked_by = NULL; + } + } +} /** op_pull * process request to pull an entire object. @@ -4942,7 +4965,12 @@ void ReplicatedPG::on_change() // take object waiters requeue_object_waiters(waiting_for_missing_object); - requeue_object_waiters(waiting_for_degraded_object); + for (map >::iterator p = waiting_for_degraded_object.begin(); + p != waiting_for_degraded_object.end(); + waiting_for_degraded_object.erase(p++)) { + osd->requeue_ops(this, p->second); + finish_degraded_object(p->first); + } osd->requeue_ops(this, waiting_for_all_missing); waiting_for_all_missing.clear(); diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 1f0d83d935732..f9eda15ffcc9d 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -271,6 +271,10 @@ public: Cond cond; int unstable_writes, readers, writers_waiting, readers_waiting; + // set if writes for this object are blocked on another objects recovery + ObjectContext *blocked_by; // object blocking our writes + set blocking; // objects whose writes we block + // any entity in obs.oi.watchers MUST be in either watchers or unconnected_watchers. map watchers; map unconnected_watchers; @@ -584,6 +588,8 @@ protected: map >& clone_subsets); void send_push_op_blank(const hobject_t& soid, int peer); + void finish_degraded_object(const hobject_t& oid); + // Cancels/resets pulls from peer void check_recovery_op_pulls(const OSDMapRef map); int pull(const hobject_t& oid, eversion_t v); -- 2.39.5