From: Samuel Just Date: Mon, 27 Jul 2015 20:12:25 +0000 (-0700) Subject: ReplicatedPG: enforce write ordering on rollback X-Git-Tag: v9.1.0~345^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=35af63b982e246eddd3ec4f8492331b930a7e230;p=ceph.git ReplicatedPG: enforce write ordering on rollback Previously, rollback ops could reorder w.r.t other writes due to waiting on degraded snaps other than head. To fix that, we'll introduce a new map tracking objects blocked on degraded snaps. A particular object can only be blocked on one snap at a time (subsequent writes won't get far enough to add another entry). It might have been possible use the blocked_by machinery for this, but it requires that the object have an extant obc, which we may not have for a missing object. Also, that machinery exists primarily to support clone_range, which I hope to remove soon. Signed-off-by: Samuel Just --- diff --git a/src/osd/PG.h b/src/osd/PG.h index 444a8c1a333..7c658e95ea8 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -747,6 +747,7 @@ protected: map > waiting_for_unreadable_object, waiting_for_degraded_object, waiting_for_blocked_object; + map objects_blocked_on_degraded_snap; // Callbacks should assume pg (and nothing else) is locked map > callbacks_for_degraded_object; diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 7d68fe069e5..9f02f7efde9 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -371,22 +371,20 @@ bool ReplicatedPG::is_missing_object(const hobject_t& soid) const return pg_log.get_missing().missing.count(soid); } -void ReplicatedPG::wait_for_unreadable_object( - const hobject_t& soid, OpRequestRef op) +void ReplicatedPG::maybe_kick_recovery( + const hobject_t &soid) { - assert(is_unreadable_object(soid)); - eversion_t v; - bool needs_recovery = missing_loc.needs_recovery(soid, &v); - assert(needs_recovery); + if (!missing_loc.needs_recovery(soid, &v)) + return; map::const_iterator p = recovering.find(soid); if (p != recovering.end()) { - dout(7) << "missing " << soid << " v " << v << ", already recovering." << dendl; + dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl; } else if (missing_loc.is_unfound(soid)) { - dout(7) << "missing " << soid << " v " << v << ", is unfound." << dendl; + dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl; } else { - dout(7) << "missing " << soid << " v " << v << ", recovering." << dendl; + dout(7) << "object " << soid << " v " << v << ", recovering." << dendl; PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); if (is_missing_object(soid)) { recover_missing(soid, v, cct->_conf->osd_client_op_priority, h); @@ -395,6 +393,14 @@ void ReplicatedPG::wait_for_unreadable_object( } pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority); } +} + +void ReplicatedPG::wait_for_unreadable_object( + const hobject_t& soid, OpRequestRef op) +{ + assert(is_unreadable_object(soid)); + + maybe_kick_recovery(soid); waiting_for_unreadable_object[soid].push_back(op); op->mark_delayed("waiting for missing object"); } @@ -434,43 +440,22 @@ void ReplicatedPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef { assert(is_degraded_or_backfilling_object(soid)); - // we don't have it (yet). - if (recovering.count(soid)) { - dout(7) << "degraded " - << soid - << ", already recovering" - << dendl; - } else if (missing_loc.is_unfound(soid)) { - dout(7) << "degraded " - << soid - << ", still unfound, waiting" - << dendl; - } else { - dout(7) << "degraded " - << soid - << ", recovering" - << dendl; - eversion_t v; - assert(!actingbackfill.empty()); - for (set::iterator i = actingbackfill.begin(); - i != actingbackfill.end(); - ++i) { - if (*i == get_primary()) continue; - pg_shard_t peer = *i; - if (peer_missing.count(peer) && - peer_missing[peer].missing.count(soid)) { - v = peer_missing[peer].missing[soid].need; - break; - } - } - PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); - prep_object_replica_pushes(soid, v, h); - pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority); - } + maybe_kick_recovery(soid); waiting_for_degraded_object[soid].push_back(op); op->mark_delayed("waiting for degraded object"); } +void ReplicatedPG::block_write_on_degraded_snap( + const hobject_t& snap, OpRequestRef op) +{ + dout(20) << __func__ << ": blocking object " << snap.get_head() + << " on degraded snap " << snap << dendl; + // otherwise, we'd have blocked in do_op + assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0); + objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap; + wait_for_degraded_object(snap, op); +} + bool ReplicatedPG::maybe_await_blocked_snapset( const hobject_t &hoid, OpRequestRef op) @@ -1428,6 +1413,16 @@ void ReplicatedPG::do_op(OpRequestRef& op) return; } + // blocked on snap? + map::iterator blocked_iter = + objects_blocked_on_degraded_snap.find(head); + if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) { + hobject_t to_wait_on(head); + to_wait_on.snap = blocked_iter->second; + wait_for_degraded_object(to_wait_on, op); + return; + } + // missing snapdir? hobject_t snapdir(m->get_oid(), m->get_object_locator().key, CEPH_SNAPDIR, m->get_pg().ps(), info.pgid.pool(), @@ -5493,7 +5488,7 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op) assert(is_missing_object(missing_oid)); dout(20) << "_rollback_to attempted to roll back to a missing object " << missing_oid << " (requested snapid: ) " << snapid << dendl; - wait_for_unreadable_object(missing_oid, ctx->op); + block_write_on_degraded_snap(missing_oid, ctx->op); return ret; } if (maybe_handle_cache(ctx->op, true, rollback_to, ret, missing_oid, true)) { @@ -5520,7 +5515,7 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op) if (is_degraded_or_backfilling_object(rollback_to_sobject)) { dout(20) << "_rollback_to attempted to roll back to a degraded object " << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl; - wait_for_degraded_object(rollback_to_sobject, ctx->op); + block_write_on_degraded_snap(rollback_to_sobject, ctx->op); ret = -EAGAIN; } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) { // rolling back to the head; we just need to clone it. @@ -8635,6 +8630,11 @@ void ReplicatedPG::finish_degraded_object(const hobject_t& oid) (*i)->complete(0); } } + map::iterator i = objects_blocked_on_degraded_snap.find( + oid.get_head()); + if (i != objects_blocked_on_degraded_snap.end() && + i->second == oid.snap) + objects_blocked_on_degraded_snap.erase(i); } void ReplicatedPG::_committed_pushed_object( @@ -9232,6 +9232,9 @@ void ReplicatedPG::on_change(ObjectStore::Transaction *t) // NOTE: we actually assert that all currently live references are dead // by the time the flush for the next interval completes. object_contexts.clear(); + + // should have been cleared above by finishing all of the degraded objects + assert(objects_blocked_on_degraded_snap.empty()); } void ReplicatedPG::on_role_change() diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 81f5130fa86..0f4924abfd4 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -1577,11 +1577,13 @@ public: return is_missing_object(oid) || !missing_loc.readable_with_acting(oid, actingset); } + void maybe_kick_recovery(const hobject_t &soid); void wait_for_unreadable_object(const hobject_t& oid, OpRequestRef op); void wait_for_all_missing(OpRequestRef op); bool is_degraded_or_backfilling_object(const hobject_t& oid); void wait_for_degraded_object(const hobject_t& oid, OpRequestRef op); + void block_write_on_degraded_snap(const hobject_t& oid, OpRequestRef op); bool maybe_await_blocked_snapset(const hobject_t &soid, OpRequestRef op); void wait_for_blocked_object(const hobject_t& soid, OpRequestRef op);