From: Samuel Just Date: Tue, 11 Mar 2014 17:31:55 +0000 (-0700) Subject: PG: do not serve requests until replicas have activated X-Git-Tag: v0.78~32^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=a576eb320463ee79feff7b0a973cad117cd98cf9;p=ceph.git PG: do not serve requests until replicas have activated There are two problems: 1) We choose the min last_update amoung peers with the max local-les value as an upper bound on requests which could have been reported to the client as committed. We then, for ec pools, roll back to that point to ensure that we don't inadvertently commit to an update which fewer than K replicas actually saw. If the primary sets local-les, accepts an update from a client, and there is a new interval before any of the replicas have been activated, we will end up being forced to use that update which no other replica has seen as the new last_update. This will cause the object to become unfound. We don't have this problem as long as all active replicas agree on last_update before we accept IO. 2) Even for replicated pools, we would then immediately respond to the request which created the primary-only update with a commit since it is in the log and we have no outstanding repops. If we then lose that primary before any of the replicas in the new interval record the new log, we will not only lose the object, but also the log entry recording it, which will result in a lost write. For these reasons, it seems like we need to wait for the replicas to activate before we can process new requests essentially because whatever update we select as last_update is essentially regarded as committed as soon as we accept IO. Fixes: #7649 Signed-off-by: Samuel Just --- diff --git a/src/osd/PG.cc b/src/osd/PG.cc index fe759efe1422f..b704556aab66f 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1417,7 +1417,6 @@ void PG::activate(ObjectStore::Transaction& t, } // twiddle pg state - state_set(PG_STATE_ACTIVE); state_clear(PG_STATE_DOWN); send_notify = false; @@ -1440,12 +1439,8 @@ void PG::activate(ObjectStore::Transaction& t, dirty_info = true; dirty_big_info = true; // maybe - // verify that there are no stray objects - if (is_primary()) - check_local(); - // find out when we commit - tfin.push_back(new C_PG_ActivateCommitted(this, query_epoch)); + t.register_on_complete(new C_PG_ActivateCommitted(this, query_epoch)); // initialize snap_trimq if (is_primary()) { @@ -1628,42 +1623,7 @@ void PG::activate(ObjectStore::Transaction& t, if (get_osdmap()->get_pg_size(info.pgid.pgid) > acting.size()) state_set(PG_STATE_DEGRADED); - // all clean? - if (needs_recovery()) { - dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl; - queue_peering_event( - CephPeeringEvtRef( - new CephPeeringEvt( - get_osdmap()->get_epoch(), - get_osdmap()->get_epoch(), - DoRecovery()))); - } else if (needs_backfill()) { - dout(10) << "activate queueing backfill" << dendl; - queue_peering_event( - CephPeeringEvtRef( - new CephPeeringEvt( - get_osdmap()->get_epoch(), - get_osdmap()->get_epoch(), - RequestBackfill()))); - } else { - dout(10) << "activate all replicas clean, no recovery" << dendl; - queue_peering_event( - CephPeeringEvtRef( - new CephPeeringEvt( - get_osdmap()->get_epoch(), - get_osdmap()->get_epoch(), - AllReplicasRecovered()))); - } - - publish_stats_to_osd(); - } - - // waiters - if (!is_replay() && flushes_in_progress == 0) { - requeue_ops(waiting_for_active); } - - on_activate(); } bool PG::op_has_sufficient_caps(OpRequestRef op) @@ -1737,7 +1697,7 @@ void PG::queue_op(OpRequestRef op) void PG::replay_queued_ops() { - assert(is_replay() && is_active()); + assert(is_replay()); eversion_t c = info.last_update; list replay; dout(10) << "replay_queued_ops" << dendl; @@ -1788,6 +1748,12 @@ void PG::_activate_committed(epoch_t e) i.info.history.last_epoch_started = e; m->pg_list.push_back(make_pair(i, pg_interval_map_t())); osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch()); + + state_set(PG_STATE_ACTIVE); + // waiters + if (flushes_in_progress == 0) { + requeue_ops(waiting_for_active); + } } if (dirty_info) { @@ -6165,7 +6131,6 @@ PG::RecoveryState::Active::Active(my_context ctx) *context< RecoveryMachine >().get_query_map(), context< RecoveryMachine >().get_info_map(), context< RecoveryMachine >().get_recovery_ctx()); - assert(pg->is_active()); dout(10) << "Activate Finished" << dendl; } @@ -6215,7 +6180,6 @@ boost::statechart::result PG::RecoveryState::Active::react(const ActMap&) { PG *pg = context< RecoveryMachine >().pg; dout(10) << "Active: handling ActMap" << dendl; - assert(pg->is_active()); assert(pg->is_primary()); if (pg->have_unfound()) { @@ -6253,7 +6217,6 @@ boost::statechart::result PG::RecoveryState::Active::react(const ActMap&) boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt) { PG *pg = context< RecoveryMachine >().pg; - assert(pg->is_active()); assert(pg->is_primary()); if (pg->peer_info.count(notevt.from)) { dout(10) << "Active: got notify from " << notevt.from @@ -6278,7 +6241,6 @@ boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& not boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt) { PG *pg = context< RecoveryMachine >().pg; - assert(pg->is_active()); assert(pg->is_primary()); assert(!pg->actingbackfill.empty()); @@ -6293,10 +6255,10 @@ boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoe dout(10) << " peer osd." << infoevt.from << " activated and committed" << dendl; pg->peer_activated.insert(infoevt.from); - } - if (pg->peer_activated.size() == pg->actingbackfill.size()) { - pg->all_activated_and_committed(); + if (pg->peer_activated.size() == pg->actingbackfill.size()) { + pg->all_activated_and_committed(); + } } return discard_event(); } @@ -6378,7 +6340,20 @@ boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q) boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt) { + PG *pg = context< RecoveryMachine >().pg; all_replicas_activated = true; + + pg->state_set(PG_STATE_ACTIVE); + + pg->check_local(); + + // waiters + if (!pg->is_replay() && pg->flushes_in_progress == 0) { + pg->requeue_ops(pg->waiting_for_active); + } + + pg->on_activate(); + return discard_event(); } diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 8ea8a624850c1..5552e861ce0ef 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -8919,6 +8919,35 @@ void ReplicatedPG::on_shutdown() void ReplicatedPG::on_activate() { + // all clean? + if (needs_recovery()) { + dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl; + queue_peering_event( + CephPeeringEvtRef( + new CephPeeringEvt( + get_osdmap()->get_epoch(), + get_osdmap()->get_epoch(), + DoRecovery()))); + } else if (needs_backfill()) { + dout(10) << "activate queueing backfill" << dendl; + queue_peering_event( + CephPeeringEvtRef( + new CephPeeringEvt( + get_osdmap()->get_epoch(), + get_osdmap()->get_epoch(), + RequestBackfill()))); + } else { + dout(10) << "activate all replicas clean, no recovery" << dendl; + queue_peering_event( + CephPeeringEvtRef( + new CephPeeringEvt( + get_osdmap()->get_epoch(), + get_osdmap()->get_epoch(), + AllReplicasRecovered()))); + } + + publish_stats_to_osd(); + if (!backfill_targets.empty()) { last_backfill_started = earliest_backfill(); new_backfill = true;