From: Samuel Just <sam.just@inktank.com>
Date: Tue, 11 Mar 2014 17:31:55 +0000 (-0700)
Subject: PG: do not serve requests until replicas have activated
X-Git-Tag: v0.78~32^2~1
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=a576eb320463ee79feff7b0a973cad117cd98cf9;p=ceph.git

PG: do not serve requests until replicas have activated

There are two problems:
1) We choose the min last_update amoung peers with the max local-les
value as an upper bound on requests which could have been reported to
the client as committed.  We then, for ec pools, roll back to that point
to ensure that we don't inadvertently commit to an update which fewer
than K replicas actually saw.  If the primary sets local-les, accepts an
update from a client, and there is a new interval before any of the
replicas have been activated, we will end up being forced to use that
update which no other replica has seen as the new last_update.  This
will cause the object to become unfound.  We don't have this problem as
long as all active replicas agree on last_update before we accept IO.

2) Even for replicated pools, we would then immediately respond to the
request which created the primary-only update with a commit since it is
in the log and we have no outstanding repops.  If we then lose that
primary before any of the replicas in the new interval record the new
log, we will not only lose the object, but also the log entry recording
it, which will result in a lost write.

For these reasons, it seems like we need to wait for the replicas to
activate before we can process new requests essentially because whatever
update we select as last_update is essentially regarded as committed as
soon as we accept IO.

Fixes: #7649
Signed-off-by: Samuel Just <sam.just@inktank.com>
---

diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index fe759efe1422f..b704556aab66f 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1417,7 +1417,6 @@ void PG::activate(ObjectStore::Transaction& t,
   }
 
   // twiddle pg state
-  state_set(PG_STATE_ACTIVE);
   state_clear(PG_STATE_DOWN);
 
   send_notify = false;
@@ -1440,12 +1439,8 @@ void PG::activate(ObjectStore::Transaction& t,
   dirty_info = true;
   dirty_big_info = true; // maybe
 
-  // verify that there are no stray objects
-  if (is_primary())
-    check_local();
-
   // find out when we commit
-  tfin.push_back(new C_PG_ActivateCommitted(this, query_epoch));
+  t.register_on_complete(new C_PG_ActivateCommitted(this, query_epoch));
   
   // initialize snap_trimq
   if (is_primary()) {
@@ -1628,42 +1623,7 @@ void PG::activate(ObjectStore::Transaction& t,
     if (get_osdmap()->get_pg_size(info.pgid.pgid) > acting.size())
       state_set(PG_STATE_DEGRADED);
 
-    // all clean?
-    if (needs_recovery()) {
-      dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
-      queue_peering_event(
-        CephPeeringEvtRef(
-          new CephPeeringEvt(
-            get_osdmap()->get_epoch(),
-            get_osdmap()->get_epoch(),
-            DoRecovery())));
-    } else if (needs_backfill()) {
-      dout(10) << "activate queueing backfill" << dendl;
-      queue_peering_event(
-        CephPeeringEvtRef(
-          new CephPeeringEvt(
-            get_osdmap()->get_epoch(),
-            get_osdmap()->get_epoch(),
-            RequestBackfill())));
-    } else {
-      dout(10) << "activate all replicas clean, no recovery" << dendl;
-      queue_peering_event(
-        CephPeeringEvtRef(
-          new CephPeeringEvt(
-            get_osdmap()->get_epoch(),
-            get_osdmap()->get_epoch(),
-            AllReplicasRecovered())));
-    }
-
-    publish_stats_to_osd();
-  }
-
-  // waiters
-  if (!is_replay() && flushes_in_progress == 0) {
-    requeue_ops(waiting_for_active);
   }
-
-  on_activate();
 }
 
 bool PG::op_has_sufficient_caps(OpRequestRef op)
@@ -1737,7 +1697,7 @@ void PG::queue_op(OpRequestRef op)
 
 void PG::replay_queued_ops()
 {
-  assert(is_replay() && is_active());
+  assert(is_replay());
   eversion_t c = info.last_update;
   list<OpRequestRef> replay;
   dout(10) << "replay_queued_ops" << dendl;
@@ -1788,6 +1748,12 @@ void PG::_activate_committed(epoch_t e)
     i.info.history.last_epoch_started = e;
     m->pg_list.push_back(make_pair(i, pg_interval_map_t()));
     osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
+
+    state_set(PG_STATE_ACTIVE);
+    // waiters
+    if (flushes_in_progress == 0) {
+      requeue_ops(waiting_for_active);
+    }
   }
 
   if (dirty_info) {
@@ -6165,7 +6131,6 @@ PG::RecoveryState::Active::Active(my_context ctx)
 	       *context< RecoveryMachine >().get_query_map(),
 	       context< RecoveryMachine >().get_info_map(),
 	       context< RecoveryMachine >().get_recovery_ctx());
-  assert(pg->is_active());
   dout(10) << "Activate Finished" << dendl;
 }
 
@@ -6215,7 +6180,6 @@ boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
 {
   PG *pg = context< RecoveryMachine >().pg;
   dout(10) << "Active: handling ActMap" << dendl;
-  assert(pg->is_active());
   assert(pg->is_primary());
 
   if (pg->have_unfound()) {
@@ -6253,7 +6217,6 @@ boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
 boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
 {
   PG *pg = context< RecoveryMachine >().pg;
-  assert(pg->is_active());
   assert(pg->is_primary());
   if (pg->peer_info.count(notevt.from)) {
     dout(10) << "Active: got notify from " << notevt.from 
@@ -6278,7 +6241,6 @@ boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& not
 boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
 {
   PG *pg = context< RecoveryMachine >().pg;
-  assert(pg->is_active());
   assert(pg->is_primary());
 
   assert(!pg->actingbackfill.empty());
@@ -6293,10 +6255,10 @@ boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoe
     dout(10) << " peer osd." << infoevt.from << " activated and committed" 
 	     << dendl;
     pg->peer_activated.insert(infoevt.from);
-  }
 
-  if (pg->peer_activated.size() == pg->actingbackfill.size()) {
-    pg->all_activated_and_committed();
+    if (pg->peer_activated.size() == pg->actingbackfill.size()) {
+      pg->all_activated_and_committed();
+    }
   }
   return discard_event();
 }
@@ -6378,7 +6340,20 @@ boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
 
 boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
 {
+  PG *pg = context< RecoveryMachine >().pg;
   all_replicas_activated = true;
+
+  pg->state_set(PG_STATE_ACTIVE);
+
+  pg->check_local();
+
+  // waiters
+  if (!pg->is_replay() && pg->flushes_in_progress == 0) {
+    pg->requeue_ops(pg->waiting_for_active);
+  }
+
+  pg->on_activate();
+
   return discard_event();
 }
 
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 8ea8a624850c1..5552e861ce0ef 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -8919,6 +8919,35 @@ void ReplicatedPG::on_shutdown()
 
 void ReplicatedPG::on_activate()
 {
+  // all clean?
+  if (needs_recovery()) {
+    dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
+    queue_peering_event(
+      CephPeeringEvtRef(
+	new CephPeeringEvt(
+	  get_osdmap()->get_epoch(),
+	  get_osdmap()->get_epoch(),
+	  DoRecovery())));
+  } else if (needs_backfill()) {
+    dout(10) << "activate queueing backfill" << dendl;
+    queue_peering_event(
+      CephPeeringEvtRef(
+	new CephPeeringEvt(
+	  get_osdmap()->get_epoch(),
+	  get_osdmap()->get_epoch(),
+	  RequestBackfill())));
+  } else {
+    dout(10) << "activate all replicas clean, no recovery" << dendl;
+    queue_peering_event(
+      CephPeeringEvtRef(
+	new CephPeeringEvt(
+	  get_osdmap()->get_epoch(),
+	  get_osdmap()->get_epoch(),
+	  AllReplicasRecovered())));
+  }
+
+  publish_stats_to_osd();
+
   if (!backfill_targets.empty()) {
     last_backfill_started = earliest_backfill();
     new_backfill = true;