From 818b91b7e2af56d9564717a9cef2d353cf42cf3d Mon Sep 17 00:00:00 2001
From: Samuel Just <sam.just@inktank.com>
Date: Fri, 7 Nov 2014 15:20:02 -0800
Subject: [PATCH] osd/: go "peered" instead of "active" when < min_size

In the case of a replicated pool, the pg will transition to "peered"
rather than "active", where it can perform backfill bringing itself up
to min_size peers.  To that end, most operations now require
is_peered rather than is_active (OSDOps being the primary exception).
Also, rather than using the query_epoch on the activation message as the
activation epoch (for last_epoch_started) on the replica, we instead
use the last_epoch_started in the info sent by the primary.  This
allows the primary to not advance last_epoch_started past the last
known actual activation.  This will prevent later peering epochs from
requiring the last_update from a peered epoch to go active (might be
divergent).

Fixes: #7862
Signed-off-by: Samuel Just <sam.just@inktank.com>
---
 src/include/ceph_features.h |  2 +
 src/osd/PG.cc               | 96 +++++++++++++++++++++++--------------
 src/osd/PG.h                | 10 ++--
 src/osd/ReplicatedPG.cc     |  4 +-
 4 files changed, 70 insertions(+), 42 deletions(-)

diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h
index ed2b7ee47010e..2fb51375cfc94 100644
--- a/src/include/ceph_features.h
+++ b/src/include/ceph_features.h
@@ -60,6 +60,7 @@
 #define CEPH_FEATURE_OSD_TRANSACTION_MAY_LAYOUT (1ULL<<46) /* overlap w/ fadvise */
 #define CEPH_FEATURE_MDS_QUOTA      (1ULL<<47)
 #define CEPH_FEATURE_CRUSH_V4      (1ULL<<48)  /* straw2 buckets */
+#define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49)
 
 #define CEPH_FEATURE_RESERVED2 (1ULL<<61)  /* slow down, we are almost out... */
 #define CEPH_FEATURE_RESERVED  (1ULL<<62)  /* DO NOT USE THIS ... last bit! */
@@ -144,6 +145,7 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) {
          CEPH_FEATURE_OSD_TRANSACTION_MAY_LAYOUT |   \
 	 CEPH_FEATURE_MDS_QUOTA | \
          CEPH_FEATURE_CRUSH_V4 |	     \
+         CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY |		 \
 	 0ULL)
 
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  CEPH_FEATURES_ALL
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index c8b02b0dbfd48..d120b21a98c92 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -270,7 +270,7 @@ void PG::proc_master_log(
 {
   dout(10) << "proc_master_log for osd." << from << ": "
 	   << olog << " " << omissing << dendl;
-  assert(!is_active() && is_primary());
+  assert(!is_peered() && is_primary());
 
   // merge log into our own log to build master log.  no need to
   // make any adjustments to their missing map; we are taking their
@@ -1307,11 +1307,14 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id)
       ++num_want_acting;
   }
 
-  // This is a bit of a problem, if we allow the pg to go active with
-  // want.size() < min_size, we won't consider the pg to have been
-  // maybe_went_rw in build_prior.
-  if (num_want_acting < pool.info.min_size) {
+  // We go incomplete if below min_size for ec_pools since backfill
+  // does not currently maintain rollbackability
+  // Otherwise, we will go "peered", but not "active"
+  if (num_want_acting < pool.info.min_size &&
+      (pool.info.ec_pool() ||
+       !(get_min_peer_features() & CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY))) {
     want_acting.clear();
+    dout(10) << "choose_acting failed, below min size" << dendl;
     return false;
   }
 
@@ -1328,6 +1331,7 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id)
   }
   if (!(*recoverable_predicate)(have)) {
     want_acting.clear();
+    dout(10) << "choose_acting failed, not recoverable" << dendl;
     return false;
   }
 
@@ -1427,15 +1431,16 @@ void PG::build_might_have_unfound()
 struct C_PG_ActivateCommitted : public Context {
   PGRef pg;
   epoch_t epoch;
-  C_PG_ActivateCommitted(PG *p, epoch_t e)
-    : pg(p), epoch(e) {}
+  epoch_t activation_epoch;
+  C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
+    : pg(p), epoch(e), activation_epoch(ae) {}
   void finish(int r) {
-    pg->_activate_committed(epoch);
+    pg->_activate_committed(epoch, activation_epoch);
   }
 };
 
 void PG::activate(ObjectStore::Transaction& t,
-		  epoch_t query_epoch,
+		  epoch_t activation_epoch,
 		  list<Context*>& tfin,
 		  map<int, map<spg_t,pg_query_t> >& query_map,
 		  map<int,
@@ -1444,12 +1449,13 @@ void PG::activate(ObjectStore::Transaction& t,
 			     pg_interval_map_t> > > *activator_map,
                   RecoveryCtx *ctx)
 {
-  assert(!is_active());
+  assert(!is_peered());
   assert(scrubber.callbacks.empty());
   assert(callbacks_for_degraded_object.empty());
 
   // -- crash recovery?
-  if (is_primary() &&
+  if (acting.size() >= pool.info.min_size &&
+      is_primary() &&
       pool.info.crash_replay_interval > 0 &&
       may_need_replay(get_osdmap())) {
     replay_until = ceph_clock_now(cct);
@@ -1470,8 +1476,14 @@ void PG::activate(ObjectStore::Transaction& t,
 
   send_notify = false;
 
-  if (is_acting(pg_whoami))
-    info.last_epoch_started = query_epoch;
+  if (is_primary()) {
+    // only update primary last_epoch_started if we will go active
+    if (acting.size() >= pool.info.min_size)
+      info.last_epoch_started = activation_epoch;
+  } else if (is_acting(pg_whoami)) {
+    // update last_epoch_started on acting replica to whatever the primary sent
+    info.last_epoch_started = activation_epoch;
+  }
 
   const pg_missing_t &missing = pg_log.get_missing();
 
@@ -1489,7 +1501,11 @@ void PG::activate(ObjectStore::Transaction& t,
   dirty_big_info = true; // maybe
 
   // find out when we commit
-  t.register_on_complete(new C_PG_ActivateCommitted(this, query_epoch));
+  t.register_on_complete(
+    new C_PG_ActivateCommitted(
+      this,
+      get_osdmap()->get_epoch(),
+      activation_epoch));
   
   // initialize snap_trimq
   if (is_primary()) {
@@ -1573,6 +1589,7 @@ void PG::activate(ObjectStore::Transaction& t,
 	pi.last_update = info.last_update;
 	pi.last_complete = info.last_update;
 	pi.last_backfill = hobject_t();
+	pi.last_epoch_started = info.last_epoch_started;
 	pi.history = info.history;
 	pi.hit_set = info.hit_set;
 	pi.stats.stats.clear();
@@ -1802,32 +1819,40 @@ void PG::replay_queued_ops()
   publish_stats_to_osd();
 }
 
-void PG::_activate_committed(epoch_t e)
+void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
 {
   lock();
-  if (pg_has_reset_since(e)) {
-    dout(10) << "_activate_committed " << e << ", that was an old interval" << dendl;
+  if (pg_has_reset_since(epoch)) {
+    dout(10) << "_activate_committed " << epoch
+	     << ", that was an old interval" << dendl;
   } else if (is_primary()) {
     peer_activated.insert(pg_whoami);
-    dout(10) << "_activate_committed " << e << " peer_activated now " << peer_activated 
+    dout(10) << "_activate_committed " << epoch
+	     << " peer_activated now " << peer_activated 
 	     << " last_epoch_started " << info.history.last_epoch_started
 	     << " same_interval_since " << info.history.same_interval_since << dendl;
     assert(!actingbackfill.empty());
     if (peer_activated.size() == actingbackfill.size())
       all_activated_and_committed();
   } else {
-    dout(10) << "_activate_committed " << e << " telling primary" << dendl;
-    MOSDPGInfo *m = new MOSDPGInfo(e);
+    dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
+    MOSDPGInfo *m = new MOSDPGInfo(epoch);
     pg_notify_t i = pg_notify_t(
       get_primary().shard, pg_whoami.shard,
       get_osdmap()->get_epoch(),
       get_osdmap()->get_epoch(),
       info);
-    i.info.history.last_epoch_started = e;
+
+    i.info.history.last_epoch_started = activation_epoch;
+    if (acting.size() >= pool.info.min_size) {
+      state_set(PG_STATE_ACTIVE);
+    } else {
+      state_set(PG_STATE_PEERED);
+    }
+
     m->pg_list.push_back(make_pair(i, pg_interval_map_t()));
     osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
 
-    state_set(PG_STATE_ACTIVE);
     // waiters
     if (flushes_in_progress == 0) {
       requeue_ops(waiting_for_peered);
@@ -2264,7 +2289,7 @@ void PG::_update_calc_stats()
   info.stats.stats.calc_copies(MAX(target, actingbackfill.size()));
   info.stats.stats.sum.num_objects_degraded = 0;
   info.stats.stats.sum.num_objects_misplaced = 0;
-  if ((is_degraded() || is_undersized() || !is_clean()) && is_active()) {
+  if ((is_degraded() || is_undersized() || !is_clean()) && is_peered()) {
     // NOTE: we only generate copies, degraded, unfound values for
     // the summation, not individual stat categories.
     uint64_t num_objects = info.stats.stats.sum.num_objects;
@@ -4729,6 +4754,7 @@ void PG::start_peering_interval(
 
   // deactivate.
   state_clear(PG_STATE_ACTIVE);
+  state_clear(PG_STATE_PEERED);
   state_clear(PG_STATE_DOWN);
   state_clear(PG_STATE_RECOVERY_WAIT);
   state_clear(PG_STATE_RECOVERING);
@@ -4860,7 +4886,7 @@ ostream& operator<<(ostream& out, const PG& pg)
 	<< "/" << pg.past_intervals.size();
   }
 
-  if (pg.is_active()) {
+  if (pg.is_peered()) {
     if (pg.last_update_ondisk != pg.info.last_update)
       out << " luod=" << pg.last_update_ondisk;
     if (pg.last_update_applied != pg.info.last_update)
@@ -5569,7 +5595,7 @@ PG::RecoveryState::Peering::Peering(my_context ctx)
   context< RecoveryMachine >().log_enter(state_name);
 
   PG *pg = context< RecoveryMachine >().pg;
-  assert(!pg->is_active());
+  assert(!pg->is_peered());
   assert(!pg->is_peering());
   assert(pg->is_primary());
   pg->state_set(PG_STATE_PEERING);
@@ -6410,10 +6436,6 @@ boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoe
   // may be telling us they have activated (and committed) but we can't
   // share that until _everyone_ does the same.
   if (pg->is_actingbackfill(infoevt.from)) {
-    assert(pg->info.history.last_epoch_started < 
-	   pg->info.history.same_interval_since);
-    assert(infoevt.info.history.last_epoch_started >= 
-	   pg->info.history.same_interval_since);
     dout(10) << " peer osd." << infoevt.from << " activated and committed" 
 	     << dendl;
     pg->peer_activated.insert(infoevt.from);
@@ -6507,7 +6529,11 @@ boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActi
 
   pg->state_clear(PG_STATE_ACTIVATING);
   pg->state_clear(PG_STATE_CREATING);
-  pg->state_set(PG_STATE_ACTIVE);
+  if (pg->acting.size() >= pg->pool.info.min_size) {
+    pg->state_set(PG_STATE_ACTIVE);
+  } else {
+    pg->state_set(PG_STATE_PEERED);
+  }
 
   // info.last_epoch_started is set during activate()
   pg->info.history.last_epoch_started = pg->info.last_epoch_started;
@@ -6518,7 +6544,7 @@ boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActi
   pg->check_local();
 
   // waiters
-  if (!pg->is_replay() && pg->flushes_in_progress == 0) {
+  if (pg->flushes_in_progress == 0) {
     pg->requeue_ops(pg->waiting_for_peered);
   }
 
@@ -6569,7 +6595,7 @@ boost::statechart::result PG::RecoveryState::ReplicaActive::react(
   PG *pg = context< RecoveryMachine >().pg;
   map<int, map<spg_t, pg_query_t> > query_map;
   pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
-	       actevt.query_epoch,
+	       actevt.activation_epoch,
 	       *context< RecoveryMachine >().get_on_safe_context_list(),
 	       query_map, NULL, NULL);
   dout(10) << "Activate Finished" << dendl;
@@ -6648,7 +6674,7 @@ PG::RecoveryState::Stray::Stray(my_context ctx)
   context< RecoveryMachine >().log_enter(state_name);
 
   PG *pg = context< RecoveryMachine >().pg;
-  assert(!pg->is_active());
+  assert(!pg->is_peered());
   assert(!pg->is_peering());
   assert(!pg->is_primary());
   pg->start_flush(
@@ -6683,7 +6709,7 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
 
   assert(pg->pg_log.get_head() == pg->info.last_update);
 
-  post_event(Activate(logevt.msg->get_epoch()));
+  post_event(Activate(logevt.msg->info.last_epoch_started));
   return transit<ReplicaActive>();
 }
 
@@ -6703,7 +6729,7 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoev
   assert(infoevt.info.last_update == pg->info.last_update);
   assert(pg->pg_log.get_head() == pg->info.last_update);
 
-  post_event(Activate(infoevt.msg_epoch));
+  post_event(Activate(infoevt.info.last_epoch_started));
   return transit<ReplicaActive>();
 }
 
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 4408b8ca8ba71..df4c156580ab7 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -970,13 +970,13 @@ public:
   void replay_queued_ops();
   void activate(
     ObjectStore::Transaction& t,
-    epoch_t query_epoch,
+    epoch_t activation_epoch,
     list<Context*>& tfin,
     map<int, map<spg_t,pg_query_t> >& query_map,
     map<int,
       vector<pair<pg_notify_t, pg_interval_map_t> > > *activator_map,
     RecoveryCtx *ctx);
-  void _activate_committed(epoch_t e);
+  void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
   void all_activated_and_committed();
 
   void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
@@ -1369,11 +1369,11 @@ public:
     }
   };
   struct Activate : boost::statechart::event< Activate > {
-    epoch_t query_epoch;
+    epoch_t activation_epoch;
     Activate(epoch_t q) : boost::statechart::event< Activate >(),
-			  query_epoch(q) {}
+			  activation_epoch(q) {}
     void print(std::ostream *out) const {
-      *out << "Activate from " << query_epoch;
+      *out << "Activate from " << activation_epoch;
     }
   };
   struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 67dd16423dbe7..afb53d60cf645 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -1284,7 +1284,7 @@ void ReplicatedPG::do_request(
     }
   }
 
-  assert(is_active() && flushes_in_progress == 0);
+  assert(is_peered() && flushes_in_progress == 0);
   if (pgbackend->handle_message(op))
     return;
 
@@ -10176,7 +10176,7 @@ void ReplicatedPG::on_flushed()
   if (flushes_in_progress == 0) {
     requeue_ops(waiting_for_peered);
   }
-  if (!is_active() || !is_primary()) {
+  if (!is_peered() || !is_primary()) {
     pair<hobject_t, ObjectContextRef> i;
     while (object_contexts.get_next(i.first, &i)) {
       derr << "on_flushed: object " << i.first << " obc still alive" << dendl;
-- 
2.39.5