osd/: go "peered" instead of "active" when < min_size

author Samuel Just <sam.just@inktank.com>

Fri, 7 Nov 2014 23:20:02 +0000 (15:20 -0800)

committer Samuel Just <sjust@redhat.com>

Fri, 30 Jan 2015 19:45:37 +0000 (11:45 -0800)
author Samuel Just <sam.just@inktank.com>
Fri, 7 Nov 2014 23:20:02 +0000 (15:20 -0800)
committer Samuel Just <sjust@redhat.com>
Fri, 30 Jan 2015 19:45:37 +0000 (11:45 -0800)
diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h

index ed2b7ee47010ea13e59a4939213e41e10ef42a06..2fb51375cfc94d04e641f132e41721dfc41eb3a1 100644 (file)
--- a/src/include/ceph_features.h
+++ b/src/include/ceph_features.h
@@ -60,6 +60,7 @@
  #define CEPH_FEATURE_OSD_TRANSACTION_MAY_LAYOUT (1ULL<<46) /* overlap w/ fadvise */
  #define CEPH_FEATURE_MDS_QUOTA      (1ULL<<47)
  #define CEPH_FEATURE_CRUSH_V4      (1ULL<<48)  /* straw2 buckets */
+#define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49)
  
  #define CEPH_FEATURE_RESERVED2 (1ULL<<61)  /* slow down, we are almost out... */
  #define CEPH_FEATURE_RESERVED  (1ULL<<62)  /* DO NOT USE THIS ... last bit! */
@@ -144,6 +145,7 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) {
           CEPH_FEATURE_OSD_TRANSACTION_MAY_LAYOUT |   \
          CEPH_FEATURE_MDS_QUOTA | \
           CEPH_FEATURE_CRUSH_V4 |            \
+         CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY |           \
          0ULL)
  
  #define CEPH_FEATURES_SUPPORTED_DEFAULT  CEPH_FEATURES_ALL
diff --git a/src/osd/PG.cc b/src/osd/PG.cc

index c8b02b0dbfd48cf3a5b8fcbbc029aa0daa8accce..d120b21a98c9207ee638314a2d49df7ba2ad2ca6 100644 (file)
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -270,7 +270,7 @@ void PG::proc_master_log(
  {
    dout(10) << "proc_master_log for osd." << from << ": "
            << olog << " " << omissing << dendl;
-  assert(!is_active() && is_primary());
+  assert(!is_peered() && is_primary());
  
    // merge log into our own log to build master log.  no need to
    // make any adjustments to their missing map; we are taking their
@@ -1307,11 +1307,14 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id)
        ++num_want_acting;
    }
  
-  // This is a bit of a problem, if we allow the pg to go active with
-  // want.size() < min_size, we won't consider the pg to have been
-  // maybe_went_rw in build_prior.
-  if (num_want_acting < pool.info.min_size) {
+  // We go incomplete if below min_size for ec_pools since backfill
+  // does not currently maintain rollbackability
+  // Otherwise, we will go "peered", but not "active"
+  if (num_want_acting < pool.info.min_size &&
+      (pool.info.ec_pool() ||
+       !(get_min_peer_features() & CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY))) {
      want_acting.clear();
+    dout(10) << "choose_acting failed, below min size" << dendl;
      return false;
    }
  
@@ -1328,6 +1331,7 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id)
    }
    if (!(*recoverable_predicate)(have)) {
      want_acting.clear();
+    dout(10) << "choose_acting failed, not recoverable" << dendl;
      return false;
    }
  
@@ -1427,15 +1431,16 @@ void PG::build_might_have_unfound()
  struct C_PG_ActivateCommitted : public Context {
    PGRef pg;
    epoch_t epoch;
-  C_PG_ActivateCommitted(PG *p, epoch_t e)
-    : pg(p), epoch(e) {}
+  epoch_t activation_epoch;
+  C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
+    : pg(p), epoch(e), activation_epoch(ae) {}
    void finish(int r) {
-    pg->_activate_committed(epoch);
+    pg->_activate_committed(epoch, activation_epoch);
    }
  };
  
  void PG::activate(ObjectStore::Transaction& t,
-                 epoch_t query_epoch,
+                 epoch_t activation_epoch,
                   list<Context*>& tfin,
                   map<int, map<spg_t,pg_query_t> >& query_map,
                   map<int,
@@ -1444,12 +1449,13 @@ void PG::activate(ObjectStore::Transaction& t,
                              pg_interval_map_t> > > *activator_map,
                    RecoveryCtx *ctx)
  {
-  assert(!is_active());
+  assert(!is_peered());
    assert(scrubber.callbacks.empty());
    assert(callbacks_for_degraded_object.empty());
  
    // -- crash recovery?
-  if (is_primary() &&
+  if (acting.size() >= pool.info.min_size &&
+      is_primary() &&
        pool.info.crash_replay_interval > 0 &&
        may_need_replay(get_osdmap())) {
      replay_until = ceph_clock_now(cct);
@@ -1470,8 +1476,14 @@ void PG::activate(ObjectStore::Transaction& t,
  
    send_notify = false;
  
-  if (is_acting(pg_whoami))
-    info.last_epoch_started = query_epoch;
+  if (is_primary()) {
+    // only update primary last_epoch_started if we will go active
+    if (acting.size() >= pool.info.min_size)
+      info.last_epoch_started = activation_epoch;
+  } else if (is_acting(pg_whoami)) {
+    // update last_epoch_started on acting replica to whatever the primary sent
+    info.last_epoch_started = activation_epoch;
+  }
  
    const pg_missing_t &missing = pg_log.get_missing();
  
@@ -1489,7 +1501,11 @@ void PG::activate(ObjectStore::Transaction& t,
    dirty_big_info = true; // maybe
  
    // find out when we commit
-  t.register_on_complete(new C_PG_ActivateCommitted(this, query_epoch));
+  t.register_on_complete(
+    new C_PG_ActivateCommitted(
+      this,
+      get_osdmap()->get_epoch(),
+      activation_epoch));
    
    // initialize snap_trimq
    if (is_primary()) {
@@ -1573,6 +1589,7 @@ void PG::activate(ObjectStore::Transaction& t,
         pi.last_update = info.last_update;
         pi.last_complete = info.last_update;
         pi.last_backfill = hobject_t();
+       pi.last_epoch_started = info.last_epoch_started;
         pi.history = info.history;
         pi.hit_set = info.hit_set;
         pi.stats.stats.clear();
@@ -1802,32 +1819,40 @@ void PG::replay_queued_ops()
    publish_stats_to_osd();
  }
  
-void PG::_activate_committed(epoch_t e)
+void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
  {
    lock();
-  if (pg_has_reset_since(e)) {
-    dout(10) << "_activate_committed " << e << ", that was an old interval" << dendl;
+  if (pg_has_reset_since(epoch)) {
+    dout(10) << "_activate_committed " << epoch
+            << ", that was an old interval" << dendl;
    } else if (is_primary()) {
      peer_activated.insert(pg_whoami);
-    dout(10) << "_activate_committed " << e << " peer_activated now " << peer_activated 
+    dout(10) << "_activate_committed " << epoch
+            << " peer_activated now " << peer_activated 
              << " last_epoch_started " << info.history.last_epoch_started
              << " same_interval_since " << info.history.same_interval_since << dendl;
      assert(!actingbackfill.empty());
      if (peer_activated.size() == actingbackfill.size())
        all_activated_and_committed();
    } else {
-    dout(10) << "_activate_committed " << e << " telling primary" << dendl;
-    MOSDPGInfo *m = new MOSDPGInfo(e);
+    dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
+    MOSDPGInfo *m = new MOSDPGInfo(epoch);
      pg_notify_t i = pg_notify_t(
        get_primary().shard, pg_whoami.shard,
        get_osdmap()->get_epoch(),
        get_osdmap()->get_epoch(),
        info);
-    i.info.history.last_epoch_started = e;
+
+    i.info.history.last_epoch_started = activation_epoch;
+    if (acting.size() >= pool.info.min_size) {
+      state_set(PG_STATE_ACTIVE);
+    } else {
+      state_set(PG_STATE_PEERED);
+    }
+
      m->pg_list.push_back(make_pair(i, pg_interval_map_t()));
      osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
  
-    state_set(PG_STATE_ACTIVE);
      // waiters
      if (flushes_in_progress == 0) {
        requeue_ops(waiting_for_peered);
@@ -2264,7 +2289,7 @@ void PG::_update_calc_stats()
    info.stats.stats.calc_copies(MAX(target, actingbackfill.size()));
    info.stats.stats.sum.num_objects_degraded = 0;
    info.stats.stats.sum.num_objects_misplaced = 0;
-  if ((is_degraded() || is_undersized() || !is_clean()) && is_active()) {
+  if ((is_degraded() || is_undersized() || !is_clean()) && is_peered()) {
      // NOTE: we only generate copies, degraded, unfound values for
      // the summation, not individual stat categories.
      uint64_t num_objects = info.stats.stats.sum.num_objects;
@@ -4729,6 +4754,7 @@ void PG::start_peering_interval(
  
    // deactivate.
    state_clear(PG_STATE_ACTIVE);
+  state_clear(PG_STATE_PEERED);
    state_clear(PG_STATE_DOWN);
    state_clear(PG_STATE_RECOVERY_WAIT);
    state_clear(PG_STATE_RECOVERING);
@@ -4860,7 +4886,7 @@ ostream& operator<<(ostream& out, const PG& pg)
         << "/" << pg.past_intervals.size();
    }
  
-  if (pg.is_active()) {
+  if (pg.is_peered()) {
      if (pg.last_update_ondisk != pg.info.last_update)
        out << " luod=" << pg.last_update_ondisk;
      if (pg.last_update_applied != pg.info.last_update)
@@ -5569,7 +5595,7 @@ PG::RecoveryState::Peering::Peering(my_context ctx)
    context< RecoveryMachine >().log_enter(state_name);
  
    PG *pg = context< RecoveryMachine >().pg;
-  assert(!pg->is_active());
+  assert(!pg->is_peered());
    assert(!pg->is_peering());
    assert(pg->is_primary());
    pg->state_set(PG_STATE_PEERING);
@@ -6410,10 +6436,6 @@ boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoe
    // may be telling us they have activated (and committed) but we can't
    // share that until _everyone_ does the same.
    if (pg->is_actingbackfill(infoevt.from)) {
-    assert(pg->info.history.last_epoch_started < 
-          pg->info.history.same_interval_since);
-    assert(infoevt.info.history.last_epoch_started >= 
-          pg->info.history.same_interval_since);
      dout(10) << " peer osd." << infoevt.from << " activated and committed" 
              << dendl;
      pg->peer_activated.insert(infoevt.from);
@@ -6507,7 +6529,11 @@ boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActi
  
    pg->state_clear(PG_STATE_ACTIVATING);
    pg->state_clear(PG_STATE_CREATING);
-  pg->state_set(PG_STATE_ACTIVE);
+  if (pg->acting.size() >= pg->pool.info.min_size) {
+    pg->state_set(PG_STATE_ACTIVE);
+  } else {
+    pg->state_set(PG_STATE_PEERED);
+  }
  
    // info.last_epoch_started is set during activate()
    pg->info.history.last_epoch_started = pg->info.last_epoch_started;
@@ -6518,7 +6544,7 @@ boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActi
    pg->check_local();
  
    // waiters
-  if (!pg->is_replay() && pg->flushes_in_progress == 0) {
+  if (pg->flushes_in_progress == 0) {
      pg->requeue_ops(pg->waiting_for_peered);
    }
  
@@ -6569,7 +6595,7 @@ boost::statechart::result PG::RecoveryState::ReplicaActive::react(
    PG *pg = context< RecoveryMachine >().pg;
    map<int, map<spg_t, pg_query_t> > query_map;
    pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
-              actevt.query_epoch,
+              actevt.activation_epoch,
                *context< RecoveryMachine >().get_on_safe_context_list(),
                query_map, NULL, NULL);
    dout(10) << "Activate Finished" << dendl;
@@ -6648,7 +6674,7 @@ PG::RecoveryState::Stray::Stray(my_context ctx)
    context< RecoveryMachine >().log_enter(state_name);
  
    PG *pg = context< RecoveryMachine >().pg;
-  assert(!pg->is_active());
+  assert(!pg->is_peered());
    assert(!pg->is_peering());
    assert(!pg->is_primary());
    pg->start_flush(
@@ -6683,7 +6709,7 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
  
    assert(pg->pg_log.get_head() == pg->info.last_update);
  
-  post_event(Activate(logevt.msg->get_epoch()));
+  post_event(Activate(logevt.msg->info.last_epoch_started));
    return transit<ReplicaActive>();
  }
  
@@ -6703,7 +6729,7 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoev
    assert(infoevt.info.last_update == pg->info.last_update);
    assert(pg->pg_log.get_head() == pg->info.last_update);
  
-  post_event(Activate(infoevt.msg_epoch));
+  post_event(Activate(infoevt.info.last_epoch_started));
    return transit<ReplicaActive>();
  }
  
diff --git a/src/osd/PG.h b/src/osd/PG.h

index 4408b8ca8ba71fd2da9e6a024f3c6c96511cc787..df4c156580ab74c4ca35a8fe1c923c524e3dbc7a 100644 (file)
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -970,13 +970,13 @@ public:
    void replay_queued_ops();
    void activate(
      ObjectStore::Transaction& t,
-    epoch_t query_epoch,
+    epoch_t activation_epoch,
      list<Context*>& tfin,
      map<int, map<spg_t,pg_query_t> >& query_map,
      map<int,
        vector<pair<pg_notify_t, pg_interval_map_t> > > *activator_map,
      RecoveryCtx *ctx);
-  void _activate_committed(epoch_t e);
+  void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
    void all_activated_and_committed();
  
    void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
@@ -1369,11 +1369,11 @@ public:
      }
    };
    struct Activate : boost::statechart::event< Activate > {
-    epoch_t query_epoch;
+    epoch_t activation_epoch;
      Activate(epoch_t q) : boost::statechart::event< Activate >(),
-                         query_epoch(q) {}
+                         activation_epoch(q) {}
      void print(std::ostream *out) const {
-      *out << "Activate from " << query_epoch;
+      *out << "Activate from " << activation_epoch;
      }
    };
    struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc

index 67dd16423dbe7ac1cdb3b3f3d34e836f3c9d421a..afb53d60cf6452ad72734ac6093a74202c4c0cdd 100644 (file)
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -1284,7 +1284,7 @@ void ReplicatedPG::do_request(
      }
    }
  
-  assert(is_active() && flushes_in_progress == 0);
+  assert(is_peered() && flushes_in_progress == 0);
    if (pgbackend->handle_message(op))
      return;
  
@@ -10176,7 +10176,7 @@ void ReplicatedPG::on_flushed()
    if (flushes_in_progress == 0) {
      requeue_ops(waiting_for_peered);
    }
-  if (!is_active() || !is_primary()) {
+  if (!is_peered() || !is_primary()) {
      pair<hobject_t, ObjectContextRef> i;
      while (object_contexts.get_next(i.first, &i)) {
        derr << "on_flushed: object " << i.first << " obc still alive" << dendl;
author	Samuel Just <sam.just@inktank.com>
	Fri, 7 Nov 2014 23:20:02 +0000 (15:20 -0800)
committer	Samuel Just <sjust@redhat.com>
	Fri, 30 Jan 2015 19:45:37 +0000 (11:45 -0800)
src/include/ceph_features.h		patch \| blob \| history
src/osd/PG.cc		patch \| blob \| history
src/osd/PG.h		patch \| blob \| history
src/osd/ReplicatedPG.cc		patch \| blob \| history