]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/: add pool min_size parameter for min acting set size
authorSamuel Just <sam.just@inktank.com>
Mon, 29 Oct 2012 22:35:09 +0000 (15:35 -0700)
committerSamuel Just <sam.just@inktank.com>
Wed, 7 Nov 2012 19:37:42 +0000 (11:37 -0800)
Otherwise, a pg might go active with a single osd in the
acting set.  If that osd subsequently dies, we potentially
loose client writes.  Note: it's still possible for the
acting set to exceed min_size but fail to obey the spirit
of the user's crush settings (e.g., min_size is 2, but both
osds happen to be no the sam node).

Signed-off-by: Samuel Just <sam.just@inktank.com>
Reviewed-by: Sage Weil: <sage@inktank.com>
src/common/config_opts.h
src/mon/OSDMonitor.cc
src/osd/OSD.cc
src/osd/OSDMap.cc
src/osd/PG.cc
src/osd/PG.h
src/osd/osd_types.cc
src/osd/osd_types.h

index a691ee4e67771f0d1f64092a5afab3a61aaa42ab..8a5f7e994d7c8baf86a80569420a355e3512bf43 100644 (file)
@@ -299,6 +299,7 @@ OPTION(osd_min_rep, OPT_INT, 1)
 OPTION(osd_max_rep, OPT_INT, 10)
 OPTION(osd_pool_default_crush_rule, OPT_INT, 0)
 OPTION(osd_pool_default_size, OPT_INT, 2)
+OPTION(osd_pool_default_min_size, OPT_INT, 2)
 OPTION(osd_pool_default_pg_num, OPT_INT, 8)
 OPTION(osd_pool_default_pgp_num, OPT_INT, 8)
 OPTION(osd_map_dedup, OPT_BOOL, true)
index bbc42a0ecc54b8a29383c528f3eb33b1229b66e3..a9bf4f582905e855c1c6ae2d7fa6b0e33b873e2d 100644 (file)
@@ -1876,6 +1876,8 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, int crush_rule,
   pending_inc.new_pools[pool].type = pg_pool_t::TYPE_REP;
 
   pending_inc.new_pools[pool].size = g_conf->osd_pool_default_size;
+  pending_inc.new_pools[pool].min_size =
+    g_conf->osd_pool_default_min_size;
   if (crush_rule >= 0)
     pending_inc.new_pools[pool].crush_ruleset = crush_rule;
   else
@@ -2653,11 +2655,22 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
              if (pending_inc.new_pools.count(pool) == 0)
                pending_inc.new_pools[pool] = *p;
              pending_inc.new_pools[pool].size = n;
+             if (n < p->min_size)
+               pending_inc.new_pools[pool].min_size = n;
              pending_inc.new_pools[pool].last_change = pending_inc.epoch;
              ss << "set pool " << pool << " size to " << n;
              getline(ss, rs);
              paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs, paxos->get_version()));
              return true;
+           } else if (m->cmd[4] == "min_size") {
+             if (pending_inc.new_pools.count(pool) == 0)
+               pending_inc.new_pools[pool] = *p;
+             pending_inc.new_pools[pool].min_size = n;
+             pending_inc.new_pools[pool].last_change = pending_inc.epoch;
+             ss << "set pool " << pool << " min_size to " << n;
+             getline(ss, rs);
+             paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs, paxos->get_version()));
+             return true;
            } else if (m->cmd[4] == "crash_replay_interval") {
              if (pending_inc.new_pools.count(pool) == 0)
                pending_inc.new_pools[pool] = *p;
index 20843021d886e92aad8d1b17f2f31a155d5dcc9a..4aa33d9a9dac9c5752da8f84d8ff1a24148a4b16 100644 (file)
@@ -1543,6 +1543,7 @@ void OSD::build_past_intervals_parallel()
                                                            p.same_interval_since,
                                                            pg->info.history.last_epoch_clean,
                                                            cur_map, last_map,
+                                                           pg->info.pgid.pool(),
                                                            &pg->past_intervals,
                                                            &debug);
       if (new_interval) {
index 2c38c807a2c0bc43ba1c596f25c35e06201b098b..e0c4779d9004f376959000462131102318ca9ed4 100644 (file)
@@ -1609,6 +1609,7 @@ void OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
     int64_t pool = ++pool_max;
     pools[pool].type = pg_pool_t::TYPE_REP;
     pools[pool].size = cct->_conf->osd_pool_default_size;
+    pools[pool].min_size = cct->_conf->osd_pool_default_min_size;
     pools[pool].crush_ruleset = p->first;
     pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
     pools[pool].set_pg_num(poolbase << pg_bits);
@@ -1730,6 +1731,7 @@ int OSDMap::build_simple_from_conf(CephContext *cct, epoch_t e, uuid_d &fsid,
     int64_t pool = ++pool_max;
     pools[pool].type = pg_pool_t::TYPE_REP;
     pools[pool].size = cct->_conf->osd_pool_default_size;
+    pools[pool].min_size = cct->_conf->osd_pool_default_min_size;
     pools[pool].crush_ruleset = p->first;
     pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
     pools[pool].set_pg_num((numosd + 1) << pg_bits);
index 51dae5458f11da7f9ccd0fbdf03fb407693f4061..7fdc372ed200f503808d1bbb0b76350dd9e7d984 100644 (file)
@@ -843,6 +843,7 @@ void PG::generate_past_intervals()
       info.history.last_epoch_clean,
       cur_map,
       last_map,
+      info.pgid.pool(),
       &past_intervals,
       &debug);
     if (new_interval) {
@@ -1243,6 +1244,11 @@ bool PG::choose_acting(int& newest_update_osd)
     return false;
   }
 
+  if (want.size() < pool.info.min_size) {
+    want_acting.clear();
+    return false;
+  }
+
   if (want != acting) {
     dout(10) << "choose_acting want " << want << " != acting " << acting
             << ", requesting pg_temp change" << dendl;
@@ -4350,7 +4356,7 @@ void PG::start_peering_interval(const OSDMapRef lastmap,
       info.history.same_interval_since,
       info.history.last_epoch_clean,
       osdmap,
-      lastmap, &past_intervals);
+      lastmap, info.pgid.pool(), &past_intervals);
     if (new_interval) {
       dout(10) << " noting past " << past_intervals.rbegin()->second << dendl;
       dirty_info = true;
@@ -6126,6 +6132,20 @@ PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
   pg->update_stats();
 }
 
+boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
+  PG *pg = context< RecoveryMachine >().pg;
+  int64_t poolnum = pg->info.pgid.pool();
+
+  // Reset if min_size changed, pg might now be able to go active
+  if (advmap.lastmap->get_pools().find(poolnum)->second.min_size !=
+      advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
+    post_event(advmap);
+    return transit< Reset >();
+  }
+
+  return forward_event();
+}
+
 void PG::RecoveryState::Incomplete::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
index 5d9f2280c796e03d3fae897570fd8f7e8b77496d..384aef725b03084ab31f6ad321407902422ba79b 100644 (file)
@@ -1510,7 +1510,11 @@ public:
     };
 
     struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
+      typedef boost::mpl::list <
+       boost::statechart::custom_reaction< AdvMap >
+       > reactions;
       Incomplete(my_context ctx);
+      boost::statechart::result react(const AdvMap &advmap);
       void exit();
     };
 
index 5c7411043cfc90fc5af01cc9ccba10c48e5b3584..404dec8aa782676be3f1f3cadc8996c79b32ed0b 100644 (file)
@@ -695,7 +695,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
     return;
   }
 
-  ENCODE_START(6, 5, bl);
+  ENCODE_START(7, 5, bl);
   ::encode(type, bl);
   ::encode(size, bl);
   ::encode(crush_ruleset, bl);
@@ -713,12 +713,13 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
   ::encode(auid, bl);
   ::encode(flags, bl);
   ::encode(crash_replay_interval, bl);
+  ::encode(min_size, bl);
   ENCODE_FINISH(bl);
 }
 
 void pg_pool_t::decode(bufferlist::iterator& bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
   ::decode(type, bl);
   ::decode(size, bl);
   ::decode(crush_ruleset, bl);
@@ -762,6 +763,11 @@ void pg_pool_t::decode(bufferlist::iterator& bl)
     else
       crash_replay_interval = 0;
   }
+  if (struct_v >= 7) {
+    ::decode(min_size, bl);
+  } else {
+    min_size = MAX(size - 1, 1);
+  }
   DECODE_FINISH(bl);
   calc_pg_masks();
 }
@@ -1462,18 +1468,23 @@ bool pg_interval_t::check_new_interval(
   epoch_t last_epoch_clean,
   OSDMapRef osdmap,
   OSDMapRef lastmap,
+  int64_t pool_id,
   map<epoch_t, pg_interval_t> *past_intervals,
   std::ostream *out)
 {
   // remember past interval
-  if (new_acting != old_acting || new_up != old_up) {
+  if (new_acting != old_acting || new_up != old_up ||
+      (!(lastmap->get_pools().count(pool_id))) ||
+      lastmap->get_pools().find(pool_id)->second.min_size !=
+      osdmap->get_pools().find(pool_id)->second.min_size) {
     pg_interval_t& i = (*past_intervals)[same_interval_since];
     i.first = same_interval_since;
     i.last = osdmap->get_epoch() - 1;
     i.acting = old_acting;
     i.up = old_up;
 
-    if (i.acting.size()) {
+    if (i.acting.size() >=
+       osdmap->get_pools().find(pool_id)->second.min_size) {
       if (lastmap->get_up_thru(i.acting[0]) >= i.first &&
          lastmap->get_up_from(i.acting[0]) <= i.first) {
        i.maybe_went_rw = true;
index 64a958e3a4a70188ada1614813d12f55f73c4076..6250810b48b3ff1253f53d798cf42f7f85b19ba1 100644 (file)
@@ -622,7 +622,7 @@ struct pg_pool_t {
 
   uint64_t flags;           /// FLAG_* 
   __u8 type;                /// TYPE_*
-  __u8 size;                /// number of osds in each pg
+  __u8 size, min_size;      /// number of osds in each pg
   __u8 crush_ruleset;       /// crush placement rule set
   __u8 object_hash;         /// hash mapping object name to ps
 private:
@@ -651,7 +651,8 @@ public:
   int pg_num_mask, pgp_num_mask;
 
   pg_pool_t()
-    : flags(0), type(0), size(0), crush_ruleset(0), object_hash(0),
+    : flags(0), type(0), size(0), min_size(0),
+      crush_ruleset(0), object_hash(0),
       pg_num(0), pgp_num(0),
       last_change(0),
       snap_seq(0), snap_epoch(0),
@@ -1139,6 +1140,7 @@ struct pg_interval_t {
     epoch_t last_epoch_clean,                   ///< [in] current
     std::tr1::shared_ptr<const OSDMap> osdmap,  ///< [in] current map
     std::tr1::shared_ptr<const OSDMap> lastmap, ///< [in] last map
+    int64_t poolid,                             ///< [in] pool for pg
     map<epoch_t, pg_interval_t> *past_intervals,///< [out] intervals
     ostream *out = 0                            ///< [out] debug ostream
     );