From 13486857cf3940d9fe910cbcc57fe5c882af3f3d Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Mon, 29 Oct 2012 15:35:09 -0700 Subject: [PATCH] osd/: add pool min_size parameter for min acting set size Otherwise, a pg might go active with a single osd in the acting set. If that osd subsequently dies, we potentially loose client writes. Note: it's still possible for the acting set to exceed min_size but fail to obey the spirit of the user's crush settings (e.g., min_size is 2, but both osds happen to be no the sam node). Signed-off-by: Samuel Just Reviewed-by: Sage Weil: --- src/common/config_opts.h | 1 + src/mon/OSDMonitor.cc | 13 +++++++++++++ src/osd/OSD.cc | 1 + src/osd/OSDMap.cc | 2 ++ src/osd/PG.cc | 22 +++++++++++++++++++++- src/osd/PG.h | 4 ++++ src/osd/osd_types.cc | 19 +++++++++++++++---- src/osd/osd_types.h | 6 ++++-- 8 files changed, 61 insertions(+), 7 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index a691ee4e67771..8a5f7e994d7c8 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -299,6 +299,7 @@ OPTION(osd_min_rep, OPT_INT, 1) OPTION(osd_max_rep, OPT_INT, 10) OPTION(osd_pool_default_crush_rule, OPT_INT, 0) OPTION(osd_pool_default_size, OPT_INT, 2) +OPTION(osd_pool_default_min_size, OPT_INT, 2) OPTION(osd_pool_default_pg_num, OPT_INT, 8) OPTION(osd_pool_default_pgp_num, OPT_INT, 8) OPTION(osd_map_dedup, OPT_BOOL, true) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index bbc42a0ecc54b..a9bf4f582905e 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1876,6 +1876,8 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, int crush_rule, pending_inc.new_pools[pool].type = pg_pool_t::TYPE_REP; pending_inc.new_pools[pool].size = g_conf->osd_pool_default_size; + pending_inc.new_pools[pool].min_size = + g_conf->osd_pool_default_min_size; if (crush_rule >= 0) pending_inc.new_pools[pool].crush_ruleset = crush_rule; else @@ -2653,11 +2655,22 @@ bool OSDMonitor::prepare_command(MMonCommand *m) if (pending_inc.new_pools.count(pool) == 0) pending_inc.new_pools[pool] = *p; pending_inc.new_pools[pool].size = n; + if (n < p->min_size) + pending_inc.new_pools[pool].min_size = n; pending_inc.new_pools[pool].last_change = pending_inc.epoch; ss << "set pool " << pool << " size to " << n; getline(ss, rs); paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs, paxos->get_version())); return true; + } else if (m->cmd[4] == "min_size") { + if (pending_inc.new_pools.count(pool) == 0) + pending_inc.new_pools[pool] = *p; + pending_inc.new_pools[pool].min_size = n; + pending_inc.new_pools[pool].last_change = pending_inc.epoch; + ss << "set pool " << pool << " min_size to " << n; + getline(ss, rs); + paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs, paxos->get_version())); + return true; } else if (m->cmd[4] == "crash_replay_interval") { if (pending_inc.new_pools.count(pool) == 0) pending_inc.new_pools[pool] = *p; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 20843021d886e..4aa33d9a9dac9 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1543,6 +1543,7 @@ void OSD::build_past_intervals_parallel() p.same_interval_since, pg->info.history.last_epoch_clean, cur_map, last_map, + pg->info.pgid.pool(), &pg->past_intervals, &debug); if (new_interval) { diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 2c38c807a2c0b..e0c4779d9004f 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -1609,6 +1609,7 @@ void OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid, int64_t pool = ++pool_max; pools[pool].type = pg_pool_t::TYPE_REP; pools[pool].size = cct->_conf->osd_pool_default_size; + pools[pool].min_size = cct->_conf->osd_pool_default_min_size; pools[pool].crush_ruleset = p->first; pools[pool].object_hash = CEPH_STR_HASH_RJENKINS; pools[pool].set_pg_num(poolbase << pg_bits); @@ -1730,6 +1731,7 @@ int OSDMap::build_simple_from_conf(CephContext *cct, epoch_t e, uuid_d &fsid, int64_t pool = ++pool_max; pools[pool].type = pg_pool_t::TYPE_REP; pools[pool].size = cct->_conf->osd_pool_default_size; + pools[pool].min_size = cct->_conf->osd_pool_default_min_size; pools[pool].crush_ruleset = p->first; pools[pool].object_hash = CEPH_STR_HASH_RJENKINS; pools[pool].set_pg_num((numosd + 1) << pg_bits); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 51dae5458f11d..7fdc372ed200f 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -843,6 +843,7 @@ void PG::generate_past_intervals() info.history.last_epoch_clean, cur_map, last_map, + info.pgid.pool(), &past_intervals, &debug); if (new_interval) { @@ -1243,6 +1244,11 @@ bool PG::choose_acting(int& newest_update_osd) return false; } + if (want.size() < pool.info.min_size) { + want_acting.clear(); + return false; + } + if (want != acting) { dout(10) << "choose_acting want " << want << " != acting " << acting << ", requesting pg_temp change" << dendl; @@ -4350,7 +4356,7 @@ void PG::start_peering_interval(const OSDMapRef lastmap, info.history.same_interval_since, info.history.last_epoch_clean, osdmap, - lastmap, &past_intervals); + lastmap, info.pgid.pool(), &past_intervals); if (new_interval) { dout(10) << " noting past " << past_intervals.rbegin()->second << dendl; dirty_info = true; @@ -6126,6 +6132,20 @@ PG::RecoveryState::Incomplete::Incomplete(my_context ctx) pg->update_stats(); } +boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) { + PG *pg = context< RecoveryMachine >().pg; + int64_t poolnum = pg->info.pgid.pool(); + + // Reset if min_size changed, pg might now be able to go active + if (advmap.lastmap->get_pools().find(poolnum)->second.min_size != + advmap.osdmap->get_pools().find(poolnum)->second.min_size) { + post_event(advmap); + return transit< Reset >(); + } + + return forward_event(); +} + void PG::RecoveryState::Incomplete::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); diff --git a/src/osd/PG.h b/src/osd/PG.h index 5d9f2280c796e..384aef725b030 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1510,7 +1510,11 @@ public: }; struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< AdvMap > + > reactions; Incomplete(my_context ctx); + boost::statechart::result react(const AdvMap &advmap); void exit(); }; diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 5c7411043cfc9..404dec8aa7826 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -695,7 +695,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const return; } - ENCODE_START(6, 5, bl); + ENCODE_START(7, 5, bl); ::encode(type, bl); ::encode(size, bl); ::encode(crush_ruleset, bl); @@ -713,12 +713,13 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const ::encode(auid, bl); ::encode(flags, bl); ::encode(crash_replay_interval, bl); + ::encode(min_size, bl); ENCODE_FINISH(bl); } void pg_pool_t::decode(bufferlist::iterator& bl) { - DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl); + DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl); ::decode(type, bl); ::decode(size, bl); ::decode(crush_ruleset, bl); @@ -762,6 +763,11 @@ void pg_pool_t::decode(bufferlist::iterator& bl) else crash_replay_interval = 0; } + if (struct_v >= 7) { + ::decode(min_size, bl); + } else { + min_size = MAX(size - 1, 1); + } DECODE_FINISH(bl); calc_pg_masks(); } @@ -1462,18 +1468,23 @@ bool pg_interval_t::check_new_interval( epoch_t last_epoch_clean, OSDMapRef osdmap, OSDMapRef lastmap, + int64_t pool_id, map *past_intervals, std::ostream *out) { // remember past interval - if (new_acting != old_acting || new_up != old_up) { + if (new_acting != old_acting || new_up != old_up || + (!(lastmap->get_pools().count(pool_id))) || + lastmap->get_pools().find(pool_id)->second.min_size != + osdmap->get_pools().find(pool_id)->second.min_size) { pg_interval_t& i = (*past_intervals)[same_interval_since]; i.first = same_interval_since; i.last = osdmap->get_epoch() - 1; i.acting = old_acting; i.up = old_up; - if (i.acting.size()) { + if (i.acting.size() >= + osdmap->get_pools().find(pool_id)->second.min_size) { if (lastmap->get_up_thru(i.acting[0]) >= i.first && lastmap->get_up_from(i.acting[0]) <= i.first) { i.maybe_went_rw = true; diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 64a958e3a4a70..6250810b48b3f 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -622,7 +622,7 @@ struct pg_pool_t { uint64_t flags; /// FLAG_* __u8 type; /// TYPE_* - __u8 size; /// number of osds in each pg + __u8 size, min_size; /// number of osds in each pg __u8 crush_ruleset; /// crush placement rule set __u8 object_hash; /// hash mapping object name to ps private: @@ -651,7 +651,8 @@ public: int pg_num_mask, pgp_num_mask; pg_pool_t() - : flags(0), type(0), size(0), crush_ruleset(0), object_hash(0), + : flags(0), type(0), size(0), min_size(0), + crush_ruleset(0), object_hash(0), pg_num(0), pgp_num(0), last_change(0), snap_seq(0), snap_epoch(0), @@ -1139,6 +1140,7 @@ struct pg_interval_t { epoch_t last_epoch_clean, ///< [in] current std::tr1::shared_ptr osdmap, ///< [in] current map std::tr1::shared_ptr lastmap, ///< [in] last map + int64_t poolid, ///< [in] pool for pg map *past_intervals,///< [out] intervals ostream *out = 0 ///< [out] debug ostream ); -- 2.39.5