From: Guang G Yang Date: Thu, 2 Jul 2015 05:29:47 +0000 (+0000) Subject: osd: pg_interval_t::check_new_interval should not rely on pool.min_size to determine... X-Git-Tag: v9.1.0~497^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=684927442d81ea08f95878a8af69d08d3a14d973;p=ceph.git osd: pg_interval_t::check_new_interval should not rely on pool.min_size to determine if the PG was active If the pool's min_size is set improperly, during peering, pg_interval_t::check_new_interval might wrongly determine the PG's state and cause the PG to stuck at down+peering forever Fixes: #12162 Signed-off-by: Guang Yang yguang@yahoo-inc.com --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 7c2d8501a96..ebc2d6d8ee2 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -3049,6 +3049,8 @@ void OSD::build_past_intervals_parallel() } assert(last_map); + boost::scoped_ptr recoverable( + pg->get_is_recoverable_predicate()); std::stringstream debug; bool new_interval = pg_interval_t::check_new_interval( p.primary, @@ -3061,6 +3063,7 @@ void OSD::build_past_intervals_parallel() pg->info.history.last_epoch_clean, cur_map, last_map, pgid, + recoverable.get(), &pg->past_intervals, &debug); if (new_interval) { diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 7bb81bfdca4..fe653036279 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -709,6 +709,8 @@ void PG::generate_past_intervals() pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool())); cur_map->pg_to_up_acting_osds(pgid, &up, &up_primary, &acting, &primary); + boost::scoped_ptr recoverable( + get_is_recoverable_predicate()); std::stringstream debug; bool new_interval = pg_interval_t::check_new_interval( old_primary, @@ -724,6 +726,7 @@ void PG::generate_past_intervals() cur_map, last_map, pgid, + recoverable.get(), &past_intervals, &debug); if (new_interval) { @@ -4656,6 +4659,8 @@ void PG::start_peering_interval( } else { std::stringstream debug; assert(info.history.same_interval_since != 0); + boost::scoped_ptr recoverable( + get_is_recoverable_predicate()); bool new_interval = pg_interval_t::check_new_interval( old_acting_primary.osd, new_acting_primary, @@ -4668,6 +4673,7 @@ void PG::start_peering_interval( osdmap, lastmap, info.pgid.pgid, + recoverable.get(), &past_intervals, &debug); dout(10) << __func__ << ": check_new_interval output: " diff --git a/src/osd/PG.h b/src/osd/PG.h index b1aff2addc7..6c089b316f7 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -197,6 +197,10 @@ public: void update_snap_mapper_bits(uint32_t bits) { snap_mapper.update_bits(bits); } + /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done + IsPGRecoverablePredicate *get_is_recoverable_predicate() { + return get_pgbackend()->get_is_recoverable_predicate(); + } protected: // Ops waiting for map, should be queued at back Mutex map_lock; diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 4bc95e4dd18..dbcbd3dab35 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -947,6 +947,16 @@ void pg_pool_t::dump(Formatter *f) const f->dump_unsigned("expected_num_objects", expected_num_objects); } +void pg_pool_t::convert_to_pg_shards(const vector &from, set* to) const { + for (size_t i = 0; i < from.size(); ++i) { + if (from[i] != CRUSH_ITEM_NONE) { + to->insert( + pg_shard_t( + from[i], + ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + } +} int pg_pool_t::calc_bits_of(int t) { @@ -2630,6 +2640,7 @@ bool pg_interval_t::check_new_interval( OSDMapRef osdmap, OSDMapRef lastmap, pg_t pgid, + IsPGRecoverablePredicate *could_have_gone_active, map *past_intervals, std::ostream *out) { @@ -2663,9 +2674,14 @@ bool pg_interval_t::check_new_interval( if (*p != CRUSH_ITEM_NONE) ++num_acting; + const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second; + set old_acting_shards; + old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards); + if (num_acting && i.primary != -1 && - num_acting >= lastmap->get_pools().find(pgid.pool())->second.min_size) { + num_acting >= old_pg_pool.min_size && + (*could_have_gone_active)(old_acting_shards)) { if (out) *out << "generate_past_intervals " << i << ": not rw," diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 05ae133c13e..0c45f413ebc 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -961,6 +961,9 @@ struct pg_pool_t { return 0; } + /// converts the acting/up vector to a set of pg shards + void convert_to_pg_shards(const vector &from, set* to) const; + typedef enum { CACHEMODE_NONE = 0, ///< no caching CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later @@ -1978,6 +1981,7 @@ struct pg_interval_t { ceph::shared_ptr osdmap, ///< [in] current map ceph::shared_ptr lastmap, ///< [in] last map pg_t pgid, ///< [in] pgid for pg + IsPGRecoverablePredicate *could_have_gone_active, /// [in] predicate whether the pg can be active map *past_intervals,///< [out] intervals ostream *out = 0 ///< [out] debug ostream ); diff --git a/src/test/osd/types.cc b/src/test/osd/types.cc index fae8607af67..63b64486217 100644 --- a/src/test/osd/types.cc +++ b/src/test/osd/types.cc @@ -21,6 +21,7 @@ #include "gtest/gtest.h" #include "common/Thread.h" #include "include/stringify.h" +#include "osd/ReplicatedBackend.h" #include @@ -140,6 +141,7 @@ TEST(pg_interval_t, check_new_interval) int64_t pool_id = 200; int pg_num = 4; __u8 min_size = 2; + boost::scoped_ptr recoverable(new ReplicatedBackend::RPCRecPred()); { OSDMap::Incremental inc(epoch + 1); inc.new_pools[pool_id].min_size = min_size; @@ -184,6 +186,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals)); ASSERT_TRUE(past_intervals.empty()); } @@ -213,6 +216,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals)); ASSERT_EQ((unsigned int)1, past_intervals.size()); ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first); @@ -245,6 +249,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals)); old_primary = new_primary; ASSERT_EQ((unsigned int)1, past_intervals.size()); @@ -278,6 +283,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals)); ASSERT_EQ((unsigned int)1, past_intervals.size()); ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first); @@ -309,6 +315,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals)); ASSERT_EQ((unsigned int)1, past_intervals.size()); ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first); @@ -347,6 +354,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals)); ASSERT_EQ((unsigned int)1, past_intervals.size()); ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first); @@ -385,6 +393,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals)); ASSERT_EQ((unsigned int)1, past_intervals.size()); ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first); @@ -418,6 +427,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals, &out)); ASSERT_EQ((unsigned int)1, past_intervals.size()); @@ -469,6 +479,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals, &out)); ASSERT_EQ((unsigned int)1, past_intervals.size()); @@ -503,6 +514,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals, &out)); ASSERT_EQ((unsigned int)1, past_intervals.size()); @@ -547,6 +559,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals, &out)); ASSERT_EQ((unsigned int)1, past_intervals.size()); @@ -595,6 +608,7 @@ TEST(pg_interval_t, check_new_interval) osdmap, lastmap, pgid, + recoverable.get(), &past_intervals, &out)); ASSERT_EQ((unsigned int)1, past_intervals.size());