From 01e8e9482ce7194d347e02ef41acfa6d8d14f614 Mon Sep 17 00:00:00 2001 From: xie xingguo Date: Sat, 23 Mar 2019 09:50:27 +0800 Subject: [PATCH] osd/OSDMap: calc_pg_upmaps - restrict optimization to origin pools only The current implementation will try to cancel any pg_upmaps that would otherwise re-map a PG out from an underfull osd, which is wrong, e.g., because it could reliably fire the following assert: src/osd/OSDMap.cc: 4405: FAILED assert(osd_weight.count(i.first)) Also it would not match the expectation if automatic balancing has been strictly restricted to some specific pools by admin. Fix by excluding any wild PG that does not belong to the origin pools passed in when trying to do upmap/unmap. Fixes: http://tracker.ceph.com/issues/38897 Signed-off-by: xie xingguo --- src/osd/OSDMap.cc | 2 + src/test/osd/TestOSDMap.cc | 230 ++++++++++++++++++++++++++++++++++++- 2 files changed, 230 insertions(+), 2 deletions(-) diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 5affd814f4b93..41ec432c63d57 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -4666,6 +4666,8 @@ int OSDMap::calc_pg_upmaps( for (auto& i : tmp.pg_upmap_items) { if (to_skip.count(i.first)) continue; + if (!only_pools.empty() && !only_pools.count(i.first.pool())) + continue; candidates.push_back(make_pair(i.first, i.second)); } if (aggressive) { diff --git a/src/test/osd/TestOSDMap.cc b/src/test/osd/TestOSDMap.cc index 507b147e1b5a6..b51f73ecbe972 100644 --- a/src/test/osd/TestOSDMap.cc +++ b/src/test/osd/TestOSDMap.cc @@ -29,7 +29,7 @@ int main(int argc, char **argv) { } class OSDMapTest : public testing::Test { - const static int num_osds = 6; + int num_osds = 6; public: OSDMap osdmap; OSDMapMapping mapping; @@ -39,7 +39,8 @@ public: OSDMapTest() {} - void set_up_map() { + void set_up_map(int new_num_osds = 6, bool no_default_pools = false) { + num_osds = new_num_osds; uuid_d fsid; osdmap.build_simple(g_ceph_context, 0, fsid, num_osds); OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); @@ -59,6 +60,8 @@ public: pending_inc.new_uuid[i] = sample_uuid; } osdmap.apply_incremental(pending_inc); + if (no_default_pools) // do not create any default pool(s) + return; // Create an EC ruleset and a pool using it int r = osdmap.crush->add_simple_rule( @@ -1084,6 +1087,229 @@ TEST_F(OSDMapTest, CleanPGUpmaps) { } } +TEST_F(OSDMapTest, BUG_38897) { + // http://tracker.ceph.com/issues/38897 + // build a fresh map with 12 OSDs, without any default pools + set_up_map(12, true); + const string pool_1("pool1"); + const string pool_2("pool2"); + int64_t pool_1_id = -1; + + { + // build customized crush rule for "pool1" + string host_name = "host_for_pool_1"; + // build a customized host to capture osd.1~5 + for (int i = 1; i < 5; i++) { + stringstream osd_name; + vector move_to; + osd_name << "osd." << i; + move_to.push_back("root=default"); + string host_loc = "host=" + host_name; + move_to.push_back(host_loc); + auto r = crush_move(osdmap, osd_name.str(), move_to); + ASSERT_EQ(0, r); + } + CrushWrapper crush; + get_crush(osdmap, crush); + auto host_id = crush.get_item_id(host_name); + ASSERT_TRUE(host_id < 0); + string rule_name = "rule_for_pool1"; + int rule_type = pg_pool_t::TYPE_REPLICATED; + ASSERT_TRUE(!crush.rule_exists(rule_name)); + int rno; + for (rno = 0; rno < crush.get_max_rules(); rno++) { + if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno)) + break; + } + int min_size = 3; + int max_size = 3; + int steps = 7; + crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size); + int step = 0; + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0); + // always choose osd.0 + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + // then pick any other random osds + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + ASSERT_TRUE(step == steps); + auto r = crush_add_rule(crush.get_crush_map(), rule, rno); + ASSERT_TRUE(r >= 0); + crush.set_rule_name(rno, rule_name); + { + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.crush.clear(); + crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); + osdmap.apply_incremental(pending_inc); + } + + // create "pool1" + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pool_max = osdmap.get_pool_max(); + auto pool_id = ++pending_inc.new_pool_max; + pool_1_id = pool_id; + pg_pool_t empty; + auto p = pending_inc.get_new_pool(pool_id, &empty); + p->size = 3; + p->min_size = 1; + p->set_pg_num(3); + p->set_pgp_num(3); + p->type = pg_pool_t::TYPE_REPLICATED; + p->crush_rule = rno; + p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + pending_inc.new_pool_names[pool_id] = pool_1; + osdmap.apply_incremental(pending_inc); + ASSERT_TRUE(osdmap.have_pg_pool(pool_id)); + ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_1); + { + for (unsigned i = 0; i < 3; i++) { + // 1.x -> [1] + pg_t rawpg(i, pool_id); + pg_t pgid = osdmap.raw_pg_to_pg(rawpg); + vector up; + int up_primary; + osdmap.pg_to_raw_up(pgid, &up, &up_primary); + ASSERT_TRUE(up.size() == 3); + ASSERT_TRUE(up[0] == 0); + + // insert a new pg_upmap + vector new_up; + // and remap 1.x to osd.1 only + // this way osd.0 is deemed to be *underfull* + // and osd.1 is deemed to be *overfull* + new_up.push_back(1); + { + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector( + new_up.begin(), new_up.end()); + osdmap.apply_incremental(pending_inc); + } + osdmap.pg_to_raw_up(pgid, &up, &up_primary); + ASSERT_TRUE(up.size() == 1); + ASSERT_TRUE(up[0] == 1); + } + } + } + + { + // build customized crush rule for "pool2" + string host_name = "host_for_pool_2"; + // build a customized host to capture osd.6~11 + for (int i = 6; i < (int)get_num_osds(); i++) { + stringstream osd_name; + vector move_to; + osd_name << "osd." << i; + move_to.push_back("root=default"); + string host_loc = "host=" + host_name; + move_to.push_back(host_loc); + auto r = crush_move(osdmap, osd_name.str(), move_to); + ASSERT_EQ(0, r); + } + CrushWrapper crush; + get_crush(osdmap, crush); + auto host_id = crush.get_item_id(host_name); + ASSERT_TRUE(host_id < 0); + string rule_name = "rule_for_pool2"; + int rule_type = pg_pool_t::TYPE_REPLICATED; + ASSERT_TRUE(!crush.rule_exists(rule_name)); + int rno; + for (rno = 0; rno < crush.get_max_rules(); rno++) { + if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno)) + break; + } + int min_size = 3; + int max_size = 3; + int steps = 7; + crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size); + int step = 0; + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0); + // always choose osd.0 + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + // then pick any other random osds + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + ASSERT_TRUE(step == steps); + auto r = crush_add_rule(crush.get_crush_map(), rule, rno); + ASSERT_TRUE(r >= 0); + crush.set_rule_name(rno, rule_name); + { + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.crush.clear(); + crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); + osdmap.apply_incremental(pending_inc); + } + + // create "pool2" + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pool_max = osdmap.get_pool_max(); + auto pool_id = ++pending_inc.new_pool_max; + pg_pool_t empty; + auto p = pending_inc.get_new_pool(pool_id, &empty); + p->size = 3; + // include a single PG + p->set_pg_num(1); + p->set_pgp_num(1); + p->type = pg_pool_t::TYPE_REPLICATED; + p->crush_rule = rno; + p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + pending_inc.new_pool_names[pool_id] = pool_2; + osdmap.apply_incremental(pending_inc); + ASSERT_TRUE(osdmap.have_pg_pool(pool_id)); + ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_2); + pg_t rawpg(0, pool_id); + pg_t pgid = osdmap.raw_pg_to_pg(rawpg); + EXPECT_TRUE(!osdmap.have_pg_upmaps(pgid)); + vector up; + int up_primary; + osdmap.pg_to_raw_up(pgid, &up, &up_primary); + ASSERT_TRUE(up.size() == 3); + ASSERT_TRUE(up[0] == 0); + + { + // build a pg_upmap_item that will + // remap pg out from *underfull* osd.0 + vector> new_pg_upmap_items; + new_pg_upmap_items.push_back(make_pair(0, 10)); // osd.0 -> osd.10 + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pg_upmap_items[pgid] = + mempool::osdmap::vector>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + osdmap.apply_incremental(pending_inc); + ASSERT_TRUE(osdmap.have_pg_upmaps(pgid)); + vector up; + int up_primary; + osdmap.pg_to_raw_up(pgid, &up, &up_primary); + ASSERT_TRUE(up.size() == 3); + ASSERT_TRUE(up[0] == 10); + } + } + + // ready to go + { + // require perfect distribution! + auto ret = g_ceph_context->_conf.set_val( + "osd_calc_pg_upmaps_max_stddev", "0"); + ASSERT_EQ(0, ret); + g_ceph_context->_conf.apply_changes(nullptr); + set only_pools; + ASSERT_TRUE(pool_1_id >= 0); + only_pools.insert(pool_1_id); + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + osdmap.calc_pg_upmaps(g_ceph_context, + 0, // so we can force optimizing + 100, + only_pools, + &pending_inc); + osdmap.apply_incremental(pending_inc); + } +} + TEST(PGTempMap, basic) { PGTempMap m; -- 2.39.5