]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/OSDMap: calc_pg_upmaps - restrict optimization to origin pools only
authorxie xingguo <xie.xingguo@zte.com.cn>
Sat, 23 Mar 2019 01:50:27 +0000 (09:50 +0800)
committerxie xingguo <xie.xingguo@zte.com.cn>
Thu, 28 Mar 2019 02:29:50 +0000 (10:29 +0800)
The current implementation will try to cancel any pg_upmaps that
would otherwise re-map a PG out from an underfull osd, which is wrong,
e.g., because it could reliably fire the following assert:

src/osd/OSDMap.cc: 4405: FAILED assert(osd_weight.count(i.first))

Also it would not match the expectation if automatic balancing
has been strictly restricted to some specific pools by admin.

Fix by excluding any wild PG that does not belong to the origin pools
passed in when trying to do upmap/unmap.

Fixes: http://tracker.ceph.com/issues/38897
Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn>
(cherry picked from commit 01e8e9482ce7194d347e02ef41acfa6d8d14f614)

src/osd/OSDMap.cc
src/test/osd/TestOSDMap.cc

index 78a5e9957616a669ec3306c73441a8c7f6c159e9..837a1a246d074b87a4151c715480cbba3f638b21 100644 (file)
@@ -4331,6 +4331,8 @@ int OSDMap::calc_pg_upmaps(
       for (auto& i : tmp.pg_upmap_items) {
         if (to_skip.count(i.first))
           continue;
+        if (!only_pools.empty() && !only_pools.count(i.first.pool()))
+          continue;
         candidates.push_back(make_pair(i.first, i.second));
       }
       if (aggressive) {
index 57ee3930ead53bb073f0e26f296c3413e9b5879a..cfb83d153ae1110c1140922ba6cfd4450bef5b28 100644 (file)
@@ -28,7 +28,7 @@ int main(int argc, char **argv) {
 }
 
 class OSDMapTest : public testing::Test {
-  const static int num_osds = 6;
+  int num_osds = 6;
 public:
   OSDMap osdmap;
   OSDMapMapping mapping;
@@ -38,7 +38,8 @@ public:
 
   OSDMapTest() {}
 
-  void set_up_map() {
+  void set_up_map(int new_num_osds = 6, bool no_default_pools = false) {
+    num_osds = new_num_osds;
     uuid_d fsid;
     osdmap.build_simple(g_ceph_context, 0, fsid, num_osds);
     OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
@@ -57,6 +58,8 @@ public:
       pending_inc.new_uuid[i] = sample_uuid;
     }
     osdmap.apply_incremental(pending_inc);
+    if (no_default_pools) // do not create any default pool(s)
+      return;
 
     // Create an EC ruleset and a pool using it
     int r = osdmap.crush->add_simple_rule(
@@ -1067,6 +1070,229 @@ TEST_F(OSDMapTest, CleanPGUpmaps) {
   }
 }
 
+TEST_F(OSDMapTest, BUG_38897) {
+  // http://tracker.ceph.com/issues/38897
+  // build a fresh map with 12 OSDs, without any default pools
+  set_up_map(12, true);
+  const string pool_1("pool1");
+  const string pool_2("pool2");
+  int64_t pool_1_id = -1;
+
+  {
+    // build customized crush rule for "pool1"
+    string host_name = "host_for_pool_1";
+    // build a customized host to capture osd.1~5
+    for (int i = 1; i < 5; i++) {
+      stringstream osd_name;
+      vector<string> move_to;
+      osd_name << "osd." << i;
+      move_to.push_back("root=default");
+      string host_loc = "host=" + host_name;
+      move_to.push_back(host_loc);
+      auto r = crush_move(osdmap, osd_name.str(), move_to);
+      ASSERT_EQ(0, r);
+    }
+    CrushWrapper crush;
+    get_crush(osdmap, crush);
+    auto host_id = crush.get_item_id(host_name);
+    ASSERT_TRUE(host_id < 0);
+    string rule_name = "rule_for_pool1";
+    int rule_type = pg_pool_t::TYPE_REPLICATED;
+    ASSERT_TRUE(!crush.rule_exists(rule_name));
+    int rno;
+    for (rno = 0; rno < crush.get_max_rules(); rno++) {
+      if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
+        break;
+    }
+    int min_size = 3;
+    int max_size = 3;
+    int steps = 7;
+    crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
+    int step = 0;
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
+    // always choose osd.0
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    // then pick any other random osds
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    ASSERT_TRUE(step == steps);
+    auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
+    ASSERT_TRUE(r >= 0);
+    crush.set_rule_name(rno, rule_name);
+    {
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.crush.clear();
+      crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+      osdmap.apply_incremental(pending_inc);
+    }
+
+    // create "pool1"
+    OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+    pending_inc.new_pool_max = osdmap.get_pool_max();
+    auto pool_id = ++pending_inc.new_pool_max;
+    pool_1_id = pool_id;
+    pg_pool_t empty;
+    auto p = pending_inc.get_new_pool(pool_id, &empty);
+    p->size = 3;
+    p->min_size = 1;
+    p->set_pg_num(3);
+    p->set_pgp_num(3);
+    p->type = pg_pool_t::TYPE_REPLICATED;
+    p->crush_rule = rno;
+    p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+    pending_inc.new_pool_names[pool_id] = pool_1;
+    osdmap.apply_incremental(pending_inc);
+    ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
+    ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_1);
+    {
+      for (unsigned i = 0; i < 3; i++) {
+        // 1.x -> [1]
+        pg_t rawpg(i, pool_id);
+        pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
+        vector<int> up;
+        int up_primary;
+        osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+        ASSERT_TRUE(up.size() == 3);
+        ASSERT_TRUE(up[0] == 0);
+
+        // insert a new pg_upmap
+        vector<int32_t> new_up;
+        // and remap 1.x to osd.1 only
+        // this way osd.0 is deemed to be *underfull*
+        // and osd.1 is deemed to be *overfull*
+        new_up.push_back(1);
+        {
+          OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+          pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
+            new_up.begin(), new_up.end());
+          osdmap.apply_incremental(pending_inc);
+        }
+        osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+        ASSERT_TRUE(up.size() == 1);
+        ASSERT_TRUE(up[0] == 1);
+      }
+    }
+  }
+
+  {
+    // build customized crush rule for "pool2"
+    string host_name = "host_for_pool_2";
+    // build a customized host to capture osd.6~11
+    for (int i = 6; i < (int)get_num_osds(); i++) {
+      stringstream osd_name;
+      vector<string> move_to;
+      osd_name << "osd." << i;
+      move_to.push_back("root=default");
+      string host_loc = "host=" + host_name;
+      move_to.push_back(host_loc);
+      auto r = crush_move(osdmap, osd_name.str(), move_to);
+      ASSERT_EQ(0, r);
+    }
+    CrushWrapper crush;
+    get_crush(osdmap, crush);
+    auto host_id = crush.get_item_id(host_name);
+    ASSERT_TRUE(host_id < 0);
+    string rule_name = "rule_for_pool2";
+    int rule_type = pg_pool_t::TYPE_REPLICATED;
+    ASSERT_TRUE(!crush.rule_exists(rule_name));
+    int rno;
+    for (rno = 0; rno < crush.get_max_rules(); rno++) {
+      if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
+        break;
+    }
+    int min_size = 3;
+    int max_size = 3;
+    int steps = 7;
+    crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
+    int step = 0;
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
+    // always choose osd.0
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    // then pick any other random osds
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    ASSERT_TRUE(step == steps);
+    auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
+    ASSERT_TRUE(r >= 0);
+    crush.set_rule_name(rno, rule_name);
+    {
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.crush.clear();
+      crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+      osdmap.apply_incremental(pending_inc);
+    }
+
+    // create "pool2"
+    OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+    pending_inc.new_pool_max = osdmap.get_pool_max();
+    auto pool_id = ++pending_inc.new_pool_max;
+    pg_pool_t empty;
+    auto p = pending_inc.get_new_pool(pool_id, &empty);
+    p->size = 3;
+    // include a single PG
+    p->set_pg_num(1);
+    p->set_pgp_num(1);
+    p->type = pg_pool_t::TYPE_REPLICATED;
+    p->crush_rule = rno;
+    p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+    pending_inc.new_pool_names[pool_id] = pool_2;
+    osdmap.apply_incremental(pending_inc);
+    ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
+    ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_2);
+    pg_t rawpg(0, pool_id);
+    pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
+    EXPECT_TRUE(!osdmap.have_pg_upmaps(pgid));
+    vector<int> up;
+    int up_primary;
+    osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+    ASSERT_TRUE(up.size() == 3);
+    ASSERT_TRUE(up[0] == 0);
+
+    {
+      // build a pg_upmap_item that will
+      // remap pg out from *underfull* osd.0
+      vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+      new_pg_upmap_items.push_back(make_pair(0, 10)); // osd.0 -> osd.10
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.new_pg_upmap_items[pgid] =
+      mempool::osdmap::vector<pair<int32_t,int32_t>>(
+        new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+      osdmap.apply_incremental(pending_inc);
+      ASSERT_TRUE(osdmap.have_pg_upmaps(pgid));
+      vector<int> up;
+      int up_primary;
+      osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+      ASSERT_TRUE(up.size() == 3);
+      ASSERT_TRUE(up[0] == 10);
+    }
+  }
+
+  // ready to go
+  {
+    // require perfect distribution!
+    auto ret = g_ceph_context->_conf->set_val(
+      "osd_calc_pg_upmaps_max_stddev", "0");
+    ASSERT_EQ(0, ret);
+    g_ceph_context->_conf->apply_changes(nullptr);
+    set<int64_t> only_pools;
+    ASSERT_TRUE(pool_1_id >= 0);
+    only_pools.insert(pool_1_id);
+    OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+    osdmap.calc_pg_upmaps(g_ceph_context,
+                          0, // so we can force optimizing
+                          100,
+                          only_pools,
+                          &pending_inc);
+    osdmap.apply_incremental(pending_inc);
+  }
+}
+
 TEST(PGTempMap, basic)
 {
   PGTempMap m;