From 09528ac4663cda9f206eb8d4dcc15926b8e4586f Mon Sep 17 00:00:00 2001 From: huangjun Date: Thu, 24 Oct 2019 23:44:12 +0800 Subject: [PATCH] crush: remove invalid upmap items We can not cancel in verify_upmap if remap an osd to different root bucket, cluster topology: osd.0 ~ osd.29 belongs to datacenter 1 osd30 ~ osd.59 belongs to datacenter 2 crush rule: take datacenter 1 chooseleaf 2 host emit take datacenter 2 chooseleaf 2 host emit The pg's primary osd in datacenter 1. We should cancel the pgid from upmap_items like below 1) from [26,12,54,46] to [30,12,54,46] pg_upmap_items [26,30] 2) from [16,25,53,31] to [40,25,53,0] pg_upmap_items [16,20,31,0] Signed-off-by: huangjun (cherry picked from commit 712a39e5c9d9848f618ad55a768103d84c0a460f) --- src/crush/CrushWrapper.cc | 41 +++++++ src/test/osd/TestOSDMap.cc | 217 +++++++++++++++++++++++++++++++++++++ 2 files changed, 258 insertions(+) diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc index 55d627b0474..db6628b4b6d 100644 --- a/src/crush/CrushWrapper.cc +++ b/src/crush/CrushWrapper.cc @@ -891,16 +891,27 @@ int CrushWrapper::verify_upmap(CephContext *cct, << dendl; return -ENOENT; } + int root_bucket = 0; + int cursor = 0; for (unsigned step = 0; step < rule->len; ++step) { auto curstep = &rule->steps[step]; ldout(cct, 10) << __func__ << " step " << step << dendl; switch (curstep->op) { + case CRUSH_RULE_TAKE: + { + root_bucket = curstep->arg1; + } + break; case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSELEAF_INDEP: { + int numrep = curstep->arg1; int type = curstep->arg2; + int real_rep = 0; if (type == 0) // osd break; + if (numrep <= 0) + numrep += pool_size; map> osds_by_parent; // parent_of_desired_type -> osds for (auto osd : up) { auto parent = get_parent_of_type(osd, type, rule_id); @@ -912,6 +923,8 @@ int CrushWrapper::verify_upmap(CephContext *cct, << dendl; } } + ldout(cct, 10) << __func__ << " osds_by_parent " << osds_by_parent << dendl; + ceph_assert(root_bucket < 0); for (auto i : osds_by_parent) { if (i.second.size() > 1) { lderr(cct) << __func__ << " multiple osds " << i.second @@ -919,6 +932,22 @@ int CrushWrapper::verify_upmap(CephContext *cct, << dendl; return -EINVAL; } + if (subtree_contains(root_bucket, i.first)) { + real_rep++; + } + } + if (real_rep != numrep) { + lderr(cct) << __func__ << " expected " << numrep << " items in bucket " << root_bucket + << " real " << real_rep << dendl; + return -EINVAL; + } + // validate the osd's in subtree + for (int c = 0; cursor < up.size() && c < numrep; ++cursor, ++c) { + int osd = up[cursor]; + if (!subtree_contains(root_bucket, osd)) { + lderr(cct) << __func__ << " osd " << osd << " not in bucket " << root_bucket << dendl; + return -EINVAL; + } } } break; @@ -928,6 +957,7 @@ int CrushWrapper::verify_upmap(CephContext *cct, { int numrep = curstep->arg1; int type = curstep->arg2; + int real_rep = 0; if (type == 0) // osd break; if (numrep <= 0) @@ -949,6 +979,17 @@ int CrushWrapper::verify_upmap(CephContext *cct, << dendl; return -EINVAL; } + ceph_assert(root_bucket < 0); + for (auto & i : parents_of_type) { + if (subtree_contains(root_bucket, i)) { + real_rep++; + } + } + if (real_rep != numrep) { + lderr(cct) << __func__ << " expected " << numrep << " items in bucket " << root_bucket + << " real " << real_rep << dendl; + return -EINVAL; + } } break; diff --git a/src/test/osd/TestOSDMap.cc b/src/test/osd/TestOSDMap.cc index 42c1bc590d5..af7e2336bfb 100644 --- a/src/test/osd/TestOSDMap.cc +++ b/src/test/osd/TestOSDMap.cc @@ -1447,6 +1447,223 @@ TEST_F(OSDMapTest, BUG_42052) { } } +TEST_F(OSDMapTest, CleanPGUpmapsFixLeakedUpmaps) { + set_up_map(60); + { + // build a temporary crush topology of 2datacenters, 3racks per dc, + // 1host per rack, 10osds per host + OSDMap tmp; // use a tmpmap here, so we do not dirty origin map.. + tmp.deepish_copy_from(osdmap); + const int expected_host_num = 6; + int osd_per_host = (int)get_num_osds() / expected_host_num; + ASSERT_GE(osd_per_host, 10); + int host_per_dc = 3; + int index = 0; + int dc_index = 0; + for (int i = 0; i < (int)get_num_osds(); i++) { + if (i && i % osd_per_host == 0) { + ++index; + } + if (i && i % (host_per_dc * osd_per_host) == 0) { + ++dc_index; + } + stringstream osd_name; + stringstream host_name; + stringstream rack_name; + stringstream dc_name; + vector move_to; + osd_name << "osd." << i; + host_name << "host-" << index; + rack_name << "rack-" << index; + dc_name << "dc-" << dc_index; + move_to.push_back("root=default"); + string dc_loc = "datacenter=" + dc_name.str(); + move_to.push_back(dc_loc); + string rack_loc = "rack=" + rack_name.str(); + move_to.push_back(rack_loc); + string host_loc = "host=" + host_name.str(); + move_to.push_back(host_loc); + auto r = crush_move(tmp, osd_name.str(), move_to); + ASSERT_EQ(0, r); + } + + // build crush rule + CrushWrapper crush; + get_crush(tmp, crush); + string rule_name = "rule_xeus_993_1"; + int rule_type = pg_pool_t::TYPE_REPLICATED; + ASSERT_TRUE(!crush.rule_exists(rule_name)); + int rno; + for (rno = 0; rno < crush.get_max_rules(); rno++) { + if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno)) + break; + } + string root_name = "default"; + string dc_1 = "dc-0"; + int dc1 = crush.get_item_id(dc_1); + string dc_2 = "dc-1"; + int dc2 = crush.get_item_id(dc_2); + int min_size = 1; + int max_size = 20; + int steps = 8; + crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size); + int step = 0; + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, dc1, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 3 /* rack */); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, dc2, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 3 /* rack */); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + ASSERT_TRUE(step == steps); + auto r = crush_add_rule(crush.get_crush_map(), rule, rno); + ASSERT_TRUE(r >= 0); + crush.set_rule_name(rno, rule_name); + { + OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); + pending_inc.crush.clear(); + crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); + tmp.apply_incremental(pending_inc); + } + int64_t pool_xeus_993; + { + OSDMap::Incremental new_pool_inc(tmp.get_epoch() + 1); + new_pool_inc.new_pool_max = tmp.get_pool_max(); + new_pool_inc.fsid = tmp.get_fsid(); + pg_pool_t empty; + pool_xeus_993 = ++new_pool_inc.new_pool_max; + pg_pool_t *p = new_pool_inc.get_new_pool(pool_xeus_993, &empty); + p->size = 4; + p->set_pg_num(4096); + p->set_pgp_num(4096); + p->type = pg_pool_t::TYPE_REPLICATED; + p->crush_rule = rno; + p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + new_pool_inc.new_pool_names[pool_xeus_993] = "pool_xeus_993"; + tmp.apply_incremental(new_pool_inc); + } + + pg_t rep_pg(0, pool_xeus_993); + pg_t rep_pgid = tmp.raw_pg_to_pg(rep_pg); + { + int from = -1; + int to = -1; + vector rep_up; + int rep_up_primary; + tmp.pg_to_raw_up(rep_pgid, &rep_up, &rep_up_primary); + std::cout << "pgid " << rep_up << " up " << rep_up << std::endl; + ASSERT_TRUE(rep_up.size() == 4); + from = *(rep_up.begin()); + ASSERT_TRUE(from >= 0); + auto dc_parent = tmp.crush->get_parent_of_type(from, 8 /* dc */, rno); + if (dc_parent == dc1) + dc_parent = dc2; + else + dc_parent = dc1; + auto rack_parent = tmp.crush->get_parent_of_type(from, 3 /* rack */, rno); + ASSERT_TRUE(dc_parent < 0); + ASSERT_TRUE(rack_parent < 0); + set rack_parents; + for (auto &i: rep_up) { + if (i == from) continue; + auto rack_parent = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno); + rack_parents.insert(rack_parent); + } + for (int i = 0; i < (int)get_num_osds(); i++) { + if (std::find(rep_up.begin(), rep_up.end(), i) == rep_up.end()) { + auto dc_p = tmp.crush->get_parent_of_type(i, 8 /* dc */, rno); + auto rack_p = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno); + if (dc_p == dc_parent && + rack_parents.find(rack_p) == rack_parents.end()) { + to = i; + break; + } + } + } + ASSERT_TRUE(to >= 0); + ASSERT_TRUE(from != to); + std::cout << "from " << from << " to " << to << std::endl; + vector> new_pg_upmap_items; + new_pg_upmap_items.push_back(make_pair(from, to)); + OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); + pending_inc.new_pg_upmap_items[rep_pgid] = + mempool::osdmap::vector>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + tmp.apply_incremental(pending_inc); + ASSERT_TRUE(tmp.have_pg_upmaps(rep_pgid)); + } + pg_t rep_pg2(2, pool_xeus_993); + pg_t rep_pgid2 = tmp.raw_pg_to_pg(rep_pg2); + { + pg_t rep_pgid = rep_pgid2; + vector from_osds{-1, -1}; + vector rep_up; + int rep_up_primary; + tmp.pg_to_raw_up(rep_pgid, &rep_up, &rep_up_primary); + ASSERT_TRUE(rep_up.size() == 4); + from_osds[0] = *(rep_up.begin()); + from_osds[1] = *(rep_up.rbegin()); + std::cout << "pgid " << rep_pgid2 << " up " << rep_up << std::endl; + ASSERT_TRUE(*(from_osds.begin()) >= 0); + ASSERT_TRUE(*(from_osds.rbegin()) >= 0); + vector> new_pg_upmap_items; + for (auto &from: from_osds) { + int to = -1; + auto dc_parent = tmp.crush->get_parent_of_type(from, 8 /* dc */, rno); + if (dc_parent == dc1) + dc_parent = dc2; + else + dc_parent = dc1; + auto rack_parent = tmp.crush->get_parent_of_type(from, 3 /* rack */, rno); + ASSERT_TRUE(dc_parent < 0); + ASSERT_TRUE(rack_parent < 0); + set rack_parents; + for (auto &i: rep_up) { + if (i == from) continue; + auto rack_parent = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno); + rack_parents.insert(rack_parent); + } + for (auto &i: new_pg_upmap_items) { + auto rack_from = tmp.crush->get_parent_of_type(i.first, 3, rno); + auto rack_to = tmp.crush->get_parent_of_type(i.second, 3, rno); + rack_parents.insert(rack_from); + rack_parents.insert(rack_to); + } + for (int i = 0; i < (int)get_num_osds(); i++) { + if (std::find(rep_up.begin(), rep_up.end(), i) == rep_up.end()) { + auto dc_p = tmp.crush->get_parent_of_type(i, 8 /* dc */, rno); + auto rack_p = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno); + if (dc_p == dc_parent && + rack_parents.find(rack_p) == rack_parents.end()) { + to = i; + break; + } + } + } + ASSERT_TRUE(to >= 0); + ASSERT_TRUE(from != to); + std::cout << "from " << from << " to " << to << std::endl; + new_pg_upmap_items.push_back(make_pair(from, to)); + } + OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); + pending_inc.new_pg_upmap_items[rep_pgid] = + mempool::osdmap::vector>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + tmp.apply_incremental(pending_inc); + ASSERT_TRUE(tmp.have_pg_upmaps(rep_pgid)); + } + { + // *maybe_remove_pg_upmaps* should remove the above upmap_item + OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); + clean_pg_upmaps(g_ceph_context, tmp, pending_inc); + tmp.apply_incremental(pending_inc); + ASSERT_FALSE(tmp.have_pg_upmaps(rep_pgid)); + ASSERT_FALSE(tmp.have_pg_upmaps(rep_pgid2)); + } + } +} + TEST(PGTempMap, basic) { PGTempMap m; -- 2.47.3