From: huangjun Date: Thu, 24 Oct 2019 15:44:12 +0000 (+0800) Subject: crush: remove invalid upmap items X-Git-Tag: v15.1.0~543^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=55ed94a345b968adf2a0a7bf9519568d9c2df488;p=ceph.git crush: remove invalid upmap items We can not cancel in verify_upmap if remap an osd to different root bucket, cluster topology: osd.0 ~ osd.29 belongs to datacenter 1 osd30 ~ osd.59 belongs to datacenter 2 crush rule: take datacenter 1 chooseleaf 2 host emit take datacenter 2 chooseleaf 2 host emit The pg's primary osd in datacenter 1. We should cancel the pgid from upmap_items like below 1) from [26,12,54,46] to [30,12,54,46] pg_upmap_items [26,30] 2) from [16,25,53,31] to [40,25,53,0] pg_upmap_items [16,20,31,0] Fixes: https://tracker.ceph.com/issues/42485 Fixes: https://tracker.ceph.com/issues/43124 Signed-off-by: huangjun --- diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc index a4c699bf0039..04d527fec12e 100644 --- a/src/crush/CrushWrapper.cc +++ b/src/crush/CrushWrapper.cc @@ -931,14 +931,26 @@ int CrushWrapper::verify_upmap(CephContext *cct, << dendl; return -ENOENT; } + int root_bucket = 0; + int cursor = 0; + std::map type_stack; for (unsigned step = 0; step < rule->len; ++step) { auto curstep = &rule->steps[step]; ldout(cct, 10) << __func__ << " step " << step << dendl; switch (curstep->op) { + case CRUSH_RULE_TAKE: + { + root_bucket = curstep->arg1; + } + break; case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSELEAF_INDEP: { + int numrep = curstep->arg1; int type = curstep->arg2; + if (numrep <= 0) + numrep += pool_size; + type_stack.emplace(type, numrep); if (type == 0) // osd break; map> osds_by_parent; // parent_of_desired_type -> osds @@ -968,10 +980,11 @@ int CrushWrapper::verify_upmap(CephContext *cct, { int numrep = curstep->arg1; int type = curstep->arg2; - if (type == 0) // osd - break; if (numrep <= 0) numrep += pool_size; + type_stack.emplace(type, numrep); + if (type == 0) // osd + break; set parents_of_type; for (auto osd : up) { auto parent = get_parent_of_type(osd, type, rule_id); @@ -992,6 +1005,26 @@ int CrushWrapper::verify_upmap(CephContext *cct, } break; + case CRUSH_RULE_EMIT: + { + if (root_bucket < 0) { + int num_osds = 1; + for (auto &item : type_stack) { + num_osds *= item.second; + } + // validate the osd's in subtree + for (int c = 0; cursor < (int)up.size() && c < num_osds; ++cursor, ++c) { + int osd = up[cursor]; + if (!subtree_contains(root_bucket, osd)) { + lderr(cct) << __func__ << " osd " << osd << " not in bucket " << root_bucket << dendl; + return -EINVAL; + } + } + } + type_stack.clear(); + root_bucket = 0; + } + break; default: // ignore break; diff --git a/src/test/osd/TestOSDMap.cc b/src/test/osd/TestOSDMap.cc index 0f4b5108cef8..84d05a3de6cd 100644 --- a/src/test/osd/TestOSDMap.cc +++ b/src/test/osd/TestOSDMap.cc @@ -1483,6 +1483,224 @@ TEST_F(OSDMapTest, BUG_42052) { } } +TEST_F(OSDMapTest, BUG_42485) { + set_up_map(60); + { + // build a temporary crush topology of 2datacenters, 3racks per dc, + // 1host per rack, 10osds per host + OSDMap tmp; // use a tmpmap here, so we do not dirty origin map.. + tmp.deepish_copy_from(osdmap); + const int expected_host_num = 6; + int osd_per_host = (int)get_num_osds() / expected_host_num; + ASSERT_GE(osd_per_host, 10); + int host_per_dc = 3; + int index = 0; + int dc_index = 0; + for (int i = 0; i < (int)get_num_osds(); i++) { + if (i && i % osd_per_host == 0) { + ++index; + } + if (i && i % (host_per_dc * osd_per_host) == 0) { + ++dc_index; + } + stringstream osd_name; + stringstream host_name; + stringstream rack_name; + stringstream dc_name; + vector move_to; + osd_name << "osd." << i; + host_name << "host-" << index; + rack_name << "rack-" << index; + dc_name << "dc-" << dc_index; + move_to.push_back("root=default"); + string dc_loc = "datacenter=" + dc_name.str(); + move_to.push_back(dc_loc); + string rack_loc = "rack=" + rack_name.str(); + move_to.push_back(rack_loc); + string host_loc = "host=" + host_name.str(); + move_to.push_back(host_loc); + auto r = crush_move(tmp, osd_name.str(), move_to); + ASSERT_EQ(0, r); + } + + // build crush rule + CrushWrapper crush; + get_crush(tmp, crush); + string rule_name = "rule_xeus_993_1"; + int rule_type = pg_pool_t::TYPE_REPLICATED; + ASSERT_TRUE(!crush.rule_exists(rule_name)); + int rno; + for (rno = 0; rno < crush.get_max_rules(); rno++) { + if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno)) + break; + } + string root_name = "default"; + string dc_1 = "dc-0"; + int dc1 = crush.get_item_id(dc_1); + string dc_2 = "dc-1"; + int dc2 = crush.get_item_id(dc_2); + int min_size = 1; + int max_size = 20; + int steps = 8; + crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size); + int step = 0; + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, dc1, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 3 /* rack */); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, dc2, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 3 /* rack */); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + ASSERT_TRUE(step == steps); + auto r = crush_add_rule(crush.get_crush_map(), rule, rno); + ASSERT_TRUE(r >= 0); + crush.set_rule_name(rno, rule_name); + { + OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); + pending_inc.crush.clear(); + crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); + tmp.apply_incremental(pending_inc); + } + // create a repliacted pool referencing the above rule + int64_t pool_xeus_993; + { + OSDMap::Incremental new_pool_inc(tmp.get_epoch() + 1); + new_pool_inc.new_pool_max = tmp.get_pool_max(); + new_pool_inc.fsid = tmp.get_fsid(); + pg_pool_t empty; + pool_xeus_993 = ++new_pool_inc.new_pool_max; + pg_pool_t *p = new_pool_inc.get_new_pool(pool_xeus_993, &empty); + p->size = 4; + p->set_pg_num(4096); + p->set_pgp_num(4096); + p->type = pg_pool_t::TYPE_REPLICATED; + p->crush_rule = rno; + p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + new_pool_inc.new_pool_names[pool_xeus_993] = "pool_xeus_993"; + tmp.apply_incremental(new_pool_inc); + } + + pg_t rep_pg(0, pool_xeus_993); + pg_t rep_pgid = tmp.raw_pg_to_pg(rep_pg); + { + int from = -1; + int to = -1; + vector rep_up; + int rep_up_primary; + tmp.pg_to_raw_up(rep_pgid, &rep_up, &rep_up_primary); + std::cout << "pgid " << rep_up << " up " << rep_up << std::endl; + ASSERT_TRUE(rep_up.size() == 4); + from = *(rep_up.begin()); + ASSERT_TRUE(from >= 0); + auto dc_parent = tmp.crush->get_parent_of_type(from, 8 /* dc */, rno); + if (dc_parent == dc1) + dc_parent = dc2; + else + dc_parent = dc1; + auto rack_parent = tmp.crush->get_parent_of_type(from, 3 /* rack */, rno); + ASSERT_TRUE(dc_parent < 0); + ASSERT_TRUE(rack_parent < 0); + set rack_parents; + for (auto &i: rep_up) { + if (i == from) continue; + auto rack_parent = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno); + rack_parents.insert(rack_parent); + } + for (int i = 0; i < (int)get_num_osds(); i++) { + if (std::find(rep_up.begin(), rep_up.end(), i) == rep_up.end()) { + auto dc_p = tmp.crush->get_parent_of_type(i, 8 /* dc */, rno); + auto rack_p = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno); + if (dc_p == dc_parent && + rack_parents.find(rack_p) == rack_parents.end()) { + to = i; + break; + } + } + } + ASSERT_TRUE(to >= 0); + ASSERT_TRUE(from != to); + std::cout << "from " << from << " to " << to << std::endl; + vector> new_pg_upmap_items; + new_pg_upmap_items.push_back(make_pair(from, to)); + OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); + pending_inc.new_pg_upmap_items[rep_pgid] = + mempool::osdmap::vector>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + tmp.apply_incremental(pending_inc); + ASSERT_TRUE(tmp.have_pg_upmaps(rep_pgid)); + } + pg_t rep_pg2(2, pool_xeus_993); + pg_t rep_pgid2 = tmp.raw_pg_to_pg(rep_pg2); + { + pg_t rep_pgid = rep_pgid2; + vector from_osds{-1, -1}; + vector rep_up; + int rep_up_primary; + tmp.pg_to_raw_up(rep_pgid, &rep_up, &rep_up_primary); + ASSERT_TRUE(rep_up.size() == 4); + from_osds[0] = *(rep_up.begin()); + from_osds[1] = *(rep_up.rbegin()); + std::cout << "pgid " << rep_pgid2 << " up " << rep_up << std::endl; + ASSERT_TRUE(*(from_osds.begin()) >= 0); + ASSERT_TRUE(*(from_osds.rbegin()) >= 0); + vector> new_pg_upmap_items; + for (auto &from: from_osds) { + int to = -1; + auto dc_parent = tmp.crush->get_parent_of_type(from, 8 /* dc */, rno); + if (dc_parent == dc1) + dc_parent = dc2; + else + dc_parent = dc1; + auto rack_parent = tmp.crush->get_parent_of_type(from, 3 /* rack */, rno); + ASSERT_TRUE(dc_parent < 0); + ASSERT_TRUE(rack_parent < 0); + set rack_parents; + for (auto &i: rep_up) { + if (i == from) continue; + auto rack_parent = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno); + rack_parents.insert(rack_parent); + } + for (auto &i: new_pg_upmap_items) { + auto rack_from = tmp.crush->get_parent_of_type(i.first, 3, rno); + auto rack_to = tmp.crush->get_parent_of_type(i.second, 3, rno); + rack_parents.insert(rack_from); + rack_parents.insert(rack_to); + } + for (int i = 0; i < (int)get_num_osds(); i++) { + if (std::find(rep_up.begin(), rep_up.end(), i) == rep_up.end()) { + auto dc_p = tmp.crush->get_parent_of_type(i, 8 /* dc */, rno); + auto rack_p = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno); + if (dc_p == dc_parent && + rack_parents.find(rack_p) == rack_parents.end()) { + to = i; + break; + } + } + } + ASSERT_TRUE(to >= 0); + ASSERT_TRUE(from != to); + std::cout << "from " << from << " to " << to << std::endl; + new_pg_upmap_items.push_back(make_pair(from, to)); + } + OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); + pending_inc.new_pg_upmap_items[rep_pgid] = + mempool::osdmap::vector>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + tmp.apply_incremental(pending_inc); + ASSERT_TRUE(tmp.have_pg_upmaps(rep_pgid)); + } + { + // *maybe_remove_pg_upmaps* should remove the above upmap_item + OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); + clean_pg_upmaps(g_ceph_context, tmp, pending_inc); + tmp.apply_incremental(pending_inc); + ASSERT_FALSE(tmp.have_pg_upmaps(rep_pgid)); + ASSERT_FALSE(tmp.have_pg_upmaps(rep_pgid2)); + } + } +} + TEST(PGTempMap, basic) { PGTempMap m; @@ -1499,3 +1717,154 @@ TEST(PGTempMap, basic) ASSERT_EQ(998u, m.size()); } +TEST_F(OSDMapTest, BUG_43124) { + set_up_map(200); + { + // https://tracker.ceph.com/issues/43124 + + // build a temporary crush topology of 5racks, + // 4 hosts per rack, 10osds per host + OSDMap tmp; // use a tmpmap here, so we do not dirty origin map.. + tmp.deepish_copy_from(osdmap); + const int expected_host_num = 20; + int osd_per_host = (int)get_num_osds() / expected_host_num; + ASSERT_GE(osd_per_host, 10); + int host_per_rack = 4; + int index = 0; + int rack_index = 0; + for (int i = 0; i < (int)get_num_osds(); i++) { + if (i && i % osd_per_host == 0) { + ++index; + } + if (i && i % (host_per_rack * osd_per_host) == 0) { + ++rack_index; + } + stringstream osd_name; + stringstream host_name; + stringstream rack_name; + vector move_to; + osd_name << "osd." << i; + host_name << "host-" << index; + rack_name << "rack-" << rack_index; + move_to.push_back("root=default"); + string rack_loc = "rack=" + rack_name.str(); + move_to.push_back(rack_loc); + string host_loc = "host=" + host_name.str(); + move_to.push_back(host_loc); + auto r = crush_move(tmp, osd_name.str(), move_to); + ASSERT_EQ(0, r); + } + + // build crush rule + CrushWrapper crush; + get_crush(tmp, crush); + string rule_name = "rule_angel_1944"; + int rule_type = pg_pool_t::TYPE_ERASURE; + ASSERT_TRUE(!crush.rule_exists(rule_name)); + int rno; + for (rno = 0; rno < crush.get_max_rules(); rno++) { + if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno)) + break; + } + int min_size = 1; + int max_size = 20; + int steps = 6; + string root_name = "default"; + int root = crush.get_item_id(root_name); + crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size); + int step = 0; + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_FIRSTN, 4, 3 /* rack */); + crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_INDEP, 3, 1 /* host */); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + ASSERT_TRUE(step == steps); + auto r = crush_add_rule(crush.get_crush_map(), rule, rno); + ASSERT_TRUE(r >= 0); + crush.set_rule_name(rno, rule_name); + { + OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); + pending_inc.crush.clear(); + crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); + tmp.apply_incremental(pending_inc); + } + { + stringstream oss; + crush.dump_tree(&oss, NULL); + std::cout << oss.str() << std::endl; + Formatter *f = Formatter::create("json-pretty"); + f->open_object_section("crush_rules"); + crush.dump_rules(f); + f->close_section(); + f->flush(cout); + delete f; + } + // create a erasuce-coded pool referencing the above rule + int64_t pool_angel_1944; + { + OSDMap::Incremental new_pool_inc(tmp.get_epoch() + 1); + new_pool_inc.new_pool_max = tmp.get_pool_max(); + new_pool_inc.fsid = tmp.get_fsid(); + pg_pool_t empty; + pool_angel_1944 = ++new_pool_inc.new_pool_max; + pg_pool_t *p = new_pool_inc.get_new_pool(pool_angel_1944, &empty); + p->size = 12; + p->set_pg_num(4096); + p->set_pgp_num(4096); + p->type = pg_pool_t::TYPE_ERASURE; + p->crush_rule = rno; + p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + new_pool_inc.new_pool_names[pool_angel_1944] = "pool_angel_1944"; + tmp.apply_incremental(new_pool_inc); + } + + pg_t rep_pg(0, pool_angel_1944); + pg_t rep_pgid = tmp.raw_pg_to_pg(rep_pg); + { + // insert a pg_upmap_item + int from = -1; + int to = -1; + vector rep_up; + int rep_up_primary; + tmp.pg_to_raw_up(rep_pgid, &rep_up, &rep_up_primary); + std::cout << "pgid " << rep_pgid << " up " << rep_up << std::endl; + ASSERT_TRUE(rep_up.size() == 12); + from = *(rep_up.begin()); + ASSERT_TRUE(from >= 0); + auto from_rack = tmp.crush->get_parent_of_type(from, 3 /* rack */, rno); + set failure_domains; + for (auto &osd : rep_up) { + failure_domains.insert(tmp.crush->get_parent_of_type(osd, 1 /* host */, rno)); + } + for (int i = 0; i < (int)get_num_osds(); i++) { + if (std::find(rep_up.begin(), rep_up.end(), i) == rep_up.end()) { + auto to_rack = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno); + auto to_host = tmp.crush->get_parent_of_type(i, 1 /* host */, rno); + if (to_rack != from_rack && failure_domains.count(to_host) == 0) { + to = i; + break; + } + } + } + ASSERT_TRUE(to >= 0); + ASSERT_TRUE(from != to); + std::cout << "from " << from << " to " << to << std::endl; + vector> new_pg_upmap_items; + new_pg_upmap_items.push_back(make_pair(from, to)); + OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); + pending_inc.new_pg_upmap_items[rep_pgid] = + mempool::osdmap::vector>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + tmp.apply_incremental(pending_inc); + ASSERT_TRUE(tmp.have_pg_upmaps(rep_pgid)); + } + { + // *maybe_remove_pg_upmaps* should not remove the above upmap_item + OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); + clean_pg_upmaps(g_ceph_context, tmp, pending_inc); + tmp.apply_incremental(pending_inc); + ASSERT_TRUE(tmp.have_pg_upmaps(rep_pgid)); + } + } +}