From 119d8cb2a1d35473a77b4392b86d0f841fbba819 Mon Sep 17 00:00:00 2001 From: xie xingguo Date: Sat, 19 Jan 2019 17:19:10 +0800 Subject: [PATCH] crush: fix upmap overkill It appears that OSDMap::maybe_remove_pg_upmaps's sanity checks are overzealous. With some customized crush rules it is possible for osdmaptool to generate valid upmaps, but maybe_remove_pg_upmaps will cancel them. Fixes: http://tracker.ceph.com/issues/37968 Signed-off-by: xie xingguo (cherry picked from commit 5c4d241c7f796cb685e9944bf237028162122725) Conflicts: - maybe_remove_pg_upmaps input changing - slight c++11 auto conflicts --- src/crush/CrushWrapper.cc | 87 +++++++++++++++++++---- src/crush/CrushWrapper.h | 13 ++-- src/osd/OSDMap.cc | 58 +++++++-------- src/test/osd/TestOSDMap.cc | 142 ++++++++++++++++++++++++++++++++++--- 4 files changed, 240 insertions(+), 60 deletions(-) diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc index ca8d9059fd99d..d3c2a7af48ca3 100644 --- a/src/crush/CrushWrapper.cc +++ b/src/crush/CrushWrapper.cc @@ -878,23 +878,84 @@ void CrushWrapper::get_children_of_type(int id, } } -int CrushWrapper::get_rule_failure_domain(int rule_id) -{ - crush_rule *rule = get_rule(rule_id); - if (IS_ERR(rule)) { +int CrushWrapper::verify_upmap(CephContext *cct, + int rule_id, + int pool_size, + const vector& up) +{ + auto rule = get_rule(rule_id); + if (IS_ERR(rule) || !rule) { + lderr(cct) << __func__ << " rule " << rule_id << " does not exist" + << dendl; return -ENOENT; } - int type = 0; // default to osd-level - for (unsigned s = 0; s < rule->len; ++s) { - if ((rule->steps[s].op == CRUSH_RULE_CHOOSE_FIRSTN || - rule->steps[s].op == CRUSH_RULE_CHOOSE_INDEP || - rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_FIRSTN || - rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_INDEP) && - rule->steps[s].arg2 > type) { - type = rule->steps[s].arg2; + for (unsigned step = 0; step < rule->len; ++step) { + auto curstep = &rule->steps[step]; + ldout(cct, 10) << __func__ << " step " << step << dendl; + switch (curstep->op) { + case CRUSH_RULE_CHOOSELEAF_FIRSTN: + case CRUSH_RULE_CHOOSELEAF_INDEP: + { + int type = curstep->arg2; + if (type == 0) // osd + break; + map> osds_by_parent; // parent_of_desired_type -> osds + for (auto osd : up) { + auto parent = get_parent_of_type(osd, type, rule_id); + if (parent < 0) { + osds_by_parent[parent].insert(osd); + } else { + ldout(cct, 1) << __func__ << " unable to get parent of osd." << osd + << ", skipping for now" + << dendl; + } + } + for (auto i : osds_by_parent) { + if (i.second.size() > 1) { + lderr(cct) << __func__ << " multiple osds " << i.second + << " come from same failure domain " << i.first + << dendl; + return -EINVAL; + } + } + } + break; + + case CRUSH_RULE_CHOOSE_FIRSTN: + case CRUSH_RULE_CHOOSE_INDEP: + { + int numrep = curstep->arg1; + int type = curstep->arg2; + if (type == 0) // osd + break; + if (numrep <= 0) + numrep += pool_size; + set parents_of_type; + for (auto osd : up) { + auto parent = get_parent_of_type(osd, type, rule_id); + if (parent < 0) { + parents_of_type.insert(parent); + } else { + ldout(cct, 1) << __func__ << " unable to get parent of osd." << osd + << ", skipping for now" + << dendl; + } + } + if ((int)parents_of_type.size() > numrep) { + lderr(cct) << __func__ << " number of buckets " + << parents_of_type.size() << " exceeds desired " << numrep + << dendl; + return -EINVAL; + } + } + break; + + default: + // ignore + break; } } - return type; + return 0; } int CrushWrapper::_get_leaves(int id, list *leaves) diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h index 8063e8ab82a36..39d74fad0f77f 100644 --- a/src/crush/CrushWrapper.h +++ b/src/crush/CrushWrapper.h @@ -733,12 +733,15 @@ public: set *children, bool exclude_shadow = true) const; + /** - * get failure-domain type of a specific crush rule - * @param rule_id crush rule id - * @return type of failure-domain or a negative errno on error. - */ - int get_rule_failure_domain(int rule_id); + * verify upmapping results. + * return 0 on success or a negative errno on error. + */ + int verify_upmap(CephContext *cct, + int rule_id, + int pool_size, + const vector& up); /** * enumerate leaves(devices) of given node diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index f95eae4b5c774..9ce294820a703 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -1642,7 +1642,30 @@ void OSDMap::maybe_remove_pg_upmaps(CephContext *cct, to_cancel.insert(pg); continue; } + vector raw_up; + int primary; + tmpmap.pg_to_raw_up(pg, &raw_up, &primary); + vector up; + up.reserve(raw_up.size()); + for (auto osd : raw_up) { + // skip non-existent/down osd for erasure-coded PGs + if (osd == CRUSH_ITEM_NONE) + continue; + up.push_back(osd); + } auto crush_rule = tmpmap.get_pg_pool_crush_rule(pg); + auto r = tmpmap.crush->verify_upmap(cct, + crush_rule, + tmpmap.get_pg_pool_size(pg), + up); + if (r < 0) { + ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg + << " returning " << r + << dendl; + to_cancel.insert(pg); + continue; + } + // below we check against crush-topology changing.. map weight_map; auto it = rule_weight_map.find(crush_rule); if (it == rule_weight_map.end()) { @@ -1656,43 +1679,10 @@ void OSDMap::maybe_remove_pg_upmaps(CephContext *cct, } else { weight_map = it->second; } - auto type = tmpmap.crush->get_rule_failure_domain(crush_rule); - if (type < 0) { - lderr(cct) << __func__ << " unable to load failure-domain-type of pg " - << pg << dendl; - continue; - } ldout(cct, 10) << __func__ << " pg " << pg - << " crush-rule-id " << crush_rule << " weight_map " << weight_map - << " failure-domain-type " << type << dendl; - vector raw; - int primary; - tmpmap.pg_to_raw_up(pg, &raw, &primary); - set parents; - for (auto osd : raw) { - // skip non-existent/down osd for erasure-coded PGs - if (osd == CRUSH_ITEM_NONE) - continue; - if (type > 0) { - auto parent = tmpmap.crush->get_parent_of_type(osd, type, crush_rule); - if (parent < 0) { - auto r = parents.insert(parent); - if (!r.second) { - // two up-set osds come from same parent - to_cancel.insert(pg); - break; - } - } else { - lderr(cct) << __func__ << " unable to get parent of raw osd." - << osd << " of pg " << pg - << dendl; - // continue to do checks below - } - } - // the above check validates collision only - // below we continue to check against crush-topology changing.. + for (auto osd : up) { auto it = weight_map.find(osd); if (it == weight_map.end()) { // osd is gone or has been moved out of the specific crush-tree diff --git a/src/test/osd/TestOSDMap.cc b/src/test/osd/TestOSDMap.cc index 251ae4d994064..57ee3930ead53 100644 --- a/src/test/osd/TestOSDMap.cc +++ b/src/test/osd/TestOSDMap.cc @@ -92,17 +92,17 @@ public: osdmap.apply_incremental(new_pool_inc); } unsigned int get_num_osds() { return num_osds; } - void get_crush(CrushWrapper& newcrush) { + void get_crush(const OSDMap& tmap, CrushWrapper& newcrush) { bufferlist bl; - osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + tmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT); bufferlist::iterator p = bl.begin(); newcrush.decode(p); } - int crush_move(const string &name, const vector &argvec) { + int crush_move(OSDMap& tmap, const string &name, const vector &argvec) { map loc; CrushWrapper::parse_loc_map(argvec, &loc); CrushWrapper newcrush; - get_crush(newcrush); + get_crush(tmap, newcrush); if (!newcrush.name_exists(name)) { return -ENOENT; } @@ -115,10 +115,10 @@ public: err = newcrush.move_bucket(g_ceph_context, id, loc); } if (err >= 0) { - OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + OSDMap::Incremental pending_inc(tmap.get_epoch() + 1); pending_inc.crush.clear(); newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); - osdmap.apply_incremental(pending_inc); + tmap.apply_incremental(pending_inc); err = 0; } } else { @@ -134,7 +134,7 @@ public: return osdmap.crush->get_rule_id(name); } CrushWrapper newcrush; - get_crush(newcrush); + get_crush(osdmap, newcrush); string device_class; stringstream ss; int ruleno = newcrush.add_simple_rule( @@ -559,7 +559,7 @@ TEST_F(OSDMapTest, CleanPGUpmaps) { move_to.push_back("root=default"); string host_loc = "host=" + host_name.str(); move_to.push_back(host_loc); - int r = crush_move(osd_name.str(), move_to); + int r = crush_move(osdmap, osd_name.str(), move_to); ASSERT_EQ(0, r); } const string upmap_rule = "upmap"; @@ -735,6 +735,132 @@ TEST_F(OSDMapTest, CleanPGUpmaps) { } } + { + // http://tracker.ceph.com/issues/37968 + + // build a temporary crush topology of 2 hosts, 3 osds per host + OSDMap tmp; // use a tmpmap here, so we do not dirty origin map.. + tmp.deepish_copy_from(osdmap); + const int expected_host_num = 2; + int osd_per_host = get_num_osds() / expected_host_num; + ASSERT_GE(osd_per_host, 3); + int index = 0; + for (int i = 0; i < (int)get_num_osds(); i++) { + if (i && i % osd_per_host == 0) { + ++index; + } + stringstream osd_name; + stringstream host_name; + vector move_to; + osd_name << "osd." << i; + host_name << "host-" << index; + move_to.push_back("root=default"); + string host_loc = "host=" + host_name.str(); + move_to.push_back(host_loc); + auto r = crush_move(tmp, osd_name.str(), move_to); + ASSERT_EQ(0, r); + } + + // build crush rule + CrushWrapper crush; + get_crush(tmp, crush); + string rule_name = "rule_37968"; + int rule_type = pg_pool_t::TYPE_ERASURE; + ASSERT_TRUE(!crush.rule_exists(rule_name)); + int rno; + for (rno = 0; rno < crush.get_max_rules(); rno++) { + if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno)) + break; + } + string root_name = "default"; + int root = crush.get_item_id(root_name); + int min_size = 3; + int max_size = 4; + int steps = 6; + crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size); + int step = 0; + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 1 /* host*/); + crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 0 /* osd */); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + ASSERT_TRUE(step == steps); + auto r = crush_add_rule(crush.get_crush_map(), rule, rno); + ASSERT_TRUE(r >= 0); + crush.set_rule_name(rno, rule_name); + { + OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); + pending_inc.crush.clear(); + crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); + tmp.apply_incremental(pending_inc); + } + + // create a erasuce-coded pool referencing the above rule + int64_t pool_37968; + { + OSDMap::Incremental new_pool_inc(tmp.get_epoch() + 1); + new_pool_inc.new_pool_max = tmp.get_pool_max(); + new_pool_inc.fsid = tmp.get_fsid(); + pg_pool_t empty; + pool_37968 = ++new_pool_inc.new_pool_max; + pg_pool_t *p = new_pool_inc.get_new_pool(pool_37968, &empty); + p->size = 4; + p->set_pg_num(8); + p->set_pgp_num(8); + p->type = pg_pool_t::TYPE_ERASURE; + p->crush_rule = rno; + p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + new_pool_inc.new_pool_names[pool_37968] = "pool_37968"; + tmp.apply_incremental(new_pool_inc); + } + + pg_t ec_pg(0, pool_37968); + pg_t ec_pgid = tmp.raw_pg_to_pg(ec_pg); + int from = -1; + int to = -1; + { + // insert a valid pg_upmap_item + vector ec_up; + int ec_up_primary; + tmp.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary); + ASSERT_TRUE(ec_up.size() == 4); + from = *(ec_up.begin()); + ASSERT_TRUE(from >= 0); + auto parent = tmp.crush->get_parent_of_type(from, 1 /* host */, rno); + ASSERT_TRUE(parent < 0); + // pick an osd of the same parent with *from* + for (int i = 0; i < (int)get_num_osds(); i++) { + if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) { + auto p = tmp.crush->get_parent_of_type(i, 1 /* host */, rno); + if (p == parent) { + to = i; + break; + } + } + } + ASSERT_TRUE(to >= 0); + ASSERT_TRUE(from != to); + vector> new_pg_upmap_items; + new_pg_upmap_items.push_back(make_pair(from, to)); + OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); + pending_inc.new_pg_upmap_items[ec_pgid] = + mempool::osdmap::vector>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + tmp.apply_incremental(pending_inc); + ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid)); + } + { + // *maybe_remove_pg_upmaps* should not remove the above upmap_item + OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); + OSDMap nextmap; + nextmap.deepish_copy_from(tmp); + nextmap.maybe_remove_pg_upmaps(g_ceph_context, nextmap, &pending_inc); + tmp.apply_incremental(pending_inc); + ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid)); + } + } + { // TEST pg_upmap { -- 2.39.5