From 5fa467e06bd0cb4f6cc6e1de4283d73693c10eda Mon Sep 17 00:00:00 2001 From: xie xingguo Date: Wed, 28 Feb 2018 10:50:00 +0800 Subject: [PATCH] mon, osd: fix potential collided *Up Set* after PG remapping The mgr balancer module are basically doing optimizations based on the snapshots of OSDMap at certain moments, which turns out to be the culprit of data loss since it can produce bad PG mapping results sometimes while in upmap mode. I.e.: 1) original cluster topology: -5 2.00000 host host-a 0 ssd 1.00000 osd.0 up 1.00000 1.00000 1 ssd 1.00000 osd.1 up 1.00000 1.00000 -7 2.00000 host host-b 2 ssd 1.00000 osd.2 up 1.00000 1.00000 3 ssd 1.00000 osd.3 up 1.00000 1.00000 -9 2.00000 host host-c 4 ssd 1.00000 osd.4 up 1.00000 1.00000 5 ssd 1.00000 osd.5 up 1.00000 1.00000 2) mgr balancer applies optimization for PG 3.f: pg-upmap-items[3.f : 1->4] 3.f [1 3] + -------------------------> [4 3] 3) osd.3 is out/reweighted etc., original crush mapping of 3.f changed (while pg-upmap-items did not): pg-upmap-items[3.f : 1->4] 3.f [1 5] + -------------------------> [4 5] 4) we are now mapping PG 3.f to two OSDs(osd.4 & osd.5) on the same host (host-c). Fix the above problem by putting a guard procedure before we can finally encode these *unsafe* upmap remappings into OSDMap. If any of them turns out to be inappropriate, we can simply cancel it since balancer can still re-calculate and re-generate later if necessary. Fixes: http://tracker.ceph.com/issues/23118 Signed-off-by: xie xingguo --- src/crush/CrushWrapper.cc | 19 +++ src/crush/CrushWrapper.h | 7 + src/mon/OSDMonitor.cc | 3 + src/osd/OSDMap.cc | 106 ++++++++++++++ src/osd/OSDMap.h | 13 ++ src/test/osd/TestOSDMap.cc | 291 ++++++++++++++++++++++++++++++++++++- 6 files changed, 438 insertions(+), 1 deletion(-) diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc index b4643158d35af..b7fa7270351e8 100644 --- a/src/crush/CrushWrapper.cc +++ b/src/crush/CrushWrapper.cc @@ -768,6 +768,25 @@ int CrushWrapper::get_children(int id, list *children) const return b->size; } +int CrushWrapper::get_rule_failure_domain(int rule_id) +{ + crush_rule *rule = get_rule(rule_id); + if (IS_ERR(rule)) { + return -ENOENT; + } + int type = 0; // default to osd-level + for (unsigned s = 0; s < rule->len; ++s) { + if ((rule->steps[s].op == CRUSH_RULE_CHOOSE_FIRSTN || + rule->steps[s].op == CRUSH_RULE_CHOOSE_INDEP || + rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_FIRSTN || + rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_INDEP) && + rule->steps[s].arg2 > type) { + type = rule->steps[s].arg2; + } + } + return type; +} + int CrushWrapper::_get_leaves(int id, list *leaves) const { assert(leaves); diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h index 4026b50c71558..8671306455fbf 100644 --- a/src/crush/CrushWrapper.h +++ b/src/crush/CrushWrapper.h @@ -729,6 +729,13 @@ public: */ int get_children(int id, list *children) const; + /** + * get failure-domain type of a specific crush rule + * @param rule_id crush rule id + * @return type of failure-domain or a negative errno on error. + */ + int get_rule_failure_domain(int rule_id); + /** * enumerate leaves(devices) of given node * diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 03909a9d80685..e98ea3898bde7 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1237,6 +1237,9 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t) } } + // clean inappropriate pg_upmap/pg_upmap_items (if any) + osdmap.maybe_remove_pg_upmaps(cct, osdmap, &pending_inc); + // features for osdmap and its incremental uint64_t features = mon->get_quorum_con_features(); diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index fb7019a9b4c71..5a5664d6d2cb9 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -1614,6 +1614,112 @@ void OSDMap::clean_temps(CephContext *cct, } } +void OSDMap::maybe_remove_pg_upmaps(CephContext *cct, + const OSDMap& osdmap, + Incremental *pending_inc) +{ + ldout(cct, 10) << __func__ << dendl; + OSDMap tmpmap; + tmpmap.deepish_copy_from(osdmap); + tmpmap.apply_incremental(*pending_inc); + + for (auto& p : tmpmap.pg_upmap) { + ldout(cct, 10) << __func__ << " pg_upmap entry " + << "[" << p.first << ":" << p.second << "]" + << dendl; + auto crush_rule = tmpmap.get_pg_pool_crush_rule(p.first); + if (crush_rule < 0) { + lderr(cct) << __func__ << " unable to load crush-rule of pg " + << p.first << dendl; + continue; + } + auto type = tmpmap.crush->get_rule_failure_domain(crush_rule); + if (type < 0) { + lderr(cct) << __func__ << " unable to load failure-domain-type of pg " + << p.first << dendl; + continue; + } + ldout(cct, 10) << __func__ << " pg " << p.first + << " crush-rule-id " << crush_rule + << " failure-domain-type " << type + << dendl; + vector raw; + int primary; + tmpmap.pg_to_raw_up(p.first, &raw, &primary); + set parents; + bool collide = false; + for (auto osd : raw) { + auto parent = tmpmap.crush->get_parent_of_type(osd, type); + auto r = parents.insert(parent); + if (!r.second) { + collide = true; + break; + } + } + if (collide) { + ldout(cct, 10) << __func__ << " removing invalid pg_upmap " + << "[" << p.first << ":" << p.second << "]" + << ", final mapping result will be: " << raw + << dendl; + auto it = pending_inc->new_pg_upmap.find(p.first); + if (it != pending_inc->new_pg_upmap.end()) { + pending_inc->new_pg_upmap.erase(it); + } + if (osdmap.pg_upmap.count(p.first)) { + pending_inc->old_pg_upmap.insert(p.first); + } + } + } + for (auto& p : tmpmap.pg_upmap_items) { + ldout(cct, 10) << __func__ << " pg_upmap_items entry " + << "[" << p.first << ":" << p.second << "]" + << dendl; + auto crush_rule = tmpmap.get_pg_pool_crush_rule(p.first); + if (crush_rule < 0) { + lderr(cct) << __func__ << " unable to load crush-rule of pg " + << p.first << dendl; + continue; + } + auto type = tmpmap.crush->get_rule_failure_domain(crush_rule); + if (type < 0) { + lderr(cct) << __func__ << " unable to load failure-domain-type of pg " + << p.first << dendl; + continue; + } + ldout(cct, 10) << __func__ << " pg " << p.first + << " crush_rule_id " << crush_rule + << " failure_domain_type " << type + << dendl; + vector raw; + int primary; + tmpmap.pg_to_raw_up(p.first, &raw, &primary); + set parents; + bool collide = false; + for (auto osd : raw) { + auto parent = tmpmap.crush->get_parent_of_type(osd, type); + auto r = parents.insert(parent); + if (!r.second) { + collide = true; + break; + } + } + if (collide) { + ldout(cct, 10) << __func__ << " removing invalid pg_upmap_items " + << "[" << p.first << ":" << p.second << "]" + << ", final mapping result will be: " << raw + << dendl; + // This is overkilling, but simpler.. + auto it = pending_inc->new_pg_upmap_items.find(p.first); + if (it != pending_inc->new_pg_upmap_items.end()) { + pending_inc->new_pg_upmap_items.erase(it); + } + if (osdmap.pg_upmap_items.count(p.first)) { + pending_inc->old_pg_upmap_items.insert(p.first); + } + } + } +} + int OSDMap::apply_incremental(const Incremental &inc) { new_blacklist_entries = false; diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 28e6f9bcfc74a..efd7ac92e52d0 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -988,6 +988,10 @@ public: */ uint64_t get_up_osd_features() const; + void maybe_remove_pg_upmaps(CephContext *cct, + const OSDMap& osdmap, + Incremental *pending_inc); + int apply_incremental(const Incremental &inc); /// try to re-use/reference addrs in oldmap from newmap @@ -1068,6 +1072,15 @@ public: return p->get_size(); } + int get_pg_pool_crush_rule(pg_t pgid) const { + if (!pg_exists(pgid)) { + return -ENOENT; + } + const pg_pool_t *p = get_pg_pool(pgid.pool()); + assert(p); + return p->get_crush_rule(); + } + private: /// pg -> (raw osd list) void _pg_to_raw_osds( diff --git a/src/test/osd/TestOSDMap.cc b/src/test/osd/TestOSDMap.cc index 20037de02132b..e561727aa27ec 100644 --- a/src/test/osd/TestOSDMap.cc +++ b/src/test/osd/TestOSDMap.cc @@ -92,7 +92,62 @@ public: osdmap.apply_incremental(new_pool_inc); } unsigned int get_num_osds() { return num_osds; } - + void get_crush(CrushWrapper& newcrush) { + bufferlist bl; + osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + bufferlist::iterator p = bl.begin(); + newcrush.decode(p); + } + int crush_move(const string &name, const vector &argvec) { + map loc; + CrushWrapper::parse_loc_map(argvec, &loc); + CrushWrapper newcrush; + get_crush(newcrush); + if (!newcrush.name_exists(name)) { + return -ENOENT; + } + int id = newcrush.get_item_id(name); + int err; + if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) { + if (id >= 0) { + err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc); + } else { + err = newcrush.move_bucket(g_ceph_context, id, loc); + } + if (err >= 0) { + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); + osdmap.apply_incremental(pending_inc); + err = 0; + } + } else { + // already there + err = 0; + } + return err; + } + int crush_rule_create_replicated(const string &name, + const string &root, + const string &type) { + if (osdmap.crush->rule_exists(name)) { + return osdmap.crush->get_rule_id(name); + } + CrushWrapper newcrush; + get_crush(newcrush); + string device_class; + stringstream ss; + int ruleno = newcrush.add_simple_rule( + name, root, type, device_class, + "firstn", pg_pool_t::TYPE_REPLICATED, &ss); + if (ruleno >= 0) { + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); + osdmap.apply_incremental(pending_inc); + } + return ruleno; + } void test_mappings(int pool, int num, vector *any, @@ -481,6 +536,240 @@ TEST_F(OSDMapTest, parse_osd_id_list) { ASSERT_EQ(-EINVAL, osdmap.parse_osd_id_list({"-12"}, &out, &cout)); } +TEST_F(OSDMapTest, CleanPGUpmaps) { + set_up_map(); + + // build a crush rule of type host + const int expected_host_num = 3; + int osd_per_host = get_num_osds() / expected_host_num; + ASSERT_GE(2, osd_per_host); + int index = 0; + for (int i = 0; i < (int)get_num_osds(); i++) { + if (i && i % osd_per_host == 0) { + ++index; + } + stringstream osd_name; + stringstream host_name; + vector move_to; + osd_name << "osd." << i; + host_name << "host-" << index; + move_to.push_back("root=default"); + string host_loc = "host=" + host_name.str(); + move_to.push_back(host_loc); + int r = crush_move(osd_name.str(), move_to); + ASSERT_EQ(0, r); + } + const string upmap_rule = "upmap"; + int upmap_rule_no = crush_rule_create_replicated( + upmap_rule, "default", "host"); + ASSERT_LT(0, upmap_rule_no); + + // create a replicated pool which references the above rule + OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1); + new_pool_inc.new_pool_max = osdmap.get_pool_max(); + new_pool_inc.fsid = osdmap.get_fsid(); + pg_pool_t empty; + uint64_t upmap_pool_id = ++new_pool_inc.new_pool_max; + pg_pool_t *p = new_pool_inc.get_new_pool(upmap_pool_id, &empty); + p->size = 2; + p->set_pg_num(64); + p->set_pgp_num(64); + p->type = pg_pool_t::TYPE_REPLICATED; + p->crush_rule = upmap_rule_no; + p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + new_pool_inc.new_pool_names[upmap_pool_id] = "upmap_pool"; + osdmap.apply_incremental(new_pool_inc); + + pg_t rawpg(0, upmap_pool_id); + pg_t pgid = osdmap.raw_pg_to_pg(rawpg); + vector up; + int up_primary; + osdmap.pg_to_raw_up(pgid, &up, &up_primary); + ASSERT_LT(1U, up.size()); + { + // validate we won't have two OSDs from a same host + int parent_0 = osdmap.crush->get_parent_of_type(up[0], + osdmap.crush->get_type_id("host")); + int parent_1 = osdmap.crush->get_parent_of_type(up[1], + osdmap.crush->get_type_id("host")); + ASSERT_TRUE(parent_0 != parent_1); + } + + { + // TEST pg_upmap + { + // STEP-1: enumerate all children of up[0]'s parent, + // replace up[1] with one of them (other than up[0]) + int parent = osdmap.crush->get_parent_of_type(up[0], + osdmap.crush->get_type_id("host")); + set candidates; + osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent), &candidates); + ASSERT_LT(1U, candidates.size()); + int replaced_by = -1; + for (auto c: candidates) { + if (c != up[0]) { + replaced_by = c; + break; + } + } + ASSERT_NE(-1, replaced_by); + // generate a new pg_upmap item and apply + vector new_pg_upmap; + new_pg_upmap.push_back(up[0]); + new_pg_upmap.push_back(replaced_by); // up[1] -> replaced_by + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector( + new_pg_upmap.begin(), new_pg_upmap.end()); + osdmap.apply_incremental(pending_inc); + { + // validate pg_upmap is there + vector new_up; + int new_up_primary; + osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); + ASSERT_TRUE(up.size() == new_up.size()); + ASSERT_TRUE(new_up[0] == new_pg_upmap[0]); + ASSERT_TRUE(new_up[1] == new_pg_upmap[1]); + // and we shall have two OSDs from a same host now.. + int parent_0 = osdmap.crush->get_parent_of_type(new_up[0], + osdmap.crush->get_type_id("host")); + int parent_1 = osdmap.crush->get_parent_of_type(new_up[1], + osdmap.crush->get_type_id("host")); + ASSERT_TRUE(parent_0 == parent_1); + } + } + { + // STEP-2: apply cure + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + osdmap.maybe_remove_pg_upmaps(g_ceph_context, osdmap, &pending_inc); + osdmap.apply_incremental(pending_inc); + { + // validate pg_upmap is gone (reverted) + vector new_up; + int new_up_primary; + osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); + ASSERT_TRUE(new_up == up); + ASSERT_TRUE(new_up_primary = up_primary); + } + } + } + + { + // TEST pg_upmap_items + // enumerate all used hosts first + set parents; + for (auto u: up) { + int parent = osdmap.crush->get_parent_of_type(u, + osdmap.crush->get_type_id("host")); + ASSERT_GT(0, parent); + parents.insert(parent); + } + int candidate_parent = 0; + set candidate_children; + vector up_after_out; + { + // STEP-1: try mark out up[1] and all other OSDs from the same host + int parent = osdmap.crush->get_parent_of_type(up[1], + osdmap.crush->get_type_id("host")); + set children; + osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent), + &children); + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + for (auto c: children) { + pending_inc.new_weight[c] = CEPH_OSD_OUT; + } + OSDMap tmpmap; + tmpmap.deepish_copy_from(osdmap); + tmpmap.apply_incremental(pending_inc); + vector new_up; + int new_up_primary; + tmpmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); + // verify that we'll have OSDs from a different host.. + int will_choose = -1; + for (auto o: new_up) { + int parent = tmpmap.crush->get_parent_of_type(o, + osdmap.crush->get_type_id("host")); + if (!parents.count(parent)) { + will_choose = o; + candidate_parent = parent; // record + break; + } + } + ASSERT_LT(-1, will_choose); // it is an OSD! + ASSERT_TRUE(candidate_parent != 0); + osdmap.crush->get_leaves(osdmap.crush->get_item_name(candidate_parent), + &candidate_children); + ASSERT_TRUE(candidate_children.count(will_choose)); + candidate_children.erase(will_choose); + ASSERT_TRUE(!candidate_children.empty()); + up_after_out = new_up; // needed for verification.. + } + { + // STEP-2: generating a new pg_upmap_items entry by + // replacing up[0] with one coming from candidate_children + int victim = up[0]; + int replaced_by = *candidate_children.begin(); + vector> new_pg_upmap_items; + new_pg_upmap_items.push_back(make_pair(victim, replaced_by)); + // apply + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pg_upmap_items[pgid] = + mempool::osdmap::vector>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + osdmap.apply_incremental(pending_inc); + { + // validate pg_upmap_items is there + vector new_up; + int new_up_primary; + osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); + ASSERT_TRUE(up.size() == new_up.size()); + ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), replaced_by) != + new_up.end()); + // and up[1] too + ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), up[1]) != + new_up.end()); + } + } + { + // STEP-3: mark out up[1] and all other OSDs from the same host + int parent = osdmap.crush->get_parent_of_type(up[1], + osdmap.crush->get_type_id("host")); + set children; + osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent), + &children); + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + for (auto c: children) { + pending_inc.new_weight[c] = CEPH_OSD_OUT; + } + osdmap.apply_incremental(pending_inc); + { + // validate we have two OSDs from the same host now.. + vector new_up; + int new_up_primary; + osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); + ASSERT_TRUE(up.size() == new_up.size()); + int parent_0 = osdmap.crush->get_parent_of_type(new_up[0], + osdmap.crush->get_type_id("host")); + int parent_1 = osdmap.crush->get_parent_of_type(new_up[1], + osdmap.crush->get_type_id("host")); + ASSERT_TRUE(parent_0 == parent_1); + } + } + { + // STEP-4: apply cure + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + osdmap.maybe_remove_pg_upmaps(g_ceph_context, osdmap, &pending_inc); + osdmap.apply_incremental(pending_inc); + { + // validate pg_upmap_items is gone (reverted) + vector new_up; + int new_up_primary; + osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); + ASSERT_TRUE(new_up == up_after_out); + } + } + } +} + TEST(PGTempMap, basic) { PGTempMap m; -- 2.39.5