]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mon, osd: fix potential collided *Up Set* after PG remapping 20653/head
authorxie xingguo <xie.xingguo@zte.com.cn>
Wed, 28 Feb 2018 02:50:00 +0000 (10:50 +0800)
committerxie xingguo <xie.xingguo@zte.com.cn>
Thu, 1 Mar 2018 11:35:49 +0000 (19:35 +0800)
The mgr balancer module are basically doing optimizations based on
the snapshots of OSDMap at certain moments, which turns out to be
the culprit of data loss since it can produce bad PG mapping results
sometimes while in upmap mode.
I.e.:
1) original cluster topology:

-5       2.00000     host host-a
 0   ssd 1.00000         osd.0       up  1.00000 1.00000
 1   ssd 1.00000         osd.1       up  1.00000 1.00000
-7       2.00000     host host-b
 2   ssd 1.00000         osd.2       up  1.00000 1.00000
 3   ssd 1.00000         osd.3       up  1.00000 1.00000
-9       2.00000     host host-c
 4   ssd 1.00000         osd.4       up  1.00000 1.00000
 5   ssd 1.00000         osd.5       up  1.00000 1.00000

2) mgr balancer applies optimization for PG 3.f:

            pg-upmap-items[3.f : 1->4]
3.f [1 3] + -------------------------> [4 3]

3) osd.3 is out/reweighted etc., original crush mapping of 3.f changed
   (while pg-upmap-items did not):

            pg-upmap-items[3.f : 1->4]
3.f [1 5] + -------------------------> [4 5]

4) we are now mapping PG 3.f to two OSDs(osd.4 & osd.5) on the same host
   (host-c).

Fix the above problem by putting a guard procedure before we can
finally encode these *unsafe* upmap remappings into OSDMap.
If any of them turns out to be inappropriate, we can simply cancel it
since balancer can still re-calculate and re-generate later if necessary.

Fixes: http://tracker.ceph.com/issues/23118
Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn>
src/crush/CrushWrapper.cc
src/crush/CrushWrapper.h
src/mon/OSDMonitor.cc
src/osd/OSDMap.cc
src/osd/OSDMap.h
src/test/osd/TestOSDMap.cc

index b4643158d35af771b69f1464a9893620243748a5..b7fa7270351e8c2cb960b3ef402087c08c4dea79 100644 (file)
@@ -768,6 +768,25 @@ int CrushWrapper::get_children(int id, list<int> *children) const
   return b->size;
 }
 
+int CrushWrapper::get_rule_failure_domain(int rule_id)
+{
+  crush_rule *rule = get_rule(rule_id);
+  if (IS_ERR(rule)) {
+    return -ENOENT;
+  }
+  int type = 0; // default to osd-level
+  for (unsigned s = 0; s < rule->len; ++s) {
+    if ((rule->steps[s].op == CRUSH_RULE_CHOOSE_FIRSTN ||
+         rule->steps[s].op == CRUSH_RULE_CHOOSE_INDEP ||
+         rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_FIRSTN ||
+         rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_INDEP) &&
+         rule->steps[s].arg2 > type) {
+      type = rule->steps[s].arg2;
+    }
+  }
+  return type;
+}
+
 int CrushWrapper::_get_leaves(int id, list<int> *leaves) const
 {
   assert(leaves);
index 4026b50c715585e2f733e082af9a569738b115c5..8671306455fbf2bdce3ca39ba4f947b1d4762578 100644 (file)
@@ -729,6 +729,13 @@ public:
    */
   int get_children(int id, list<int> *children) const;
 
+  /**
+    * get failure-domain type of a specific crush rule
+    * @param rule_id crush rule id
+    * @return type of failure-domain or a negative errno on error.
+    */
+  int get_rule_failure_domain(int rule_id);
+
   /**
     * enumerate leaves(devices) of given node
     *
index 03909a9d80685d4593bac41d6c2415eccdf6701c..e98ea3898bde78c533b1f9b3791ffe9f632c7914 100644 (file)
@@ -1237,6 +1237,9 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
     }
   }
 
+  // clean inappropriate pg_upmap/pg_upmap_items (if any)
+  osdmap.maybe_remove_pg_upmaps(cct, osdmap, &pending_inc);
+
   // features for osdmap and its incremental
   uint64_t features = mon->get_quorum_con_features();
 
index fb7019a9b4c711b12a55c084faa72137e942ec3a..5a5664d6d2cb93d9fc3a8619db5bc589b3953cf2 100644 (file)
@@ -1614,6 +1614,112 @@ void OSDMap::clean_temps(CephContext *cct,
   }
 }
 
+void OSDMap::maybe_remove_pg_upmaps(CephContext *cct,
+                                    const OSDMap& osdmap,
+                                    Incremental *pending_inc)
+{
+  ldout(cct, 10) << __func__ << dendl;
+  OSDMap tmpmap;
+  tmpmap.deepish_copy_from(osdmap);
+  tmpmap.apply_incremental(*pending_inc);
+
+  for (auto& p : tmpmap.pg_upmap) {
+    ldout(cct, 10) << __func__ << " pg_upmap entry "
+                   << "[" << p.first << ":" << p.second << "]"
+                   << dendl;
+    auto crush_rule = tmpmap.get_pg_pool_crush_rule(p.first);
+    if (crush_rule < 0) {
+      lderr(cct) << __func__ << " unable to load crush-rule of pg "
+                 << p.first << dendl;
+      continue;
+    }
+    auto type = tmpmap.crush->get_rule_failure_domain(crush_rule);
+    if (type < 0) {
+      lderr(cct) << __func__ << " unable to load failure-domain-type of pg "
+                 << p.first << dendl;
+      continue;
+    }
+    ldout(cct, 10) << __func__ << " pg " << p.first
+                   << " crush-rule-id " << crush_rule
+                   << " failure-domain-type " << type
+                   << dendl;
+    vector<int> raw;
+    int primary;
+    tmpmap.pg_to_raw_up(p.first, &raw, &primary);
+    set<int> parents;
+    bool collide = false;
+    for (auto osd : raw) {
+      auto parent = tmpmap.crush->get_parent_of_type(osd, type);
+      auto r = parents.insert(parent);
+      if (!r.second) {
+        collide = true;
+        break;
+      }
+    }
+    if (collide) {
+      ldout(cct, 10) << __func__ << " removing invalid pg_upmap "
+                     << "[" << p.first << ":" << p.second << "]"
+                     << ", final mapping result will be: " << raw
+                     << dendl;
+      auto it = pending_inc->new_pg_upmap.find(p.first);
+      if (it != pending_inc->new_pg_upmap.end()) {
+        pending_inc->new_pg_upmap.erase(it);
+      }
+      if (osdmap.pg_upmap.count(p.first)) {
+        pending_inc->old_pg_upmap.insert(p.first);
+      }
+    }
+  }
+  for (auto& p : tmpmap.pg_upmap_items) {
+    ldout(cct, 10) << __func__ << " pg_upmap_items entry "
+                   << "[" << p.first << ":" << p.second << "]"
+                   << dendl;
+    auto crush_rule = tmpmap.get_pg_pool_crush_rule(p.first);
+    if (crush_rule < 0) {
+      lderr(cct) << __func__ << " unable to load crush-rule of pg "
+                 << p.first << dendl;
+      continue;
+    }
+    auto type = tmpmap.crush->get_rule_failure_domain(crush_rule);
+    if (type < 0) {
+      lderr(cct) << __func__ << " unable to load failure-domain-type of pg "
+                 << p.first << dendl;
+      continue;
+    }
+    ldout(cct, 10) << __func__ << " pg " << p.first
+                   << " crush_rule_id " << crush_rule
+                   << " failure_domain_type " << type
+                   << dendl;
+    vector<int> raw;
+    int primary;
+    tmpmap.pg_to_raw_up(p.first, &raw, &primary);
+    set<int> parents;
+    bool collide = false;
+    for (auto osd : raw) {
+      auto parent = tmpmap.crush->get_parent_of_type(osd, type);
+      auto r = parents.insert(parent);
+      if (!r.second) {
+        collide = true;
+        break;
+      }
+    }
+    if (collide) {
+      ldout(cct, 10) << __func__ << " removing invalid pg_upmap_items "
+                     << "[" << p.first << ":" << p.second << "]"
+                     << ", final mapping result will be: " << raw
+                     << dendl;
+      // This is overkilling, but simpler..
+      auto it = pending_inc->new_pg_upmap_items.find(p.first);
+      if (it != pending_inc->new_pg_upmap_items.end()) {
+        pending_inc->new_pg_upmap_items.erase(it);
+      }
+      if (osdmap.pg_upmap_items.count(p.first)) {
+        pending_inc->old_pg_upmap_items.insert(p.first);
+      }
+    }
+  }
+}
+
 int OSDMap::apply_incremental(const Incremental &inc)
 {
   new_blacklist_entries = false;
index 28e6f9bcfc74a116b72285c1fd3257c3e8e1e332..efd7ac92e52d071cc60448ee72ae59b44d2d1f91 100644 (file)
@@ -988,6 +988,10 @@ public:
    */
   uint64_t get_up_osd_features() const;
 
+  void maybe_remove_pg_upmaps(CephContext *cct,
+                              const OSDMap& osdmap,
+                              Incremental *pending_inc);
+
   int apply_incremental(const Incremental &inc);
 
   /// try to re-use/reference addrs in oldmap from newmap
@@ -1068,6 +1072,15 @@ public:
     return p->get_size();
   }
 
+  int get_pg_pool_crush_rule(pg_t pgid) const {
+    if (!pg_exists(pgid)) {
+      return -ENOENT;
+    }
+    const pg_pool_t *p = get_pg_pool(pgid.pool());
+    assert(p);
+    return p->get_crush_rule();
+  }
+
 private:
   /// pg -> (raw osd list)
   void _pg_to_raw_osds(
index 20037de02132b8d9d86e5a5feb2c516e78a1ab3f..e561727aa27ec11d116b39dc55bc2c9df949724d 100644 (file)
@@ -92,7 +92,62 @@ public:
     osdmap.apply_incremental(new_pool_inc);
   }
   unsigned int get_num_osds() { return num_osds; }
-
+  void get_crush(CrushWrapper& newcrush) {
+    bufferlist bl;
+    osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+    bufferlist::iterator p = bl.begin();
+    newcrush.decode(p);
+  }
+  int crush_move(const string &name, const vector<string> &argvec) {
+    map<string,string> loc;
+    CrushWrapper::parse_loc_map(argvec, &loc);
+    CrushWrapper newcrush;
+    get_crush(newcrush);
+    if (!newcrush.name_exists(name)) {
+       return -ENOENT;
+    }
+    int id = newcrush.get_item_id(name);
+    int err;
+    if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
+      if (id >= 0) {
+        err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc);
+      } else {
+        err = newcrush.move_bucket(g_ceph_context, id, loc);
+      }
+      if (err >= 0) {
+        OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+        pending_inc.crush.clear();
+        newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+        osdmap.apply_incremental(pending_inc);
+        err = 0;
+      }
+    } else {
+      // already there
+      err = 0;
+    }
+    return err;
+  }
+  int crush_rule_create_replicated(const string &name,
+                                   const string &root,
+                                   const string &type) {
+    if (osdmap.crush->rule_exists(name)) {
+      return osdmap.crush->get_rule_id(name);
+    }
+    CrushWrapper newcrush;
+    get_crush(newcrush);
+    string device_class;
+    stringstream ss;
+    int ruleno = newcrush.add_simple_rule(
+              name, root, type, device_class,
+              "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
+    if (ruleno >= 0) {
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.crush.clear();
+      newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+      osdmap.apply_incremental(pending_inc);
+    }
+    return ruleno;
+  }
   void test_mappings(int pool,
                     int num,
                     vector<int> *any,
@@ -481,6 +536,240 @@ TEST_F(OSDMapTest, parse_osd_id_list) {
   ASSERT_EQ(-EINVAL, osdmap.parse_osd_id_list({"-12"}, &out, &cout));
 }
 
+TEST_F(OSDMapTest, CleanPGUpmaps) {
+  set_up_map();
+
+  // build a crush rule of type host
+  const int expected_host_num = 3;
+  int osd_per_host = get_num_osds() / expected_host_num;
+  ASSERT_GE(2, osd_per_host);
+  int index = 0;
+  for (int i = 0; i < (int)get_num_osds(); i++) {
+    if (i && i % osd_per_host == 0) {
+      ++index;
+    }
+    stringstream osd_name;
+    stringstream host_name;
+    vector<string> move_to;
+    osd_name << "osd." << i;
+    host_name << "host-" << index;
+    move_to.push_back("root=default");
+    string host_loc = "host=" + host_name.str();
+    move_to.push_back(host_loc);
+    int r = crush_move(osd_name.str(), move_to);
+    ASSERT_EQ(0, r);
+  }
+  const string upmap_rule = "upmap";
+  int upmap_rule_no = crush_rule_create_replicated(
+    upmap_rule, "default", "host");
+  ASSERT_LT(0, upmap_rule_no);
+
+  // create a replicated pool which references the above rule
+  OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
+  new_pool_inc.new_pool_max = osdmap.get_pool_max();
+  new_pool_inc.fsid = osdmap.get_fsid();
+  pg_pool_t empty;
+  uint64_t upmap_pool_id = ++new_pool_inc.new_pool_max;
+  pg_pool_t *p = new_pool_inc.get_new_pool(upmap_pool_id, &empty);
+  p->size = 2;
+  p->set_pg_num(64);
+  p->set_pgp_num(64);
+  p->type = pg_pool_t::TYPE_REPLICATED;
+  p->crush_rule = upmap_rule_no;
+  p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+  new_pool_inc.new_pool_names[upmap_pool_id] = "upmap_pool";
+  osdmap.apply_incremental(new_pool_inc);
+
+  pg_t rawpg(0, upmap_pool_id);
+  pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
+  vector<int> up;
+  int up_primary;
+  osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+  ASSERT_LT(1U, up.size());
+  {
+    // validate we won't have two OSDs from a same host
+    int parent_0 = osdmap.crush->get_parent_of_type(up[0],
+      osdmap.crush->get_type_id("host"));
+    int parent_1 = osdmap.crush->get_parent_of_type(up[1],
+      osdmap.crush->get_type_id("host"));
+    ASSERT_TRUE(parent_0 != parent_1);
+  }
+
+  {
+    // TEST pg_upmap
+    {
+      // STEP-1: enumerate all children of up[0]'s parent,
+      // replace up[1] with one of them (other than up[0])
+      int parent = osdmap.crush->get_parent_of_type(up[0],
+        osdmap.crush->get_type_id("host"));
+      set<int> candidates;
+      osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent), &candidates);
+      ASSERT_LT(1U, candidates.size());
+      int replaced_by = -1;
+      for (auto c: candidates) {
+        if (c != up[0]) {
+          replaced_by = c;
+          break;
+        }
+      }
+      ASSERT_NE(-1, replaced_by);
+      // generate a new pg_upmap item and apply
+      vector<int32_t> new_pg_upmap;
+      new_pg_upmap.push_back(up[0]);
+      new_pg_upmap.push_back(replaced_by); // up[1] -> replaced_by
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
+        new_pg_upmap.begin(), new_pg_upmap.end());
+      osdmap.apply_incremental(pending_inc);
+      {
+        // validate pg_upmap is there
+        vector<int> new_up;
+        int new_up_primary;
+        osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+        ASSERT_TRUE(up.size() == new_up.size());
+        ASSERT_TRUE(new_up[0] == new_pg_upmap[0]);
+        ASSERT_TRUE(new_up[1] == new_pg_upmap[1]);
+        // and we shall have two OSDs from a same host now..
+        int parent_0 = osdmap.crush->get_parent_of_type(new_up[0],
+          osdmap.crush->get_type_id("host"));
+        int parent_1 = osdmap.crush->get_parent_of_type(new_up[1],
+          osdmap.crush->get_type_id("host"));
+        ASSERT_TRUE(parent_0 == parent_1);
+      }
+    }
+    {
+      // STEP-2: apply cure
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      osdmap.maybe_remove_pg_upmaps(g_ceph_context, osdmap, &pending_inc);
+      osdmap.apply_incremental(pending_inc);
+      {
+        // validate pg_upmap is gone (reverted)
+        vector<int> new_up;
+        int new_up_primary;
+        osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+        ASSERT_TRUE(new_up == up);
+        ASSERT_TRUE(new_up_primary = up_primary);
+      }
+    }
+  }
+
+  {
+    // TEST pg_upmap_items
+    // enumerate all used hosts first
+    set<int> parents;
+    for (auto u: up) {
+      int parent = osdmap.crush->get_parent_of_type(u,
+        osdmap.crush->get_type_id("host"));
+      ASSERT_GT(0, parent);
+      parents.insert(parent);
+    }
+    int candidate_parent = 0;
+    set<int> candidate_children;
+    vector<int> up_after_out;
+    {
+      // STEP-1: try mark out up[1] and all other OSDs from the same host
+      int parent = osdmap.crush->get_parent_of_type(up[1],
+        osdmap.crush->get_type_id("host"));
+      set<int> children;
+      osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent),
+        &children);
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      for (auto c: children) {
+        pending_inc.new_weight[c] = CEPH_OSD_OUT;
+      }
+      OSDMap tmpmap;
+      tmpmap.deepish_copy_from(osdmap);
+      tmpmap.apply_incremental(pending_inc);
+      vector<int> new_up;
+      int new_up_primary;
+      tmpmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+      // verify that we'll have OSDs from a different host..
+      int will_choose = -1;
+      for (auto o: new_up) {
+        int parent = tmpmap.crush->get_parent_of_type(o,
+          osdmap.crush->get_type_id("host"));
+        if (!parents.count(parent)) {
+          will_choose = o;
+          candidate_parent = parent; // record
+          break;
+        }
+      }
+      ASSERT_LT(-1, will_choose); // it is an OSD!
+      ASSERT_TRUE(candidate_parent != 0);
+      osdmap.crush->get_leaves(osdmap.crush->get_item_name(candidate_parent),
+        &candidate_children);
+      ASSERT_TRUE(candidate_children.count(will_choose));
+      candidate_children.erase(will_choose);
+      ASSERT_TRUE(!candidate_children.empty());
+      up_after_out = new_up; // needed for verification..
+    }
+    {
+      // STEP-2: generating a new pg_upmap_items entry by
+      // replacing up[0] with one coming from candidate_children
+      int victim = up[0];
+      int replaced_by = *candidate_children.begin();
+      vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+      new_pg_upmap_items.push_back(make_pair(victim, replaced_by));
+      // apply
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.new_pg_upmap_items[pgid] =
+        mempool::osdmap::vector<pair<int32_t,int32_t>>(
+        new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+      osdmap.apply_incremental(pending_inc);
+      {
+        // validate pg_upmap_items is there
+        vector<int> new_up;
+        int new_up_primary;
+        osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+        ASSERT_TRUE(up.size() == new_up.size());
+        ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), replaced_by) !=
+          new_up.end());
+        // and up[1] too
+        ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), up[1]) !=
+          new_up.end());
+      }
+    }
+    {
+      // STEP-3: mark out up[1] and all other OSDs from the same host
+      int parent = osdmap.crush->get_parent_of_type(up[1],
+        osdmap.crush->get_type_id("host"));
+      set<int> children;
+      osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent),
+        &children);
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      for (auto c: children) {
+        pending_inc.new_weight[c] = CEPH_OSD_OUT;
+      }
+      osdmap.apply_incremental(pending_inc);
+      {
+        // validate we have two OSDs from the same host now..
+        vector<int> new_up;
+        int new_up_primary;
+        osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+        ASSERT_TRUE(up.size() == new_up.size());
+        int parent_0 = osdmap.crush->get_parent_of_type(new_up[0],
+          osdmap.crush->get_type_id("host"));
+        int parent_1 = osdmap.crush->get_parent_of_type(new_up[1],
+          osdmap.crush->get_type_id("host"));
+        ASSERT_TRUE(parent_0 == parent_1);
+      } 
+    }
+    {
+      // STEP-4: apply cure
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      osdmap.maybe_remove_pg_upmaps(g_ceph_context, osdmap, &pending_inc);
+      osdmap.apply_incremental(pending_inc);
+      {
+        // validate pg_upmap_items is gone (reverted)
+        vector<int> new_up;
+        int new_up_primary;
+        osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+        ASSERT_TRUE(new_up == up_after_out);
+      }
+    }
+  }
+}
+
 TEST(PGTempMap, basic)
 {
   PGTempMap m;