]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mon, osd: fix potential collided *Up Set* after PG remapping
authorxie xingguo <xie.xingguo@zte.com.cn>
Wed, 28 Feb 2018 02:50:00 +0000 (10:50 +0800)
committerxie xingguo <xie.xingguo@zte.com.cn>
Sat, 10 Mar 2018 02:56:24 +0000 (10:56 +0800)
The mgr balancer module are basically doing optimizations based on
the snapshots of OSDMap at certain moments, which turns out to be
the culprit of data loss since it can produce bad PG mapping results
sometimes while in upmap mode.
I.e.:
1) original cluster topology:

-5       2.00000     host host-a
 0   ssd 1.00000         osd.0       up  1.00000 1.00000
 1   ssd 1.00000         osd.1       up  1.00000 1.00000
-7       2.00000     host host-b
 2   ssd 1.00000         osd.2       up  1.00000 1.00000
 3   ssd 1.00000         osd.3       up  1.00000 1.00000
-9       2.00000     host host-c
 4   ssd 1.00000         osd.4       up  1.00000 1.00000
 5   ssd 1.00000         osd.5       up  1.00000 1.00000

2) mgr balancer applies optimization for PG 3.f:

            pg-upmap-items[3.f : 1->4]
3.f [1 3] + -------------------------> [4 3]

3) osd.3 is out/reweighted etc., original crush mapping of 3.f changed
   (while pg-upmap-items did not):

            pg-upmap-items[3.f : 1->4]
3.f [1 5] + -------------------------> [4 5]

4) we are now mapping PG 3.f to two OSDs(osd.4 & osd.5) on the same host
   (host-c).

Fix the above problem by putting a guard procedure before we can
finally encode these *unsafe* upmap remappings into OSDMap.
If any of them turns out to be inappropriate, we can simply cancel it
since balancer can still re-calculate and re-generate later if necessary.

Fixes: http://tracker.ceph.com/issues/23118
Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn>
(cherry picked from commit 5fa467e06bd0cb4f6cc6e1de4283d73693c10eda)

Conflicts:
src/crush/CrushWrapper.cc

drop **const** constraint of CrushWrapper::_get_leaves from master

src/crush/CrushWrapper.cc
src/crush/CrushWrapper.h
src/mon/OSDMonitor.cc
src/osd/OSDMap.cc
src/osd/OSDMap.h
src/test/osd/TestOSDMap.cc

index 52af91f6f47d1f9c9815dd3f112b659c9878a8aa..18596f3bf97798ce1c7fa7e7449361b5dcad817a 100644 (file)
@@ -770,6 +770,25 @@ int CrushWrapper::get_children(int id, list<int> *children)
   return b->size;
 }
 
+int CrushWrapper::get_rule_failure_domain(int rule_id)
+{
+  crush_rule *rule = get_rule(rule_id);
+  if (IS_ERR(rule)) {
+    return -ENOENT;
+  }
+  int type = 0; // default to osd-level
+  for (unsigned s = 0; s < rule->len; ++s) {
+    if ((rule->steps[s].op == CRUSH_RULE_CHOOSE_FIRSTN ||
+         rule->steps[s].op == CRUSH_RULE_CHOOSE_INDEP ||
+         rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_FIRSTN ||
+         rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_INDEP) &&
+         rule->steps[s].arg2 > type) {
+      type = rule->steps[s].arg2;
+    }
+  }
+  return type;
+}
+
 int CrushWrapper::_get_leaves(int id, list<int> *leaves)
 {
   assert(leaves);
index 8325b76c87be8c675b91df7dbbaa7840acf87a49..64e63760c7545019f5acc36b53318b4454025e8d 100644 (file)
@@ -727,6 +727,13 @@ public:
    */
   int get_children(int id, list<int> *children);
 
+  /**
+    * get failure-domain type of a specific crush rule
+    * @param rule_id crush rule id
+    * @return type of failure-domain or a negative errno on error.
+    */
+  int get_rule_failure_domain(int rule_id);
+
   /**
     * enumerate leaves(devices) of given node
     *
index 2783356f9b186a6af36b483074d68a513d16b1f8..90ddeac7985f15bb93ff1df5201930d41ed9b8c8 100644 (file)
@@ -1310,6 +1310,9 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
     }
   }
 
+  // clean inappropriate pg_upmap/pg_upmap_items (if any)
+  osdmap.maybe_remove_pg_upmaps(cct, osdmap, &pending_inc);
+
   // features for osdmap and its incremental
   uint64_t features = mon->get_quorum_con_features();
 
index 3143bbccf0cc9a67dbe89a6756dbb07241c114d5..3b790cd92a56fe4efbd9cdfed2ae61be86bd4194 100644 (file)
@@ -1600,6 +1600,112 @@ void OSDMap::clean_temps(CephContext *cct,
   }
 }
 
+void OSDMap::maybe_remove_pg_upmaps(CephContext *cct,
+                                    const OSDMap& osdmap,
+                                    Incremental *pending_inc)
+{
+  ldout(cct, 10) << __func__ << dendl;
+  OSDMap tmpmap;
+  tmpmap.deepish_copy_from(osdmap);
+  tmpmap.apply_incremental(*pending_inc);
+
+  for (auto& p : tmpmap.pg_upmap) {
+    ldout(cct, 10) << __func__ << " pg_upmap entry "
+                   << "[" << p.first << ":" << p.second << "]"
+                   << dendl;
+    auto crush_rule = tmpmap.get_pg_pool_crush_rule(p.first);
+    if (crush_rule < 0) {
+      lderr(cct) << __func__ << " unable to load crush-rule of pg "
+                 << p.first << dendl;
+      continue;
+    }
+    auto type = tmpmap.crush->get_rule_failure_domain(crush_rule);
+    if (type < 0) {
+      lderr(cct) << __func__ << " unable to load failure-domain-type of pg "
+                 << p.first << dendl;
+      continue;
+    }
+    ldout(cct, 10) << __func__ << " pg " << p.first
+                   << " crush-rule-id " << crush_rule
+                   << " failure-domain-type " << type
+                   << dendl;
+    vector<int> raw;
+    int primary;
+    tmpmap.pg_to_raw_up(p.first, &raw, &primary);
+    set<int> parents;
+    bool collide = false;
+    for (auto osd : raw) {
+      auto parent = tmpmap.crush->get_parent_of_type(osd, type);
+      auto r = parents.insert(parent);
+      if (!r.second) {
+        collide = true;
+        break;
+      }
+    }
+    if (collide) {
+      ldout(cct, 10) << __func__ << " removing invalid pg_upmap "
+                     << "[" << p.first << ":" << p.second << "]"
+                     << ", final mapping result will be: " << raw
+                     << dendl;
+      auto it = pending_inc->new_pg_upmap.find(p.first);
+      if (it != pending_inc->new_pg_upmap.end()) {
+        pending_inc->new_pg_upmap.erase(it);
+      }
+      if (osdmap.pg_upmap.count(p.first)) {
+        pending_inc->old_pg_upmap.insert(p.first);
+      }
+    }
+  }
+  for (auto& p : tmpmap.pg_upmap_items) {
+    ldout(cct, 10) << __func__ << " pg_upmap_items entry "
+                   << "[" << p.first << ":" << p.second << "]"
+                   << dendl;
+    auto crush_rule = tmpmap.get_pg_pool_crush_rule(p.first);
+    if (crush_rule < 0) {
+      lderr(cct) << __func__ << " unable to load crush-rule of pg "
+                 << p.first << dendl;
+      continue;
+    }
+    auto type = tmpmap.crush->get_rule_failure_domain(crush_rule);
+    if (type < 0) {
+      lderr(cct) << __func__ << " unable to load failure-domain-type of pg "
+                 << p.first << dendl;
+      continue;
+    }
+    ldout(cct, 10) << __func__ << " pg " << p.first
+                   << " crush_rule_id " << crush_rule
+                   << " failure_domain_type " << type
+                   << dendl;
+    vector<int> raw;
+    int primary;
+    tmpmap.pg_to_raw_up(p.first, &raw, &primary);
+    set<int> parents;
+    bool collide = false;
+    for (auto osd : raw) {
+      auto parent = tmpmap.crush->get_parent_of_type(osd, type);
+      auto r = parents.insert(parent);
+      if (!r.second) {
+        collide = true;
+        break;
+      }
+    }
+    if (collide) {
+      ldout(cct, 10) << __func__ << " removing invalid pg_upmap_items "
+                     << "[" << p.first << ":" << p.second << "]"
+                     << ", final mapping result will be: " << raw
+                     << dendl;
+      // This is overkilling, but simpler..
+      auto it = pending_inc->new_pg_upmap_items.find(p.first);
+      if (it != pending_inc->new_pg_upmap_items.end()) {
+        pending_inc->new_pg_upmap_items.erase(it);
+      }
+      if (osdmap.pg_upmap_items.count(p.first)) {
+        pending_inc->old_pg_upmap_items.insert(p.first);
+      }
+    }
+  }
+}
+
 int OSDMap::apply_incremental(const Incremental &inc)
 {
   new_blacklist_entries = false;
index 6ba56511823d1390ad4aa5a4bfcd7b8cfcaabe40..c75a874c75639e94a4ab256973569be5de99c927 100644 (file)
@@ -979,6 +979,10 @@ public:
    */
   uint64_t get_up_osd_features() const;
 
+  void maybe_remove_pg_upmaps(CephContext *cct,
+                              const OSDMap& osdmap,
+                              Incremental *pending_inc);
+
   int apply_incremental(const Incremental &inc);
 
   /// try to re-use/reference addrs in oldmap from newmap
@@ -1059,6 +1063,15 @@ public:
     return p->get_size();
   }
 
+  int get_pg_pool_crush_rule(pg_t pgid) const {
+    if (!pg_exists(pgid)) {
+      return -ENOENT;
+    }
+    const pg_pool_t *p = get_pg_pool(pgid.pool());
+    assert(p);
+    return p->get_crush_rule();
+  }
+
 private:
   /// pg -> (raw osd list)
   void _pg_to_raw_osds(
index f94f7402a5e74c9ccb3f18dce0e3ebfd79b77f65..6031464dd52c67a14622b3a65737f2704addfa2e 100644 (file)
@@ -92,7 +92,62 @@ public:
     osdmap.apply_incremental(new_pool_inc);
   }
   unsigned int get_num_osds() { return num_osds; }
-
+  void get_crush(CrushWrapper& newcrush) {
+    bufferlist bl;
+    osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+    bufferlist::iterator p = bl.begin();
+    newcrush.decode(p);
+  }
+  int crush_move(const string &name, const vector<string> &argvec) {
+    map<string,string> loc;
+    CrushWrapper::parse_loc_map(argvec, &loc);
+    CrushWrapper newcrush;
+    get_crush(newcrush);
+    if (!newcrush.name_exists(name)) {
+       return -ENOENT;
+    }
+    int id = newcrush.get_item_id(name);
+    int err;
+    if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
+      if (id >= 0) {
+        err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc);
+      } else {
+        err = newcrush.move_bucket(g_ceph_context, id, loc);
+      }
+      if (err >= 0) {
+        OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+        pending_inc.crush.clear();
+        newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+        osdmap.apply_incremental(pending_inc);
+        err = 0;
+      }
+    } else {
+      // already there
+      err = 0;
+    }
+    return err;
+  }
+  int crush_rule_create_replicated(const string &name,
+                                   const string &root,
+                                   const string &type) {
+    if (osdmap.crush->rule_exists(name)) {
+      return osdmap.crush->get_rule_id(name);
+    }
+    CrushWrapper newcrush;
+    get_crush(newcrush);
+    string device_class;
+    stringstream ss;
+    int ruleno = newcrush.add_simple_rule(
+              name, root, type, device_class,
+              "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
+    if (ruleno >= 0) {
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.crush.clear();
+      newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+      osdmap.apply_incremental(pending_inc);
+    }
+    return ruleno;
+  }
   void test_mappings(int pool,
                     int num,
                     vector<int> *any,
@@ -484,6 +539,240 @@ TEST_F(OSDMapTest, parse_osd_id_list) {
   ASSERT_EQ(-EINVAL, osdmap.parse_osd_id_list({"-12"}, &out, &cout));
 }
 
+TEST_F(OSDMapTest, CleanPGUpmaps) {
+  set_up_map();
+
+  // build a crush rule of type host
+  const int expected_host_num = 3;
+  int osd_per_host = get_num_osds() / expected_host_num;
+  ASSERT_GE(2, osd_per_host);
+  int index = 0;
+  for (int i = 0; i < (int)get_num_osds(); i++) {
+    if (i && i % osd_per_host == 0) {
+      ++index;
+    }
+    stringstream osd_name;
+    stringstream host_name;
+    vector<string> move_to;
+    osd_name << "osd." << i;
+    host_name << "host-" << index;
+    move_to.push_back("root=default");
+    string host_loc = "host=" + host_name.str();
+    move_to.push_back(host_loc);
+    int r = crush_move(osd_name.str(), move_to);
+    ASSERT_EQ(0, r);
+  }
+  const string upmap_rule = "upmap";
+  int upmap_rule_no = crush_rule_create_replicated(
+    upmap_rule, "default", "host");
+  ASSERT_LT(0, upmap_rule_no);
+
+  // create a replicated pool which references the above rule
+  OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
+  new_pool_inc.new_pool_max = osdmap.get_pool_max();
+  new_pool_inc.fsid = osdmap.get_fsid();
+  pg_pool_t empty;
+  uint64_t upmap_pool_id = ++new_pool_inc.new_pool_max;
+  pg_pool_t *p = new_pool_inc.get_new_pool(upmap_pool_id, &empty);
+  p->size = 2;
+  p->set_pg_num(64);
+  p->set_pgp_num(64);
+  p->type = pg_pool_t::TYPE_REPLICATED;
+  p->crush_rule = upmap_rule_no;
+  p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+  new_pool_inc.new_pool_names[upmap_pool_id] = "upmap_pool";
+  osdmap.apply_incremental(new_pool_inc);
+
+  pg_t rawpg(0, upmap_pool_id);
+  pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
+  vector<int> up;
+  int up_primary;
+  osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+  ASSERT_LT(1U, up.size());
+  {
+    // validate we won't have two OSDs from a same host
+    int parent_0 = osdmap.crush->get_parent_of_type(up[0],
+      osdmap.crush->get_type_id("host"));
+    int parent_1 = osdmap.crush->get_parent_of_type(up[1],
+      osdmap.crush->get_type_id("host"));
+    ASSERT_TRUE(parent_0 != parent_1);
+  }
+
+  {
+    // TEST pg_upmap
+    {
+      // STEP-1: enumerate all children of up[0]'s parent,
+      // replace up[1] with one of them (other than up[0])
+      int parent = osdmap.crush->get_parent_of_type(up[0],
+        osdmap.crush->get_type_id("host"));
+      set<int> candidates;
+      osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent), &candidates);
+      ASSERT_LT(1U, candidates.size());
+      int replaced_by = -1;
+      for (auto c: candidates) {
+        if (c != up[0]) {
+          replaced_by = c;
+          break;
+        }
+      }
+      ASSERT_NE(-1, replaced_by);
+      // generate a new pg_upmap item and apply
+      vector<int32_t> new_pg_upmap;
+      new_pg_upmap.push_back(up[0]);
+      new_pg_upmap.push_back(replaced_by); // up[1] -> replaced_by
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
+        new_pg_upmap.begin(), new_pg_upmap.end());
+      osdmap.apply_incremental(pending_inc);
+      {
+        // validate pg_upmap is there
+        vector<int> new_up;
+        int new_up_primary;
+        osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+        ASSERT_TRUE(up.size() == new_up.size());
+        ASSERT_TRUE(new_up[0] == new_pg_upmap[0]);
+        ASSERT_TRUE(new_up[1] == new_pg_upmap[1]);
+        // and we shall have two OSDs from a same host now..
+        int parent_0 = osdmap.crush->get_parent_of_type(new_up[0],
+          osdmap.crush->get_type_id("host"));
+        int parent_1 = osdmap.crush->get_parent_of_type(new_up[1],
+          osdmap.crush->get_type_id("host"));
+        ASSERT_TRUE(parent_0 == parent_1);
+      }
+    }
+    {
+      // STEP-2: apply cure
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      osdmap.maybe_remove_pg_upmaps(g_ceph_context, osdmap, &pending_inc);
+      osdmap.apply_incremental(pending_inc);
+      {
+        // validate pg_upmap is gone (reverted)
+        vector<int> new_up;
+        int new_up_primary;
+        osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+        ASSERT_TRUE(new_up == up);
+        ASSERT_TRUE(new_up_primary = up_primary);
+      }
+    }
+  }
+
+  {
+    // TEST pg_upmap_items
+    // enumerate all used hosts first
+    set<int> parents;
+    for (auto u: up) {
+      int parent = osdmap.crush->get_parent_of_type(u,
+        osdmap.crush->get_type_id("host"));
+      ASSERT_GT(0, parent);
+      parents.insert(parent);
+    }
+    int candidate_parent = 0;
+    set<int> candidate_children;
+    vector<int> up_after_out;
+    {
+      // STEP-1: try mark out up[1] and all other OSDs from the same host
+      int parent = osdmap.crush->get_parent_of_type(up[1],
+        osdmap.crush->get_type_id("host"));
+      set<int> children;
+      osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent),
+        &children);
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      for (auto c: children) {
+        pending_inc.new_weight[c] = CEPH_OSD_OUT;
+      }
+      OSDMap tmpmap;
+      tmpmap.deepish_copy_from(osdmap);
+      tmpmap.apply_incremental(pending_inc);
+      vector<int> new_up;
+      int new_up_primary;
+      tmpmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+      // verify that we'll have OSDs from a different host..
+      int will_choose = -1;
+      for (auto o: new_up) {
+        int parent = tmpmap.crush->get_parent_of_type(o,
+          osdmap.crush->get_type_id("host"));
+        if (!parents.count(parent)) {
+          will_choose = o;
+          candidate_parent = parent; // record
+          break;
+        }
+      }
+      ASSERT_LT(-1, will_choose); // it is an OSD!
+      ASSERT_TRUE(candidate_parent != 0);
+      osdmap.crush->get_leaves(osdmap.crush->get_item_name(candidate_parent),
+        &candidate_children);
+      ASSERT_TRUE(candidate_children.count(will_choose));
+      candidate_children.erase(will_choose);
+      ASSERT_TRUE(!candidate_children.empty());
+      up_after_out = new_up; // needed for verification..
+    }
+    {
+      // STEP-2: generating a new pg_upmap_items entry by
+      // replacing up[0] with one coming from candidate_children
+      int victim = up[0];
+      int replaced_by = *candidate_children.begin();
+      vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+      new_pg_upmap_items.push_back(make_pair(victim, replaced_by));
+      // apply
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.new_pg_upmap_items[pgid] =
+        mempool::osdmap::vector<pair<int32_t,int32_t>>(
+        new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+      osdmap.apply_incremental(pending_inc);
+      {
+        // validate pg_upmap_items is there
+        vector<int> new_up;
+        int new_up_primary;
+        osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+        ASSERT_TRUE(up.size() == new_up.size());
+        ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), replaced_by) !=
+          new_up.end());
+        // and up[1] too
+        ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), up[1]) !=
+          new_up.end());
+      }
+    }
+    {
+      // STEP-3: mark out up[1] and all other OSDs from the same host
+      int parent = osdmap.crush->get_parent_of_type(up[1],
+        osdmap.crush->get_type_id("host"));
+      set<int> children;
+      osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent),
+        &children);
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      for (auto c: children) {
+        pending_inc.new_weight[c] = CEPH_OSD_OUT;
+      }
+      osdmap.apply_incremental(pending_inc);
+      {
+        // validate we have two OSDs from the same host now..
+        vector<int> new_up;
+        int new_up_primary;
+        osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+        ASSERT_TRUE(up.size() == new_up.size());
+        int parent_0 = osdmap.crush->get_parent_of_type(new_up[0],
+          osdmap.crush->get_type_id("host"));
+        int parent_1 = osdmap.crush->get_parent_of_type(new_up[1],
+          osdmap.crush->get_type_id("host"));
+        ASSERT_TRUE(parent_0 == parent_1);
+      } 
+    }
+    {
+      // STEP-4: apply cure
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      osdmap.maybe_remove_pg_upmaps(g_ceph_context, osdmap, &pending_inc);
+      osdmap.apply_incremental(pending_inc);
+      {
+        // validate pg_upmap_items is gone (reverted)
+        vector<int> new_up;
+        int new_up_primary;
+        osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
+        ASSERT_TRUE(new_up == up_after_out);
+      }
+    }
+  }
+}
+
 TEST(PGTempMap, basic)
 {
   PGTempMap m;