]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/OSDMap: don't mapping all pgs each time in calc_pg_upmaps
authorhuangjun <huangjun@xsky.com>
Fri, 24 Aug 2018 14:47:02 +0000 (22:47 +0800)
committerxie xingguo <xie.xingguo@zte.com.cn>
Fri, 25 Jan 2019 01:37:08 +0000 (09:37 +0800)
We have a cluster pool with 32768 pgs and 400 osds, it costs 600 seconds when doing upmap
with '--upmap-max 32768 --upmap-deviation 0.01', which is pretty slow.
After adding some debug code, the time mostly spent on pg_to_up_acting_osds, the average
time for one pg_to_up_acting_osds is about 12us, so the whole pool's pg will cost 500ms each
time, we finally have 1429 pgs need to do upmap, so it cost about 600 seconds.
Withi this patch, it only spend 5 seconds to get job done.

Signed-off-by: huangjun <huangjun@xsky.com>
(cherry picked from commit da45e4e352b30cc4f6fd52f2f030bf7569eaee57)

src/osd/OSDMap.cc

index 3b7ddfa4a40fab9f6ffd0320ca0a5817d7abcb97..a78fd3240efcf62ea10f85611b95de802e1c8c7a 100644 (file)
@@ -4024,15 +4024,14 @@ int OSDMap::calc_pg_upmaps(
   float start_deviation = 0;
   float end_deviation = 0;
   int num_changed = 0;
-  while (true) {
-    map<int,set<pg_t>> pgs_by_osd;
-    int total_pgs = 0;
-    float osd_weight_total = 0;
-    map<int,float> osd_weight;
-    for (auto& i : pools) {
-      if (!only_pools.empty() && !only_pools.count(i.first))
+  map<int,set<pg_t>> pgs_by_osd;
+  int total_pgs = 0;
+  float osd_weight_total = 0;
+  map<int,float> osd_weight;
+  for (auto& i : pools) {
+    if (!only_pools.empty() && !only_pools.count(i.first))
        continue;
-      for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
+    for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
        pg_t pg(ps, i.first);
        vector<int> up;
        tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
@@ -4040,43 +4039,43 @@ int OSDMap::calc_pg_upmaps(
          if (osd != CRUSH_ITEM_NONE)
            pgs_by_osd[osd].insert(pg);
        }
-      }
-      total_pgs += i.second.get_size() * i.second.get_pg_num();
+    }
+    total_pgs += i.second.get_size() * i.second.get_pg_num();
 
-      map<int,float> pmap;
-      int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
+    map<int,float> pmap;
+    int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
                                        i.second.get_type(),
                                        i.second.get_size());
-      tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
-      ldout(cct,30) << __func__ << " pool " << i.first << " ruleno " << ruleno << dendl;
-      for (auto p : pmap) {
+    tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
+    ldout(cct,30) << __func__ << " pool " << i.first << " ruleno " << ruleno << dendl;
+    for (auto p : pmap) {
        auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
-        if (adjusted_weight == 0) {
-          continue;
-        }
+      if (adjusted_weight == 0) {
+        continue;
+      }
        osd_weight[p.first] += adjusted_weight;
        osd_weight_total += adjusted_weight;
-      }
     }
-    for (auto& i : osd_weight) {
-      int pgs = 0;
-      auto p = pgs_by_osd.find(i.first);
-      if (p != pgs_by_osd.end())
+  }
+  for (auto& i : osd_weight) {
+    int pgs = 0;
+    auto p = pgs_by_osd.find(i.first);
+    if (p != pgs_by_osd.end())
        pgs = p->second.size();
-      else
+    else
        pgs_by_osd.emplace(i.first, set<pg_t>());
-      ldout(cct, 20) << " osd." << i.first << " weight " << i.second
+    ldout(cct, 20) << " osd." << i.first << " weight " << i.second
                     << " pgs " << pgs << dendl;
-    }
-
-    if (osd_weight_total == 0) {
-      lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
-      break;
-    }
-    float pgs_per_weight = total_pgs / osd_weight_total;
-    ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
-    ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
+  }
+  if (osd_weight_total == 0) {
+    lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
+    return 0;
+  }
+  float pgs_per_weight = total_pgs / osd_weight_total;
+  ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
+  ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
 
+  while (true) {
     // osd deviation
     float total_deviation = 0;
     map<int,float> osd_deviation;       // osd, deviation(pgs)
@@ -4148,6 +4147,10 @@ int OSDMap::calc_pg_upmaps(
            if (q.second == osd) {
              ldout(cct, 10) << "  dropping pg_upmap_items " << pg
                             << " " << p->second << dendl;
+              for (auto i : p->second) {
+                pgs_by_osd[i.second].erase(pg);
+                pgs_by_osd[i.first].insert(pg);
+              }
              tmp.pg_upmap_items.erase(p);
              pending_inc->old_pg_upmap_items.insert(pg);
              ++num_changed;
@@ -4183,6 +4186,10 @@ int OSDMap::calc_pg_upmaps(
            rmi.push_back(make_pair(orig[i], out[i]));
          }
        }
+        for (auto i : rmi) {
+          pgs_by_osd[i.first].erase(pg);
+          pgs_by_osd[i.second].insert(pg);
+        }
        pending_inc->new_pg_upmap_items[pg] = rmi;
        ldout(cct, 10) << "  " << pg << " pg_upmap_items " << rmi << dendl;
        restart = true;