]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/OSDMap: don't mapping all pgs each time in calc_pg_upmaps
authorhuangjun <huangjun@xsky.com>
Fri, 24 Aug 2018 14:47:02 +0000 (22:47 +0800)
committerxie xingguo <xie.xingguo@zte.com.cn>
Fri, 25 Jan 2019 03:14:44 +0000 (11:14 +0800)
We have a cluster pool with 32768 pgs and 400 osds, it costs 600 seconds when doing upmap
with '--upmap-max 32768 --upmap-deviation 0.01', which is pretty slow.
After adding some debug code, the time mostly spent on pg_to_up_acting_osds, the average
time for one pg_to_up_acting_osds is about 12us, so the whole pool's pg will cost 500ms each
time, we finally have 1429 pgs need to do upmap, so it cost about 600 seconds.
Withi this patch, it only spend 5 seconds to get job done.

Signed-off-by: huangjun <huangjun@xsky.com>
(cherry picked from commit da45e4e352b30cc4f6fd52f2f030bf7569eaee57)

src/osd/OSDMap.cc

index fa9937b3f7562eca2aae1e933e30b3086d5ac16b..37740efc658a8cdf87f62904d5c99a835453758e 100644 (file)
@@ -4158,15 +4158,14 @@ int OSDMap::calc_pg_upmaps(
   float start_deviation = 0;
   float end_deviation = 0;
   int num_changed = 0;
-  while (true) {
-    map<int,set<pg_t>> pgs_by_osd;
-    int total_pgs = 0;
-    float osd_weight_total = 0;
-    map<int,float> osd_weight;
-    for (auto& i : pools) {
-      if (!only_pools.empty() && !only_pools.count(i.first))
+  map<int,set<pg_t>> pgs_by_osd;
+  int total_pgs = 0;
+  float osd_weight_total = 0;
+  map<int,float> osd_weight;
+  for (auto& i : pools) {
+    if (!only_pools.empty() && !only_pools.count(i.first))
        continue;
-      for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
+    for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
        pg_t pg(ps, i.first);
        vector<int> up;
        tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
@@ -4174,43 +4173,43 @@ int OSDMap::calc_pg_upmaps(
          if (osd != CRUSH_ITEM_NONE)
            pgs_by_osd[osd].insert(pg);
        }
-      }
-      total_pgs += i.second.get_size() * i.second.get_pg_num();
+    }
+    total_pgs += i.second.get_size() * i.second.get_pg_num();
 
-      map<int,float> pmap;
-      int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
+    map<int,float> pmap;
+    int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
                                        i.second.get_type(),
                                        i.second.get_size());
-      tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
-      ldout(cct,30) << __func__ << " pool " << i.first << " ruleno " << ruleno << dendl;
-      for (auto p : pmap) {
+    tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
+    ldout(cct,30) << __func__ << " pool " << i.first << " ruleno " << ruleno << dendl;
+    for (auto p : pmap) {
        auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
-        if (adjusted_weight == 0) {
-          continue;
-        }
+      if (adjusted_weight == 0) {
+        continue;
+      }
        osd_weight[p.first] += adjusted_weight;
        osd_weight_total += adjusted_weight;
-      }
     }
-    for (auto& i : osd_weight) {
-      int pgs = 0;
-      auto p = pgs_by_osd.find(i.first);
-      if (p != pgs_by_osd.end())
+  }
+  for (auto& i : osd_weight) {
+    int pgs = 0;
+    auto p = pgs_by_osd.find(i.first);
+    if (p != pgs_by_osd.end())
        pgs = p->second.size();
-      else
+    else
        pgs_by_osd.emplace(i.first, set<pg_t>());
-      ldout(cct, 20) << " osd." << i.first << " weight " << i.second
+    ldout(cct, 20) << " osd." << i.first << " weight " << i.second
                     << " pgs " << pgs << dendl;
-    }
-
-    if (osd_weight_total == 0) {
-      lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
-      break;
-    }
-    float pgs_per_weight = total_pgs / osd_weight_total;
-    ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
-    ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
+  }
+  if (osd_weight_total == 0) {
+    lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
+    return 0;
+  }
+  float pgs_per_weight = total_pgs / osd_weight_total;
+  ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
+  ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
 
+  while (true) {
     // osd deviation
     float total_deviation = 0;
     map<int,float> osd_deviation;       // osd, deviation(pgs)
@@ -4282,6 +4281,10 @@ int OSDMap::calc_pg_upmaps(
            if (q.second == osd) {
              ldout(cct, 10) << "  dropping pg_upmap_items " << pg
                             << " " << p->second << dendl;
+              for (auto i : p->second) {
+                pgs_by_osd[i.second].erase(pg);
+                pgs_by_osd[i.first].insert(pg);
+              }
              tmp.pg_upmap_items.erase(p);
              pending_inc->old_pg_upmap_items.insert(pg);
              ++num_changed;
@@ -4317,6 +4320,10 @@ int OSDMap::calc_pg_upmaps(
            rmi.push_back(make_pair(orig[i], out[i]));
          }
        }
+        for (auto i : rmi) {
+          pgs_by_osd[i.first].erase(pg);
+          pgs_by_osd[i.second].insert(pg);
+        }
        pending_inc->new_pg_upmap_items[pg] = rmi;
        ldout(cct, 10) << "  " << pg << " pg_upmap_items " << rmi << dendl;
        restart = true;