]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/OSDMap: implement remap_pgs
authorSage Weil <sage@redhat.com>
Wed, 15 Mar 2017 16:47:21 +0000 (12:47 -0400)
committerSage Weil <sage@redhat.com>
Tue, 28 Mar 2017 14:12:09 +0000 (10:12 -0400)
Run a specified number of iterations trying to install new
pg_remap_items entries that improve the PG distribution.

Signed-off-by: Sage Weil <sage@redhat.com>
src/osd/OSDMap.cc
src/osd/OSDMap.h

index 3c4e90a712da1335472dea40a2d3b5fbf847c98d..f33ae5c1af35030c9ae8a7b964e3cfa5b138c4b6 100644 (file)
@@ -3314,3 +3314,208 @@ int OSDMap::summarize_mapping_stats(
     *out = ss.str();
   return 0;
 }
+
+bool OSDMap::try_pg_remap(
+  CephContext *cct,
+  pg_t pg,                       ///< pg to potentially remap
+  const set<int>& overfull,      ///< osds we'd want to evacuate
+  const vector<int>& underfull,  ///< osds to move to, in order of preference
+  vector<int> *orig,
+  vector<int> *out)              ///< resulting alternative mapping
+{
+  const pg_pool_t *pool = get_pg_pool(pg.pool());
+  if (!pool)
+    return false;
+  int rule = crush->find_rule(pool->get_crush_ruleset(), pool->get_type(),
+                             pool->get_size());
+  if (rule < 0)
+    return false;
+
+  // get original mapping
+  _pg_to_raw_osds(*pool, pg, orig, NULL);
+
+  // make sure there is something there to remap
+  bool any = false;
+  for (auto osd : *orig) {
+    if (overfull.count(osd)) {
+      any = true;
+      break;
+    }
+  }
+  if (!any) {
+    return false;
+  }
+
+  int r = crush->try_remap_rule(
+    cct,
+    rule,
+    pool->get_size(),
+    overfull, underfull,
+    *orig,
+    out);
+  if (r < 0)
+    return false;
+  if (*out == *orig)
+    return false;
+  return true;
+}
+
+int OSDMap::remap_pgs(
+  CephContext *cct,
+  float max_deviation,
+  int max,
+  const set<int64_t>& only_pools,
+  OSDMap::Incremental *pending_inc)
+{
+  OSDMap tmp;
+  tmp.deepish_copy_from(*this);
+  int num_changed = 0;
+  while (true) {
+    map<int,set<pg_t>> pgs_by_osd;
+    int total_pgs = 0;
+    for (auto& i : pools) {
+      if (!only_pools.empty() && !only_pools.count(i.first))
+       continue;
+      for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
+       pg_t pg(ps, i.first);
+       vector<int> up;
+       tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
+       for (auto osd : up) {
+         if (osd != CRUSH_ITEM_NONE)
+           pgs_by_osd[osd].insert(pg);
+       }
+      }
+      total_pgs += i.second.get_size() * i.second.get_pg_num();
+    }
+    float osd_weight_total = 0;
+    map<int,float> osd_weight;
+    for (auto& i : pgs_by_osd) {
+      float w = crush->get_item_weightf(i.first);
+      osd_weight[i.first] = w;
+      osd_weight_total += w;
+      ldout(cct, 20) << " osd." << i.first << " weight " << w
+                    << " pgs " << i.second.size() << dendl;
+    }
+
+    // NOTE: we assume we touch all osds with CRUSH!
+    float pgs_per_weight = total_pgs / osd_weight_total;
+    ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
+    ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
+
+    // osd deviation
+    map<int,float> osd_deviation;       // osd, deviation(pgs)
+    multimap<float,int> deviation_osd;  // deviation(pgs), osd
+    set<int> overfull;
+    for (auto& i : pgs_by_osd) {
+      float target = osd_weight[i.first] * pgs_per_weight;
+      float deviation = (float)i.second.size() - target;
+      ldout(cct, 20) << " osd." << i.first
+                    << "\tpgs " << i.second.size()
+                    << "\ttarget " << target
+                    << "\tdeviation " << deviation
+                    << dendl;
+      osd_deviation[i.first] = deviation;
+      deviation_osd.insert(make_pair(deviation, i.first));
+      if (deviation > 0)
+       overfull.insert(i.first);
+    }
+
+    // build underfull, sorted from least-full to most-average
+    vector<int> underfull;
+    for (auto i = deviation_osd.begin();
+        i != deviation_osd.end();
+        ++i) {
+      if (i->first >= -.999)
+       break;
+      underfull.push_back(i->second);
+    }
+    ldout(cct, 10) << " overfull " << overfull
+                  << " underfull " << underfull << dendl;
+    if (overfull.empty() || underfull.empty())
+      break;
+
+    // pick fullest
+    bool restart = false;
+    for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
+      int osd = p->second;
+      float target = osd_weight[osd] * pgs_per_weight;
+      float deviation = deviation_osd.rbegin()->first;
+      if (deviation/target < max_deviation) {
+       ldout(cct, 10) << " osd." << osd
+                      << " target " << target
+                      << " deviation " << deviation
+                      << " -> " << deviation/target
+                      << " < max " << max_deviation << dendl;
+       break;
+      }
+      int num_to_move = deviation;
+      ldout(cct, 10) << " osd." << osd << " move " << num_to_move << dendl;
+      if (num_to_move < 1)
+       break;
+
+      set<pg_t>& pgs = pgs_by_osd[osd];
+
+      // look for remaps we can un-remap
+      for (auto pg : pgs) {
+       auto p = tmp.pg_remap_items.find(pg);
+       if (p != tmp.pg_remap_items.end()) {
+         for (auto q : p->second) {
+           if (q.second == osd) {
+             ldout(cct, 10) << "  dropping pg_remap_items " << pg
+                            << " " << p->second << dendl;
+             tmp.pg_remap_items.erase(p);
+             pending_inc->old_pg_remap_items.insert(pg);
+             ++num_changed;
+             restart = true;
+           }
+         }
+       }
+       if (restart)
+         break;
+      } // pg loop
+      if (restart)
+       break;
+
+      for (auto pg : pgs) {
+       if (tmp.pg_remap.count(pg) ||
+           tmp.pg_remap_items.count(pg)) {
+         ldout(cct, 20) << "  already remapped " << pg << dendl;
+         continue;
+       }
+       ldout(cct, 10) << "  trying " << pg << dendl;
+       vector<int> orig, out;
+       if (!try_pg_remap(cct, pg, overfull, underfull, &orig, &out)) {
+         continue;
+       }
+       ldout(cct, 10) << "  " << pg << " " << orig << " -> " << out << dendl;
+       if (orig.size() != out.size()) {
+         continue;
+       }
+       assert(orig != out);
+       vector<pair<int,int>>& rmi = tmp.pg_remap_items[pg];
+       for (unsigned i = 0; i < out.size(); ++i) {
+         if (orig[i] != out[i]) {
+           rmi.push_back(make_pair(orig[i], out[i]));
+         }
+       }
+       pending_inc->new_pg_remap_items[pg] = rmi;
+       ldout(cct, 10) << "  " << pg << " pg_remap_items " << rmi << dendl;
+       restart = true;
+       ++num_changed;
+       break;
+      } // pg loop
+      if (restart)
+       break;
+    } // osd loop
+
+    if (!restart) {
+      ldout(cct, 10) << " failed to find any changes to make" << dendl;
+      break;
+    }
+    if (--max == 0) {
+      ldout(cct, 10) << " hit max iterations, stopping" << dendl;
+      break;
+    }
+  }
+  return num_changed;
+}
index 7c17ecad2e441556060d1afb4cb2b184590653bb..4221b2e0fb34b1b3e2b7fd62ea7ab24a406b58e3 100644 (file)
@@ -862,6 +862,22 @@ public:
   }
 
 
+  bool try_pg_remap(
+    CephContext *cct,
+    pg_t pg,                       ///< pg to potentially remap
+    const set<int>& overfull,      ///< osds we'd want to evacuate
+    const vector<int>& underfull,  ///< osds to move to, in order of preference
+    vector<int> *orig,
+    vector<int> *out);             ///< resulting alternative mapping
+
+  int remap_pgs(
+    CephContext *cct,
+    float max_deviation, ///< max deviation from target (value < 1.0)
+    int max_iterations,  ///< max iterations to run
+    const set<int64_t>& pools,        ///< [optional] restrict to pool
+    OSDMap::Incremental *pending_inc
+    );
+
   /*
    * handy helpers to build simple maps...
    */