From afcce93591c7e5c53596cec03165033f049a0c63 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Sep 2017 18:00:08 -0400 Subject: [PATCH] mgr/balancer: make crush-compat mode work! - it does multiple iterations, like the upmap optimizer. - it decreases the step size if it isn't improving, in the hope that it is overshooting the minimum - debug output is cleaned up a bit (the info level should be genuinely useful) Signed-off-by: Sage Weil (cherry picked from commit d9a31595ba15de5fda104a0154778e3200fc46a0) --- src/pybind/mgr/balancer/module.py | 86 ++++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 24 deletions(-) diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py index d1133e6cbafc5..8d8d6a74810be 100644 --- a/src/pybind/mgr/balancer/module.py +++ b/src/pybind/mgr/balancer/module.py @@ -3,6 +3,7 @@ Balance PG distribution across OSDs. """ +import copy import errno import json import math @@ -160,7 +161,8 @@ class Eval: r[t] = { 'avg': avg, 'stddev': stddev, - 'score': sum_weight, + 'sum_weight': sum_weight, + 'score': score, } return r @@ -577,11 +579,19 @@ class Module(MgrModule): def do_crush_compat(self, plan): self.log.info('do_crush_compat') + max_iterations = self.get_config('crush_compat_max_iterations', 25) + if max_iterations < 1: + return False + step = self.get_config('crush_compat_step', .2) + if step <= 0 or step >= 1.0: + return False + osdmap = self.get_osdmap() crush = osdmap.get_crush() # get current compat weight-set weights - old_ws = self.get_compat_weight_set_weights() + orig_ws = self.get_compat_weight_set_weights() + orig_ws = { a: b for a, b in orig_ws.iteritems() if a >= 0 } ms = plan.initial pe = self.calc_eval(ms) @@ -606,28 +616,56 @@ class Module(MgrModule): key = 'pgs' # pgs objects or bytes # go - random.shuffle(roots) - for root in roots: - pools = pe.root_pools[root] - self.log.info('Balancing root %s (pools %s) by %s' % - (root, pools, key)) - target = pe.target_by_root[root] - actual = pe.actual_by_root[root][key] - queue = sorted(actual.keys(), - key=lambda osd: -abs(target[osd] - actual[osd])) - self.log.debug('queue %s' % queue) - for osd in queue: - deviation = target[osd] - actual[osd] - if deviation == 0: - break - self.log.debug('osd.%d deviation %f', osd, deviation) - weight = old_ws[osd] - calc_weight = target[osd] / actual[osd] * weight - new_weight = weight * .7 + calc_weight * .3 - self.log.debug('Reweight osd.%d %f -> %f', osd, weight, - new_weight) - plan.compat_ws[osd] = new_weight - return True + best_ws = copy.deepcopy(orig_ws) + cur_pe = pe + left = max_iterations + while left > 0: + # adjust + self.log.debug('best_ws %s' % best_ws) + next_ws = copy.deepcopy(best_ws) + random.shuffle(roots) + for root in roots: + pools = cur_pe.root_pools[root] + self.log.info('Balancing root %s (pools %s) by %s' % + (root, pools, key)) + target = cur_pe.target_by_root[root] + actual = cur_pe.actual_by_root[root][key] + queue = sorted(actual.keys(), + key=lambda osd: -abs(target[osd] - actual[osd])) + for osd in queue: + deviation = target[osd] - actual[osd] + if deviation == 0: + break + self.log.debug('osd.%d deviation %f', osd, deviation) + weight = best_ws[osd] + calc_weight = target[osd] / actual[osd] * weight + new_weight = weight * (1.0 - step) + calc_weight * step + self.log.debug('Reweight osd.%d %f -> %f', osd, weight, + new_weight) + next_ws[osd] = new_weight + + # recalc + plan.compat_ws = copy.deepcopy(next_ws) + next_ms = plan.final_state() + next_pe = self.calc_eval(next_ms) + self.log.debug('Step result score %f -> %f', cur_pe.score, + next_pe.score) + if next_pe.score > cur_pe.score * 1.01: + step /= 2.0 + self.log.debug('Score got worse, trying smaller step %f' % step) + else: + cur_pe = next_pe + best_ws = next_ws + left -= 1 + + if cur_pe.score < pe.score: + self.log.info('Success, score %f -> %f', pe.score, cur_pe.score) + plan.compat_ws = best_ws + return True + else: + self.log.info('Failed to find further optimization, score %f', + pe.score) + return False def compat_weight_set_reweight(self, osd, new_weight): self.log.debug('ceph osd crush weight-set reweight-compat') -- 2.39.5