From a57b803e7027a6bfb9d64e3e9810dbf0ef3217ac Mon Sep 17 00:00:00 2001 From: xie xingguo Date: Mon, 5 Mar 2018 10:09:52 +0800 Subject: [PATCH] pybind/mgr/balancer: fix pool-deletion vs auto-optimization race This patch fixes the error below: ``` File "/usr/lib/ceph/mgr/balancer/module.py", line 722, in optimize return self.do_crush_compat(plan) File "/usr/lib/ceph/mgr/balancer/module.py", line 781, in do_crush_compat pe = self.calc_eval(ms, plan.pools) File "/usr/lib/ceph/mgr/balancer/module.py", line 570, in calc_eval objects_by_osd[osd] += ms.pg_stat[pgid]['num_objects'] KeyError: ('5.1b',) ``` The root cause is that balancer is basically collecting cluster information from two separate maps (OSDMap and PGMap), and hence there is a small window/chance that the pool statistics might become divergent. E.g.: 1) auto-optimization begin 2) get osdmap 3) a pool is gone (deleted by admin); pg_dump refreshed 4) get pg_dump (balancer is now with both the newest pg_dump and an obsolute osdmap in hand) 5) execute optimization; balancer complains some PGs are missing in the pg_dump map.. Fix the above problem by tracing pools existing in both maps only. Signed-off-by: xie xingguo --- src/pybind/mgr/balancer/module.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py index faf5cfd1c509..cd2e4a8af28b 100644 --- a/src/pybind/mgr/balancer/module.py +++ b/src/pybind/mgr/balancer/module.py @@ -31,7 +31,9 @@ class MappingState: self.pg_stat = { i['pgid']: i['stat_sum'] for i in pg_dump.get('pg_stats', []) } - self.poolids = [p['pool'] for p in self.osdmap_dump.get('pools', [])] + osd_poolids = [p['pool'] for p in self.osdmap_dump.get('pools', [])] + pg_poolids = [p['poolid'] for p in pg_dump.get('pool_stats', [])] + self.poolids = set(osd_poolids) & set(pg_poolids) self.pg_up = {} self.pg_up_by_poolid = {} for poolid in self.poolids: @@ -408,6 +410,9 @@ class Module(MgrModule): for p in ms.osdmap_dump.get('pools',[]): if len(pools) and p['pool_name'] not in pools: continue + # skip dead or not-yet-ready pools too + if p['pool'] not in ms.poolids: + continue pe.pool_name[p['pool']] = p['pool_name'] pe.pool_id[p['pool_name']] = p['pool'] pool_rule[p['pool_name']] = p['crush_rule'] -- 2.47.3