From 116004de58d30a3170807bec4ee276d22e239405 Mon Sep 17 00:00:00 2001 From: Kamoltat Date: Fri, 1 Jul 2022 14:42:40 +0000 Subject: [PATCH] qa/tasks/ceph_manager.py: improve loggings 1. When `test_pool_min_size` hit the case where `not all PGs are active or peered` we dump each PG state that doesn't have active or peered state 2. Improve logs message in `inject_pause()`. 3. Add logs for the `test_map_discontinuity()`. 4. In the function, `choose_action()`, added more logs regarding `chance_down`. 5. Added more loggings to `primary_affinity()`, `thrash_pg_upmap_items()`, `thrash_pg_upmap()`. 6. Make self.is_clean() dump the pgs that are not active+clean. Signed-off-by: Kamoltat (cherry picked from commit 9ed53c82cf5d56bd4555084d3bf1c3c9948c4fe3) --- qa/tasks/ceph_manager.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index 8768a40e4f28d..b837cddeb696a 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -655,6 +655,7 @@ class OSDThrasher(Thrasher): options['max_change']) def primary_affinity(self, osd=None): + self.log("primary_affinity") if osd is None: osd = random.choice(self.in_osds) if random.random() >= .5: @@ -681,6 +682,7 @@ class OSDThrasher(Thrasher): """ Install or remove random pg_upmap entries in OSDMap """ + self.log("thrash_pg_upmap") from random import shuffle out = self.ceph_manager.raw_cluster_cmd('osd', 'dump', '-f', 'json-pretty') j = json.loads(out) @@ -689,12 +691,14 @@ class OSDThrasher(Thrasher): if random.random() >= .3: pgs = self.ceph_manager.get_pg_stats() if not pgs: + self.log('No pgs; doing nothing') return pg = random.choice(pgs) pgid = str(pg['pgid']) poolid = int(pgid.split('.')[0]) sizes = [x['size'] for x in j['pools'] if x['pool'] == poolid] if len(sizes) == 0: + self.log('No pools; doing nothing') return n = sizes[0] osds = self.in_osds + self.out_osds @@ -723,6 +727,7 @@ class OSDThrasher(Thrasher): """ Install or remove random pg_upmap_items entries in OSDMap """ + self.log("thrash_pg_upmap_items") from random import shuffle out = self.ceph_manager.raw_cluster_cmd('osd', 'dump', '-f', 'json-pretty') j = json.loads(out) @@ -731,12 +736,14 @@ class OSDThrasher(Thrasher): if random.random() >= .3: pgs = self.ceph_manager.get_pg_stats() if not pgs: + self.log('No pgs; doing nothing') return pg = random.choice(pgs) pgid = str(pg['pgid']) poolid = int(pgid.split('.')[0]) sizes = [x['size'] for x in j['pools'] if x['pool'] == poolid] if len(sizes) == 0: + self.log('No pools; doing nothing') return n = sizes[0] osds = self.in_osds + self.out_osds @@ -1000,7 +1007,7 @@ class OSDThrasher(Thrasher): Pause injection testing. Check for osd being down when finished. """ the_one = random.choice(self.live_osds) - self.log("inject_pause on {osd}".format(osd=the_one)) + self.log("inject_pause on osd.{osd}".format(osd=the_one)) self.log( "Testing {key} pause injection for duration {duration}".format( key=conf_key, @@ -1175,6 +1182,7 @@ class OSDThrasher(Thrasher): This sequence should cause the revived osd to have to handle a map gap since the mons would have trimmed """ + self.log("test_map_discontinuity") while len(self.in_osds) < (self.minin + 1): self.in_osd() self.log("Waiting for recovery") @@ -1216,8 +1224,9 @@ class OSDThrasher(Thrasher): mindead = int(self.config.get("min_dead", 0)) self.log('choose_action: min_in %d min_out ' - '%d min_live %d min_dead %d' % - (minin, minout, minlive, mindead)) + '%d min_live %d min_dead %d ' + 'chance_down %.2f' % + (minin, minout, minlive, mindead, chance_down)) actions = [] if len(self.in_osds) > minin: actions.append((self.out_osd, 1.0,)) @@ -2679,7 +2688,11 @@ class CephManager: True if all pgs are clean """ pgs = self.get_pg_stats() - return self._get_num_active_clean(pgs) == len(pgs) + if self._get_num_active_clean(pgs) == len(pgs): + return True + else: + self.dump_pgs_not_active_clean() + return False def is_recovered(self): """ @@ -2725,6 +2738,12 @@ class CephManager: self.log('PG %s is not active' % pg['pgid']) self.log(pg) + def dump_pgs_not_active_peered(self, pgs): + for pg in pgs: + if (not pg['state'].count('active')) and (not pg['state'].count('peered')): + self.log('PG %s is not active or peered' % pg['pgid']) + self.log(pg) + def wait_for_clean(self, timeout=1200): """ Returns true when all pgs are clean. @@ -2910,7 +2929,11 @@ class CephManager: Wrapper to check if all PGs are active or peered """ pgs = self.get_pg_stats() - return self._get_num_active(pgs) + self._get_num_peered(pgs) == len(pgs) + if self._get_num_active(pgs) + self._get_num_peered(pgs) == len(pgs): + return True + else: + self.dump_pgs_not_active_peered(pgs) + return False def wait_till_active(self, timeout=None): """ -- 2.39.5