From: Kamoltat Date: Fri, 1 Jul 2022 14:42:40 +0000 (+0000) Subject: qa/tasks/ceph_manager.py: improve loggings X-Git-Tag: v16.2.11~368^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=71e2061c02c4102b8c58fef5251aa28443cea609;p=ceph.git qa/tasks/ceph_manager.py: improve loggings 1. When `test_pool_min_size` hit the case where `not all PGs are active or peered` we dump each PG state that doesn't have active or peered state 2. Improve logs message in `inject_pause()`. 3. Add logs for the `test_map_discontinuity()`. 4. In the function, `choose_action()`, added more logs regarding `chance_down`. 5. Added more loggings to `primary_affinity()`, `thrash_pg_upmap_items()`, `thrash_pg_upmap()`. 6. Make self.is_clean() dump the pgs that are not active+clean. Signed-off-by: Kamoltat (cherry picked from commit 9ed53c82cf5d56bd4555084d3bf1c3c9948c4fe3) --- diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index a274b8f29254..849b401529f4 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -645,6 +645,7 @@ class OSDThrasher(Thrasher): options['max_change']) def primary_affinity(self, osd=None): + self.log("primary_affinity") if osd is None: osd = random.choice(self.in_osds) if random.random() >= .5: @@ -671,6 +672,7 @@ class OSDThrasher(Thrasher): """ Install or remove random pg_upmap entries in OSDMap """ + self.log("thrash_pg_upmap") from random import shuffle out = self.ceph_manager.raw_cluster_cmd('osd', 'dump', '-f', 'json-pretty') j = json.loads(out) @@ -679,12 +681,14 @@ class OSDThrasher(Thrasher): if random.random() >= .3: pgs = self.ceph_manager.get_pg_stats() if not pgs: + self.log('No pgs; doing nothing') return pg = random.choice(pgs) pgid = str(pg['pgid']) poolid = int(pgid.split('.')[0]) sizes = [x['size'] for x in j['pools'] if x['pool'] == poolid] if len(sizes) == 0: + self.log('No pools; doing nothing') return n = sizes[0] osds = self.in_osds + self.out_osds @@ -713,6 +717,7 @@ class OSDThrasher(Thrasher): """ Install or remove random pg_upmap_items entries in OSDMap """ + self.log("thrash_pg_upmap_items") from random import shuffle out = self.ceph_manager.raw_cluster_cmd('osd', 'dump', '-f', 'json-pretty') j = json.loads(out) @@ -721,12 +726,14 @@ class OSDThrasher(Thrasher): if random.random() >= .3: pgs = self.ceph_manager.get_pg_stats() if not pgs: + self.log('No pgs; doing nothing') return pg = random.choice(pgs) pgid = str(pg['pgid']) poolid = int(pgid.split('.')[0]) sizes = [x['size'] for x in j['pools'] if x['pool'] == poolid] if len(sizes) == 0: + self.log('No pools; doing nothing') return n = sizes[0] osds = self.in_osds + self.out_osds @@ -990,7 +997,7 @@ class OSDThrasher(Thrasher): Pause injection testing. Check for osd being down when finished. """ the_one = random.choice(self.live_osds) - self.log("inject_pause on {osd}".format(osd=the_one)) + self.log("inject_pause on osd.{osd}".format(osd=the_one)) self.log( "Testing {key} pause injection for duration {duration}".format( key=conf_key, @@ -1165,6 +1172,7 @@ class OSDThrasher(Thrasher): This sequence should cause the revived osd to have to handle a map gap since the mons would have trimmed """ + self.log("test_map_discontinuity") while len(self.in_osds) < (self.minin + 1): self.in_osd() self.log("Waiting for recovery") @@ -1206,8 +1214,9 @@ class OSDThrasher(Thrasher): mindead = int(self.config.get("min_dead", 0)) self.log('choose_action: min_in %d min_out ' - '%d min_live %d min_dead %d' % - (minin, minout, minlive, mindead)) + '%d min_live %d min_dead %d ' + 'chance_down %.2f' % + (minin, minout, minlive, mindead, chance_down)) actions = [] if len(self.in_osds) > minin: actions.append((self.out_osd, 1.0,)) @@ -2657,7 +2666,11 @@ class CephManager: True if all pgs are clean """ pgs = self.get_pg_stats() - return self._get_num_active_clean(pgs) == len(pgs) + if self._get_num_active_clean(pgs) == len(pgs): + return True + else: + self.dump_pgs_not_active_clean() + return False def is_recovered(self): """ @@ -2703,6 +2716,12 @@ class CephManager: self.log('PG %s is not active' % pg['pgid']) self.log(pg) + def dump_pgs_not_active_peered(self, pgs): + for pg in pgs: + if (not pg['state'].count('active')) and (not pg['state'].count('peered')): + self.log('PG %s is not active or peered' % pg['pgid']) + self.log(pg) + def wait_for_clean(self, timeout=1200): """ Returns true when all pgs are clean. @@ -2888,7 +2907,11 @@ class CephManager: Wrapper to check if all PGs are active or peered """ pgs = self.get_pg_stats() - return self._get_num_active(pgs) + self._get_num_peered(pgs) == len(pgs) + if self._get_num_active(pgs) + self._get_num_peered(pgs) == len(pgs): + return True + else: + self.dump_pgs_not_active_peered(pgs) + return False def wait_till_active(self, timeout=None): """