From 116004de58d30a3170807bec4ee276d22e239405 Mon Sep 17 00:00:00 2001
From: Kamoltat <ksirivad@redhat.com>
Date: Fri, 1 Jul 2022 14:42:40 +0000
Subject: [PATCH] qa/tasks/ceph_manager.py: improve loggings

1. When `test_pool_min_size` hit the case where
`not all PGs are active or peered` we dump
each PG state that doesn't have active or
peered state

2. Improve logs message in `inject_pause()`.

3. Add logs for the `test_map_discontinuity()`.

4. In the function, `choose_action()`,
added more logs regarding `chance_down`.

5. Added more loggings to
`primary_affinity()`,
`thrash_pg_upmap_items()`,
`thrash_pg_upmap()`.

6. Make self.is_clean() dump the pgs that
are not active+clean.

Signed-off-by: Kamoltat <ksirivad@redhat.com>
(cherry picked from commit 9ed53c82cf5d56bd4555084d3bf1c3c9948c4fe3)
---
 qa/tasks/ceph_manager.py | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py
index 8768a40e4f28d..b837cddeb696a 100644
--- a/qa/tasks/ceph_manager.py
+++ b/qa/tasks/ceph_manager.py
@@ -655,6 +655,7 @@ class OSDThrasher(Thrasher):
                     options['max_change'])
 
     def primary_affinity(self, osd=None):
+        self.log("primary_affinity")
         if osd is None:
             osd = random.choice(self.in_osds)
         if random.random() >= .5:
@@ -681,6 +682,7 @@ class OSDThrasher(Thrasher):
         """
         Install or remove random pg_upmap entries in OSDMap
         """
+        self.log("thrash_pg_upmap")
         from random import shuffle
         out = self.ceph_manager.raw_cluster_cmd('osd', 'dump', '-f', 'json-pretty')
         j = json.loads(out)
@@ -689,12 +691,14 @@ class OSDThrasher(Thrasher):
             if random.random() >= .3:
                 pgs = self.ceph_manager.get_pg_stats()
                 if not pgs:
+                    self.log('No pgs; doing nothing')
                     return
                 pg = random.choice(pgs)
                 pgid = str(pg['pgid'])
                 poolid = int(pgid.split('.')[0])
                 sizes = [x['size'] for x in j['pools'] if x['pool'] == poolid]
                 if len(sizes) == 0:
+                    self.log('No pools; doing nothing')
                     return
                 n = sizes[0]
                 osds = self.in_osds + self.out_osds
@@ -723,6 +727,7 @@ class OSDThrasher(Thrasher):
         """
         Install or remove random pg_upmap_items entries in OSDMap
         """
+        self.log("thrash_pg_upmap_items")
         from random import shuffle
         out = self.ceph_manager.raw_cluster_cmd('osd', 'dump', '-f', 'json-pretty')
         j = json.loads(out)
@@ -731,12 +736,14 @@ class OSDThrasher(Thrasher):
             if random.random() >= .3:
                 pgs = self.ceph_manager.get_pg_stats()
                 if not pgs:
+                    self.log('No pgs; doing nothing')
                     return
                 pg = random.choice(pgs)
                 pgid = str(pg['pgid'])
                 poolid = int(pgid.split('.')[0])
                 sizes = [x['size'] for x in j['pools'] if x['pool'] == poolid]
                 if len(sizes) == 0:
+                    self.log('No pools; doing nothing')
                     return
                 n = sizes[0]
                 osds = self.in_osds + self.out_osds
@@ -1000,7 +1007,7 @@ class OSDThrasher(Thrasher):
         Pause injection testing. Check for osd being down when finished.
         """
         the_one = random.choice(self.live_osds)
-        self.log("inject_pause on {osd}".format(osd=the_one))
+        self.log("inject_pause on osd.{osd}".format(osd=the_one))
         self.log(
             "Testing {key} pause injection for duration {duration}".format(
                 key=conf_key,
@@ -1175,6 +1182,7 @@ class OSDThrasher(Thrasher):
         This sequence should cause the revived osd to have to handle
         a map gap since the mons would have trimmed
         """
+        self.log("test_map_discontinuity")
         while len(self.in_osds) < (self.minin + 1):
             self.in_osd()
         self.log("Waiting for recovery")
@@ -1216,8 +1224,9 @@ class OSDThrasher(Thrasher):
         mindead = int(self.config.get("min_dead", 0))
 
         self.log('choose_action: min_in %d min_out '
-                 '%d min_live %d min_dead %d' %
-                 (minin, minout, minlive, mindead))
+                 '%d min_live %d min_dead %d '
+                 'chance_down %.2f' %
+                 (minin, minout, minlive, mindead, chance_down))
         actions = []
         if len(self.in_osds) > minin:
             actions.append((self.out_osd, 1.0,))
@@ -2679,7 +2688,11 @@ class CephManager:
         True if all pgs are clean
         """
         pgs = self.get_pg_stats()
-        return self._get_num_active_clean(pgs) == len(pgs)
+        if self._get_num_active_clean(pgs) == len(pgs):
+            return True
+        else:
+            self.dump_pgs_not_active_clean()
+            return False
 
     def is_recovered(self):
         """
@@ -2725,6 +2738,12 @@ class CephManager:
              self.log('PG %s is not active' % pg['pgid'])
              self.log(pg)
 
+    def dump_pgs_not_active_peered(self, pgs):
+        for pg in pgs:
+            if (not pg['state'].count('active')) and (not pg['state'].count('peered')):
+                self.log('PG %s is not active or peered' % pg['pgid'])
+                self.log(pg)
+
     def wait_for_clean(self, timeout=1200):
         """
         Returns true when all pgs are clean.
@@ -2910,7 +2929,11 @@ class CephManager:
         Wrapper to check if all PGs are active or peered
         """
         pgs = self.get_pg_stats()
-        return self._get_num_active(pgs) + self._get_num_peered(pgs) == len(pgs)
+        if self._get_num_active(pgs) + self._get_num_peered(pgs) == len(pgs):
+            return True
+        else:
+            self.dump_pgs_not_active_peered(pgs)
+            return False
 
     def wait_till_active(self, timeout=None):
         """
-- 
2.39.5