From: Kamoltat <ksirivad@redhat.com>
Date: Tue, 13 Jul 2021 19:14:43 +0000 (+0000)
Subject: mgr/test_progress.py: Delay recover in test_progress
X-Git-Tag: v16.2.7~84^2~1
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=a8003f3449a9ae282e1d5b3ef9b8dceb27c38a65;p=ceph.git

mgr/test_progress.py: Delay recover in test_progress

Changes some the tests in teuthology to make
the test more deterministic.
Using:

`ceph osd set norecover` and
`ceph osd set nobackfill` when marking osds in
or out. As this will delay the recovery and make
sure it the test cases get the chance to check
that there is actually events poping up in
the progress module.

took out test_osd_cannot_recover from
tasks/mgr/test_progress.py since it is no longer
a relevant test case since recovery will get
triggered regardless if pg is unmoved.

Ignoring `OSDMAP_FLAGS` in teuthology
because we are using norecover and nobackfill
to delay the recovery process, therefore, it
will create a health warning and fails the
teuthology test.

Signed-off-by: Kamoltat <ksirivad@redhat.com>
(cherry picked from commit 5f33f2f6e0609b452db47b341aaf6d5889917563)
---

diff --git a/qa/suites/rados/mgr/tasks/progress.yaml b/qa/suites/rados/mgr/tasks/progress.yaml
index 4a0e802b2896..73bbe3c99564 100644
--- a/qa/suites/rados/mgr/tasks/progress.yaml
+++ b/qa/suites/rados/mgr/tasks/progress.yaml
@@ -17,6 +17,7 @@ tasks:
         - \(FS_WITH_FAILED_MDS\)
         - \(FS_DEGRADED\)
         - \(PG_
+        - \(OSDMAP_FLAGS\)
         - replacing it with standby
         - No standby daemons available
   - cephfs_test_runner:
diff --git a/qa/tasks/mgr/test_progress.py b/qa/tasks/mgr/test_progress.py
index 5ec611cb57e9..082653f62529 100644
--- a/qa/tasks/mgr/test_progress.py
+++ b/qa/tasks/mgr/test_progress.py
@@ -2,9 +2,8 @@
 import json
 import logging
 import time
-
 from .mgr_test_case import MgrTestCase
-
+from contextlib import contextmanager
 
 log = logging.getLogger(__name__)
 
@@ -14,7 +13,7 @@ class TestProgress(MgrTestCase):
 
     # How long we expect to wait at most between taking an OSD out
     # and seeing the progress event pop up.
-    EVENT_CREATION_PERIOD = 15
+    EVENT_CREATION_PERIOD = 60
 
     WRITE_PERIOD = 30
 
@@ -149,6 +148,18 @@ class TestProgress(MgrTestCase):
         osd_map = self.mgr_cluster.mon_manager.get_osd_dump_json()
         return len(osd_map['osds'])
 
+    @contextmanager    
+    def recovery_backfill_disabled(self):
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'osd', 'set', 'nobackfill')
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'osd', 'set', 'norecover')
+        yield
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'osd', 'unset', 'nobackfill')
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'osd', 'unset', 'norecover')
+           
     def setUp(self):
         super(TestProgress, self).setUp()
         # Ensure we have at least four OSDs
@@ -181,15 +192,16 @@ class TestProgress(MgrTestCase):
 
         self._setup_pool()
         self._write_some_data(self.WRITE_PERIOD)
+        with self.recovery_backfill_disabled():
+            for osd_id in osd_ids:
+                self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                    'osd', 'out', str(osd_id))
 
-        for osd_id in osd_ids:
-            self.mgr_cluster.mon_manager.raw_cluster_cmd(
-                'osd', 'out', str(osd_id))
+            # Wait for a progress event to pop up
+            self.wait_until_equal(lambda: self._osd_in_out_events_count('out'), 1,
+                                  timeout=self.EVENT_CREATION_PERIOD,
+                                  period=1)
 
-        # Wait for a progress event to pop up
-        self.wait_until_equal(lambda: self._osd_in_out_events_count('out'), 1,
-                              timeout=self.EVENT_CREATION_PERIOD*2,
-                              period=1)
         ev = self._get_osd_in_out_events('out')[0]
         log.info(json.dumps(ev, indent=1))
         self.assertIn("Rebalancing after osd.0 marked out", ev['message'])
@@ -202,20 +214,23 @@ class TestProgress(MgrTestCase):
 
         # First Event should complete promptly
         self.wait_until_true(lambda: self._is_complete(initial_event['id']),
-                             timeout=self.EVENT_CREATION_PERIOD)
-        try:
-            # Wait for progress event marked in to pop up
-            self.wait_until_equal(lambda: self._osd_in_out_events_count('in'), 1,
-                                  timeout=self.EVENT_CREATION_PERIOD*2,
-                                  period=1)
-        except RuntimeError as ex:
-            if not "Timed out after" in str(ex):
-                raise ex
+                             timeout=self.RECOVERY_PERIOD)
+
+        with self.recovery_backfill_disabled():
+
+            try:
+                # Wait for progress event marked in to pop up
+                self.wait_until_equal(lambda: self._osd_in_out_events_count('in'), 1,
+                                      timeout=self.EVENT_CREATION_PERIOD,
+                                      period=1)
+            except RuntimeError as ex:
+                if not "Timed out after" in str(ex):
+                    raise ex
 
-            log.info("There was no PGs affected by osd being marked in")
-            return None
+                log.info("There was no PGs affected by osd being marked in")
+                return None
 
-        new_event = self._get_osd_in_out_events('in')[0]
+            new_event = self._get_osd_in_out_events('in')[0]
         return new_event
 
     def _no_events_anywhere(self):
@@ -254,6 +269,11 @@ class TestProgress(MgrTestCase):
         if self.POOL in self.mgr_cluster.mon_manager.pools:
             self.mgr_cluster.mon_manager.remove_pool(self.POOL)
 
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'osd', 'unset', 'nobackfill')
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'osd', 'unset', 'norecover')
+
         osd_map = self.mgr_cluster.mon_manager.get_osd_dump_json()
         for osd in osd_map['osds']:
             if osd['weight'] == 0.0:
@@ -287,7 +307,7 @@ class TestProgress(MgrTestCase):
 
         # Event should complete promptly
         self.wait_until_true(lambda: self._is_complete(ev['id']),
-                             timeout=self.EVENT_CREATION_PERIOD)
+                             timeout=self.RECOVERY_PERIOD)
         self.assertEqual(self._osd_in_out_events_count(), 0)
 
     def test_osd_came_back(self):
@@ -308,57 +328,6 @@ class TestProgress(MgrTestCase):
 
         self.assertEqual(self._osd_in_out_events_count(), 0)
 
-    def test_osd_cannot_recover(self):
-        """
-        When the cluster cannot recover from a lost OSD, e.g.
-        because there is no suitable new placement for it.
-        (a size=3 pool when there are only 2 OSDs left)
-        (a size=3 pool when the remaining osds are only on 2 hosts)
-
-        Progress event should not be created.
-        """
-
-        pool_size = 3
-
-        self._setup_pool(size=pool_size)
-        self._write_some_data(self.WRITE_PERIOD)
-
-        # Fail enough OSDs so there are less than N_replicas OSDs
-        # available.
-        osd_count = self._osd_count()
-
-        # First do some failures that will result in a normal rebalance
-        # (Assumption: we're in a test environment that is configured
-        #  not to require replicas be on different hosts, like teuthology)
-        for osd_id in range(0, osd_count - pool_size):
-            self.mgr_cluster.mon_manager.raw_cluster_cmd(
-                'osd', 'out', str(osd_id))
-
-        # We should see an event for each of the OSDs we took out
-        self.wait_until_equal(
-            lambda: self._osd_in_out_events_count('out'),
-            osd_count - pool_size,
-            timeout=self.EVENT_CREATION_PERIOD*(osd_count - pool_size))
-
-        # Those should complete cleanly
-        self.wait_until_equal(
-            lambda: self._osd_in_out_completed_events_count('out'),
-            osd_count - pool_size,
-            timeout=self.RECOVERY_PERIOD*(osd_count - pool_size)
-        )
-
-        # Fail one last OSD, at the point the PGs have nowhere to go
-        victim_osd = osd_count - pool_size
-        self.mgr_cluster.mon_manager.raw_cluster_cmd(
-                'osd', 'out', str(victim_osd))
-
-        # Check that no event is created
-        time.sleep(self.EVENT_CREATION_PERIOD)
-
-        self.assertEqual(
-            self._osd_in_out_completed_events_count('out'),
-            osd_count - pool_size)
-
     def test_turn_off_module(self):
         """
         When the the module is turned off, there should not
@@ -371,18 +340,19 @@ class TestProgress(MgrTestCase):
         pool_size = 3
         self._setup_pool(size=pool_size)
         self._write_some_data(self.WRITE_PERIOD)
-
         self.mgr_cluster.mon_manager.raw_cluster_cmd("progress", "off")
 
-        self.mgr_cluster.mon_manager.raw_cluster_cmd(
-                'osd', 'out', '0')
+        with self.recovery_backfill_disabled():
+            self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                    'osd', 'out', '0')
 
-        time.sleep(self.EVENT_CREATION_PERIOD)
+        time.sleep(self.EVENT_CREATION_PERIOD/2)
 
-        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+        with self.recovery_backfill_disabled():
+            self.mgr_cluster.mon_manager.raw_cluster_cmd(
                     'osd', 'in', '0')
 
-        time.sleep(self.EVENT_CREATION_PERIOD)
+        time.sleep(self.EVENT_CREATION_PERIOD/2)
 
         self.assertTrue(self._no_events_anywhere())
 
@@ -390,13 +360,15 @@ class TestProgress(MgrTestCase):
 
         self._write_some_data(self.WRITE_PERIOD)
 
-        self.mgr_cluster.mon_manager.raw_cluster_cmd(
-                'osd', 'out', '0')
+        with self.recovery_backfill_disabled():
 
-        # Wait for a progress event to pop up
-        self.wait_until_equal(lambda: self._osd_in_out_events_count('out'), 1,
-                              timeout=self.EVENT_CREATION_PERIOD*2,
-                              period=1)
+            self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                    'osd', 'out', '0')
+
+            # Wait for a progress event to pop up
+            self.wait_until_equal(lambda: self._osd_in_out_events_count('out'), 1,
+                                  timeout=self.EVENT_CREATION_PERIOD,
+                                  period=1)
 
         ev1 = self._get_osd_in_out_events('out')[0]