]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr: force purge normal ceph entities from service map
authorVenky Shankar <vshankar@redhat.com>
Fri, 27 Mar 2020 04:00:08 +0000 (00:00 -0400)
committerVenky Shankar <vshankar@redhat.com>
Tue, 31 Mar 2020 06:41:00 +0000 (02:41 -0400)
Normal ceph services can send task status updates to manager.
Task status is tracked in service map implying that normal
ceph services have entries in service map and daemon tracking
index (daemon state). But the manager prunes entries from daemon
state when it receives an updated map (fs, mon, etc...). This
causes periodic pruning of service map entries to fail for normal
ceph services (those which send task status updates) since it
expects a corresponding entry in daemon state.

Signed-off-by: Venky Shankar <vshankar@redhat.com>
qa/tasks/cephfs/test_scrub_checks.py
src/mgr/DaemonServer.cc

index e3f5609afe50c6997d1e748990dfb690a422464e..012b6c009fda2884a278074bdac9dc8e3432359f 100644 (file)
@@ -16,7 +16,7 @@ class TestScrubControls(CephFSTestCase):
     Test basic scrub control operations such as abort, pause and resume.
     """
 
-    MDSS_REQUIRED = 1
+    MDSS_REQUIRED = 2
     CLIENTS_REQUIRED = 1
 
     def _abort_scrub(self, expected):
@@ -129,6 +129,34 @@ class TestScrubControls(CephFSTestCase):
         time.sleep(10)
         self._check_task_status("idle")
 
+    def test_scrub_task_status_on_mds_failover(self):
+        # sleep enough to fetch updated task status
+        time.sleep(10)
+
+        (original_active, ) = self.fs.get_active_names()
+        original_standbys = self.mds_cluster.get_standby_daemons()
+        self._check_task_status("idle")
+
+        # Kill the rank 0
+        self.fs.mds_stop(original_active)
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        def promoted():
+            active = self.fs.get_active_names()
+            return active and active[0] in original_standbys
+
+        log.info("Waiting for promotion of one of the original standbys {0}".format(
+            original_standbys))
+        self.wait_until_true(promoted, timeout=grace*2)
+
+        mgr_beacon_grace = float(self.fs.get_config("mgr_service_beacon_grace", service_type="mon"))
+
+        def status_check():
+            task_status = self.fs.get_task_status("scrub status")
+            return original_active not in task_status
+        self.wait_until_true(status_check, timeout=mgr_beacon_grace*2)
+
 class TestScrubChecks(CephFSTestCase):
     """
     Run flush and scrub commands on the specified files in the filesystem. This
index de0a99cb870d8f040b18444c5c50ef5fd41db071..4b5f3316342d199a6eff8e6250bbc055aeb39c9a 100644 (file)
@@ -2332,10 +2332,19 @@ void DaemonServer::_prune_pending_service_map()
     while (q != p->second.daemons.end()) {
       DaemonKey key{p->first, q->first};
       if (!daemon_state.exists(key)) {
-       derr << "missing key " << key << dendl;
-       ++q;
-       continue;
+        if (ServiceMap::is_normal_ceph_entity(p->first)) {
+          dout(10) << "daemon " << key << " in service map but not in daemon state "
+                   << "index -- force pruning" << dendl;
+          q = p->second.daemons.erase(q);
+          pending_service_map_dirty = pending_service_map.epoch;
+        } else {
+          derr << "missing key " << key << dendl;
+          ++q;
+        }
+
+        continue;
       }
+
       auto daemon = daemon_state.get(key);
       std::lock_guard l(daemon->lock);
       if (daemon->last_service_beacon == utime_t()) {