mgr: force purge normal ceph entities from service map

author Venky Shankar <vshankar@redhat.com>

Fri, 27 Mar 2020 04:00:08 +0000 (00:00 -0400)

committer Venky Shankar <vshankar@redhat.com>

Tue, 31 Mar 2020 06:41:00 +0000 (02:41 -0400)
author Venky Shankar <vshankar@redhat.com>
Fri, 27 Mar 2020 04:00:08 +0000 (00:00 -0400)
committer Venky Shankar <vshankar@redhat.com>
Tue, 31 Mar 2020 06:41:00 +0000 (02:41 -0400)
diff --git a/qa/tasks/cephfs/test_scrub_checks.py b/qa/tasks/cephfs/test_scrub_checks.py

index e3f5609afe50c6997d1e748990dfb690a422464e..012b6c009fda2884a278074bdac9dc8e3432359f 100644 (file)
--- a/qa/tasks/cephfs/test_scrub_checks.py
+++ b/qa/tasks/cephfs/test_scrub_checks.py
@@ -16,7 +16,7 @@ class TestScrubControls(CephFSTestCase):
      Test basic scrub control operations such as abort, pause and resume.
      """
  
-    MDSS_REQUIRED = 1
+    MDSS_REQUIRED = 2
      CLIENTS_REQUIRED = 1
  
      def _abort_scrub(self, expected):
@@ -129,6 +129,34 @@ class TestScrubControls(CephFSTestCase):
          time.sleep(10)
          self._check_task_status("idle")
  
+    def test_scrub_task_status_on_mds_failover(self):
+        # sleep enough to fetch updated task status
+        time.sleep(10)
+
+        (original_active, ) = self.fs.get_active_names()
+        original_standbys = self.mds_cluster.get_standby_daemons()
+        self._check_task_status("idle")
+
+        # Kill the rank 0
+        self.fs.mds_stop(original_active)
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        def promoted():
+            active = self.fs.get_active_names()
+            return active and active[0] in original_standbys
+
+        log.info("Waiting for promotion of one of the original standbys {0}".format(
+            original_standbys))
+        self.wait_until_true(promoted, timeout=grace*2)
+
+        mgr_beacon_grace = float(self.fs.get_config("mgr_service_beacon_grace", service_type="mon"))
+
+        def status_check():
+            task_status = self.fs.get_task_status("scrub status")
+            return original_active not in task_status
+        self.wait_until_true(status_check, timeout=mgr_beacon_grace*2)
+
  class TestScrubChecks(CephFSTestCase):
      """
      Run flush and scrub commands on the specified files in the filesystem. This
diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc

index de0a99cb870d8f040b18444c5c50ef5fd41db071..4b5f3316342d199a6eff8e6250bbc055aeb39c9a 100644 (file)
--- a/src/mgr/DaemonServer.cc
+++ b/src/mgr/DaemonServer.cc
@@ -2332,10 +2332,19 @@ void DaemonServer::_prune_pending_service_map()
      while (q != p->second.daemons.end()) {
        DaemonKey key{p->first, q->first};
        if (!daemon_state.exists(key)) {
-       derr << "missing key " << key << dendl;
-       ++q;
-       continue;
+        if (ServiceMap::is_normal_ceph_entity(p->first)) {
+          dout(10) << "daemon " << key << " in service map but not in daemon state "
+                   << "index -- force pruning" << dendl;
+          q = p->second.daemons.erase(q);
+          pending_service_map_dirty = pending_service_map.epoch;
+        } else {
+          derr << "missing key " << key << dendl;
+          ++q;
+        }
+
+        continue;
        }
+
        auto daemon = daemon_state.get(key);
        std::lock_guard l(daemon->lock);
        if (daemon->last_service_beacon == utime_t()) {
author	Venky Shankar <vshankar@redhat.com>
	Fri, 27 Mar 2020 04:00:08 +0000 (00:00 -0400)
committer	Venky Shankar <vshankar@redhat.com>
	Tue, 31 Mar 2020 06:41:00 +0000 (02:41 -0400)
qa/tasks/cephfs/test_scrub_checks.py		patch \| blob \| history
src/mgr/DaemonServer.cc		patch \| blob \| history