mgr: force purge normal ceph entities from service map

author Venky Shankar <vshankar@redhat.com>

Fri, 27 Mar 2020 04:00:08 +0000 (00:00 -0400)

committer Vicente Cheng <freeze.bilsted@gmail.com>

Tue, 21 Apr 2020 16:38:42 +0000 (16:38 +0000)
author Venky Shankar <vshankar@redhat.com>
Fri, 27 Mar 2020 04:00:08 +0000 (00:00 -0400)
committer Vicente Cheng <freeze.bilsted@gmail.com>
Tue, 21 Apr 2020 16:38:42 +0000 (16:38 +0000)
diff --git a/qa/tasks/cephfs/test_scrub_checks.py b/qa/tasks/cephfs/test_scrub_checks.py

index f36b2303cad71f123804ebf78fb00232e2f6a69c..87b759e5d7a2a7f28a992551c33702babf439c35 100644 (file)
--- a/qa/tasks/cephfs/test_scrub_checks.py
+++ b/qa/tasks/cephfs/test_scrub_checks.py
@@ -16,7 +16,7 @@ class TestScrubControls(CephFSTestCase):
      Test basic scrub control operations such as abort, pause and resume.
      """
  
-    MDSS_REQUIRED = 1
+    MDSS_REQUIRED = 2
      CLIENTS_REQUIRED = 1
  
      def _abort_scrub(self, expected):
@@ -129,6 +129,34 @@ class TestScrubControls(CephFSTestCase):
          time.sleep(10)
          self._check_task_status("idle")
  
+    def test_scrub_task_status_on_mds_failover(self):
+        # sleep enough to fetch updated task status
+        time.sleep(10)
+
+        (original_active, ) = self.fs.get_active_names()
+        original_standbys = self.mds_cluster.get_standby_daemons()
+        self._check_task_status("idle")
+
+        # Kill the rank 0
+        self.fs.mds_stop(original_active)
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        def promoted():
+            active = self.fs.get_active_names()
+            return active and active[0] in original_standbys
+
+        log.info("Waiting for promotion of one of the original standbys {0}".format(
+            original_standbys))
+        self.wait_until_true(promoted, timeout=grace*2)
+
+        mgr_beacon_grace = float(self.fs.get_config("mgr_service_beacon_grace", service_type="mon"))
+
+        def status_check():
+            task_status = self.fs.get_task_status("scrub status")
+            return original_active not in task_status
+        self.wait_until_true(status_check, timeout=mgr_beacon_grace*2)
+
  class TestScrubChecks(CephFSTestCase):
      """
      Run flush and scrub commands on the specified files in the filesystem. This
diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc

index e485e8216163b7ac113488f373ef4f12bd4c69b5..93e770ad9cec105f3e64219227c3bf0ddaaa56ad 100644 (file)
--- a/src/mgr/DaemonServer.cc
+++ b/src/mgr/DaemonServer.cc
@@ -2336,10 +2336,19 @@ void DaemonServer::_prune_pending_service_map()
      while (q != p->second.daemons.end()) {
        DaemonKey key(p->first, q->first);
        if (!daemon_state.exists(key)) {
-       derr << "missing key " << key << dendl;
-       ++q;
-       continue;
+        if (ServiceMap::is_normal_ceph_entity(p->first)) {
+          dout(10) << "daemon " << key << " in service map but not in daemon state "
+                   << "index -- force pruning" << dendl;
+          q = p->second.daemons.erase(q);
+          pending_service_map_dirty = pending_service_map.epoch;
+        } else {
+          derr << "missing key " << key << dendl;
+          ++q;
+        }
+
+        continue;
        }
+
        auto daemon = daemon_state.get(key);
        std::lock_guard l(daemon->lock);
        if (daemon->last_service_beacon == utime_t()) {
author	Venky Shankar <vshankar@redhat.com>
	Fri, 27 Mar 2020 04:00:08 +0000 (00:00 -0400)
committer	Vicente Cheng <freeze.bilsted@gmail.com>
	Tue, 21 Apr 2020 16:38:42 +0000 (16:38 +0000)
qa/tasks/cephfs/test_scrub_checks.py		patch \| blob \| history
src/mgr/DaemonServer.cc		patch \| blob \| history