From: Venky Shankar Date: Fri, 27 Mar 2020 04:00:08 +0000 (-0400) Subject: mgr: force purge normal ceph entities from service map X-Git-Tag: v14.2.10~93^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=8b2528803fa47cfcf77c4b33840ef262692925ea;p=ceph.git mgr: force purge normal ceph entities from service map Normal ceph services can send task status updates to manager. Task status is tracked in service map implying that normal ceph services have entries in service map and daemon tracking index (daemon state). But the manager prunes entries from daemon state when it receives an updated map (fs, mon, etc...). This causes periodic pruning of service map entries to fail for normal ceph services (those which send task status updates) since it expects a corresponding entry in daemon state. Signed-off-by: Venky Shankar (cherry picked from commit bccbf1fa03ed2fb02ad2e50e6aaf963b36d8bd30) --- diff --git a/qa/tasks/cephfs/test_scrub_checks.py b/qa/tasks/cephfs/test_scrub_checks.py index f36b2303cad7..87b759e5d7a2 100644 --- a/qa/tasks/cephfs/test_scrub_checks.py +++ b/qa/tasks/cephfs/test_scrub_checks.py @@ -16,7 +16,7 @@ class TestScrubControls(CephFSTestCase): Test basic scrub control operations such as abort, pause and resume. """ - MDSS_REQUIRED = 1 + MDSS_REQUIRED = 2 CLIENTS_REQUIRED = 1 def _abort_scrub(self, expected): @@ -129,6 +129,34 @@ class TestScrubControls(CephFSTestCase): time.sleep(10) self._check_task_status("idle") + def test_scrub_task_status_on_mds_failover(self): + # sleep enough to fetch updated task status + time.sleep(10) + + (original_active, ) = self.fs.get_active_names() + original_standbys = self.mds_cluster.get_standby_daemons() + self._check_task_status("idle") + + # Kill the rank 0 + self.fs.mds_stop(original_active) + + grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) + + def promoted(): + active = self.fs.get_active_names() + return active and active[0] in original_standbys + + log.info("Waiting for promotion of one of the original standbys {0}".format( + original_standbys)) + self.wait_until_true(promoted, timeout=grace*2) + + mgr_beacon_grace = float(self.fs.get_config("mgr_service_beacon_grace", service_type="mon")) + + def status_check(): + task_status = self.fs.get_task_status("scrub status") + return original_active not in task_status + self.wait_until_true(status_check, timeout=mgr_beacon_grace*2) + class TestScrubChecks(CephFSTestCase): """ Run flush and scrub commands on the specified files in the filesystem. This diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc index e485e8216163..93e770ad9cec 100644 --- a/src/mgr/DaemonServer.cc +++ b/src/mgr/DaemonServer.cc @@ -2336,10 +2336,19 @@ void DaemonServer::_prune_pending_service_map() while (q != p->second.daemons.end()) { DaemonKey key(p->first, q->first); if (!daemon_state.exists(key)) { - derr << "missing key " << key << dendl; - ++q; - continue; + if (ServiceMap::is_normal_ceph_entity(p->first)) { + dout(10) << "daemon " << key << " in service map but not in daemon state " + << "index -- force pruning" << dendl; + q = p->second.daemons.erase(q); + pending_service_map_dirty = pending_service_map.epoch; + } else { + derr << "missing key " << key << dendl; + ++q; + } + + continue; } + auto daemon = daemon_state.get(key); std::lock_guard l(daemon->lock); if (daemon->last_service_beacon == utime_t()) {