mgr/cephadm: don't mark daemons created/removed in the last minute as stray

author Adam King <adking@redhat.com>

Wed, 17 Apr 2024 13:36:13 +0000 (09:36 -0400)

committer Adam King <adking@redhat.com>

Mon, 22 Apr 2024 20:14:57 +0000 (16:14 -0400)
author Adam King <adking@redhat.com>
Wed, 17 Apr 2024 13:36:13 +0000 (09:36 -0400)
committer Adam King <adking@redhat.com>
Mon, 22 Apr 2024 20:14:57 +0000 (16:14 -0400)
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py

index 49eab78fe70e6a63d4089666fa1d2f32e9df2f3c..2a75a08e0f60e8ca91bd625f94635ecf5d9484fc 100644 (file)
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -730,6 +730,11 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
          self.offline_watcher = OfflineHostWatcher(self)
          self.offline_watcher.start()
  
+        # Maps daemon names to timestamps (creation/removal time) for recently created or
+        # removed daemons. Daemons are added to the dict upon creation or removal and cleared
+        # as part of the handling of stray daemons
+        self.recently_altered_daemons: Dict[str, datetime.datetime] = {}
+
      def shutdown(self) -> None:
          self.log.debug('shutdown')
          self._worker_pool.close()
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py

index 4c7889bd18fe0528611b9194180285f1460b7d2a..1094ecb83137a7100f10636eee32ce295e26a666 100644 (file)
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -466,6 +466,11 @@ class CephadmServe:
          for k in ['CEPHADM_STRAY_HOST',
                    'CEPHADM_STRAY_DAEMON']:
              self.mgr.remove_health_warning(k)
+        # clear recently altered daemons that were created/removed more than 60 seconds ago
+        self.mgr.recently_altered_daemons = {
+            d: t for (d, t) in self.mgr.recently_altered_daemons.items()
+            if ((datetime_now() - t).total_seconds() < 60)
+        }
          if self.mgr.warn_on_stray_hosts or self.mgr.warn_on_stray_daemons:
              ls = self.mgr.list_servers()
              self.log.debug(ls)
@@ -504,6 +509,11 @@ class CephadmServe:
                          # and don't have a way to check if the daemon is part of iscsi service
                          # we assume that all tcmu-runner daemons are managed by cephadm
                          managed.append(name)
+                    # Don't mark daemons we just created/removed in the last minute as stray.
+                    # It may take some time for the mgr to become aware the daemon
+                    # had been created/removed.
+                    if name in self.mgr.recently_altered_daemons:
+                        continue
                      if host not in self.mgr.inventory:
                          missing_names.append(name)
                          host_num_daemons += 1
@@ -1409,6 +1419,7 @@ class CephadmServe:
                      what = 'reconfigure' if reconfig else 'deploy'
                      self.mgr.events.for_daemon(
                          daemon_spec.name(), OrchestratorEvent.ERROR, f'Failed to {what}: {err}')
+                self.mgr.recently_altered_daemons[daemon_spec.name()] = datetime_now()
                  return msg
              except OrchestratorError:
                  redeploy = daemon_spec.name() in self.mgr.cache.get_daemon_names()
@@ -1508,6 +1519,7 @@ class CephadmServe:
                                                              daemon_type)].post_remove(daemon, is_failed_deploy=False))
                      self.mgr._kick_serve_loop()
  
+            self.mgr.recently_altered_daemons[name] = datetime_now()
              return "Removed {} from host '{}'".format(name, host)
  
      async def _run_cephadm_json(self,
author	Adam King <adking@redhat.com>
	Wed, 17 Apr 2024 13:36:13 +0000 (09:36 -0400)
committer	Adam King <adking@redhat.com>
	Mon, 22 Apr 2024 20:14:57 +0000 (16:14 -0400)
src/pybind/mgr/cephadm/module.py		patch \| blob \| history
src/pybind/mgr/cephadm/serve.py		patch \| blob \| history