self.offline_watcher = OfflineHostWatcher(self)
self.offline_watcher.start()
+ # Maps daemon names to timestamps (creation/removal time) for recently created or
+ # removed daemons. Daemons are added to the dict upon creation or removal and cleared
+ # as part of the handling of stray daemons
+ self.recently_altered_daemons: Dict[str, datetime.datetime] = {}
+
def shutdown(self) -> None:
self.log.debug('shutdown')
self._worker_pool.close()
for k in ['CEPHADM_STRAY_HOST',
'CEPHADM_STRAY_DAEMON']:
self.mgr.remove_health_warning(k)
+ # clear recently altered daemons that were created/removed more than 60 seconds ago
+ self.mgr.recently_altered_daemons = {
+ d: t for (d, t) in self.mgr.recently_altered_daemons.items()
+ if ((datetime_now() - t).total_seconds() < 60)
+ }
if self.mgr.warn_on_stray_hosts or self.mgr.warn_on_stray_daemons:
ls = self.mgr.list_servers()
self.log.debug(ls)
# and don't have a way to check if the daemon is part of iscsi service
# we assume that all tcmu-runner daemons are managed by cephadm
managed.append(name)
+ # Don't mark daemons we just created/removed in the last minute as stray.
+ # It may take some time for the mgr to become aware the daemon
+ # had been created/removed.
+ if name in self.mgr.recently_altered_daemons:
+ continue
if host not in self.mgr.inventory:
missing_names.append(name)
host_num_daemons += 1
what = 'reconfigure' if reconfig else 'deploy'
self.mgr.events.for_daemon(
daemon_spec.name(), OrchestratorEvent.ERROR, f'Failed to {what}: {err}')
+ self.mgr.recently_altered_daemons[daemon_spec.name()] = datetime_now()
return msg
except OrchestratorError:
redeploy = daemon_spec.name() in self.mgr.cache.get_daemon_names()
daemon_type)].post_remove(daemon, is_failed_deploy=False))
self.mgr._kick_serve_loop()
+ self.mgr.recently_altered_daemons[name] = datetime_now()
return "Removed {} from host '{}'".format(name, host)
async def _run_cephadm_json(self,