mgr/cephadm: better handling of offline hosts with agent

author Adam King <adking@redhat.com>

Wed, 1 Sep 2021 15:03:16 +0000 (11:03 -0400)

committer Adam King <adking@redhat.com>

Fri, 24 Sep 2021 11:23:51 +0000 (07:23 -0400)
author Adam King <adking@redhat.com>
Wed, 1 Sep 2021 15:03:16 +0000 (11:03 -0400)
committer Adam King <adking@redhat.com>
Fri, 24 Sep 2021 11:23:51 +0000 (07:23 -0400)
diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py

index cdb42d4bc7293bbe6f526dccc9fc9de19c8864dd..9b16a4af7eb28b66e654556c2bf4e99d5be88c56 100644 (file)
--- a/src/pybind/mgr/cephadm/agent.py
+++ b/src/pybind/mgr/cephadm/agent.py
@@ -106,6 +106,9 @@ class HostData:
          except Exception as e:
              self.mgr.log.warning(f'Received bad metadata from an agent: {e}')
          else:
+            # if we got here, we've already verified the keyring of the agent. If
+            # host agent is reporting on is marked offline, it shouldn't be any more
+            self.mgr.offline_hosts_remove(data['host'])
              self.handle_metadata(data)
  
      def check_request_fields(self, data: Dict[str, Any]) -> None:
@@ -271,9 +274,12 @@ class CephadmAgentHelpers:
  
      def _agent_down(self, host: str) -> bool:
          # if we don't have a timestamp, it's likely because of a mgr fail over.
-        # just set the timestamp to now
+        # just set the timestamp to now. However, if host was offline before, we
+        # should not allow creating a new timestamp to cause it to be marked online
          if host not in self.mgr.cache.agent_timestamp:
              self.mgr.cache.agent_timestamp[host] = datetime_now()
+            if host in self.mgr.offline_hosts:
+                return False
          # agent hasn't reported in 2.5 * it's refresh rate. Something is likely wrong with it.
          time_diff = datetime_now() - self.mgr.cache.agent_timestamp[host]
          if time_diff.total_seconds() > 2.5 * float(self.mgr.agent_refresh_rate):
diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py

index 3ea558607aaa765a0a0a853e1796a5c60efa99c2..fd41ebb2c4f1ffdc4e9d6ad6a856b08244cb3dd9 100644 (file)
--- a/src/pybind/mgr/cephadm/inventory.py
+++ b/src/pybind/mgr/cephadm/inventory.py
@@ -947,7 +947,12 @@ class HostCache():
          return True
  
      def all_host_metadata_up_to_date(self) -> bool:
-        if [h for h in self.get_hosts() if not self.host_metadata_up_to_date(h)]:
+        unreachables = [h.hostname for h in self.mgr._unreachable_hosts()]
+        if [h for h in self.get_hosts() if (not self.host_metadata_up_to_date(h) and h not in unreachables)]:
+            # this function is primarily for telling if it's safe to try and apply a service
+            # spec. Since offline/maintenance hosts aren't considered in that process anyway
+            # we don't want to return False if the host without up-to-date metadata is in one
+            # of those two categories.
              return False
          return True
  
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py

index e11a0e7e8e2fab74b74be88d60d59d99ca65d2e9..50c29e2f43902896abdf58a2c783b77aae076fea 100644 (file)
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -266,6 +266,8 @@ class CephadmServe:
                  return
  
              if self.mgr.use_agent and self.mgr.agent_helpers._agent_down(host):
+                agents_down.append(host)
+                self.mgr.cache.metadata_up_to_date[host] = False
                  if host in self.mgr.offline_hosts:
                      return
                  self.mgr.offline_hosts.add(host)
@@ -273,9 +275,9 @@ class CephadmServe:
                  # a long timeout trying to use an existing connection to an offline host
                  # REVISIT AFTER https://github.com/ceph/ceph/pull/42919
                  # self.mgr.ssh._reset_con(host)
-                agents_down.append(host)
-                # try to schedule redeploy of agent in case it is individually down
+
                  try:
+                    # try to schedule redeploy of agent in case it is individually down
                      agent = [a for a in self.mgr.cache.get_daemons_by_host(
                          host) if a.hostname == host][0]
                      self.mgr._schedule_daemon_action(agent.name(), 'redeploy')
@@ -283,8 +285,6 @@ class CephadmServe:
                      self.log.debug(
                          f'Failed to find entry for agent deployed on host {host}. Agent possibly never deployed: {e}')
                  return
-            elif self.mgr.use_agent:
-                self.mgr.offline_hosts_remove(host)
  
              if self.mgr.cache.host_needs_check(host):
                  r = self._check_host(host)
author	Adam King <adking@redhat.com>
	Wed, 1 Sep 2021 15:03:16 +0000 (11:03 -0400)
committer	Adam King <adking@redhat.com>
	Fri, 24 Sep 2021 11:23:51 +0000 (07:23 -0400)
src/pybind/mgr/cephadm/agent.py		patch \| blob \| history
src/pybind/mgr/cephadm/inventory.py		patch \| blob \| history
src/pybind/mgr/cephadm/serve.py		patch \| blob \| history