]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: offline host handling improvements for agent
authorAdam King <adking@redhat.com>
Mon, 6 Sep 2021 23:11:34 +0000 (19:11 -0400)
committerAdam King <adking@redhat.com>
Fri, 24 Sep 2021 11:23:51 +0000 (07:23 -0400)
Signed-off-by: Adam King <adking@redhat.com>
src/pybind/mgr/cephadm/agent.py
src/pybind/mgr/cephadm/schedule.py
src/pybind/mgr/cephadm/serve.py

index 6671b02ad2abca25752f90f57fb14f3efbd33bb9..3a8028a4d0851ae8f6d899510a53434652334d55 100644 (file)
@@ -146,6 +146,7 @@ class HostData:
     def handle_metadata(self, data: Dict[str, Any]) -> None:
         try:
             host = data['host']
+            self.mgr.cache.agent_ports[host] = int(data['port'])
             if host not in self.mgr.cache.agent_counter:
                 self.mgr.log.debug(
                     f'Got metadata from agent on host {host} with no known counter entry. Starting counter at 1 and requesting new metadata')
@@ -153,7 +154,6 @@ class HostData:
                 self.mgr.agent_helpers._request_agent_acks({host})
                 return
 
-            self.mgr.cache.agent_ports[host] = int(data['port'])
             # update timestamp of most recent agent update
             self.mgr.cache.agent_timestamp[host] = datetime_now()
             up_to_date = False
index 7d45b57eae4d05f092f979f0da7765a26f552520..8f4f02e5e4b325ed51c13f3d1f041c07e277a014 100644 (file)
@@ -322,9 +322,7 @@ class HostAssignment(object):
         existing = existing_active + existing_standby
 
         # build to_add
-        if self.service_name == 'agent':
-            to_add = [dd for dd in others]
-        elif not count:
+        if not count:
             to_add = [dd for dd in others if dd.hostname not in [
                 h.hostname for h in self.unreachable_hosts]]
         else:
index a952fd625c55d37f8116def6c749d92a1e485f7c..c737e481209d63f9c98abec34b52e3eb2419b62d 100644 (file)
@@ -276,7 +276,7 @@ class CephadmServe:
                 self.mgr.cache.metadata_up_to_date[host] = False
                 if host in self.mgr.offline_hosts:
                     return
-                self.mgr.offline_hosts.add(host)
+
                 # In case host is actually offline, it's best to reset the connection to avoid
                 # a long timeout trying to use an existing connection to an offline host
                 # REVISIT AFTER https://github.com/ceph/ceph/pull/42919
@@ -375,11 +375,11 @@ class CephadmServe:
             for agent in agents_down:
                 detail.append((f'Cephadm agent on host {agent} has not reported in '
                               f'{2.5 * self.mgr.agent_refresh_rate} seconds. Agent is assumed '
-                               'down and host has been marked offline.'))
+                               'down and host may be offline.'))
             self.mgr.health_checks['CEPHADM_AGENT_DOWN'] = {
                 'severity': 'warning',
                 'summary': '%d Cephadm Agent(s) are not reporting. '
-                'Hosts marked offline' % (len(agents_down)),
+                'Hosts may be offline' % (len(agents_down)),
                 'count': len(agents_down),
                 'detail': detail,
             }