From: Adam King Date: Mon, 6 Sep 2021 23:11:34 +0000 (-0400) Subject: mgr/cephadm: offline host handling improvements for agent X-Git-Tag: v17.1.0~816^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=f0139dd983b2886b1a6f68789672a53f67405cea;p=ceph.git mgr/cephadm: offline host handling improvements for agent Signed-off-by: Adam King --- diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index 6671b02ad2ab..3a8028a4d085 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -146,6 +146,7 @@ class HostData: def handle_metadata(self, data: Dict[str, Any]) -> None: try: host = data['host'] + self.mgr.cache.agent_ports[host] = int(data['port']) if host not in self.mgr.cache.agent_counter: self.mgr.log.debug( f'Got metadata from agent on host {host} with no known counter entry. Starting counter at 1 and requesting new metadata') @@ -153,7 +154,6 @@ class HostData: self.mgr.agent_helpers._request_agent_acks({host}) return - self.mgr.cache.agent_ports[host] = int(data['port']) # update timestamp of most recent agent update self.mgr.cache.agent_timestamp[host] = datetime_now() up_to_date = False diff --git a/src/pybind/mgr/cephadm/schedule.py b/src/pybind/mgr/cephadm/schedule.py index 7d45b57eae4d..8f4f02e5e4b3 100644 --- a/src/pybind/mgr/cephadm/schedule.py +++ b/src/pybind/mgr/cephadm/schedule.py @@ -322,9 +322,7 @@ class HostAssignment(object): existing = existing_active + existing_standby # build to_add - if self.service_name == 'agent': - to_add = [dd for dd in others] - elif not count: + if not count: to_add = [dd for dd in others if dd.hostname not in [ h.hostname for h in self.unreachable_hosts]] else: diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index a952fd625c55..c737e481209d 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -276,7 +276,7 @@ class CephadmServe: self.mgr.cache.metadata_up_to_date[host] = False if host in self.mgr.offline_hosts: return - self.mgr.offline_hosts.add(host) + # In case host is actually offline, it's best to reset the connection to avoid # a long timeout trying to use an existing connection to an offline host # REVISIT AFTER https://github.com/ceph/ceph/pull/42919 @@ -375,11 +375,11 @@ class CephadmServe: for agent in agents_down: detail.append((f'Cephadm agent on host {agent} has not reported in ' f'{2.5 * self.mgr.agent_refresh_rate} seconds. Agent is assumed ' - 'down and host has been marked offline.')) + 'down and host may be offline.')) self.mgr.health_checks['CEPHADM_AGENT_DOWN'] = { 'severity': 'warning', 'summary': '%d Cephadm Agent(s) are not reporting. ' - 'Hosts marked offline' % (len(agents_down)), + 'Hosts may be offline' % (len(agents_down)), 'count': len(agents_down), 'detail': detail, }