except Exception as e:
self.mgr.log.warning(f'Received bad metadata from an agent: {e}')
else:
+ # if we got here, we've already verified the keyring of the agent. If
+ # host agent is reporting on is marked offline, it shouldn't be any more
+ self.mgr.offline_hosts_remove(data['host'])
self.handle_metadata(data)
def check_request_fields(self, data: Dict[str, Any]) -> None:
def _agent_down(self, host: str) -> bool:
# if we don't have a timestamp, it's likely because of a mgr fail over.
- # just set the timestamp to now
+ # just set the timestamp to now. However, if host was offline before, we
+ # should not allow creating a new timestamp to cause it to be marked online
if host not in self.mgr.cache.agent_timestamp:
self.mgr.cache.agent_timestamp[host] = datetime_now()
+ if host in self.mgr.offline_hosts:
+ return False
# agent hasn't reported in 2.5 * it's refresh rate. Something is likely wrong with it.
time_diff = datetime_now() - self.mgr.cache.agent_timestamp[host]
if time_diff.total_seconds() > 2.5 * float(self.mgr.agent_refresh_rate):
return True
def all_host_metadata_up_to_date(self) -> bool:
- if [h for h in self.get_hosts() if not self.host_metadata_up_to_date(h)]:
+ unreachables = [h.hostname for h in self.mgr._unreachable_hosts()]
+ if [h for h in self.get_hosts() if (not self.host_metadata_up_to_date(h) and h not in unreachables)]:
+ # this function is primarily for telling if it's safe to try and apply a service
+ # spec. Since offline/maintenance hosts aren't considered in that process anyway
+ # we don't want to return False if the host without up-to-date metadata is in one
+ # of those two categories.
return False
return True
return
if self.mgr.use_agent and self.mgr.agent_helpers._agent_down(host):
+ agents_down.append(host)
+ self.mgr.cache.metadata_up_to_date[host] = False
if host in self.mgr.offline_hosts:
return
self.mgr.offline_hosts.add(host)
# a long timeout trying to use an existing connection to an offline host
# REVISIT AFTER https://github.com/ceph/ceph/pull/42919
# self.mgr.ssh._reset_con(host)
- agents_down.append(host)
- # try to schedule redeploy of agent in case it is individually down
+
try:
+ # try to schedule redeploy of agent in case it is individually down
agent = [a for a in self.mgr.cache.get_daemons_by_host(
host) if a.hostname == host][0]
self.mgr._schedule_daemon_action(agent.name(), 'redeploy')
self.log.debug(
f'Failed to find entry for agent deployed on host {host}. Agent possibly never deployed: {e}')
return
- elif self.mgr.use_agent:
- self.mgr.offline_hosts_remove(host)
if self.mgr.cache.host_needs_check(host):
r = self._check_host(host)