From 549a0775146cae2b860063b6a6aadb7d68399408 Mon Sep 17 00:00:00 2001 From: Adam King Date: Wed, 24 Nov 2021 18:52:10 -0500 Subject: [PATCH] mgr/cephadm: agent: allow agent down multiplier to be configured Signed-off-by: Adam King --- src/pybind/mgr/cephadm/agent.py | 8 +++++--- src/pybind/mgr/cephadm/module.py | 7 +++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index 624c3b2015472..523e945a9ccea 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -371,9 +371,10 @@ class CephadmAgentHelpers: self.mgr.cache.agent_timestamp[host] = datetime_now() if host in self.mgr.offline_hosts: return False - # agent hasn't reported in 2.5 * it's refresh rate. Something is likely wrong with it. + # agent hasn't reported in down multiplier * it's refresh rate. Something is likely wrong with it. + down_mult: float = max(self.mgr.agent_down_multiplier, 1.5) time_diff = datetime_now() - self.mgr.cache.agent_timestamp[host] - if time_diff.total_seconds() > 2.5 * float(self.mgr.agent_refresh_rate): + if time_diff.total_seconds() > down_mult * float(self.mgr.agent_refresh_rate): return True return False @@ -381,9 +382,10 @@ class CephadmAgentHelpers: self.mgr.remove_health_warning('CEPHADM_AGENT_DOWN') if down_agent_hosts: detail: List[str] = [] + down_mult: float = max(self.mgr.agent_down_multiplier, 1.5) for agent in down_agent_hosts: detail.append((f'Cephadm agent on host {agent} has not reported in ' - f'{2.5 * self.mgr.agent_refresh_rate} seconds. Agent is assumed ' + f'{down_mult * self.mgr.agent_refresh_rate} seconds. Agent is assumed ' 'down and host may be offline.')) for dd in [d for d in self.mgr.cache.get_daemons_by_type('agent') if d.hostname in down_agent_hosts]: dd.status = DaemonDescriptionStatus.error diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index db7568a3e980f..cd928382d0340 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -355,6 +355,12 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, default=4721, desc='First port agent will try to bind to (will also try up to next 1000 subsequent ports if blocked)' ), + Option( + 'agent_down_multiplier', + type='float', + default=3.0, + desc='Multiplied by agent refresh rate to calculate how long agent must not report before being marked down' + ), Option( 'max_osd_draining_count', type='int', @@ -423,6 +429,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.ssh_pub: Optional[str] = None self.use_agent = False self.agent_refresh_rate = 0 + self.agent_down_multiplier = 0.0 self.agent_starting_port = 0 self.apply_spec_fails: List[Tuple[str, str]] = [] self.max_osd_draining_count = 10 -- 2.39.5