mgr/cephadm: agent: allow agent down multiplier to be configured

author Adam King <adking@redhat.com>

Wed, 24 Nov 2021 23:52:10 +0000 (18:52 -0500)

committer Adam King <adking@redhat.com>

Wed, 24 Nov 2021 23:52:10 +0000 (18:52 -0500)
author Adam King <adking@redhat.com>
Wed, 24 Nov 2021 23:52:10 +0000 (18:52 -0500)
committer Adam King <adking@redhat.com>
Wed, 24 Nov 2021 23:52:10 +0000 (18:52 -0500)
diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py

index 624c3b20154725301012e62942fd1cdb864f1f68..523e945a9ccea6c271f554edf50354dd9198251b 100644 (file)
--- a/src/pybind/mgr/cephadm/agent.py
+++ b/src/pybind/mgr/cephadm/agent.py
@@ -371,9 +371,10 @@ class CephadmAgentHelpers:
              self.mgr.cache.agent_timestamp[host] = datetime_now()
              if host in self.mgr.offline_hosts:
                  return False
-        # agent hasn't reported in 2.5 * it's refresh rate. Something is likely wrong with it.
+        # agent hasn't reported in down multiplier * it's refresh rate. Something is likely wrong with it.
+        down_mult: float = max(self.mgr.agent_down_multiplier, 1.5)
          time_diff = datetime_now() - self.mgr.cache.agent_timestamp[host]
-        if time_diff.total_seconds() > 2.5 * float(self.mgr.agent_refresh_rate):
+        if time_diff.total_seconds() > down_mult * float(self.mgr.agent_refresh_rate):
              return True
          return False
  
@@ -381,9 +382,10 @@ class CephadmAgentHelpers:
          self.mgr.remove_health_warning('CEPHADM_AGENT_DOWN')
          if down_agent_hosts:
              detail: List[str] = []
+            down_mult: float = max(self.mgr.agent_down_multiplier, 1.5)
              for agent in down_agent_hosts:
                  detail.append((f'Cephadm agent on host {agent} has not reported in '
-                              f'{2.5 * self.mgr.agent_refresh_rate} seconds. Agent is assumed '
+                              f'{down_mult * self.mgr.agent_refresh_rate} seconds. Agent is assumed '
                                 'down and host may be offline.'))
              for dd in [d for d in self.mgr.cache.get_daemons_by_type('agent') if d.hostname in down_agent_hosts]:
                  dd.status = DaemonDescriptionStatus.error
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py

index db7568a3e980fead33035601414d80c473cf773e..cd928382d03405deda043cad605ec630d1635829 100644 (file)
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -355,6 +355,12 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
              default=4721,
              desc='First port agent will try to bind to (will also try up to next 1000 subsequent ports if blocked)'
          ),
+        Option(
+            'agent_down_multiplier',
+            type='float',
+            default=3.0,
+            desc='Multiplied by agent refresh rate to calculate how long agent must not report before being marked down'
+        ),
          Option(
              'max_osd_draining_count',
              type='int',
@@ -423,6 +429,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
              self.ssh_pub: Optional[str] = None
              self.use_agent = False
              self.agent_refresh_rate = 0
+            self.agent_down_multiplier = 0.0
              self.agent_starting_port = 0
              self.apply_spec_fails: List[Tuple[str, str]] = []
              self.max_osd_draining_count = 10
author	Adam King <adking@redhat.com>
	Wed, 24 Nov 2021 23:52:10 +0000 (18:52 -0500)
committer	Adam King <adking@redhat.com>
	Wed, 24 Nov 2021 23:52:10 +0000 (18:52 -0500)
src/pybind/mgr/cephadm/agent.py		patch \| blob \| history
src/pybind/mgr/cephadm/module.py		patch \| blob \| history