]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: agent: allow agent down multiplier to be configured 44101/head
authorAdam King <adking@redhat.com>
Wed, 24 Nov 2021 23:52:10 +0000 (18:52 -0500)
committerAdam King <adking@redhat.com>
Wed, 24 Nov 2021 23:52:10 +0000 (18:52 -0500)
Signed-off-by: Adam King <adking@redhat.com>
src/pybind/mgr/cephadm/agent.py
src/pybind/mgr/cephadm/module.py

index 624c3b20154725301012e62942fd1cdb864f1f68..523e945a9ccea6c271f554edf50354dd9198251b 100644 (file)
@@ -371,9 +371,10 @@ class CephadmAgentHelpers:
             self.mgr.cache.agent_timestamp[host] = datetime_now()
             if host in self.mgr.offline_hosts:
                 return False
-        # agent hasn't reported in 2.5 * it's refresh rate. Something is likely wrong with it.
+        # agent hasn't reported in down multiplier * it's refresh rate. Something is likely wrong with it.
+        down_mult: float = max(self.mgr.agent_down_multiplier, 1.5)
         time_diff = datetime_now() - self.mgr.cache.agent_timestamp[host]
-        if time_diff.total_seconds() > 2.5 * float(self.mgr.agent_refresh_rate):
+        if time_diff.total_seconds() > down_mult * float(self.mgr.agent_refresh_rate):
             return True
         return False
 
@@ -381,9 +382,10 @@ class CephadmAgentHelpers:
         self.mgr.remove_health_warning('CEPHADM_AGENT_DOWN')
         if down_agent_hosts:
             detail: List[str] = []
+            down_mult: float = max(self.mgr.agent_down_multiplier, 1.5)
             for agent in down_agent_hosts:
                 detail.append((f'Cephadm agent on host {agent} has not reported in '
-                              f'{2.5 * self.mgr.agent_refresh_rate} seconds. Agent is assumed '
+                              f'{down_mult * self.mgr.agent_refresh_rate} seconds. Agent is assumed '
                                'down and host may be offline.'))
             for dd in [d for d in self.mgr.cache.get_daemons_by_type('agent') if d.hostname in down_agent_hosts]:
                 dd.status = DaemonDescriptionStatus.error
index db7568a3e980fead33035601414d80c473cf773e..cd928382d03405deda043cad605ec630d1635829 100644 (file)
@@ -355,6 +355,12 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
             default=4721,
             desc='First port agent will try to bind to (will also try up to next 1000 subsequent ports if blocked)'
         ),
+        Option(
+            'agent_down_multiplier',
+            type='float',
+            default=3.0,
+            desc='Multiplied by agent refresh rate to calculate how long agent must not report before being marked down'
+        ),
         Option(
             'max_osd_draining_count',
             type='int',
@@ -423,6 +429,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
             self.ssh_pub: Optional[str] = None
             self.use_agent = False
             self.agent_refresh_rate = 0
+            self.agent_down_multiplier = 0.0
             self.agent_starting_port = 0
             self.apply_spec_fails: List[Tuple[str, str]] = []
             self.max_osd_draining_count = 10