]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: agent: simplify handling of agent reports 44031/head
authorAdam King <adking@redhat.com>
Fri, 19 Nov 2021 18:58:02 +0000 (13:58 -0500)
committerAdam King <adking@redhat.com>
Wed, 15 Dec 2021 17:28:24 +0000 (12:28 -0500)
Don't try to do extra things like checking other agents
or updating health checks when agents report. Rely on
serve loop for that

This also should help with thrashing we've been seeing

Signed-off-by: Adam King <adking@redhat.com>
src/pybind/mgr/cephadm/agent.py
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/serve.py

index c1609e042be4668b05214742e554622c66b3c163..0284a799321561135ddfbdbb72246caaa70d82da 100644 (file)
@@ -18,14 +18,13 @@ from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
 
 from datetime import datetime, timedelta
 from OpenSSL import crypto
-from contextlib import contextmanager
 from cryptography import x509
 from cryptography.x509.oid import NameOID
 from cryptography.hazmat.primitives.asymmetric import rsa
 from cryptography.hazmat.primitives import hashes, serialization
 from cryptography.hazmat.backends import default_backend
 
-from typing import Any, Dict, Iterator, List, Set, Tuple, TYPE_CHECKING
+from typing import Any, Dict, List, Set, Tuple, TYPE_CHECKING
 
 if TYPE_CHECKING:
     from cephadm.module import CephadmOrchestrator
@@ -83,11 +82,6 @@ class CherryPyThread(threading.Thread):
             self.mgr.log.debug('Starting cherrypy engine...')
             self.start_engine()
             self.mgr.log.debug('Cherrypy engine started.')
-            agents_down = []
-            for h in self.mgr.cache.get_hosts():
-                if self.mgr.agent_helpers._check_agent(h):
-                    agents_down.append(h)
-            self.mgr.agent_helpers._update_agent_down_healthcheck(agents_down)
             # wait for the shutdown event
             self.cherrypy_shutdown_event.wait()
             self.cherrypy_shutdown_event.clear()
@@ -205,12 +199,6 @@ class HostData:
             error_daemons_old = set([dd.name() for dd in self.mgr.cache.get_error_daemons()])
             daemon_count_old = len(self.mgr.cache.get_daemons_by_host(host))
 
-            agents_down = []
-            for h in self.mgr.cache.get_hosts():
-                if self.mgr.agent_helpers._check_agent(h):
-                    agents_down.append(h)
-            self.mgr.agent_helpers._update_agent_down_healthcheck(agents_down)
-
             up_to_date = False
 
             int_ack = int(data['ack'])
@@ -342,7 +330,6 @@ class AgentLockException(Exception):
 class CephadmAgentHelpers:
     def __init__(self, mgr: "CephadmOrchestrator"):
         self.mgr: "CephadmOrchestrator" = mgr
-        self.agent_locks: Dict[str, threading.Lock] = {}
 
     def _request_agent_acks(self, hosts: Set[str], increment: bool = False) -> None:
         for host in hosts:
@@ -431,18 +418,6 @@ class CephadmAgentHelpers:
             self.mgr.cache.agent_ports = {}
         return need_apply
 
-    @contextmanager
-    def agent_lock(self, host: str) -> Iterator[threading.Lock]:
-        if host not in self.mgr.agent_helpers.agent_locks:
-            self.mgr.agent_helpers.agent_locks[host] = threading.Lock()
-        lock = self.mgr.agent_helpers.agent_locks[host]
-        if not lock.acquire(False):
-            raise AgentLockException()
-        try:
-            yield lock
-        finally:
-            lock.release()
-
     def _check_agent(self, host: str) -> bool:
         try:
             assert self.mgr.cherrypy_thread
@@ -452,24 +427,6 @@ class CephadmAgentHelpers:
                 f'Delaying checking agent on {host} until cephadm endpoint finished creating root cert')
             return False
         if self.mgr.agent_helpers._agent_down(host):
-            if host not in self.mgr.offline_hosts:
-                self.mgr.cache.metadata_up_to_date[host] = False
-                # In case host is actually offline, it's best to reset the connection to avoid
-                # a long timeout trying to use an existing connection to an offline host
-                self.mgr.ssh._reset_con(host)
-
-                try:
-                    # try to schedule redeploy of agent in case it is individually down
-                    agent = self.mgr.cache.get_daemons_by_type('agent', host=host)[0]
-                    with self.mgr.agent_helpers.agent_lock(host):
-                        daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(agent)
-                        self.mgr._daemon_action(daemon_spec, action='redeploy')
-                except AgentLockException:
-                    self.mgr.log.debug(
-                        f'Could not redeploy agent on host {host}. Someone else holds agent\'s lock')
-                except Exception as e:
-                    self.mgr.log.debug(
-                        f'Failed to redeploy agent on host {host}. Agent possibly never deployed: {e}')
             return True
         else:
             try:
@@ -487,25 +444,17 @@ class CephadmAgentHelpers:
                     host, agent.name())
                 if not last_config or last_deps != deps:
                     daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(agent)
-                    with self.mgr.agent_helpers.agent_lock(host):
-                        self.mgr._daemon_action(daemon_spec, action='reconfig')
+                    self.mgr._daemon_action(daemon_spec, action='reconfig')
                     return False
-            except AgentLockException:
-                self.mgr.log.debug(
-                    f'Could not reconfig agent on host {host}. Someone else holds agent\'s lock')
             except Exception as e:
                 self.mgr.log.debug(
                     f'Agent on host {host} not ready to have config and deps checked: {e}')
             action = self.mgr.cache.get_scheduled_daemon_action(agent.hostname, agent.name())
             if action:
                 try:
-                    with self.mgr.agent_helpers.agent_lock(host):
-                        daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(agent)
-                        self.mgr._daemon_action(daemon_spec, action=action)
-                        self.mgr.cache.rm_scheduled_daemon_action(agent.hostname, agent.name())
-                except AgentLockException:
-                    self.mgr.log.debug(
-                        f'Could not {action} agent on host {host}. Someone else holds agent\'s lock')
+                    daemon_spec = CephadmDaemonDeploySpec.from_daemon_description(agent)
+                    self.mgr._daemon_action(daemon_spec, action=action)
+                    self.mgr.cache.rm_scheduled_daemon_action(agent.hostname, agent.name())
                 except Exception as e:
                     self.mgr.log.debug(
                         f'Agent on host {host} not ready to {action}: {e}')
index e60ce48d743e71bfc9b4ee03e8a7816b23cdbebc..ac276e181bf4768500cd386e7d81f37541e4c34b 100644 (file)
@@ -750,12 +750,12 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
             self.offline_hosts.remove(host)
 
     def update_failed_daemon_health_check(self) -> None:
-        self.remove_health_warning('CEPHADM_FAILED_DAEMON')
         failed_daemons = []
         for dd in self.cache.get_error_daemons():
             failed_daemons.append('daemon %s on %s is in %s state' % (
                 dd.name(), dd.hostname, dd.status_desc
             ))
+        self.remove_health_warning('CEPHADM_FAILED_DAEMON')
         if failed_daemons:
             self.set_health_warning('CEPHADM_FAILED_DAEMON', f'{len(failed_daemons)} failed cephadm daemon(s)', len(
                 failed_daemons), failed_daemons)
index 9e3f9c0927f84a7ed1ca0a0cb1f665f67af4185c..bac455d465b2c54bf649fbc1bced2a3059655119 100644 (file)
@@ -855,9 +855,9 @@ class CephadmServe:
             if dd.daemon_type == 'agent':
                 try:
                     self.mgr.agent_helpers._check_agent(dd.hostname)
-                except Exception:
+                except Exception as e:
                     self.log.debug(
-                        f'Agent {dd.name()} could not be checked in _check_daemons')
+                        f'Agent {dd.name()} could not be checked in _check_daemons: {e}')
                 continue
 
             # These daemon types require additional configs after creation