From: Sebastian Wagner <sebastian.wagner@suse.com>
Date: Fri, 21 Aug 2020 14:25:31 +0000 (+0200)
Subject: mgr/cephadm: Make daemon actions asynchronous
X-Git-Tag: v15.2.9~122^2~117^2~10
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=3304bdeb506be1d9ffef0068d5e610ca968aa420;p=ceph.git

mgr/cephadm: Make daemon actions asynchronous

Fixes broken `ceph orch redeploy <current active mgr>`

Signed-off-by: Sebastian Wagner <sebastian.wagner@suse.com>
(cherry picked from commit 365f839a79ac5e321dc088aaec07644c7bc554eb)
---

diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index dd01469f578..1c66cdedb6c 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -1620,23 +1620,12 @@ To check that the host is reachable:
             daemon_type=daemon_type,
         )
 
-        if image is not None:
-            if action != 'redeploy':
-                raise OrchestratorError(
-                    f'Cannot execute {action} with new image. `action` needs to be `redeploy`')
-            if daemon_type not in CEPH_TYPES:
-                raise OrchestratorError(
-                    f'Cannot redeploy {daemon_type}.{daemon_id} with a new image: Supported '
-                    f'types are: {", ".join(CEPH_TYPES)}')
-
-            self.check_mon_command({
-                'prefix': 'config set',
-                'name': 'container_image',
-                'value': image,
-                'who': utils.name_to_config_section(daemon_type + '.' + daemon_id),
-            })
+        self._daemon_action_set_image(action, image, daemon_type, daemon_id)
 
         if action == 'redeploy':
+            if self.daemon_is_self(daemon_type, daemon_id):
+                self.mgr_service.fail_over()
+                return # unreachable.
             # stop, recreate the container+unit, then restart
             return self._create_daemon(daemon_spec)
         elif action == 'reconfig':
@@ -1660,13 +1649,50 @@ To check that the host is reachable:
         self.events.for_daemon(name, 'INFO', msg)
         return msg
 
+    def _daemon_action_set_image(self, action: str, image: Optional[str], daemon_type: str, daemon_id: str):
+        if image is not None:
+            if action != 'redeploy':
+                raise OrchestratorError(
+                    f'Cannot execute {action} with new image. `action` needs to be `redeploy`')
+            if daemon_type not in CEPH_TYPES:
+                raise OrchestratorError(
+                    f'Cannot redeploy {daemon_type}.{daemon_id} with a new image: Supported '
+                    f'types are: {", ".join(CEPH_TYPES)}')
+
+            self.check_mon_command({
+                'prefix': 'config set',
+                'name': 'container_image',
+                'value': image,
+                'who': utils.name_to_config_section(daemon_type + '.' + daemon_id),
+            })
+
     @trivial_completion
     def daemon_action(self, action: str, daemon_name: str, image: Optional[str]=None) -> str:
         d = self.cache.get_daemon(daemon_name)
 
-        self.log.info(f'{action} daemon {daemon_name}')
-        return self._daemon_action(d.daemon_type, d.daemon_id,
-                                 d.hostname, action, image=image)
+        if action == 'redeploy' and self.daemon_is_self(d.daemon_type, d.daemon_id) \
+                and not self.mgr_service.mgr_map_has_standby():
+            raise OrchestratorError(
+                f'Unable to schedule redeploy for {daemon_name}: No standby MGRs')
+
+        self._daemon_action_set_image(action, image, d.daemon_type, d.daemon_id)
+
+        self.log.info(f'Schedule {action} daemon {daemon_name}')
+        return self._schedule_daemon_action(daemon_name, action)
+
+    def daemon_is_self(self, daemon_type: str, daemon_id: str) -> bool:
+        return daemon_type == 'mgr' and daemon_id == self.get_mgr_id()
+
+    def _schedule_daemon_action(self, daemon_name: str, action: str):
+        dd = self.cache.get_daemon(daemon_name)
+        if action == 'redeploy' and self.daemon_is_self(dd.daemon_type, dd.daemon_id) \
+                and not self.mgr_service.mgr_map_has_standby():
+            raise OrchestratorError(
+                f'Unable to schedule redeploy for {daemon_name}: No standby MGRs')
+        self.cache.schedule_daemon_action(dd.hostname, dd.name(), action)
+        msg = "Scheduled to {} {} on host '{}'".format(action, daemon_name, dd.hostname)
+        self._kick_serve_loop()
+        return msg
 
     @trivial_completion
     def remove_daemons(self, names):
@@ -2155,30 +2181,34 @@ To check that the host is reachable:
                 dd.hostname, dd.name())
             if last_deps is None:
                 last_deps = []
-            reconfig = False
+            action = self.cache.get_scheduled_daemon_action(dd.hostname, dd.name())
             if not last_config:
                 self.log.info('Reconfiguring %s (unknown last config time)...'% (
                     dd.name()))
-                reconfig = True
+                action = 'reconfig'
             elif last_deps != deps:
                 self.log.debug('%s deps %s -> %s' % (dd.name(), last_deps,
                                                      deps))
                 self.log.info('Reconfiguring %s (dependencies changed)...' % (
                     dd.name()))
-                reconfig = True
+                action = 'reconfig'
             elif self.last_monmap and \
                     self.last_monmap > last_config and \
                dd.daemon_type in CEPH_TYPES:
                 self.log.info('Reconfiguring %s (monmap changed)...' % dd.name())
-                reconfig = True
-            if reconfig:
+                action = 'reconfig'
+            if action:
+                if self.cache.get_scheduled_daemon_action(dd.hostname, dd.name()) == 'redeploy' \
+                        and action == 'reconfig':
+                    action = 'redeploy'
                 try:
-                    self._create_daemon(
-                        CephadmDaemonSpec(
-                            host=dd.hostname,
-                            daemon_id=dd.daemon_id,
-                            daemon_type=dd.daemon_type),
-                        reconfig=True)
+                    self._daemon_action(
+                        daemon_type=dd.daemon_type,
+                        daemon_id=dd.daemon_id,
+                        host=dd.hostname,
+                        action=action
+                    )
+                    self.cache.rm_scheduled_daemon_action(dd.hostname, dd.name())
                 except OrchestratorError as e:
                     self.events.from_orch_error(e)
                     if dd.daemon_type in daemons_post:
diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py
index 304e4a10a00..52b08a21afa 100644
--- a/src/pybind/mgr/cephadm/services/cephadmservice.py
+++ b/src/pybind/mgr/cephadm/services/cephadmservice.py
@@ -355,17 +355,14 @@ class MgrService(CephadmService):
         return self.mgr._create_daemon(daemon_spec)
 
     def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
-        active_mgr_str = self.mgr.get('mgr_map')['active_name']
         for daemon in daemon_descrs:
-            if daemon.daemon_id == active_mgr_str:
+            if self.mgr.daemon_is_self(daemon.daemon_type, daemon.daemon_id):
                 return daemon
         # if no active mgr found, return empty Daemon Desc
         return DaemonDescription()
 
     def fail_over(self):
-        mgr_map = self.mgr.get('mgr_map')
-        num = len(mgr_map.get('standbys'))
-        if not num:
+        if not self.mgr_map_has_standby():
             raise OrchestratorError('Need standby mgr daemon', event_kind_subject=(
                 'daemon', 'mgr' + self.mgr.get_mgr_id()))
 
@@ -379,6 +376,15 @@ class MgrService(CephadmService):
             'who': self.mgr.get_mgr_id(),
         })
 
+    def mgr_map_has_standby(self) -> bool:
+        """
+        This is a bit safer than asking our inventory. If the mgr joined the mgr map,
+        we know it joined the cluster
+        """
+        mgr_map = self.mgr.get('mgr_map')
+        num = len(mgr_map.get('standbys'))
+        return bool(num)
+
 
 class MdsService(CephadmService):
     TYPE = 'mds'
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py
index fcff289609b..5d53a5f13c4 100644
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -184,11 +184,13 @@ class TestCephadm(object):
             with with_daemon(cephadm_module, RGWSpec(service_id='myrgw.foobar'), CephadmOrchestrator.add_rgw, 'test') as daemon_id:
 
                 c = cephadm_module.daemon_action('redeploy', 'rgw.' + daemon_id)
-                assert wait(cephadm_module, c) == f"Deployed rgw.{daemon_id} on host 'test'"
+                assert wait(cephadm_module,
+                            c) == f"Scheduled to redeploy rgw.{daemon_id} on host 'test'"
 
                 for what in ('start', 'stop', 'restart'):
                     c = cephadm_module.daemon_action(what, 'rgw.' + daemon_id)
-                    assert wait(cephadm_module, c) == what + f" rgw.{daemon_id} from host 'test'"
+                    assert wait(cephadm_module,
+                                c) == F"Scheduled to {what} rgw.{daemon_id} on host 'test'"
 
                 # Make sure, _check_daemons does a redeploy due to monmap change:
                 cephadm_module._store['_ceph_get/mon_map'] = {
diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py
index 89cc39038d8..eb1929bc961 100644
--- a/src/pybind/mgr/cephadm/upgrade.py
+++ b/src/pybind/mgr/cephadm/upgrade.py
@@ -261,8 +261,7 @@ class CephadmUpgrade:
                     daemon_type, d.daemon_id,
                     d.container_image_name, d.container_image_id, d.version))
 
-                if daemon_type == 'mgr' and \
-                   d.daemon_id == self.mgr.get_mgr_id():
+                if self.mgr.daemon_is_self(d.daemon_type, d.daemon_id):
                     logger.info('Upgrade: Need to upgrade myself (mgr.%s)' %
                                 self.mgr.get_mgr_id())
                     need_upgrade_self = True