From: Sebastian Wagner Date: Fri, 21 Aug 2020 14:25:31 +0000 (+0200) Subject: mgr/cephadm: Make daemon actions asynchronous X-Git-Tag: v15.2.9~122^2~117^2~10 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=3304bdeb506be1d9ffef0068d5e610ca968aa420;p=ceph.git mgr/cephadm: Make daemon actions asynchronous Fixes broken `ceph orch redeploy ` Signed-off-by: Sebastian Wagner (cherry picked from commit 365f839a79ac5e321dc088aaec07644c7bc554eb) --- diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index dd01469f578..1c66cdedb6c 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -1620,23 +1620,12 @@ To check that the host is reachable: daemon_type=daemon_type, ) - if image is not None: - if action != 'redeploy': - raise OrchestratorError( - f'Cannot execute {action} with new image. `action` needs to be `redeploy`') - if daemon_type not in CEPH_TYPES: - raise OrchestratorError( - f'Cannot redeploy {daemon_type}.{daemon_id} with a new image: Supported ' - f'types are: {", ".join(CEPH_TYPES)}') - - self.check_mon_command({ - 'prefix': 'config set', - 'name': 'container_image', - 'value': image, - 'who': utils.name_to_config_section(daemon_type + '.' + daemon_id), - }) + self._daemon_action_set_image(action, image, daemon_type, daemon_id) if action == 'redeploy': + if self.daemon_is_self(daemon_type, daemon_id): + self.mgr_service.fail_over() + return # unreachable. # stop, recreate the container+unit, then restart return self._create_daemon(daemon_spec) elif action == 'reconfig': @@ -1660,13 +1649,50 @@ To check that the host is reachable: self.events.for_daemon(name, 'INFO', msg) return msg + def _daemon_action_set_image(self, action: str, image: Optional[str], daemon_type: str, daemon_id: str): + if image is not None: + if action != 'redeploy': + raise OrchestratorError( + f'Cannot execute {action} with new image. `action` needs to be `redeploy`') + if daemon_type not in CEPH_TYPES: + raise OrchestratorError( + f'Cannot redeploy {daemon_type}.{daemon_id} with a new image: Supported ' + f'types are: {", ".join(CEPH_TYPES)}') + + self.check_mon_command({ + 'prefix': 'config set', + 'name': 'container_image', + 'value': image, + 'who': utils.name_to_config_section(daemon_type + '.' + daemon_id), + }) + @trivial_completion def daemon_action(self, action: str, daemon_name: str, image: Optional[str]=None) -> str: d = self.cache.get_daemon(daemon_name) - self.log.info(f'{action} daemon {daemon_name}') - return self._daemon_action(d.daemon_type, d.daemon_id, - d.hostname, action, image=image) + if action == 'redeploy' and self.daemon_is_self(d.daemon_type, d.daemon_id) \ + and not self.mgr_service.mgr_map_has_standby(): + raise OrchestratorError( + f'Unable to schedule redeploy for {daemon_name}: No standby MGRs') + + self._daemon_action_set_image(action, image, d.daemon_type, d.daemon_id) + + self.log.info(f'Schedule {action} daemon {daemon_name}') + return self._schedule_daemon_action(daemon_name, action) + + def daemon_is_self(self, daemon_type: str, daemon_id: str) -> bool: + return daemon_type == 'mgr' and daemon_id == self.get_mgr_id() + + def _schedule_daemon_action(self, daemon_name: str, action: str): + dd = self.cache.get_daemon(daemon_name) + if action == 'redeploy' and self.daemon_is_self(dd.daemon_type, dd.daemon_id) \ + and not self.mgr_service.mgr_map_has_standby(): + raise OrchestratorError( + f'Unable to schedule redeploy for {daemon_name}: No standby MGRs') + self.cache.schedule_daemon_action(dd.hostname, dd.name(), action) + msg = "Scheduled to {} {} on host '{}'".format(action, daemon_name, dd.hostname) + self._kick_serve_loop() + return msg @trivial_completion def remove_daemons(self, names): @@ -2155,30 +2181,34 @@ To check that the host is reachable: dd.hostname, dd.name()) if last_deps is None: last_deps = [] - reconfig = False + action = self.cache.get_scheduled_daemon_action(dd.hostname, dd.name()) if not last_config: self.log.info('Reconfiguring %s (unknown last config time)...'% ( dd.name())) - reconfig = True + action = 'reconfig' elif last_deps != deps: self.log.debug('%s deps %s -> %s' % (dd.name(), last_deps, deps)) self.log.info('Reconfiguring %s (dependencies changed)...' % ( dd.name())) - reconfig = True + action = 'reconfig' elif self.last_monmap and \ self.last_monmap > last_config and \ dd.daemon_type in CEPH_TYPES: self.log.info('Reconfiguring %s (monmap changed)...' % dd.name()) - reconfig = True - if reconfig: + action = 'reconfig' + if action: + if self.cache.get_scheduled_daemon_action(dd.hostname, dd.name()) == 'redeploy' \ + and action == 'reconfig': + action = 'redeploy' try: - self._create_daemon( - CephadmDaemonSpec( - host=dd.hostname, - daemon_id=dd.daemon_id, - daemon_type=dd.daemon_type), - reconfig=True) + self._daemon_action( + daemon_type=dd.daemon_type, + daemon_id=dd.daemon_id, + host=dd.hostname, + action=action + ) + self.cache.rm_scheduled_daemon_action(dd.hostname, dd.name()) except OrchestratorError as e: self.events.from_orch_error(e) if dd.daemon_type in daemons_post: diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py index 304e4a10a00..52b08a21afa 100644 --- a/src/pybind/mgr/cephadm/services/cephadmservice.py +++ b/src/pybind/mgr/cephadm/services/cephadmservice.py @@ -355,17 +355,14 @@ class MgrService(CephadmService): return self.mgr._create_daemon(daemon_spec) def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription: - active_mgr_str = self.mgr.get('mgr_map')['active_name'] for daemon in daemon_descrs: - if daemon.daemon_id == active_mgr_str: + if self.mgr.daemon_is_self(daemon.daemon_type, daemon.daemon_id): return daemon # if no active mgr found, return empty Daemon Desc return DaemonDescription() def fail_over(self): - mgr_map = self.mgr.get('mgr_map') - num = len(mgr_map.get('standbys')) - if not num: + if not self.mgr_map_has_standby(): raise OrchestratorError('Need standby mgr daemon', event_kind_subject=( 'daemon', 'mgr' + self.mgr.get_mgr_id())) @@ -379,6 +376,15 @@ class MgrService(CephadmService): 'who': self.mgr.get_mgr_id(), }) + def mgr_map_has_standby(self) -> bool: + """ + This is a bit safer than asking our inventory. If the mgr joined the mgr map, + we know it joined the cluster + """ + mgr_map = self.mgr.get('mgr_map') + num = len(mgr_map.get('standbys')) + return bool(num) + class MdsService(CephadmService): TYPE = 'mds' diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index fcff289609b..5d53a5f13c4 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -184,11 +184,13 @@ class TestCephadm(object): with with_daemon(cephadm_module, RGWSpec(service_id='myrgw.foobar'), CephadmOrchestrator.add_rgw, 'test') as daemon_id: c = cephadm_module.daemon_action('redeploy', 'rgw.' + daemon_id) - assert wait(cephadm_module, c) == f"Deployed rgw.{daemon_id} on host 'test'" + assert wait(cephadm_module, + c) == f"Scheduled to redeploy rgw.{daemon_id} on host 'test'" for what in ('start', 'stop', 'restart'): c = cephadm_module.daemon_action(what, 'rgw.' + daemon_id) - assert wait(cephadm_module, c) == what + f" rgw.{daemon_id} from host 'test'" + assert wait(cephadm_module, + c) == F"Scheduled to {what} rgw.{daemon_id} on host 'test'" # Make sure, _check_daemons does a redeploy due to monmap change: cephadm_module._store['_ceph_get/mon_map'] = { diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py index 89cc39038d8..eb1929bc961 100644 --- a/src/pybind/mgr/cephadm/upgrade.py +++ b/src/pybind/mgr/cephadm/upgrade.py @@ -261,8 +261,7 @@ class CephadmUpgrade: daemon_type, d.daemon_id, d.container_image_name, d.container_image_id, d.version)) - if daemon_type == 'mgr' and \ - d.daemon_id == self.mgr.get_mgr_id(): + if self.mgr.daemon_is_self(d.daemon_type, d.daemon_id): logger.info('Upgrade: Need to upgrade myself (mgr.%s)' % self.mgr.get_mgr_id()) need_upgrade_self = True