From fedcfc0a2d246a153f85f4d96e4f58a80674e82a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 25 Jan 2020 16:09:36 -0600 Subject: [PATCH] mgr/cephadm: upgrade: pull image after upgrade start, and for each host Make 'upgrade start' return quickly, without first pulling the image. Pull the image once to establish the image_id. For each host, before updating the container, ensure the local image is up to date, and if not, pull. If a pull returns a different image_id, restart upgrade process. (This could live-lock if two hosts have different container registries that return different image ids for the same image name. :/) Signed-off-by: Sage Weil --- src/pybind/mgr/cephadm/module.py | 37 +++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index c5d74910d1d..43e69fd9b0c 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -419,7 +419,14 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): return None target_name = self.upgrade_state.get('target_name') - target_id = self.upgrade_state.get('target_id') + target_id = self.upgrade_state.get('target_id', None) + if not target_id: + # need to learn the container hash + self.log.info('Upgrade: First pull of %s' % target_name) + target_id, target_version = self._get_container_image_id(target_name) + self.upgrade_state['target_id'] = target_id + self.upgrade_state['target_version'] = target_version + self._save_upgrade_state() target_version = self.upgrade_state.get('target_version') self.log.info('Upgrade: Target is %s with id %s' % (target_name, target_id)) @@ -455,6 +462,29 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): need_upgrade_self = True continue + # make sure host has latest container image + out, err, code = self._run_cephadm( + d.nodename, None, 'inspect-image', [], + image=target_name, no_fsid=True) + self.log.debug('out %s code %s' % (out, code)) + if code or json.loads(''.join(out)).get('image_id') != target_id: + self.log.info('Upgrade: Pulling %s on %s' % (target_name, + d.nodename)) + out, err, code = self._run_cephadm( + d.nodename, None, 'pull', [], + image=target_name, no_fsid=True) + if code: + self.log.warning('Upgrade: failed to pull %s on %s' % ( + target_name, d.nodename)) + # FIXME + continue + r = json.loads(''.join(out)) + if r.get('image_id') != target_id: + self.log.info('Upgrade: image %s pull on %s got new image %s (not %s), restarting' % (target_name, d.nodename, r['image_id'], target_id)) + self.upgrade_state['image_id'] = r['image_id'] + self._save_upgrade_state() + return None + if not self._wait_for_ok_to_stop(d): return None self.log.info('Upgrade: Redeploying %s.%s' % @@ -1844,16 +1874,13 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): self.upgrade_state.get('target_name')) return trivial_result('Upgrade to %s in progress' % self.upgrade_state.get('target_name')) - target_id, target_version = self._get_container_image_id(target_name) self.upgrade_state = { 'target_name': target_name, - 'target_id': target_id, - 'target_version': target_version, } self._save_upgrade_state() self._clear_health_checks() self.event.set() - return trivial_result('Initiating upgrade to %s %s' % (image, target_id)) + return trivial_result('Initiating upgrade to %s' % (image)) def upgrade_pause(self): if not self.upgrade_state: -- 2.39.5