From: Adam King Date: Wed, 3 Apr 2024 17:11:08 +0000 (-0400) Subject: mgr/cephadm: pass daemon's current image when reconfiguring X-Git-Tag: testing/wip-yuriw-testing-20240501.200505-squid~4^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=069a25d8584b2dcddb89ba0cc062367130413e01;p=ceph-ci.git mgr/cephadm: pass daemon's current image when reconfiguring Important to note here is that a reconfig will rewrite the config files of the daemon and restart it, but not rewrite the unit files. This lead to a bug where the grafana image we used between the quincy and squid release used different UIDs internally, which caused us to rewrite the config files as owned by a UID that worked with the new image but did not work with the image still specified in the unit.run file. This meant the grafana daemon was down from a bit after the mgr upgrade until the end of the upgrade when we redeploy all the monitoring images. Fixes: https://tracker.ceph.com/issues/65234 Signed-off-by: Adam King (cherry picked from commit 5922d20e05ef60414e72d11d0e947093d92dc288) Conflicts: src/pybind/mgr/cephadm/tests/test_services.py --- diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 73ccbb19694..c65d34a384e 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -1580,9 +1580,27 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self._kick_serve_loop() return HandleCommandResult() - def _get_container_image(self, daemon_name: str) -> Optional[str]: + def _get_container_image(self, daemon_name: str, use_current_daemon_image: bool = False) -> Optional[str]: daemon_type = daemon_name.split('.', 1)[0] # type: ignore image: Optional[str] = None + # Try to use current image if specified. This is necessary + # because, if we're reconfiguring the daemon, we can + # run into issues during upgrade. For example if the default grafana + # image is changing and we pass the new image name when doing + # the reconfig, we could end up using the UID required by the + # new image as owner for the config files we write, while the + # unit.run will still reference the old image that requires those + # config files to be owned by a different UID + # Note that "current image" just means the one we picked up + # when we last ran "cephadm ls" on the host + if use_current_daemon_image: + try: + dd = self.cache.get_daemon(daemon_name=daemon_name) + if dd.container_image_name: + return dd.container_image_name + except OrchestratorError: + self.log.debug(f'Could not find daemon {daemon_name} in cache ' + 'while searching for its image') if daemon_type in CEPH_IMAGE_TYPES: # get container image image = str(self.get_foreign_ceph_option( diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 33ec35de70d..559ef7e5be9 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -1373,6 +1373,7 @@ class CephadmServe: ), config_blobs=daemon_spec.final_config, ).dump_json_str(), + use_current_daemon_image=reconfig, ) if daemon_spec.daemon_type == 'agent': @@ -1517,11 +1518,12 @@ class CephadmServe: error_ok: Optional[bool] = False, image: Optional[str] = "", log_output: Optional[bool] = True, + use_current_daemon_image: bool = False, ) -> Any: try: out, err, code = await self._run_cephadm( host, entity, command, args, no_fsid=no_fsid, error_ok=error_ok, - image=image, log_output=log_output) + image=image, log_output=log_output, use_current_daemon_image=use_current_daemon_image) if code: raise OrchestratorError(f'host {host} `cephadm {command}` returned {code}: {err}') except Exception as e: @@ -1546,6 +1548,7 @@ class CephadmServe: env_vars: Optional[List[str]] = None, log_output: Optional[bool] = True, timeout: Optional[int] = None, # timeout in seconds + use_current_daemon_image: bool = False, ) -> Tuple[List[str], List[str], int]: """ Run cephadm on the remote host with the given command + args @@ -1566,7 +1569,10 @@ class CephadmServe: # Skip the image check for daemons deployed that are not ceph containers if not str(entity).startswith(bypass_image): if not image and entity is not cephadmNoImage: - image = self.mgr._get_container_image(entity) + image = self.mgr._get_container_image( + entity, + use_current_daemon_image=use_current_daemon_image + ) final_args = [] diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index 2477de13e00..4cfebfb37bb 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -127,10 +127,12 @@ def with_osd_daemon(cephadm_module: CephadmOrchestrator, _run_cephadm, host: str [host]).stdout == f"Created osd(s) 1 on host '{host}'" assert _run_cephadm.mock_calls == [ mock.call(host, 'osd', 'ceph-volume', - ['--', 'lvm', 'list', '--format', 'json'], no_fsid=False, error_ok=False, image='', log_output=True), - mock.call(host, f'osd.{osd_id}', ['_orch', 'deploy'], [], stdin=mock.ANY), + ['--', 'lvm', 'list', '--format', 'json'], + no_fsid=False, error_ok=False, image='', log_output=True, use_current_daemon_image=False), + mock.call(host, f'osd.{osd_id}', ['_orch', 'deploy'], [], stdin=mock.ANY, use_current_daemon_image=False), mock.call(host, 'osd', 'ceph-volume', - ['--', 'raw', 'list', '--format', 'json'], no_fsid=False, error_ok=False, image='', log_output=True), + ['--', 'raw', 'list', '--format', 'json'], + no_fsid=False, error_ok=False, image='', log_output=True, use_current_daemon_image=False), ] dd = cephadm_module.cache.get_daemon(f'osd.{osd_id}', host=host) assert dd.name() == f'osd.{osd_id}' @@ -524,6 +526,7 @@ class TestCephadm(object): }, }, }), + use_current_daemon_image=True, ) @mock.patch("cephadm.serve.CephadmServe._run_cephadm") @@ -578,6 +581,7 @@ class TestCephadm(object): "crush_location": "datacenter=a", }, }), + use_current_daemon_image=False, ) @mock.patch("cephadm.serve.CephadmServe._run_cephadm") @@ -619,6 +623,7 @@ class TestCephadm(object): "keyring": "[client.crash.test]\nkey = None\n", }, }), + use_current_daemon_image=False, ) @mock.patch("cephadm.serve.CephadmServe._run_cephadm") @@ -660,6 +665,7 @@ class TestCephadm(object): }, "config_blobs": {}, }), + use_current_daemon_image=False, ) @mock.patch("cephadm.serve.CephadmServe._run_cephadm") @@ -709,6 +715,7 @@ class TestCephadm(object): }, "config_blobs": {}, }), + use_current_daemon_image=False, ) @mock.patch("cephadm.serve.CephadmServe._run_cephadm") @@ -762,6 +769,7 @@ class TestCephadm(object): }, "config_blobs": {}, }), + use_current_daemon_image=False, ) @mock.patch("cephadm.serve.CephadmServe._run_cephadm") @@ -1107,9 +1115,13 @@ class TestCephadm(object): env_vars=['CEPH_VOLUME_OSDSPEC_AFFINITY=foo'], error_ok=True, stdin='{"config": "", "keyring": ""}') _run_cephadm.assert_any_call( - 'test', 'osd', 'ceph-volume', ['--', 'lvm', 'list', '--format', 'json'], image='', no_fsid=False, error_ok=False, log_output=True) + 'test', 'osd', 'ceph-volume', ['--', 'lvm', 'list', '--format', 'json'], + image='', no_fsid=False, error_ok=False, log_output=True, use_current_daemon_image=False + ) _run_cephadm.assert_any_call( - 'test', 'osd', 'ceph-volume', ['--', 'raw', 'list', '--format', 'json'], image='', no_fsid=False, error_ok=False, log_output=True) + 'test', 'osd', 'ceph-volume', ['--', 'raw', 'list', '--format', 'json'], + image='', no_fsid=False, error_ok=False, log_output=True, use_current_daemon_image=False + ) @mock.patch("cephadm.serve.CephadmServe._run_cephadm") def test_apply_osd_save_non_collocated(self, _run_cephadm, cephadm_module: CephadmOrchestrator): @@ -1147,11 +1159,16 @@ class TestCephadm(object): '--no-auto', '/dev/sdb', '--db-devices', '/dev/sdc', '--wal-devices', '/dev/sdd', '--yes', '--no-systemd'], env_vars=['CEPH_VOLUME_OSDSPEC_AFFINITY=noncollocated'], - error_ok=True, stdin='{"config": "", "keyring": ""}') + error_ok=True, stdin='{"config": "", "keyring": ""}', + ) _run_cephadm.assert_any_call( - 'test', 'osd', 'ceph-volume', ['--', 'lvm', 'list', '--format', 'json'], image='', no_fsid=False, error_ok=False, log_output=True) + 'test', 'osd', 'ceph-volume', ['--', 'lvm', 'list', '--format', 'json'], + image='', no_fsid=False, error_ok=False, log_output=True, use_current_daemon_image=False + ) _run_cephadm.assert_any_call( - 'test', 'osd', 'ceph-volume', ['--', 'raw', 'list', '--format', 'json'], image='', no_fsid=False, error_ok=False, log_output=True) + 'test', 'osd', 'ceph-volume', ['--', 'raw', 'list', '--format', 'json'], + image='', no_fsid=False, error_ok=False, log_output=True, use_current_daemon_image=False + ) @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) @mock.patch("cephadm.module.SpecStore.save") @@ -2283,10 +2300,10 @@ Traceback (most recent call last): assert _run_cephadm.mock_calls == [ mock.call('test', 'osd', 'ceph-volume', ['--', 'inventory', '--format=json-pretty', '--filter-for-batch'], image='', - no_fsid=False, error_ok=False, log_output=False), + no_fsid=False, error_ok=False, log_output=False, use_current_daemon_image=False), mock.call('test', 'osd', 'ceph-volume', ['--', 'inventory', '--format=json-pretty'], image='', - no_fsid=False, error_ok=False, log_output=False), + no_fsid=False, error_ok=False, log_output=False, use_current_daemon_image=False), ] @mock.patch("cephadm.serve.CephadmServe._run_cephadm") diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index 215feea3edd..ea99932c297 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -345,6 +345,7 @@ log_to_file = False""" }, } }), + use_current_daemon_image=False, ) @@ -455,6 +456,7 @@ tgt_cmd_extra_args = {tgt_cmd_extra_args}\n""" } } }), + use_current_daemon_image=False, ) @@ -565,6 +567,7 @@ class TestMonitoring: "peers": [], } }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -660,6 +663,7 @@ class TestMonitoring: 'web_config': '/etc/alertmanager/web.yml', } }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -762,6 +766,7 @@ class TestMonitoring: 'ip_to_bind_to': '1.2.3.1', }, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -938,6 +943,7 @@ class TestMonitoring: 'web_config': '/etc/prometheus/web.yml', }, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -1006,6 +1012,7 @@ class TestMonitoring: }, }, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -1064,6 +1071,7 @@ class TestMonitoring: }, }, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -1175,6 +1183,7 @@ class TestMonitoring: "files": files, }, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) @@ -1349,6 +1358,7 @@ spec: }, "config_blobs": {}, }), + use_current_daemon_image=True, ) @@ -1455,6 +1465,7 @@ class TestSNMPGateway: }, "config_blobs": config, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -1502,6 +1513,7 @@ class TestSNMPGateway: }, "config_blobs": config, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -1553,6 +1565,7 @@ class TestSNMPGateway: }, "config_blobs": config, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -1609,6 +1622,7 @@ class TestSNMPGateway: }, "config_blobs": config, }), + use_current_daemon_image=False, ) @@ -2677,6 +2691,7 @@ class TestJaeger: }, "config_blobs": config, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -2716,6 +2731,7 @@ class TestJaeger: }, "config_blobs": es_config, }), + use_current_daemon_image=False, ) with with_service(cephadm_module, collector_spec): _run_cephadm.assert_called_with( @@ -2743,6 +2759,7 @@ class TestJaeger: }, "config_blobs": collector_config, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -2782,6 +2799,7 @@ class TestJaeger: }, "config_blobs": collector_config, }), + use_current_daemon_image=False, ) with with_service(cephadm_module, agent_spec): _run_cephadm.assert_called_with( @@ -2809,6 +2827,7 @@ class TestJaeger: }, "config_blobs": agent_config, }), + use_current_daemon_image=False, ) @@ -2865,6 +2884,7 @@ class TestCustomContainer: }, } ), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -2951,6 +2971,7 @@ class TestCustomContainer: ['_orch', 'deploy'], [], stdin=json.dumps(expected), + use_current_daemon_image=False, ) @@ -3001,6 +3022,7 @@ class TestSMB: ['_orch', 'deploy'], [], stdin=json.dumps(expected), + use_current_daemon_image=False ) @patch("cephadm.module.CephadmOrchestrator.get_unique_name") @@ -3070,4 +3092,5 @@ class TestSMB: ['_orch', 'deploy'], [], stdin=json.dumps(expected), + use_current_daemon_image=False )