From: Adam King Date: Wed, 3 Apr 2024 17:11:08 +0000 (-0400) Subject: mgr/cephadm: pass daemon's current image when reconfiguring X-Git-Tag: testing/wip-batrick-testing-20240411.154038~25^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=5922d20e05ef60414e72d11d0e947093d92dc288;p=ceph-ci.git mgr/cephadm: pass daemon's current image when reconfiguring Important to note here is that a reconfig will rewrite the config files of the daemon and restart it, but not rewrite the unit files. This lead to a bug where the grafana image we used between the quincy and squid release used different UIDs internally, which caused us to rewrite the config files as owned by a UID that worked with the new image but did not work with the image still specified in the unit.run file. This meant the grafana daemon was down from a bit after the mgr upgrade until the end of the upgrade when we redeploy all the monitoring images. Fixes: https://tracker.ceph.com/issues/65234 Signed-off-by: Adam King --- diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index b93cb7f3096..529e9e4a233 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -1582,9 +1582,27 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self._kick_serve_loop() return HandleCommandResult() - def _get_container_image(self, daemon_name: str) -> Optional[str]: + def _get_container_image(self, daemon_name: str, use_current_daemon_image: bool = False) -> Optional[str]: daemon_type = daemon_name.split('.', 1)[0] # type: ignore image: Optional[str] = None + # Try to use current image if specified. This is necessary + # because, if we're reconfiguring the daemon, we can + # run into issues during upgrade. For example if the default grafana + # image is changing and we pass the new image name when doing + # the reconfig, we could end up using the UID required by the + # new image as owner for the config files we write, while the + # unit.run will still reference the old image that requires those + # config files to be owned by a different UID + # Note that "current image" just means the one we picked up + # when we last ran "cephadm ls" on the host + if use_current_daemon_image: + try: + dd = self.cache.get_daemon(daemon_name=daemon_name) + if dd.container_image_name: + return dd.container_image_name + except OrchestratorError: + self.log.debug(f'Could not find daemon {daemon_name} in cache ' + 'while searching for its image') if daemon_type in CEPH_IMAGE_TYPES: # get container image image = str(self.get_foreign_ceph_option( diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 33ec35de70d..559ef7e5be9 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -1373,6 +1373,7 @@ class CephadmServe: ), config_blobs=daemon_spec.final_config, ).dump_json_str(), + use_current_daemon_image=reconfig, ) if daemon_spec.daemon_type == 'agent': @@ -1517,11 +1518,12 @@ class CephadmServe: error_ok: Optional[bool] = False, image: Optional[str] = "", log_output: Optional[bool] = True, + use_current_daemon_image: bool = False, ) -> Any: try: out, err, code = await self._run_cephadm( host, entity, command, args, no_fsid=no_fsid, error_ok=error_ok, - image=image, log_output=log_output) + image=image, log_output=log_output, use_current_daemon_image=use_current_daemon_image) if code: raise OrchestratorError(f'host {host} `cephadm {command}` returned {code}: {err}') except Exception as e: @@ -1546,6 +1548,7 @@ class CephadmServe: env_vars: Optional[List[str]] = None, log_output: Optional[bool] = True, timeout: Optional[int] = None, # timeout in seconds + use_current_daemon_image: bool = False, ) -> Tuple[List[str], List[str], int]: """ Run cephadm on the remote host with the given command + args @@ -1566,7 +1569,10 @@ class CephadmServe: # Skip the image check for daemons deployed that are not ceph containers if not str(entity).startswith(bypass_image): if not image and entity is not cephadmNoImage: - image = self.mgr._get_container_image(entity) + image = self.mgr._get_container_image( + entity, + use_current_daemon_image=use_current_daemon_image + ) final_args = [] diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index 2477de13e00..4cfebfb37bb 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -127,10 +127,12 @@ def with_osd_daemon(cephadm_module: CephadmOrchestrator, _run_cephadm, host: str [host]).stdout == f"Created osd(s) 1 on host '{host}'" assert _run_cephadm.mock_calls == [ mock.call(host, 'osd', 'ceph-volume', - ['--', 'lvm', 'list', '--format', 'json'], no_fsid=False, error_ok=False, image='', log_output=True), - mock.call(host, f'osd.{osd_id}', ['_orch', 'deploy'], [], stdin=mock.ANY), + ['--', 'lvm', 'list', '--format', 'json'], + no_fsid=False, error_ok=False, image='', log_output=True, use_current_daemon_image=False), + mock.call(host, f'osd.{osd_id}', ['_orch', 'deploy'], [], stdin=mock.ANY, use_current_daemon_image=False), mock.call(host, 'osd', 'ceph-volume', - ['--', 'raw', 'list', '--format', 'json'], no_fsid=False, error_ok=False, image='', log_output=True), + ['--', 'raw', 'list', '--format', 'json'], + no_fsid=False, error_ok=False, image='', log_output=True, use_current_daemon_image=False), ] dd = cephadm_module.cache.get_daemon(f'osd.{osd_id}', host=host) assert dd.name() == f'osd.{osd_id}' @@ -524,6 +526,7 @@ class TestCephadm(object): }, }, }), + use_current_daemon_image=True, ) @mock.patch("cephadm.serve.CephadmServe._run_cephadm") @@ -578,6 +581,7 @@ class TestCephadm(object): "crush_location": "datacenter=a", }, }), + use_current_daemon_image=False, ) @mock.patch("cephadm.serve.CephadmServe._run_cephadm") @@ -619,6 +623,7 @@ class TestCephadm(object): "keyring": "[client.crash.test]\nkey = None\n", }, }), + use_current_daemon_image=False, ) @mock.patch("cephadm.serve.CephadmServe._run_cephadm") @@ -660,6 +665,7 @@ class TestCephadm(object): }, "config_blobs": {}, }), + use_current_daemon_image=False, ) @mock.patch("cephadm.serve.CephadmServe._run_cephadm") @@ -709,6 +715,7 @@ class TestCephadm(object): }, "config_blobs": {}, }), + use_current_daemon_image=False, ) @mock.patch("cephadm.serve.CephadmServe._run_cephadm") @@ -762,6 +769,7 @@ class TestCephadm(object): }, "config_blobs": {}, }), + use_current_daemon_image=False, ) @mock.patch("cephadm.serve.CephadmServe._run_cephadm") @@ -1107,9 +1115,13 @@ class TestCephadm(object): env_vars=['CEPH_VOLUME_OSDSPEC_AFFINITY=foo'], error_ok=True, stdin='{"config": "", "keyring": ""}') _run_cephadm.assert_any_call( - 'test', 'osd', 'ceph-volume', ['--', 'lvm', 'list', '--format', 'json'], image='', no_fsid=False, error_ok=False, log_output=True) + 'test', 'osd', 'ceph-volume', ['--', 'lvm', 'list', '--format', 'json'], + image='', no_fsid=False, error_ok=False, log_output=True, use_current_daemon_image=False + ) _run_cephadm.assert_any_call( - 'test', 'osd', 'ceph-volume', ['--', 'raw', 'list', '--format', 'json'], image='', no_fsid=False, error_ok=False, log_output=True) + 'test', 'osd', 'ceph-volume', ['--', 'raw', 'list', '--format', 'json'], + image='', no_fsid=False, error_ok=False, log_output=True, use_current_daemon_image=False + ) @mock.patch("cephadm.serve.CephadmServe._run_cephadm") def test_apply_osd_save_non_collocated(self, _run_cephadm, cephadm_module: CephadmOrchestrator): @@ -1147,11 +1159,16 @@ class TestCephadm(object): '--no-auto', '/dev/sdb', '--db-devices', '/dev/sdc', '--wal-devices', '/dev/sdd', '--yes', '--no-systemd'], env_vars=['CEPH_VOLUME_OSDSPEC_AFFINITY=noncollocated'], - error_ok=True, stdin='{"config": "", "keyring": ""}') + error_ok=True, stdin='{"config": "", "keyring": ""}', + ) _run_cephadm.assert_any_call( - 'test', 'osd', 'ceph-volume', ['--', 'lvm', 'list', '--format', 'json'], image='', no_fsid=False, error_ok=False, log_output=True) + 'test', 'osd', 'ceph-volume', ['--', 'lvm', 'list', '--format', 'json'], + image='', no_fsid=False, error_ok=False, log_output=True, use_current_daemon_image=False + ) _run_cephadm.assert_any_call( - 'test', 'osd', 'ceph-volume', ['--', 'raw', 'list', '--format', 'json'], image='', no_fsid=False, error_ok=False, log_output=True) + 'test', 'osd', 'ceph-volume', ['--', 'raw', 'list', '--format', 'json'], + image='', no_fsid=False, error_ok=False, log_output=True, use_current_daemon_image=False + ) @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) @mock.patch("cephadm.module.SpecStore.save") @@ -2283,10 +2300,10 @@ Traceback (most recent call last): assert _run_cephadm.mock_calls == [ mock.call('test', 'osd', 'ceph-volume', ['--', 'inventory', '--format=json-pretty', '--filter-for-batch'], image='', - no_fsid=False, error_ok=False, log_output=False), + no_fsid=False, error_ok=False, log_output=False, use_current_daemon_image=False), mock.call('test', 'osd', 'ceph-volume', ['--', 'inventory', '--format=json-pretty'], image='', - no_fsid=False, error_ok=False, log_output=False), + no_fsid=False, error_ok=False, log_output=False, use_current_daemon_image=False), ] @mock.patch("cephadm.serve.CephadmServe._run_cephadm") diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index 593cdc85e73..715f7e56117 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -345,6 +345,7 @@ log_to_file = False""" }, } }), + use_current_daemon_image=False, ) @@ -480,6 +481,7 @@ timeout = 1.0\n""" } } }), + use_current_daemon_image=False, ) @@ -590,6 +592,7 @@ class TestMonitoring: "peers": [], } }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -685,6 +688,7 @@ class TestMonitoring: 'web_config': '/etc/alertmanager/web.yml', } }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -819,6 +823,7 @@ class TestMonitoring: 'ip_to_bind_to': '1.2.3.1', }, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -997,6 +1002,7 @@ class TestMonitoring: 'web_config': '/etc/prometheus/web.yml', }, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -1065,6 +1071,7 @@ class TestMonitoring: }, }, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -1123,6 +1130,7 @@ class TestMonitoring: }, }, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -1234,6 +1242,7 @@ class TestMonitoring: "files": files, }, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) @@ -1408,6 +1417,7 @@ spec: }, "config_blobs": {}, }), + use_current_daemon_image=True, ) @@ -1514,6 +1524,7 @@ class TestSNMPGateway: }, "config_blobs": config, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -1561,6 +1572,7 @@ class TestSNMPGateway: }, "config_blobs": config, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -1612,6 +1624,7 @@ class TestSNMPGateway: }, "config_blobs": config, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -1668,6 +1681,7 @@ class TestSNMPGateway: }, "config_blobs": config, }), + use_current_daemon_image=False, ) @@ -2735,6 +2749,7 @@ class TestJaeger: }, "config_blobs": config, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -2774,6 +2789,7 @@ class TestJaeger: }, "config_blobs": es_config, }), + use_current_daemon_image=False, ) with with_service(cephadm_module, collector_spec): _run_cephadm.assert_called_with( @@ -2801,6 +2817,7 @@ class TestJaeger: }, "config_blobs": collector_config, }), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -2840,6 +2857,7 @@ class TestJaeger: }, "config_blobs": collector_config, }), + use_current_daemon_image=False, ) with with_service(cephadm_module, agent_spec): _run_cephadm.assert_called_with( @@ -2867,6 +2885,7 @@ class TestJaeger: }, "config_blobs": agent_config, }), + use_current_daemon_image=False, ) @@ -2923,6 +2942,7 @@ class TestCustomContainer: }, } ), + use_current_daemon_image=False, ) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -3009,6 +3029,7 @@ class TestCustomContainer: ['_orch', 'deploy'], [], stdin=json.dumps(expected), + use_current_daemon_image=False, ) @@ -3059,6 +3080,7 @@ class TestSMB: ['_orch', 'deploy'], [], stdin=json.dumps(expected), + use_current_daemon_image=False, ) @patch("cephadm.module.CephadmOrchestrator.get_unique_name") @@ -3128,4 +3150,5 @@ class TestSMB: ['_orch', 'deploy'], [], stdin=json.dumps(expected), + use_current_daemon_image=False, )