From 9e02dfb26a48a7bdc03a244699adaf2e669b39e6 Mon Sep 17 00:00:00 2001 From: Adam King Date: Tue, 12 Mar 2024 10:26:18 -0400 Subject: [PATCH] cephadm: fix `cephadm shell --name ` for stopped/failed daemon This previously would always try to use 'podman inspect' on the running container of the daemon, but this doesn't work if the daemon is stopped or failed. Doing this for stopped/failed daemons is a valid use case as we recommend cephadm shell with --name for running debugging tools (often for OSDs) Fixes: https://tracker.ceph.com/issues/64879 Signed-off-by: Adam King (cherry picked from commit dc5e1ea20c31e1089966ab89370d82e24d138524) Conflicts: src/cephadm/cephadm.py --- src/cephadm/cephadm.py | 76 ++++++++++++++++++++++++++++--- src/cephadm/tests/test_cephadm.py | 76 +++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+), 7 deletions(-) diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 74a3146bba1fa..2c0d65670722a 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -2687,17 +2687,66 @@ def get_container_info(ctx: CephadmContext, daemon_filter: str, by_name: bool) - if by_name and '.' not in daemon_filter: logger.warning(f'Trying to get container info using invalid daemon name {daemon_filter}') return None - daemons = list_daemons(ctx, detail=False) - matching_daemons = [d for d in daemons if daemon_name_or_type(d) == daemon_filter and d['fsid'] == ctx.fsid] + if by_name: + matching_daemons = _get_matching_daemons_by_name(ctx, daemon_filter) + else: + # NOTE: we are passing detail=False here as in this case where we are not + # doing it by_name, we really only need the names of the daemons. Additionally, + # when not doing it by_name, we are getting the info for all daemons on the + # host, and doing this with detail=True tends to be slow. + daemons = list_daemons(ctx, detail=False) + matching_daemons = [d for d in daemons if daemon_name_or_type(d) == daemon_filter and d['fsid'] == ctx.fsid] if matching_daemons: - d_type, d_id = matching_daemons[0]['name'].split('.', 1) - out, _, code = get_container_stats(ctx, ctx.container_engine.path, ctx.fsid, d_type, d_id) - if not code: - (container_id, image_name, image_id, start, version) = out.strip().split(',') - return ContainerInfo(container_id, image_name, image_id, start, version) + if ( + by_name + and 'state' in matching_daemons[0] + and matching_daemons[0]['state'] != 'running' + and 'container_image_name' in matching_daemons[0] + and matching_daemons[0]['container_image_name'] + ): + # this daemon contianer is not running so the regular `podman/docker inspect` on the + # container will not help us. If we have the image name from the list_daemons output + # we can try that. + image_name = matching_daemons[0]['container_image_name'] + out, _, code = get_container_stats_by_image_name(ctx, ctx.container_engine.path, image_name) + if not code: + # keep in mind, the daemon container is not running, so no container id here + (image_id, start, version) = out.strip().split(',') + return ContainerInfo( + container_id='', + image_name=image_name, + image_id=image_id, + start=start, + version=version) + else: + d_type, d_id = matching_daemons[0]['name'].split('.', 1) + out, _, code = get_container_stats(ctx, ctx.container_engine.path, ctx.fsid, d_type, d_id) + if not code: + (container_id, image_name, image_id, start, version) = out.strip().split(',') + return ContainerInfo(container_id, image_name, image_id, start, version) return None +def _get_matching_daemons_by_name(ctx: CephadmContext, daemon_filter: str) -> List[Dict[str, str]]: + # NOTE: we are not passing detail=False to this list_daemons call + # as we want the container_image name in the case where we are + # doing this by name and this is skipped when detail=False + matching_daemons = list_daemons(ctx, daemon_name=daemon_filter) + if len(matching_daemons) > 1: + logger.warning(f'Found multiple daemons sharing same name: {daemon_filter}') + # Take the first daemon we find that is actually running, or just the + # first in the list if none are running + matched_daemon = None + for d in matching_daemons: + if 'state' in d and d['state'] == 'running': + matched_daemon = d + break + if not matched_daemon: + matched_daemon = matching_daemons[0] + matching_daemons = [matched_daemon] + return matching_daemons + + def infer_local_ceph_image(ctx: CephadmContext, container_path: str) -> Optional[str]: """ Infer the local ceph image based on the following priority criteria: @@ -7594,6 +7643,7 @@ def get_daemon_description(ctx, fsid, name, detail=False, legacy_dir=None): def get_container_stats(ctx: CephadmContext, container_path: str, fsid: str, daemon_type: str, daemon_id: str) -> Tuple[str, str, int]: + """returns container id, image name, image id, created time, and ceph version if available""" c = CephContainer.for_daemon(ctx, fsid, daemon_type, daemon_id, 'bash') out, err, code = '', '', -1 for name in (c.cname, c.old_cname): @@ -7607,6 +7657,18 @@ def get_container_stats(ctx: CephadmContext, container_path: str, fsid: str, dae break return out, err, code + +def get_container_stats_by_image_name(ctx: CephadmContext, container_path: str, image_name: str) -> Tuple[str, str, int]: + """returns image id, created time, and ceph version if available""" + out, err, code = '', '', -1 + cmd = [ + container_path, 'image', 'inspect', + '--format', '{{.Id}},{{.Created}},{{index .Config.Labels "io.ceph.version"}}', + image_name + ] + out, err, code = call(ctx, cmd, verbosity=CallVerbosity.QUIET) + return out, err, code + ################################## diff --git a/src/cephadm/tests/test_cephadm.py b/src/cephadm/tests/test_cephadm.py index d310215f6e324..0b705834a5a6b 100644 --- a/src/cephadm/tests/test_cephadm.py +++ b/src/cephadm/tests/test_cephadm.py @@ -1,5 +1,6 @@ # type: ignore +import copy import errno import json import mock @@ -741,6 +742,81 @@ class TestCephAdm(object): with mock.patch('cephadm.get_container_stats', return_value=container_stats): assert _cephadm.get_container_info(ctx, daemon_filter, by_name) == output + @mock.patch('cephadm.logger') + @mock.patch('cephadm.list_daemons') + @mock.patch('cephadm.get_container_stats') + @mock.patch('cephadm.get_container_stats_by_image_name') + def test_get_container_info_daemon_down(self, _get_stats_by_name, _get_stats, _list_daemons, _logger): + ctx = _cephadm.CephadmContext() + ctx.fsid = '5e39c134-dfc5-11ee-a344-5254000ee071' + ctx.container_engine = mock_podman() + + # list_daemons output taken from cephadm ls of an + # OSD that was stopped, with subsititutions + # true -> True + # null -> None + down_osd_json = { + "style": "cephadm:v1", + "name": "osd.2", + "fsid": "5e39c134-dfc5-11ee-a344-5254000ee071", + "systemd_unit": "ceph-5e39c134-dfc5-11ee-a344-5254000ee071@osd.2", + "enabled": True, + "state": "stopped", + "service_name": "osd.foo", + "ports": [], + "ip": None, + "deployed_by": [ + "quay.io/adk3798/ceph@sha256:7da0af22ce45aac97dff00125af590506d8e36ab97d78e5175149643562bfb0b" + ], + "rank": None, + "rank_generation": None, + "extra_container_args": None, + "extra_entrypoint_args": None, + "memory_request": None, + "memory_limit": None, + "container_id": None, + "container_image_name": "quay.io/adk3798/ceph@sha256:7da0af22ce45aac97dff00125af590506d8e36ab97d78e5175149643562bfb0b", + "container_image_id": None, + "container_image_digests": None, + "version": None, + "started": None, + "created": "2024-03-11T17:17:49.533757Z", + "deployed": "2024-03-11T17:37:23.520061Z", + "configured": "2024-03-11T17:37:28.494075Z" + } + _list_daemons.return_value = [down_osd_json] + _get_stats_by_name.return_value = (('a03c201ff4080204949932f367545cd381c4acee0d48dbc15f2eac1e35f22318,' + '2023-11-28 21:34:38.045413692 +0000 UTC,'), + '', 0) + + expected_container_info = _cephadm.ContainerInfo( + container_id='', + image_name='quay.io/adk3798/ceph@sha256:7da0af22ce45aac97dff00125af590506d8e36ab97d78e5175149643562bfb0b', + image_id='a03c201ff4080204949932f367545cd381c4acee0d48dbc15f2eac1e35f22318', + start='2023-11-28 21:34:38.045413692 +0000 UTC', + version='') + + assert _cephadm.get_container_info(ctx, 'osd.2', by_name=True) == expected_container_info + assert not _get_stats.called, 'only get_container_stats_by_image_name should have been called' + + # If there is one down and one up daemon of the same name, it should use the up one + # In this case, we would be using the running container to get the image, so + # all the info will come from the return value of get_container_stats, rather + # than it partially being taken from the list_daemons output + up_osd_json = copy.deepcopy(down_osd_json) + up_osd_json['state'] = 'running' + _get_stats.return_value = (('container_id,image_name,image_id,the_past,'), '', 0) + _list_daemons.return_value = [down_osd_json, up_osd_json] + + expected_container_info = _cephadm.ContainerInfo( + container_id='container_id', + image_name='image_name', + image_id='image_id', + start='the_past', + version='') + + assert _cephadm.get_container_info(ctx, 'osd.2', by_name=True) == expected_container_info + def test_should_log_to_journald(self): ctx = _cephadm.CephadmContext() # explicit -- 2.39.5