From: Adam King Date: Tue, 12 Mar 2024 14:26:18 +0000 (-0400) Subject: cephadm: fix `cephadm shell --name ` for stopped/failed daemon X-Git-Tag: v20.0.0~2364^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=dc5e1ea20c31e1089966ab89370d82e24d138524;p=ceph.git cephadm: fix `cephadm shell --name ` for stopped/failed daemon This previously would always try to use 'podman inspect' on the running container of the daemon, but this doesn't work if the daemon is stopped or failed. Doing this for stopped/failed daemons is a valid use case as we recommend cephadm shell with --name for running debugging tools (often for OSDs) Fixes: https://tracker.ceph.com/issues/64879 Signed-off-by: Adam King --- diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index b3ff564ec2f4e..3b2157cdc322d 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -483,17 +483,66 @@ def get_container_info(ctx: CephadmContext, daemon_filter: str, by_name: bool) - if by_name and '.' not in daemon_filter: logger.warning(f'Trying to get container info using invalid daemon name {daemon_filter}') return None - daemons = list_daemons(ctx, detail=False) - matching_daemons = [d for d in daemons if daemon_name_or_type(d) == daemon_filter and d['fsid'] == ctx.fsid] + if by_name: + matching_daemons = _get_matching_daemons_by_name(ctx, daemon_filter) + else: + # NOTE: we are passing detail=False here as in this case where we are not + # doing it by_name, we really only need the names of the daemons. Additionally, + # when not doing it by_name, we are getting the info for all daemons on the + # host, and doing this with detail=True tends to be slow. + daemons = list_daemons(ctx, detail=False) + matching_daemons = [d for d in daemons if daemon_name_or_type(d) == daemon_filter and d['fsid'] == ctx.fsid] if matching_daemons: - d_type, d_id = matching_daemons[0]['name'].split('.', 1) - out, _, code = get_container_stats(ctx, ctx.container_engine.path, ctx.fsid, d_type, d_id) - if not code: - (container_id, image_name, image_id, start, version) = out.strip().split(',') - return ContainerInfo(container_id, image_name, image_id, start, version) + if ( + by_name + and 'state' in matching_daemons[0] + and matching_daemons[0]['state'] != 'running' + and 'container_image_name' in matching_daemons[0] + and matching_daemons[0]['container_image_name'] + ): + # this daemon contianer is not running so the regular `podman/docker inspect` on the + # container will not help us. If we have the image name from the list_daemons output + # we can try that. + image_name = matching_daemons[0]['container_image_name'] + out, _, code = get_container_stats_by_image_name(ctx, ctx.container_engine.path, image_name) + if not code: + # keep in mind, the daemon container is not running, so no container id here + (image_id, start, version) = out.strip().split(',') + return ContainerInfo( + container_id='', + image_name=image_name, + image_id=image_id, + start=start, + version=version) + else: + d_type, d_id = matching_daemons[0]['name'].split('.', 1) + out, _, code = get_container_stats(ctx, ctx.container_engine.path, ctx.fsid, d_type, d_id) + if not code: + (container_id, image_name, image_id, start, version) = out.strip().split(',') + return ContainerInfo(container_id, image_name, image_id, start, version) return None +def _get_matching_daemons_by_name(ctx: CephadmContext, daemon_filter: str) -> List[Dict[str, str]]: + # NOTE: we are not passing detail=False to this list_daemons call + # as we want the container_image name in the case where we are + # doing this by name and this is skipped when detail=False + matching_daemons = list_daemons(ctx, daemon_name=daemon_filter) + if len(matching_daemons) > 1: + logger.warning(f'Found multiple daemons sharing same name: {daemon_filter}') + # Take the first daemon we find that is actually running, or just the + # first in the list if none are running + matched_daemon = None + for d in matching_daemons: + if 'state' in d and d['state'] == 'running': + matched_daemon = d + break + if not matched_daemon: + matched_daemon = matching_daemons[0] + matching_daemons = [matched_daemon] + return matching_daemons + + def infer_local_ceph_image(ctx: CephadmContext, container_path: str) -> Optional[str]: """ Infer the local ceph image based on the following priority criteria: @@ -3656,6 +3705,7 @@ def get_daemon_description(ctx, fsid, name, detail=False, legacy_dir=None): def get_container_stats(ctx: CephadmContext, container_path: str, fsid: str, daemon_type: str, daemon_id: str) -> Tuple[str, str, int]: + """returns container id, image name, image id, created time, and ceph version if available""" c = CephContainer.for_daemon( ctx, DaemonIdentity(fsid, daemon_type, daemon_id), 'bash' ) @@ -3671,6 +3721,18 @@ def get_container_stats(ctx: CephadmContext, container_path: str, fsid: str, dae break return out, err, code + +def get_container_stats_by_image_name(ctx: CephadmContext, container_path: str, image_name: str) -> Tuple[str, str, int]: + """returns image id, created time, and ceph version if available""" + out, err, code = '', '', -1 + cmd = [ + container_path, 'image', 'inspect', + '--format', '{{.Id}},{{.Created}},{{index .Config.Labels "io.ceph.version"}}', + image_name + ] + out, err, code = call(ctx, cmd, verbosity=CallVerbosity.QUIET) + return out, err, code + ################################## diff --git a/src/cephadm/tests/test_cephadm.py b/src/cephadm/tests/test_cephadm.py index 9c6caf0092b74..59d360b15b840 100644 --- a/src/cephadm/tests/test_cephadm.py +++ b/src/cephadm/tests/test_cephadm.py @@ -1,5 +1,6 @@ # type: ignore +import copy import errno import json import mock @@ -784,6 +785,80 @@ class TestCephAdm(object): with mock.patch('cephadm.get_container_stats', return_value=container_stats): assert _cephadm.get_container_info(ctx, daemon_filter, by_name) == output + @mock.patch('cephadm.list_daemons') + @mock.patch('cephadm.get_container_stats') + @mock.patch('cephadm.get_container_stats_by_image_name') + def test_get_container_info_daemon_down(self, _get_stats_by_name, _get_stats, _list_daemons): + ctx = _cephadm.CephadmContext() + ctx.fsid = '5e39c134-dfc5-11ee-a344-5254000ee071' + ctx.container_engine = mock_podman() + + # list_daemons output taken from cephadm ls of an + # OSD that was stopped, with subsititutions + # true -> True + # null -> None + down_osd_json = { + "style": "cephadm:v1", + "name": "osd.2", + "fsid": "5e39c134-dfc5-11ee-a344-5254000ee071", + "systemd_unit": "ceph-5e39c134-dfc5-11ee-a344-5254000ee071@osd.2", + "enabled": True, + "state": "stopped", + "service_name": "osd.foo", + "ports": [], + "ip": None, + "deployed_by": [ + "quay.io/adk3798/ceph@sha256:7da0af22ce45aac97dff00125af590506d8e36ab97d78e5175149643562bfb0b" + ], + "rank": None, + "rank_generation": None, + "extra_container_args": None, + "extra_entrypoint_args": None, + "memory_request": None, + "memory_limit": None, + "container_id": None, + "container_image_name": "quay.io/adk3798/ceph@sha256:7da0af22ce45aac97dff00125af590506d8e36ab97d78e5175149643562bfb0b", + "container_image_id": None, + "container_image_digests": None, + "version": None, + "started": None, + "created": "2024-03-11T17:17:49.533757Z", + "deployed": "2024-03-11T17:37:23.520061Z", + "configured": "2024-03-11T17:37:28.494075Z" + } + _list_daemons.return_value = [down_osd_json] + _get_stats_by_name.return_value = (('a03c201ff4080204949932f367545cd381c4acee0d48dbc15f2eac1e35f22318,' + '2023-11-28 21:34:38.045413692 +0000 UTC,'), + '', 0) + + expected_container_info = _cephadm.ContainerInfo( + container_id='', + image_name='quay.io/adk3798/ceph@sha256:7da0af22ce45aac97dff00125af590506d8e36ab97d78e5175149643562bfb0b', + image_id='a03c201ff4080204949932f367545cd381c4acee0d48dbc15f2eac1e35f22318', + start='2023-11-28 21:34:38.045413692 +0000 UTC', + version='') + + assert _cephadm.get_container_info(ctx, 'osd.2', by_name=True) == expected_container_info + assert not _get_stats.called, 'only get_container_stats_by_image_name should have been called' + + # If there is one down and one up daemon of the same name, it should use the up one + # In this case, we would be using the running container to get the image, so + # all the info will come from the return value of get_container_stats, rather + # than it partially being taken from the list_daemons output + up_osd_json = copy.deepcopy(down_osd_json) + up_osd_json['state'] = 'running' + _get_stats.return_value = (('container_id,image_name,image_id,the_past,'), '', 0) + _list_daemons.return_value = [down_osd_json, up_osd_json] + + expected_container_info = _cephadm.ContainerInfo( + container_id='container_id', + image_name='image_name', + image_id='image_id', + start='the_past', + version='') + + assert _cephadm.get_container_info(ctx, 'osd.2', by_name=True) == expected_container_info + def test_should_log_to_journald(self): from cephadmlib import context_getters