]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
cephadm: fix `cephadm shell --name <daemon-name>` for stopped/failed daemon 56147/head
authorAdam King <adking@redhat.com>
Tue, 12 Mar 2024 14:26:18 +0000 (10:26 -0400)
committerAdam King <adking@redhat.com>
Fri, 15 Mar 2024 18:54:23 +0000 (14:54 -0400)
This previously would always try to use 'podman
inspect' on the running container of the daemon,
but this doesn't work if the daemon is stopped
or failed. Doing this for stopped/failed daemons
is a valid use case as we recommend cephadm shell
with --name for running debugging tools (often
for OSDs)

Fixes: https://tracker.ceph.com/issues/64879
Signed-off-by: Adam King <adking@redhat.com>
src/cephadm/cephadm.py
src/cephadm/tests/test_cephadm.py

index b3ff564ec2f4e7f9e2df75751df94f18c8ef0d1d..3b2157cdc322d4f9b1540f12ed77fefb6d68f068 100755 (executable)
@@ -483,17 +483,66 @@ def get_container_info(ctx: CephadmContext, daemon_filter: str, by_name: bool) -
     if by_name and '.' not in daemon_filter:
         logger.warning(f'Trying to get container info using invalid daemon name {daemon_filter}')
         return None
-    daemons = list_daemons(ctx, detail=False)
-    matching_daemons = [d for d in daemons if daemon_name_or_type(d) == daemon_filter and d['fsid'] == ctx.fsid]
+    if by_name:
+        matching_daemons = _get_matching_daemons_by_name(ctx, daemon_filter)
+    else:
+        # NOTE: we are passing detail=False here as in this case where we are not
+        # doing it by_name, we really only need the names of the daemons. Additionally,
+        # when not doing it by_name, we are getting the info for all daemons on the
+        # host, and doing this with detail=True tends to be slow.
+        daemons = list_daemons(ctx, detail=False)
+        matching_daemons = [d for d in daemons if daemon_name_or_type(d) == daemon_filter and d['fsid'] == ctx.fsid]
     if matching_daemons:
-        d_type, d_id = matching_daemons[0]['name'].split('.', 1)
-        out, _, code = get_container_stats(ctx, ctx.container_engine.path, ctx.fsid, d_type, d_id)
-        if not code:
-            (container_id, image_name, image_id, start, version) = out.strip().split(',')
-            return ContainerInfo(container_id, image_name, image_id, start, version)
+        if (
+            by_name
+            and 'state' in matching_daemons[0]
+            and matching_daemons[0]['state'] != 'running'
+            and 'container_image_name' in matching_daemons[0]
+            and matching_daemons[0]['container_image_name']
+        ):
+            # this daemon contianer is not running so the regular `podman/docker inspect` on the
+            # container will not help us. If we have the image name from the list_daemons output
+            # we can try that.
+            image_name = matching_daemons[0]['container_image_name']
+            out, _, code = get_container_stats_by_image_name(ctx, ctx.container_engine.path, image_name)
+            if not code:
+                # keep in mind, the daemon container is not running, so no container id here
+                (image_id, start, version) = out.strip().split(',')
+                return ContainerInfo(
+                    container_id='',
+                    image_name=image_name,
+                    image_id=image_id,
+                    start=start,
+                    version=version)
+        else:
+            d_type, d_id = matching_daemons[0]['name'].split('.', 1)
+            out, _, code = get_container_stats(ctx, ctx.container_engine.path, ctx.fsid, d_type, d_id)
+            if not code:
+                (container_id, image_name, image_id, start, version) = out.strip().split(',')
+                return ContainerInfo(container_id, image_name, image_id, start, version)
     return None
 
 
+def _get_matching_daemons_by_name(ctx: CephadmContext, daemon_filter: str) -> List[Dict[str, str]]:
+    # NOTE: we are not passing detail=False to this list_daemons call
+    # as we want the container_image name in the case where we are
+    # doing this by name and this is skipped when detail=False
+    matching_daemons = list_daemons(ctx, daemon_name=daemon_filter)
+    if len(matching_daemons) > 1:
+        logger.warning(f'Found multiple daemons sharing same name: {daemon_filter}')
+        # Take the first daemon we find that is actually running, or just the
+        # first in the list if none are running
+        matched_daemon = None
+        for d in matching_daemons:
+            if 'state' in d and d['state'] == 'running':
+                matched_daemon = d
+                break
+        if not matched_daemon:
+            matched_daemon = matching_daemons[0]
+        matching_daemons = [matched_daemon]
+    return matching_daemons
+
+
 def infer_local_ceph_image(ctx: CephadmContext, container_path: str) -> Optional[str]:
     """
      Infer the local ceph image based on the following priority criteria:
@@ -3656,6 +3705,7 @@ def get_daemon_description(ctx, fsid, name, detail=False, legacy_dir=None):
 
 
 def get_container_stats(ctx: CephadmContext, container_path: str, fsid: str, daemon_type: str, daemon_id: str) -> Tuple[str, str, int]:
+    """returns container id, image name, image id, created time, and ceph version if available"""
     c = CephContainer.for_daemon(
         ctx, DaemonIdentity(fsid, daemon_type, daemon_id), 'bash'
     )
@@ -3671,6 +3721,18 @@ def get_container_stats(ctx: CephadmContext, container_path: str, fsid: str, dae
             break
     return out, err, code
 
+
+def get_container_stats_by_image_name(ctx: CephadmContext, container_path: str, image_name: str) -> Tuple[str, str, int]:
+    """returns image id, created time, and ceph version if available"""
+    out, err, code = '', '', -1
+    cmd = [
+        container_path, 'image', 'inspect',
+        '--format', '{{.Id}},{{.Created}},{{index .Config.Labels "io.ceph.version"}}',
+        image_name
+    ]
+    out, err, code = call(ctx, cmd, verbosity=CallVerbosity.QUIET)
+    return out, err, code
+
 ##################################
 
 
index 9c6caf0092b7455098ba390c23ff26a65ed4c8e5..59d360b15b8405b787a8d464e21b823fcb97a797 100644 (file)
@@ -1,5 +1,6 @@
 # type: ignore
 
+import copy
 import errno
 import json
 import mock
@@ -784,6 +785,80 @@ class TestCephAdm(object):
             with mock.patch('cephadm.get_container_stats', return_value=container_stats):
                 assert _cephadm.get_container_info(ctx, daemon_filter, by_name) == output
 
+    @mock.patch('cephadm.list_daemons')
+    @mock.patch('cephadm.get_container_stats')
+    @mock.patch('cephadm.get_container_stats_by_image_name')
+    def test_get_container_info_daemon_down(self, _get_stats_by_name, _get_stats, _list_daemons):
+        ctx = _cephadm.CephadmContext()
+        ctx.fsid = '5e39c134-dfc5-11ee-a344-5254000ee071'
+        ctx.container_engine = mock_podman()
+
+        # list_daemons output taken from cephadm ls of an
+        # OSD that was stopped, with subsititutions
+        # true -> True
+        # null -> None
+        down_osd_json = {
+                "style": "cephadm:v1",
+                "name": "osd.2",
+                "fsid": "5e39c134-dfc5-11ee-a344-5254000ee071",
+                "systemd_unit": "ceph-5e39c134-dfc5-11ee-a344-5254000ee071@osd.2",
+                "enabled": True,
+                "state": "stopped",
+                "service_name": "osd.foo",
+                "ports": [],
+                "ip": None,
+                "deployed_by": [
+                    "quay.io/adk3798/ceph@sha256:7da0af22ce45aac97dff00125af590506d8e36ab97d78e5175149643562bfb0b"
+                ],
+                "rank": None,
+                "rank_generation": None,
+                "extra_container_args": None,
+                "extra_entrypoint_args": None,
+                "memory_request": None,
+                "memory_limit": None,
+                "container_id": None,
+                "container_image_name": "quay.io/adk3798/ceph@sha256:7da0af22ce45aac97dff00125af590506d8e36ab97d78e5175149643562bfb0b",
+                "container_image_id": None,
+                "container_image_digests": None,
+                "version": None,
+                "started": None,
+                "created": "2024-03-11T17:17:49.533757Z",
+                "deployed": "2024-03-11T17:37:23.520061Z",
+                "configured": "2024-03-11T17:37:28.494075Z"
+        }
+        _list_daemons.return_value = [down_osd_json]
+        _get_stats_by_name.return_value = (('a03c201ff4080204949932f367545cd381c4acee0d48dbc15f2eac1e35f22318,'
+                                   '2023-11-28 21:34:38.045413692 +0000 UTC,'),
+                                   '', 0)
+
+        expected_container_info = _cephadm.ContainerInfo(
+            container_id='',
+            image_name='quay.io/adk3798/ceph@sha256:7da0af22ce45aac97dff00125af590506d8e36ab97d78e5175149643562bfb0b',
+            image_id='a03c201ff4080204949932f367545cd381c4acee0d48dbc15f2eac1e35f22318',
+            start='2023-11-28 21:34:38.045413692 +0000 UTC',
+            version='')
+
+        assert _cephadm.get_container_info(ctx, 'osd.2', by_name=True) == expected_container_info
+        assert not _get_stats.called, 'only get_container_stats_by_image_name should have been called'
+
+        # If there is one down and one up daemon of the same name, it should use the up one
+        # In this case, we would be using the running container to get the image, so
+        # all the info will come from the return value of get_container_stats, rather
+        # than it partially being taken from the list_daemons output
+        up_osd_json = copy.deepcopy(down_osd_json)
+        up_osd_json['state'] = 'running'
+        _get_stats.return_value = (('container_id,image_name,image_id,the_past,'), '', 0)
+        _list_daemons.return_value = [down_osd_json, up_osd_json]
+
+        expected_container_info = _cephadm.ContainerInfo(
+            container_id='container_id',
+            image_name='image_name',
+            image_id='image_id',
+            start='the_past',
+            version='')
+
+        assert _cephadm.get_container_info(ctx, 'osd.2', by_name=True) == expected_container_info
+
     def test_should_log_to_journald(self):
         from cephadmlib import context_getters