]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mgr/cephadm: pass daemon's current image when reconfiguring
authorAdam King <adking@redhat.com>
Wed, 3 Apr 2024 17:11:08 +0000 (13:11 -0400)
committerAdam King <adking@redhat.com>
Thu, 4 Apr 2024 20:09:12 +0000 (16:09 -0400)
Important to note here is that a reconfig will rewrite
the config files of the daemon and restart it, but not
rewrite the unit files. This lead to a bug where the
grafana image we used between the quincy and squid release
used different UIDs internally, which caused us to rewrite
the config files as owned by a UID that worked with the
new image but did not work with the image still specified
in the unit.run file. This meant the grafana daemon was down
from a bit after the mgr upgrade until the end
of the upgrade when we redeploy all the monitoring images.

Fixes: https://tracker.ceph.com/issues/65234
Signed-off-by: Adam King <adking@redhat.com>
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/serve.py
src/pybind/mgr/cephadm/tests/test_cephadm.py
src/pybind/mgr/cephadm/tests/test_services.py

index b93cb7f3096fa0a6fea3fc087b2db2faebe1faae..529e9e4a233f1210ff862fe7fb9605fefb46640e 100644 (file)
@@ -1582,9 +1582,27 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
         self._kick_serve_loop()
         return HandleCommandResult()
 
-    def _get_container_image(self, daemon_name: str) -> Optional[str]:
+    def _get_container_image(self, daemon_name: str, use_current_daemon_image: bool = False) -> Optional[str]:
         daemon_type = daemon_name.split('.', 1)[0]  # type: ignore
         image: Optional[str] = None
+        # Try to use current image if specified. This is necessary
+        # because, if we're reconfiguring the daemon, we can
+        # run into issues during upgrade. For example if the default grafana
+        # image is changing and we pass the new image name when doing
+        # the reconfig, we could end up using the UID required by the
+        # new image as owner for the config files we write, while the
+        # unit.run will still reference the old image that requires those
+        # config files to be owned by a different UID
+        # Note that "current image" just means the one we picked up
+        # when we last ran "cephadm ls" on the host
+        if use_current_daemon_image:
+            try:
+                dd = self.cache.get_daemon(daemon_name=daemon_name)
+                if dd.container_image_name:
+                    return dd.container_image_name
+            except OrchestratorError:
+                self.log.debug(f'Could not find daemon {daemon_name} in cache '
+                               'while searching for its image')
         if daemon_type in CEPH_IMAGE_TYPES:
             # get container image
             image = str(self.get_foreign_ceph_option(
index 33ec35de70d61c8e38f33887d92f641f1178e157..559ef7e5be977505781304f4f2b7fb50369d255c 100644 (file)
@@ -1373,6 +1373,7 @@ class CephadmServe:
                         ),
                         config_blobs=daemon_spec.final_config,
                     ).dump_json_str(),
+                    use_current_daemon_image=reconfig,
                 )
 
                 if daemon_spec.daemon_type == 'agent':
@@ -1517,11 +1518,12 @@ class CephadmServe:
                                 error_ok: Optional[bool] = False,
                                 image: Optional[str] = "",
                                 log_output: Optional[bool] = True,
+                                use_current_daemon_image: bool = False,
                                 ) -> Any:
         try:
             out, err, code = await self._run_cephadm(
                 host, entity, command, args, no_fsid=no_fsid, error_ok=error_ok,
-                image=image, log_output=log_output)
+                image=image, log_output=log_output, use_current_daemon_image=use_current_daemon_image)
             if code:
                 raise OrchestratorError(f'host {host} `cephadm {command}` returned {code}: {err}')
         except Exception as e:
@@ -1546,6 +1548,7 @@ class CephadmServe:
                            env_vars: Optional[List[str]] = None,
                            log_output: Optional[bool] = True,
                            timeout: Optional[int] = None,  # timeout in seconds
+                           use_current_daemon_image: bool = False,
                            ) -> Tuple[List[str], List[str], int]:
         """
         Run cephadm on the remote host with the given command + args
@@ -1566,7 +1569,10 @@ class CephadmServe:
         # Skip the image check for daemons deployed that are not ceph containers
         if not str(entity).startswith(bypass_image):
             if not image and entity is not cephadmNoImage:
-                image = self.mgr._get_container_image(entity)
+                image = self.mgr._get_container_image(
+                    entity,
+                    use_current_daemon_image=use_current_daemon_image
+                )
 
         final_args = []
 
index 2477de13e00a0902fb28fbdc76f9e1b3194810ab..4cfebfb37bbd849a2fb38b8a045a44d2c4b0689d 100644 (file)
@@ -127,10 +127,12 @@ def with_osd_daemon(cephadm_module: CephadmOrchestrator, _run_cephadm, host: str
         [host]).stdout == f"Created osd(s) 1 on host '{host}'"
     assert _run_cephadm.mock_calls == [
         mock.call(host, 'osd', 'ceph-volume',
-                  ['--', 'lvm', 'list', '--format', 'json'], no_fsid=False, error_ok=False, image='', log_output=True),
-        mock.call(host, f'osd.{osd_id}', ['_orch', 'deploy'], [], stdin=mock.ANY),
+                  ['--', 'lvm', 'list', '--format', 'json'],
+                  no_fsid=False, error_ok=False, image='', log_output=True, use_current_daemon_image=False),
+        mock.call(host, f'osd.{osd_id}', ['_orch', 'deploy'], [], stdin=mock.ANY, use_current_daemon_image=False),
         mock.call(host, 'osd', 'ceph-volume',
-                  ['--', 'raw', 'list', '--format', 'json'], no_fsid=False, error_ok=False, image='', log_output=True),
+                  ['--', 'raw', 'list', '--format', 'json'],
+                  no_fsid=False, error_ok=False, image='', log_output=True, use_current_daemon_image=False),
     ]
     dd = cephadm_module.cache.get_daemon(f'osd.{osd_id}', host=host)
     assert dd.name() == f'osd.{osd_id}'
@@ -524,6 +526,7 @@ class TestCephadm(object):
                             },
                         },
                     }),
+                    use_current_daemon_image=True,
                 )
 
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -578,6 +581,7 @@ class TestCephadm(object):
                             "crush_location": "datacenter=a",
                         },
                     }),
+                    use_current_daemon_image=False,
                 )
 
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -619,6 +623,7 @@ class TestCephadm(object):
                             "keyring": "[client.crash.test]\nkey = None\n",
                         },
                     }),
+                    use_current_daemon_image=False,
                 )
 
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -660,6 +665,7 @@ class TestCephadm(object):
                         },
                         "config_blobs": {},
                     }),
+                    use_current_daemon_image=False,
                 )
 
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -709,6 +715,7 @@ class TestCephadm(object):
                         },
                         "config_blobs": {},
                     }),
+                    use_current_daemon_image=False,
                 )
 
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -762,6 +769,7 @@ class TestCephadm(object):
                         },
                         "config_blobs": {},
                     }),
+                    use_current_daemon_image=False,
                 )
 
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -1107,9 +1115,13 @@ class TestCephadm(object):
                 env_vars=['CEPH_VOLUME_OSDSPEC_AFFINITY=foo'], error_ok=True,
                 stdin='{"config": "", "keyring": ""}')
             _run_cephadm.assert_any_call(
-                'test', 'osd', 'ceph-volume', ['--', 'lvm', 'list', '--format', 'json'], image='', no_fsid=False, error_ok=False, log_output=True)
+                'test', 'osd', 'ceph-volume', ['--', 'lvm', 'list', '--format', 'json'],
+                image='', no_fsid=False, error_ok=False, log_output=True, use_current_daemon_image=False
+            )
             _run_cephadm.assert_any_call(
-                'test', 'osd', 'ceph-volume', ['--', 'raw', 'list', '--format', 'json'], image='', no_fsid=False, error_ok=False, log_output=True)
+                'test', 'osd', 'ceph-volume', ['--', 'raw', 'list', '--format', 'json'],
+                image='', no_fsid=False, error_ok=False, log_output=True, use_current_daemon_image=False
+            )
 
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
     def test_apply_osd_save_non_collocated(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
@@ -1147,11 +1159,16 @@ class TestCephadm(object):
                     '--no-auto', '/dev/sdb', '--db-devices', '/dev/sdc',
                     '--wal-devices', '/dev/sdd', '--yes', '--no-systemd'],
                 env_vars=['CEPH_VOLUME_OSDSPEC_AFFINITY=noncollocated'],
-                error_ok=True, stdin='{"config": "", "keyring": ""}')
+                error_ok=True, stdin='{"config": "", "keyring": ""}',
+            )
             _run_cephadm.assert_any_call(
-                'test', 'osd', 'ceph-volume', ['--', 'lvm', 'list', '--format', 'json'], image='', no_fsid=False, error_ok=False, log_output=True)
+                'test', 'osd', 'ceph-volume', ['--', 'lvm', 'list', '--format', 'json'],
+                image='', no_fsid=False, error_ok=False, log_output=True, use_current_daemon_image=False
+            )
             _run_cephadm.assert_any_call(
-                'test', 'osd', 'ceph-volume', ['--', 'raw', 'list', '--format', 'json'], image='', no_fsid=False, error_ok=False, log_output=True)
+                'test', 'osd', 'ceph-volume', ['--', 'raw', 'list', '--format', 'json'],
+                image='', no_fsid=False, error_ok=False, log_output=True, use_current_daemon_image=False
+            )
 
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
     @mock.patch("cephadm.module.SpecStore.save")
@@ -2283,10 +2300,10 @@ Traceback (most recent call last):
             assert _run_cephadm.mock_calls == [
                 mock.call('test', 'osd', 'ceph-volume',
                           ['--', 'inventory', '--format=json-pretty', '--filter-for-batch'], image='',
-                          no_fsid=False, error_ok=False, log_output=False),
+                          no_fsid=False, error_ok=False, log_output=False, use_current_daemon_image=False),
                 mock.call('test', 'osd', 'ceph-volume',
                           ['--', 'inventory', '--format=json-pretty'], image='',
-                          no_fsid=False, error_ok=False, log_output=False),
+                          no_fsid=False, error_ok=False, log_output=False, use_current_daemon_image=False),
             ]
 
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
index 593cdc85e73c5551e84077364982d18ab6ddc963..715f7e56117e882df3afd7e5326d2ece68bcfb99 100644 (file)
@@ -345,6 +345,7 @@ log_to_file = False"""
                             },
                         }
                     }),
+                    use_current_daemon_image=False,
                 )
 
 
@@ -480,6 +481,7 @@ timeout = 1.0\n"""
                             }
                         }
                     }),
+                    use_current_daemon_image=False,
                 )
 
 
@@ -590,6 +592,7 @@ class TestMonitoring:
                             "peers": [],
                         }
                     }),
+                    use_current_daemon_image=False,
                 )
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -685,6 +688,7 @@ class TestMonitoring:
                             'web_config': '/etc/alertmanager/web.yml',
                         }
                     }),
+                    use_current_daemon_image=False,
                 )
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -819,6 +823,7 @@ class TestMonitoring:
                             'ip_to_bind_to': '1.2.3.1',
                         },
                     }),
+                    use_current_daemon_image=False,
                 )
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -997,6 +1002,7 @@ class TestMonitoring:
                             'web_config': '/etc/prometheus/web.yml',
                         },
                     }),
+                    use_current_daemon_image=False,
                 )
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -1065,6 +1071,7 @@ class TestMonitoring:
                             },
                         },
                     }),
+                    use_current_daemon_image=False,
                 )
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -1123,6 +1130,7 @@ class TestMonitoring:
                             },
                         },
                     }),
+                    use_current_daemon_image=False,
                 )
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -1234,6 +1242,7 @@ class TestMonitoring:
                             "files": files,
                         },
                     }),
+                    use_current_daemon_image=False,
                 )
 
     @patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
@@ -1408,6 +1417,7 @@ spec:
                             },
                             "config_blobs": {},
                         }),
+                        use_current_daemon_image=True,
                     )
 
 
@@ -1514,6 +1524,7 @@ class TestSNMPGateway:
                         },
                         "config_blobs": config,
                     }),
+                    use_current_daemon_image=False,
                 )
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -1561,6 +1572,7 @@ class TestSNMPGateway:
                         },
                         "config_blobs": config,
                     }),
+                    use_current_daemon_image=False,
                 )
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -1612,6 +1624,7 @@ class TestSNMPGateway:
                         },
                         "config_blobs": config,
                     }),
+                    use_current_daemon_image=False,
                 )
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -1668,6 +1681,7 @@ class TestSNMPGateway:
                         },
                         "config_blobs": config,
                     }),
+                    use_current_daemon_image=False,
                 )
 
 
@@ -2735,6 +2749,7 @@ class TestJaeger:
                         },
                         "config_blobs": config,
                     }),
+                    use_current_daemon_image=False,
                 )
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -2774,6 +2789,7 @@ class TestJaeger:
                         },
                         "config_blobs": es_config,
                     }),
+                    use_current_daemon_image=False,
                 )
                 with with_service(cephadm_module, collector_spec):
                     _run_cephadm.assert_called_with(
@@ -2801,6 +2817,7 @@ class TestJaeger:
                             },
                             "config_blobs": collector_config,
                         }),
+                        use_current_daemon_image=False,
                     )
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -2840,6 +2857,7 @@ class TestJaeger:
                         },
                         "config_blobs": collector_config,
                     }),
+                    use_current_daemon_image=False,
                 )
                 with with_service(cephadm_module, agent_spec):
                     _run_cephadm.assert_called_with(
@@ -2867,6 +2885,7 @@ class TestJaeger:
                             },
                             "config_blobs": agent_config,
                         }),
+                        use_current_daemon_image=False,
                     )
 
 
@@ -2923,6 +2942,7 @@ class TestCustomContainer:
                             },
                         }
                     ),
+                    use_current_daemon_image=False,
                 )
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -3009,6 +3029,7 @@ class TestCustomContainer:
                     ['_orch', 'deploy'],
                     [],
                     stdin=json.dumps(expected),
+                    use_current_daemon_image=False,
                 )
 
 
@@ -3059,6 +3080,7 @@ class TestSMB:
                     ['_orch', 'deploy'],
                     [],
                     stdin=json.dumps(expected),
+                    use_current_daemon_image=False,
                 )
 
     @patch("cephadm.module.CephadmOrchestrator.get_unique_name")
@@ -3128,4 +3150,5 @@ class TestSMB:
                     ['_orch', 'deploy'],
                     [],
                     stdin=json.dumps(expected),
+                    use_current_daemon_image=False,
                 )