From 5b5202e04edb1cd13ec906066ebc7913b4b97774 Mon Sep 17 00:00:00 2001 From: Adam King Date: Tue, 13 Sep 2022 13:25:02 -0400 Subject: [PATCH] mgr/cephadm: write client files after applying services Fixes: https://tracker.ceph.com/issues/56696 Fixes: https://tracker.ceph.com/issues/57462 Fixes: https://tracker.ceph.com/issues/57449 There is a bug described in the linked trackers where client files are being removed for seemingly no reason. In the cases of the trackers it's the client keyring or ceph.conf being removed and it's causing failures in the test scenarios. After giving the relevant code a thorough look, I couldn't come up with any potential causes. Since one of the failures was occurring with some regularity (maybe 1/5 times) in a teuthology test, I just added a bunch of extra logging and ran through the tests. That logging included if self.mgr.manage_etc_ceph_ceph_conf or self.mgr.keys.keys: client_files = self._calc_client_files() self.log.error(f'ADMK - all client files {client_files}') else: self.log.error('ADMK - no calcing client files') client_files = {} around the part of the code that actually figured out what the client files should be (that code is being removed in this commit). In theory, this should tell me what client files were calculated for that iteration or that no client files were calculated at all. Additionally, there was logging to say when we were actually writing or removing a client file (all prepended with "ADMK" to help with searching through the logs). There was more logging around scheduling, multiple parts of _calc_client_files etc. You can see an example run with the logging at http://qa-proxy.ceph.com/teuthology/adking-2022-09-13_11:46:25-fs:workload-wip-adk-removal-debugging2-distro-default-smithi/7030534/teuthology.log where searching for "ADMK" will show all the new log statements and can look at the two last commits in https://github.com/adk3798/ceph/tree/wip-adk-removal-debugging2 to see what all the logging statement that were added are. As expected, this logigng eventually gave a message about it removing the relevant files and additionally stated that the client_files calculated were "{}" as in, no files were calculated. Given that, I expected to see "ADMK - all client files {}" or "ADMK - no calcing client files" in the logs not far above that. But they weren't there.I could not find a single instance within 30 minutes of the removal of it actually calculating that there should be no client files (the only instances of that were very early in the test before things had been setup). In fact, directly leading up to when the files were removed, the most recent instance of either of the logging statements in the posted code snippet was about 20 seconds earlier and had correctly calculated that there should be a number of client files. Somehow it is deciding that client_files is an empty dict but I simply cannot find any instance anywhere of it actually doing so (outside of early on before the client files are setup where it is correct to say that). The solution proposed in this commit is simply to calculate the client files right before writing them in the hope that this consolidation of the functionality will maybe prevent whatever was setting client_files to an empty dict from doing so or at least make the debugging process easier. The plan is to basically, unless someone has a good idea, to run this through testing we know causes the issue and see if it still happens. If not, I'll consider it "fixed". Otherwise, hopefully the consolidation makes debugging easier. Keep in mind so far this issue has only been seen in long running tests and, as mentioned earlier when talking about the teuthology a that produces this, doesn't happen every time. Signed-off-by: Adam King (cherry picked from commit 93de6b4db35518e14ff932924ddd5fba4f1a9b85) Conflicts: src/pybind/mgr/cephadm/serve.py src/pybind/mgr/cephadm/tests/test_cephadm.py --- src/pybind/mgr/cephadm/serve.py | 20 +++++++++++++------- src/pybind/mgr/cephadm/tests/test_cephadm.py | 2 +- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 7ac6fee88e2cd..550925b291efd 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -192,11 +192,6 @@ class CephadmServe: bad_hosts = [] failures = [] - if self.mgr.manage_etc_ceph_ceph_conf or self.mgr.keys.keys: - client_files = self._calc_client_files() - else: - client_files = {} - @forall_hosts def refresh(host: str) -> None: @@ -246,10 +241,9 @@ class CephadmServe: self.log.debug(f"autotuning memory for {host}") self._autotune_host_memory(host) - self._write_client_files(client_files, host) - refresh(self.mgr.cache.get_hosts()) + self._write_all_client_files() self.mgr.config_checker.run_checks() for k in [ @@ -1030,6 +1024,18 @@ class CephadmServe: f'unable to calc client keyring {ks.entity} placement {ks.placement}: {e}') return client_files + def _write_all_client_files(self) -> None: + if self.mgr.manage_etc_ceph_ceph_conf or self.mgr.keys.keys: + client_files = self._calc_client_files() + else: + client_files = {} + + @forall_hosts + def _write_files(host: str) -> None: + self._write_client_files(client_files, host) + + _write_files(self.mgr.cache.get_hosts()) + def _write_client_files(self, client_files: Dict[str, Dict[str, Tuple[int, int, int, bytes, str]]], host: str) -> None: diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index a6850f6cb7893..bbd355b25336d 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -1587,7 +1587,7 @@ class TestCephadm(object): before_digest = cephadm_module.cache.get_host_client_files('test')[ '/etc/ceph/ceph.conf'][0] cephadm_module._set_extra_ceph_conf('[mon]\nk2=v2') - CephadmServe(cephadm_module)._refresh_hosts_and_daemons() + CephadmServe(cephadm_module)._write_all_client_files() after_digest = cephadm_module.cache.get_host_client_files('test')[ '/etc/ceph/ceph.conf'][0] assert before_digest != after_digest -- 2.39.5