From a3f69e45cd802158012693937e65df5936e9f066 Mon Sep 17 00:00:00 2001 From: Adam King Date: Tue, 9 Apr 2024 12:10:14 -0400 Subject: [PATCH] mgr/cephadm: additional debug logging for autotuner This came from trying to debug behavior of the autotuner in the upstream mailing list. The test case being added was what that user was seeing. The debug logging being added was useful in getting a full understanding of how the autotuner got the result it did. Therefore, why not add the logging to the actual codebase so we can make use of it to debug autotuner issues in the future Signed-off-by: Adam King --- src/pybind/mgr/cephadm/autotune.py | 20 +++++++++++-- src/pybind/mgr/cephadm/serve.py | 6 ++++ src/pybind/mgr/cephadm/tests/test_autotune.py | 28 ++++++++++++++++++- 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/src/pybind/mgr/cephadm/autotune.py b/src/pybind/mgr/cephadm/autotune.py index 72ebcd66064..0365c76a868 100644 --- a/src/pybind/mgr/cephadm/autotune.py +++ b/src/pybind/mgr/cephadm/autotune.py @@ -32,24 +32,38 @@ class MemoryAutotuner(object): def tune(self) -> Tuple[Optional[int], List[str]]: tuned_osds: List[str] = [] total = self.total_mem + logger.debug('Autotuning OSD memory with given parameters:\n' + f'Total memory: {total}\nDaemons: {self.daemons}') for d in self.daemons: if d.daemon_type == 'mds': - total -= self.config_get(d.name(), 'mds_cache_memory_limit') + mds_mem = self.config_get(d.name(), 'mds_cache_memory_limit') + logger.debug(f'Subtracting {mds_mem} from total for mds daemon') + total -= mds_mem + logger.debug(f'new total: {total}') continue if d.daemon_type != 'osd': assert d.daemon_type - total -= max( + daemon_mem = max( self.min_size_by_type.get(d.daemon_type, self.default_size), d.memory_usage or 0 ) + logger.debug(f'Subtracting {daemon_mem} from total for {d.daemon_type} daemon') + total -= daemon_mem + logger.debug(f'new total: {total}') continue if not self.config_get(d.name(), 'osd_memory_target_autotune'): - total -= self.config_get(d.name(), 'osd_memory_target') + osd_mem = self.config_get(d.name(), 'osd_memory_target') + logger.debug('osd_memory_target_autotune disabled. ' + f'Subtracting {osd_mem} from total for osd daemon') + total -= osd_mem + logger.debug(f'new total: {total}') continue tuned_osds.append(d.name()) if total < 0: return None, [] if not tuned_osds: return None, [] + logger.debug(f'Final total is {total} to be split among {len(tuned_osds)} OSDs') per = total // len(tuned_osds) + logger.debug(f'Result is {per} per OSD') return int(per), tuned_osds diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 4c7889bd18f..b8a8d4e5208 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -195,6 +195,9 @@ class CephadmServe: val = None else: total_mem *= 1024 # kb -> bytes + self.log.debug(f'Autotuning memory for host {host} with ' + f'{total_mem} total bytes of memory and ' + f'{self.mgr.autotune_memory_target_ratio} target ratio') total_mem *= self.mgr.autotune_memory_target_ratio a = MemoryAutotuner( daemons=self.mgr.cache.get_daemons_by_host(host), @@ -231,6 +234,9 @@ class CephadmServe: # options as users may be using them. Since there is no way to set autotuning # on/off at a host level, best we can do is check if it is globally on. if self.mgr.get_foreign_ceph_option('osd', 'osd_memory_target_autotune'): + self.mgr.log.debug(f'Removing osd_memory_target for OSDs on {host}' + ' as either there were no OSDs to tune or the ' + ' per OSD memory calculation result was <= 0') self.mgr.check_mon_command({ 'prefix': 'config rm', 'who': f'osd/host:{host.split(".")[0]}', diff --git a/src/pybind/mgr/cephadm/tests/test_autotune.py b/src/pybind/mgr/cephadm/tests/test_autotune.py index 7994c390a7e..bf6f3d5ef59 100644 --- a/src/pybind/mgr/cephadm/tests/test_autotune.py +++ b/src/pybind/mgr/cephadm/tests/test_autotune.py @@ -57,7 +57,31 @@ from orchestrator import DaemonDescription ], {}, 60 * 1024 * 1024 * 1024, - ) + ), + ( # Taken from an actual user case + int(32827840 * 1024 * 0.7), + [ + DaemonDescription('crash', 'a', 'host1'), + DaemonDescription('grafana', 'a', 'host1'), + DaemonDescription('mds', 'a', 'host1'), + DaemonDescription('mds', 'b', 'host1'), + DaemonDescription('mds', 'c', 'host1'), + DaemonDescription('mgr', 'a', 'host1'), + DaemonDescription('mon', 'a', 'host1'), + DaemonDescription('node-exporter', 'a', 'host1'), + DaemonDescription('osd', '1', 'host1'), + DaemonDescription('osd', '2', 'host1'), + DaemonDescription('osd', '3', 'host1'), + DaemonDescription('osd', '4', 'host1'), + DaemonDescription('prometheus', 'a', 'host1'), + ], + { + 'mds.a': 4 * 1024 * 1024 * 1024, # 4294967296 + 'mds.b': 4 * 1024 * 1024 * 1024, + 'mds.c': 4 * 1024 * 1024 * 1024, + }, + 480485376, + ), ]) def test_autotune(total, daemons, config, result): def fake_getter(who, opt): @@ -69,6 +93,8 @@ def test_autotune(total, daemons, config, result): if opt == 'osd_memory_target': return config.get(who, 4 * 1024 * 1024 * 1024) if opt == 'mds_cache_memory_limit': + if who in config: + return config.get(who, 16 * 1024 * 1024 * 1024) return 16 * 1024 * 1024 * 1024 a = MemoryAutotuner( -- 2.39.5