From a3f69e45cd802158012693937e65df5936e9f066 Mon Sep 17 00:00:00 2001
From: Adam King <adking@redhat.com>
Date: Tue, 9 Apr 2024 12:10:14 -0400
Subject: [PATCH] mgr/cephadm: additional debug logging for autotuner

This came from trying to debug behavior of the autotuner
in the upstream mailing list. The test case being added
was what that user was seeing. The debug logging being
added was useful in getting a full understanding of how
the autotuner got the result it did. Therefore, why not
add the logging to the actual codebase so we can make use
of it to debug autotuner issues in the future

Signed-off-by: Adam King <adking@redhat.com>
---
 src/pybind/mgr/cephadm/autotune.py            | 20 +++++++++++--
 src/pybind/mgr/cephadm/serve.py               |  6 ++++
 src/pybind/mgr/cephadm/tests/test_autotune.py | 28 ++++++++++++++++++-
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/src/pybind/mgr/cephadm/autotune.py b/src/pybind/mgr/cephadm/autotune.py
index 72ebcd66064..0365c76a868 100644
--- a/src/pybind/mgr/cephadm/autotune.py
+++ b/src/pybind/mgr/cephadm/autotune.py
@@ -32,24 +32,38 @@ class MemoryAutotuner(object):
     def tune(self) -> Tuple[Optional[int], List[str]]:
         tuned_osds: List[str] = []
         total = self.total_mem
+        logger.debug('Autotuning OSD memory with given parameters:\n'
+                     f'Total memory: {total}\nDaemons: {self.daemons}')
         for d in self.daemons:
             if d.daemon_type == 'mds':
-                total -= self.config_get(d.name(), 'mds_cache_memory_limit')
+                mds_mem = self.config_get(d.name(), 'mds_cache_memory_limit')
+                logger.debug(f'Subtracting {mds_mem} from total for mds daemon')
+                total -= mds_mem
+                logger.debug(f'new total: {total}')
                 continue
             if d.daemon_type != 'osd':
                 assert d.daemon_type
-                total -= max(
+                daemon_mem = max(
                     self.min_size_by_type.get(d.daemon_type, self.default_size),
                     d.memory_usage or 0
                 )
+                logger.debug(f'Subtracting {daemon_mem} from total for {d.daemon_type} daemon')
+                total -= daemon_mem
+                logger.debug(f'new total: {total}')
                 continue
             if not self.config_get(d.name(), 'osd_memory_target_autotune'):
-                total -= self.config_get(d.name(), 'osd_memory_target')
+                osd_mem = self.config_get(d.name(), 'osd_memory_target')
+                logger.debug('osd_memory_target_autotune disabled. '
+                             f'Subtracting {osd_mem} from total for osd daemon')
+                total -= osd_mem
+                logger.debug(f'new total: {total}')
                 continue
             tuned_osds.append(d.name())
         if total < 0:
             return None, []
         if not tuned_osds:
             return None, []
+        logger.debug(f'Final total is {total} to be split among {len(tuned_osds)} OSDs')
         per = total // len(tuned_osds)
+        logger.debug(f'Result is {per} per OSD')
         return int(per), tuned_osds
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py
index 4c7889bd18f..b8a8d4e5208 100644
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -195,6 +195,9 @@ class CephadmServe:
             val = None
         else:
             total_mem *= 1024   # kb -> bytes
+            self.log.debug(f'Autotuning memory for host {host} with '
+                           f'{total_mem} total bytes of memory and '
+                           f'{self.mgr.autotune_memory_target_ratio} target ratio')
             total_mem *= self.mgr.autotune_memory_target_ratio
             a = MemoryAutotuner(
                 daemons=self.mgr.cache.get_daemons_by_host(host),
@@ -231,6 +234,9 @@ class CephadmServe:
             # options as users may be using them. Since there is no way to set autotuning
             # on/off at a host level, best we can do is check if it is globally on.
             if self.mgr.get_foreign_ceph_option('osd', 'osd_memory_target_autotune'):
+                self.mgr.log.debug(f'Removing osd_memory_target for OSDs on {host}'
+                                   ' as either there were no OSDs to tune or the '
+                                   ' per OSD memory calculation result was <= 0')
                 self.mgr.check_mon_command({
                     'prefix': 'config rm',
                     'who': f'osd/host:{host.split(".")[0]}',
diff --git a/src/pybind/mgr/cephadm/tests/test_autotune.py b/src/pybind/mgr/cephadm/tests/test_autotune.py
index 7994c390a7e..bf6f3d5ef59 100644
--- a/src/pybind/mgr/cephadm/tests/test_autotune.py
+++ b/src/pybind/mgr/cephadm/tests/test_autotune.py
@@ -57,7 +57,31 @@ from orchestrator import DaemonDescription
             ],
             {},
             60 * 1024 * 1024 * 1024,
-        )
+        ),
+        (  # Taken from an actual user case
+            int(32827840 * 1024 * 0.7),
+            [
+                DaemonDescription('crash', 'a', 'host1'),
+                DaemonDescription('grafana', 'a', 'host1'),
+                DaemonDescription('mds', 'a', 'host1'),
+                DaemonDescription('mds', 'b', 'host1'),
+                DaemonDescription('mds', 'c', 'host1'),
+                DaemonDescription('mgr', 'a', 'host1'),
+                DaemonDescription('mon', 'a', 'host1'),
+                DaemonDescription('node-exporter', 'a', 'host1'),
+                DaemonDescription('osd', '1', 'host1'),
+                DaemonDescription('osd', '2', 'host1'),
+                DaemonDescription('osd', '3', 'host1'),
+                DaemonDescription('osd', '4', 'host1'),
+                DaemonDescription('prometheus', 'a', 'host1'),
+            ],
+            {
+                'mds.a': 4 * 1024 * 1024 * 1024,  # 4294967296
+                'mds.b': 4 * 1024 * 1024 * 1024,
+                'mds.c': 4 * 1024 * 1024 * 1024,
+            },
+            480485376,
+        ),
     ])
 def test_autotune(total, daemons, config, result):
     def fake_getter(who, opt):
@@ -69,6 +93,8 @@ def test_autotune(total, daemons, config, result):
         if opt == 'osd_memory_target':
             return config.get(who, 4 * 1024 * 1024 * 1024)
         if opt == 'mds_cache_memory_limit':
+            if who in config:
+                return config.get(who, 16 * 1024 * 1024 * 1024)
             return 16 * 1024 * 1024 * 1024
 
     a = MemoryAutotuner(
-- 
2.39.5