mgr/cephadm: autotune osd memory

author Sage Weil <sage@newdream.net>

Tue, 4 May 2021 19:56:21 +0000 (15:56 -0400)

committer Sage Weil <sage@newdream.net>

Thu, 20 May 2021 23:02:58 +0000 (18:02 -0500)
author Sage Weil <sage@newdream.net>
Tue, 4 May 2021 19:56:21 +0000 (15:56 -0400)
committer Sage Weil <sage@newdream.net>
Thu, 20 May 2021 23:02:58 +0000 (18:02 -0500)
diff --git a/doc/cephadm/osd.rst b/doc/cephadm/osd.rst

index d61b61d58ab5e0443ac89e53e0412fbaf6467982..55ddd4c66167eeda1b9835eb7f466a7971416f7a 100644 (file)
--- a/doc/cephadm/osd.rst
+++ b/doc/cephadm/osd.rst
@@ -363,6 +363,52 @@ Example command:
      device.  To disable this behavior, see :ref:`cephadm-osd-declarative`.
  
  
+.. _osd_autotune:
+
+Automatically tuning OSD memory
+===============================
+
+OSD daemons will adjust their memory consumption based on the
+``osd_memory_target`` config option (several gigabytes, by
+default).  If Ceph is deployed on dedicated nodes that are not sharing
+memory with other services, cephadm can automatically adjust the per-OSD
+memory consumption based on the total amount of RAM and the number of deployed
+OSDs.
+
+This option is enabled globally with::
+
+  ceph config set osd osd_memory_target_autotune true
+
+Cephadm will start with a fraction
+(``mgr/cephadm/autotune_memory_target_ratio``, which defaults to
+``.7``) of the total RAM in the system, subtract off any memory
+consumed by non-autotuned daemons (non-OSDs, for OSDs for which
+``osd_memory_target_autotune`` is false), and then divide by the
+remaining OSDs.
+
+The final targets are reflected in the config database with options like::
+
+  WHO   MASK      LEVEL   OPTION              VALUE
+  osd   host:foo  basic   osd_memory_target   126092301926
+  osd   host:bar  basic   osd_memory_target   6442450944
+
+Both the limits and the current memory consumed by each daemon are visible from
+the ``ceph orch ps`` output in the ``MEM LIMIT`` column::
+
+  NAME        HOST  PORTS  STATUS         REFRESHED  AGE  MEM USED  MEM LIMIT  VERSION                IMAGE ID      CONTAINER ID  
+  osd.1       dael         running (3h)     10s ago   3h    72857k     117.4G  17.0.0-3781-gafaed750  7015fda3cd67  9e183363d39c  
+  osd.2       dael         running (81m)    10s ago  81m    63989k     117.4G  17.0.0-3781-gafaed750  7015fda3cd67  1f0cc479b051  
+  osd.3       dael         running (62m)    10s ago  62m    64071k     117.4G  17.0.0-3781-gafaed750  7015fda3cd67  ac5537492f27  
+
+To exclude an OSD from memory autotuning, disable the autotune option
+for that OSD and also set a specific memory target.  For example,
+
+  .. prompt:: bash #
+
+    ceph config set osd.123 osd_memory_target_autotune false
+    ceph config set osd.123 osd_memory_target 16G
+
+
  .. _drivegroups:
  
  Advanced OSD Service Specifications
diff --git a/src/pybind/mgr/cephadm/autotune.py b/src/pybind/mgr/cephadm/autotune.py

new file mode 100644 (file)

index 0000000..51c931c
--- /dev/null
+++ b/src/pybind/mgr/cephadm/autotune.py
@@ -0,0 +1,54 @@
+import logging
+from typing import List, Optional, Callable, Any, Tuple
+
+from orchestrator._interface import DaemonDescription
+
+logger = logging.getLogger(__name__)
+
+
+class MemoryAutotuner(object):
+
+    min_size_by_type = {
+        'mds': 4096 * 1048576,
+        'mgr': 4096 * 1048576,
+        'mon': 1024 * 1048576,
+        'crash': 128 * 1048576,
+        'keepalived': 128 * 1048576,
+        'haproxy': 128 * 1048576,
+    }
+    default_size = 1024 * 1048576
+
+    def __init__(
+            self,
+            daemons: List[DaemonDescription],
+            config_get: Callable[[str, str], Any],
+            total_mem: int,
+    ):
+        self.daemons = daemons
+        self.config_get = config_get
+        self.total_mem = total_mem
+
+    def tune(self) -> Tuple[Optional[int], List[str]]:
+        tuned_osds: List[str] = []
+        total = self.total_mem
+        for d in self.daemons:
+            if d.daemon_type == 'mds':
+                total -= self.config_get(d.name(), 'mds_cache_memory_limit')
+                continue
+            if d.daemon_type != 'osd':
+                assert d.daemon_type
+                total -= max(
+                    self.min_size_by_type.get(d.daemon_type, self.default_size),
+                    d.memory_usage or 0
+                )
+                continue
+            if not self.config_get(d.name(), 'osd_memory_target_autotune'):
+                total -= self.config_get(d.name(), 'osd_memory_target')
+                continue
+            tuned_osds.append(d.name())
+        if total < 0:
+            return None, []
+        if not tuned_osds:
+            return None, []
+        per = total // len(tuned_osds)
+        return int(per), tuned_osds
diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py

index 81089a26437a94dddaa3a2c085973693d00b426a..f66679a01a8a52a34d02716b616d1978bb9e0270 100644 (file)
--- a/src/pybind/mgr/cephadm/inventory.py
+++ b/src/pybind/mgr/cephadm/inventory.py
@@ -345,6 +345,7 @@ class HostCache():
          self.devices = {}              # type: Dict[str, List[inventory.Device]]
          self.facts = {}                # type: Dict[str, Dict[str, Any]]
          self.last_facts_update = {}    # type: Dict[str, datetime.datetime]
+        self.last_autotune = {}        # type: Dict[str, datetime.datetime]
          self.osdspec_previews = {}     # type: Dict[str, List[Dict[str, Any]]]
          self.osdspec_last_applied = {}  # type: Dict[str, Dict[str, datetime.datetime]]
          self.networks = {}             # type: Dict[str, Dict[str, Dict[str, List[str]]]]
@@ -429,6 +430,9 @@ class HostCache():
          self.facts[host] = facts
          self.last_facts_update[host] = datetime_now()
  
+    def update_autotune(self, host: str) -> None:
+        self.last_autotune[host] = datetime_now()
+
      def devices_changed(self, host: str, b: List[inventory.Device]) -> bool:
          a = self.devices[host]
          if len(a) != len(b):
@@ -576,6 +580,8 @@ class HostCache():
              del self.facts[host]
          if host in self.last_facts_update:
              del self.last_facts_update[host]
+        if host in self.last_autotune:
+            del self.last_autotune[host]
          if host in self.osdspec_previews:
              del self.osdspec_previews[host]
          if host in self.osdspec_last_applied:
@@ -605,6 +611,9 @@ class HostCache():
              r.append(host)
          return r
  
+    def get_facts(self, host: str) -> Dict[str, Any]:
+        return self.facts.get(host, {})
+
      def get_daemons(self):
          # type: () -> List[orchestrator.DaemonDescription]
          r = []
@@ -613,6 +622,9 @@ class HostCache():
                  r.append(dd)
          return r
  
+    def get_daemons_by_host(self, host: str) -> List[orchestrator.DaemonDescription]:
+        return list(self.daemons.get(host, {}).values())
+
      def get_daemon(self, daemon_name: str) -> orchestrator.DaemonDescription:
          assert not daemon_name.startswith('ha-rgw.')
          for _, dm in self.daemons.items():
@@ -712,6 +724,17 @@ class HostCache():
              return True
          return False
  
+    def host_needs_autotune_memory(self, host):
+        # type: (str) -> bool
+        if host in self.mgr.offline_hosts:
+            logger.debug(f'Host "{host}" marked as offline. Skipping autotune')
+            return False
+        cutoff = datetime_now() - datetime.timedelta(
+            seconds=self.mgr.autotune_interval)
+        if host not in self.last_autotune or self.last_autotune[host] < cutoff:
+            return True
+        return False
+
      def host_had_daemon_refresh(self, host: str) -> bool:
          """
          ... at least once.
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py

index e17c875241612586d40c5b0cb9ab1064e1a3f145..a5ab603f0e9ee54933212cd2041e374f1db727ea 100644 (file)
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -331,6 +331,18 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
              default=10,
              desc='max number of daemons per service per host',
          ),
+        Option(
+            'autotune_memory_target_ratio',
+            type='float',
+            default=.7,
+            desc='ratio of total system memory to divide amongst autotuned daemons'
+        ),
+        Option(
+            'autotune_interval',
+            type='secs',
+            default=10 * 60,
+            desc='how frequently to autotune daemon memory'
+        ),
      ]
  
      def __init__(self, *args: Any, **kwargs: Any):
@@ -378,6 +390,8 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
              self.registry_password: Optional[str] = None
              self.use_repo_digest = True
              self.default_registry = ''
+            self.autotune_memory_target_ratio = 0.0
+            self.autotune_interval = 0
  
          self._cons: Dict[str, Tuple[remoto.backends.BaseConnection,
                                      remoto.backends.LegacyModuleExecute]] = {}
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py

index 9b0e82919331ed4c328af97c659bafec1041c2ad..d419ff2bb57594ca58d526307d53a7541153bc13 100644 (file)
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -23,9 +23,11 @@ from orchestrator import OrchestratorError, set_exception_subject, OrchestratorE
      DaemonDescriptionStatus, daemon_type_to_service
  from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
  from cephadm.schedule import HostAssignment
+from cephadm.autotune import MemoryAutotuner
  from cephadm.utils import forall_hosts, cephadmNoImage, is_repo_digest, \
      CephadmNoImage, CEPH_TYPES, ContainerInspectInfo
  from mgr_module import MonCommandFailed
+from mgr_util import format_bytes
  
  from . import utils
  
@@ -128,6 +130,51 @@ class CephadmServe:
                  del self.mgr.health_checks['CEPHADM_PAUSED']
                  self.mgr.set_health_checks(self.mgr.health_checks)
  
+    def _autotune_host_memory(self, host: str) -> None:
+        total_mem = self.mgr.cache.get_facts(host).get('memory_total_kb', 0)
+        if not total_mem:
+            val = None
+        else:
+            total_mem *= 1024   # kb -> bytes
+            total_mem *= self.mgr.autotune_memory_target_ratio
+            a = MemoryAutotuner(
+                daemons=self.mgr.cache.get_daemons_by_host(host),
+                config_get=self.mgr.get_foreign_ceph_option,
+                total_mem=total_mem,
+            )
+            val, osds = a.tune()
+            any_changed = False
+            for o in osds:
+                if self.mgr.get_foreign_ceph_option(o, 'osd_memory_target') != val:
+                    self.mgr.check_mon_command({
+                        'prefix': 'config rm',
+                        'who': o,
+                        'name': 'osd_memory_target',
+                    })
+                    any_changed = True
+        if val is not None:
+            if any_changed:
+                self.mgr.log.info(
+                    f'Adjusting osd_memory_target on {host} to {format_bytes(val, 6)}'
+                )
+                ret, out, err = self.mgr.mon_command({
+                    'prefix': 'config set',
+                    'who': f'osd/host:{host}',
+                    'name': 'osd_memory_target',
+                    'value': str(val),
+                })
+                if ret:
+                    self.log.warning(
+                        f'Unable to set osd_memory_target on {host} to {val}: {err}'
+                    )
+        else:
+            self.mgr.check_mon_command({
+                'prefix': 'config rm',
+                'who': f'osd/host:{host}',
+                'name': 'osd_memory_target',
+            })
+        self.mgr.cache.update_autotune(host)
+
      def _refresh_hosts_and_daemons(self) -> None:
          bad_hosts = []
          failures = []
@@ -236,6 +283,10 @@ class CephadmServe:
                  if r:
                      failures.append(r)
  
+            if self.mgr.cache.host_needs_autotune_memory(host):
+                self.log.debug(f"autotuning memory for {host}")
+                self._autotune_host_memory(host)
+
              # client files
              updated_files = False
              old_files = self.mgr.cache.get_host_client_files(host).copy()
diff --git a/src/pybind/mgr/cephadm/tests/fixtures.py b/src/pybind/mgr/cephadm/tests/fixtures.py

index 4a8a4e30a9892f93b6f8b1f589effa89d887c9bc..127f14f7729236d731077c476c54f9b7c0ad71e3 100644 (file)
--- a/src/pybind/mgr/cephadm/tests/fixtures.py
+++ b/src/pybind/mgr/cephadm/tests/fixtures.py
@@ -20,7 +20,9 @@ def get_ceph_option(_, key):
  
  
  def _run_cephadm(ret):
-    def foo(*args, **kwargs):
+    def foo(s, host, entity, cmd, e, **kwargs):
+        if cmd == 'gather-facts':
+            return '{}', '', 0
          return [ret], '', 0
      return foo
  
diff --git a/src/pybind/mgr/cephadm/tests/test_autotune.py b/src/pybind/mgr/cephadm/tests/test_autotune.py

new file mode 100644 (file)

index 0000000..524da9c
--- /dev/null
+++ b/src/pybind/mgr/cephadm/tests/test_autotune.py
@@ -0,0 +1,69 @@
+# Disable autopep8 for this file:
+
+# fmt: off
+
+import pytest
+
+from cephadm.autotune import MemoryAutotuner
+from orchestrator import DaemonDescription
+
+
+@pytest.mark.parametrize("total,daemons,config,result",
+    [   # noqa: E128
+        (
+            128 * 1024 * 1024 * 1024,
+            [],
+            {},
+            None,
+        ),
+        (
+            128 * 1024 * 1024 * 1024,
+            [
+                DaemonDescription('osd', '1', 'host1'),
+                DaemonDescription('osd', '2', 'host1'),
+            ],
+            {},
+            64 * 1024 * 1024 * 1024,
+        ),
+        (
+            128 * 1024 * 1024 * 1024,
+            [
+                DaemonDescription('osd', '1', 'host1'),
+                DaemonDescription('osd', '2', 'host1'),
+                DaemonDescription('osd', '3', 'host1'),
+            ],
+            {
+                'osd.3': 16 * 1024 * 1024 * 1024,
+            },
+            56 * 1024 * 1024 * 1024,
+        ),
+        (
+            128 * 1024 * 1024 * 1024,
+            [
+                DaemonDescription('mgr', 'a', 'host1'),
+                DaemonDescription('osd', '1', 'host1'),
+                DaemonDescription('osd', '2', 'host1'),
+            ],
+            {},
+            62 * 1024 * 1024 * 1024,
+        )
+    ])
+def test_autotune(total, daemons, config, result):
+    def fake_getter(who, opt):
+        if opt == 'osd_memory_target_autotune':
+            if who in config:
+                return False
+            else:
+                return True
+        if opt == 'osd_memory_target':
+            return config.get(who, 4 * 1024 * 1024 * 1024)
+        if opt == 'mds_cache_memory_limit':
+            return 16 * 1024 * 1024 * 1024
+
+    a = MemoryAutotuner(
+        total_mem=total,
+        daemons=daemons,
+        config_get=fake_getter,
+    )
+    val, osds = a.tune()
+    assert val == result
author	Sage Weil <sage@newdream.net>
	Tue, 4 May 2021 19:56:21 +0000 (15:56 -0400)
committer	Sage Weil <sage@newdream.net>
	Thu, 20 May 2021 23:02:58 +0000 (18:02 -0500)
doc/cephadm/osd.rst		patch \| blob \| history
src/pybind/mgr/cephadm/autotune.py	[new file with mode: 0644]	patch \| blob
src/pybind/mgr/cephadm/inventory.py		patch \| blob \| history
src/pybind/mgr/cephadm/module.py		patch \| blob \| history
src/pybind/mgr/cephadm/serve.py		patch \| blob \| history
src/pybind/mgr/cephadm/tests/fixtures.py		patch \| blob \| history
src/pybind/mgr/cephadm/tests/test_autotune.py	[new file with mode: 0644]	patch \| blob