]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: autotune osd memory
authorSage Weil <sage@newdream.net>
Tue, 4 May 2021 19:56:21 +0000 (15:56 -0400)
committerSage Weil <sage@newdream.net>
Thu, 20 May 2021 23:02:58 +0000 (18:02 -0500)
- set osd_memory_target_autotune=true to enable
- tuning is periodic (check every 10m by default)
- tuned values are reflected by osd_memory_target config options scoped
  to the host
- only make a change if it appears that we will affect at least 1 of the
  relevant OSDs
- attempt to clean out conflicting options.  (This is imperfect, since any
  manner of weirdly-scoped config options could be responsible; we only
  attempt to clean out one scoped directly to the osd name.)

Signed-off-by: Sage Weil <sage@newdream.net>
(cherry picked from commit 85ea078787d864bd8ae199f9517a9d2dbaf33c7c)

doc/cephadm/osd.rst
src/pybind/mgr/cephadm/autotune.py [new file with mode: 0644]
src/pybind/mgr/cephadm/inventory.py
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/serve.py
src/pybind/mgr/cephadm/tests/fixtures.py
src/pybind/mgr/cephadm/tests/test_autotune.py [new file with mode: 0644]

index d61b61d58ab5e0443ac89e53e0412fbaf6467982..55ddd4c66167eeda1b9835eb7f466a7971416f7a 100644 (file)
@@ -363,6 +363,52 @@ Example command:
     device.  To disable this behavior, see :ref:`cephadm-osd-declarative`.
 
 
+.. _osd_autotune:
+
+Automatically tuning OSD memory
+===============================
+
+OSD daemons will adjust their memory consumption based on the
+``osd_memory_target`` config option (several gigabytes, by
+default).  If Ceph is deployed on dedicated nodes that are not sharing
+memory with other services, cephadm can automatically adjust the per-OSD
+memory consumption based on the total amount of RAM and the number of deployed
+OSDs.
+
+This option is enabled globally with::
+
+  ceph config set osd osd_memory_target_autotune true
+
+Cephadm will start with a fraction
+(``mgr/cephadm/autotune_memory_target_ratio``, which defaults to
+``.7``) of the total RAM in the system, subtract off any memory
+consumed by non-autotuned daemons (non-OSDs, for OSDs for which
+``osd_memory_target_autotune`` is false), and then divide by the
+remaining OSDs.
+
+The final targets are reflected in the config database with options like::
+
+  WHO   MASK      LEVEL   OPTION              VALUE
+  osd   host:foo  basic   osd_memory_target   126092301926
+  osd   host:bar  basic   osd_memory_target   6442450944
+
+Both the limits and the current memory consumed by each daemon are visible from
+the ``ceph orch ps`` output in the ``MEM LIMIT`` column::
+
+  NAME        HOST  PORTS  STATUS         REFRESHED  AGE  MEM USED  MEM LIMIT  VERSION                IMAGE ID      CONTAINER ID  
+  osd.1       dael         running (3h)     10s ago   3h    72857k     117.4G  17.0.0-3781-gafaed750  7015fda3cd67  9e183363d39c  
+  osd.2       dael         running (81m)    10s ago  81m    63989k     117.4G  17.0.0-3781-gafaed750  7015fda3cd67  1f0cc479b051  
+  osd.3       dael         running (62m)    10s ago  62m    64071k     117.4G  17.0.0-3781-gafaed750  7015fda3cd67  ac5537492f27  
+
+To exclude an OSD from memory autotuning, disable the autotune option
+for that OSD and also set a specific memory target.  For example,
+
+  .. prompt:: bash #
+
+    ceph config set osd.123 osd_memory_target_autotune false
+    ceph config set osd.123 osd_memory_target 16G
+
+
 .. _drivegroups:
 
 Advanced OSD Service Specifications
diff --git a/src/pybind/mgr/cephadm/autotune.py b/src/pybind/mgr/cephadm/autotune.py
new file mode 100644 (file)
index 0000000..51c931c
--- /dev/null
@@ -0,0 +1,54 @@
+import logging
+from typing import List, Optional, Callable, Any, Tuple
+
+from orchestrator._interface import DaemonDescription
+
+logger = logging.getLogger(__name__)
+
+
+class MemoryAutotuner(object):
+
+    min_size_by_type = {
+        'mds': 4096 * 1048576,
+        'mgr': 4096 * 1048576,
+        'mon': 1024 * 1048576,
+        'crash': 128 * 1048576,
+        'keepalived': 128 * 1048576,
+        'haproxy': 128 * 1048576,
+    }
+    default_size = 1024 * 1048576
+
+    def __init__(
+            self,
+            daemons: List[DaemonDescription],
+            config_get: Callable[[str, str], Any],
+            total_mem: int,
+    ):
+        self.daemons = daemons
+        self.config_get = config_get
+        self.total_mem = total_mem
+
+    def tune(self) -> Tuple[Optional[int], List[str]]:
+        tuned_osds: List[str] = []
+        total = self.total_mem
+        for d in self.daemons:
+            if d.daemon_type == 'mds':
+                total -= self.config_get(d.name(), 'mds_cache_memory_limit')
+                continue
+            if d.daemon_type != 'osd':
+                assert d.daemon_type
+                total -= max(
+                    self.min_size_by_type.get(d.daemon_type, self.default_size),
+                    d.memory_usage or 0
+                )
+                continue
+            if not self.config_get(d.name(), 'osd_memory_target_autotune'):
+                total -= self.config_get(d.name(), 'osd_memory_target')
+                continue
+            tuned_osds.append(d.name())
+        if total < 0:
+            return None, []
+        if not tuned_osds:
+            return None, []
+        per = total // len(tuned_osds)
+        return int(per), tuned_osds
index 81089a26437a94dddaa3a2c085973693d00b426a..f66679a01a8a52a34d02716b616d1978bb9e0270 100644 (file)
@@ -345,6 +345,7 @@ class HostCache():
         self.devices = {}              # type: Dict[str, List[inventory.Device]]
         self.facts = {}                # type: Dict[str, Dict[str, Any]]
         self.last_facts_update = {}    # type: Dict[str, datetime.datetime]
+        self.last_autotune = {}        # type: Dict[str, datetime.datetime]
         self.osdspec_previews = {}     # type: Dict[str, List[Dict[str, Any]]]
         self.osdspec_last_applied = {}  # type: Dict[str, Dict[str, datetime.datetime]]
         self.networks = {}             # type: Dict[str, Dict[str, Dict[str, List[str]]]]
@@ -429,6 +430,9 @@ class HostCache():
         self.facts[host] = facts
         self.last_facts_update[host] = datetime_now()
 
+    def update_autotune(self, host: str) -> None:
+        self.last_autotune[host] = datetime_now()
+
     def devices_changed(self, host: str, b: List[inventory.Device]) -> bool:
         a = self.devices[host]
         if len(a) != len(b):
@@ -576,6 +580,8 @@ class HostCache():
             del self.facts[host]
         if host in self.last_facts_update:
             del self.last_facts_update[host]
+        if host in self.last_autotune:
+            del self.last_autotune[host]
         if host in self.osdspec_previews:
             del self.osdspec_previews[host]
         if host in self.osdspec_last_applied:
@@ -605,6 +611,9 @@ class HostCache():
             r.append(host)
         return r
 
+    def get_facts(self, host: str) -> Dict[str, Any]:
+        return self.facts.get(host, {})
+
     def get_daemons(self):
         # type: () -> List[orchestrator.DaemonDescription]
         r = []
@@ -613,6 +622,9 @@ class HostCache():
                 r.append(dd)
         return r
 
+    def get_daemons_by_host(self, host: str) -> List[orchestrator.DaemonDescription]:
+        return list(self.daemons.get(host, {}).values())
+
     def get_daemon(self, daemon_name: str) -> orchestrator.DaemonDescription:
         assert not daemon_name.startswith('ha-rgw.')
         for _, dm in self.daemons.items():
@@ -712,6 +724,17 @@ class HostCache():
             return True
         return False
 
+    def host_needs_autotune_memory(self, host):
+        # type: (str) -> bool
+        if host in self.mgr.offline_hosts:
+            logger.debug(f'Host "{host}" marked as offline. Skipping autotune')
+            return False
+        cutoff = datetime_now() - datetime.timedelta(
+            seconds=self.mgr.autotune_interval)
+        if host not in self.last_autotune or self.last_autotune[host] < cutoff:
+            return True
+        return False
+
     def host_had_daemon_refresh(self, host: str) -> bool:
         """
         ... at least once.
index e17c875241612586d40c5b0cb9ab1064e1a3f145..a5ab603f0e9ee54933212cd2041e374f1db727ea 100644 (file)
@@ -331,6 +331,18 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
             default=10,
             desc='max number of daemons per service per host',
         ),
+        Option(
+            'autotune_memory_target_ratio',
+            type='float',
+            default=.7,
+            desc='ratio of total system memory to divide amongst autotuned daemons'
+        ),
+        Option(
+            'autotune_interval',
+            type='secs',
+            default=10 * 60,
+            desc='how frequently to autotune daemon memory'
+        ),
     ]
 
     def __init__(self, *args: Any, **kwargs: Any):
@@ -378,6 +390,8 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
             self.registry_password: Optional[str] = None
             self.use_repo_digest = True
             self.default_registry = ''
+            self.autotune_memory_target_ratio = 0.0
+            self.autotune_interval = 0
 
         self._cons: Dict[str, Tuple[remoto.backends.BaseConnection,
                                     remoto.backends.LegacyModuleExecute]] = {}
index 9b0e82919331ed4c328af97c659bafec1041c2ad..d419ff2bb57594ca58d526307d53a7541153bc13 100644 (file)
@@ -23,9 +23,11 @@ from orchestrator import OrchestratorError, set_exception_subject, OrchestratorE
     DaemonDescriptionStatus, daemon_type_to_service
 from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
 from cephadm.schedule import HostAssignment
+from cephadm.autotune import MemoryAutotuner
 from cephadm.utils import forall_hosts, cephadmNoImage, is_repo_digest, \
     CephadmNoImage, CEPH_TYPES, ContainerInspectInfo
 from mgr_module import MonCommandFailed
+from mgr_util import format_bytes
 
 from . import utils
 
@@ -128,6 +130,51 @@ class CephadmServe:
                 del self.mgr.health_checks['CEPHADM_PAUSED']
                 self.mgr.set_health_checks(self.mgr.health_checks)
 
+    def _autotune_host_memory(self, host: str) -> None:
+        total_mem = self.mgr.cache.get_facts(host).get('memory_total_kb', 0)
+        if not total_mem:
+            val = None
+        else:
+            total_mem *= 1024   # kb -> bytes
+            total_mem *= self.mgr.autotune_memory_target_ratio
+            a = MemoryAutotuner(
+                daemons=self.mgr.cache.get_daemons_by_host(host),
+                config_get=self.mgr.get_foreign_ceph_option,
+                total_mem=total_mem,
+            )
+            val, osds = a.tune()
+            any_changed = False
+            for o in osds:
+                if self.mgr.get_foreign_ceph_option(o, 'osd_memory_target') != val:
+                    self.mgr.check_mon_command({
+                        'prefix': 'config rm',
+                        'who': o,
+                        'name': 'osd_memory_target',
+                    })
+                    any_changed = True
+        if val is not None:
+            if any_changed:
+                self.mgr.log.info(
+                    f'Adjusting osd_memory_target on {host} to {format_bytes(val, 6)}'
+                )
+                ret, out, err = self.mgr.mon_command({
+                    'prefix': 'config set',
+                    'who': f'osd/host:{host}',
+                    'name': 'osd_memory_target',
+                    'value': str(val),
+                })
+                if ret:
+                    self.log.warning(
+                        f'Unable to set osd_memory_target on {host} to {val}: {err}'
+                    )
+        else:
+            self.mgr.check_mon_command({
+                'prefix': 'config rm',
+                'who': f'osd/host:{host}',
+                'name': 'osd_memory_target',
+            })
+        self.mgr.cache.update_autotune(host)
+
     def _refresh_hosts_and_daemons(self) -> None:
         bad_hosts = []
         failures = []
@@ -236,6 +283,10 @@ class CephadmServe:
                 if r:
                     failures.append(r)
 
+            if self.mgr.cache.host_needs_autotune_memory(host):
+                self.log.debug(f"autotuning memory for {host}")
+                self._autotune_host_memory(host)
+
             # client files
             updated_files = False
             old_files = self.mgr.cache.get_host_client_files(host).copy()
index 4a8a4e30a9892f93b6f8b1f589effa89d887c9bc..127f14f7729236d731077c476c54f9b7c0ad71e3 100644 (file)
@@ -20,7 +20,9 @@ def get_ceph_option(_, key):
 
 
 def _run_cephadm(ret):
-    def foo(*args, **kwargs):
+    def foo(s, host, entity, cmd, e, **kwargs):
+        if cmd == 'gather-facts':
+            return '{}', '', 0
         return [ret], '', 0
     return foo
 
diff --git a/src/pybind/mgr/cephadm/tests/test_autotune.py b/src/pybind/mgr/cephadm/tests/test_autotune.py
new file mode 100644 (file)
index 0000000..524da9c
--- /dev/null
@@ -0,0 +1,69 @@
+# Disable autopep8 for this file:
+
+# fmt: off
+
+import pytest
+
+from cephadm.autotune import MemoryAutotuner
+from orchestrator import DaemonDescription
+
+
+@pytest.mark.parametrize("total,daemons,config,result",
+    [   # noqa: E128
+        (
+            128 * 1024 * 1024 * 1024,
+            [],
+            {},
+            None,
+        ),
+        (
+            128 * 1024 * 1024 * 1024,
+            [
+                DaemonDescription('osd', '1', 'host1'),
+                DaemonDescription('osd', '2', 'host1'),
+            ],
+            {},
+            64 * 1024 * 1024 * 1024,
+        ),
+        (
+            128 * 1024 * 1024 * 1024,
+            [
+                DaemonDescription('osd', '1', 'host1'),
+                DaemonDescription('osd', '2', 'host1'),
+                DaemonDescription('osd', '3', 'host1'),
+            ],
+            {
+                'osd.3': 16 * 1024 * 1024 * 1024,
+            },
+            56 * 1024 * 1024 * 1024,
+        ),
+        (
+            128 * 1024 * 1024 * 1024,
+            [
+                DaemonDescription('mgr', 'a', 'host1'),
+                DaemonDescription('osd', '1', 'host1'),
+                DaemonDescription('osd', '2', 'host1'),
+            ],
+            {},
+            62 * 1024 * 1024 * 1024,
+        )
+    ])
+def test_autotune(total, daemons, config, result):
+    def fake_getter(who, opt):
+        if opt == 'osd_memory_target_autotune':
+            if who in config:
+                return False
+            else:
+                return True
+        if opt == 'osd_memory_target':
+            return config.get(who, 4 * 1024 * 1024 * 1024)
+        if opt == 'mds_cache_memory_limit':
+            return 16 * 1024 * 1024 * 1024
+
+    a = MemoryAutotuner(
+        total_mem=total,
+        daemons=daemons,
+        config_get=fake_getter,
+    )
+    val, osds = a.tune()
+    assert val == result