From 85ea078787d864bd8ae199f9517a9d2dbaf33c7c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 4 May 2021 15:56:21 -0400 Subject: [PATCH] mgr/cephadm: autotune osd memory - set osd_memory_target_autotune=true to enable - tuning is periodic (check every 10m by default) - tuned values are reflected by osd_memory_target config options scoped to the host - only make a change if it appears that we will affect at least 1 of the relevant OSDs - attempt to clean out conflicting options. (This is imperfect, since any manner of weirdly-scoped config options could be responsible; we only attempt to clean out one scoped directly to the osd name.) Signed-off-by: Sage Weil --- doc/cephadm/osd.rst | 46 +++++++++++++ src/pybind/mgr/cephadm/autotune.py | 54 +++++++++++++++ src/pybind/mgr/cephadm/inventory.py | 23 +++++++ src/pybind/mgr/cephadm/module.py | 14 ++++ src/pybind/mgr/cephadm/serve.py | 51 ++++++++++++++ src/pybind/mgr/cephadm/tests/fixtures.py | 4 +- src/pybind/mgr/cephadm/tests/test_autotune.py | 69 +++++++++++++++++++ 7 files changed, 260 insertions(+), 1 deletion(-) create mode 100644 src/pybind/mgr/cephadm/autotune.py create mode 100644 src/pybind/mgr/cephadm/tests/test_autotune.py diff --git a/doc/cephadm/osd.rst b/doc/cephadm/osd.rst index d61b61d58ab5e..55ddd4c66167e 100644 --- a/doc/cephadm/osd.rst +++ b/doc/cephadm/osd.rst @@ -363,6 +363,52 @@ Example command: device. To disable this behavior, see :ref:`cephadm-osd-declarative`. +.. _osd_autotune: + +Automatically tuning OSD memory +=============================== + +OSD daemons will adjust their memory consumption based on the +``osd_memory_target`` config option (several gigabytes, by +default). If Ceph is deployed on dedicated nodes that are not sharing +memory with other services, cephadm can automatically adjust the per-OSD +memory consumption based on the total amount of RAM and the number of deployed +OSDs. + +This option is enabled globally with:: + + ceph config set osd osd_memory_target_autotune true + +Cephadm will start with a fraction +(``mgr/cephadm/autotune_memory_target_ratio``, which defaults to +``.7``) of the total RAM in the system, subtract off any memory +consumed by non-autotuned daemons (non-OSDs, for OSDs for which +``osd_memory_target_autotune`` is false), and then divide by the +remaining OSDs. + +The final targets are reflected in the config database with options like:: + + WHO MASK LEVEL OPTION VALUE + osd host:foo basic osd_memory_target 126092301926 + osd host:bar basic osd_memory_target 6442450944 + +Both the limits and the current memory consumed by each daemon are visible from +the ``ceph orch ps`` output in the ``MEM LIMIT`` column:: + + NAME HOST PORTS STATUS REFRESHED AGE MEM USED MEM LIMIT VERSION IMAGE ID CONTAINER ID + osd.1 dael running (3h) 10s ago 3h 72857k 117.4G 17.0.0-3781-gafaed750 7015fda3cd67 9e183363d39c + osd.2 dael running (81m) 10s ago 81m 63989k 117.4G 17.0.0-3781-gafaed750 7015fda3cd67 1f0cc479b051 + osd.3 dael running (62m) 10s ago 62m 64071k 117.4G 17.0.0-3781-gafaed750 7015fda3cd67 ac5537492f27 + +To exclude an OSD from memory autotuning, disable the autotune option +for that OSD and also set a specific memory target. For example, + + .. prompt:: bash # + + ceph config set osd.123 osd_memory_target_autotune false + ceph config set osd.123 osd_memory_target 16G + + .. _drivegroups: Advanced OSD Service Specifications diff --git a/src/pybind/mgr/cephadm/autotune.py b/src/pybind/mgr/cephadm/autotune.py new file mode 100644 index 0000000000000..51c931cbad637 --- /dev/null +++ b/src/pybind/mgr/cephadm/autotune.py @@ -0,0 +1,54 @@ +import logging +from typing import List, Optional, Callable, Any, Tuple + +from orchestrator._interface import DaemonDescription + +logger = logging.getLogger(__name__) + + +class MemoryAutotuner(object): + + min_size_by_type = { + 'mds': 4096 * 1048576, + 'mgr': 4096 * 1048576, + 'mon': 1024 * 1048576, + 'crash': 128 * 1048576, + 'keepalived': 128 * 1048576, + 'haproxy': 128 * 1048576, + } + default_size = 1024 * 1048576 + + def __init__( + self, + daemons: List[DaemonDescription], + config_get: Callable[[str, str], Any], + total_mem: int, + ): + self.daemons = daemons + self.config_get = config_get + self.total_mem = total_mem + + def tune(self) -> Tuple[Optional[int], List[str]]: + tuned_osds: List[str] = [] + total = self.total_mem + for d in self.daemons: + if d.daemon_type == 'mds': + total -= self.config_get(d.name(), 'mds_cache_memory_limit') + continue + if d.daemon_type != 'osd': + assert d.daemon_type + total -= max( + self.min_size_by_type.get(d.daemon_type, self.default_size), + d.memory_usage or 0 + ) + continue + if not self.config_get(d.name(), 'osd_memory_target_autotune'): + total -= self.config_get(d.name(), 'osd_memory_target') + continue + tuned_osds.append(d.name()) + if total < 0: + return None, [] + if not tuned_osds: + return None, [] + per = total // len(tuned_osds) + return int(per), tuned_osds diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py index d2c6e895a31d1..88bf970b995e9 100644 --- a/src/pybind/mgr/cephadm/inventory.py +++ b/src/pybind/mgr/cephadm/inventory.py @@ -344,6 +344,7 @@ class HostCache(): self.devices = {} # type: Dict[str, List[inventory.Device]] self.facts = {} # type: Dict[str, Dict[str, Any]] self.last_facts_update = {} # type: Dict[str, datetime.datetime] + self.last_autotune = {} # type: Dict[str, datetime.datetime] self.osdspec_previews = {} # type: Dict[str, List[Dict[str, Any]]] self.osdspec_last_applied = {} # type: Dict[str, Dict[str, datetime.datetime]] self.networks = {} # type: Dict[str, Dict[str, Dict[str, List[str]]]] @@ -428,6 +429,9 @@ class HostCache(): self.facts[host] = facts self.last_facts_update[host] = datetime_now() + def update_autotune(self, host: str) -> None: + self.last_autotune[host] = datetime_now() + def devices_changed(self, host: str, b: List[inventory.Device]) -> bool: a = self.devices[host] if len(a) != len(b): @@ -575,6 +579,8 @@ class HostCache(): del self.facts[host] if host in self.last_facts_update: del self.last_facts_update[host] + if host in self.last_autotune: + del self.last_autotune[host] if host in self.osdspec_previews: del self.osdspec_previews[host] if host in self.osdspec_last_applied: @@ -604,6 +610,9 @@ class HostCache(): r.append(host) return r + def get_facts(self, host: str) -> Dict[str, Any]: + return self.facts.get(host, {}) + def get_daemons(self): # type: () -> List[orchestrator.DaemonDescription] r = [] @@ -612,6 +621,9 @@ class HostCache(): r.append(dd) return r + def get_daemons_by_host(self, host: str) -> List[orchestrator.DaemonDescription]: + return list(self.daemons.get(host, {}).values()) + def get_daemon(self, daemon_name: str) -> orchestrator.DaemonDescription: assert not daemon_name.startswith('ha-rgw.') for _, dm in self.daemons.items(): @@ -711,6 +723,17 @@ class HostCache(): return True return False + def host_needs_autotune_memory(self, host): + # type: (str) -> bool + if host in self.mgr.offline_hosts: + logger.debug(f'Host "{host}" marked as offline. Skipping autotune') + return False + cutoff = datetime_now() - datetime.timedelta( + seconds=self.mgr.autotune_interval) + if host not in self.last_autotune or self.last_autotune[host] < cutoff: + return True + return False + def host_had_daemon_refresh(self, host: str) -> bool: """ ... at least once. diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index e79cdb413e80c..89f1b0e5e4800 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -331,6 +331,18 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, default=10, desc='max number of daemons per service per host', ), + Option( + 'autotune_memory_target_ratio', + type='float', + default=.7, + desc='ratio of total system memory to divide amongst autotuned daemons' + ), + Option( + 'autotune_interval', + type='secs', + default=10 * 60, + desc='how frequently to autotune daemon memory' + ), ] def __init__(self, *args: Any, **kwargs: Any): @@ -378,6 +390,8 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.registry_password: Optional[str] = None self.use_repo_digest = True self.default_registry = '' + self.autotune_memory_target_ratio = 0.0 + self.autotune_interval = 0 self._cons: Dict[str, Tuple[remoto.backends.BaseConnection, remoto.backends.LegacyModuleExecute]] = {} diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index e92da1f9fbe21..467c1fab32a0b 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -23,9 +23,11 @@ from orchestrator import OrchestratorError, set_exception_subject, OrchestratorE DaemonDescriptionStatus, daemon_type_to_service from cephadm.services.cephadmservice import CephadmDaemonDeploySpec from cephadm.schedule import HostAssignment +from cephadm.autotune import MemoryAutotuner from cephadm.utils import forall_hosts, cephadmNoImage, is_repo_digest, \ CephadmNoImage, CEPH_TYPES, ContainerInspectInfo from mgr_module import MonCommandFailed +from mgr_util import format_bytes from . import utils @@ -128,6 +130,51 @@ class CephadmServe: del self.mgr.health_checks['CEPHADM_PAUSED'] self.mgr.set_health_checks(self.mgr.health_checks) + def _autotune_host_memory(self, host: str) -> None: + total_mem = self.mgr.cache.get_facts(host).get('memory_total_kb', 0) + if not total_mem: + val = None + else: + total_mem *= 1024 # kb -> bytes + total_mem *= self.mgr.autotune_memory_target_ratio + a = MemoryAutotuner( + daemons=self.mgr.cache.get_daemons_by_host(host), + config_get=self.mgr.get_foreign_ceph_option, + total_mem=total_mem, + ) + val, osds = a.tune() + any_changed = False + for o in osds: + if self.mgr.get_foreign_ceph_option(o, 'osd_memory_target') != val: + self.mgr.check_mon_command({ + 'prefix': 'config rm', + 'who': o, + 'name': 'osd_memory_target', + }) + any_changed = True + if val is not None: + if any_changed: + self.mgr.log.info( + f'Adjusting osd_memory_target on {host} to {format_bytes(val, 6)}' + ) + ret, out, err = self.mgr.mon_command({ + 'prefix': 'config set', + 'who': f'osd/host:{host}', + 'name': 'osd_memory_target', + 'value': str(val), + }) + if ret: + self.log.warning( + f'Unable to set osd_memory_target on {host} to {val}: {err}' + ) + else: + self.mgr.check_mon_command({ + 'prefix': 'config rm', + 'who': f'osd/host:{host}', + 'name': 'osd_memory_target', + }) + self.mgr.cache.update_autotune(host) + def _refresh_hosts_and_daemons(self) -> None: bad_hosts = [] failures = [] @@ -233,6 +280,10 @@ class CephadmServe: if r: failures.append(r) + if self.mgr.cache.host_needs_autotune_memory(host): + self.log.debug(f"autotuning memory for {host}") + self._autotune_host_memory(host) + # client files updated_files = False old_files = self.mgr.cache.get_host_client_files(host).copy() diff --git a/src/pybind/mgr/cephadm/tests/fixtures.py b/src/pybind/mgr/cephadm/tests/fixtures.py index 4a8a4e30a9892..127f14f772923 100644 --- a/src/pybind/mgr/cephadm/tests/fixtures.py +++ b/src/pybind/mgr/cephadm/tests/fixtures.py @@ -20,7 +20,9 @@ def get_ceph_option(_, key): def _run_cephadm(ret): - def foo(*args, **kwargs): + def foo(s, host, entity, cmd, e, **kwargs): + if cmd == 'gather-facts': + return '{}', '', 0 return [ret], '', 0 return foo diff --git a/src/pybind/mgr/cephadm/tests/test_autotune.py b/src/pybind/mgr/cephadm/tests/test_autotune.py new file mode 100644 index 0000000000000..524da9c0008c7 --- /dev/null +++ b/src/pybind/mgr/cephadm/tests/test_autotune.py @@ -0,0 +1,69 @@ +# Disable autopep8 for this file: + +# fmt: off + +import pytest + +from cephadm.autotune import MemoryAutotuner +from orchestrator import DaemonDescription + + +@pytest.mark.parametrize("total,daemons,config,result", + [ # noqa: E128 + ( + 128 * 1024 * 1024 * 1024, + [], + {}, + None, + ), + ( + 128 * 1024 * 1024 * 1024, + [ + DaemonDescription('osd', '1', 'host1'), + DaemonDescription('osd', '2', 'host1'), + ], + {}, + 64 * 1024 * 1024 * 1024, + ), + ( + 128 * 1024 * 1024 * 1024, + [ + DaemonDescription('osd', '1', 'host1'), + DaemonDescription('osd', '2', 'host1'), + DaemonDescription('osd', '3', 'host1'), + ], + { + 'osd.3': 16 * 1024 * 1024 * 1024, + }, + 56 * 1024 * 1024 * 1024, + ), + ( + 128 * 1024 * 1024 * 1024, + [ + DaemonDescription('mgr', 'a', 'host1'), + DaemonDescription('osd', '1', 'host1'), + DaemonDescription('osd', '2', 'host1'), + ], + {}, + 62 * 1024 * 1024 * 1024, + ) + ]) +def test_autotune(total, daemons, config, result): + def fake_getter(who, opt): + if opt == 'osd_memory_target_autotune': + if who in config: + return False + else: + return True + if opt == 'osd_memory_target': + return config.get(who, 4 * 1024 * 1024 * 1024) + if opt == 'mds_cache_memory_limit': + return 16 * 1024 * 1024 * 1024 + + a = MemoryAutotuner( + total_mem=total, + daemons=daemons, + config_get=fake_getter, + ) + val, osds = a.tune() + assert val == result -- 2.39.5