device. To disable this behavior, see :ref:`cephadm-osd-declarative`.
+.. _osd_autotune:
+
+Automatically tuning OSD memory
+===============================
+
+OSD daemons will adjust their memory consumption based on the
+``osd_memory_target`` config option (several gigabytes, by
+default). If Ceph is deployed on dedicated nodes that are not sharing
+memory with other services, cephadm can automatically adjust the per-OSD
+memory consumption based on the total amount of RAM and the number of deployed
+OSDs.
+
+This option is enabled globally with::
+
+ ceph config set osd osd_memory_target_autotune true
+
+Cephadm will start with a fraction
+(``mgr/cephadm/autotune_memory_target_ratio``, which defaults to
+``.7``) of the total RAM in the system, subtract off any memory
+consumed by non-autotuned daemons (non-OSDs, for OSDs for which
+``osd_memory_target_autotune`` is false), and then divide by the
+remaining OSDs.
+
+The final targets are reflected in the config database with options like::
+
+ WHO MASK LEVEL OPTION VALUE
+ osd host:foo basic osd_memory_target 126092301926
+ osd host:bar basic osd_memory_target 6442450944
+
+Both the limits and the current memory consumed by each daemon are visible from
+the ``ceph orch ps`` output in the ``MEM LIMIT`` column::
+
+ NAME HOST PORTS STATUS REFRESHED AGE MEM USED MEM LIMIT VERSION IMAGE ID CONTAINER ID
+ osd.1 dael running (3h) 10s ago 3h 72857k 117.4G 17.0.0-3781-gafaed750 7015fda3cd67 9e183363d39c
+ osd.2 dael running (81m) 10s ago 81m 63989k 117.4G 17.0.0-3781-gafaed750 7015fda3cd67 1f0cc479b051
+ osd.3 dael running (62m) 10s ago 62m 64071k 117.4G 17.0.0-3781-gafaed750 7015fda3cd67 ac5537492f27
+
+To exclude an OSD from memory autotuning, disable the autotune option
+for that OSD and also set a specific memory target. For example,
+
+ .. prompt:: bash #
+
+ ceph config set osd.123 osd_memory_target_autotune false
+ ceph config set osd.123 osd_memory_target 16G
+
+
.. _drivegroups:
Advanced OSD Service Specifications
--- /dev/null
+import logging
+from typing import List, Optional, Callable, Any, Tuple
+
+from orchestrator._interface import DaemonDescription
+
+logger = logging.getLogger(__name__)
+
+
+class MemoryAutotuner(object):
+
+ min_size_by_type = {
+ 'mds': 4096 * 1048576,
+ 'mgr': 4096 * 1048576,
+ 'mon': 1024 * 1048576,
+ 'crash': 128 * 1048576,
+ 'keepalived': 128 * 1048576,
+ 'haproxy': 128 * 1048576,
+ }
+ default_size = 1024 * 1048576
+
+ def __init__(
+ self,
+ daemons: List[DaemonDescription],
+ config_get: Callable[[str, str], Any],
+ total_mem: int,
+ ):
+ self.daemons = daemons
+ self.config_get = config_get
+ self.total_mem = total_mem
+
+ def tune(self) -> Tuple[Optional[int], List[str]]:
+ tuned_osds: List[str] = []
+ total = self.total_mem
+ for d in self.daemons:
+ if d.daemon_type == 'mds':
+ total -= self.config_get(d.name(), 'mds_cache_memory_limit')
+ continue
+ if d.daemon_type != 'osd':
+ assert d.daemon_type
+ total -= max(
+ self.min_size_by_type.get(d.daemon_type, self.default_size),
+ d.memory_usage or 0
+ )
+ continue
+ if not self.config_get(d.name(), 'osd_memory_target_autotune'):
+ total -= self.config_get(d.name(), 'osd_memory_target')
+ continue
+ tuned_osds.append(d.name())
+ if total < 0:
+ return None, []
+ if not tuned_osds:
+ return None, []
+ per = total // len(tuned_osds)
+ return int(per), tuned_osds
self.devices = {} # type: Dict[str, List[inventory.Device]]
self.facts = {} # type: Dict[str, Dict[str, Any]]
self.last_facts_update = {} # type: Dict[str, datetime.datetime]
+ self.last_autotune = {} # type: Dict[str, datetime.datetime]
self.osdspec_previews = {} # type: Dict[str, List[Dict[str, Any]]]
self.osdspec_last_applied = {} # type: Dict[str, Dict[str, datetime.datetime]]
self.networks = {} # type: Dict[str, Dict[str, Dict[str, List[str]]]]
self.facts[host] = facts
self.last_facts_update[host] = datetime_now()
+ def update_autotune(self, host: str) -> None:
+ self.last_autotune[host] = datetime_now()
+
def devices_changed(self, host: str, b: List[inventory.Device]) -> bool:
a = self.devices[host]
if len(a) != len(b):
del self.facts[host]
if host in self.last_facts_update:
del self.last_facts_update[host]
+ if host in self.last_autotune:
+ del self.last_autotune[host]
if host in self.osdspec_previews:
del self.osdspec_previews[host]
if host in self.osdspec_last_applied:
r.append(host)
return r
+ def get_facts(self, host: str) -> Dict[str, Any]:
+ return self.facts.get(host, {})
+
def get_daemons(self):
# type: () -> List[orchestrator.DaemonDescription]
r = []
r.append(dd)
return r
+ def get_daemons_by_host(self, host: str) -> List[orchestrator.DaemonDescription]:
+ return list(self.daemons.get(host, {}).values())
+
def get_daemon(self, daemon_name: str) -> orchestrator.DaemonDescription:
assert not daemon_name.startswith('ha-rgw.')
for _, dm in self.daemons.items():
return True
return False
+ def host_needs_autotune_memory(self, host):
+ # type: (str) -> bool
+ if host in self.mgr.offline_hosts:
+ logger.debug(f'Host "{host}" marked as offline. Skipping autotune')
+ return False
+ cutoff = datetime_now() - datetime.timedelta(
+ seconds=self.mgr.autotune_interval)
+ if host not in self.last_autotune or self.last_autotune[host] < cutoff:
+ return True
+ return False
+
def host_had_daemon_refresh(self, host: str) -> bool:
"""
... at least once.
default=10,
desc='max number of daemons per service per host',
),
+ Option(
+ 'autotune_memory_target_ratio',
+ type='float',
+ default=.7,
+ desc='ratio of total system memory to divide amongst autotuned daemons'
+ ),
+ Option(
+ 'autotune_interval',
+ type='secs',
+ default=10 * 60,
+ desc='how frequently to autotune daemon memory'
+ ),
]
def __init__(self, *args: Any, **kwargs: Any):
self.registry_password: Optional[str] = None
self.use_repo_digest = True
self.default_registry = ''
+ self.autotune_memory_target_ratio = 0.0
+ self.autotune_interval = 0
self._cons: Dict[str, Tuple[remoto.backends.BaseConnection,
remoto.backends.LegacyModuleExecute]] = {}
DaemonDescriptionStatus, daemon_type_to_service
from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
from cephadm.schedule import HostAssignment
+from cephadm.autotune import MemoryAutotuner
from cephadm.utils import forall_hosts, cephadmNoImage, is_repo_digest, \
CephadmNoImage, CEPH_TYPES, ContainerInspectInfo
from mgr_module import MonCommandFailed
+from mgr_util import format_bytes
from . import utils
del self.mgr.health_checks['CEPHADM_PAUSED']
self.mgr.set_health_checks(self.mgr.health_checks)
+ def _autotune_host_memory(self, host: str) -> None:
+ total_mem = self.mgr.cache.get_facts(host).get('memory_total_kb', 0)
+ if not total_mem:
+ val = None
+ else:
+ total_mem *= 1024 # kb -> bytes
+ total_mem *= self.mgr.autotune_memory_target_ratio
+ a = MemoryAutotuner(
+ daemons=self.mgr.cache.get_daemons_by_host(host),
+ config_get=self.mgr.get_foreign_ceph_option,
+ total_mem=total_mem,
+ )
+ val, osds = a.tune()
+ any_changed = False
+ for o in osds:
+ if self.mgr.get_foreign_ceph_option(o, 'osd_memory_target') != val:
+ self.mgr.check_mon_command({
+ 'prefix': 'config rm',
+ 'who': o,
+ 'name': 'osd_memory_target',
+ })
+ any_changed = True
+ if val is not None:
+ if any_changed:
+ self.mgr.log.info(
+ f'Adjusting osd_memory_target on {host} to {format_bytes(val, 6)}'
+ )
+ ret, out, err = self.mgr.mon_command({
+ 'prefix': 'config set',
+ 'who': f'osd/host:{host}',
+ 'name': 'osd_memory_target',
+ 'value': str(val),
+ })
+ if ret:
+ self.log.warning(
+ f'Unable to set osd_memory_target on {host} to {val}: {err}'
+ )
+ else:
+ self.mgr.check_mon_command({
+ 'prefix': 'config rm',
+ 'who': f'osd/host:{host}',
+ 'name': 'osd_memory_target',
+ })
+ self.mgr.cache.update_autotune(host)
+
def _refresh_hosts_and_daemons(self) -> None:
bad_hosts = []
failures = []
if r:
failures.append(r)
+ if self.mgr.cache.host_needs_autotune_memory(host):
+ self.log.debug(f"autotuning memory for {host}")
+ self._autotune_host_memory(host)
+
# client files
updated_files = False
old_files = self.mgr.cache.get_host_client_files(host).copy()
def _run_cephadm(ret):
- def foo(*args, **kwargs):
+ def foo(s, host, entity, cmd, e, **kwargs):
+ if cmd == 'gather-facts':
+ return '{}', '', 0
return [ret], '', 0
return foo
--- /dev/null
+# Disable autopep8 for this file:
+
+# fmt: off
+
+import pytest
+
+from cephadm.autotune import MemoryAutotuner
+from orchestrator import DaemonDescription
+
+
+@pytest.mark.parametrize("total,daemons,config,result",
+ [ # noqa: E128
+ (
+ 128 * 1024 * 1024 * 1024,
+ [],
+ {},
+ None,
+ ),
+ (
+ 128 * 1024 * 1024 * 1024,
+ [
+ DaemonDescription('osd', '1', 'host1'),
+ DaemonDescription('osd', '2', 'host1'),
+ ],
+ {},
+ 64 * 1024 * 1024 * 1024,
+ ),
+ (
+ 128 * 1024 * 1024 * 1024,
+ [
+ DaemonDescription('osd', '1', 'host1'),
+ DaemonDescription('osd', '2', 'host1'),
+ DaemonDescription('osd', '3', 'host1'),
+ ],
+ {
+ 'osd.3': 16 * 1024 * 1024 * 1024,
+ },
+ 56 * 1024 * 1024 * 1024,
+ ),
+ (
+ 128 * 1024 * 1024 * 1024,
+ [
+ DaemonDescription('mgr', 'a', 'host1'),
+ DaemonDescription('osd', '1', 'host1'),
+ DaemonDescription('osd', '2', 'host1'),
+ ],
+ {},
+ 62 * 1024 * 1024 * 1024,
+ )
+ ])
+def test_autotune(total, daemons, config, result):
+ def fake_getter(who, opt):
+ if opt == 'osd_memory_target_autotune':
+ if who in config:
+ return False
+ else:
+ return True
+ if opt == 'osd_memory_target':
+ return config.get(who, 4 * 1024 * 1024 * 1024)
+ if opt == 'mds_cache_memory_limit':
+ return 16 * 1024 * 1024 * 1024
+
+ a = MemoryAutotuner(
+ total_mem=total,
+ daemons=daemons,
+ config_get=fake_getter,
+ )
+ val, osds = a.tune()
+ assert val == result