Automatic balancing
-------------------
-When the balancer is in ``upmap`` mode, the automatic balancing feature is
-enabled by default. For more details, see :ref:`upmap`. To disable the
-balancer, run the following command:
+When the balancer is in ``upmap`` mode, which is the default, the automatic
+upmap balancing feature is enabled. For more details, see :ref:`upmap`.
+To disable the balancer, run the following command:
.. prompt:: bash $
``crush-compat`` mode, the balancer automatically makes small changes to the
data distribution in order to ensure that OSDs are utilized equally.
+Additional modes include ``upmap-read`` and ``read``. ``upmap-read`` mode
+combines the upmap balancer with the read balancer so that both writes
+and reads are optimized. ``read`` mode can be used when only read optimization
+is desired. For more details, see :ref:`read_balancer`.
Throttling
----------
Modes
-----
-There are two supported balancer modes:
+There are four supported balancer modes:
#. **crush-compat**. This mode uses the compat weight-set feature (introduced
in Luminous) to manage an alternative set of weights for devices in the
To use ``upmap``, all clients must be Luminous or newer.
-The default mode is ``upmap``. The mode can be changed to ``crush-compat`` by
-running the following command:
+#. **read**. In Reef and later releases, the OSDMap can store explicit
+ mappings for individual primary OSDs as exceptions to the normal CRUSH
+ placement calculation. These ``pg-upmap-primary`` entries provide fine-grained
+ control over primary PG mappings. This mode optimizes the placement of individual
+ primary PGs in order to achieve balanced reads, or primary PGs, in a cluster.
+ In ``read`` mode, upmap behavior is not excercised, so this mode is best for
+ uses cases in which only read balancing is desired.
+
+ To use ``pg-upmap-primary``, all clients must be Reef or newer. For more
+ details about client compatibility, see :ref:`read_balancer`.
+
+#. **upmap-read**. This balancer mode combines optimization benefits of
+ both ``upmap`` and ``read`` mode. Like in ``read`` mode, ``upmap-read``
+ makes use of ``pg-upmap-primary``. As such, only Reef and later clients
+ are compatible. For more details about client compatibility, see
+ :ref:`read_balancer`.
+
+ ``upmap-read`` is highly recommended for achieving the ``upmap`` mode's
+ offering of balanced PG distribution as well as the ``read`` mode's
+ offering of balanced reads.
+
+The default mode is ``upmap``. The mode can be changed to ``crush-compat`` by running the following command:
.. prompt:: bash $
ceph balancer mode crush-compat
+The mode can be changed to ``read`` by running the following command:
+
+ .. prompt:: bash $
+
+ ceph balancer mode read
+
+The mode can be changed to ``upmap-read`` by running the following command:
+
+ .. prompt:: bash $
+
+ ceph balancer mode upmap-read
+
Supervised optimization
-----------------------
import math
import random
import time
-from mgr_module import CLIReadCommand, CLICommand, CommandResult, MgrModule, Option, OSDMap
+from mgr_module import CLIReadCommand, CLICommand, CommandResult, MgrModule, Option, OSDMap, CephReleases
from threading import Event
from typing import cast, Any, Dict, List, Optional, Sequence, Tuple, Union
from mgr_module import CRUSHMap
none = 'none'
crush_compat = 'crush-compat'
upmap = 'upmap'
+ read = 'read'
+ upmap_read = 'upmap-read'
class Plan(object):
osdlist += [m['from'], m['to']]
ls.append('ceph osd pg-upmap-items %s %s' %
(item['pgid'], ' '.join([str(a) for a in osdlist])))
+ for item in incdump.get('new_pg_upmap_primaries', []):
+ ls.append('ceph osd pg-upmap-primary %s %s' % (item['pgid'], item['primary_osd']))
+ for item in incdump.get('old_pg_upmap_primaries', []):
+ ls.append('ceph osd rm-pg-upmap-primary %s' % item['pgid'])
return '\n'.join(ls)
self.score = 0.0
+ self.read_balance_score_by_pool: Dict[str, Dict[str, float]] = {}
+ self.read_balance_score_acting_by_pool: Dict[str, float] = {}
+
def show(self, verbose: bool = False) -> str:
if verbose:
r = self.ms.desc + '\n'
r += 'stats_by_root %s\n' % self.stats_by_root
r += 'score_by_pool %s\n' % self.score_by_pool
r += 'score_by_root %s\n' % self.score_by_root
+ r += 'score %f (lower is better)\n' % self.score
+ r += 'read_balance_score_by_pool %s\n' % self.read_balance_score_by_pool
else:
r = self.ms.desc + ' '
- r += 'score %f (lower is better)\n' % self.score
+ r += 'score %f (lower is better)\n' % self.score
+ r += 'read_balance_scores (lower is better) %s\n' % self.read_balance_score_acting_by_pool
return r
def calc_stats(self, count, target, total):
Option(name='mode',
desc='Balancer mode',
default='upmap',
- enum_allowed=['none', 'crush-compat', 'upmap'],
+ enum_allowed=['none', 'crush-compat', 'upmap', 'read', 'upmap-read'],
runtime=True),
Option(name='sleep_interval',
type='secs',
self.get("pool_stats"),
'initialize compat weight-set')
self.get_compat_weight_set_weights(ms) # ignore error
+ elif (mode == Mode.read) or (mode == Mode.upmap_read):
+ try:
+ release = CephReleases[min_compat_client]
+ if release.value < CephReleases.reef.value:
+ warn = ('min_compat_client "%s" '
+ '< "reef", which is required for pg-upmap-primary. '
+ 'Try "ceph osd set-require-min-compat-client reef" '
+ 'before enabling this mode' % min_compat_client)
+ return (-errno.EPERM, '', warn)
+ except KeyError:
+ self.log.error('Unable to apply mode {} due to unknown min_compat_client {}'.format(mode, min_compat_client))
+ warn = ('Unable to apply mode {} due to unknown min_compat_client {}.'.format(mode, min_compat_client))
+ return (-errno.EPERM, '', warn)
self.set_module_option('mode', mode.value)
return (0, '', '')
'objects': objects,
'bytes': bytes,
}
+ try:
+ read_balance_scores = pi['read_balance']
+ pe.read_balance_score_acting_by_pool[pool] = read_balance_scores['score_acting']
+ pe.read_balance_score_by_pool[pool] = {
+ 'score_acting': read_balance_scores['score_acting'],
+ 'score_stable': read_balance_scores['score_stable'],
+ 'optimal_score': read_balance_scores['optimal_score'],
+ 'raw_score_acting': read_balance_scores['raw_score_acting'],
+ 'raw_score_stable': read_balance_scores['raw_score_stable'],
+ 'primary_affinity_weighted': read_balance_scores['primary_affinity_weighted'],
+ 'average_primary_affinity': read_balance_scores['average_primary_affinity'],
+ 'average_primary_affinity_weighted': read_balance_scores['average_primary_affinity_weighted']
+ }
+ except KeyError:
+ self.log.debug("Skipping pool '{}' since it does not have a read_balance_score, "
+ "likely because it is not replicated.".format(pool))
+
for root in pe.total_by_root:
pe.count_by_root[root] = {
'pgs': {
return self.do_upmap(plan)
elif plan.mode == 'crush-compat':
return self.do_crush_compat(cast(MsPlan, plan))
+ elif plan.mode == 'read':
+ return self.do_read_balancing(plan)
+ elif plan.mode == 'upmap-read':
+ r_upmap, detail_upmap = self.do_upmap(plan)
+ r_read, detail_read = self.do_read_balancing(plan)
+ if (r_upmap < 0) and (r_read < 0):
+ return r_upmap, detail_upmap
+ return 0, ''
elif plan.mode == 'none':
detail = 'Please do "ceph balancer mode" to choose a valid mode first'
self.log.info('Idle')
self.log.info(detail)
return -errno.EINVAL, detail
+ def do_read_balancing(self, plan: Plan) -> Tuple[int, str]:
+ self.log.info('do_read_balancing')
+ osdmap_dump = plan.osdmap_dump
+ msg = 'Unable to find further optimization, ' \
+ 'or distribution is already perfect'
+
+ if len(plan.pools):
+ pools = plan.pools
+ else: # all
+ pools = [str(i['pool_name']) for i in osdmap_dump.get('pools', [])]
+ if len(pools) == 0:
+ detail = 'No pools available'
+ self.log.info(detail)
+ return -errno.ENOENT, detail
+ self.log.info('pools %s' % pools)
+
+ adjusted_pools = []
+ inc = plan.inc
+ total_num_changes = 0
+ pools_with_pg_merge = []
+ crush_rule_by_pool_name = {}
+ no_read_balance_info = []
+ replicated_pools_with_optimal_score = []
+ rb_error_message = {}
+ for p in osdmap_dump.get('pools', []):
+ for pool_pg_status in plan.pg_status.get('pgs_by_pool_state', []):
+ if pool_pg_status['pool_id'] != p['pool']:
+ continue
+ for state in pool_pg_status['pg_state_counts']:
+ if state['state_name'] != 'active+clean':
+ msg = "Not all PGs are active+clean; try again later."
+ return -errno.EALREADY, msg
+ if p['pg_num'] > p['pg_num_target']:
+ pools_with_pg_merge.append(p['pool_name'])
+ crush_rule_by_pool_name[p['pool_name']] = p['crush_rule']
+ if 'read_balance' not in p:
+ no_read_balance_info.append(p['pool_name'])
+ if 'read_balance' in p:
+ if 'error_message' in p['read_balance']:
+ rb_error_message[p['pool_name']] = p['read_balance']['error_message']
+ elif p['read_balance']['score_acting'] == p['read_balance']['optimal_score']:
+ replicated_pools_with_optimal_score.append(p['pool_name'])
+ for pool in pools:
+ if pool not in crush_rule_by_pool_name:
+ self.log.debug('pool %s does not exist' % pool)
+ continue
+ if pool in pools_with_pg_merge:
+ self.log.debug('pool %s has pending PG(s) for merging, skipping for now' % pool)
+ continue
+ if pool in no_read_balance_info:
+ self.log.debug('pool %s has no read_balance information, skipping' % pool)
+ continue
+ if pool in replicated_pools_with_optimal_score:
+ self.log.debug('pool %s is already balanced, skipping' % pool)
+ continue
+ if pool in rb_error_message:
+ self.log.error(rb_error_message[pool])
+ continue
+ adjusted_pools.append(pool)
+ pool_dump = osdmap_dump.get('pools', [])
+ for pool in adjusted_pools:
+ for p in pool_dump:
+ if p['pool_name'] == pool:
+ pool_id = p['pool']
+ break
+ num_changes = plan.osdmap.balance_primaries(pool_id, inc)
+ total_num_changes += num_changes
+ if total_num_changes < 0:
+ self.no_optimization_needed = True
+ self.log.debug('unable to balance reads.')
+ return -errno.EALREADY, msg
+ self.log.info('prepared {} read changes'.format(total_num_changes))
+ if total_num_changes == 0:
+ self.no_optimization_needed = True
+ return -errno.EALREADY, msg
+ return 0, ''
+
def do_upmap(self, plan: Plan) -> Tuple[int, str]:
self.log.info('do_upmap')
max_optimizations = cast(float, self.get_module_option('upmap_max_optimizations'))
left -= did
if left <= 0:
break
- self.log.info('prepared %d/%d changes' % (total_did, max_optimizations))
+ self.log.info('prepared %d/%d upmap changes' % (total_did, max_optimizations))
if total_did == 0:
self.no_optimization_needed = True
return -errno.EALREADY, 'Unable to find further optimization, ' \
}), 'foo')
commands.append(result)
+ # read
+ for item in incdump.get('new_pg_upmap_primaries', []):
+ self.log.info('ceph osd pg-upmap-primary %s primary_osd %s', item['pgid'],
+ item['primary_osd'])
+ result = CommandResult('foo')
+ self.send_command(result, 'mon', '', json.dumps({
+ 'prefix': 'osd pg-upmap-primary',
+ 'format': 'json',
+ 'pgid': item['pgid'],
+ 'id': item['primary_osd'],
+ }), 'foo')
+ commands.append(result)
+
# wait for commands
self.log.debug('commands %s' % commands)
for result in commands:
'active': self.active,
'mode': self.mode,
}
+
+ def self_test(self) -> None:
+ # turn balancer on
+ self.on()
+
+ # Get min-compat-client
+ min_compat_client = self.get_osdmap().dump().get('require_min_compat_client', '')
+ release = CephReleases[min_compat_client]
+
+ # Check upmap mode warning
+ r, _, warn = self.set_mode(Mode.upmap)
+ if release.value < CephReleases.luminous.value:
+ if r >= 0:
+ raise RuntimeError('upmap mode did not properly warn about min_compat_client')
+ if warn == '':
+ raise RuntimeError('upmap mode warning is empty when it should not be.')
+
+ # Check read mode warning
+ r, _, warn = self.set_mode(Mode.read)
+ if release.value < CephReleases.reef.value:
+ if r >= 0:
+ raise RuntimeError('read mode did not properly warn about min_compat_client')
+ if warn == '':
+ raise RuntimeError('read mode warning is empty when it should not be.')
+ r, _, warn = self.set_mode(Mode.upmap_read)
+
+ # Check upmap-read mode warning
+ if release.value < CephReleases.reef.value:
+ if r >= 0:
+ raise RuntimeError('upmap-read mode did not properly warn about min_compat_client')
+ if warn == '':
+ raise RuntimeError('upmap-read mode warning is empty when it should not be.')
+
+ # Check status
+ r, status, _ = self.show_status()
+ if r < 0:
+ raise RuntimeError('Balancer status was unsuccessful')
+ if status == '':
+ raise RuntimeError('Balancer status was empty')
+
+ # Turn off
+ self.off()