From ae7823187186310bac117fac437d991398cffd5e Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Tue, 26 Sep 2017 15:54:14 +0800 Subject: [PATCH] osd: add max-pg-per-osd limit osd will refused to create new pgs, until its pg number is lower than the max-pg-per-osd upper bound setting. Signed-off-by: Kefu Chai (cherry picked from commit 4c7df944c7f28232873ba681eedce72cdb062ea5) --- .../configuration/pool-pg-config-ref.rst | 9 ++ .../all/max-pg-per-osd.from-mon.yaml | 26 ++++ .../all/max-pg-per-osd.from-primary.yaml | 31 +++++ .../all/max-pg-per-osd.from-replica.yaml | 31 +++++ qa/tasks/ceph_manager.py | 22 +++ qa/tasks/osd_max_pg_per_osd.py | 126 ++++++++++++++++++ src/common/options.cc | 7 + src/osd/OSD.cc | 107 ++++++++++++++- src/osd/OSD.h | 8 +- 9 files changed, 363 insertions(+), 4 deletions(-) create mode 100644 qa/suites/rados/singleton/all/max-pg-per-osd.from-mon.yaml create mode 100644 qa/suites/rados/singleton/all/max-pg-per-osd.from-primary.yaml create mode 100644 qa/suites/rados/singleton/all/max-pg-per-osd.from-replica.yaml create mode 100644 qa/tasks/osd_max_pg_per_osd.py diff --git a/doc/rados/configuration/pool-pg-config-ref.rst b/doc/rados/configuration/pool-pg-config-ref.rst index dd416edfa3826..9811b3bd3f104 100644 --- a/doc/rados/configuration/pool-pg-config-ref.rst +++ b/doc/rados/configuration/pool-pg-config-ref.rst @@ -255,6 +255,15 @@ Ceph configuration file. :Type: 32-bit Integer :Default: ``45`` +``osd max pg per osd hard ratio`` + +:Description: The ratio of number of PGs per OSD allowed by the cluster before + OSD refuses to create new PGs. OSD stops creates new PGs the number + of PGs it serves exceeds + ``osd max pg per osd hard ratio`` \* ``mon max pg per osd``. + +:Type: Float +:Default: ``2`` .. _pool: ../../operations/pools .. _Monitoring OSDs and PGs: ../../operations/monitoring-osd-pg#peering diff --git a/qa/suites/rados/singleton/all/max-pg-per-osd.from-mon.yaml b/qa/suites/rados/singleton/all/max-pg-per-osd.from-mon.yaml new file mode 100644 index 0000000000000..accdd964fdc56 --- /dev/null +++ b/qa/suites/rados/singleton/all/max-pg-per-osd.from-mon.yaml @@ -0,0 +1,26 @@ +roles: +- - mon.a + - mgr.x + - osd.0 + - osd.1 +openstack: + - volumes: # attached to each instance + count: 2 + size: 10 # GB +overrides: + ceph: + create_rbd_pool: False + conf: + mon: + osd pool default size: 2 + osd: + mon max pg per osd : 2 + osd max pg per osd hard ratio : 1 + log-whitelist: + - \(TOO_FEW_PGS\) +tasks: +- install: +- ceph: +- osd_max_pg_per_osd: + test_create_from_mon: True + pg_num: 2 diff --git a/qa/suites/rados/singleton/all/max-pg-per-osd.from-primary.yaml b/qa/suites/rados/singleton/all/max-pg-per-osd.from-primary.yaml new file mode 100644 index 0000000000000..1c48ada75c931 --- /dev/null +++ b/qa/suites/rados/singleton/all/max-pg-per-osd.from-primary.yaml @@ -0,0 +1,31 @@ +roles: +- - mon.a + - mgr.x + - osd.0 + - osd.1 + - osd.2 + - osd.3 +openstack: + - volumes: # attached to each instance + count: 4 + size: 10 # GB +overrides: + ceph: + create_rbd_pool: False + conf: + mon: + osd pool default size: 2 + osd: + mon max pg per osd : 1 + osd max pg per osd hard ratio : 1 + log-whitelist: + - \(TOO_FEW_PGS\) + - \(PG_ +tasks: +- install: +- ceph: +- osd_max_pg_per_osd: + test_create_from_mon: False + pg_num: 1 + pool_size: 2 + from_primary: True diff --git a/qa/suites/rados/singleton/all/max-pg-per-osd.from-replica.yaml b/qa/suites/rados/singleton/all/max-pg-per-osd.from-replica.yaml new file mode 100644 index 0000000000000..0cf37fd8ecdfc --- /dev/null +++ b/qa/suites/rados/singleton/all/max-pg-per-osd.from-replica.yaml @@ -0,0 +1,31 @@ +roles: +- - mon.a + - mgr.x + - osd.0 + - osd.1 + - osd.2 + - osd.3 +openstack: + - volumes: # attached to each instance + count: 4 + size: 10 # GB +overrides: + ceph: + create_rbd_pool: False + conf: + mon: + osd pool default size: 2 + osd: + mon max pg per osd : 1 + osd max pg per osd hard ratio : 1 + log-whitelist: + - \(TOO_FEW_PGS\) + - \(PG_ +tasks: +- install: +- ceph: +- osd_max_pg_per_osd: + test_create_from_mon: False + pg_num: 1 + pool_size: 2 + from_primary: False diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index 76b1efd61ecc9..9bed608d069ce 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -2320,6 +2320,28 @@ class CephManager: time.sleep(3) self.log("active!") + def wait_till_pg_convergence(self, timeout=None): + start = time.time() + old_stats = None + while True: + # strictly speaking, no need to wait for mon. but due to the + # "ms inject socket failures" setting, the osdmap could be delayed, + # so mgr is likely to ignore the pg-stat messages with pgs serving + # newly created pools which is not yet known by mgr. so, to make sure + # the mgr is updated with the latest pg-stats, waiting for mon/mgr is + # necessary. + self.flush_all_pg_stats() + new_stats = dict((stat['pgid'], stat['state']) + for stat in self.get_pg_stats()) + if old_stats == new_stats: + return old_stats + if timeout is not None: + assert time.time() - start < timeout, \ + 'failed to reach convergence before %d secs' % timeout + old_stats = new_stats + # longer than mgr_stats_period + time.sleep(5 + 1) + def mark_out_osd(self, osd): """ Wrapper to mark osd out. diff --git a/qa/tasks/osd_max_pg_per_osd.py b/qa/tasks/osd_max_pg_per_osd.py new file mode 100644 index 0000000000000..b4e2aa4deed45 --- /dev/null +++ b/qa/tasks/osd_max_pg_per_osd.py @@ -0,0 +1,126 @@ +import logging +import random + + +log = logging.getLogger(__name__) + + +def pg_num_in_all_states(pgs, *states): + return sum(1 for state in pgs.itervalues() + if all(s in state for s in states)) + + +def pg_num_in_any_state(pgs, *states): + return sum(1 for state in pgs.itervalues() + if any(s in state for s in states)) + + +def test_create_from_mon(ctx, config): + """ + osd should stop creating new pools if the number of pg it servers + exceeds the max-pg-per-osd setting, and it should resume the previously + suspended pg creations once the its pg number drops down below the setting + How it works:: + 1. set the hard limit of pg-per-osd to "2" + 2. create pool.a with pg_num=2 + # all pgs should be active+clean + 2. create pool.b with pg_num=2 + # new pgs belonging to this pool should be unknown (the primary osd + reaches the limit) or creating (replica osd reaches the limit) + 3. remove pool.a + 4. all pg belonging to pool.b should be active+clean + """ + pg_num = config.get('pg_num', 2) + manager = ctx.managers['ceph'] + log.info('1. creating pool.a') + pool_a = manager.create_pool_with_unique_name(pg_num) + manager.wait_for_clean() + assert manager.get_num_active_clean() == pg_num + + log.info('2. creating pool.b') + pool_b = manager.create_pool_with_unique_name(pg_num) + pg_states = manager.wait_till_pg_convergence(300) + pg_created = pg_num_in_all_states(pg_states, 'active', 'clean') + assert pg_created == pg_num + pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating') + assert pg_pending == pg_num + + log.info('3. removing pool.a') + manager.remove_pool(pool_a) + pg_states = manager.wait_till_pg_convergence(300) + assert len(pg_states) == pg_num + pg_created = pg_num_in_all_states(pg_states, 'active', 'clean') + assert pg_created == pg_num + + # cleanup + manager.remove_pool(pool_b) + + +def test_create_from_peer(ctx, config): + """ + osd should stop creating new pools if the number of pg it servers + exceeds the max-pg-per-osd setting, and it should resume the previously + suspended pg creations once the its pg number drops down below the setting + + How it works:: + 0. create 4 OSDs. + 1. create pool.a with pg_num=1, size=2 + pg will be mapped to osd.0, and osd.1, and it should be active+clean + 2. create pool.b with pg_num=1, size=2. + if the pgs stuck in creating, delete the pool since the pool and try + again, eventually we'll get the pool to land on the other 2 osds that + aren't occupied by pool.a. (this will also verify that pgs for deleted + pools get cleaned out of the creating wait list.) + 3. mark an osd out. verify that some pgs get stuck stale or peering. + 4. delete a pool, verify pgs go active. + """ + pg_num = config.get('pg_num', 1) + pool_size = config.get('pool_size', 2) + from_primary = config.get('from_primary', True) + + manager = ctx.managers['ceph'] + log.info('1. creating pool.a') + pool_a = manager.create_pool_with_unique_name(pg_num) + manager.wait_for_clean() + assert manager.get_num_active_clean() == pg_num + + log.info('2. creating pool.b') + while True: + pool_b = manager.create_pool_with_unique_name(pg_num) + pg_states = manager.wait_till_pg_convergence(300) + pg_created = pg_num_in_all_states(pg_states, 'active', 'clean') + assert pg_created >= pg_num + pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating') + assert pg_pending == pg_num * 2 - pg_created + if pg_created == pg_num * 2: + break + manager.remove_pool(pool_b) + + log.info('3. mark an osd out') + pg_stats = manager.get_pg_stats() + pg = random.choice(pg_stats) + if from_primary: + victim = pg['acting'][-1] + else: + victim = pg['acting'][0] + manager.mark_out_osd(victim) + pg_states = manager.wait_till_pg_convergence(300) + pg_stuck = pg_num_in_any_state(pg_states, 'activating', 'stale', 'peering') + assert pg_stuck > 0 + + log.info('4. removing pool.b') + manager.remove_pool(pool_b) + manager.wait_for_clean(30) + + # cleanup + manager.remove_pool(pool_a) + + +def task(ctx, config): + assert isinstance(config, dict), \ + 'osd_max_pg_per_osd task only accepts a dict for config' + manager = ctx.managers['ceph'] + if config.get('test_create_from_mon', True): + test_create_from_mon(ctx, config) + else: + test_create_from_peer(ctx, config) diff --git a/src/common/options.cc b/src/common/options.cc index 71f58a35f1b7d..f0643e100ba86 100644 --- a/src/common/options.cc +++ b/src/common/options.cc @@ -2569,6 +2569,13 @@ std::vector