osd: add max-pg-per-osd limit

author Kefu Chai <kchai@redhat.com>

Tue, 26 Sep 2017 07:54:14 +0000 (15:54 +0800)

committer Kefu Chai <kchai@redhat.com>

Tue, 17 Oct 2017 15:08:40 +0000 (23:08 +0800)
author Kefu Chai <kchai@redhat.com>
Tue, 26 Sep 2017 07:54:14 +0000 (15:54 +0800)
committer Kefu Chai <kchai@redhat.com>
Tue, 17 Oct 2017 15:08:40 +0000 (23:08 +0800)
diff --git a/doc/rados/configuration/pool-pg-config-ref.rst b/doc/rados/configuration/pool-pg-config-ref.rst

index 1566d7e6365feb8779acea0a30869b24a9399efd..98e59affe4bd46bf7d5345d36f2059a8543d69e6 100644 (file)
--- a/doc/rados/configuration/pool-pg-config-ref.rst
+++ b/doc/rados/configuration/pool-pg-config-ref.rst
@@ -255,6 +255,15 @@ Ceph configuration file.
  :Type: 32-bit Integer
  :Default: ``45``
  
+``osd max pg per osd hard ratio``
+
+:Description: The ratio of number of PGs per OSD allowed by the cluster before
+              OSD refuses to create new PGs. OSD stops creates new PGs the number
+              of PGs it serves exceeds
+              ``osd max pg per osd hard ratio`` \* ``mon max pg per osd``.
+
+:Type: Float
+:Default: ``2``
  
  .. _pool: ../../operations/pools
  .. _Monitoring OSDs and PGs: ../../operations/monitoring-osd-pg#peering
diff --git a/qa/suites/rados/singleton/all/max-pg-per-osd.from-mon.yaml b/qa/suites/rados/singleton/all/max-pg-per-osd.from-mon.yaml

new file mode 100644 (file)

index 0000000..accdd96
--- /dev/null
+++ b/qa/suites/rados/singleton/all/max-pg-per-osd.from-mon.yaml
@@ -0,0 +1,26 @@
+roles:
+- - mon.a
+  - mgr.x
+  - osd.0
+  - osd.1
+openstack:
+  - volumes: # attached to each instance
+      count: 2
+      size: 10 # GB
+overrides:
+  ceph:
+    create_rbd_pool: False
+    conf:
+      mon:
+        osd pool default size: 2
+      osd:
+        mon max pg per osd : 2
+        osd max pg per osd hard ratio : 1
+    log-whitelist:
+      - \(TOO_FEW_PGS\)
+tasks:
+- install:
+- ceph:
+- osd_max_pg_per_osd:
+    test_create_from_mon: True
+    pg_num: 2
diff --git a/qa/suites/rados/singleton/all/max-pg-per-osd.from-primary.yaml b/qa/suites/rados/singleton/all/max-pg-per-osd.from-primary.yaml

new file mode 100644 (file)

index 0000000..1c48ada
--- /dev/null
+++ b/qa/suites/rados/singleton/all/max-pg-per-osd.from-primary.yaml
@@ -0,0 +1,31 @@
+roles:
+- - mon.a
+  - mgr.x
+  - osd.0
+  - osd.1
+  - osd.2
+  - osd.3
+openstack:
+  - volumes: # attached to each instance
+      count: 4
+      size: 10 # GB
+overrides:
+  ceph:
+    create_rbd_pool: False
+    conf:
+      mon:
+        osd pool default size: 2
+      osd:
+        mon max pg per osd : 1
+        osd max pg per osd hard ratio : 1
+    log-whitelist:
+      - \(TOO_FEW_PGS\)
+      - \(PG_
+tasks:
+- install:
+- ceph:
+- osd_max_pg_per_osd:
+    test_create_from_mon: False
+    pg_num: 1
+    pool_size: 2
+    from_primary: True
diff --git a/qa/suites/rados/singleton/all/max-pg-per-osd.from-replica.yaml b/qa/suites/rados/singleton/all/max-pg-per-osd.from-replica.yaml

new file mode 100644 (file)

index 0000000..0cf37fd
--- /dev/null
+++ b/qa/suites/rados/singleton/all/max-pg-per-osd.from-replica.yaml
@@ -0,0 +1,31 @@
+roles:
+- - mon.a
+  - mgr.x
+  - osd.0
+  - osd.1
+  - osd.2
+  - osd.3
+openstack:
+  - volumes: # attached to each instance
+      count: 4
+      size: 10 # GB
+overrides:
+  ceph:
+    create_rbd_pool: False
+    conf:
+      mon:
+        osd pool default size: 2
+      osd:
+        mon max pg per osd : 1
+        osd max pg per osd hard ratio : 1
+    log-whitelist:
+      - \(TOO_FEW_PGS\)
+      - \(PG_
+tasks:
+- install:
+- ceph:
+- osd_max_pg_per_osd:
+    test_create_from_mon: False
+    pg_num: 1
+    pool_size: 2
+    from_primary: False
diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py

index 55fcbe8c1487b2e65fe6cfaaa7b4c24f104a2b07..e05ea6e56d732dae2aa78a58d4913a165c4bb38c 100644 (file)
--- a/qa/tasks/ceph_manager.py
+++ b/qa/tasks/ceph_manager.py
@@ -2271,6 +2271,28 @@ class CephManager:
              time.sleep(3)
          self.log("active!")
  
+    def wait_till_pg_convergence(self, timeout=None):
+        start = time.time()
+        old_stats = None
+        while True:
+            # strictly speaking, no need to wait for mon. but due to the
+            # "ms inject socket failures" setting, the osdmap could be delayed,
+            # so mgr is likely to ignore the pg-stat messages with pgs serving
+            # newly created pools which is not yet known by mgr. so, to make sure
+            # the mgr is updated with the latest pg-stats, waiting for mon/mgr is
+            # necessary.
+            self.flush_all_pg_stats()
+            new_stats = dict((stat['pgid'], stat['state'])
+                             for stat in self.get_pg_stats())
+            if old_stats == new_stats:
+                return old_stats
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'failed to reach convergence before %d secs' % timeout
+            old_stats = new_stats
+            # longer than mgr_stats_period
+            time.sleep(5 + 1)
+
      def mark_out_osd(self, osd):
          """
          Wrapper to mark osd out.
diff --git a/qa/tasks/osd_max_pg_per_osd.py b/qa/tasks/osd_max_pg_per_osd.py

new file mode 100644 (file)

index 0000000..b4e2aa4
--- /dev/null
+++ b/qa/tasks/osd_max_pg_per_osd.py
@@ -0,0 +1,126 @@
+import logging
+import random
+
+
+log = logging.getLogger(__name__)
+
+
+def pg_num_in_all_states(pgs, *states):
+    return sum(1 for state in pgs.itervalues()
+               if all(s in state for s in states))
+
+
+def pg_num_in_any_state(pgs, *states):
+    return sum(1 for state in pgs.itervalues()
+               if any(s in state for s in states))
+
+
+def test_create_from_mon(ctx, config):
+    """
+    osd should stop creating new pools if the number of pg it servers
+    exceeds the max-pg-per-osd setting, and it should resume the previously
+    suspended pg creations once the its pg number drops down below the setting
+    How it works::
+    1. set the hard limit of pg-per-osd to "2"
+    2. create pool.a with pg_num=2
+       # all pgs should be active+clean
+    2. create pool.b with pg_num=2
+       # new pgs belonging to this pool should be unknown (the primary osd
+       reaches the limit) or creating (replica osd reaches the limit)
+    3. remove pool.a
+    4. all pg belonging to pool.b should be active+clean
+    """
+    pg_num = config.get('pg_num', 2)
+    manager = ctx.managers['ceph']
+    log.info('1. creating pool.a')
+    pool_a = manager.create_pool_with_unique_name(pg_num)
+    manager.wait_for_clean()
+    assert manager.get_num_active_clean() == pg_num
+
+    log.info('2. creating pool.b')
+    pool_b = manager.create_pool_with_unique_name(pg_num)
+    pg_states = manager.wait_till_pg_convergence(300)
+    pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+    assert pg_created == pg_num
+    pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating')
+    assert pg_pending == pg_num
+
+    log.info('3. removing pool.a')
+    manager.remove_pool(pool_a)
+    pg_states = manager.wait_till_pg_convergence(300)
+    assert len(pg_states) == pg_num
+    pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+    assert pg_created == pg_num
+
+    # cleanup
+    manager.remove_pool(pool_b)
+
+
+def test_create_from_peer(ctx, config):
+    """
+    osd should stop creating new pools if the number of pg it servers
+    exceeds the max-pg-per-osd setting, and it should resume the previously
+    suspended pg creations once the its pg number drops down below the setting
+
+    How it works::
+    0. create 4 OSDs.
+    1. create pool.a with pg_num=1, size=2
+       pg will be mapped to osd.0, and osd.1, and it should be active+clean
+    2. create pool.b with pg_num=1, size=2.
+       if the pgs stuck in creating, delete the pool since the pool and try
+       again, eventually we'll get the pool to land on the other 2 osds that
+       aren't occupied by pool.a. (this will also verify that pgs for deleted
+       pools get cleaned out of the creating wait list.)
+    3. mark an osd out. verify that some pgs get stuck stale or peering.
+    4. delete a pool, verify pgs go active.
+    """
+    pg_num = config.get('pg_num', 1)
+    pool_size = config.get('pool_size', 2)
+    from_primary = config.get('from_primary', True)
+
+    manager = ctx.managers['ceph']
+    log.info('1. creating pool.a')
+    pool_a = manager.create_pool_with_unique_name(pg_num)
+    manager.wait_for_clean()
+    assert manager.get_num_active_clean() == pg_num
+
+    log.info('2. creating pool.b')
+    while True:
+        pool_b = manager.create_pool_with_unique_name(pg_num)
+        pg_states = manager.wait_till_pg_convergence(300)
+        pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+        assert pg_created >= pg_num
+        pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating')
+        assert pg_pending == pg_num * 2 - pg_created
+        if pg_created == pg_num * 2:
+            break
+        manager.remove_pool(pool_b)
+
+    log.info('3. mark an osd out')
+    pg_stats = manager.get_pg_stats()
+    pg = random.choice(pg_stats)
+    if from_primary:
+        victim = pg['acting'][-1]
+    else:
+        victim = pg['acting'][0]
+    manager.mark_out_osd(victim)
+    pg_states = manager.wait_till_pg_convergence(300)
+    pg_stuck = pg_num_in_any_state(pg_states, 'activating', 'stale', 'peering')
+    assert pg_stuck > 0
+
+    log.info('4. removing pool.b')
+    manager.remove_pool(pool_b)
+    manager.wait_for_clean(30)
+
+    # cleanup
+    manager.remove_pool(pool_a)
+
+
+def task(ctx, config):
+    assert isinstance(config, dict), \
+        'osd_max_pg_per_osd task only accepts a dict for config'
+    manager = ctx.managers['ceph']
+    if config.get('test_create_from_mon', True):
+        test_create_from_mon(ctx, config)
+    else:
+        test_create_from_peer(ctx, config)
diff --git a/src/common/options.cc b/src/common/options.cc

index 769c150779ec7b4b4089e530bef9b9a764978be9..c0d4b7383ea063a8c42cf381c18d0070a8bd3c00 100644 (file)
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -2604,6 +2604,13 @@ std::vector<Option> get_global_options() {
      .set_default(100)
      .set_description(""),
  
+    Option("osd_max_pg_per_osd_hard_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .set_min(1)
+    .set_description("Maximum number of PG per OSD, a factor of 'mon_max_pg_per_osd'")
+    .set_long_description("OSD will refuse to instantiate PG if the number of PG it serves exceeds this number.")
+    .add_see_also("mon_max_pg_per_osd"),
+
      Option("osd_op_complaint_time", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
      .set_default(30)
      .set_description(""),
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index 3c028f3f84b303e90fea23c77bea567679438af4..4a485ac74b353fb5c5ebdcded2bc15e9806cdc08 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -4024,6 +4024,11 @@ int OSD::handle_pg_peering_evt(
        ceph_abort();
      }
  
+    const bool is_mon_create =
+      evt->get_event().dynamic_type() == PG::NullEvt::static_type();
+    if (maybe_wait_for_max_pg(pgid, is_mon_create)) {
+      return -EAGAIN;
+    }
      // do we need to resurrect a deleting pg?
      spg_t resurrected;
      PGRef old_pg_state;
@@ -4161,6 +4166,88 @@ int OSD::handle_pg_peering_evt(
    }
  }
  
+bool OSD::maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create)
+{
+  const auto max_pgs_per_osd =
+    (cct->_conf->get_val<int64_t>("mon_max_pg_per_osd") *
+     cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+
+  RWLock::RLocker pg_map_locker{pg_map_lock};
+  if (pg_map.size() < max_pgs_per_osd) {
+    return false;
+  }
+  auto&& pending_creates_locker = guardedly_lock(pending_creates_lock);
+  if (is_mon_create) {
+    pending_creates_from_mon++;
+  } else {
+    pending_creates_from_osd.emplace(pgid.pgid);
+  }
+  dout(5) << __func__ << " withhold creation of pg " << pgid
+         << ": " << pg_map.size() << " >= "<< max_pgs_per_osd << dendl;
+  return true;
+}
+
+// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
+// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
+// to up set if pg_temp is empty. so an empty pg_temp won't work.
+static vector<int32_t> twiddle(const vector<int>& acting) {
+  if (acting.size() > 1) {
+    return {acting[0]};
+  } else {
+    vector<int32_t> twiddled(acting.begin(), acting.end());
+    twiddled.push_back(-1);
+    return twiddled;
+  }
+}
+
+void OSD::resume_creating_pg()
+{
+  bool do_sub_pg_creates = false;
+  MOSDPGTemp *pgtemp = nullptr;
+  {
+    const auto max_pgs_per_osd =
+      (cct->_conf->get_val<int64_t>("mon_max_pg_per_osd") *
+       cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+    RWLock::RLocker l(pg_map_lock);
+    if (max_pgs_per_osd <= pg_map.size()) {
+      // this could happen if admin decreases this setting before a PG is removed
+      return;
+    }
+    unsigned spare_pgs = max_pgs_per_osd - pg_map.size();
+    auto&& locker = guardedly_lock(pending_creates_lock);
+    if (pending_creates_from_mon > 0) {
+      do_sub_pg_creates = true;
+      if (pending_creates_from_mon >= spare_pgs) {
+       spare_pgs = pending_creates_from_mon = 0;
+      } else {
+       spare_pgs -= pending_creates_from_mon;
+       pending_creates_from_mon = 0;
+      }
+    }
+    auto pg = pending_creates_from_osd.cbegin();
+    while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
+      if (!pgtemp) {
+       pgtemp = new MOSDPGTemp{osdmap->get_epoch()};
+      }
+      vector<int> acting;
+      osdmap->pg_to_up_acting_osds(*pg, nullptr, nullptr, &acting, nullptr);
+      pgtemp->pg_temp[*pg] = twiddle(acting);
+      pg = pending_creates_from_osd.erase(pg);
+      spare_pgs--;
+    }
+  }
+  if (do_sub_pg_creates) {
+    if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
+      dout(4) << __func__ << ": resolicit pg creates from mon since "
+             << last_pg_create_epoch << dendl;
+      monc->renew_subs();
+    }
+  }
+  if (pgtemp) {
+    pgtemp->forced = true;
+    monc->send_mon_message(pgtemp);
+  }
+}
  
  void OSD::build_initial_pg_history(
    spg_t pgid,
@@ -4904,6 +4991,7 @@ void OSD::tick_without_osd_lock()
        sched_scrub();
      }
      service.promote_throttle_recalibrate();
+    resume_creating_pg();
      bool need_send_beacon = false;
      const auto now = ceph::coarse_mono_clock::now();
      {
@@ -7714,6 +7802,16 @@ void OSD::consume_map()
  
        pg->unlock();
      }
+
+    auto&& pending_create_locker = guardedly_lock(pending_creates_lock);
+    for (auto pg = pending_creates_from_osd.cbegin();
+        pg != pending_creates_from_osd.cend();) {
+      if (osdmap->get_pg_acting_rank(*pg, whoami) < 0) {
+       pg = pending_creates_from_osd.erase(pg);
+      } else {
+       ++pg;
+      }
+    }
    }
  
    for (list<PGRef>::iterator i = to_remove.begin();
@@ -8038,7 +8136,6 @@ void OSD::handle_pg_create(OpRequestRef op)
                << dendl;
        continue;
      }
-
      if (handle_pg_peering_evt(
            pgid,
            history,
@@ -8053,7 +8150,12 @@ void OSD::handle_pg_create(OpRequestRef op)
        service.send_pg_created(pgid.pgid);
      }
    }
-  last_pg_create_epoch = m->epoch;
+
+  with_unique_lock(pending_creates_lock, [=]() {
+      if (pending_creates_from_mon == 0) {
+       last_pg_create_epoch = m->epoch;
+      }
+    });
  
    maybe_update_heartbeat_peers();
  }
@@ -8721,7 +8823,6 @@ void OSD::_remove_pg(PG *pg)
    pg->put("PGMap"); // since we've taken it out of map
  }
  
-
  // =========================================================
  // RECOVERY
  
diff --git a/src/osd/OSD.h b/src/osd/OSD.h

index 4895bcbdab47fd7d3c9fb873098a2afd43c5f28b..62e2f0149e607bdefde5a83f972e126c5e8b19f0 100644 (file)
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1910,6 +1910,10 @@ protected:
    RWLock pg_map_lock; // this lock orders *above* individual PG _locks
    ceph::unordered_map<spg_t, PG*> pg_map; // protected by pg_map lock
  
+  std::mutex pending_creates_lock;
+  std::set<pg_t> pending_creates_from_osd;
+  unsigned pending_creates_from_mon = 0;
+
    map<spg_t, list<PG::CephPeeringEvtRef> > peering_wait_for_split;
    PGRecoveryStats pg_recovery_stats;
  
@@ -1959,7 +1963,9 @@ protected:
      const PastIntervals& pi,
      epoch_t epoch,
      PG::CephPeeringEvtRef evt);
-  
+  bool maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create);
+  void resume_creating_pg();
+
    void load_pgs();
  
    /// build initial pg history and intervals on create
author	Kefu Chai <kchai@redhat.com>
	Tue, 26 Sep 2017 07:54:14 +0000 (15:54 +0800)
committer	Kefu Chai <kchai@redhat.com>
	Tue, 17 Oct 2017 15:08:40 +0000 (23:08 +0800)
doc/rados/configuration/pool-pg-config-ref.rst		patch \| blob \| history
qa/suites/rados/singleton/all/max-pg-per-osd.from-mon.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/rados/singleton/all/max-pg-per-osd.from-primary.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/rados/singleton/all/max-pg-per-osd.from-replica.yaml	[new file with mode: 0644]	patch \| blob
qa/tasks/ceph_manager.py		patch \| blob \| history
qa/tasks/osd_max_pg_per_osd.py	[new file with mode: 0644]	patch \| blob
src/common/options.cc		patch \| blob \| history
src/osd/OSD.cc		patch \| blob \| history
src/osd/OSD.h		patch \| blob \| history