:Type: 32-bit Integer
:Default: ``45``
+``osd max pg per osd hard ratio``
+
+:Description: The ratio of number of PGs per OSD allowed by the cluster before
+ OSD refuses to create new PGs. OSD stops creates new PGs the number
+ of PGs it serves exceeds
+ ``osd max pg per osd hard ratio`` \* ``mon max pg per osd``.
+
+:Type: Float
+:Default: ``2``
.. _pool: ../../operations/pools
.. _Monitoring OSDs and PGs: ../../operations/monitoring-osd-pg#peering
--- /dev/null
+roles:
+- - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+openstack:
+ - volumes: # attached to each instance
+ count: 2
+ size: 10 # GB
+overrides:
+ ceph:
+ create_rbd_pool: False
+ conf:
+ mon:
+ osd pool default size: 2
+ osd:
+ mon max pg per osd : 2
+ osd max pg per osd hard ratio : 1
+ log-whitelist:
+ - \(TOO_FEW_PGS\)
+tasks:
+- install:
+- ceph:
+- osd_max_pg_per_osd:
+ test_create_from_mon: True
+ pg_num: 2
--- /dev/null
+roles:
+- - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+ - osd.2
+ - osd.3
+openstack:
+ - volumes: # attached to each instance
+ count: 4
+ size: 10 # GB
+overrides:
+ ceph:
+ create_rbd_pool: False
+ conf:
+ mon:
+ osd pool default size: 2
+ osd:
+ mon max pg per osd : 1
+ osd max pg per osd hard ratio : 1
+ log-whitelist:
+ - \(TOO_FEW_PGS\)
+ - \(PG_
+tasks:
+- install:
+- ceph:
+- osd_max_pg_per_osd:
+ test_create_from_mon: False
+ pg_num: 1
+ pool_size: 2
+ from_primary: True
--- /dev/null
+roles:
+- - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+ - osd.2
+ - osd.3
+openstack:
+ - volumes: # attached to each instance
+ count: 4
+ size: 10 # GB
+overrides:
+ ceph:
+ create_rbd_pool: False
+ conf:
+ mon:
+ osd pool default size: 2
+ osd:
+ mon max pg per osd : 1
+ osd max pg per osd hard ratio : 1
+ log-whitelist:
+ - \(TOO_FEW_PGS\)
+ - \(PG_
+tasks:
+- install:
+- ceph:
+- osd_max_pg_per_osd:
+ test_create_from_mon: False
+ pg_num: 1
+ pool_size: 2
+ from_primary: False
time.sleep(3)
self.log("active!")
+ def wait_till_pg_convergence(self, timeout=None):
+ start = time.time()
+ old_stats = None
+ while True:
+ # strictly speaking, no need to wait for mon. but due to the
+ # "ms inject socket failures" setting, the osdmap could be delayed,
+ # so mgr is likely to ignore the pg-stat messages with pgs serving
+ # newly created pools which is not yet known by mgr. so, to make sure
+ # the mgr is updated with the latest pg-stats, waiting for mon/mgr is
+ # necessary.
+ self.flush_all_pg_stats()
+ new_stats = dict((stat['pgid'], stat['state'])
+ for stat in self.get_pg_stats())
+ if old_stats == new_stats:
+ return old_stats
+ if timeout is not None:
+ assert time.time() - start < timeout, \
+ 'failed to reach convergence before %d secs' % timeout
+ old_stats = new_stats
+ # longer than mgr_stats_period
+ time.sleep(5 + 1)
+
def mark_out_osd(self, osd):
"""
Wrapper to mark osd out.
--- /dev/null
+import logging
+import random
+
+
+log = logging.getLogger(__name__)
+
+
+def pg_num_in_all_states(pgs, *states):
+ return sum(1 for state in pgs.itervalues()
+ if all(s in state for s in states))
+
+
+def pg_num_in_any_state(pgs, *states):
+ return sum(1 for state in pgs.itervalues()
+ if any(s in state for s in states))
+
+
+def test_create_from_mon(ctx, config):
+ """
+ osd should stop creating new pools if the number of pg it servers
+ exceeds the max-pg-per-osd setting, and it should resume the previously
+ suspended pg creations once the its pg number drops down below the setting
+ How it works::
+ 1. set the hard limit of pg-per-osd to "2"
+ 2. create pool.a with pg_num=2
+ # all pgs should be active+clean
+ 2. create pool.b with pg_num=2
+ # new pgs belonging to this pool should be unknown (the primary osd
+ reaches the limit) or creating (replica osd reaches the limit)
+ 3. remove pool.a
+ 4. all pg belonging to pool.b should be active+clean
+ """
+ pg_num = config.get('pg_num', 2)
+ manager = ctx.managers['ceph']
+ log.info('1. creating pool.a')
+ pool_a = manager.create_pool_with_unique_name(pg_num)
+ manager.wait_for_clean()
+ assert manager.get_num_active_clean() == pg_num
+
+ log.info('2. creating pool.b')
+ pool_b = manager.create_pool_with_unique_name(pg_num)
+ pg_states = manager.wait_till_pg_convergence(300)
+ pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+ assert pg_created == pg_num
+ pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating')
+ assert pg_pending == pg_num
+
+ log.info('3. removing pool.a')
+ manager.remove_pool(pool_a)
+ pg_states = manager.wait_till_pg_convergence(300)
+ assert len(pg_states) == pg_num
+ pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+ assert pg_created == pg_num
+
+ # cleanup
+ manager.remove_pool(pool_b)
+
+
+def test_create_from_peer(ctx, config):
+ """
+ osd should stop creating new pools if the number of pg it servers
+ exceeds the max-pg-per-osd setting, and it should resume the previously
+ suspended pg creations once the its pg number drops down below the setting
+
+ How it works::
+ 0. create 4 OSDs.
+ 1. create pool.a with pg_num=1, size=2
+ pg will be mapped to osd.0, and osd.1, and it should be active+clean
+ 2. create pool.b with pg_num=1, size=2.
+ if the pgs stuck in creating, delete the pool since the pool and try
+ again, eventually we'll get the pool to land on the other 2 osds that
+ aren't occupied by pool.a. (this will also verify that pgs for deleted
+ pools get cleaned out of the creating wait list.)
+ 3. mark an osd out. verify that some pgs get stuck stale or peering.
+ 4. delete a pool, verify pgs go active.
+ """
+ pg_num = config.get('pg_num', 1)
+ pool_size = config.get('pool_size', 2)
+ from_primary = config.get('from_primary', True)
+
+ manager = ctx.managers['ceph']
+ log.info('1. creating pool.a')
+ pool_a = manager.create_pool_with_unique_name(pg_num)
+ manager.wait_for_clean()
+ assert manager.get_num_active_clean() == pg_num
+
+ log.info('2. creating pool.b')
+ while True:
+ pool_b = manager.create_pool_with_unique_name(pg_num)
+ pg_states = manager.wait_till_pg_convergence(300)
+ pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+ assert pg_created >= pg_num
+ pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating')
+ assert pg_pending == pg_num * 2 - pg_created
+ if pg_created == pg_num * 2:
+ break
+ manager.remove_pool(pool_b)
+
+ log.info('3. mark an osd out')
+ pg_stats = manager.get_pg_stats()
+ pg = random.choice(pg_stats)
+ if from_primary:
+ victim = pg['acting'][-1]
+ else:
+ victim = pg['acting'][0]
+ manager.mark_out_osd(victim)
+ pg_states = manager.wait_till_pg_convergence(300)
+ pg_stuck = pg_num_in_any_state(pg_states, 'activating', 'stale', 'peering')
+ assert pg_stuck > 0
+
+ log.info('4. removing pool.b')
+ manager.remove_pool(pool_b)
+ manager.wait_for_clean(30)
+
+ # cleanup
+ manager.remove_pool(pool_a)
+
+
+def task(ctx, config):
+ assert isinstance(config, dict), \
+ 'osd_max_pg_per_osd task only accepts a dict for config'
+ manager = ctx.managers['ceph']
+ if config.get('test_create_from_mon', True):
+ test_create_from_mon(ctx, config)
+ else:
+ test_create_from_peer(ctx, config)
.set_default(100)
.set_description(""),
+ Option("osd_max_pg_per_osd_hard_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(2)
+ .set_min(1)
+ .set_description("Maximum number of PG per OSD, a factor of 'mon_max_pg_per_osd'")
+ .set_long_description("OSD will refuse to instantiate PG if the number of PG it serves exceeds this number.")
+ .add_see_also("mon_max_pg_per_osd"),
+
Option("osd_op_complaint_time", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(30)
.set_description(""),
ceph_abort();
}
+ const bool is_mon_create =
+ evt->get_event().dynamic_type() == PG::NullEvt::static_type();
+ if (maybe_wait_for_max_pg(pgid, is_mon_create)) {
+ return -EAGAIN;
+ }
// do we need to resurrect a deleting pg?
spg_t resurrected;
PGRef old_pg_state;
}
}
+bool OSD::maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create)
+{
+ const auto max_pgs_per_osd =
+ (cct->_conf->get_val<int64_t>("mon_max_pg_per_osd") *
+ cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+
+ RWLock::RLocker pg_map_locker{pg_map_lock};
+ if (pg_map.size() < max_pgs_per_osd) {
+ return false;
+ }
+ auto&& pending_creates_locker = guardedly_lock(pending_creates_lock);
+ if (is_mon_create) {
+ pending_creates_from_mon++;
+ } else {
+ pending_creates_from_osd.emplace(pgid.pgid);
+ }
+ dout(5) << __func__ << " withhold creation of pg " << pgid
+ << ": " << pg_map.size() << " >= "<< max_pgs_per_osd << dendl;
+ return true;
+}
+
+// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
+// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
+// to up set if pg_temp is empty. so an empty pg_temp won't work.
+static vector<int32_t> twiddle(const vector<int>& acting) {
+ if (acting.size() > 1) {
+ return {acting[0]};
+ } else {
+ vector<int32_t> twiddled(acting.begin(), acting.end());
+ twiddled.push_back(-1);
+ return twiddled;
+ }
+}
+
+void OSD::resume_creating_pg()
+{
+ bool do_sub_pg_creates = false;
+ MOSDPGTemp *pgtemp = nullptr;
+ {
+ const auto max_pgs_per_osd =
+ (cct->_conf->get_val<int64_t>("mon_max_pg_per_osd") *
+ cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+ RWLock::RLocker l(pg_map_lock);
+ if (max_pgs_per_osd <= pg_map.size()) {
+ // this could happen if admin decreases this setting before a PG is removed
+ return;
+ }
+ unsigned spare_pgs = max_pgs_per_osd - pg_map.size();
+ auto&& locker = guardedly_lock(pending_creates_lock);
+ if (pending_creates_from_mon > 0) {
+ do_sub_pg_creates = true;
+ if (pending_creates_from_mon >= spare_pgs) {
+ spare_pgs = pending_creates_from_mon = 0;
+ } else {
+ spare_pgs -= pending_creates_from_mon;
+ pending_creates_from_mon = 0;
+ }
+ }
+ auto pg = pending_creates_from_osd.cbegin();
+ while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
+ if (!pgtemp) {
+ pgtemp = new MOSDPGTemp{osdmap->get_epoch()};
+ }
+ vector<int> acting;
+ osdmap->pg_to_up_acting_osds(*pg, nullptr, nullptr, &acting, nullptr);
+ pgtemp->pg_temp[*pg] = twiddle(acting);
+ pg = pending_creates_from_osd.erase(pg);
+ spare_pgs--;
+ }
+ }
+ if (do_sub_pg_creates) {
+ if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
+ dout(4) << __func__ << ": resolicit pg creates from mon since "
+ << last_pg_create_epoch << dendl;
+ monc->renew_subs();
+ }
+ }
+ if (pgtemp) {
+ pgtemp->forced = true;
+ monc->send_mon_message(pgtemp);
+ }
+}
void OSD::build_initial_pg_history(
spg_t pgid,
sched_scrub();
}
service.promote_throttle_recalibrate();
+ resume_creating_pg();
bool need_send_beacon = false;
const auto now = ceph::coarse_mono_clock::now();
{
pg->unlock();
}
+
+ auto&& pending_create_locker = guardedly_lock(pending_creates_lock);
+ for (auto pg = pending_creates_from_osd.cbegin();
+ pg != pending_creates_from_osd.cend();) {
+ if (osdmap->get_pg_acting_rank(*pg, whoami) < 0) {
+ pg = pending_creates_from_osd.erase(pg);
+ } else {
+ ++pg;
+ }
+ }
}
for (list<PGRef>::iterator i = to_remove.begin();
<< dendl;
continue;
}
-
if (handle_pg_peering_evt(
pgid,
history,
service.send_pg_created(pgid.pgid);
}
}
- last_pg_create_epoch = m->epoch;
+
+ with_unique_lock(pending_creates_lock, [=]() {
+ if (pending_creates_from_mon == 0) {
+ last_pg_create_epoch = m->epoch;
+ }
+ });
maybe_update_heartbeat_peers();
}
pg->put("PGMap"); // since we've taken it out of map
}
-
// =========================================================
// RECOVERY
RWLock pg_map_lock; // this lock orders *above* individual PG _locks
ceph::unordered_map<spg_t, PG*> pg_map; // protected by pg_map lock
+ std::mutex pending_creates_lock;
+ std::set<pg_t> pending_creates_from_osd;
+ unsigned pending_creates_from_mon = 0;
+
map<spg_t, list<PG::CephPeeringEvtRef> > peering_wait_for_split;
PGRecoveryStats pg_recovery_stats;
const PastIntervals& pi,
epoch_t epoch,
PG::CephPeeringEvtRef evt);
-
+ bool maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create);
+ void resume_creating_pg();
+
void load_pgs();
/// build initial pg history and intervals on create