From: Sage Weil Date: Thu, 18 May 2017 19:41:39 +0000 (-0400) Subject: mon/OSDMonitor: limit number of concurrently creating pgs X-Git-Tag: ses5-milestone6~8^2~19^2~73 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=1e9c95ac9782d73a194ddde405a4bca1e8fbceb8;p=ceph.git mon/OSDMonitor: limit number of concurrently creating pgs There is overhead for PGs we are creating because the mon has to track which OSD each one current maps to. This can be problematic on a very large cluster. Limit the overhead by setting a cap on the number of PGs we are creating at once; leave the rest in a persistent queue. Signed-off-by: Sage Weil --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 7bfe8c0864af..85552efba638 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -271,6 +271,7 @@ OPTION(mon_osd_cache_size, OPT_INT, 10) // the size of osdmaps cache, not to re OPTION(mon_cpu_threads, OPT_INT, 4) OPTION(mon_osd_mapping_pgs_per_chunk, OPT_INT, 4096) +OPTION(mon_osd_max_creating_pgs, OPT_INT, 512) OPTION(mon_tick_interval, OPT_INT, 5) OPTION(mon_session_timeout, OPT_INT, 300) // must send keepalive or subscribe OPTION(mon_subscribe_interval, OPT_DOUBLE, 24*3600) // for legacy clients only diff --git a/src/mon/CreatingPGs.h b/src/mon/CreatingPGs.h index 46aaa7040772..f2d8ed286974 100644 --- a/src/mon/CreatingPGs.h +++ b/src/mon/CreatingPGs.h @@ -9,25 +9,48 @@ struct creating_pgs_t { epoch_t last_scan_epoch = 0; + + /// pgs we are currently creating std::map > pgs; - std::set created_pools; - unsigned create_pool(int64_t poolid, uint32_t pg_num, - epoch_t created, utime_t modified) { - if (created_pools.count(poolid)) { - return 0; + struct create_info { + epoch_t created; + utime_t modified; + uint64_t start = 0; + uint64_t end = 0; + bool done() const { + return start >= end; } - const unsigned total = pgs.size(); - for (ps_t ps = 0; ps < pg_num; ps++) { - const pg_t pgid{ps, static_cast(poolid)}; - if (pgs.count(pgid)) { - continue; - } - pgs.emplace(pgid, make_pair(created, modified)); + void encode(bufferlist& bl) const { + ::encode(created, bl); + ::encode(modified, bl); + ::encode(start, bl); + ::encode(end, bl); } - return pgs.size() - total; - } + void decode(bufferlist::iterator& p) { + ::decode(created, p); + ::decode(modified, p); + ::decode(start, p); + ::decode(end, p); + } + }; + + /// queue of pgs we still need to create (poolid -> ) + map queue; + /// pools that exist in the osdmap for which at least one pg has been created + std::set created_pools; + + void create_pool(int64_t poolid, uint32_t pg_num, + epoch_t created, utime_t modified) { + if (created_pools.count(poolid) == 0) { + auto& c = queue[poolid]; + c.created = created; + c.modified = modified; + c.end = pg_num; + created_pools.insert(poolid); + } + } unsigned remove_pool(int64_t removed_pool) { const unsigned total = pgs.size(); auto first = pgs.lower_bound(pg_t{0, (uint64_t)removed_pool}); @@ -37,22 +60,25 @@ struct creating_pgs_t { return total - pgs.size(); } void encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); ::encode(last_scan_epoch, bl); ::encode(pgs, bl); ::encode(created_pools, bl); + ::encode(queue, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator& bl) { - DECODE_START(1, bl); + DECODE_START(2, bl); ::decode(last_scan_epoch, bl); ::decode(pgs, bl); ::decode(created_pools, bl); + if (struct_v >= 2) + ::decode(queue, bl); DECODE_FINISH(bl); } void dump(ceph::Formatter *f) const { - f->open_object_section("creating_pgs"); f->dump_unsigned("last_scan_epoch", last_scan_epoch); + f->open_array_section("creating_pgs"); for (auto& pg : pgs) { f->open_object_section("pg"); f->dump_stream("pgid") << pg.first; @@ -61,6 +87,17 @@ struct creating_pgs_t { f->close_section(); } f->close_section(); + f->open_array_section("queue"); + for (auto& p : queue) { + f->open_object_section("pool"); + f->dump_unsigned("pool", p.first); + f->dump_unsigned("created", p.second.created); + f->dump_stream("modified") << p.second.modified; + f->dump_unsigned("ps_start", p.second.start); + f->dump_unsigned("ps_end", p.second.end); + f->close_section(); + } + f->close_section(); f->open_array_section("created_pools"); for (auto pool : created_pools) { f->dump_unsigned("pool", pool); @@ -81,4 +118,5 @@ struct creating_pgs_t { o.push_back(c); } }; +WRITE_CLASS_ENCODER(creating_pgs_t::create_info); WRITE_CLASS_ENCODER(creating_pgs_t); diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 9aa7ea6df5e7..4fc513a31979 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -942,51 +942,86 @@ void OSDMonitor::create_pending() creating_pgs_t OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc) { + dout(10) << __func__ << dendl; creating_pgs_t pending_creatings; { std::lock_guard l(creating_pgs_lock); pending_creatings = creating_pgs; } - if (pending_creatings.last_scan_epoch > inc.epoch) { - return pending_creatings; - } - if (osdmap.get_epoch() && - !osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) { - const unsigned total = pending_creatings.pgs.size(); - mon->pgservice->maybe_add_creating_pgs(creating_pgs.last_scan_epoch, - &pending_creatings); - dout(7) << __func__ << total - pending_creatings.pgs.size() - << " pgs added from pgmap" << dendl; - } - unsigned added = 0; - added += scan_for_creating_pgs(osdmap.get_pools(), - inc.old_pools, - inc.modified, - &pending_creatings); - added += scan_for_creating_pgs(inc.new_pools, - inc.old_pools, - inc.modified, - &pending_creatings); - dout(10) << __func__ << added << " pg added for new pools" << dendl; - for (auto deleted_pool : inc.old_pools) { - auto removed = pending_creatings.remove_pool(deleted_pool); - dout(10) << __func__ << removed - << " pg removed because containing pool deleted: " - << deleted_pool << dendl; - } - // pgmon updates its creating_pgs in check_osd_map() which is called by - // on_active() and check_osd_map() could be delayed if lease expires, so its - // creating_pgs could be stale in comparison with the one of osdmon. let's - // trim them here. otherwise, they will be added back after being erased. - unsigned removed = 0; - for (auto& pg : pending_created_pgs) { - pending_creatings.created_pools.insert(pg.pool()); - removed += pending_creatings.pgs.erase(pg); - } - pending_created_pgs.clear(); - dout(10) << __func__ << removed << " pgs removed because they're created" - << dendl; - pending_creatings.last_scan_epoch = osdmap.get_epoch(); + // check for new or old pools + if (pending_creatings.last_scan_epoch < inc.epoch) { + if (osdmap.get_epoch() && + osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) { + const unsigned total = pending_creatings.pgs.size(); + mon->pgservice->maybe_add_creating_pgs(creating_pgs.last_scan_epoch, + &pending_creatings); + dout(7) << __func__ << total - pending_creatings.pgs.size() + << " pgs added from pgmap" << dendl; + } + scan_for_creating_pgs(osdmap.get_pools(), + inc.old_pools, + inc.modified, + &pending_creatings); + scan_for_creating_pgs(inc.new_pools, + inc.old_pools, + inc.modified, + &pending_creatings); + for (auto deleted_pool : inc.old_pools) { + auto removed = pending_creatings.remove_pool(deleted_pool); + dout(10) << __func__ << removed + << " pg removed because containing pool deleted: " + << deleted_pool << dendl; + last_epoch_clean.remove_pool(deleted_pool); + } + // pgmon updates its creating_pgs in check_osd_map() which is called by + // on_active() and check_osd_map() could be delayed if lease expires, so its + // creating_pgs could be stale in comparison with the one of osdmon. let's + // trim them here. otherwise, they will be added back after being erased. + unsigned removed = 0; + for (auto& pg : pending_created_pgs) { + pending_creatings.created_pools.insert(pg.pool()); + removed += pending_creatings.pgs.erase(pg); + } + pending_created_pgs.clear(); + dout(10) << __func__ << removed << " pgs removed because they're created" + << dendl; + pending_creatings.last_scan_epoch = osdmap.get_epoch(); + } + + // process queue + unsigned max = MAX(1, g_conf->mon_osd_max_creating_pgs); + while (pending_creatings.pgs.size() < max && + !pending_creatings.queue.empty()) { + auto p = pending_creatings.queue.begin(); + int64_t poolid = p->first; + dout(10) << __func__ << " pool " << poolid + << " created " << p->second.created + << " modified " << p->second.modified + << " [" << p->second.start << "-" << p->second.end << ")" + << dendl; + int n = MIN(max - pending_creatings.pgs.size(), + p->second.end - p->second.start); + ps_t first = p->second.start; + ps_t end = first + n; + for (ps_t ps = first; ps < end; ++ps) { + const pg_t pgid{ps, static_cast(poolid)}; + pending_creatings.pgs.emplace(pgid, make_pair(p->second.created, + p->second.modified)); + dout(10) << __func__ << " adding " << pgid << dendl; + } + p->second.start = end; + if (p->second.done()) { + dout(10) << __func__ << " done with queue for " << poolid << dendl; + pending_creatings.queue.erase(p); + } else { + dout(10) << __func__ << " pool " << poolid + << " now [" << p->second.start << "-" << p->second.end << ")" + << dendl; + } + } + dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size() + << " pools" << dendl; + return pending_creatings; } @@ -1337,7 +1372,6 @@ void OSDMonitor::trim_creating_pgs(creating_pgs_t* creating_pgs, dout(20) << __func__ << " pgmap shows " << p->first << " is created" << dendl; p = creating_pgs->pgs.erase(p); - creating_pgs->created_pools.insert(q->first.pool()); } else { ++p; } @@ -3203,13 +3237,12 @@ void OSDMonitor::check_pg_creates_sub(Subscription *sub) } } -unsigned OSDMonitor::scan_for_creating_pgs( +void OSDMonitor::scan_for_creating_pgs( const mempool::osdmap::map& pools, const mempool::osdmap::set& removed_pools, utime_t modified, creating_pgs_t* creating_pgs) const { - unsigned total = 0; for (auto& p : pools) { int64_t poolid = p.first; const pg_pool_t& pool = p.second; @@ -3230,17 +3263,16 @@ unsigned OSDMonitor::scan_for_creating_pgs( << " " << pool << dendl; continue; } - unsigned added = creating_pgs->create_pool(poolid, pool.get_pg_num(), - created, modified); - dout(10) << __func__ << added << " pgs added for pool "<< poolid + dout(10) << __func__ << " queueing pool create for " << poolid << " " << pool << dendl; - total += added; + creating_pgs->create_pool(poolid, pool.get_pg_num(), created, modified); } - return total; } void OSDMonitor::update_creating_pgs() { + dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, " + << creating_pgs.queue.size() << " pools in queue" << dendl; decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch; std::lock_guard l(creating_pgs_lock); for (auto& pg : creating_pgs.pgs) { @@ -3264,7 +3296,10 @@ void OSDMonitor::update_creating_pgs() mapped = mapping.get_epoch(); } break; - } + } else { + // newly creating + mapped = mapping.get_epoch(); + } } } dout(10) << __func__ << " will instruct osd." << acting_primary