From: John Spray Date: Mon, 5 Dec 2016 15:40:00 +0000 (+0000) Subject: mds: move throttling code out of StrayManager X-Git-Tag: v12.0.1~140^2~21 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=8a1b3e1b2dab14a6b5a51cb159bce0ffec584d6c;p=ceph.git mds: move throttling code out of StrayManager This will belong in PurgeQueue from now on. We assume that there is no need to throttle the rate of insertions into purge queue as it is an efficient sequentially-written journal. Signed-off-by: John Spray --- diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 26f7e6a506c9..1ac8e55b1d50 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1680,10 +1680,6 @@ CDentry *CDir::_load_dentry( if (in->inode.is_dirty_rstat()) in->mark_dirty_rstat(); - if (inode->is_stray()) { - cache->notify_stray_loaded(dn); - } - //in->hack_accessed = false; //in->hack_load_stamp = ceph_clock_now(); //num_new_inodes_loaded++; diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index c905d9c4c7eb..1ae4beaab53a 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -6587,11 +6587,7 @@ bool MDCache::trim_dentry(CDentry *dn, map& expiremap assert(dnl->is_null()); } - if (dn->is_auth()) { - if (dn->state_test(CDentry::STATE_PURGING)) { - stray_manager.notify_stray_trimmed(dn); - } - } else { + if (!dn->is_auth()) { // notify dentry authority. mds_authority_t auth = dn->authority(); @@ -7629,8 +7625,6 @@ bool MDCache::shutdown_export_strays() strays[i]->get_dirfrags(dfs); } - stray_manager.abort_queue(); - for (std::list::iterator dfs_i = dfs.begin(); dfs_i != dfs.end(); ++dfs_i) { @@ -12308,12 +12302,15 @@ void MDCache::register_perfcounters() /* Stray/purge statistics */ pcb.add_u64(l_mdc_num_strays, "num_strays", "Stray dentries", "stry"); - pcb.add_u64(l_mdc_num_strays_purging, "num_strays_purging", "Stray dentries purging"); pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed", "Stray dentries delayed"); + pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing", "Stray dentries enqueuing for purge"); + + pcb.add_u64(l_mdc_num_strays_purging, "num_strays_purging", "Stray dentries purging"); pcb.add_u64(l_mdc_num_purge_ops, "num_purge_ops", "Purge operations"); + pcb.add_u64_counter(l_mdc_strays_created, "strays_created", "Stray dentries created"); - pcb.add_u64_counter(l_mdc_strays_purged, "strays_purged", - "Stray dentries purged", "purg"); + pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued", + "Stray dentries enqueued for purge", "purg"); pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated", "Stray dentries reintegrated"); pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated", "Stray dentries migrated"); @@ -12362,23 +12359,3 @@ void MDCache::maybe_eval_stray(CInode *in, bool delay) { } } -void MDCache::notify_mdsmap_changed() -{ - stray_manager.update_op_limit(); -} - -void MDCache::notify_osdmap_changed() -{ - stray_manager.update_op_limit(); -} - -void MDCache::handle_conf_change(const struct md_config_t *conf, - const std::set &changed) -{ - assert(mds->mds_lock.is_locked_by_me()); - - if (changed.count("mds_max_purge_ops") - || changed.count("mds_max_purge_ops_per_pg")) { - stray_manager.update_op_limit(); - } -} diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 87da4416c850..720f931476ed 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -73,16 +73,22 @@ enum { l_mdc_first = 3000, // How many inodes currently in stray dentries l_mdc_num_strays, - // How many stray dentries are currently enqueued for purge - l_mdc_num_strays_purging, // How many stray dentries are currently delayed for purge due to refs l_mdc_num_strays_delayed, + // How many stray dentries are currently being enqueued for purge + l_mdc_num_strays_enqueuing, + + // >> TODO populate from PurgeQueue + // How many stray dentries are currently enqueued for purge + l_mdc_num_strays_purging, // How many purge RADOS ops might currently be in flight? l_mdc_num_purge_ops, + // << TODO + // How many dentries have ever been added to stray dir l_mdc_strays_created, - // How many dentries have ever finished purging from stray dir - l_mdc_strays_purged, + // How many dentries have been passed on to PurgeQueue + l_mdc_strays_enqueued, // How many strays have been reintegrated? l_mdc_strays_reintegrated, // How many strays have been migrated? @@ -156,13 +162,6 @@ public: stray_manager.eval_stray(dn); } - void notify_stray_loaded(CDentry *dn) { - stray_manager.notify_stray_loaded(dn); - } - - void handle_conf_change(const struct md_config_t *conf, - const std::set &changed); - void maybe_eval_stray(CInode *in, bool delay=false); bool is_readonly() { return readonly; } void force_readonly(); @@ -1129,9 +1128,6 @@ public: void process_delayed_expire(CDir *dir); void discard_delayed_expire(CDir *dir); - void notify_mdsmap_changed(); - void notify_osdmap_changed(); - protected: void dump_cache(const char *fn, Formatter *f, const std::string& dump_root = "", diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc index 40e16e861b5b..69b76f3e6f6b 100644 --- a/src/mds/MDSDaemon.cc +++ b/src/mds/MDSDaemon.cc @@ -415,7 +415,7 @@ void MDSDaemon::handle_conf_change(const struct md_config_t *conf, } if (mds_rank) { - mds_rank->mdcache->handle_conf_change(conf, changed); + mds_rank->handle_conf_change(conf, changed); } if (!initially_locked) { diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 79bd88b7ff8f..26bbf2a59b66 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -80,6 +80,8 @@ MDSRank::MDSRank( { hb = g_ceph_context->get_heartbeat_map()->add_worker("MDSRank", pthread_self()); + purge_queue.update_op_limit(*mdsmap); + objecter->unset_honor_osdmap_full(); finisher = new Finisher(msgr->cct); @@ -1635,7 +1637,9 @@ void MDSRankDispatcher::handle_mds_map( } } - mdcache->notify_mdsmap_changed(); + if (oldmap->get_max_mds() != mdsmap->get_max_mds()) { + purge_queue.update_op_limit(*mdsmap); + } } void MDSRank::handle_mds_recovery(mds_rank_t who) @@ -2435,7 +2439,7 @@ void MDSRankDispatcher::handle_osd_map() server->handle_osd_map(); - mdcache->notify_osdmap_changed(); + purge_queue.update_op_limit(*mdsmap); // By default the objecter only requests OSDMap updates on use, // we would like to always receive the latest maps in order to diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h index 13e905773018..07e34869a83d 100644 --- a/src/mds/MDSRank.h +++ b/src/mds/MDSRank.h @@ -201,6 +201,12 @@ class MDSRank { void handle_write_error(int err); + void handle_conf_change(const struct md_config_t *conf, + const std::set &changed) + { + purge_queue.handle_conf_change(conf, changed, *mdsmap); + } + protected: // Flag to indicate we entered shutdown: anyone seeing this to be true // after taking mds_lock must drop out. diff --git a/src/mds/PurgeQueue.cc b/src/mds/PurgeQueue.cc index 56f5e20fe296..bcf1e9990653 100644 --- a/src/mds/PurgeQueue.cc +++ b/src/mds/PurgeQueue.cc @@ -64,6 +64,8 @@ void PurgeItem::decode(bufferlist::iterator &p) // for this. Shoudl just give objects a string name with a rank // suffix, like we do for MDSTables. Requires a little refactor // of Journaler. +// TODO: if Objecter has any slow requests, take that as a hint and +// slow down our rate of purging (keep accepting pushes though) PurgeQueue::PurgeQueue( CephContext *cct_, mds_rank_t rank_, @@ -159,10 +161,59 @@ void PurgeQueue::push(const PurgeItem &pi, Context *completion) // and passing the PurgeItem straight into _execute_item } +#if 0 +uint32_t StrayManager::_calculate_ops_required(CInode *in, bool trunc) +{ + uint32_t ops_required = 0; + if (in->is_dir()) { + // Directory, count dirfrags to be deleted + std::list ls; + if (!in->dirfragtree.is_leaf(frag_t())) { + in->dirfragtree.get_leaves(ls); + } + // One for the root, plus any leaves + ops_required = 1 + ls.size(); + } else { + // File, work out concurrent Filer::purge deletes + const uint64_t to = MAX(in->inode.max_size_ever, + MAX(in->inode.size, in->inode.get_max_size())); + + const uint64_t num = (to > 0) ? Striper::get_num_objects(in->inode.layout, to) : 1; + ops_required = MIN(num, g_conf->filer_max_purge_ops); + + // Account for removing (or zeroing) backtrace + ops_required += 1; + + // Account for deletions for old pools + if (!trunc) { + ops_required += in->get_projected_inode()->old_pools.size(); + } + } + + return ops_required; +} +#endif + bool PurgeQueue::can_consume() { +#if 0 + // Calculate how much of the ops allowance is available, allowing + // for the case where the limit is currently being exceeded. + uint32_t ops_avail; + if (ops_in_flight <= max_purge_ops) { + ops_avail = max_purge_ops - ops_in_flight; + } else { + ops_avail = 0; + } + + dout(10) << __func__ << ": allocating allowance " + << ops_required << " to " << ops_in_flight << " in flight" << dendl; + + logger->set(l_mdc_num_purge_ops, ops_in_flight); +#endif + // TODO: enforce limits (currently just allowing one in flight) - if (in_flight.size() > 0) { + if (in_flight.size() > cct->_conf->mds_max_purge_files) { return false; } else { return true; @@ -279,6 +330,57 @@ void PurgeQueue::execute_item_complete( in_flight.erase(iter); +#if 0 + // Release resources + dout(10) << __func__ << ": decrementing op allowance " + << ops_allowance << " from " << ops_in_flight << " in flight" << dendl; + assert(ops_in_flight >= ops_allowance); + ops_in_flight -= ops_allowance; + logger->set(l_mdc_num_purge_ops, ops_in_flight); + files_purging -= 1; +#endif + _consume(); } +void PurgeQueue::update_op_limit(const MDSMap &mds_map) +{ + Mutex::Locker l(lock); + + uint64_t pg_count = 0; + objecter->with_osdmap([&](const OSDMap& o) { + // Number of PGs across all data pools + const std::set &data_pools = mds_map.get_data_pools(); + for (const auto dp : data_pools) { + if (o.get_pg_pool(dp) == NULL) { + // It is possible that we have an older OSDMap than MDSMap, + // because we don't start watching every OSDMap until after + // MDSRank is initialized + dout(4) << " data pool " << dp << " not found in OSDMap" << dendl; + continue; + } + pg_count += o.get_pg_num(dp); + } + }); + + // Work out a limit based on n_pgs / n_mdss, multiplied by the user's + // preference for how many ops per PG + max_purge_ops = uint64_t(((double)pg_count / (double)mds_map.get_max_mds()) * + cct->_conf->mds_max_purge_ops_per_pg); + + // User may also specify a hard limit, apply this if so. + if (cct->_conf->mds_max_purge_ops) { + max_purge_ops = MIN(max_purge_ops, cct->_conf->mds_max_purge_ops); + } +} + +void PurgeQueue::handle_conf_change(const struct md_config_t *conf, + const std::set &changed, + const MDSMap &mds_map) +{ + if (changed.count("mds_max_purge_ops") + || changed.count("mds_max_purge_ops_per_pg")) { + update_op_limit(mds_map); + } +} + diff --git a/src/mds/PurgeQueue.h b/src/mds/PurgeQueue.h index e94759a87284..b7ed4345f732 100644 --- a/src/mds/PurgeQueue.h +++ b/src/mds/PurgeQueue.h @@ -16,9 +16,16 @@ #define PURGE_QUEUE_H_ #include "include/compact_set.h" +#include "mds/MDSMap.h" #include "osdc/Journaler.h" +/** + * Descriptor of the work associated with purging a file. We record + * the minimal amount of information from the inode such as the size + * and layout: all other un-needed inode metadata (times, permissions, etc) + * has been discarded. + */ class PurgeItem { public: @@ -38,6 +45,9 @@ public: WRITE_CLASS_ENCODER(PurgeItem) /** + * A persistent queue of PurgeItems. This class both writes and reads + * to the queue. There is one of these per MDS rank. + * * Note that this class does not take a reference to MDSRank: we are * independent of all the metadata structures and do not need to * take mds_lock for anything. @@ -59,8 +69,16 @@ protected: Objecter *objecter; Journaler journaler; + // Map of Journaler offset to PurgeItem std::map in_flight; + // Throttled allowances + uint64_t ops_in_flight; + uint64_t files_purging; + + // Dynamic op limit per MDS based on PG count + uint64_t max_purge_ops; + //PerfCounters *logger; bool can_consume(); @@ -87,6 +105,12 @@ public: // to the queue (there is no callback for when it is executed) void push(const PurgeItem &pi, Context *completion); + void update_op_limit(const MDSMap &mds_map); + + void handle_conf_change(const struct md_config_t *conf, + const std::set &changed, + const MDSMap &mds_map); + PurgeQueue( CephContext *cct_, mds_rank_t rank_, diff --git a/src/mds/StrayManager.cc b/src/mds/StrayManager.cc index eb293c80cce3..0b793447633b 100644 --- a/src/mds/StrayManager.cc +++ b/src/mds/StrayManager.cc @@ -78,25 +78,22 @@ class C_IO_PurgeStrayPurged : public StrayManagerIOContext { // How many ops_in_flight were allocated to this purge? uint32_t ops_allowance; public: - C_IO_PurgeStrayPurged(StrayManager *sm_, CDentry *d, bool oh, uint32_t ops) : - StrayManagerIOContext(sm_), dn(d), only_head(oh), ops_allowance(ops) { } + C_IO_PurgeStrayPurged(StrayManager *sm_, CDentry *d, bool oh) : + StrayManagerIOContext(sm_), dn(d), only_head(oh) { } void finish(int r) override { assert(r == 0 || r == -ENOENT); sm->_purge_stray_purged(dn, ops_allowance, only_head); } }; -void StrayManager::purge(CDentry *dn, uint32_t op_allowance) + +void StrayManager::purge(CDentry *dn) { CDentry::linkage_t *dnl = dn->get_projected_linkage(); CInode *in = dnl->get_inode(); dout(10) << __func__ << " " << *dn << " " << *in << dendl; assert(!dn->is_replicated()); - num_strays_purging++; - logger->set(l_mdc_num_strays_purging, num_strays_purging); - - // CHEAT. there's no real need to journal our intent to purge, since // that is implicit in the dentry's presence and non-use in the stray // dir. on recovery, we'll need to re-eval all strays anyway. @@ -105,7 +102,7 @@ void StrayManager::purge(CDentry *dn, uint32_t op_allowance) C_GatherBuilder gather( g_ceph_context, new C_OnFinisher(new C_IO_PurgeStrayPurged( - this, dn, false, op_allowance), mds->finisher)); + this, dn, false), mds->finisher)); if (in->is_dir()) { object_locator_t oloc(mds->mdsmap->get_metadata_pool()); @@ -192,6 +189,10 @@ void StrayManager::_purge_stray_purged( CInode *in = dn->get_projected_linkage()->get_inode(); dout(10) << "_purge_stray_purged " << *dn << " " << *in << dendl; + logger->inc(l_mdc_strays_enqueued); + num_strays_enqueuing--; + logger->set(l_mdc_num_strays_enqueuing, num_strays_enqueuing); + if (only_head) { /* This was a ::truncate */ EUpdate *le = new EUpdate(mds->mdlog, "purge_stray truncate"); @@ -248,20 +249,7 @@ void StrayManager::_purge_stray_purged( num_strays--; logger->set(l_mdc_num_strays, num_strays); - logger->inc(l_mdc_strays_purged); } - - num_strays_purging--; - logger->set(l_mdc_num_strays_purging, num_strays_purging); - - // Release resources - dout(10) << __func__ << ": decrementing op allowance " - << ops_allowance << " from " << ops_in_flight << " in flight" << dendl; - assert(ops_in_flight >= ops_allowance); - ops_in_flight -= ops_allowance; - logger->set(l_mdc_num_purge_ops, ops_in_flight); - files_purging -= 1; - _advance(); } void StrayManager::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls) @@ -300,11 +288,6 @@ void StrayManager::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *l void StrayManager::enqueue(CDentry *dn, bool trunc) { - if (aborted) { - dout(10) << __func__ << ": aborted, skip purging: " << *dn << dendl; - return; - } - CDentry::linkage_t *dnl = dn->get_projected_linkage(); assert(dnl); CInode *in = dnl->get_inode(); @@ -323,213 +306,68 @@ void StrayManager::enqueue(CDentry *dn, bool trunc) } } - const uint32_t ops_required = _calculate_ops_required(in, trunc); - - // Try to purge immediately if there is nothing in the queue, otherwise - // we will go to the back of the queue (even if there is allowance available - // to run us immediately) in order to be fair to others. - bool consumed = false; - if (ready_for_purge.empty()) { - consumed = _consume(dn, trunc, ops_required); - } - - if (consumed) { - dout(10) << __func__ << ": purging this dentry immediately: " - << *dn << dendl; - } else { - dout(10) << __func__ << ": enqueuing this dentry for later purge: " - << *dn << dendl; - if (!dn->state_test(CDentry::STATE_PURGINGPINNED) && - ready_for_purge.size() < g_conf->mds_max_purge_files) { - dn->get(CDentry::PIN_PURGING); - dn->state_set(CDentry::STATE_PURGINGPINNED); - } - ready_for_purge.push_back(QueuedStray(dn, trunc, ops_required)); - } -} - -class C_StraysFetched : public StrayManagerContext { -public: - C_StraysFetched(StrayManager *sm_) : - StrayManagerContext(sm_) { } - void finish(int r) override { - sm->_advance(); - } -}; - -void StrayManager::_advance() -{ - if (aborted) - return; - - std::map > to_fetch; - - for (auto p = ready_for_purge.begin(); - p != ready_for_purge.end();) { - const QueuedStray &qs = *p; - auto q = p++; - CDentry *dn = qs.dir->lookup_exact_snap(qs.name, CEPH_NOSNAP); - if (!dn) { - assert(trimmed_strays.count(qs.name) > 0); - if (fetching_strays.size() >= g_conf->mds_max_purge_files) { - break; - } - - dout(10) << __func__ << ": fetching stray dentry " << qs.name << dendl; - - auto it = fetching_strays.insert(qs); - assert(it.second); - to_fetch[qs.dir].insert(dentry_key_t(CEPH_NOSNAP, (it.first)->name.c_str())); - ready_for_purge.erase(q); - continue; - } - - const bool consumed = _consume(dn, qs.trunc, qs.ops_required); - if (!consumed) { - break; - } - ready_for_purge.erase(q); - } - - MDSGatherBuilder gather(g_ceph_context); - for (auto p = to_fetch.begin(); p != to_fetch.end(); ++p) - p->first->fetch(gather.new_sub(), p->second); - - if (gather.has_subs()) { - gather.set_finisher(new C_StraysFetched(this)); - gather.activate(); - } -} - -/* - * Note that there are compromises to how throttling - * is implemented here, in the interests of simplicity: - * * If insufficient ops are available to execute - * the next item on the queue, we block even if - * there are items further down the queue requiring - * fewer ops which might be executable - * * The ops considered "in use" by a purge will be - * an overestimate over the period of execution, as - * we count filer_max_purge_ops and ops for old backtraces - * as in use throughout, even though towards the end - * of the purge the actual ops in flight will be - * lower. - * * The ops limit may be exceeded if the number of ops - * required by a single inode is greater than the - * limit, for example directories with very many - * fragments. - */ -bool StrayManager::_consume(CDentry *dn, bool trunc, uint32_t ops_required) -{ - const int files_avail = g_conf->mds_max_purge_files - files_purging; - - if (!started) { - dout(20) << __func__ << ": haven't started purging yet" << dendl; - return false; - } - - if (files_avail <= 0) { - dout(20) << __func__ << ": throttling on max files" << dendl; - return false; - } else { - dout(20) << __func__ << ": purging dn: " << *dn << dendl; - } - - // Calculate how much of the ops allowance is available, allowing - // for the case where the limit is currently being exceeded. - uint32_t ops_avail; - if (ops_in_flight <= max_purge_ops) { - ops_avail = max_purge_ops - ops_in_flight; - } else { - ops_avail = 0; - } - - /* The ops_in_flight > 0 condition here handles the case where the - * ops required by this inode would never fit in the limit: we wait - * instead until nothing else is running */ - if (ops_in_flight > 0 && ops_avail < ops_required) { - dout(20) << __func__ << ": throttling on max ops (require " - << ops_required << ", " << ops_in_flight << " in flight" << dendl; - return false; - } + dout(20) << __func__ << ": purging dn: " << *dn << dendl; if (!dn->state_test(CDentry::STATE_PURGINGPINNED)) { dn->get(CDentry::PIN_PURGING); dn->state_set(CDentry::STATE_PURGINGPINNED); } + ++num_strays_enqueuing; + logger->set(l_mdc_num_strays_enqueuing, num_strays_enqueuing); + // Resources are available, acquire them and execute the purge - files_purging += 1; - dout(10) << __func__ << ": allocating allowance " - << ops_required << " to " << ops_in_flight << " in flight" << dendl; - ops_in_flight += ops_required; - logger->set(l_mdc_num_purge_ops, ops_in_flight); - - _process(dn, trunc, ops_required); - return true; + _enqueue(dn, trunc); + + dout(10) << __func__ << ": purging this dentry immediately: " + << *dn << dendl; } class C_OpenSnapParents : public StrayManagerContext { CDentry *dn; bool trunc; - uint32_t ops_required; public: - C_OpenSnapParents(StrayManager *sm_, CDentry *dn_, bool t, uint32_t ops) : - StrayManagerContext(sm_), dn(dn_), trunc(t), ops_required(ops) { } + C_OpenSnapParents(StrayManager *sm_, CDentry *dn_, bool t) : + StrayManagerContext(sm_), dn(dn_), trunc(t) { } void finish(int r) override { - sm->_process(dn, trunc, ops_required); + sm->_enqueue(dn, trunc); } }; -void StrayManager::_process(CDentry *dn, bool trunc, uint32_t ops_required) +void StrayManager::_enqueue(CDentry *dn, bool trunc) { CInode *in = dn->get_linkage()->get_inode(); if (in->snaprealm && !in->snaprealm->have_past_parents_open() && - !in->snaprealm->open_parents(new C_OpenSnapParents(this, dn, trunc, - ops_required))) { + !in->snaprealm->open_parents(new C_OpenSnapParents(this, dn, trunc))) { // this can happen if the dentry had been trimmed from cache. return; } - if (trunc) { - truncate(dn, ops_required); - } else { - purge(dn, ops_required); + if (!started) { + // If the MDS is not yet active, defer executing this purge + // in order to avoid the mdlog writes we do on purge completion. + mds->wait_for_active( + new MDSInternalContextWrapper(mds, + new FunctionContext([this, dn, trunc](int r){ + // It is safe to hold on to this CDentry* pointer + // because the dentry is pinned with PIN_PURGING + _enqueue(dn, trunc); + }) + ) + ); + + return; } -} -uint32_t StrayManager::_calculate_ops_required(CInode *in, bool trunc) -{ - uint32_t ops_required = 0; - if (in->is_dir()) { - // Directory, count dirfrags to be deleted - std::list ls; - if (!in->dirfragtree.is_leaf(frag_t())) { - in->dirfragtree.get_leaves(ls); - } - // One for the root, plus any leaves - ops_required = 1 + ls.size(); + if (trunc) { + truncate(dn); } else { - // File, work out concurrent Filer::purge deletes - const uint64_t to = MAX(in->inode.max_size_ever, - MAX(in->inode.size, in->inode.get_max_size())); - - const uint64_t num = (to > 0) ? Striper::get_num_objects(in->inode.layout, to) : 1; - ops_required = MIN(num, g_conf->filer_max_purge_ops); - - // Account for removing (or zeroing) backtrace - ops_required += 1; - - // Account for deletions for old pools - if (!trunc) { - ops_required += in->get_projected_inode()->old_pools.size(); - } + purge(dn); } - - return ops_required; } + void StrayManager::advance_delayed() { for (elist::iterator p = delayed_eval_stray.begin(); !p.end(); ) { @@ -589,7 +427,7 @@ struct C_MDC_EvalStray : public StrayManagerContext { } }; -bool StrayManager::__eval_stray(CDentry *dn, bool delay) +bool StrayManager::_eval_stray(CDentry *dn, bool delay) { dout(10) << "eval_stray " << *dn << dendl; CDentry::linkage_t *dnl = dn->get_projected_linkage(); @@ -731,7 +569,6 @@ void StrayManager::activate() { dout(10) << __func__ << dendl; started = true; - _advance(); } bool StrayManager::eval_stray(CDentry *dn, bool delay) @@ -741,7 +578,7 @@ bool StrayManager::eval_stray(CDentry *dn, bool delay) return false; dn->state_set(CDentry::STATE_EVALUATINGSTRAY); - bool ret = __eval_stray(dn, delay); + bool ret = _eval_stray(dn, delay); dn->state_clear(CDentry::STATE_EVALUATINGSTRAY); return ret; } @@ -840,48 +677,14 @@ void StrayManager::migrate_stray(CDentry *dn, mds_rank_t to) StrayManager::StrayManager(MDSRank *mds, PurgeQueue &purge_queue_) : delayed_eval_stray(member_offset(CDentry, item_stray)), - mds(mds), purge_queue(purge_queue_), logger(NULL), started(false), - aborted(false), - ops_in_flight(0), files_purging(0), - max_purge_ops(0), - num_strays(0), num_strays_purging(0), num_strays_delayed(0), - filer(mds->objecter, mds->finisher) + mds(mds), logger(NULL), started(false), num_strays(0), + num_strays_delayed(0), num_strays_enqueuing(0), + filer(mds->objecter, mds->finisher), purge_queue(purge_queue_) { assert(mds != NULL); } -void StrayManager::abort_queue() -{ - for (std::list::iterator i = ready_for_purge.begin(); - i != ready_for_purge.end(); ++i) - { - const QueuedStray &qs = *i; - CDentry *dn = qs.dir->lookup_exact_snap(qs.name, CEPH_NOSNAP); - if (!dn) - continue; - - dout(10) << __func__ << ": aborting enqueued purge " << *dn << dendl; - - CDentry::linkage_t *dnl = dn->get_projected_linkage(); - assert(dnl); - CInode *in = dnl->get_inode(); - assert(in); - - // Clear flags set in enqueue - if (dn->state_test(CDentry::STATE_PURGINGPINNED)) - dn->put(CDentry::PIN_PURGING); - dn->state_clear(CDentry::STATE_PURGING | CDentry::STATE_PURGINGPINNED); - in->state_clear(CInode::STATE_PURGING); - } - ready_for_purge.clear(); - - trimmed_strays.clear(); - fetching_strays.clear(); - - aborted = true; -} - -void StrayManager::truncate(CDentry *dn, uint32_t op_allowance) +void StrayManager::truncate(CDentry *dn) { CDentry::linkage_t *dnl = dn->get_projected_linkage(); CInode *in = dnl->get_inode(); @@ -889,12 +692,9 @@ void StrayManager::truncate(CDentry *dn, uint32_t op_allowance) dout(10) << __func__ << ": " << *dn << " " << *in << dendl; assert(!dn->is_replicated()); - num_strays_purging++; - logger->set(l_mdc_num_strays_purging, num_strays_purging); - C_GatherBuilder gather( g_ceph_context, - new C_OnFinisher(new C_IO_PurgeStrayPurged(this, dn, true, 0), + new C_OnFinisher(new C_IO_PurgeStrayPurged(this, dn, true), mds->finisher)); SnapRealm *realm = in->find_snaprealm(); @@ -943,72 +743,3 @@ void StrayManager::_truncate_stray_logged(CDentry *dn, LogSegment *ls) eval_stray(dn); } - -void StrayManager::update_op_limit() -{ - uint64_t pg_count = 0; - mds->objecter->with_osdmap([&](const OSDMap& o) { - // Number of PGs across all data pools - const std::set &data_pools = mds->mdsmap->get_data_pools(); - for (const auto dp : data_pools) { - if (o.get_pg_pool(dp) == NULL) { - // It is possible that we have an older OSDMap than MDSMap, - // because we don't start watching every OSDMap until after - // MDSRank is initialized - dout(4) << __func__ << " data pool " << dp - << " not found in OSDMap" << dendl; - continue; - } - pg_count += o.get_pg_num(dp); - } - }); - - uint64_t mds_count = mds->mdsmap->get_max_mds(); - - // Work out a limit based on n_pgs / n_mdss, multiplied by the user's - // preference for how many ops per PG - max_purge_ops = uint64_t(((double)pg_count / (double)mds_count) * - g_conf->mds_max_purge_ops_per_pg); - - // User may also specify a hard limit, apply this if so. - if (g_conf->mds_max_purge_ops) { - max_purge_ops = MIN(max_purge_ops, g_conf->mds_max_purge_ops); - } -} - -void StrayManager::notify_stray_loaded(CDentry *dn) -{ - dout(10) << __func__ << ": " << *dn << dendl; - - dn->state_set(CDentry::STATE_STRAY); - CInode *in = dn->get_linkage()->get_inode(); - if (in->inode.nlink == 0) - in->state_set(CInode::STATE_ORPHAN); - - if (aborted) - return; - - auto p = trimmed_strays.find(dn->name); - if (p != trimmed_strays.end()) { - dn->state_set(CDentry::STATE_PURGING); - in->state_set(CInode::STATE_PURGING); - trimmed_strays.erase(p); - - QueuedStray key(dn, false, 0); - auto q = fetching_strays.find(key); - if (q != fetching_strays.end()) { - ready_for_purge.push_front(*q); - fetching_strays.erase(q); - } - } -} - -void StrayManager::notify_stray_trimmed(CDentry *dn) -{ - dout(10) << __func__ << ": " << *dn << dendl; - - if (aborted) - return; - - trimmed_strays.insert(dn->name); -} diff --git a/src/mds/StrayManager.h b/src/mds/StrayManager.h index 0a9b61862ba1..3124478bf63d 100644 --- a/src/mds/StrayManager.h +++ b/src/mds/StrayManager.h @@ -27,55 +27,33 @@ class CDentry; class StrayManager { protected: - class QueuedStray { - public: - CDir *dir; - std::string name; - bool trunc; - uint32_t ops_required; - QueuedStray(CDentry *dn, bool t, uint32_t ops) - : dir(dn->get_dir()), name(dn->name), - trunc(t), ops_required(ops) {} - bool operator<(const QueuedStray& o) const { - return (name < o.name); - } - }; - // Has passed through eval_stray and still has refs elist delayed_eval_stray; - // No more refs, can purge these - std::list ready_for_purge; - // strays that have been trimmed from cache std::set trimmed_strays; - // strays that are being fetching - std::set fetching_strays; // Global references for doing I/O MDSRank *mds; PerfCounters *logger; bool started; - bool aborted; - - // Throttled allowances - uint64_t ops_in_flight; - uint64_t files_purging; - - // Dynamic op limit per MDS based on PG count - uint64_t max_purge_ops; - // Statistics + // Stray dentries for this rank (including those not in cache) uint64_t num_strays; - uint64_t num_strays_purging; + + // Stray dentries uint64_t num_strays_delayed; + // Entries that have entered enqueue() but not been persistently + // recorded by PurgeQueue yet + uint64_t num_strays_enqueuing; + Filer filer; PurgeQueue &purge_queue; - void truncate(CDentry *dn, uint32_t op_allowance); + void truncate(CDentry *dn); /** * Purge a dentry from a stray directory. This function @@ -83,7 +61,7 @@ class StrayManager * throttling is also satisfied. There is no going back * at this stage! */ - void purge(CDentry *dn, uint32_t op_allowance); + void purge(CDentry *dn); /** * Completion handler for a Filer::purge on a stray inode. @@ -108,41 +86,15 @@ class StrayManager friend class C_TruncateStrayLogged; friend class C_IO_PurgeStrayPurged; - /** - * Enqueue a purge operation on a dentry that has passed the tests - * in eval_stray. This may start the operation inline if the throttle - * allowances are already available. - * - * @param trunc false to purge dentry (normal), true to just truncate - * inode (snapshots) - */ - void enqueue(CDentry *dn, bool trunc); - - /** - * Iteratively call _consume on items from the ready_for_purge - * list until it returns false (throttle limit reached) - */ - void _advance(); - - /** - * Attempt to purge an inode, if throttling permits - * its. - * - * Return true if we successfully consumed resource, - * false if insufficient resource was available. - */ - bool _consume(CDentry *dn, bool trunc, uint32_t ops_required); - void _process(CDentry *dn, bool trunc, uint32_t ops_required); + // Call this on a dentry that has been identified as + // elegible for purging. It will be passed on to PurgeQueue. + void enqueue(CDentry *dn, bool trunc); + // Final part of enqueue() which we may have to retry + // after opening snap parents. + void _enqueue(CDentry *dn, bool trunc); - /** - * Return the maximum number of concurrent RADOS ops that - * may be executed while purging this inode. - * - * @param trunc true if it's a truncate, false if it's a purge - */ - uint32_t _calculate_ops_required(CInode *in, bool trunc); /** * When hard links exist to an inode whose primary dentry @@ -168,7 +120,7 @@ class StrayManager * @returns true if the dentry will be purged (caller should never * take more refs after this happens), else false. */ - bool __eval_stray(CDentry *dn, bool delay=false); + bool _eval_stray(CDentry *dn, bool delay=false); // My public interface is for consumption by MDCache public: @@ -243,33 +195,6 @@ class StrayManager * this MDS to another MDS. */ void notify_stray_removed(); - - /** - * For any strays that are enqueued for purge, but - * currently blocked on throttling, clear their - * purging status. Used during MDS rank shutdown - * so that it can migrate these strays instead - * of waiting for them to trickle through the - * queue. - */ - void abort_queue(); - - /* - * Calculate our local RADOS op throttle limit based on - * (mds_max_purge_ops_per_pg / number_of_mds) * number_of_pg - * - * Call this whenever one of those operands changes. - */ - void update_op_limit(); - - /* - * track stray dentries that have been trimmed from cache - */ - void notify_stray_trimmed(CDentry *dn); - /* - * restore stray dentry's previous stats - */ - void notify_stray_loaded(CDentry *dn); }; #endif // STRAY_MANAGER_H