From: John Spray Date: Wed, 16 Nov 2016 14:08:36 +0000 (+0000) Subject: mds: more deterministic timing on frag split/join X-Git-Tag: v11.1.0~137^2~4 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=819d34e9da64aa9e48b0f3790f3796ee73ab9c5f;p=ceph.git mds: more deterministic timing on frag split/join ...by using timer instead of tick() Fixes: http://tracker.ceph.com/issues/17853 Signed-off-by: John Spray --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 22948076bee8..108ffd2ed502 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -517,6 +517,7 @@ OPTION(mds_bal_merge_size, OPT_INT, 50) OPTION(mds_bal_interval, OPT_INT, 10) // seconds OPTION(mds_bal_fragment_interval, OPT_INT, 5) // seconds OPTION(mds_bal_fragment_size_max, OPT_INT, 10000*10) // order of magnitude higher than split size +OPTION(mds_bal_fragment_fast_factor, OPT_FLOAT, 1.5) // multiple of size_max that triggers immediate split OPTION(mds_bal_idle_threshold, OPT_FLOAT, 0) OPTION(mds_bal_max, OPT_INT, -1) OPTION(mds_bal_max_until, OPT_INT, -1) diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h index 4c35af3f2d0a..d280eece5d5c 100644 --- a/src/mds/CDentry.h +++ b/src/mds/CDentry.h @@ -194,6 +194,13 @@ public: return &projected.back(); return &linkage; } + + const linkage_t *get_projected_linkage() const { + if (!projected.empty()) + return &projected.back(); + return &linkage; + } + CInode *get_projected_inode() { return get_projected_linkage()->inode; } diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index ad161ce1b4f7..078a1ae7bcec 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -3118,3 +3118,32 @@ std::string CDir::get_path() const return path; } +bool CDir::should_split_fast() const +{ + // Max size a fragment can be before trigger fast splitting + int fast_limit = g_conf->mds_bal_split_size * g_conf->mds_bal_fragment_fast_factor; + + // Fast path: the sum of accounted size and null dentries does not + // exceed threshold: we definitely are not over it. + if (get_frag_size() + get_num_head_null() <= fast_limit) { + return false; + } + + // Fast path: the accounted size of the frag exceeds threshold: we + // definitely are over it + if (get_frag_size() > fast_limit) { + return true; + } + + int64_t effective_size = 0; + + for (const auto &p : items) { + const CDentry *dn = p.second; + if (!dn->get_projected_linkage()->is_null()) { + effective_size++; + } + } + + return effective_size > fast_limit; +} + diff --git a/src/mds/CDir.h b/src/mds/CDir.h index 82fecded6c58..9313e3d26843 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -444,7 +444,9 @@ protected: return num_dirty; } - int64_t get_frag_size() { return get_projected_fnode()->fragstat.size(); } + int64_t get_frag_size() const { + return get_projected_fnode()->fragstat.size(); + } // -- dentries and inodes -- public: @@ -488,10 +490,11 @@ public: void split(int bits, list& subs, list& waiters, bool replay); void merge(list& subs, list& waiters, bool replay); - bool should_split() { + bool should_split() const { return (int)get_frag_size() > g_conf->mds_bal_split_size; } - bool should_merge() { + bool should_split_fast() const; + bool should_merge() const { return (int)get_frag_size() < g_conf->mds_bal_merge_size; } diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc index 5cad10bc5399..334c11e94b53 100644 --- a/src/mds/MDBalancer.cc +++ b/src/mds/MDBalancer.cc @@ -98,14 +98,6 @@ void MDBalancer::tick() send_heartbeat(); num_bal_times--; } - - // hash? - if ((g_conf->mds_bal_frag || g_conf->mds_thrash_fragments) && - g_conf->mds_bal_fragment_interval > 0 && - now.sec() - last_fragment.sec() > g_conf->mds_bal_fragment_interval) { - last_fragment = now; - do_fragmenting(); - } } @@ -385,95 +377,121 @@ double MDBalancer::try_match(mds_rank_t ex, double& maxex, return howmuch; } -void MDBalancer::queue_split(CDir *dir) +void MDBalancer::queue_split(const CDir *dir, bool fast) { + dout(10) << __func__ << " enqueuing " << *dir + << " (fast=" << fast << ")" << dendl; + assert(mds->mdsmap->allows_dirfrags()); - split_queue.insert(dir->dirfrag()); -} + const dirfrag_t frag = dir->dirfrag(); + + auto callback = [this, frag](int r) { + if (split_pending.erase(frag) == 0) { + // Someone beat me to it. This can happen in the fast splitting + // path, because we spawn two contexts, one with mds->timer and + // one with mds->queue_waiter. The loser can safely just drop + // out. + return; + } -void MDBalancer::queue_merge(CDir *dir) -{ - merge_queue.insert(dir->dirfrag()); + CDir *split_dir = mds->mdcache->get_dirfrag(frag); + if (!split_dir) { + dout(10) << "drop split on " << frag << " because not in cache" << dendl; + return; + } + + // Pass on to MDCache: note that the split might still not + // happen if the checks in MDCache::can_fragment fail. + dout(10) << __func__ << " splitting " << *split_dir << dendl; + mds->mdcache->split_dir(split_dir, g_conf->mds_bal_split_bits); + }; + + bool is_new = false; + if (split_pending.count(frag) == 0) { + split_pending.insert(frag); + is_new = true; + } + + if (fast) { + // Do the split ASAP: enqueue it in the MDSRank waiters which are + // run at the end of dispatching the current request + mds->queue_waiter(new MDSInternalContextWrapper(mds, + new FunctionContext(callback))); + } else if (is_new) { + // Set a timer to really do the split: we don't do it immediately + // so that bursts of ops on a directory have a chance to go through + // before we freeze it. + mds->timer.add_event_after(g_conf->mds_bal_fragment_interval, + new FunctionContext(callback)); + } } -void MDBalancer::do_fragmenting() +void MDBalancer::queue_merge(CDir *dir) { - if (split_queue.empty() && merge_queue.empty()) { - dout(20) << "do_fragmenting has nothing to do" << dendl; - return; - } + const auto frag = dir->dirfrag(); + auto callback = [this, frag](int r) { + assert(frag.frag != frag_t()); + + // frag must be in this set because only one context is in flight + // for a given frag at a time (because merge_pending is checked before + // starting one), and this context is the only one that erases it. + merge_pending.erase(frag); + + CDir *dir = mds->mdcache->get_dirfrag(frag); + if (!dir) { + dout(10) << "drop merge on " << frag << " because not in cache" << dendl; + return; + } + assert(dir->dirfrag() == frag); - if (!split_queue.empty()) { - dout(10) << "do_fragmenting " << split_queue.size() << " dirs marked for possible splitting" << dendl; + if(!dir->is_auth()) { + dout(10) << "drop merge on " << *dir << " because lost auth" << dendl; + return; + } - set q; - q.swap(split_queue); + dout(10) << "merging " << *dir << dendl; - for (set::iterator i = q.begin(); - i != q.end(); - ++i) { - CDir *dir = mds->mdcache->get_dirfrag(*i); - if (!dir || - !dir->is_auth()) - continue; + CInode *diri = dir->get_inode(); - dout(10) << "do_fragmenting splitting " << *dir << dendl; - mds->mdcache->split_dir(dir, g_conf->mds_bal_split_bits); + frag_t fg = dir->get_frag(); + while (fg != frag_t()) { + frag_t sibfg = fg.get_sibling(); + list sibs; + bool complete = diri->get_dirfrags_under(sibfg, sibs); + if (!complete) { + dout(10) << " not all sibs under " << sibfg << " in cache (have " << sibs << ")" << dendl; + break; + } + bool all = true; + for (list::iterator p = sibs.begin(); p != sibs.end(); ++p) { + CDir *sib = *p; + if (!sib->is_auth() || !sib->should_merge()) { + all = false; + break; + } + } + if (!all) { + dout(10) << " not all sibs under " << sibfg << " " << sibs << " should_merge" << dendl; + break; + } + dout(10) << " all sibs under " << sibfg << " " << sibs << " should merge" << dendl; + fg = fg.parent(); } - } - if (!merge_queue.empty()) { - dout(10) << "do_fragmenting " << merge_queue.size() << " dirs marked for possible merging" << dendl; - - set q; - q.swap(merge_queue); - - for (set::iterator i = q.begin(); - i != q.end(); - ++i) { - CDir *dir = mds->mdcache->get_dirfrag(*i); - if (!dir || - !dir->is_auth() || - dir->get_frag() == frag_t()) // ok who's the joker? - continue; - - dout(10) << "do_fragmenting merging " << *dir << dendl; - - CInode *diri = dir->get_inode(); - - frag_t fg = dir->get_frag(); - while (fg != frag_t()) { - frag_t sibfg = fg.get_sibling(); - list sibs; - bool complete = diri->get_dirfrags_under(sibfg, sibs); - if (!complete) { - dout(10) << " not all sibs under " << sibfg << " in cache (have " << sibs << ")" << dendl; - break; - } - bool all = true; - for (list::iterator p = sibs.begin(); p != sibs.end(); ++p) { - CDir *sib = *p; - if (!sib->is_auth() || !sib->should_merge()) { - all = false; - break; - } - } - if (!all) { - dout(10) << " not all sibs under " << sibfg << " " << sibs << " should_merge" << dendl; - break; - } - dout(10) << " all sibs under " << sibfg << " " << sibs << " should merge" << dendl; - fg = fg.parent(); - } + if (fg != dir->get_frag()) + mds->mdcache->merge_dir(diri, fg); + }; - if (fg != dir->get_frag()) - mds->mdcache->merge_dir(diri, fg); - } + if (merge_pending.count(frag) == 0) { + dout(20) << __func__ << " enqueued dir " << *dir << dendl; + merge_pending.insert(frag); + mds->timer.add_event_after(g_conf->mds_bal_fragment_interval, + new FunctionContext(callback)); + } else { + dout(20) << __func__ << " dir already in queue " << *dir << dendl; } } - - void MDBalancer::prep_rebalance(int beat) { if (g_conf->mds_thrash_exports) { @@ -1061,17 +1079,28 @@ void MDBalancer::hit_dir(utime_t now, CDir *dir, int type, int who, double amoun mds->mdsmap->allows_dirfrags() && (dir->should_split() || (v > g_conf->mds_bal_split_rd && type == META_POP_IRD) || - (v > g_conf->mds_bal_split_wr && type == META_POP_IWR)) && - split_queue.count(dir->dirfrag()) == 0) { - dout(10) << "hit_dir " << type << " pop is " << v << ", putting in split_queue: " << *dir << dendl; - split_queue.insert(dir->dirfrag()); + (v > g_conf->mds_bal_split_wr && type == META_POP_IWR))) { + dout(10) << "hit_dir " << type << " pop is " << v + << ", putting in split_pending: " << *dir << dendl; + if (split_pending.count(dir->dirfrag()) == 0) { + queue_split(dir, false); + } else { + if (dir->should_split_fast()) { + dout(4) << "hit_dir: fragment hit hard limit, splitting immediately (" + << *dir << ")" << dendl; + queue_split(dir, true); + } else { + dout(10) << "hit_dir: fragment already enqueued to split: " + << *dir << dendl; + } + } } // merge? if (dir->get_frag() != frag_t() && dir->should_merge() && - merge_queue.count(dir->dirfrag()) == 0) { - dout(10) << "hit_dir " << type << " pop is " << v << ", putting in merge_queue: " << *dir << dendl; - merge_queue.insert(dir->dirfrag()); + merge_pending.count(dir->dirfrag()) == 0) { + dout(10) << "hit_dir " << type << " pop is " << v << ", putting in merge_pending: " << *dir << dendl; + queue_merge(dir); } } diff --git a/src/mds/MDBalancer.h b/src/mds/MDBalancer.h index 7c64e750c8c2..3a53db2a13e9 100644 --- a/src/mds/MDBalancer.h +++ b/src/mds/MDBalancer.h @@ -49,12 +49,14 @@ class MDBalancer { string bal_version; utime_t last_heartbeat; - utime_t last_fragment; utime_t last_sample; utime_t rebalance_time; //ensure a consistent view of load for rebalance - // todo - set split_queue, merge_queue; + // Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir + // just as soon as a delayed context comes back and triggers it. + // These sets just prevent us from spawning extra timer contexts for + // dirfrags that already have one in flight. + set split_pending, merge_pending; // per-epoch scatter/gathered info map mds_load; @@ -97,8 +99,6 @@ public: void tick(); - void do_fragmenting(); - void export_empties(); //set up the rebalancing targets for export and do one if the //MDSMap is up to date @@ -122,9 +122,8 @@ public: void hit_dir(utime_t now, class CDir *dir, int type, int who=-1, double amount=1.0); void hit_recursive(utime_t now, class CDir *dir, int type, double amount, double rd_adj); - void queue_split(CDir *dir); + void queue_split(const CDir *dir, bool fast); void queue_merge(CDir *dir); - }; diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index ca03c6969e01..eeb5d3a8534c 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -10729,15 +10729,17 @@ bool MDCache::can_fragment(CInode *diri, list& dirs) void MDCache::split_dir(CDir *dir, int bits) { - dout(7) << "split_dir " << *dir << " bits " << bits << dendl; + dout(7) << __func__ << " " << *dir << " bits " << bits << dendl; assert(dir->is_auth()); CInode *diri = dir->inode; list dirs; dirs.push_back(dir); - if (!can_fragment(diri, dirs)) + if (!can_fragment(diri, dirs)) { + dout(7) << __func__ << " cannot fragment right now, dropping" << dendl; return; + } MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR); mdr->more()->fragment_base = dir->dirfrag(); @@ -11092,7 +11094,7 @@ void MDCache::dispatch_fragment_dir(MDRequestRef& mdr) dout(10) << " can't auth_pin " << *diri << ", requeuing dir " << info.dirs.front()->dirfrag() << dendl; if (info.bits > 0) - mds->balancer->queue_split(info.dirs.front()); + mds->balancer->queue_split(info.dirs.front(), false); else mds->balancer->queue_merge(info.dirs.front()); fragment_unmark_unfreeze_dirs(info.dirs); @@ -11310,9 +11312,9 @@ void MDCache::_fragment_finish(dirfrag_t basedirfrag, list& resultfrags) ufragment &uf = it->second; // unmark & auth_unpin - for (list::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p) { - (*p)->state_clear(CDir::STATE_FRAGMENTING); - (*p)->auth_unpin(this); + for (const auto &dir : resultfrags) { + dir->state_clear(CDir::STATE_FRAGMENTING); + dir->auth_unpin(this); } if (mds->logger) {