From 24eea7e11d014fbb7cb468b66e84d0b747cd9359 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 13 Aug 2018 16:47:53 +0800 Subject: [PATCH] mds: migrate strays part by part when shutdown mds migrating all strays at once may require lots of memory and cpu time. Fixes: http://tracker.ceph.com/issues/26926 Signed-off-by: "Yan, Zheng" --- src/mds/MDCache.cc | 133 +++++++++++++++++++++++++++++----------- src/mds/MDCache.h | 9 ++- src/mds/Server.cc | 25 ++++---- src/mds/StrayManager.cc | 16 +++-- 4 files changed, 129 insertions(+), 54 deletions(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index dd57feeed80f0..41dd8095081c4 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -3111,7 +3111,7 @@ void MDCache::handle_mds_failure(mds_rank_t who) // MDCache::shutdown_export_strays() always exports strays to mds.0 if (who == mds_rank_t(0)) - shutdown_exported_strays.clear(); + shutdown_exporting_strays.clear(); show_subtrees(); } @@ -7903,60 +7903,119 @@ bool MDCache::shutdown_pass() bool MDCache::shutdown_export_strays() { + static const unsigned MAX_EXPORTING = 100; + if (mds->get_nodeid() == 0) return true; - - dout(10) << "shutdown_export_strays" << dendl; + + if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2) + return false; + + dout(10) << "shutdown_export_strays " << shutdown_export_next.first + << " '" << shutdown_export_next.second << "'" << dendl; bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0)); + bool all_exported = false; - bool done = true; +again: + auto next = shutdown_export_next; - list dfs; for (int i = 0; i < NUM_STRAY; ++i) { - if (!strays[i] || - !strays[i]->state_test(CInode::STATE_STRAYPINNED)) + CInode *strayi = strays[i]; + if (!strayi || + !strayi->state_test(CInode::STATE_STRAYPINNED)) + continue; + if (strayi->ino() < next.first.ino) continue; - strays[i]->get_dirfrags(dfs); - } - for (std::list::iterator dfs_i = dfs.begin(); - dfs_i != dfs.end(); ++dfs_i) - { - CDir *dir = *dfs_i; + deque dfls; + strayi->get_dirfrags(dfls); - if (!dir->is_complete()) { - dir->fetch(0); - done = false; - if (!mds0_active) - break; - } - - for (auto &p : dir->items) { - CDentry *dn = p.second; - CDentry::linkage_t *dnl = dn->get_projected_linkage(); - if (dnl->is_null()) + while (!dfls.empty()) { + CDir *dir = dfls.front(); + dfls.pop_front(); + + if (dir->dirfrag() < next.first) continue; - done = false; - if (!mds0_active) - break; - - if (dn->state_test(CDentry::STATE_PURGING)) { - // Don't try to migrate anything that is actually - // being purged right now - continue; + if (next.first < dir->dirfrag()) { + next.first = dir->dirfrag(); + next.second.clear(); + } + + if (!dir->is_complete()) { + MDSInternalContextBase *fin = nullptr; + if (shutdown_exporting_strays.empty()) { + fin = new MDSInternalContextWrapper(mds, + new FunctionContext([this](int r) { + shutdown_export_strays(); + }) + ); + } + dir->fetch(fin); + goto done; } - if (shutdown_exported_strays.count(dnl->get_inode()->ino()) == 0) { - shutdown_exported_strays.insert(dnl->get_inode()->ino()); - stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root! + CDir::dentry_key_map::iterator it; + if (next.second.empty()) { + it = dir->begin(); } else { - dout(10) << "already exporting " << *dn << dendl; + auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second)); + it = dir->lower_bound(dentry_key_t(0, next.second, hash)); + } + + for (; it != dir->end(); ++it) { + CDentry *dn = it->second; + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + if (dnl->is_null()) + continue; + + if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) { + next.second = it->first.name; + goto done; + } + + auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino()); + if (!ret.second) { + dout(10) << "already exporting/purging " << *dn << dendl; + continue; + } + + // Don't try to migrate anything that is actually + // being purged right now + if (!dn->state_test(CDentry::STATE_PURGING)) + stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root! + + if (shutdown_exporting_strays.size() >= MAX_EXPORTING) { + ++it; + if (it != dir->end()) { + next.second = it->first.name; + } else { + if (dfls.empty()) + next.first.ino.val++; + else + next.first = dfls.front()->dirfrag(); + next.second.clear(); + } + goto done; + } } } } - return done; + if (shutdown_exporting_strays.empty()) { + dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0); + if (first_df < shutdown_export_next.first || + !shutdown_export_next.second.empty()) { + shutdown_export_next.first = first_df; + shutdown_export_next.second.clear(); + goto again; + } + all_exported = true; + } + +done: + shutdown_export_next = next; + return all_exported; } // ========= messaging ============== diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 8e323dbade421..15697b92e6077 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -775,13 +775,18 @@ public: // shutdown private: - set shutdown_exported_strays; + set shutdown_exporting_strays; + pair shutdown_export_next; public: void shutdown_start(); void shutdown_check(); bool shutdown_pass(); - bool shutdown_export_strays(); bool shutdown(); // clear cache (ie at shutodwn) + bool shutdown_export_strays(); + void shutdown_export_stray_finish(inodeno_t ino) { + if (shutdown_exporting_strays.erase(ino)) + shutdown_export_strays(); + } bool did_shutdown_log_cap; diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 722ce887011d4..5862de3edcc84 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -8469,16 +8469,17 @@ void Server::_commit_slave_rename(MDRequestRef& mdr, int r, { dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl; - CDentry::linkage_t *destdnl = destdn->get_linkage(); + CInode *in = destdn->get_linkage()->get_inode(); + + inodeno_t migrated_stray; + if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray()) + migrated_stray = in->ino(); MDSInternalContextBase::vec finished; if (r == 0) { // unfreeze+singleauth inode // hmm, do i really need to delay this? if (mdr->more()->is_inode_exporter) { - - CInode *in = destdnl->get_inode(); - // drop our pins // we exported, clear out any xlocks that we moved to another MDS set::iterator i = mdr->xlocks.begin(); @@ -8495,14 +8496,13 @@ void Server::_commit_slave_rename(MDRequestRef& mdr, int r, auto bp = mdr->more()->inode_import.cbegin(); decode(peer_imported, bp); - dout(10) << " finishing inode export on " << *destdnl->get_inode() << dendl; - mdcache->migrator->finish_export_inode(destdnl->get_inode(), - mdr->slave_to_mds, peer_imported, finished); + dout(10) << " finishing inode export on " << *in << dendl; + mdcache->migrator->finish_export_inode(in, mdr->slave_to_mds, peer_imported, finished); mds->queue_waiters(finished); // this includes SINGLEAUTH waiters. // unfreeze - assert(destdnl->get_inode()->is_frozen_inode()); - destdnl->get_inode()->unfreeze_inode(finished); + assert(in->is_frozen_inode()); + in->unfreeze_inode(finished); } // singleauth @@ -8538,8 +8538,8 @@ void Server::_commit_slave_rename(MDRequestRef& mdr, int r, // witness list from the master, and they failed before we tried prep again. if (mdr->more()->rollback_bl.length()) { if (mdr->more()->is_inode_exporter) { - dout(10) << " reversing inode export of " << *destdnl->get_inode() << dendl; - destdnl->get_inode()->abort_export(); + dout(10) << " reversing inode export of " << *in << dendl; + in->abort_export(); } if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) { mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds); @@ -8562,6 +8562,9 @@ void Server::_commit_slave_rename(MDRequestRef& mdr, int r, mdcache->request_finish(mdr); } } + + if (migrated_stray && mds->is_stopping()) + mdcache->shutdown_export_stray_finish(migrated_stray); } void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime, diff --git a/src/mds/StrayManager.cc b/src/mds/StrayManager.cc index 08c64f61c2042..ffc96aa084c95 100644 --- a/src/mds/StrayManager.cc +++ b/src/mds/StrayManager.cc @@ -263,9 +263,13 @@ void StrayManager::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *l } // drop inode + inodeno_t ino = in->ino(); if (in->is_dirty()) in->mark_clean(); - in->mdcache->remove_inode(in); + mds->mdcache->remove_inode(in); + + if (mds->is_stopping()) + mds->mdcache->shutdown_export_stray_finish(ino); } void StrayManager::enqueue(CDentry *dn, bool trunc) @@ -465,7 +469,7 @@ bool StrayManager::_eval_stray(CDentry *dn, bool delay) return false; // not until some snaps are deleted. } - in->mdcache->clear_dirty_bits_for_stray(in); + mds->mdcache->clear_dirty_bits_for_stray(in); if (!in->remote_parents.empty()) { // unlink any stale remote snap dentry. @@ -748,11 +752,15 @@ void StrayManager::_truncate_stray_logged(CDentry *dn, LogSegment *ls) dout(10) << __func__ << ": " << *dn << " " << *in << dendl; + in->pop_and_dirty_projected_inode(ls); + + in->state_clear(CInode::STATE_PURGING); dn->state_clear(CDentry::STATE_PURGING | CDentry::STATE_PURGINGPINNED); dn->put(CDentry::PIN_PURGING); - in->pop_and_dirty_projected_inode(ls); - eval_stray(dn); + + if (!dn->state_test(CDentry::STATE_PURGING) && mds->is_stopping()) + mds->mdcache->shutdown_export_stray_finish(in->ino()); } -- 2.39.5