From: Patrick Donnelly Date: Sat, 7 Mar 2020 03:20:58 +0000 (-0800) Subject: mds: finish implementation of ephemeral pins X-Git-Tag: wip-pdonnell-testing-20200918.022351~870^2~10 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=703e4b01fa83f81cda4fdc46b91f24a5e92f95f0;p=ceph-ci.git mds: finish implementation of ephemeral pins Signed-off-by: Patrick Donnelly --- diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 17f05514dd0..8cf2cb2cdf8 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1857,7 +1857,7 @@ CDentry *CDir::_load_dentry( if (in->inode.is_dirty_rstat()) in->mark_dirty_rstat(); - in->maybe_export_ephemeral_random_pin(true); + in->maybe_ephemeral_rand(); //in->hack_accessed = false; //in->hack_load_stamp = ceph_clock_now(); //num_new_inodes_loaded++; diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 7b37e56c7ee..57d56490786 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -273,6 +273,12 @@ ostream& operator<<(ostream& out, const CInode& in) if (in.inode.export_pin != MDS_RANK_NONE) { out << " export_pin=" << in.inode.export_pin; } + if (in.state_test(CInode::STATE_DISTEPHEMERALPIN)) { + out << " distepin"; + } + if (in.state_test(CInode::STATE_RANDEPHEMERALPIN)) { + out << " randepin"; + } out << " " << ∈ out << "]"; @@ -443,19 +449,25 @@ CInode::projected_inode &CInode::project_inode(bool xattr, bool snap) void CInode::pop_and_dirty_projected_inode(LogSegment *ls) { ceph_assert(!projected_nodes.empty()); - auto &front = projected_nodes.front(); + auto& front = projected_nodes.front(); + dout(15) << __func__ << " " << front.inode.ino << " v" << front.inode.version << dendl; + int64_t old_pool = inode.layout.pool_id; + bool pin_update = inode.export_pin != front.inode.export_pin; + bool dist_update = inode.export_ephemeral_distributed_pin + != front.inode.export_ephemeral_distributed_pin; mark_dirty(front.inode.version, ls); - bool new_export_pin = inode.export_pin != front.inode.export_pin; - inode = front.inode; - if (new_export_pin) + + inode = std::move(front.inode); + + if (pin_update) maybe_export_pin(true); + if (dist_update) + maybe_ephemeral_dist_children(true); - if (front.inode.version == 1) - maybe_export_ephemeral_random_pin(); if (inode.is_backtrace_updated()) mark_dirty_parent(ls, old_pool != inode.layout.pool_id); @@ -824,6 +836,7 @@ CDir *CInode::add_dirfrag(CDir *dir) } maybe_export_pin(); + maybe_ephemeral_dist(); return dir; } @@ -2009,7 +2022,7 @@ void CInode::decode_lock_iflock(bufferlist::const_iterator& p) void CInode::encode_lock_ipolicy(bufferlist& bl) { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); if (inode.is_dir()) { encode(inode.version, bl); encode(inode.ctime, bl); @@ -2024,7 +2037,7 @@ void CInode::encode_lock_ipolicy(bufferlist& bl) void CInode::decode_lock_ipolicy(bufferlist::const_iterator& p) { - DECODE_START(1, p); + DECODE_START(2, p); if (inode.is_dir()) { decode(inode.version, p); utime_t tm; @@ -2032,13 +2045,19 @@ void CInode::decode_lock_ipolicy(bufferlist::const_iterator& p) if (inode.ctime < tm) inode.ctime = tm; decode(inode.layout, p); decode(inode.quota, p); - mds_rank_t old_pin = inode.export_pin; - decode(inode.export_pin, p); - maybe_export_pin(old_pin != inode.export_pin); - bool old_ephemeral_pin = inode.export_ephemeral_distributed_pin; - decode(inode.export_ephemeral_distributed_pin, p); - maybe_export_ephemeral_distributed_pin(old_ephemeral_pin != inode.export_ephemeral_distributed_pin); - decode(inode.export_ephemeral_random_pin, p); + { + mds_rank_t old_pin = inode.export_pin; + decode(inode.export_pin, p); + maybe_export_pin(old_pin != inode.export_pin); + } + if (struct_v >= 2) { + { + bool old_ephemeral_pin = inode.export_ephemeral_distributed_pin; + decode(inode.export_ephemeral_distributed_pin, p); + maybe_ephemeral_dist_children(old_ephemeral_pin != inode.export_ephemeral_distributed_pin); + } + decode(inode.export_ephemeral_random_pin, p); + } } DECODE_FINISH(p); } @@ -5198,32 +5217,21 @@ int64_t CInode::get_backtrace_pool() const } } -void CInode::maybe_export_pin(bool update) +void CInode::queue_export_pin(mds_rank_t target) { - if (!g_conf()->mds_bal_export_pin) - return; - if (!is_dir() || !is_normal()) - return; - - mds_rank_t export_pin = get_export_pin(false); - if (export_pin == MDS_RANK_NONE && !update) { - maybe_export_ephemeral_distributed_pin(); - return; - } - if (state_test(CInode::STATE_QUEUEDEXPORTPIN)) return; bool queue = false; - for (auto p = dirfrags.begin(); p != dirfrags.end(); p++) { - CDir *dir = p->second; + for (auto& p : dirfrags) { + CDir *dir = p.second; if (!dir->is_auth()) continue; - if (export_pin != MDS_RANK_NONE) { + if (target != MDS_RANK_NONE) { if (dir->is_subtree_root()) { // set auxsubtree bit or export it if (!dir->state_test(CDir::STATE_AUXSUBTREE) || - export_pin != dir->get_dir_auth().first) + target != dir->get_dir_auth().first) queue = true; } else { // create aux subtree or export it @@ -5241,123 +5249,176 @@ void CInode::maybe_export_pin(bool update) } } -void CInode::maybe_export_ephemeral_random_pin(bool update) +void CInode::maybe_export_pin(bool update) { - bool export_ephemeral_random_config = mdcache->get_export_ephemeral_random_config(); + if (!g_conf()->mds_bal_export_pin) + return; + if (!is_dir() || !is_normal()) + return; - //If the config isn't set then return - if (!export_ephemeral_random_config) + dout(15) << __func__ << " update=" << update << " " << *this << dendl; + + mds_rank_t export_pin = get_export_pin(false, false); + if (export_pin == MDS_RANK_NONE && !update) { return; + } - //Check if it's already ephemerally pinned - if (is_export_ephemeral_random_pinned && !update) - return; + /* disable ephemeral pins */ + set_ephemeral_dist(false); + set_ephemeral_rand(false); + queue_export_pin(export_pin); +} - if (export_ephemeral_random_config) { - double export_ephemeral_random_pin = get_export_ephemeral_random_pin(false); - if ((update || export_ephemeral_random_pin >= - ceph::util::generate_random_number(0.0, 1.0)) - && is_export_ephemeral_distributed_pinned == false) { - - dout(10) << "I'm here under ephemeral random because is_export_ephemeral_distributed is" << is_export_ephemeral_distributed_pinned << dendl; - - is_export_ephemeral_random_migrating = true; - - bool queue = false; - for (auto& p : dirfrags) { - CDir *dir = p.second; - if (!dir->is_auth()) - continue; - if (dir->is_subtree_root()) { - // set auxsubtree bit or export it - if (!dir->state_test(CDir::STATE_AUXSUBTREE) || - mdcache->hash_into_rank_bucket(ino(), mdcache->mds->mdsmap->get_max_mds()) != dir->get_dir_auth().first) - queue = true; - } else { - // create aux subtree or export it - queue = true; - } - if (queue) { - if (mdcache->hash_into_rank_bucket(ino(), mdcache->mds->mdsmap->get_max_mds()) == mdcache->mds->get_nodeid()) - mdcache->ephemeral_pin(ephemeral_pin_inode); - state_set(CInode::STATE_QUEUEDEXPORTPIN); - mdcache->export_pin_queue.insert(this); - break; - } - } - return; +void CInode::set_ephemeral_dist(bool yes) +{ + if (yes) { + if (!state_test(CInode::STATE_DISTEPHEMERALPIN)) { + state_set(CInode::STATE_DISTEPHEMERALPIN); + auto p = mdcache->dist_ephemeral_pins.insert(this); + ceph_assert(p.second); + } + } else { + /* avoid std::set::erase if unnecessary */ + if (state_test(CInode::STATE_DISTEPHEMERALPIN)) { + dout(10) << "clearing ephemeral distributed pin on " << *this << dendl; + state_clear(CInode::STATE_DISTEPHEMERALPIN); + auto count = mdcache->dist_ephemeral_pins.erase(this); + ceph_assert(count == 1); + queue_export_pin(MDS_RANK_NONE); } } } -void CInode::maybe_export_ephemeral_distributed_pin(bool update) +void CInode::maybe_ephemeral_dist(bool update) { - bool export_ephemeral_distributed_config = mdcache->get_export_ephemeral_distributed_config(); - - //If both the configs aren't set then return - if (!export_ephemeral_distributed_config) + if (!mdcache->get_export_ephemeral_distributed_config()) { + dout(15) << __func__ << " config false: cannot ephemeral distributed pin " << *this << dendl; + set_ephemeral_dist(false); + return; + } else if (!is_dir() || !is_normal()) { + dout(15) << __func__ << " !dir or !normal: cannot ephemeral distributed pin " << *this << dendl; + set_ephemeral_dist(false); + return; + } else if (get_inode().nlink == 0) { + dout(15) << __func__ << " unlinked directory: cannot ephemeral distributed pin " << *this << dendl; + set_ephemeral_dist(false); + return; + } else if (!update && state_test(CInode::STATE_DISTEPHEMERALPIN)) { + dout(15) << __func__ << " requeueing already pinned " << *this << dendl; + queue_export_pin(mdcache->hash_into_rank_bucket(ino())); return; + } - //Check if it's already ephemerally pinned - if (is_export_ephemeral_distributed_pinned && !update) - return; + dout(15) << __func__ << " update=" << update << " " << *this << dendl; - if (export_ephemeral_distributed_config) { - CDentry *pdn = get_parent_dn(); + auto dir = get_parent_dir(); + if (!dir) { + return; + } - if (!pdn) { - return; - } + bool pin = dir->get_inode()->get_inode().export_ephemeral_distributed_pin; + if (pin) { + dout(10) << __func__ << " ephemeral distributed pinning " << *this << dendl; + set_ephemeral_dist(true); + queue_export_pin(mdcache->hash_into_rank_bucket(ino())); + } else if (update) { + set_ephemeral_dist(false); + queue_export_pin(MDS_RANK_NONE); + } +} + +void CInode::maybe_ephemeral_dist_children(bool update) +{ + if (!mdcache->get_export_ephemeral_distributed_config()) { + dout(15) << __func__ << " config false: cannot ephemeral distributed pin " << *this << dendl; + return; + } else if (!is_dir() || !is_normal()) { + dout(15) << __func__ << " !dir or !normal: cannot ephemeral distributed pin " << *this << dendl; + return; + } else if (get_inode().nlink == 0) { + dout(15) << __func__ << " unlinked directory: cannot ephemeral distributed pin " << *this << dendl; + return; + } - auto dir = pdn->get_dir(); + bool pin = get_inode().export_ephemeral_distributed_pin; + /* FIXME: expensive to iterate children when not updating */ + if (!pin && !update) { + return; + } - if (get_export_ephemeral_distributed_pin() && dir->get_num_head_items()) { - for (auto& bound : bounds) { - bound->maybe_export_ephemeral_distributed_pin(); + dout(10) << __func__ << " maybe ephemerally pinning children of " << *this << dendl; + for (auto& p : dirfrags) { + auto& dir = p.second; + for (auto& q : *dir) { + auto& dn = q.second; + auto&& in = dn->get_linkage()->get_inode(); + if (in && in->is_dir()) { + in->maybe_ephemeral_dist(update); } } + } +} - else if (update || (dir->get_inode()->get_export_ephemeral_distributed_pin())) { - is_export_ephemeral_distributed_migrating = true; - - bool queue = false; - for (auto& p : dirfrags) { - CDir *dir = p.second; - if (!dir->is_auth()) - continue; - if (dir->is_subtree_root()) { - // set auxsubtree bit or export it - if (!dir->state_test(CDir::STATE_AUXSUBTREE) || - mdcache->hash_into_rank_bucket(ino(), mdcache->mds->mdsmap->get_max_mds()) != dir->get_dir_auth().first) - queue = true; - } else { - // create aux subtree or export it - queue = true; - } - if (queue) { - dout(10) << "max_mds is" << mdcache->mds->mdsmap->get_max_mds() << "and target mds is:" << mdcache->hash_into_rank_bucket(ino(), mdcache->mds->mdsmap->get_max_mds()) << dendl; - if (mdcache->hash_into_rank_bucket(ino(), mdcache->mds->mdsmap->get_max_mds()) == mdcache->mds->get_nodeid()) { - mdcache->ephemeral_pin(ephemeral_pin_inode); - dout(10) << "Inside if inside the else" << dendl; - } - state_set(CInode::STATE_QUEUEDEXPORTPIN); - mdcache->export_pin_queue.insert(this); - break; - } - } - return; +void CInode::set_ephemeral_rand(bool yes) +{ + if (yes) { + if (!state_test(CInode::STATE_RANDEPHEMERALPIN)) { + state_set(CInode::STATE_RANDEPHEMERALPIN); + auto p = mdcache->rand_ephemeral_pins.insert(this); + ceph_assert(p.second); } + } else { + if (state_test(CInode::STATE_RANDEPHEMERALPIN)) { + dout(10) << "clearing ephemeral random pin on " << *this << dendl; + state_clear(CInode::STATE_RANDEPHEMERALPIN); + auto count = mdcache->rand_ephemeral_pins.erase(this); + ceph_assert(count == 1); + queue_export_pin(MDS_RANK_NONE); + } + } +} + +void CInode::maybe_ephemeral_rand() +{ + if (!mdcache->get_export_ephemeral_random_config()) { + dout(15) << __func__ << " config false: cannot ephemeral random pin " << *this << dendl; + set_ephemeral_rand(false); + return; + } else if (!is_dir() || !is_normal()) { + dout(15) << __func__ << " !dir or !normal: cannot ephemeral random pin " << *this << dendl; + set_ephemeral_rand(false); + return; + } else if (get_inode().nlink == 0) { + dout(15) << __func__ << " unlinked directory: cannot ephemeral random pin " << *this << dendl; + set_ephemeral_rand(false); + return; + } else if (state_test(CInode::STATE_RANDEPHEMERALPIN)) { + dout(10) << __func__ << " already ephemeral random pinned: requeueing " << *this << dendl; + queue_export_pin(mdcache->hash_into_rank_bucket(ino())); + return; + } + + double threshold = get_ephemeral_rand(); + double n = ceph::util::generate_random_number(0.0, 1.0); + + dout(15) << __func__ << " rand " << n << " hash_into_rank_bucket(ino())); } } -void CInode::set_export_ephemeral_random_pin(double probability) +void CInode::setxattr_ephemeral_rand(double probability) { ceph_assert(is_dir()); ceph_assert(is_projected()); get_projected_inode()->export_ephemeral_random_pin = probability; } -void CInode::set_export_ephemeral_distributed_pin(bool val) +void CInode::setxattr_ephemeral_dist(bool val) { ceph_assert(is_dir()); ceph_assert(is_projected()); @@ -5371,7 +5432,7 @@ void CInode::set_export_pin(mds_rank_t rank) get_projected_inode()->export_pin = rank; } -mds_rank_t CInode::get_export_pin(bool inherit) const +mds_rank_t CInode::get_export_pin(bool inherit, bool ephemeral) const { /* An inode that is export pinned may not necessarily be a subtree root, we * need to traverse the parents. A base or system inode cannot be pinned. @@ -5379,30 +5440,37 @@ mds_rank_t CInode::get_export_pin(bool inherit) const * have a parent yet. */ const CInode *in = this; + mds_rank_t etarget = MDS_RANK_NONE; while (true) { if (in->is_system()) break; const CDentry *pdn = in->get_parent_dn(); if (!pdn) break; - // ignore export pin for unlinked directory - if (in->get_inode().nlink == 0) - break; - if (in->get_inode().export_pin >= 0) + if (in->get_inode().nlink == 0) { + // ignore export pin for unlinked directory + return MDS_RANK_NONE; + } else if (etarget != MDS_RANK_NONE && (in->get_inode().export_ephemeral_random_pin > 0.0 || in->get_inode().export_ephemeral_distributed_pin)) { + return etarget; + } else if (in->get_inode().export_pin >= 0) { return in->get_inode().export_pin; + } else if (etarget == MDS_RANK_NONE && ephemeral && in->is_ephemerally_pinned()) { + /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */ + etarget = mdcache->hash_into_rank_bucket(in->ino()); + if (!inherit) return etarget; + } - if (!inherit) + if (!inherit) { break; + } in = pdn->get_dir()->inode; } return MDS_RANK_NONE; } -double CInode::get_export_ephemeral_random_pin(bool inherit) const +double CInode::get_ephemeral_rand(bool inherit) const { - /* An inode that is export pinned may not necessarily be a subtree root, we - * need to traverse the parents. A base or system inode cannot be pinned. - * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not + /* N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not * have a parent yet. */ const CInode *in = this; @@ -5415,22 +5483,21 @@ double CInode::get_export_ephemeral_random_pin(bool inherit) const // ignore export pin for unlinked directory if (in->get_inode().nlink == 0) break; - if (in->get_inode().export_ephemeral_random_pin >= 0) + + if (in->get_inode().export_ephemeral_random_pin > 0.0) return in->get_inode().export_ephemeral_random_pin; + /* An export_pin overrides only if no closer parent (incl. this one) has a + * random pin set. + */ + if (in->get_inode().export_pin >= 0) + return 0.0; + if (!inherit) break; in = pdn->get_dir()->inode; } - return 0; -} - -bool CInode::get_export_ephemeral_distributed_pin() const -{ - if (get_inode().export_ephemeral_distributed_pin) - return get_inode().export_ephemeral_distributed_pin; - else - return false; + return 0.0; } bool CInode::is_exportable(mds_rank_t dest) const diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 6d052a38139..0c0d97ff087 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -325,6 +325,8 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter> batch_ops; - bool is_export_ephemeral_distributed_pinned = false; - bool is_export_ephemeral_random_pinned = false; - - bool is_export_ephemeral_distributed_migrating = false; - bool is_export_ephemeral_random_migrating = false; - - void finish_export_ephemeral_distributed_migration() { - is_export_ephemeral_distributed_migrating = false; - is_export_ephemeral_distributed_pinned = true; - } - - void finish_export_ephemeral_random_migration() { - is_export_ephemeral_random_migrating = false; - is_export_ephemeral_random_pinned = true; - } - std::string_view pin_name(int p) const override; std::ostream& print_db_line_prefix(std::ostream& out) override; @@ -925,15 +912,31 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter::item dirty_rstat_item; - elist::item ephemeral_pin_inode; - mempool::mds_co::set client_snap_caps; mempool::mds_co::compact_map > client_need_snapflush; diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc index 537d52af9b6..e4de95ec0f4 100644 --- a/src/mds/MDBalancer.cc +++ b/src/mds/MDBalancer.cc @@ -99,14 +99,7 @@ void MDBalancer::handle_export_pins(void) auto cur = it++; CInode *in = *cur; ceph_assert(in->is_dir()); - mds_rank_t export_pin = MDS_RANK_NONE; - // Making sure the ephemeral pin does not override export pin - if (in->get_export_pin(false) != MDS_RANK_NONE) - export_pin = in->get_export_pin(false); - else if (in->is_export_ephemeral_distributed_migrating || in->is_export_ephemeral_random_migrating) { - export_pin = mds->mdcache->hash_into_rank_bucket(in->ino(), mds->mdsmap->get_max_mds()); - dout(10) << "Ephemeral export pin set on" << *in << dendl; - } + mds_rank_t export_pin = in->get_export_pin(false); if (export_pin >= mds->mdsmap->get_max_mds()) { dout(20) << " delay export_pin=" << export_pin << " on " << *in << dendl; in->state_clear(CInode::STATE_QUEUEDEXPORTPIN); diff --git a/src/mds/MDBalancer.h b/src/mds/MDBalancer.h index 2e8fef16779..d8834fce43c 100644 --- a/src/mds/MDBalancer.h +++ b/src/mds/MDBalancer.h @@ -49,6 +49,8 @@ public: */ void tick(); + void handle_export_pins(void); + void subtract_export(CDir *ex); void add_import(CDir *im); void adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc); @@ -86,8 +88,6 @@ private: void prep_rebalance(int beat); int mantle_prep_rebalance(); - void handle_export_pins(void); - mds_load_t get_load(); int localize_balancer(); void send_heartbeat(); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index b9de75ff1ea..b06d4cc2f92 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -140,7 +140,6 @@ MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) : filer(m->objecter, m->finisher), stray_manager(m, purge_queue_), recovery_queue(m), - ephemeral_pins(member_offset(CInode, ephemeral_pin_inode)), trim_counter(g_conf().get_val("mds_cache_trim_decay_rate")) { migrator.reset(new Migrator(mds, this)); @@ -217,14 +216,33 @@ MDCache::~MDCache() void MDCache::handle_conf_change(const std::set& changed, const MDSMap& mdsmap) { + dout(20) << "config changes: " << changed << dendl; if (changed.count("mds_cache_memory_limit")) cache_memory_limit = g_conf().get_val("mds_cache_memory_limit"); if (changed.count("mds_cache_reservation")) cache_reservation = g_conf().get_val("mds_cache_reservation"); - if (changed.count("mds_export_ephemeral_distributed")) + if (changed.count("mds_export_ephemeral_distributed")) { export_ephemeral_distributed_config = g_conf().get_val("mds_export_ephemeral_distributed"); - if (changed.count("mds_export_ephemeral_random")) + dout(10) << "Migrating any ephemeral distributed pinned inodes" << dendl; + /* copy to vector to avoid removals during iteration */ + std::vector migrate; + migrate.assign(dist_ephemeral_pins.begin(), dist_ephemeral_pins.end()); + for (auto& in : migrate) { + in->maybe_ephemeral_dist(); + } + mds->balancer->handle_export_pins(); + } + if (changed.count("mds_export_ephemeral_random")) { export_ephemeral_random_config = g_conf().get_val("mds_export_ephemeral_random"); + dout(10) << "Migrating any ephemeral random pinned inodes" << dendl; + /* copy to vector to avoid removals during iteration */ + std::vector migrate; + migrate.assign(rand_ephemeral_pins.begin(), rand_ephemeral_pins.end()); + for (auto& in : migrate) { + in->maybe_ephemeral_rand(); + } + mds->balancer->handle_export_pins(); + } if (changed.count("mds_health_cache_threshold")) cache_health_threshold = g_conf().get_val("mds_health_cache_threshold"); if (changed.count("mds_cache_mid")) @@ -309,6 +327,8 @@ void MDCache::add_inode(CInode *in) if (cache_toofull()) { exceeded_size_limit = true; } + + in->maybe_ephemeral_dist(false); } void MDCache::remove_inode(CInode *o) @@ -331,14 +351,15 @@ void MDCache::remove_inode(CInode *o) o->item_open_file.remove_myself(); - o->ephemeral_pin_inode.remove_myself(); - if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN)) export_pin_queue.erase(o); if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN)) export_pin_delayed_queue.erase(o); + o->set_ephemeral_dist(false); + o->set_ephemeral_rand(false); + // remove from inode map if (o->last == CEPH_NOSNAP) { inode_map.erase(o->ino()); @@ -872,8 +893,9 @@ MDSCacheObject *MDCache::get_object(const MDSCacheObjectInfo &info) /* * hashing implementation based on Lamping and Veach's Jump Consistent Hash: https://arxiv.org/pdf/1406.2294.pdf */ -mds_rank_t MDCache::hash_into_rank_bucket(inodeno_t ino, mds_rank_t max_mds) +mds_rank_t MDCache::hash_into_rank_bucket(inodeno_t ino) { + const mds_rank_t max_mds = mds->mdsmap->get_max_mds(); uint64_t hash = rjhash64(ino); int64_t b = -1, j = 0; while (j < max_mds) { @@ -7837,47 +7859,44 @@ bool MDCache::shutdown_pass() trim(UINT64_MAX); dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl; + + { + dout(10) << "Migrating any ephemerally pinned inodes" << dendl; + /* copy to vector to avoid removals during iteration */ + std::vector migrate; + migrate.assign(rand_ephemeral_pins.begin(), rand_ephemeral_pins.end()); + for (auto& in : migrate) { + in->maybe_ephemeral_rand(); + } + migrate.assign(dist_ephemeral_pins.begin(), dist_ephemeral_pins.end()); + for (auto& in : migrate) { + in->maybe_ephemeral_dist(); + } + mds->balancer->handle_export_pins(); + } + // Export all subtrees to another active (usually rank 0) if not rank 0 int num_auth_subtree = 0; - if (!subtrees.empty() && - mds->get_nodeid() != 0) { - dout(7) << "looking for subtrees to export to mds0" << dendl; + if (!subtrees.empty() && mds->get_nodeid() != 0) { + dout(7) << "looking for subtrees to export" << dendl; std::vector ls; - for (map >::iterator it = subtrees.begin(); - it != subtrees.end(); - ++it) { - CDir *dir = it->first; - if (dir->get_inode()->is_mdsdir()) + for (auto& [dir, bounds] : subtrees) { + dout(10) << " examining " << *dir << " bounds " << bounds << dendl; + if (dir->get_inode()->is_mdsdir() || !dir->is_auth()) continue; - if (dir->is_auth()) { - num_auth_subtree++; - if (dir->is_frozen() || - dir->is_freezing() || - dir->is_ambiguous_dir_auth() || - dir->state_test(CDir::STATE_EXPORTING)) - continue; - ls.push_back(dir); + num_auth_subtree++; + if (dir->is_frozen() || + dir->is_freezing() || + dir->is_ambiguous_dir_auth() || + dir->state_test(CDir::STATE_EXPORTING) || + dir->get_inode()->is_ephemerally_pinned()) { + continue; } + ls.push_back(dir); } migrator->clear_export_queue(); - if (export_ephemeral_random_config || - export_ephemeral_distributed_config) { - dout(10) << "Migrating ephemerally pinned inodes due to shutdown" << dendl; - elist::iterator it = ephemeral_pins.begin(member_offset(CInode, ephemeral_pin_inode)); - while (!it.end()) { - if ((*it) == NULL || !((*it)->is_auth())) - dout(10) << "Inode is not auth to this rank" << dendl; - else { - dout(10) << "adding inode to export queue" << dendl; - (*it)->maybe_export_ephemeral_distributed_pin(true); - (*it)->maybe_export_ephemeral_random_pin(true); - } - ++it; - } - } - for (const auto& dir : ls) { mds_rank_t dest = dir->get_inode()->authority().first; if (dest > 0 && !mds->mdsmap->is_active(dest)) @@ -13360,6 +13379,9 @@ void MDCache::handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap) { for (auto it = q.begin(); it != q.end(); ) { auto *in = *it; mds_rank_t export_pin = in->get_export_pin(false); + if (in->is_ephemerally_pinned()) { + dout(10) << "ephemeral export pin to " << export_pin << " for " << *in << dendl; + } dout(10) << " delayed export_pin=" << export_pin << " on " << *in << " max_mds=" << mdsmap.get_max_mds() << dendl; if (export_pin >= mdsmap.get_max_mds()) { @@ -13369,28 +13391,20 @@ void MDCache::handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap) { in->state_clear(CInode::STATE_DELAYEDEXPORTPIN); it = q.erase(it); - in->maybe_export_pin(); + in->queue_export_pin(export_pin); } - /* Handle consistent hash ring during cluster resizes */ if (mdsmap.get_max_mds() != oldmap.get_max_mds()) { - dout(10) << "Checking ephemerally pinned directories for re-export due to max_mds change." << dendl; - auto it = ephemeral_pins.begin(member_offset(CInode, ephemeral_pin_inode)); - while (!it.end()) { - auto in = *it; - ++it; - // Migrate if the inodes hash elsewhere - if (hash_into_rank_bucket(in->ino(), mdsmap.get_max_mds()) != mds->get_nodeid()) { - if (in == NULL || !in->is_auth()) { - dout(10) << "Inode is not auth to this rank" << dendl; - // ++it; ??? - batrick - } - } else { - dout(10) << "adding inode to export queue" << dendl; - in->maybe_export_ephemeral_distributed_pin(true); - in->maybe_export_ephemeral_random_pin(true); - in->ephemeral_pin_inode.remove_myself(); - } + dout(10) << "Checking ephemerally pinned directories for redistribute due to max_mds change." << dendl; + /* copy to vector to avoid removals during iteration */ + std::vector migrate; + migrate.assign(rand_ephemeral_pins.begin(), rand_ephemeral_pins.end()); + for (auto& in : migrate) { + in->maybe_ephemeral_rand(); + } + migrate.assign(dist_ephemeral_pins.begin(), dist_ephemeral_pins.end()); + for (auto& in : migrate) { + in->maybe_ephemeral_dist(); } } } diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 983e87e939b..1cce2ac33b5 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -235,7 +235,7 @@ class MDCache { stray_manager.eval_stray(dn); } - mds_rank_t hash_into_rank_bucket(inodeno_t ino, mds_rank_t max_mds); + mds_rank_t hash_into_rank_bucket(inodeno_t ino); void maybe_eval_stray(CInode *in, bool delay=false); void clear_dirty_bits_for_stray(CInode* diri); @@ -996,6 +996,8 @@ class MDCache { /* Because exports may fail, this set lets us keep track of inodes that need exporting. */ std::set export_pin_queue; std::set export_pin_delayed_queue; + std::set rand_ephemeral_pins; + std::set dist_ephemeral_pins; OpenFileTable open_file_table; @@ -1329,8 +1331,6 @@ class MDCache { map fragments; - elist ephemeral_pins; - DecayCounter trim_counter; std::thread upkeeper; diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index a247c7134c0..fed587d5d33 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -2939,7 +2939,10 @@ void MDSRank::command_get_subtrees(Formatter *f) f->dump_bool("is_auth", dir->is_auth()); f->dump_int("auth_first", dir->get_dir_auth().first); f->dump_int("auth_second", dir->get_dir_auth().second); - f->dump_int("export_pin", dir->inode->get_export_pin()); + f->dump_int("export_pin", dir->inode->get_export_pin(false, false)); + f->dump_bool("distributed_ephemeral_pin", dir->inode->is_ephemeral_dist()); + f->dump_bool("random_ephemeral_pin", dir->inode->is_ephemeral_rand()); + f->dump_int("ephemeral_pin", mdcache->hash_into_rank_bucket(dir->inode->ino())); f->open_object_section("dir"); dir->dump(f); f->close_section(); @@ -3596,8 +3599,8 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const "mds_dump_cache_threshold_file", "mds_dump_cache_threshold_formatter", "mds_enable_op_tracker", - "mds_export_ephemeral_random" - "mds_export_ephemeral_distributed" + "mds_export_ephemeral_random", + "mds_export_ephemeral_distributed", "mds_health_cache_threshold", "mds_inject_migrator_session_race", "mds_log_pause", @@ -3649,6 +3652,8 @@ void MDSRankDispatcher::handle_conf_change(const ConfigProxy& conf, const std::s finisher->queue(new LambdaContext([this, changed](int) { std::scoped_lock lock(mds_lock); + dout(10) << "flushing conf change to components: " << changed << dendl; + if (changed.count("mds_log_pause") && !g_conf()->mds_log_pause) { mdlog->kick_submitter(); } diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index f5600031e79..61ffabde21e 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -2236,11 +2236,6 @@ void Migrator::export_finish(CDir *dir) mut->cleanup(); } - if (dir->get_inode()->is_export_ephemeral_distributed_migrating) - dir->get_inode()->finish_export_ephemeral_distributed_migration(); - else if (dir->get_inode()->is_export_ephemeral_random_migrating) - dir->get_inode()->finish_export_ephemeral_random_migration(); - if (parent) child_export_finish(parent, true); @@ -3140,29 +3135,7 @@ void Migrator::import_finish(CDir *dir, bool notify, bool last) MutationRef mut = it->second.mut; import_state.erase(it); - // start the journal entry - EImportFinish *le = new EImportFinish(dir, true); - mds->mdlog->start_entry(le); - - CInode *in = dir->get_inode(); - - CDentry *pdn = in->get_parent_dn(); - - if (in->get_export_ephemeral_random_pin(false)) { // Lazy checks. FIXME - le->metablob.add_primary_dentry(pdn, in, false, false, false, false, - false, true); - in->is_export_ephemeral_random_pinned = true; - cache->ephemeral_pins.push_back(&in->ephemeral_pin_inode); - } else if (pdn->get_dir()->get_inode() - && pdn->get_dir()->get_inode()->get_export_ephemeral_distributed_pin()) { - le->metablob.add_primary_dentry(pdn, in, false, false, false, false, - true, false); - in->is_export_ephemeral_distributed_pinned = true; - cache->ephemeral_pins.push_back(&in->ephemeral_pin_inode); - } - - // log it - mds->mdlog->submit_entry(le); + mds->mdlog->start_submit_entry(new EImportFinish(dir, true)); // process delayed expires cache->process_delayed_expire(dir); @@ -3467,6 +3440,7 @@ void Migrator::decode_import_dir(bufferlist::const_iterator& blp, dir->verify_fragstat(); #endif + dir->inode->maybe_ephemeral_dist(); dir->inode->maybe_export_pin(); dout(7) << " done " << *dir << dendl; diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 3d0e64f5018..58ff0fd0b05 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -5679,7 +5679,7 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur) return; auto &pi = cur->project_inode(); - cur->set_export_ephemeral_random_pin(val); + cur->setxattr_ephemeral_rand(val); pip = &pi.inode; } else if (name == "ceph.dir.pin.distributed"sv) { if (!cur->is_dir() || cur->is_root()) { @@ -5700,9 +5700,8 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur) return; auto &pi = cur->project_inode(); - cur->set_export_ephemeral_distributed_pin(val); + cur->setxattr_ephemeral_dist(val); pip = &pi.inode; - dout(10) << "Here is the distrib pin value" << pip->export_ephemeral_distributed_pin << dendl; } else { dout(10) << " unknown vxattr " << name << dendl; respond_to_request(mdr, -EINVAL); @@ -6007,8 +6006,13 @@ public: MDRequestRef null_ref; get_mds()->mdcache->send_dentry_link(dn, null_ref); - if (newi->inode.is_file()) + if (newi->inode.is_file()) { get_mds()->locker->share_inode_max_size(newi); + } else if (newi->inode.is_dir()) { + // We do this now so that the linkages on the new directory are stable. + newi->maybe_ephemeral_dist(); + newi->maybe_ephemeral_rand(); + } // hit pop get_mds()->balancer->hit_inode(newi, META_POP_IWR); diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h index 9052260c6d7..52bcce20e19 100644 --- a/src/mds/events/EMetaBlob.h +++ b/src/mds/events/EMetaBlob.h @@ -65,8 +65,7 @@ public: static const int STATE_DIRTYPARENT = (1<<1); static const int STATE_DIRTYPOOL = (1<<2); static const int STATE_NEED_SNAPFLUSH = (1<<3); - static const int STATE_EPHEMERAL_DISTRIBUTED = (1<<4); - static const int STATE_EPHEMERAL_RANDOM = (1<<5); + static const int STATE_EPHEMERAL_RANDOM = (1<<4); std::string dn; // dentry snapid_t dnfirst, dnlast; version_t dnv{0}; @@ -113,7 +112,6 @@ public: bool is_dirty_parent() const { return (state & STATE_DIRTYPARENT); } bool is_dirty_pool() const { return (state & STATE_DIRTYPOOL); } bool need_snapflush() const { return (state & STATE_NEED_SNAPFLUSH); } - bool is_export_ephemeral_distributed() const { return (state & STATE_EPHEMERAL_DISTRIBUTED); } bool is_export_ephemeral_random() const { return (state & STATE_EPHEMERAL_RANDOM); } void print(ostream& out) const { @@ -438,21 +436,22 @@ private: // return remote pointer to to-be-journaled inode void add_primary_dentry(CDentry *dn, CInode *in, bool dirty, bool dirty_parent=false, bool dirty_pool=false, - bool need_snapflush=false, bool export_ephemeral_distributed=false, - bool export_ephemeral_random=false) { + bool need_snapflush=false) { __u8 state = 0; if (dirty) state |= fullbit::STATE_DIRTY; if (dirty_parent) state |= fullbit::STATE_DIRTYPARENT; if (dirty_pool) state |= fullbit::STATE_DIRTYPOOL; if (need_snapflush) state |= fullbit::STATE_NEED_SNAPFLUSH; - if (export_ephemeral_distributed) state |= fullbit::STATE_EPHEMERAL_DISTRIBUTED; - if (export_ephemeral_random) state |= fullbit::STATE_EPHEMERAL_RANDOM; add_primary_dentry(add_dir(dn->get_dir(), false), dn, in, state); } void add_primary_dentry(dirlump& lump, CDentry *dn, CInode *in, __u8 state) { if (!in) in = dn->get_projected_linkage()->get_inode(); + if (in->is_ephemeral_rand()) { + state |= fullbit::STATE_EPHEMERAL_RANDOM; + } + // make note of where this inode was last journaled in->last_journaled = event_seq; //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl; diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 49d8702a4a6..68b58495ae5 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -527,12 +527,13 @@ void EMetaBlob::fullbit::update_inode(MDSRank *mds, CInode *in) in->inode = inode; in->xattrs = xattrs; if (in->inode.is_dir()) { - in->is_export_ephemeral_distributed_pinned = is_export_ephemeral_distributed(); - in->is_export_ephemeral_random_pinned = is_export_ephemeral_random(); - dout(10) << "I'm in update_inode inside journal.cc and is_export_ephemeral_distrib for inode " << *in << "is" << in->is_export_ephemeral_distributed_pinned << dendl; - } - in->maybe_export_pin(); - if (in->inode.is_dir()) { + if (is_export_ephemeral_random()) { + dout(15) << "random ephemeral pin on " << *in << dendl; + in->set_ephemeral_rand(true); + in->maybe_ephemeral_rand(); + } + in->maybe_ephemeral_dist(); + in->maybe_export_pin(); if (!(in->dirfragtree == dirfragtree)) { dout(10) << "EMetaBlob::fullbit::update_inode dft " << in->dirfragtree << " -> " << dirfragtree << " on " << *in << dendl; diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 9f3d3d73476..a0b689c37a5 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -818,7 +818,7 @@ void inode_t::dump(ceph::Formatter *f) const f->dump_unsigned("change_attr", change_attr); f->dump_int("export_pin", export_pin); f->dump_int("export_ephemeral_random_pin", export_ephemeral_random_pin); - f->dump_int("export_ephemeral_distributed_pin", export_ephemeral_distributed_pin); + f->dump_bool("export_ephemeral_distributed_pin", export_ephemeral_distributed_pin); f->open_array_section("client_ranges"); for (const auto &p : client_ranges) {