From c18a3631d7add157961ddc0c5e7e0e2d5d788460 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 9 Jun 2020 15:15:00 -0700 Subject: [PATCH] mds: replicate random pin state This is slightly evil in its current form. The MDS should use locks to transmit state changes but right now it's just set when the CInode is replicated. This replication of this state marker is necessary for failover situations where we want the randomly pinned subtree to remain pinned across failovers. Note: this problem does not exist for the ephemeral distributed pins because simple knowledge of the immediate parent's setting (which is replicated normally) is sufficient to determine if the CInode is ephemerally distributed. Ditto for regular export pins. Signed-off-by: Patrick Donnelly (cherry picked from commit 306003c51b1070a10c8e3818f67117d987640ba0) Conflicts: src/mds/CInode.h --- src/mds/CDir.cc | 2 +- src/mds/CInode.cc | 54 +++++++++++++++++++++++++++++++++++++------ src/mds/CInode.h | 29 +++++++++++++++++++++-- src/mds/MDBalancer.cc | 11 ++++++--- src/mds/MDCache.cc | 24 +++++++++++++++++-- src/mds/Server.cc | 2 +- src/mds/journal.cc | 2 +- 7 files changed, 107 insertions(+), 17 deletions(-) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index eb525cab75650..67f46df98a51b 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1857,7 +1857,7 @@ CDentry *CDir::_load_dentry( if (in->inode.is_dirty_rstat()) in->mark_dirty_rstat(); - in->maybe_ephemeral_rand(); + in->maybe_ephemeral_rand(true); //in->hack_accessed = false; //in->hack_load_stamp = ceph_clock_now(); //num_new_inodes_loaded++; diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index d1cef4101f183..e8b0b69e46f0b 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -835,8 +835,7 @@ CDir *CInode::add_dirfrag(CDir *dir) dir->get(CDir::PIN_STICKY); } - maybe_export_pin(); - maybe_ephemeral_dist(); + maybe_pin(); return dir; } @@ -4273,9 +4272,20 @@ void CInode::decode_import(bufferlist::const_iterator& p, _decode_base(p); - unsigned s; - decode(s, p); - state_set(STATE_AUTH | (s & MASK_STATE_EXPORTED)); + { + unsigned s; + decode(s, p); + s &= MASK_STATE_EXPORTED; + + if (s & STATE_RANDEPHEMERALPIN) { + set_ephemeral_rand(true); + } + if (s & STATE_DISTEPHEMERALPIN) { + set_ephemeral_dist(true); + } + + state_set(STATE_AUTH | s); + } if (is_dirty()) { get(PIN_DIRTY); @@ -5338,7 +5348,7 @@ void CInode::set_ephemeral_rand(bool yes) } } -void CInode::maybe_ephemeral_rand() +void CInode::maybe_ephemeral_rand(bool fresh) { if (!mdcache->get_export_ephemeral_random_config()) { dout(15) << __func__ << " config false: cannot ephemeral random pin " << *this << dendl; @@ -5356,6 +5366,8 @@ void CInode::maybe_ephemeral_rand() dout(10) << __func__ << " already ephemeral random pinned: requeueing " << *this << dendl; queue_export_pin(mdcache->hash_into_rank_bucket(ino())); return; + } else if (!fresh) { + return; } double threshold = get_ephemeral_rand(); @@ -5392,6 +5404,34 @@ void CInode::set_export_pin(mds_rank_t rank) get_projected_inode()->export_pin = rank; } +void CInode::check_pin_policy() +{ + const CInode *in = this; + mds_rank_t etarget = MDS_RANK_NONE; + while (true) { + if (in->is_system()) + break; + const CDentry *pdn = in->get_parent_dn(); + if (!pdn) + break; + if (in->get_inode().nlink == 0) { + // ignore export pin for unlinked directory + return; + } else if (etarget != MDS_RANK_NONE && in->has_ephemeral_policy()) { + return; + } else if (in->get_inode().export_pin >= 0) { + /* clear any epin policy */ + set_ephemeral_dist(false); + set_ephemeral_rand(false); + return; + } else if (etarget == MDS_RANK_NONE && in->is_ephemerally_pinned()) { + /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */ + etarget = mdcache->hash_into_rank_bucket(in->ino()); + } + in = pdn->get_dir()->inode; + } +} + mds_rank_t CInode::get_export_pin(bool inherit, bool ephemeral) const { /* An inode that is export pinned may not necessarily be a subtree root, we @@ -5410,7 +5450,7 @@ mds_rank_t CInode::get_export_pin(bool inherit, bool ephemeral) const if (in->get_inode().nlink == 0) { // ignore export pin for unlinked directory return MDS_RANK_NONE; - } else if (etarget != MDS_RANK_NONE && (in->get_inode().export_ephemeral_random_pin > 0.0 || in->get_inode().export_ephemeral_distributed_pin)) { + } else if (etarget != MDS_RANK_NONE && in->has_ephemeral_policy()) { return etarget; } else if (in->get_inode().export_pin >= 0) { return in->get_inode().export_pin; diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 5b297a5d3678e..22cdc6e7bef32 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -327,12 +327,25 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter 0.0 || + get_inode().export_ephemeral_distributed_pin; + } bool is_ephemerally_pinned() const { return state_test(STATE_DISTEPHEMERALPIN) || state_test(STATE_RANDEPHEMERALPIN); } bool is_exportable(mds_rank_t dest) const; + void maybe_pin() { + maybe_export_pin(); + maybe_ephemeral_dist(); + maybe_ephemeral_rand(); + } + void print(ostream& out) override; void dump(Formatter *f, int flags = DUMP_DEFAULT) const; diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc index c3bac659ecf1d..76241bcb4bc37 100644 --- a/src/mds/MDBalancer.cc +++ b/src/mds/MDBalancer.cc @@ -99,6 +99,8 @@ void MDBalancer::handle_export_pins(void) auto cur = it++; CInode *in = *cur; ceph_assert(in->is_dir()); + + in->check_pin_policy(); mds_rank_t export_pin = in->get_export_pin(false); if (export_pin >= mds->mdsmap->get_max_mds()) { dout(20) << " delay export_pin=" << export_pin << " on " << *in << dendl; @@ -179,9 +181,12 @@ void MDBalancer::handle_export_pins(void) dendl; } - if (export_pin >= 0 && export_pin < mds->mdsmap->get_max_mds() - && export_pin != mds->get_nodeid()) { - mds->mdcache->migrator->export_dir(cd, export_pin); + if (export_pin >= 0 && export_pin < mds->mdsmap->get_max_mds()) { + if (export_pin == mds->get_nodeid()) { + cd->get_inode()->check_pin_policy(); + } else { + mds->mdcache->migrator->export_dir(cd, export_pin); + } } } } diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 1241f1d4aecbe..6a9b314c92b4d 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -957,6 +957,11 @@ void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_p } } + if (dir->is_auth()) { + /* do this now that we are auth for the CDir */ + dir->inode->maybe_pin(); + } + show_subtrees(); } @@ -10759,7 +10764,7 @@ void MDCache::encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl) void MDCache::encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl, uint64_t features) { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); ceph_assert(in->is_auth()); encode(in->inode.ino, bl); // bleh, minor assymetry here encode(in->last, bl); @@ -10769,6 +10774,10 @@ void MDCache::encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl, in->_encode_base(bl, features); in->_encode_locks_state_for_replica(bl, mds->get_state() < MDSMap::STATE_ACTIVE); + + __u32 state = in->state; + encode(state, bl); + ENCODE_FINISH(bl); } @@ -10865,7 +10874,7 @@ void MDCache::decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, void MDCache::decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished) { - DECODE_START(1, p); + DECODE_START(2, p); inodeno_t ino; snapid_t last; __u32 nonce; @@ -10899,6 +10908,17 @@ void MDCache::decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, C if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in) dout(10) << __func__ << " different linkage in dentry " << *dn << dendl; } + + if (struct_v >= 2) { + __u32 s; + decode(s, p); + s &= CInode::MASK_STATE_REPLICATED; + if (s & CInode::STATE_RANDEPHEMERALPIN) { + dout(10) << "replica inode is random ephemeral pinned" << dendl; + in->set_ephemeral_rand(true); + } + } + DECODE_FINISH(p); } diff --git a/src/mds/Server.cc b/src/mds/Server.cc index d9d30656fefe5..2899031a880e9 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -5976,7 +5976,7 @@ public: } else if (newi->inode.is_dir()) { // We do this now so that the linkages on the new directory are stable. newi->maybe_ephemeral_dist(); - newi->maybe_ephemeral_rand(); + newi->maybe_ephemeral_rand(true); } // hit pop diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 7468134baffe0..ac905009295ad 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -530,7 +530,7 @@ void EMetaBlob::fullbit::update_inode(MDSRank *mds, CInode *in) if (is_export_ephemeral_random()) { dout(15) << "random ephemeral pin on " << *in << dendl; in->set_ephemeral_rand(true); - in->maybe_ephemeral_rand(); + in->maybe_ephemeral_rand(true); } in->maybe_ephemeral_dist(); in->maybe_export_pin(); -- 2.39.5