if (in->inode.is_dirty_rstat())
in->mark_dirty_rstat();
- in->maybe_ephemeral_rand();
+ in->maybe_ephemeral_rand(true);
//in->hack_accessed = false;
//in->hack_load_stamp = ceph_clock_now();
//num_new_inodes_loaded++;
dir->get(CDir::PIN_STICKY);
}
- maybe_export_pin();
- maybe_ephemeral_dist();
+ maybe_pin();
return dir;
}
_decode_base(p);
- unsigned s;
- decode(s, p);
- state_set(STATE_AUTH | (s & MASK_STATE_EXPORTED));
+ {
+ unsigned s;
+ decode(s, p);
+ s &= MASK_STATE_EXPORTED;
+
+ if (s & STATE_RANDEPHEMERALPIN) {
+ set_ephemeral_rand(true);
+ }
+ if (s & STATE_DISTEPHEMERALPIN) {
+ set_ephemeral_dist(true);
+ }
+
+ state_set(STATE_AUTH | s);
+ }
if (is_dirty()) {
get(PIN_DIRTY);
}
}
-void CInode::maybe_ephemeral_rand()
+void CInode::maybe_ephemeral_rand(bool fresh)
{
if (!mdcache->get_export_ephemeral_random_config()) {
dout(15) << __func__ << " config false: cannot ephemeral random pin " << *this << dendl;
dout(10) << __func__ << " already ephemeral random pinned: requeueing " << *this << dendl;
queue_export_pin(mdcache->hash_into_rank_bucket(ino()));
return;
+ } else if (!fresh) {
+ return;
}
double threshold = get_ephemeral_rand();
get_projected_inode()->export_pin = rank;
}
+void CInode::check_pin_policy()
+{
+ const CInode *in = this;
+ mds_rank_t etarget = MDS_RANK_NONE;
+ while (true) {
+ if (in->is_system())
+ break;
+ const CDentry *pdn = in->get_parent_dn();
+ if (!pdn)
+ break;
+ if (in->get_inode().nlink == 0) {
+ // ignore export pin for unlinked directory
+ return;
+ } else if (etarget != MDS_RANK_NONE && in->has_ephemeral_policy()) {
+ return;
+ } else if (in->get_inode().export_pin >= 0) {
+ /* clear any epin policy */
+ set_ephemeral_dist(false);
+ set_ephemeral_rand(false);
+ return;
+ } else if (etarget == MDS_RANK_NONE && in->is_ephemerally_pinned()) {
+ /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */
+ etarget = mdcache->hash_into_rank_bucket(in->ino());
+ }
+ in = pdn->get_dir()->inode;
+ }
+}
+
mds_rank_t CInode::get_export_pin(bool inherit, bool ephemeral) const
{
/* An inode that is export pinned may not necessarily be a subtree root, we
if (in->get_inode().nlink == 0) {
// ignore export pin for unlinked directory
return MDS_RANK_NONE;
- } else if (etarget != MDS_RANK_NONE && (in->get_inode().export_ephemeral_random_pin > 0.0 || in->get_inode().export_ephemeral_distributed_pin)) {
+ } else if (etarget != MDS_RANK_NONE && in->has_ephemeral_policy()) {
return etarget;
} else if (in->get_inode().export_pin >= 0) {
return in->get_inode().export_pin;
static const int STATE_ORPHAN = STATE_NOTIFYREF;
static const int MASK_STATE_EXPORTED =
- (STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL);
+ (STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL|
+ STATE_DISTEPHEMERALPIN|STATE_RANDEPHEMERALPIN);
static const int MASK_STATE_EXPORT_KEPT =
(STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS|
STATE_QUEUEDEXPORTPIN|STATE_TRACKEDBYOFT|STATE_DELAYEDEXPORTPIN|
STATE_DISTEPHEMERALPIN|STATE_RANDEPHEMERALPIN);
+ /* These are for "permanent" state markers that are passed around between
+ * MDS. Nothing protects/updates it like a typical MDS lock.
+ *
+ * Currently, we just use this for REPLICATED inodes. The reason we need to
+ * replicate the random epin state is because the directory inode is still
+ * under the authority of the parent subtree. So it's not exported normally
+ * and we can't pass around the state that way. The importer of the dirfrags
+ * still needs to know that the inode is random pinned though otherwise it
+ * doesn't know that the dirfrags are pinned.
+ */
+ static const int MASK_STATE_REPLICATED = STATE_RANDEPHEMERALPIN;
+
// -- waiters --
static const uint64_t WAIT_DIR = (1<<0);
static const uint64_t WAIT_FROZEN = (1<<1);
void queue_export_pin(mds_rank_t target);
void maybe_export_pin(bool update=false);
+ void check_pin_policy();
+
void set_ephemeral_dist(bool yes);
void maybe_ephemeral_dist(bool update=false);
void maybe_ephemeral_dist_children(bool update=false);
double get_ephemeral_rand(bool inherit=true) const;
void set_ephemeral_rand(bool yes);
- void maybe_ephemeral_rand();
+ void maybe_ephemeral_rand(bool fresh=false);
void setxattr_ephemeral_rand(double prob=0.0);
bool is_ephemeral_rand() const {
return state_test(STATE_RANDEPHEMERALPIN);
}
+ bool has_ephemeral_policy() const {
+ return get_inode().export_ephemeral_random_pin > 0.0 ||
+ get_inode().export_ephemeral_distributed_pin;
+ }
bool is_ephemerally_pinned() const {
return state_test(STATE_DISTEPHEMERALPIN) ||
state_test(STATE_RANDEPHEMERALPIN);
}
bool is_exportable(mds_rank_t dest) const;
+ void maybe_pin() {
+ maybe_export_pin();
+ maybe_ephemeral_dist();
+ maybe_ephemeral_rand();
+ }
+
void print(std::ostream& out) override;
void dump(ceph::Formatter *f, int flags = DUMP_DEFAULT) const;
auto cur = it++;
CInode *in = *cur;
ceph_assert(in->is_dir());
+
+ in->check_pin_policy();
mds_rank_t export_pin = in->get_export_pin(false);
if (export_pin >= mds->mdsmap->get_max_mds()) {
dout(20) << " delay export_pin=" << export_pin << " on " << *in << dendl;
dendl;
}
- if (export_pin >= 0 && export_pin < mds->mdsmap->get_max_mds()
- && export_pin != mds->get_nodeid()) {
- mds->mdcache->migrator->export_dir(cd, export_pin);
+ if (export_pin >= 0 && export_pin < mds->mdsmap->get_max_mds()) {
+ if (export_pin == mds->get_nodeid()) {
+ cd->get_inode()->check_pin_policy();
+ } else {
+ mds->mdcache->migrator->export_dir(cd, export_pin);
+ }
}
}
}
}
}
+ if (dir->is_auth()) {
+ /* do this now that we are auth for the CDir */
+ dir->inode->maybe_pin();
+ }
+
show_subtrees();
}
void MDCache::encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl,
uint64_t features)
{
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 1, bl);
ceph_assert(in->is_auth());
encode(in->inode.ino, bl); // bleh, minor assymetry here
encode(in->last, bl);
in->_encode_base(bl, features);
in->_encode_locks_state_for_replica(bl, mds->get_state() < MDSMap::STATE_ACTIVE);
+
+ __u32 state = in->state;
+ encode(state, bl);
+
ENCODE_FINISH(bl);
}
void MDCache::decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished)
{
- DECODE_START(1, p);
+ DECODE_START(2, p);
inodeno_t ino;
snapid_t last;
__u32 nonce;
if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
dout(10) << __func__ << " different linkage in dentry " << *dn << dendl;
}
+
+ if (struct_v >= 2) {
+ __u32 s;
+ decode(s, p);
+ s &= CInode::MASK_STATE_REPLICATED;
+ if (s & CInode::STATE_RANDEPHEMERALPIN) {
+ dout(10) << "replica inode is random ephemeral pinned" << dendl;
+ in->set_ephemeral_rand(true);
+ }
+ }
+
DECODE_FINISH(p);
}
} else if (newi->inode.is_dir()) {
// We do this now so that the linkages on the new directory are stable.
newi->maybe_ephemeral_dist();
- newi->maybe_ephemeral_rand();
+ newi->maybe_ephemeral_rand(true);
}
// hit pop
if (is_export_ephemeral_random()) {
dout(15) << "random ephemeral pin on " << *in << dendl;
in->set_ephemeral_rand(true);
- in->maybe_ephemeral_rand();
+ in->maybe_ephemeral_rand(true);
}
in->maybe_ephemeral_dist();
in->maybe_export_pin();