if (in->inode.is_dirty_rstat())
in->mark_dirty_rstat();
- in->maybe_export_ephemeral_random_pin(true);
+ in->maybe_ephemeral_rand();
//in->hack_accessed = false;
//in->hack_load_stamp = ceph_clock_now();
//num_new_inodes_loaded++;
if (in.inode.export_pin != MDS_RANK_NONE) {
out << " export_pin=" << in.inode.export_pin;
}
+ if (in.state_test(CInode::STATE_DISTEPHEMERALPIN)) {
+ out << " distepin";
+ }
+ if (in.state_test(CInode::STATE_RANDEPHEMERALPIN)) {
+ out << " randepin";
+ }
out << " " << ∈
out << "]";
void CInode::pop_and_dirty_projected_inode(LogSegment *ls)
{
ceph_assert(!projected_nodes.empty());
- auto &front = projected_nodes.front();
+ auto& front = projected_nodes.front();
+
dout(15) << __func__ << " " << front.inode.ino
<< " v" << front.inode.version << dendl;
+
int64_t old_pool = inode.layout.pool_id;
+ bool pin_update = inode.export_pin != front.inode.export_pin;
+ bool dist_update = inode.export_ephemeral_distributed_pin
+ != front.inode.export_ephemeral_distributed_pin;
mark_dirty(front.inode.version, ls);
- bool new_export_pin = inode.export_pin != front.inode.export_pin;
- inode = front.inode;
- if (new_export_pin)
+
+ inode = std::move(front.inode);
+
+ if (pin_update)
maybe_export_pin(true);
+ if (dist_update)
+ maybe_ephemeral_dist_children(true);
- if (front.inode.version == 1)
- maybe_export_ephemeral_random_pin();
if (inode.is_backtrace_updated())
mark_dirty_parent(ls, old_pool != inode.layout.pool_id);
}
maybe_export_pin();
+ maybe_ephemeral_dist();
return dir;
}
void CInode::encode_lock_ipolicy(bufferlist& bl)
{
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 1, bl);
if (inode.is_dir()) {
encode(inode.version, bl);
encode(inode.ctime, bl);
void CInode::decode_lock_ipolicy(bufferlist::const_iterator& p)
{
- DECODE_START(1, p);
+ DECODE_START(2, p);
if (inode.is_dir()) {
decode(inode.version, p);
utime_t tm;
if (inode.ctime < tm) inode.ctime = tm;
decode(inode.layout, p);
decode(inode.quota, p);
- mds_rank_t old_pin = inode.export_pin;
- decode(inode.export_pin, p);
- maybe_export_pin(old_pin != inode.export_pin);
- bool old_ephemeral_pin = inode.export_ephemeral_distributed_pin;
- decode(inode.export_ephemeral_distributed_pin, p);
- maybe_export_ephemeral_distributed_pin(old_ephemeral_pin != inode.export_ephemeral_distributed_pin);
- decode(inode.export_ephemeral_random_pin, p);
+ {
+ mds_rank_t old_pin = inode.export_pin;
+ decode(inode.export_pin, p);
+ maybe_export_pin(old_pin != inode.export_pin);
+ }
+ if (struct_v >= 2) {
+ {
+ bool old_ephemeral_pin = inode.export_ephemeral_distributed_pin;
+ decode(inode.export_ephemeral_distributed_pin, p);
+ maybe_ephemeral_dist_children(old_ephemeral_pin != inode.export_ephemeral_distributed_pin);
+ }
+ decode(inode.export_ephemeral_random_pin, p);
+ }
}
DECODE_FINISH(p);
}
}
}
-void CInode::maybe_export_pin(bool update)
+void CInode::queue_export_pin(mds_rank_t target)
{
- if (!g_conf()->mds_bal_export_pin)
- return;
- if (!is_dir() || !is_normal())
- return;
-
- mds_rank_t export_pin = get_export_pin(false);
- if (export_pin == MDS_RANK_NONE && !update) {
- maybe_export_ephemeral_distributed_pin();
- return;
- }
-
if (state_test(CInode::STATE_QUEUEDEXPORTPIN))
return;
bool queue = false;
- for (auto p = dirfrags.begin(); p != dirfrags.end(); p++) {
- CDir *dir = p->second;
+ for (auto& p : dirfrags) {
+ CDir *dir = p.second;
if (!dir->is_auth())
continue;
- if (export_pin != MDS_RANK_NONE) {
+ if (target != MDS_RANK_NONE) {
if (dir->is_subtree_root()) {
// set auxsubtree bit or export it
if (!dir->state_test(CDir::STATE_AUXSUBTREE) ||
- export_pin != dir->get_dir_auth().first)
+ target != dir->get_dir_auth().first)
queue = true;
} else {
// create aux subtree or export it
}
}
-void CInode::maybe_export_ephemeral_random_pin(bool update)
+void CInode::maybe_export_pin(bool update)
{
- bool export_ephemeral_random_config = mdcache->get_export_ephemeral_random_config();
+ if (!g_conf()->mds_bal_export_pin)
+ return;
+ if (!is_dir() || !is_normal())
+ return;
- //If the config isn't set then return
- if (!export_ephemeral_random_config)
+ dout(15) << __func__ << " update=" << update << " " << *this << dendl;
+
+ mds_rank_t export_pin = get_export_pin(false, false);
+ if (export_pin == MDS_RANK_NONE && !update) {
return;
+ }
- //Check if it's already ephemerally pinned
- if (is_export_ephemeral_random_pinned && !update)
- return;
+ /* disable ephemeral pins */
+ set_ephemeral_dist(false);
+ set_ephemeral_rand(false);
+ queue_export_pin(export_pin);
+}
- if (export_ephemeral_random_config) {
- double export_ephemeral_random_pin = get_export_ephemeral_random_pin(false);
- if ((update || export_ephemeral_random_pin >=
- ceph::util::generate_random_number(0.0, 1.0))
- && is_export_ephemeral_distributed_pinned == false) {
-
- dout(10) << "I'm here under ephemeral random because is_export_ephemeral_distributed is" << is_export_ephemeral_distributed_pinned << dendl;
-
- is_export_ephemeral_random_migrating = true;
-
- bool queue = false;
- for (auto& p : dirfrags) {
- CDir *dir = p.second;
- if (!dir->is_auth())
- continue;
- if (dir->is_subtree_root()) {
- // set auxsubtree bit or export it
- if (!dir->state_test(CDir::STATE_AUXSUBTREE) ||
- mdcache->hash_into_rank_bucket(ino(), mdcache->mds->mdsmap->get_max_mds()) != dir->get_dir_auth().first)
- queue = true;
- } else {
- // create aux subtree or export it
- queue = true;
- }
- if (queue) {
- if (mdcache->hash_into_rank_bucket(ino(), mdcache->mds->mdsmap->get_max_mds()) == mdcache->mds->get_nodeid())
- mdcache->ephemeral_pin(ephemeral_pin_inode);
- state_set(CInode::STATE_QUEUEDEXPORTPIN);
- mdcache->export_pin_queue.insert(this);
- break;
- }
- }
- return;
+void CInode::set_ephemeral_dist(bool yes)
+{
+ if (yes) {
+ if (!state_test(CInode::STATE_DISTEPHEMERALPIN)) {
+ state_set(CInode::STATE_DISTEPHEMERALPIN);
+ auto p = mdcache->dist_ephemeral_pins.insert(this);
+ ceph_assert(p.second);
+ }
+ } else {
+ /* avoid std::set::erase if unnecessary */
+ if (state_test(CInode::STATE_DISTEPHEMERALPIN)) {
+ dout(10) << "clearing ephemeral distributed pin on " << *this << dendl;
+ state_clear(CInode::STATE_DISTEPHEMERALPIN);
+ auto count = mdcache->dist_ephemeral_pins.erase(this);
+ ceph_assert(count == 1);
+ queue_export_pin(MDS_RANK_NONE);
}
}
}
-void CInode::maybe_export_ephemeral_distributed_pin(bool update)
+void CInode::maybe_ephemeral_dist(bool update)
{
- bool export_ephemeral_distributed_config = mdcache->get_export_ephemeral_distributed_config();
-
- //If both the configs aren't set then return
- if (!export_ephemeral_distributed_config)
+ if (!mdcache->get_export_ephemeral_distributed_config()) {
+ dout(15) << __func__ << " config false: cannot ephemeral distributed pin " << *this << dendl;
+ set_ephemeral_dist(false);
+ return;
+ } else if (!is_dir() || !is_normal()) {
+ dout(15) << __func__ << " !dir or !normal: cannot ephemeral distributed pin " << *this << dendl;
+ set_ephemeral_dist(false);
+ return;
+ } else if (get_inode().nlink == 0) {
+ dout(15) << __func__ << " unlinked directory: cannot ephemeral distributed pin " << *this << dendl;
+ set_ephemeral_dist(false);
+ return;
+ } else if (!update && state_test(CInode::STATE_DISTEPHEMERALPIN)) {
+ dout(15) << __func__ << " requeueing already pinned " << *this << dendl;
+ queue_export_pin(mdcache->hash_into_rank_bucket(ino()));
return;
+ }
- //Check if it's already ephemerally pinned
- if (is_export_ephemeral_distributed_pinned && !update)
- return;
+ dout(15) << __func__ << " update=" << update << " " << *this << dendl;
- if (export_ephemeral_distributed_config) {
- CDentry *pdn = get_parent_dn();
+ auto dir = get_parent_dir();
+ if (!dir) {
+ return;
+ }
- if (!pdn) {
- return;
- }
+ bool pin = dir->get_inode()->get_inode().export_ephemeral_distributed_pin;
+ if (pin) {
+ dout(10) << __func__ << " ephemeral distributed pinning " << *this << dendl;
+ set_ephemeral_dist(true);
+ queue_export_pin(mdcache->hash_into_rank_bucket(ino()));
+ } else if (update) {
+ set_ephemeral_dist(false);
+ queue_export_pin(MDS_RANK_NONE);
+ }
+}
+
+void CInode::maybe_ephemeral_dist_children(bool update)
+{
+ if (!mdcache->get_export_ephemeral_distributed_config()) {
+ dout(15) << __func__ << " config false: cannot ephemeral distributed pin " << *this << dendl;
+ return;
+ } else if (!is_dir() || !is_normal()) {
+ dout(15) << __func__ << " !dir or !normal: cannot ephemeral distributed pin " << *this << dendl;
+ return;
+ } else if (get_inode().nlink == 0) {
+ dout(15) << __func__ << " unlinked directory: cannot ephemeral distributed pin " << *this << dendl;
+ return;
+ }
- auto dir = pdn->get_dir();
+ bool pin = get_inode().export_ephemeral_distributed_pin;
+ /* FIXME: expensive to iterate children when not updating */
+ if (!pin && !update) {
+ return;
+ }
- if (get_export_ephemeral_distributed_pin() && dir->get_num_head_items()) {
- for (auto& bound : bounds) {
- bound->maybe_export_ephemeral_distributed_pin();
+ dout(10) << __func__ << " maybe ephemerally pinning children of " << *this << dendl;
+ for (auto& p : dirfrags) {
+ auto& dir = p.second;
+ for (auto& q : *dir) {
+ auto& dn = q.second;
+ auto&& in = dn->get_linkage()->get_inode();
+ if (in && in->is_dir()) {
+ in->maybe_ephemeral_dist(update);
}
}
+ }
+}
- else if (update || (dir->get_inode()->get_export_ephemeral_distributed_pin())) {
- is_export_ephemeral_distributed_migrating = true;
-
- bool queue = false;
- for (auto& p : dirfrags) {
- CDir *dir = p.second;
- if (!dir->is_auth())
- continue;
- if (dir->is_subtree_root()) {
- // set auxsubtree bit or export it
- if (!dir->state_test(CDir::STATE_AUXSUBTREE) ||
- mdcache->hash_into_rank_bucket(ino(), mdcache->mds->mdsmap->get_max_mds()) != dir->get_dir_auth().first)
- queue = true;
- } else {
- // create aux subtree or export it
- queue = true;
- }
- if (queue) {
- dout(10) << "max_mds is" << mdcache->mds->mdsmap->get_max_mds() << "and target mds is:" << mdcache->hash_into_rank_bucket(ino(), mdcache->mds->mdsmap->get_max_mds()) << dendl;
- if (mdcache->hash_into_rank_bucket(ino(), mdcache->mds->mdsmap->get_max_mds()) == mdcache->mds->get_nodeid()) {
- mdcache->ephemeral_pin(ephemeral_pin_inode);
- dout(10) << "Inside if inside the else" << dendl;
- }
- state_set(CInode::STATE_QUEUEDEXPORTPIN);
- mdcache->export_pin_queue.insert(this);
- break;
- }
- }
- return;
+void CInode::set_ephemeral_rand(bool yes)
+{
+ if (yes) {
+ if (!state_test(CInode::STATE_RANDEPHEMERALPIN)) {
+ state_set(CInode::STATE_RANDEPHEMERALPIN);
+ auto p = mdcache->rand_ephemeral_pins.insert(this);
+ ceph_assert(p.second);
}
+ } else {
+ if (state_test(CInode::STATE_RANDEPHEMERALPIN)) {
+ dout(10) << "clearing ephemeral random pin on " << *this << dendl;
+ state_clear(CInode::STATE_RANDEPHEMERALPIN);
+ auto count = mdcache->rand_ephemeral_pins.erase(this);
+ ceph_assert(count == 1);
+ queue_export_pin(MDS_RANK_NONE);
+ }
+ }
+}
+
+void CInode::maybe_ephemeral_rand()
+{
+ if (!mdcache->get_export_ephemeral_random_config()) {
+ dout(15) << __func__ << " config false: cannot ephemeral random pin " << *this << dendl;
+ set_ephemeral_rand(false);
+ return;
+ } else if (!is_dir() || !is_normal()) {
+ dout(15) << __func__ << " !dir or !normal: cannot ephemeral random pin " << *this << dendl;
+ set_ephemeral_rand(false);
+ return;
+ } else if (get_inode().nlink == 0) {
+ dout(15) << __func__ << " unlinked directory: cannot ephemeral random pin " << *this << dendl;
+ set_ephemeral_rand(false);
+ return;
+ } else if (state_test(CInode::STATE_RANDEPHEMERALPIN)) {
+ dout(10) << __func__ << " already ephemeral random pinned: requeueing " << *this << dendl;
+ queue_export_pin(mdcache->hash_into_rank_bucket(ino()));
+ return;
+ }
+
+ double threshold = get_ephemeral_rand();
+ double n = ceph::util::generate_random_number(0.0, 1.0);
+
+ dout(15) << __func__ << " rand " << n << " <?= " << threshold
+ << " " << *this << dendl;
+
+ if (n <= threshold) {
+ dout(10) << __func__ << " randomly export pinning " << *this << dendl;
+ set_ephemeral_rand(true);
+ queue_export_pin(mdcache->hash_into_rank_bucket(ino()));
}
}
-void CInode::set_export_ephemeral_random_pin(double probability)
+void CInode::setxattr_ephemeral_rand(double probability)
{
ceph_assert(is_dir());
ceph_assert(is_projected());
get_projected_inode()->export_ephemeral_random_pin = probability;
}
-void CInode::set_export_ephemeral_distributed_pin(bool val)
+void CInode::setxattr_ephemeral_dist(bool val)
{
ceph_assert(is_dir());
ceph_assert(is_projected());
get_projected_inode()->export_pin = rank;
}
-mds_rank_t CInode::get_export_pin(bool inherit) const
+mds_rank_t CInode::get_export_pin(bool inherit, bool ephemeral) const
{
/* An inode that is export pinned may not necessarily be a subtree root, we
* need to traverse the parents. A base or system inode cannot be pinned.
* have a parent yet.
*/
const CInode *in = this;
+ mds_rank_t etarget = MDS_RANK_NONE;
while (true) {
if (in->is_system())
break;
const CDentry *pdn = in->get_parent_dn();
if (!pdn)
break;
- // ignore export pin for unlinked directory
- if (in->get_inode().nlink == 0)
- break;
- if (in->get_inode().export_pin >= 0)
+ if (in->get_inode().nlink == 0) {
+ // ignore export pin for unlinked directory
+ return MDS_RANK_NONE;
+ } else if (etarget != MDS_RANK_NONE && (in->get_inode().export_ephemeral_random_pin > 0.0 || in->get_inode().export_ephemeral_distributed_pin)) {
+ return etarget;
+ } else if (in->get_inode().export_pin >= 0) {
return in->get_inode().export_pin;
+ } else if (etarget == MDS_RANK_NONE && ephemeral && in->is_ephemerally_pinned()) {
+ /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */
+ etarget = mdcache->hash_into_rank_bucket(in->ino());
+ if (!inherit) return etarget;
+ }
- if (!inherit)
+ if (!inherit) {
break;
+ }
in = pdn->get_dir()->inode;
}
return MDS_RANK_NONE;
}
-double CInode::get_export_ephemeral_random_pin(bool inherit) const
+double CInode::get_ephemeral_rand(bool inherit) const
{
- /* An inode that is export pinned may not necessarily be a subtree root, we
- * need to traverse the parents. A base or system inode cannot be pinned.
- * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
+ /* N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
* have a parent yet.
*/
const CInode *in = this;
// ignore export pin for unlinked directory
if (in->get_inode().nlink == 0)
break;
- if (in->get_inode().export_ephemeral_random_pin >= 0)
+
+ if (in->get_inode().export_ephemeral_random_pin > 0.0)
return in->get_inode().export_ephemeral_random_pin;
+ /* An export_pin overrides only if no closer parent (incl. this one) has a
+ * random pin set.
+ */
+ if (in->get_inode().export_pin >= 0)
+ return 0.0;
+
if (!inherit)
break;
in = pdn->get_dir()->inode;
}
- return 0;
-}
-
-bool CInode::get_export_ephemeral_distributed_pin() const
-{
- if (get_inode().export_ephemeral_distributed_pin)
- return get_inode().export_ephemeral_distributed_pin;
- else
- return false;
+ return 0.0;
}
bool CInode::is_exportable(mds_rank_t dest) const
static const int STATE_QUEUEDEXPORTPIN = (1<<17);
static const int STATE_TRACKEDBYOFT = (1<<18); // tracked by open file table
static const int STATE_DELAYEDEXPORTPIN = (1<<19);
+ static const int STATE_DISTEPHEMERALPIN = (1<<20);
+ static const int STATE_RANDEPHEMERALPIN = (1<<21);
// orphan inode needs notification of releasing reference
static const int STATE_ORPHAN = STATE_NOTIFYREF;
(STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL);
static const int MASK_STATE_EXPORT_KEPT =
(STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS|
- STATE_QUEUEDEXPORTPIN|STATE_TRACKEDBYOFT|STATE_DELAYEDEXPORTPIN);
+ STATE_QUEUEDEXPORTPIN|STATE_TRACKEDBYOFT|STATE_DELAYEDEXPORTPIN|
+ STATE_DISTEPHEMERALPIN|STATE_RANDEPHEMERALPIN);
// -- waiters --
static const uint64_t WAIT_DIR = (1<<0);
std::map<int, std::unique_ptr<BatchOp>> batch_ops;
- bool is_export_ephemeral_distributed_pinned = false;
- bool is_export_ephemeral_random_pinned = false;
-
- bool is_export_ephemeral_distributed_migrating = false;
- bool is_export_ephemeral_random_migrating = false;
-
- void finish_export_ephemeral_distributed_migration() {
- is_export_ephemeral_distributed_migrating = false;
- is_export_ephemeral_distributed_pinned = true;
- }
-
- void finish_export_ephemeral_random_migration() {
- is_export_ephemeral_random_migrating = false;
- is_export_ephemeral_random_pinned = true;
- }
-
std::string_view pin_name(int p) const override;
std::ostream& print_db_line_prefix(std::ostream& out) override;
return !projected_parent.empty();
}
- void maybe_export_pin(bool update=false);
- void maybe_export_ephemeral_random_pin(bool update=false);
- void maybe_export_ephemeral_distributed_pin(bool update=false);
+ mds_rank_t get_export_pin(bool inherit=true, bool ephemeral=true) const;
void set_export_pin(mds_rank_t rank);
- void set_export_ephemeral_random_pin(double probablitiy=0);
- void set_export_ephemeral_distributed_pin(bool val=false);
- mds_rank_t get_export_pin(bool inherit=true) const;
- double get_export_ephemeral_random_pin(bool inherit=true) const;
- bool get_export_ephemeral_distributed_pin() const;
+ void queue_export_pin(mds_rank_t target);
+ void maybe_export_pin(bool update=false);
+
+ void set_ephemeral_dist(bool yes);
+ void maybe_ephemeral_dist(bool update=false);
+ void maybe_ephemeral_dist_children(bool update=false);
+ void setxattr_ephemeral_dist(bool val=false);
+ bool is_ephemeral_dist() const {
+ return state_test(STATE_DISTEPHEMERALPIN);
+ }
+
+ double get_ephemeral_rand(bool inherit=true) const;
+ void set_ephemeral_rand(bool yes);
+ void maybe_ephemeral_rand();
+ void setxattr_ephemeral_rand(double prob=0.0);
+ bool is_ephemeral_rand() const {
+ return state_test(STATE_RANDEPHEMERALPIN);
+ }
+
+ bool is_ephemerally_pinned() const {
+ return state_test(STATE_DISTEPHEMERALPIN) ||
+ state_test(STATE_RANDEPHEMERALPIN);
+ }
bool is_exportable(mds_rank_t dest) const;
void print(std::ostream& out) override;
// list item node for when we have unpropagated rstat data
elist<CInode*>::item dirty_rstat_item;
- elist<CInode*>::item ephemeral_pin_inode;
-
mempool::mds_co::set<client_t> client_snap_caps;
mempool::mds_co::compact_map<snapid_t, mempool::mds_co::set<client_t> > client_need_snapflush;
auto cur = it++;
CInode *in = *cur;
ceph_assert(in->is_dir());
- mds_rank_t export_pin = MDS_RANK_NONE;
- // Making sure the ephemeral pin does not override export pin
- if (in->get_export_pin(false) != MDS_RANK_NONE)
- export_pin = in->get_export_pin(false);
- else if (in->is_export_ephemeral_distributed_migrating || in->is_export_ephemeral_random_migrating) {
- export_pin = mds->mdcache->hash_into_rank_bucket(in->ino(), mds->mdsmap->get_max_mds());
- dout(10) << "Ephemeral export pin set on" << *in << dendl;
- }
+ mds_rank_t export_pin = in->get_export_pin(false);
if (export_pin >= mds->mdsmap->get_max_mds()) {
dout(20) << " delay export_pin=" << export_pin << " on " << *in << dendl;
in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
*/
void tick();
+ void handle_export_pins(void);
+
void subtract_export(CDir *ex);
void add_import(CDir *im);
void adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc);
void prep_rebalance(int beat);
int mantle_prep_rebalance();
- void handle_export_pins(void);
-
mds_load_t get_load();
int localize_balancer();
void send_heartbeat();
filer(m->objecter, m->finisher),
stray_manager(m, purge_queue_),
recovery_queue(m),
- ephemeral_pins(member_offset(CInode, ephemeral_pin_inode)),
trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate"))
{
migrator.reset(new Migrator(mds, this));
void MDCache::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mdsmap)
{
+ dout(20) << "config changes: " << changed << dendl;
if (changed.count("mds_cache_memory_limit"))
cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
if (changed.count("mds_cache_reservation"))
cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
- if (changed.count("mds_export_ephemeral_distributed"))
+ if (changed.count("mds_export_ephemeral_distributed")) {
export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed");
- if (changed.count("mds_export_ephemeral_random"))
+ dout(10) << "Migrating any ephemeral distributed pinned inodes" << dendl;
+ /* copy to vector to avoid removals during iteration */
+ std::vector<CInode*> migrate;
+ migrate.assign(dist_ephemeral_pins.begin(), dist_ephemeral_pins.end());
+ for (auto& in : migrate) {
+ in->maybe_ephemeral_dist();
+ }
+ mds->balancer->handle_export_pins();
+ }
+ if (changed.count("mds_export_ephemeral_random")) {
export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random");
+ dout(10) << "Migrating any ephemeral random pinned inodes" << dendl;
+ /* copy to vector to avoid removals during iteration */
+ std::vector<CInode*> migrate;
+ migrate.assign(rand_ephemeral_pins.begin(), rand_ephemeral_pins.end());
+ for (auto& in : migrate) {
+ in->maybe_ephemeral_rand();
+ }
+ mds->balancer->handle_export_pins();
+ }
if (changed.count("mds_health_cache_threshold"))
cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
if (changed.count("mds_cache_mid"))
if (cache_toofull()) {
exceeded_size_limit = true;
}
+
+ in->maybe_ephemeral_dist(false);
}
void MDCache::remove_inode(CInode *o)
o->item_open_file.remove_myself();
- o->ephemeral_pin_inode.remove_myself();
-
if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
export_pin_queue.erase(o);
if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN))
export_pin_delayed_queue.erase(o);
+ o->set_ephemeral_dist(false);
+ o->set_ephemeral_rand(false);
+
// remove from inode map
if (o->last == CEPH_NOSNAP) {
inode_map.erase(o->ino());
/*
* hashing implementation based on Lamping and Veach's Jump Consistent Hash: https://arxiv.org/pdf/1406.2294.pdf
*/
-mds_rank_t MDCache::hash_into_rank_bucket(inodeno_t ino, mds_rank_t max_mds)
+mds_rank_t MDCache::hash_into_rank_bucket(inodeno_t ino)
{
+ const mds_rank_t max_mds = mds->mdsmap->get_max_mds();
uint64_t hash = rjhash64(ino);
int64_t b = -1, j = 0;
while (j < max_mds) {
trim(UINT64_MAX);
dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
+
+ {
+ dout(10) << "Migrating any ephemerally pinned inodes" << dendl;
+ /* copy to vector to avoid removals during iteration */
+ std::vector<CInode*> migrate;
+ migrate.assign(rand_ephemeral_pins.begin(), rand_ephemeral_pins.end());
+ for (auto& in : migrate) {
+ in->maybe_ephemeral_rand();
+ }
+ migrate.assign(dist_ephemeral_pins.begin(), dist_ephemeral_pins.end());
+ for (auto& in : migrate) {
+ in->maybe_ephemeral_dist();
+ }
+ mds->balancer->handle_export_pins();
+ }
+
// Export all subtrees to another active (usually rank 0) if not rank 0
int num_auth_subtree = 0;
- if (!subtrees.empty() &&
- mds->get_nodeid() != 0) {
- dout(7) << "looking for subtrees to export to mds0" << dendl;
+ if (!subtrees.empty() && mds->get_nodeid() != 0) {
+ dout(7) << "looking for subtrees to export" << dendl;
std::vector<CDir*> ls;
- for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
- it != subtrees.end();
- ++it) {
- CDir *dir = it->first;
- if (dir->get_inode()->is_mdsdir())
+ for (auto& [dir, bounds] : subtrees) {
+ dout(10) << " examining " << *dir << " bounds " << bounds << dendl;
+ if (dir->get_inode()->is_mdsdir() || !dir->is_auth())
continue;
- if (dir->is_auth()) {
- num_auth_subtree++;
- if (dir->is_frozen() ||
- dir->is_freezing() ||
- dir->is_ambiguous_dir_auth() ||
- dir->state_test(CDir::STATE_EXPORTING))
- continue;
- ls.push_back(dir);
+ num_auth_subtree++;
+ if (dir->is_frozen() ||
+ dir->is_freezing() ||
+ dir->is_ambiguous_dir_auth() ||
+ dir->state_test(CDir::STATE_EXPORTING) ||
+ dir->get_inode()->is_ephemerally_pinned()) {
+ continue;
}
+ ls.push_back(dir);
}
migrator->clear_export_queue();
- if (export_ephemeral_random_config ||
- export_ephemeral_distributed_config) {
- dout(10) << "Migrating ephemerally pinned inodes due to shutdown" << dendl;
- elist<CInode*>::iterator it = ephemeral_pins.begin(member_offset(CInode, ephemeral_pin_inode));
- while (!it.end()) {
- if ((*it) == NULL || !((*it)->is_auth()))
- dout(10) << "Inode is not auth to this rank" << dendl;
- else {
- dout(10) << "adding inode to export queue" << dendl;
- (*it)->maybe_export_ephemeral_distributed_pin(true);
- (*it)->maybe_export_ephemeral_random_pin(true);
- }
- ++it;
- }
- }
-
for (const auto& dir : ls) {
mds_rank_t dest = dir->get_inode()->authority().first;
if (dest > 0 && !mds->mdsmap->is_active(dest))
for (auto it = q.begin(); it != q.end(); ) {
auto *in = *it;
mds_rank_t export_pin = in->get_export_pin(false);
+ if (in->is_ephemerally_pinned()) {
+ dout(10) << "ephemeral export pin to " << export_pin << " for " << *in << dendl;
+ }
dout(10) << " delayed export_pin=" << export_pin << " on " << *in
<< " max_mds=" << mdsmap.get_max_mds() << dendl;
if (export_pin >= mdsmap.get_max_mds()) {
in->state_clear(CInode::STATE_DELAYEDEXPORTPIN);
it = q.erase(it);
- in->maybe_export_pin();
+ in->queue_export_pin(export_pin);
}
- /* Handle consistent hash ring during cluster resizes */
if (mdsmap.get_max_mds() != oldmap.get_max_mds()) {
- dout(10) << "Checking ephemerally pinned directories for re-export due to max_mds change." << dendl;
- auto it = ephemeral_pins.begin(member_offset(CInode, ephemeral_pin_inode));
- while (!it.end()) {
- auto in = *it;
- ++it;
- // Migrate if the inodes hash elsewhere
- if (hash_into_rank_bucket(in->ino(), mdsmap.get_max_mds()) != mds->get_nodeid()) {
- if (in == NULL || !in->is_auth()) {
- dout(10) << "Inode is not auth to this rank" << dendl;
- // ++it; ??? - batrick
- }
- } else {
- dout(10) << "adding inode to export queue" << dendl;
- in->maybe_export_ephemeral_distributed_pin(true);
- in->maybe_export_ephemeral_random_pin(true);
- in->ephemeral_pin_inode.remove_myself();
- }
+ dout(10) << "Checking ephemerally pinned directories for redistribute due to max_mds change." << dendl;
+ /* copy to vector to avoid removals during iteration */
+ std::vector<CInode*> migrate;
+ migrate.assign(rand_ephemeral_pins.begin(), rand_ephemeral_pins.end());
+ for (auto& in : migrate) {
+ in->maybe_ephemeral_rand();
+ }
+ migrate.assign(dist_ephemeral_pins.begin(), dist_ephemeral_pins.end());
+ for (auto& in : migrate) {
+ in->maybe_ephemeral_dist();
}
}
}
stray_manager.eval_stray(dn);
}
- mds_rank_t hash_into_rank_bucket(inodeno_t ino, mds_rank_t max_mds);
+ mds_rank_t hash_into_rank_bucket(inodeno_t ino);
void maybe_eval_stray(CInode *in, bool delay=false);
void clear_dirty_bits_for_stray(CInode* diri);
/* Because exports may fail, this set lets us keep track of inodes that need exporting. */
std::set<CInode *> export_pin_queue;
std::set<CInode *> export_pin_delayed_queue;
+ std::set<CInode *> rand_ephemeral_pins;
+ std::set<CInode *> dist_ephemeral_pins;
OpenFileTable open_file_table;
map<dirfrag_t,fragment_info_t> fragments;
- elist<CInode*> ephemeral_pins;
-
DecayCounter trim_counter;
std::thread upkeeper;
f->dump_bool("is_auth", dir->is_auth());
f->dump_int("auth_first", dir->get_dir_auth().first);
f->dump_int("auth_second", dir->get_dir_auth().second);
- f->dump_int("export_pin", dir->inode->get_export_pin());
+ f->dump_int("export_pin", dir->inode->get_export_pin(false, false));
+ f->dump_bool("distributed_ephemeral_pin", dir->inode->is_ephemeral_dist());
+ f->dump_bool("random_ephemeral_pin", dir->inode->is_ephemeral_rand());
+ f->dump_int("ephemeral_pin", mdcache->hash_into_rank_bucket(dir->inode->ino()));
f->open_object_section("dir");
dir->dump(f);
f->close_section();
"mds_dump_cache_threshold_file",
"mds_dump_cache_threshold_formatter",
"mds_enable_op_tracker",
- "mds_export_ephemeral_random"
- "mds_export_ephemeral_distributed"
+ "mds_export_ephemeral_random",
+ "mds_export_ephemeral_distributed",
"mds_health_cache_threshold",
"mds_inject_migrator_session_race",
"mds_log_pause",
finisher->queue(new LambdaContext([this, changed](int) {
std::scoped_lock lock(mds_lock);
+ dout(10) << "flushing conf change to components: " << changed << dendl;
+
if (changed.count("mds_log_pause") && !g_conf()->mds_log_pause) {
mdlog->kick_submitter();
}
mut->cleanup();
}
- if (dir->get_inode()->is_export_ephemeral_distributed_migrating)
- dir->get_inode()->finish_export_ephemeral_distributed_migration();
- else if (dir->get_inode()->is_export_ephemeral_random_migrating)
- dir->get_inode()->finish_export_ephemeral_random_migration();
-
if (parent)
child_export_finish(parent, true);
MutationRef mut = it->second.mut;
import_state.erase(it);
- // start the journal entry
- EImportFinish *le = new EImportFinish(dir, true);
- mds->mdlog->start_entry(le);
-
- CInode *in = dir->get_inode();
-
- CDentry *pdn = in->get_parent_dn();
-
- if (in->get_export_ephemeral_random_pin(false)) { // Lazy checks. FIXME
- le->metablob.add_primary_dentry(pdn, in, false, false, false, false,
- false, true);
- in->is_export_ephemeral_random_pinned = true;
- cache->ephemeral_pins.push_back(&in->ephemeral_pin_inode);
- } else if (pdn->get_dir()->get_inode()
- && pdn->get_dir()->get_inode()->get_export_ephemeral_distributed_pin()) {
- le->metablob.add_primary_dentry(pdn, in, false, false, false, false,
- true, false);
- in->is_export_ephemeral_distributed_pinned = true;
- cache->ephemeral_pins.push_back(&in->ephemeral_pin_inode);
- }
-
- // log it
- mds->mdlog->submit_entry(le);
+ mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
// process delayed expires
cache->process_delayed_expire(dir);
dir->verify_fragstat();
#endif
+ dir->inode->maybe_ephemeral_dist();
dir->inode->maybe_export_pin();
dout(7) << " done " << *dir << dendl;
return;
auto &pi = cur->project_inode();
- cur->set_export_ephemeral_random_pin(val);
+ cur->setxattr_ephemeral_rand(val);
pip = &pi.inode;
} else if (name == "ceph.dir.pin.distributed"sv) {
if (!cur->is_dir() || cur->is_root()) {
return;
auto &pi = cur->project_inode();
- cur->set_export_ephemeral_distributed_pin(val);
+ cur->setxattr_ephemeral_dist(val);
pip = &pi.inode;
- dout(10) << "Here is the distrib pin value" << pip->export_ephemeral_distributed_pin << dendl;
} else {
dout(10) << " unknown vxattr " << name << dendl;
respond_to_request(mdr, -EINVAL);
MDRequestRef null_ref;
get_mds()->mdcache->send_dentry_link(dn, null_ref);
- if (newi->inode.is_file())
+ if (newi->inode.is_file()) {
get_mds()->locker->share_inode_max_size(newi);
+ } else if (newi->inode.is_dir()) {
+ // We do this now so that the linkages on the new directory are stable.
+ newi->maybe_ephemeral_dist();
+ newi->maybe_ephemeral_rand();
+ }
// hit pop
get_mds()->balancer->hit_inode(newi, META_POP_IWR);
static const int STATE_DIRTYPARENT = (1<<1);
static const int STATE_DIRTYPOOL = (1<<2);
static const int STATE_NEED_SNAPFLUSH = (1<<3);
- static const int STATE_EPHEMERAL_DISTRIBUTED = (1<<4);
- static const int STATE_EPHEMERAL_RANDOM = (1<<5);
+ static const int STATE_EPHEMERAL_RANDOM = (1<<4);
std::string dn; // dentry
snapid_t dnfirst, dnlast;
version_t dnv{0};
bool is_dirty_parent() const { return (state & STATE_DIRTYPARENT); }
bool is_dirty_pool() const { return (state & STATE_DIRTYPOOL); }
bool need_snapflush() const { return (state & STATE_NEED_SNAPFLUSH); }
- bool is_export_ephemeral_distributed() const { return (state & STATE_EPHEMERAL_DISTRIBUTED); }
bool is_export_ephemeral_random() const { return (state & STATE_EPHEMERAL_RANDOM); }
void print(ostream& out) const {
// return remote pointer to to-be-journaled inode
void add_primary_dentry(CDentry *dn, CInode *in, bool dirty,
bool dirty_parent=false, bool dirty_pool=false,
- bool need_snapflush=false, bool export_ephemeral_distributed=false,
- bool export_ephemeral_random=false) {
+ bool need_snapflush=false) {
__u8 state = 0;
if (dirty) state |= fullbit::STATE_DIRTY;
if (dirty_parent) state |= fullbit::STATE_DIRTYPARENT;
if (dirty_pool) state |= fullbit::STATE_DIRTYPOOL;
if (need_snapflush) state |= fullbit::STATE_NEED_SNAPFLUSH;
- if (export_ephemeral_distributed) state |= fullbit::STATE_EPHEMERAL_DISTRIBUTED;
- if (export_ephemeral_random) state |= fullbit::STATE_EPHEMERAL_RANDOM;
add_primary_dentry(add_dir(dn->get_dir(), false), dn, in, state);
}
void add_primary_dentry(dirlump& lump, CDentry *dn, CInode *in, __u8 state) {
if (!in)
in = dn->get_projected_linkage()->get_inode();
+ if (in->is_ephemeral_rand()) {
+ state |= fullbit::STATE_EPHEMERAL_RANDOM;
+ }
+
// make note of where this inode was last journaled
in->last_journaled = event_seq;
//cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl;
in->inode = inode;
in->xattrs = xattrs;
if (in->inode.is_dir()) {
- in->is_export_ephemeral_distributed_pinned = is_export_ephemeral_distributed();
- in->is_export_ephemeral_random_pinned = is_export_ephemeral_random();
- dout(10) << "I'm in update_inode inside journal.cc and is_export_ephemeral_distrib for inode " << *in << "is" << in->is_export_ephemeral_distributed_pinned << dendl;
- }
- in->maybe_export_pin();
- if (in->inode.is_dir()) {
+ if (is_export_ephemeral_random()) {
+ dout(15) << "random ephemeral pin on " << *in << dendl;
+ in->set_ephemeral_rand(true);
+ in->maybe_ephemeral_rand();
+ }
+ in->maybe_ephemeral_dist();
+ in->maybe_export_pin();
if (!(in->dirfragtree == dirfragtree)) {
dout(10) << "EMetaBlob::fullbit::update_inode dft " << in->dirfragtree << " -> "
<< dirfragtree << " on " << *in << dendl;
f->dump_unsigned("change_attr", change_attr);
f->dump_int("export_pin", export_pin);
f->dump_int("export_ephemeral_random_pin", export_ephemeral_random_pin);
- f->dump_int("export_ephemeral_distributed_pin", export_ephemeral_distributed_pin);
+ f->dump_bool("export_ephemeral_distributed_pin", export_ephemeral_distributed_pin);
f->open_array_section("client_ranges");
for (const auto &p : client_ranges) {