.set_description("allow ephemeral distributed pinning of the loaded subtrees")
.set_long_description("pin the immediate child directories of the loaded directory inode based on the consistent hash of the child's inode number. "),
+ Option("mds_export_ephemeral_distributed_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(2.0)
+ .set_min_max(1.0, 100.0)
+ .set_flag(Option::FLAG_RUNTIME)
+ .set_description("multiple of max_mds for splitting and distributing directory"),
+
Option("mds_bal_sample_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(3.0)
.set_description("interval in seconds between balancer ticks"),
if (in->get_inode()->is_dirty_rstat())
in->mark_dirty_rstat();
- in->maybe_ephemeral_rand(true, rand_threshold);
+ in->maybe_ephemeral_rand(rand_threshold);
//in->hack_accessed = false;
//in->hack_load_stamp = ceph_clock_now();
//num_new_inodes_loaded++;
// IMPORT/EXPORT
+mds_rank_t CDir::get_export_pin(bool inherit) const
+{
+ mds_rank_t export_pin = inode->get_export_pin(inherit);
+ if (export_pin == MDS_RANK_EPHEMERAL_DIST)
+ export_pin = mdcache->hash_into_rank_bucket(ino(), get_frag());
+ else if (export_pin == MDS_RANK_EPHEMERAL_RAND)
+ export_pin = mdcache->hash_into_rank_bucket(ino());
+ return export_pin;
+}
+
+bool CDir::is_exportable(mds_rank_t dest) const
+{
+ mds_rank_t export_pin = get_export_pin();
+ if (export_pin == dest)
+ return true;
+ if (export_pin >= 0)
+ return false;
+ return true;
+}
+
void CDir::encode_export(bufferlist& bl)
{
ENCODE_START(1, 1, bl);
return effective_size > fast_limit;
}
+bool CDir::should_merge() const
+{
+ if (get_frag() == frag_t())
+ return false;
+
+ if (inode->is_ephemeral_dist()) {
+ unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
+ if (min_frag_bits > 0 && get_frag().bits() < min_frag_bits + 1)
+ return false;
+ }
+
+ return (int)get_frag_size() < g_conf()->mds_bal_merge_size;
+}
+
MEMPOOL_DEFINE_OBJECT_FACTORY(CDir, co_dir, mds_co);
MEMPOOL_DEFINE_OBJECT_FACTORY(CDir::scrub_info_t, scrub_info_t, mds_co)
(int)get_frag_size() > g_conf()->mds_bal_split_size;
}
bool should_split_fast() const;
- bool should_merge() const {
- return get_frag() != frag_t() &&
- (int)get_frag_size() < g_conf()->mds_bal_merge_size;
- }
+ bool should_merge() const;
mds_authority_t authority() const override;
mds_authority_t get_dir_auth() const { return dir_auth; }
void finish_waiting(uint64_t mask, int result = 0); // ditto
// -- import/export --
+ mds_rank_t get_export_pin(bool inherit=true) const;
+ bool is_exportable(mds_rank_t dest) const;
+
void encode_export(ceph::buffer::list& bl);
void finish_export();
void abort_export() {
out << " snaprealm=" << in.snaprealm;
if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH";
- if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " needsrecover";
- if (in.state_test(CInode::STATE_RECOVERING)) out << " recovering";
- if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " dirtyparent";
- if (in.state_test(CInode::STATE_MISSINGOBJS)) out << " missingobjs";
+ if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " NEEDSRECOVER";
+ if (in.state_test(CInode::STATE_RECOVERING)) out << " RECOVERING";
+ if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " DIRTYPARENT";
+ if (in.state_test(CInode::STATE_MISSINGOBJS)) out << " MISSINGOBJS";
+ if (in.is_ephemeral_dist()) out << " DISTEPHEMERALPIN";
+ if (in.is_ephemeral_rand()) out << " RANDEPHEMERALPIN";
if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance;
if (in.is_frozen_inode()) out << " FROZEN";
if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN";
if (mut)
mut->remove_projected_node(this);
- bool pool_update = get_inode()->layout.pool_id != front.inode->layout.pool_id;
- bool pin_update = get_inode()->export_pin != front.inode->export_pin;
- bool dist_update = get_inode()->export_ephemeral_distributed_pin !=
- front.inode->export_ephemeral_distributed_pin;
+ bool pool_updated = get_inode()->layout.pool_id != front.inode->layout.pool_id;
+ bool pin_updated = (get_inode()->export_pin != front.inode->export_pin) ||
+ (get_inode()->export_ephemeral_distributed_pin !=
+ front.inode->export_ephemeral_distributed_pin);
reset_inode(std::move(front.inode));
if (front.xattrs != get_xattrs())
mark_dirty(ls);
if (get_inode()->is_backtrace_updated())
- mark_dirty_parent(ls, pool_update);
+ mark_dirty_parent(ls, pool_updated);
- if (pin_update)
+ if (pin_updated)
maybe_export_pin(true);
- if (dist_update)
- maybe_ephemeral_dist_children(true);
}
sr_t *CInode::prepare_new_srnode(snapid_t snapid)
dir->get(CDir::PIN_STICKY);
}
- maybe_pin();
+ maybe_export_pin();
return dir;
}
}
}
DECODE_FINISH(p);
- mds_rank_t old_export_pin = get_inode()->export_pin;
- bool old_ephemeral_pin = get_inode()->export_ephemeral_distributed_pin;
+
+ bool pin_updated = (get_inode()->export_pin != _inode->export_pin) ||
+ (get_inode()->export_ephemeral_distributed_pin !=
+ _inode->export_ephemeral_distributed_pin);
reset_inode(std::move(_inode));
- maybe_export_pin(old_export_pin != get_inode()->export_pin);
- maybe_ephemeral_dist_children(old_ephemeral_pin != get_inode()->export_ephemeral_distributed_pin);
+ maybe_export_pin(pin_updated);
}
void CInode::encode_lock_state(int type, bufferlist& bl)
decode(s, p);
s &= MASK_STATE_EXPORTED;
- if (s & STATE_RANDEPHEMERALPIN) {
- set_ephemeral_rand(true);
- }
- if (s & STATE_DISTEPHEMERALPIN) {
- set_ephemeral_dist(true);
- }
-
+ set_ephemeral_pin((s & STATE_DISTEPHEMERALPIN),
+ (s & STATE_RANDEPHEMERALPIN));
state_set(STATE_AUTH | s);
}
}
}
-void CInode::queue_export_pin(mds_rank_t target)
+void CInode::queue_export_pin(mds_rank_t export_pin)
{
if (state_test(CInode::STATE_QUEUEDEXPORTPIN))
return;
+ mds_rank_t target;
+ if (export_pin >= 0)
+ target = export_pin;
+ else if (export_pin == MDS_RANK_EPHEMERAL_RAND)
+ target = mdcache->hash_into_rank_bucket(ino());
+ else
+ target = MDS_RANK_NONE;
+
+ unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
bool queue = false;
for (auto& p : dirfrags) {
CDir *dir = p.second;
if (!dir->is_auth())
continue;
+
+ if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
+ if (dir->get_frag().bits() < min_frag_bits) {
+ // needs split
+ queue = true;
+ break;
+ }
+ target = mdcache->hash_into_rank_bucket(ino(), dir->get_frag());
+ }
+
if (target != MDS_RANK_NONE) {
if (dir->is_subtree_root()) {
// set auxsubtree bit or export it
// clear aux subtrees ?
queue = dir->state_test(CDir::STATE_AUXSUBTREE);
}
- if (queue) {
- state_set(CInode::STATE_QUEUEDEXPORTPIN);
- mdcache->export_pin_queue.insert(this);
+
+ if (queue)
break;
- }
+ }
+ if (queue) {
+ state_set(CInode::STATE_QUEUEDEXPORTPIN);
+ mdcache->export_pin_queue.insert(this);
}
}
dout(15) << __func__ << " update=" << update << " " << *this << dendl;
- mds_rank_t export_pin = get_export_pin(false, false);
- if (export_pin == MDS_RANK_NONE && !update) {
+ mds_rank_t export_pin = get_export_pin(false);
+ if (export_pin == MDS_RANK_NONE && !update)
return;
- }
- /* disable ephemeral pins */
- set_ephemeral_dist(false);
- set_ephemeral_rand(false);
+ check_pin_policy(export_pin);
queue_export_pin(export_pin);
}
-void CInode::set_ephemeral_dist(bool yes)
-{
- if (yes) {
- if (!state_test(CInode::STATE_DISTEPHEMERALPIN)) {
- state_set(CInode::STATE_DISTEPHEMERALPIN);
- auto p = mdcache->dist_ephemeral_pins.insert(this);
- ceph_assert(p.second);
- }
- } else {
- /* avoid std::set::erase if unnecessary */
- if (state_test(CInode::STATE_DISTEPHEMERALPIN)) {
- dout(10) << "clearing ephemeral distributed pin on " << *this << dendl;
- state_clear(CInode::STATE_DISTEPHEMERALPIN);
- auto count = mdcache->dist_ephemeral_pins.erase(this);
- ceph_assert(count == 1);
- queue_export_pin(MDS_RANK_NONE);
- }
- }
-}
-
-void CInode::maybe_ephemeral_dist(bool update)
+void CInode::set_ephemeral_pin(bool dist, bool rand)
{
- if (!mdcache->get_export_ephemeral_distributed_config()) {
- dout(15) << __func__ << " config false: cannot ephemeral distributed pin " << *this << dendl;
- set_ephemeral_dist(false);
- return;
- } else if (!is_dir() || !is_normal()) {
- dout(15) << __func__ << " !dir or !normal: cannot ephemeral distributed pin " << *this << dendl;
- set_ephemeral_dist(false);
- return;
- } else if (get_inode()->nlink == 0) {
- dout(15) << __func__ << " unlinked directory: cannot ephemeral distributed pin " << *this << dendl;
- set_ephemeral_dist(false);
- return;
- } else if (!update && state_test(CInode::STATE_DISTEPHEMERALPIN)) {
- dout(15) << __func__ << " requeueing already pinned " << *this << dendl;
- queue_export_pin(mdcache->hash_into_rank_bucket(ino()));
- return;
- }
-
- dout(15) << __func__ << " update=" << update << " " << *this << dendl;
-
- auto dir = get_parent_dir();
- if (!dir) {
- return;
- }
-
- bool pin = dir->get_inode()->get_inode()->export_ephemeral_distributed_pin;
- if (pin) {
- dout(10) << __func__ << " ephemeral distributed pinning " << *this << dendl;
- set_ephemeral_dist(true);
- queue_export_pin(mdcache->hash_into_rank_bucket(ino()));
- } else if (update) {
- set_ephemeral_dist(false);
- queue_export_pin(MDS_RANK_NONE);
- }
-}
-
-void CInode::maybe_ephemeral_dist_children(bool update)
-{
- if (!mdcache->get_export_ephemeral_distributed_config()) {
- dout(15) << __func__ << " config false: cannot ephemeral distributed pin " << *this << dendl;
- return;
- } else if (!is_dir() || !is_normal()) {
- dout(15) << __func__ << " !dir or !normal: cannot ephemeral distributed pin " << *this << dendl;
- return;
- } else if (get_inode()->nlink == 0) {
- dout(15) << __func__ << " unlinked directory: cannot ephemeral distributed pin " << *this << dendl;
- return;
- }
-
- bool pin = get_inode()->export_ephemeral_distributed_pin;
- /* FIXME: expensive to iterate children when not updating */
- if (!pin && !update) {
+ unsigned state = 0;
+ if (dist)
+ state |= STATE_DISTEPHEMERALPIN;
+ if (rand)
+ state |= STATE_RANDEPHEMERALPIN;
+ if (!state)
return;
- }
- dout(10) << __func__ << " maybe ephemerally pinning children of " << *this << dendl;
- for (auto& p : dirfrags) {
- auto& dir = p.second;
- for (auto& q : *dir) {
- auto& dn = q.second;
- auto&& in = dn->get_linkage()->get_inode();
- if (in && in->is_dir()) {
- in->maybe_ephemeral_dist(update);
- }
+ if (state_test(state) != state) {
+ dout(10) << "set ephemeral (" << (dist ? "dist" : "")
+ << (rand ? " rand" : "") << ") pin on " << *this << dendl;
+ if (!is_ephemerally_pinned()) {
+ auto p = mdcache->export_ephemeral_pins.insert(this);
+ ceph_assert(p.second);
}
+ state_set(state);
}
}
-void CInode::set_ephemeral_rand(bool yes)
+void CInode::clear_ephemeral_pin(bool dist, bool rand)
{
- if (yes) {
- if (!state_test(CInode::STATE_RANDEPHEMERALPIN)) {
- state_set(CInode::STATE_RANDEPHEMERALPIN);
- auto p = mdcache->rand_ephemeral_pins.insert(this);
- ceph_assert(p.second);
- }
- } else {
- if (state_test(CInode::STATE_RANDEPHEMERALPIN)) {
- dout(10) << "clearing ephemeral random pin on " << *this << dendl;
- state_clear(CInode::STATE_RANDEPHEMERALPIN);
- auto count = mdcache->rand_ephemeral_pins.erase(this);
+ unsigned state = 0;
+ if (dist)
+ state |= STATE_DISTEPHEMERALPIN;
+ if (rand)
+ state |= STATE_RANDEPHEMERALPIN;
+
+ if (state_test(state)) {
+ dout(10) << "clear ephemeral (" << (dist ? "dist" : "")
+ << (rand ? " rand" : "") << ") pin on " << *this << dendl;
+ state_clear(state);
+ if (!is_ephemerally_pinned()) {
+ auto count = mdcache->export_ephemeral_pins.erase(this);
ceph_assert(count == 1);
- queue_export_pin(MDS_RANK_NONE);
}
}
}
-void CInode::maybe_ephemeral_rand(bool fresh, double threshold)
+void CInode::maybe_ephemeral_rand(double threshold)
{
if (!mdcache->get_export_ephemeral_random_config()) {
dout(15) << __func__ << " config false: cannot ephemeral random pin " << *this << dendl;
- set_ephemeral_rand(false);
+ clear_ephemeral_pin(false, true);
return;
} else if (!is_dir() || !is_normal()) {
dout(15) << __func__ << " !dir or !normal: cannot ephemeral random pin " << *this << dendl;
- set_ephemeral_rand(false);
+ clear_ephemeral_pin(false, true);
return;
} else if (get_inode()->nlink == 0) {
dout(15) << __func__ << " unlinked directory: cannot ephemeral random pin " << *this << dendl;
- set_ephemeral_rand(false);
+ clear_ephemeral_pin(false, true);
return;
} else if (state_test(CInode::STATE_RANDEPHEMERALPIN)) {
dout(10) << __func__ << " already ephemeral random pinned: requeueing " << *this << dendl;
- queue_export_pin(mdcache->hash_into_rank_bucket(ino()));
- return;
- } else if (!fresh) {
+ queue_export_pin(MDS_RANK_EPHEMERAL_RAND);
return;
}
if (n <= threshold) {
dout(10) << __func__ << " randomly export pinning " << *this << dendl;
- set_ephemeral_rand(true);
- queue_export_pin(mdcache->hash_into_rank_bucket(ino()));
+ set_ephemeral_pin(false, true);
+ queue_export_pin(MDS_RANK_EPHEMERAL_RAND);
}
}
maybe_export_pin(true);
}
-void CInode::check_pin_policy()
+mds_rank_t CInode::get_export_pin(bool inherit) const
{
- const CInode *in = this;
- mds_rank_t etarget = MDS_RANK_NONE;
- while (true) {
- if (in->is_system())
- break;
- const CDentry *pdn = in->get_parent_dn();
- if (!pdn)
- break;
- if (in->get_inode()->nlink == 0) {
- // ignore export pin for unlinked directory
- return;
- } else if (etarget != MDS_RANK_NONE && in->has_ephemeral_policy()) {
- return;
- } else if (in->get_inode()->export_pin >= 0) {
- /* clear any epin policy */
- set_ephemeral_dist(false);
- set_ephemeral_rand(false);
- return;
- } else if (etarget == MDS_RANK_NONE && in->is_ephemerally_pinned()) {
- /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */
- etarget = mdcache->hash_into_rank_bucket(in->ino());
- }
- in = pdn->get_dir()->inode;
- }
-}
+ if (!g_conf()->mds_bal_export_pin)
+ return MDS_RANK_NONE;
-mds_rank_t CInode::get_export_pin(bool inherit, bool ephemeral) const
-{
/* An inode that is export pinned may not necessarily be a subtree root, we
* need to traverse the parents. A base or system inode cannot be pinned.
* N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
* have a parent yet.
*/
+ mds_rank_t r_target = MDS_RANK_NONE;
const CInode *in = this;
- mds_rank_t etarget = MDS_RANK_NONE;
+ const CDir *dir = nullptr;
while (true) {
if (in->is_system())
break;
break;
if (in->get_inode()->nlink == 0) {
// ignore export pin for unlinked directory
- return MDS_RANK_NONE;
- } else if (etarget != MDS_RANK_NONE && in->has_ephemeral_policy()) {
- return etarget;
- } else if (in->get_inode()->export_pin >= 0) {
+ break;
+ }
+
+ if (in->get_inode()->export_pin >= 0) {
return in->get_inode()->export_pin;
- } else if (etarget == MDS_RANK_NONE && ephemeral && in->is_ephemerally_pinned()) {
+ } else if (in->get_inode()->export_ephemeral_distributed_pin &&
+ mdcache->get_export_ephemeral_distributed_config()) {
+ if (in != this)
+ return mdcache->hash_into_rank_bucket(in->ino(), dir->get_frag());
+ return MDS_RANK_EPHEMERAL_DIST;
+ } else if (r_target != MDS_RANK_NONE && in->get_inode()->export_ephemeral_random_pin > 0.0) {
+ return r_target;
+ } else if (r_target == MDS_RANK_NONE && in->is_ephemeral_rand() &&
+ mdcache->get_export_ephemeral_random_config()) {
/* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */
- etarget = mdcache->hash_into_rank_bucket(in->ino());
- if (!inherit) return etarget;
+ if (!inherit)
+ return MDS_RANK_EPHEMERAL_RAND;
+ if (in == this)
+ r_target = MDS_RANK_EPHEMERAL_RAND;
+ else
+ r_target = mdcache->hash_into_rank_bucket(in->ino());
}
- if (!inherit) {
+ if (!inherit)
break;
- }
- in = pdn->get_dir()->inode;
+ dir = pdn->get_dir();
+ in = dir->inode;
}
return MDS_RANK_NONE;
}
-double CInode::get_ephemeral_rand(bool inherit) const
+void CInode::check_pin_policy(mds_rank_t export_pin)
+{
+ if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
+ set_ephemeral_pin(true, false);
+ clear_ephemeral_pin(false, true);
+ } else if (export_pin == MDS_RANK_EPHEMERAL_RAND) {
+ set_ephemeral_pin(false, true);
+ clear_ephemeral_pin(true, false);
+ } else if (is_ephemerally_pinned()) {
+ // export_pin >= 0 || export_pin == MDS_RANK_NONE
+ clear_ephemeral_pin(true, true);
+ if (export_pin != get_inode()->export_pin) // inherited export_pin
+ queue_export_pin(MDS_RANK_NONE);
+ }
+}
+
+double CInode::get_ephemeral_rand() const
{
/* N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
* have a parent yet.
/* An export_pin overrides only if no closer parent (incl. this one) has a
* random pin set.
*/
- if (in->get_inode()->export_pin >= 0)
+ if (in->get_inode()->export_pin >= 0 ||
+ in->get_inode()->export_ephemeral_distributed_pin)
return 0.0;
- if (!inherit)
- break;
in = pdn->get_dir()->inode;
}
return 0.0;
}
-bool CInode::is_exportable(mds_rank_t dest) const
-{
- mds_rank_t pin = get_export_pin();
- if (pin == dest) {
- return true;
- } else if (pin >= 0) {
- return false;
- } else {
- return true;
- }
-}
-
void CInode::get_nested_dirfrags(std::vector<CDir*>& v) const
{
for (const auto &p : dirfrags) {
return !projected_parent.empty();
}
- mds_rank_t get_export_pin(bool inherit=true, bool ephemeral=true) const;
+ mds_rank_t get_export_pin(bool inherit=true) const;
+ void check_pin_policy(mds_rank_t target);
void set_export_pin(mds_rank_t rank);
void queue_export_pin(mds_rank_t target);
void maybe_export_pin(bool update=false);
- void check_pin_policy();
+ void set_ephemeral_pin(bool dist, bool rand);
+ void clear_ephemeral_pin(bool dist, bool rand);
- void set_ephemeral_dist(bool yes);
- void maybe_ephemeral_dist(bool update=false);
- void maybe_ephemeral_dist_children(bool update=false);
void setxattr_ephemeral_dist(bool val=false);
bool is_ephemeral_dist() const {
return state_test(STATE_DISTEPHEMERALPIN);
}
- double get_ephemeral_rand(bool inherit=true) const;
- void set_ephemeral_rand(bool yes);
- void maybe_ephemeral_rand(bool fresh=false, double threshold=-1.0);
+ double get_ephemeral_rand() const;
+ void maybe_ephemeral_rand(double threshold=-1.0);
void setxattr_ephemeral_rand(double prob=0.0);
bool is_ephemeral_rand() const {
return state_test(STATE_RANDEPHEMERALPIN);
return state_test(STATE_DISTEPHEMERALPIN) ||
state_test(STATE_RANDEPHEMERALPIN);
}
- bool is_exportable(mds_rank_t dest) const;
-
- void maybe_pin() {
- maybe_export_pin();
- maybe_ephemeral_dist();
- maybe_ephemeral_rand();
- }
void print(std::ostream& out) override;
void dump(ceph::Formatter *f, int flags = DUMP_DEFAULT) const;
void MDBalancer::handle_export_pins(void)
{
- auto &q = mds->mdcache->export_pin_queue;
+ const mds_rank_t max_mds = mds->mdsmap->get_max_mds();
+ auto mdcache = mds->mdcache;
+
+ auto &q = mdcache->export_pin_queue;
auto it = q.begin();
dout(20) << "export_pin_queue size=" << q.size() << dendl;
while (it != q.end()) {
CInode *in = *cur;
ceph_assert(in->is_dir());
- in->check_pin_policy();
mds_rank_t export_pin = in->get_export_pin(false);
- if (export_pin >= mds->mdsmap->get_max_mds()) {
+ in->check_pin_policy(export_pin);
+
+ if (export_pin >= max_mds) {
dout(20) << " delay export_pin=" << export_pin << " on " << *in << dendl;
in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
q.erase(cur);
in->state_set(CInode::STATE_DELAYEDEXPORTPIN);
- mds->mdcache->export_pin_delayed_queue.insert(in);
+ mdcache->export_pin_delayed_queue.insert(in);
continue;
- } else {
- dout(20) << " executing export_pin=" << export_pin << " on " << *in << dendl;
}
+ dout(20) << " executing export_pin=" << export_pin << " on " << *in << dendl;
+ unsigned min_frag_bits = 0;
+ mds_rank_t target = MDS_RANK_NONE;
+ if (export_pin >= 0)
+ target = export_pin;
+ else if (export_pin == MDS_RANK_EPHEMERAL_RAND)
+ target = mdcache->hash_into_rank_bucket(in->ino());
+ else if (export_pin == MDS_RANK_EPHEMERAL_DIST)
+ min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
+
bool remove = true;
for (auto&& dir : in->get_dirfrags()) {
if (!dir->is_auth())
continue;
- if (export_pin == MDS_RANK_NONE) {
+ if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
+ if (dir->get_frag().bits() < min_frag_bits) {
+ if (!dir->state_test(CDir::STATE_CREATING) &&
+ !dir->is_frozen() && !dir->is_freezing()) {
+ queue_split(dir, true);
+ }
+ remove = false;
+ continue;
+ }
+ target = mdcache->hash_into_rank_bucket(in->ino(), dir->get_frag());
+ }
+
+ if (target == MDS_RANK_NONE) {
if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
if (dir->is_frozen() || dir->is_freezing()) {
// try again later
dir->state_clear(CDir::STATE_AUXSUBTREE);
mds->mdcache->try_subtree_merge(dir);
}
- } else if (export_pin == mds->get_nodeid()) {
+ } else if (target == mds->get_nodeid()) {
if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
ceph_assert(dir->is_subtree_root());
} else if (dir->state_test(CDir::STATE_CREATING) ||
* be sent back by the importer.
*/
if (dir->get_num_head_items() > 0) {
- mds->mdcache->migrator->export_dir(dir, export_pin);
+ mds->mdcache->migrator->export_dir(dir, target);
}
remove = false;
}
}
}
- std::vector<CDir *> authsubs = mds->mdcache->get_auth_subtrees();
+ std::vector<CDir*> authsubs = mdcache->get_auth_subtrees();
bool print_auth_subtrees = true;
if (authsubs.size() > AUTH_TREES_THRESHOLD &&
for (auto &cd : authsubs) {
mds_rank_t export_pin = cd->inode->get_export_pin();
+ cd->inode->check_pin_policy(export_pin);
- if (print_auth_subtrees) {
- dout(25) << "auth tree " << *cd << " export_pin=" << export_pin <<
- dendl;
+ if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
+ export_pin = mdcache->hash_into_rank_bucket(cd->ino(), cd->get_frag());
+ } else if (export_pin == MDS_RANK_EPHEMERAL_RAND) {
+ export_pin = mdcache->hash_into_rank_bucket(cd->ino());
}
- if (export_pin >= 0 && export_pin < mds->mdsmap->get_max_mds()) {
- if (export_pin == mds->get_nodeid()) {
- cd->get_inode()->check_pin_policy();
- } else {
- mds->mdcache->migrator->export_dir(cd, export_pin);
- }
+ if (print_auth_subtrees)
+ dout(25) << "auth tree " << *cd << " export_pin=" << export_pin << dendl;
+
+ if (export_pin >= 0 && export_pin != mds->get_nodeid() &&
+ export_pin < mds->mdsmap->get_max_mds()) {
+ mdcache->migrator->export_dir(cd, export_pin);
}
}
}
dout(10) << __func__ << " enqueuing " << *dir
<< " (fast=" << fast << ")" << dendl;
- const dirfrag_t frag = dir->dirfrag();
+ const dirfrag_t df = dir->dirfrag();
- auto callback = [this, frag](int r) {
- if (split_pending.erase(frag) == 0) {
+ auto callback = [this, df](int r) {
+ if (split_pending.erase(df) == 0) {
// Someone beat me to it. This can happen in the fast splitting
// path, because we spawn two contexts, one with mds->timer and
// one with mds->queue_waiter. The loser can safely just drop
return;
}
- CDir *split_dir = mds->mdcache->get_dirfrag(frag);
- if (!split_dir) {
- dout(10) << "drop split on " << frag << " because not in cache" << dendl;
+ auto mdcache = mds->mdcache;
+
+ CDir *dir = mdcache->get_dirfrag(df);
+ if (!dir) {
+ dout(10) << "drop split on " << df << " because not in cache" << dendl;
return;
}
- if (!split_dir->is_auth()) {
- dout(10) << "drop split on " << frag << " because non-auth" << dendl;
+ if (!dir->is_auth()) {
+ dout(10) << "drop split on " << df << " because non-auth" << dendl;
return;
}
// Pass on to MDCache: note that the split might still not
// happen if the checks in MDCache::can_fragment fail.
- dout(10) << __func__ << " splitting " << *split_dir << dendl;
- mds->mdcache->split_dir(split_dir, g_conf()->mds_bal_split_bits);
+ dout(10) << __func__ << " splitting " << *dir << dendl;
+ int bits = g_conf()->mds_bal_split_bits;
+ if (dir->inode->is_ephemeral_dist()) {
+ unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
+ if (df.frag.bits() + bits < min_frag_bits)
+ bits = min_frag_bits - df.frag.bits();
+ }
+ mdcache->split_dir(dir, bits);
};
- bool is_new = false;
- if (split_pending.count(frag) == 0) {
- split_pending.insert(frag);
- is_new = true;
- }
+ auto ret = split_pending.insert(df);
+ bool is_new = ret.second;
if (fast) {
// Do the split ASAP: enqueue it in the MDSRank waiters which are
// starting one), and this context is the only one that erases it.
merge_pending.erase(frag);
- CDir *dir = mds->mdcache->get_dirfrag(frag);
+ auto mdcache = mds->mdcache;
+ CDir *dir = mdcache->get_dirfrag(frag);
if (!dir) {
dout(10) << "drop merge on " << frag << " because not in cache" << dendl;
return;
CInode *diri = dir->get_inode();
+ unsigned min_frag_bits = 0;
+ if (diri->is_ephemeral_dist())
+ min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
+
frag_t fg = dir->get_frag();
- while (fg != frag_t()) {
+ while (fg.bits() > min_frag_bits) {
frag_t sibfg = fg.get_sibling();
auto&& [complete, sibs] = diri->get_dirfrags_under(sibfg);
if (!complete) {
}
if (fg != dir->get_frag())
- mds->mdcache->merge_dir(diri, fg);
+ mdcache->merge_dir(diri, fg);
};
if (merge_pending.count(frag) == 0) {
cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
if (changed.count("mds_cache_reservation"))
cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
+
+ bool ephemeral_pin_config_changed = false;
if (changed.count("mds_export_ephemeral_distributed")) {
export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed");
dout(10) << "Migrating any ephemeral distributed pinned inodes" << dendl;
/* copy to vector to avoid removals during iteration */
- std::vector<CInode*> migrate;
- migrate.assign(dist_ephemeral_pins.begin(), dist_ephemeral_pins.end());
- for (auto& in : migrate) {
- in->maybe_ephemeral_dist();
- }
- mds->balancer->handle_export_pins();
+ ephemeral_pin_config_changed = true;
}
if (changed.count("mds_export_ephemeral_random")) {
export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random");
dout(10) << "Migrating any ephemeral random pinned inodes" << dendl;
/* copy to vector to avoid removals during iteration */
+ ephemeral_pin_config_changed = true;
+ }
+ if (ephemeral_pin_config_changed) {
std::vector<CInode*> migrate;
- migrate.assign(rand_ephemeral_pins.begin(), rand_ephemeral_pins.end());
+ migrate.assign(export_ephemeral_pins.begin(), export_ephemeral_pins.end());
for (auto& in : migrate) {
- in->maybe_ephemeral_rand();
+ in->maybe_export_pin(true);
}
- mds->balancer->handle_export_pins();
}
if (changed.count("mds_export_ephemeral_random_max")) {
export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max");
if (cache_toofull()) {
exceeded_size_limit = true;
}
-
- in->maybe_ephemeral_dist(false);
}
void MDCache::remove_inode(CInode *o)
if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN))
export_pin_delayed_queue.erase(o);
- o->set_ephemeral_dist(false);
- o->set_ephemeral_rand(false);
+ o->clear_ephemeral_pin(true, true);
// remove from inode map
if (o->last == CEPH_NOSNAP) {
/*
* hashing implementation based on Lamping and Veach's Jump Consistent Hash: https://arxiv.org/pdf/1406.2294.pdf
*/
-mds_rank_t MDCache::hash_into_rank_bucket(inodeno_t ino)
+mds_rank_t MDCache::hash_into_rank_bucket(inodeno_t ino, frag_t fg)
{
const mds_rank_t max_mds = mds->mdsmap->get_max_mds();
uint64_t hash = rjhash64(ino);
+ if (fg)
+ hash = rjhash64(hash + rjhash64(fg.value()));
+
int64_t b = -1, j = 0;
while (j < max_mds) {
b = j;
}
}
- if (dir->is_auth()) {
- /* do this now that we are auth for the CDir */
- dir->inode->maybe_pin();
- }
-
show_subtrees();
}
trim(UINT64_MAX);
dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
-
- {
- dout(10) << "Migrating any ephemerally pinned inodes" << dendl;
- /* copy to vector to avoid removals during iteration */
- std::vector<CInode*> migrate;
- migrate.assign(rand_ephemeral_pins.begin(), rand_ephemeral_pins.end());
- for (auto& in : migrate) {
- in->maybe_ephemeral_rand();
- }
- migrate.assign(dist_ephemeral_pins.begin(), dist_ephemeral_pins.end());
- for (auto& in : migrate) {
- in->maybe_ephemeral_dist();
- }
- mds->balancer->handle_export_pins();
- }
-
// Export all subtrees to another active (usually rank 0) if not rank 0
int num_auth_subtree = 0;
if (!subtrees.empty() && mds->get_nodeid() != 0) {
}
migrator->clear_export_queue();
-
+ // stopping mds does not call MDBalancer::tick()
+ mds->balancer->handle_export_pins();
for (const auto& dir : ls) {
mds_rank_t dest = dir->get_inode()->authority().first;
if (dest > 0 && !mds->mdsmap->is_active(dest))
s &= CInode::MASK_STATE_REPLICATED;
if (s & CInode::STATE_RANDEPHEMERALPIN) {
dout(10) << "replica inode is random ephemeral pinned" << dendl;
- in->set_ephemeral_rand(true);
+ in->set_ephemeral_pin(false, true);
}
}
}
void MDCache::handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap) {
+ const mds_rank_t max_mds = mdsmap.get_max_mds();
+
// process export_pin_delayed_queue whenever a new MDSMap received
auto &q = export_pin_delayed_queue;
for (auto it = q.begin(); it != q.end(); ) {
auto *in = *it;
mds_rank_t export_pin = in->get_export_pin(false);
- if (in->is_ephemerally_pinned()) {
- dout(10) << "ephemeral export pin to " << export_pin << " for " << *in << dendl;
- }
dout(10) << " delayed export_pin=" << export_pin << " on " << *in
- << " max_mds=" << mdsmap.get_max_mds() << dendl;
+ << " max_mds=" << max_mds << dendl;
if (export_pin >= mdsmap.get_max_mds()) {
it++;
continue;
dout(10) << "Checking ephemerally pinned directories for redistribute due to max_mds change." << dendl;
/* copy to vector to avoid removals during iteration */
std::vector<CInode*> migrate;
- migrate.assign(rand_ephemeral_pins.begin(), rand_ephemeral_pins.end());
- for (auto& in : migrate) {
- in->maybe_ephemeral_rand();
- }
- migrate.assign(dist_ephemeral_pins.begin(), dist_ephemeral_pins.end());
+ migrate.assign(export_ephemeral_pins.begin(), export_ephemeral_pins.end());
for (auto& in : migrate) {
- in->maybe_ephemeral_dist();
+ in->maybe_export_pin();
}
}
+
+ if (max_mds <= 1) {
+ export_ephemeral_dist_frag_bits = 0;
+ } else {
+ double want = g_conf().get_val<double>("mds_export_ephemeral_distributed_factor");
+ want *= max_mds;
+ unsigned n = 0;
+ while ((1U << n) < (unsigned)want)
+ ++n;
+ export_ephemeral_dist_frag_bits = n;
+ }
}
void advance_stray();
+ unsigned get_ephemeral_dist_frag_bits() const {
+ return export_ephemeral_dist_frag_bits;
+ }
bool get_export_ephemeral_distributed_config(void) const {
return export_ephemeral_distributed_config;
}
stray_manager.eval_stray(dn);
}
- mds_rank_t hash_into_rank_bucket(inodeno_t ino);
+ mds_rank_t hash_into_rank_bucket(inodeno_t ino, frag_t fg=0);
void maybe_eval_stray(CInode *in, bool delay=false);
void clear_dirty_bits_for_stray(CInode* diri);
/* Because exports may fail, this set lets us keep track of inodes that need exporting. */
std::set<CInode *> export_pin_queue;
std::set<CInode *> export_pin_delayed_queue;
- std::set<CInode *> rand_ephemeral_pins;
- std::set<CInode *> dist_ephemeral_pins;
+ std::set<CInode *> export_ephemeral_pins;
OpenFileTable open_file_table;
bool export_ephemeral_distributed_config;
bool export_ephemeral_random_config;
+ unsigned export_ephemeral_dist_frag_bits;
// File size recovery
RecoveryQueue recovery_queue;
{
f->dump_bool("is_auth", dir->is_auth());
f->dump_int("auth_first", dir->get_dir_auth().first);
- f->dump_int("auth_second", dir->get_dir_auth().second);
- f->dump_int("export_pin", dir->inode->get_export_pin(false, false));
- f->dump_bool("distributed_ephemeral_pin", dir->inode->is_ephemeral_dist());
- f->dump_bool("random_ephemeral_pin", dir->inode->is_ephemeral_rand());
- f->dump_int("ephemeral_pin", mdcache->hash_into_rank_bucket(dir->inode->ino()));
+ f->dump_int("auth_second", dir->get_dir_auth().second); {
+ mds_rank_t export_pin = dir->inode->get_export_pin(false);
+ f->dump_int("export_pin", export_pin >= 0 ? export_pin : -1);
+ f->dump_bool("distributed_ephemeral_pin", export_pin == MDS_RANK_EPHEMERAL_DIST);
+ f->dump_bool("random_ephemeral_pin", export_pin == MDS_RANK_EPHEMERAL_RAND);
+ }
+ f->dump_int("export_pin_target", dir->get_export_pin(false));
f->open_object_section("dir");
dir->dump(f);
f->close_section();
ceph_assert(dest != mds->get_nodeid());
CDir* parent = dir->inode->get_projected_parent_dir();
- if (!mds->is_stopping() && !dir->inode->is_exportable(dest) && dir->get_num_head_items() > 0) {
+ if (!mds->is_stopping() && !dir->is_exportable(dest) && dir->get_num_head_items() > 0) {
dout(7) << "Cannot export to mds." << dest << " " << *dir << ": dir is export pinned" << dendl;
return;
} else if (!(mds->is_active() || mds->is_stopping())) {
dir->verify_fragstat();
#endif
- dir->inode->maybe_ephemeral_dist();
dir->inode->maybe_export_pin();
dout(7) << " done " << *dir << dendl;
get_mds()->locker->share_inode_max_size(newi);
} else if (newi->is_dir()) {
// We do this now so that the linkages on the new directory are stable.
- newi->maybe_ephemeral_dist();
- newi->maybe_ephemeral_rand(true);
+ newi->maybe_ephemeral_rand();
}
// hit pop
if (in->is_dir()) {
if (is_export_ephemeral_random()) {
dout(15) << "random ephemeral pin on " << *in << dendl;
- in->set_ephemeral_rand(true);
- in->maybe_ephemeral_rand(true);
+ in->set_ephemeral_pin(false, true);
}
- in->maybe_ephemeral_dist();
in->maybe_export_pin();
if (!(in->dirfragtree == dirfragtree)) {
dout(10) << "EMetaBlob::fullbit::update_inode dft " << in->dirfragtree << " -> "
#define MDS_INO_STRAY_INDEX(i) (((unsigned (i)) - MDS_INO_STRAY_OFFSET) % NUM_STRAY)
typedef int32_t mds_rank_t;
-constexpr mds_rank_t MDS_RANK_NONE = -1;
+constexpr mds_rank_t MDS_RANK_NONE = -1;
+constexpr mds_rank_t MDS_RANK_EPHEMERAL_DIST = -2;
+constexpr mds_rank_t MDS_RANK_EPHEMERAL_RAND = -3;
BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
extern const mds_gid_t MDS_GID_NONE;