back to being a standby.
+Manually pinning directory trees to a particular rank
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In multiple active metadata server configurations, a balancer runs which works
+to spread metadata load evenly across the cluster. This usually works well
+enough for most users but sometimes it is desirable to override the dynamic
+balancer with explicit mappings of metadata to particular ranks. This can allow
+the administrator or users to evenly spread application load or limit impact of
+users' metadata requests on the entire cluster.
+
+The mechanism provided for this purpose is called an ``export pin``, an
+extended attribute of directories. The name of this extended attribute is
+``ceph.dir.pin``. Users can set this attribute using standard commands:
+
+::
+ setfattr -n ceph.dir.pin -v 2 path/to/dir
+
+The value of the extended attribute is the rank to assign the directory subtree
+to. A default value of ``-1`` indicates the directory is not pinned.
+
+A directory's export pin is inherited from its closest parent with a set export
+pin. In this way, setting the export pin on a directory affects all of its
+children. However, the parents pin can be overriden by setting the child
+directory's export pin. For example:
+
+::
+ mkdir -p a/b
+ # "a" and "a/b" both start without an export pin set
+ setfattr -n ceph.dir.pin -v 1 a/
+ # a and b are now pinned to rank 1
+ setfattr -n ceph.dir.pin -v 0 a/b
+ # a/b is now pinned to rank 0 and a/ and the rest of its children are still pinned to rank 1
--- /dev/null
+tasks:
+- cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_exports
--- /dev/null
+import logging
+import time
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+
+class TestExports(CephFSTestCase):
+ def test_export_pin(self):
+ if not isinstance(self.mount_a, FuseMount):
+ self.skipTest("FUSE needed for measuring op counts")
+
+ self.fs.set_allow_multimds(True)
+ self.fs.set_max_mds(2)
+
+ status = self.fs.status()
+
+ self.mount_a.run_shell(["mkdir", "-p", "1/2/3"])
+ subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 0)['name'])
+ log.info(subtrees)
+ subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees)
+ self.assertTrue(len(subtrees) == 0)
+
+ # NOP
+ self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "-1", "1"])
+ subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 0)['name'])
+ log.info(subtrees)
+ subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees)
+ self.assertTrue(len(subtrees) == 0)
+
+ # NOP (rank < -1)
+ self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "-2341", "1"])
+ subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 0)['name'])
+ log.info(subtrees)
+ subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees)
+ self.assertTrue(len(subtrees) == 0)
+
+ # pin /1 to rank 1
+ self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", "1"])
+ time.sleep(10)
+ subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 1)['name'])
+ log.info(subtrees)
+ subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees)
+ log.info(subtrees)
+ self.assertTrue(len(subtrees) == 1 and subtrees[0]['auth_first'] == 1)
+
+ # Check export_targets is set properly
+ status = self.fs.status()
+ log.info(status)
+ r0 = status.get_rank(self.fs.id, 0)
+ self.assertTrue(sorted(r0['export_targets']) == [1])
+
+ # redundant pin /1/2 to rank 1
+ self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", "1/2"])
+ time.sleep(10)
+ subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 1)['name'])
+ log.info(subtrees)
+ subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees)
+ self.assertTrue(len(subtrees) == 1 and subtrees[0]['auth_first'] == 1)
+
+ # change pin /1/2 to rank 0
+ self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "0", "1/2"])
+ time.sleep(10)
+ subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 0)['name'])
+ log.info(subtrees)
+ subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees)
+ self.assertTrue(len(subtrees) == 2)
+ subtrees = filter(lambda s: s['dir']['path'].startswith('/1/2'), subtrees)
+ self.assertTrue(len(subtrees) == 1 and subtrees[0]['auth_first'] == 0)
+
+ # change pin /1/2/3 to (presently) non-existent rank 2
+ self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "2", "1/2/3"])
+ time.sleep(10)
+ subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 0)['name'])
+ log.info(subtrees)
+ subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees)
+ self.assertTrue(len(subtrees) == 2)
+ subtrees = filter(lambda s: s['dir']['path'].startswith('/1/2'), subtrees)
+ self.assertTrue(len(subtrees) == 1 and subtrees[0]['auth_first'] == 0)
+
+ # change pin /1/2 back to rank 1
+ self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", "1/2"])
+ time.sleep(10)
+ subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 1)['name'])
+ log.info(subtrees)
+ subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees)
+ self.assertTrue(len(subtrees) == 1 and subtrees[0]['auth_first'] == 1)
+
+ # add another directory pinned to 1
+ self.mount_a.run_shell(["mkdir", "-p", "1/4/5"])
+ self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", "1/4/5"])
+
+ # change pin /1 to 0
+ self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "0", "1"])
+ time.sleep(20)
+ subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 0)['name'])
+ log.info(subtrees)
+ subtrees = filter(lambda s: s['dir']['path'] == '/1', subtrees)
+ self.assertTrue(len(subtrees) == 0) # /1 is merged into root
+ subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 1)['name'])
+ log.info(subtrees)
+ subtrees = filter(lambda s: s['dir']['path'].startswith('/1/'), subtrees)
+ self.assertTrue(len(subtrees) == 2 and subtrees[0]['auth_first'] == 1 and subtrees[1]['auth_first'] == 1) # /1/2 and /1/4/5
+
+ # change pin /1/2 to default (-1); does the subtree root properly respect it's parent pin?
+ self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "-1", "1/2"])
+ time.sleep(10)
+ subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 1)['name'])
+ log.info(subtrees)
+ subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees)
+ self.assertTrue(len(subtrees) == 1) # /1/4 still here!
+
+ if len(list(status.get_standbys())):
+ self.fs.set_max_mds(3)
+ time.sleep(10)
+ status = self.fs.status()
+ subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 2)['name'])
+ log.info(subtrees)
+ subtrees = filter(lambda s: s['dir']['path'].startswith('/1/2/3'), subtrees)
+ self.assertTrue(len(subtrees) == 1)
+
+ # Check export_targets is set properly
+ status = self.fs.status()
+ log.info(status)
+ r0 = status.get_rank(self.fs.id, 0)
+ self.assertTrue(sorted(r0['export_targets']) == [1,2])
+ r1 = status.get_rank(self.fs.id, 1)
+ self.assertTrue(sorted(r1['export_targets']) == [0])
+ r2 = status.get_rank(self.fs.id, 2)
+ self.assertTrue(sorted(r2['export_targets']) == [])
+
+ # Test rename
+ self.mount_a.run_shell(["mkdir", "-p", "a/b", "aa/bb"])
+ self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", "a"])
+ time.sleep(10);
+ subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 0)['name'])
+ log.info(subtrees)
+ subtrees = filter(lambda s: s['dir']['path'].startswith('/a'), subtrees)
+ self.assertTrue(len(subtrees) == 1 and subtrees[0]['auth_first'] == 1)
+ self.mount_a.run_shell(["mv", "aa", "a/b/"])
+ time.sleep(10)
+ subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 0)['name'])
+ log.info(subtrees)
+ subtrees = filter(lambda s: s['dir']['path'].startswith('/a'), subtrees)
+ self.assertTrue(len(subtrees) == 1 and subtrees[0]['auth_first'] == 1)
OPTION(mds_log_segment_size, OPT_INT, 0) // segment size for mds log, default to default file_layout_t
OPTION(mds_log_max_segments, OPT_U32, 30)
OPTION(mds_log_max_expiring, OPT_INT, 20)
+OPTION(mds_bal_export_pin, OPT_BOOL, true) // allow clients to pin directory trees to ranks
OPTION(mds_bal_sample_interval, OPT_DOUBLE, 3.0) // every 3 seconds
OPTION(mds_bal_replicate_threshold, OPT_FLOAT, 8000)
OPTION(mds_bal_unreplicate_threshold, OPT_FLOAT, 0)
in.print_pin_set(out);
}
+ if (in.inode.export_pin != MDS_RANK_NONE) {
+ out << " export_pin=" << in.inode.export_pin;
+ }
+
out << " " << ∈
out << "]";
return out;
::encode(inode.ctime, bl);
::encode(inode.layout, bl, mdcache->mds->mdsmap->get_up_features());
::encode(inode.quota, bl);
+ ::encode(inode.export_pin, bl);
}
break;
if (inode.ctime < tm) inode.ctime = tm;
::decode(inode.layout, p);
::decode(inode.quota, p);
+ ::decode(inode.export_pin, p);
+ maybe_export_pin();
}
break;
return inode.layout.pool_id;
}
}
+
+void CInode::maybe_export_pin()
+{
+ if (g_conf->mds_bal_export_pin && is_dir() && !mdcache->export_pin_queue.count(this)) {
+ dout(20) << "maybe_export_pin " << *this << dendl;
+ mds_rank_t pin = get_projected_inode()->export_pin;
+ if (pin == MDS_RANK_NONE) {
+ /* Try to find farthest full auth parent fragment which is pinned
+ * elsewhere. There cannot be a break in the authority chain of
+ * directories, otherwise this inode itself will not be exported.
+ */
+ CInode *auth_last = this; /* N.B. we may not be full auth for any fragments of this inode, but adding it to the queue is harmless. */
+ bool auth_barrier = false;
+ for (CDir *cd = get_projected_parent_dir(); cd && !cd->inode->is_base() && !cd->inode->is_system(); cd = cd->inode->get_projected_parent_dir()) {
+ if (cd->is_full_dir_auth() && !auth_barrier) {
+ auth_last = cd->inode;
+ } else {
+ auth_barrier = true;
+ }
+ pin = cd->inode->get_projected_inode()->export_pin;
+ if (pin != MDS_RANK_NONE) {
+ if (pin != mdcache->mds->get_nodeid()) {
+ dout(20) << "adding ancestor to export_pin_queue " << *auth_last << dendl;
+ mdcache->export_pin_queue.insert(auth_last);
+ } else {
+ break; /* it is correctly pinned here! */
+ }
+ }
+ }
+ } else {
+ if (pin != mdcache->mds->get_nodeid()) {
+ dout(20) << "adding to export_pin_queue " << *this << dendl;
+ mdcache->export_pin_queue.insert(this);
+ }
+ }
+ }
+}
+
+void CInode::set_export_pin(mds_rank_t rank)
+{
+ assert(is_dir());
+ assert(is_projected());
+ get_projected_inode()->export_pin = rank;
+ maybe_export_pin();
+}
+
+mds_rank_t CInode::get_export_pin(void) const
+{
+ /* An inode that is export pinned may not necessarily be a subtree root, we
+ * need to traverse the parents. A base or system inode cannot be pinned.
+ * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
+ * have a parent yet.
+ */
+ for (const CInode *in = this; !in->is_base() && !in->is_system() && in->get_projected_parent_dn(); in = in->get_projected_parent_dn()->dir->inode) {
+ mds_rank_t pin = in->get_projected_inode()->export_pin;
+ if (pin >= 0) {
+ return pin;
+ }
+ }
+ return MDS_RANK_NONE;
+}
+
+bool CInode::is_exportable(mds_rank_t dest) const
+{
+ mds_rank_t pin = get_export_pin();
+ if (pin == dest) {
+ return true;
+ } else if (pin >= 0) {
+ return false;
+ } else {
+ return true;
+ }
+}
projected_parent.pop_front();
}
+private:
+ void maybe_export_pin();
+public:
+ void set_export_pin(mds_rank_t rank);
+ mds_rank_t get_export_pin(void) const;
+ bool is_exportable(mds_rank_t dest) const;
+
void print(ostream& out) override;
void dump(Formatter *f) const;
return 0;
}
+void MDBalancer::handle_export_pins(void)
+{
+ auto &q = mds->mdcache->export_pin_queue;
+ auto it = q.begin();
+ dout(20) << "export_pin_queue size=" << q.size() << dendl;
+ while (it != q.end()) {
+ auto current = it++;
+ CInode *in = *current;
+ assert(in->is_dir());
+ mds_rank_t export_pin = in->get_export_pin();
+ if (!in->is_exportable(export_pin)) {
+ dout(10) << "can no longer export " << *in << " because export pins have since changed" << dendl;
+ q.erase(current);
+ continue;
+ }
+ dout(10) << "exporting dirfrags of " << *in << " to " << export_pin << dendl;
+ bool has_auth = false;
+ list<frag_t> ls;
+ in->dirfragtree.get_leaves(ls);
+ for (const auto &fg : ls) {
+ CDir *cd = in->get_dirfrag(fg);
+ if (cd && cd->is_auth()) {
+ /* N.B. when we are no longer auth after exporting, this function will remove the inode from the queue */
+ mds->mdcache->migrator->export_dir(cd, export_pin);
+ has_auth = true;
+ }
+ }
+ if (!has_auth) {
+ dout(10) << "can no longer export " << *in << " because I am not auth for any dirfrags" << dendl;
+ q.erase(current);
+ continue;
+ }
+ }
-
+ set<CDir *> authsubs;
+ mds->mdcache->get_auth_subtrees(authsubs);
+ for (auto &cd : authsubs) {
+ mds_rank_t export_pin = cd->inode->get_export_pin();
+ dout(10) << "auth tree " << *cd << " export_pin=" << export_pin << dendl;
+ if (export_pin >= 0 && export_pin != mds->get_nodeid()) {
+ dout(10) << "exporting auth subtree " << *cd->inode << " to " << export_pin << dendl;
+ mds->mdcache->migrator->export_dir(cd, export_pin);
+ }
+ }
+}
void MDBalancer::tick()
{
utime_t elapsed = now;
elapsed -= first;
+ if (g_conf->mds_bal_export_pin) {
+ handle_export_pins();
+ }
+
// sample?
if ((double)now - (double)last_sample > g_conf->mds_bal_sample_interval) {
dout(15) << "tick last_sample now " << now << dendl;
void prep_rebalance(int beat);
int mantle_prep_rebalance();
+ void handle_export_pins(void);
+
void export_empties();
int localize_balancer();
bool check_targets(const balance_state_t& state);
g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
exceeded_size_limit = true;
}
+
+ /* This is the place where inodes are thrown in the cache, so check if the inode should be somewhere else: */
+ in->maybe_export_pin();
}
void MDCache::remove_inode(CInode *o)
o->item_open_file.remove_myself();
+ export_pin_queue.erase(o);
+
// remove from inode map
inode_map.erase(o->vino());
in->inode.nlink = 1;
in->inode.truncate_size = -1ull;
in->inode.change_attr = 0;
+ in->inode.export_pin = MDS_RANK_NONE;
memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
if (in->inode.is_dir()) {
Formatter *f, Context *fin);
void repair_inode_stats(CInode *diri);
void repair_dirfrag_stats(CDir *dir);
+
+public:
+ /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
+ std::set<CInode *> export_pin_queue;
};
class C_MDS_RetryRequest : public MDSInternalContext {
#include "Beacon.h"
-#define CEPH_MDS_PROTOCOL 28 /* cluster internal */
+#define CEPH_MDS_PROTOCOL 29 /* cluster internal */
class MonClient;
return;
}
+ if (!mds->is_stopping() && !dir->inode->is_exportable(dest)) {
+ dout(7) << "dir is export pinned" << dendl;
+ return;
+ }
+
+ if (dest == mds->get_nodeid() || !mds->mdsmap->is_up(dest)) {
+ dout(7) << "cannot export: dest " << dest << " is me or is not active" << dendl;
+ return;
+ }
+
mds->hit_export_target(ceph_clock_now(), dest, -1);
dir->auth_pin(this);
::encode(d_type, exportbl);
continue;
}
-
+
+ /* XXX The inode may be pinned to me (in->get_inode().export_pin) but it is
+ * not a subtree by the time I've found it here. So, keeping it is
+ * difficult as we've already notified the importer of the subtree bounds
+ * (MExportDirPrep). Creating a new subtree for this pinned inode would
+ * probably require widespread changes and is not worth the effort since
+ * the importer will simply export this inode and its subtrees back to us
+ * (Migrator::decode_import_inode). This should be rare enough to not
+ * justify mucking with things here.
+ */
+
// primary link
// -- inode
exportbl.append("I", 1); // inode dentry
in->add_replica(oldauth, CInode::EXPORT_NONCE);
if (in->is_replica(mds->get_nodeid()))
in->remove_replica(mds->get_nodeid());
-
+
+ in->maybe_export_pin();
}
void Migrator::decode_import_inode_caps(CInode *in, bool auth_cap,
pi = cur->project_inode();
pi->quota = quota;
+ } else if (name.find("ceph.dir.pin") == 0) {
+ if (!cur->is_dir() || cur->is_root()) {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ mds_rank_t rank;
+ try {
+ rank = boost::lexical_cast<mds_rank_t>(value);
+ if (rank < 0) rank = MDS_RANK_NONE;
+ } catch (boost::bad_lexical_cast const&) {
+ dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ xlocks.insert(&cur->policylock);
+ if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ return;
+
+ pi = cur->project_inode();
+ cur->set_export_pin(rank);
} else {
dout(10) << " unknown vxattr " << name << dendl;
respond_to_request(mdr, -EINVAL);
*/
void inode_t::encode(bufferlist &bl, uint64_t features) const
{
- ENCODE_START(14, 6, bl);
+ ENCODE_START(15, 6, bl);
::encode(ino, bl);
::encode(rdev, bl);
::encode(btime, bl);
::encode(change_attr, bl);
+ ::encode(export_pin, bl);
+
ENCODE_FINISH(bl);
}
void inode_t::decode(bufferlist::iterator &p)
{
- DECODE_START_LEGACY_COMPAT_LEN(14, 6, 6, p);
+ DECODE_START_LEGACY_COMPAT_LEN(15, 6, 6, p);
::decode(ino, p);
::decode(rdev, p);
change_attr = 0;
}
+ if (struct_v >= 15) {
+ ::decode(export_pin, p);
+ } else {
+ export_pin = MDS_RANK_NONE;
+ }
+
DECODE_FINISH(p);
}
f->dump_stream("atime") << atime;
f->dump_unsigned("time_warp_seq", time_warp_seq);
f->dump_unsigned("change_attr", change_attr);
+ f->dump_int("export_pin", export_pin);
f->open_array_section("client_ranges");
for (map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin(); p != client_ranges.end(); ++p) {
typedef int32_t mds_rank_t;
typedef int32_t fs_cluster_id_t;
-
-
BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
extern const mds_gid_t MDS_GID_NONE;
constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = {-1};
nest_info_t accounted_rstat; // protected by parent's nestlock
quota_info_t quota;
+
+ mds_rank_t export_pin;
// special stuff
version_t version; // auth only
truncate_seq(0), truncate_size(0), truncate_from(0),
truncate_pending(0),
time_warp_seq(0), change_attr(0),
+ export_pin(MDS_RANK_NONE),
version(0), file_data_version(0), xattr_version(0),
last_scrub_version(0), backtrace_version(0) {
clear_layout();