From 0b420be7e9f5b325315db101ea65258a9043468a Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 11 Apr 2017 15:16:02 -0400 Subject: [PATCH] mds: add export_pin feature This allows the client/admin to pin a directory tree to a particular rank, preventing its export by the dynamic balancer. Fixes: http://tracker.ceph.com/issues/17834 Signed-off-by: Patrick Donnelly --- doc/cephfs/multimds.rst | 32 ++++ .../basic/tasks/cephfs_test_exports.yaml | 4 + qa/tasks/cephfs/test_exports.py | 146 ++++++++++++++++++ src/common/config_opts.h | 1 + src/mds/CInode.cc | 80 ++++++++++ src/mds/CInode.h | 7 + src/mds/MDBalancer.cc | 49 +++++- src/mds/MDBalancer.h | 2 + src/mds/MDCache.cc | 6 + src/mds/MDCache.h | 4 + src/mds/MDSDaemon.h | 2 +- src/mds/Migrator.cc | 25 ++- src/mds/Server.cc | 22 +++ src/mds/mdstypes.cc | 13 +- src/mds/mdstypes.h | 5 +- 15 files changed, 390 insertions(+), 8 deletions(-) create mode 100644 qa/suites/multimds/basic/tasks/cephfs_test_exports.yaml create mode 100644 qa/tasks/cephfs/test_exports.py diff --git a/doc/cephfs/multimds.rst b/doc/cephfs/multimds.rst index a1832bb9382..942a786d623 100644 --- a/doc/cephfs/multimds.rst +++ b/doc/cephfs/multimds.rst @@ -107,3 +107,35 @@ When a daemon finishes stopping, it will respawn itself and go back to being a standby. +Manually pinning directory trees to a particular rank +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In multiple active metadata server configurations, a balancer runs which works +to spread metadata load evenly across the cluster. This usually works well +enough for most users but sometimes it is desirable to override the dynamic +balancer with explicit mappings of metadata to particular ranks. This can allow +the administrator or users to evenly spread application load or limit impact of +users' metadata requests on the entire cluster. + +The mechanism provided for this purpose is called an ``export pin``, an +extended attribute of directories. The name of this extended attribute is +``ceph.dir.pin``. Users can set this attribute using standard commands: + +:: + setfattr -n ceph.dir.pin -v 2 path/to/dir + +The value of the extended attribute is the rank to assign the directory subtree +to. A default value of ``-1`` indicates the directory is not pinned. + +A directory's export pin is inherited from its closest parent with a set export +pin. In this way, setting the export pin on a directory affects all of its +children. However, the parents pin can be overriden by setting the child +directory's export pin. For example: + +:: + mkdir -p a/b + # "a" and "a/b" both start without an export pin set + setfattr -n ceph.dir.pin -v 1 a/ + # a and b are now pinned to rank 1 + setfattr -n ceph.dir.pin -v 0 a/b + # a/b is now pinned to rank 0 and a/ and the rest of its children are still pinned to rank 1 diff --git a/qa/suites/multimds/basic/tasks/cephfs_test_exports.yaml b/qa/suites/multimds/basic/tasks/cephfs_test_exports.yaml new file mode 100644 index 00000000000..b5842b3ef1b --- /dev/null +++ b/qa/suites/multimds/basic/tasks/cephfs_test_exports.yaml @@ -0,0 +1,4 @@ +tasks: +- cephfs_test_runner: + modules: + - tasks.cephfs.test_exports diff --git a/qa/tasks/cephfs/test_exports.py b/qa/tasks/cephfs/test_exports.py new file mode 100644 index 00000000000..21e2168262e --- /dev/null +++ b/qa/tasks/cephfs/test_exports.py @@ -0,0 +1,146 @@ +import logging +import time +from tasks.cephfs.fuse_mount import FuseMount +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +log = logging.getLogger(__name__) + + +class TestExports(CephFSTestCase): + def test_export_pin(self): + if not isinstance(self.mount_a, FuseMount): + self.skipTest("FUSE needed for measuring op counts") + + self.fs.set_allow_multimds(True) + self.fs.set_max_mds(2) + + status = self.fs.status() + + self.mount_a.run_shell(["mkdir", "-p", "1/2/3"]) + subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 0)['name']) + log.info(subtrees) + subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees) + self.assertTrue(len(subtrees) == 0) + + # NOP + self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "-1", "1"]) + subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 0)['name']) + log.info(subtrees) + subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees) + self.assertTrue(len(subtrees) == 0) + + # NOP (rank < -1) + self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "-2341", "1"]) + subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 0)['name']) + log.info(subtrees) + subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees) + self.assertTrue(len(subtrees) == 0) + + # pin /1 to rank 1 + self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", "1"]) + time.sleep(10) + subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 1)['name']) + log.info(subtrees) + subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees) + log.info(subtrees) + self.assertTrue(len(subtrees) == 1 and subtrees[0]['auth_first'] == 1) + + # Check export_targets is set properly + status = self.fs.status() + log.info(status) + r0 = status.get_rank(self.fs.id, 0) + self.assertTrue(sorted(r0['export_targets']) == [1]) + + # redundant pin /1/2 to rank 1 + self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", "1/2"]) + time.sleep(10) + subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 1)['name']) + log.info(subtrees) + subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees) + self.assertTrue(len(subtrees) == 1 and subtrees[0]['auth_first'] == 1) + + # change pin /1/2 to rank 0 + self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "0", "1/2"]) + time.sleep(10) + subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 0)['name']) + log.info(subtrees) + subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees) + self.assertTrue(len(subtrees) == 2) + subtrees = filter(lambda s: s['dir']['path'].startswith('/1/2'), subtrees) + self.assertTrue(len(subtrees) == 1 and subtrees[0]['auth_first'] == 0) + + # change pin /1/2/3 to (presently) non-existent rank 2 + self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "2", "1/2/3"]) + time.sleep(10) + subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 0)['name']) + log.info(subtrees) + subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees) + self.assertTrue(len(subtrees) == 2) + subtrees = filter(lambda s: s['dir']['path'].startswith('/1/2'), subtrees) + self.assertTrue(len(subtrees) == 1 and subtrees[0]['auth_first'] == 0) + + # change pin /1/2 back to rank 1 + self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", "1/2"]) + time.sleep(10) + subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 1)['name']) + log.info(subtrees) + subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees) + self.assertTrue(len(subtrees) == 1 and subtrees[0]['auth_first'] == 1) + + # add another directory pinned to 1 + self.mount_a.run_shell(["mkdir", "-p", "1/4/5"]) + self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", "1/4/5"]) + + # change pin /1 to 0 + self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "0", "1"]) + time.sleep(20) + subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 0)['name']) + log.info(subtrees) + subtrees = filter(lambda s: s['dir']['path'] == '/1', subtrees) + self.assertTrue(len(subtrees) == 0) # /1 is merged into root + subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 1)['name']) + log.info(subtrees) + subtrees = filter(lambda s: s['dir']['path'].startswith('/1/'), subtrees) + self.assertTrue(len(subtrees) == 2 and subtrees[0]['auth_first'] == 1 and subtrees[1]['auth_first'] == 1) # /1/2 and /1/4/5 + + # change pin /1/2 to default (-1); does the subtree root properly respect it's parent pin? + self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "-1", "1/2"]) + time.sleep(10) + subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 1)['name']) + log.info(subtrees) + subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees) + self.assertTrue(len(subtrees) == 1) # /1/4 still here! + + if len(list(status.get_standbys())): + self.fs.set_max_mds(3) + time.sleep(10) + status = self.fs.status() + subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 2)['name']) + log.info(subtrees) + subtrees = filter(lambda s: s['dir']['path'].startswith('/1/2/3'), subtrees) + self.assertTrue(len(subtrees) == 1) + + # Check export_targets is set properly + status = self.fs.status() + log.info(status) + r0 = status.get_rank(self.fs.id, 0) + self.assertTrue(sorted(r0['export_targets']) == [1,2]) + r1 = status.get_rank(self.fs.id, 1) + self.assertTrue(sorted(r1['export_targets']) == [0]) + r2 = status.get_rank(self.fs.id, 2) + self.assertTrue(sorted(r2['export_targets']) == []) + + # Test rename + self.mount_a.run_shell(["mkdir", "-p", "a/b", "aa/bb"]) + self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", "a"]) + time.sleep(10); + subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 0)['name']) + log.info(subtrees) + subtrees = filter(lambda s: s['dir']['path'].startswith('/a'), subtrees) + self.assertTrue(len(subtrees) == 1 and subtrees[0]['auth_first'] == 1) + self.mount_a.run_shell(["mv", "aa", "a/b/"]) + time.sleep(10) + subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 0)['name']) + log.info(subtrees) + subtrees = filter(lambda s: s['dir']['path'].startswith('/a'), subtrees) + self.assertTrue(len(subtrees) == 1 and subtrees[0]['auth_first'] == 1) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 4f0632db778..9b815a86272 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -523,6 +523,7 @@ OPTION(mds_log_events_per_segment, OPT_INT, 1024) OPTION(mds_log_segment_size, OPT_INT, 0) // segment size for mds log, default to default file_layout_t OPTION(mds_log_max_segments, OPT_U32, 30) OPTION(mds_log_max_expiring, OPT_INT, 20) +OPTION(mds_bal_export_pin, OPT_BOOL, true) // allow clients to pin directory trees to ranks OPTION(mds_bal_sample_interval, OPT_DOUBLE, 3.0) // every 3 seconds OPTION(mds_bal_replicate_threshold, OPT_FLOAT, 8000) OPTION(mds_bal_unreplicate_threshold, OPT_FLOAT, 0) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 836362eaad9..92c2735d2c7 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -247,6 +247,10 @@ ostream& operator<<(ostream& out, const CInode& in) in.print_pin_set(out); } + if (in.inode.export_pin != MDS_RANK_NONE) { + out << " export_pin=" << in.inode.export_pin; + } + out << " " << ∈ out << "]"; return out; @@ -1526,6 +1530,7 @@ void CInode::encode_lock_state(int type, bufferlist& bl) ::encode(inode.ctime, bl); ::encode(inode.layout, bl, mdcache->mds->mdsmap->get_up_features()); ::encode(inode.quota, bl); + ::encode(inode.export_pin, bl); } break; @@ -1788,6 +1793,8 @@ void CInode::decode_lock_state(int type, bufferlist& bl) if (inode.ctime < tm) inode.ctime = tm; ::decode(inode.layout, p); ::decode(inode.quota, p); + ::decode(inode.export_pin, p); + maybe_export_pin(); } break; @@ -4383,3 +4390,76 @@ int64_t CInode::get_backtrace_pool() const return inode.layout.pool_id; } } + +void CInode::maybe_export_pin() +{ + if (g_conf->mds_bal_export_pin && is_dir() && !mdcache->export_pin_queue.count(this)) { + dout(20) << "maybe_export_pin " << *this << dendl; + mds_rank_t pin = get_projected_inode()->export_pin; + if (pin == MDS_RANK_NONE) { + /* Try to find farthest full auth parent fragment which is pinned + * elsewhere. There cannot be a break in the authority chain of + * directories, otherwise this inode itself will not be exported. + */ + CInode *auth_last = this; /* N.B. we may not be full auth for any fragments of this inode, but adding it to the queue is harmless. */ + bool auth_barrier = false; + for (CDir *cd = get_projected_parent_dir(); cd && !cd->inode->is_base() && !cd->inode->is_system(); cd = cd->inode->get_projected_parent_dir()) { + if (cd->is_full_dir_auth() && !auth_barrier) { + auth_last = cd->inode; + } else { + auth_barrier = true; + } + pin = cd->inode->get_projected_inode()->export_pin; + if (pin != MDS_RANK_NONE) { + if (pin != mdcache->mds->get_nodeid()) { + dout(20) << "adding ancestor to export_pin_queue " << *auth_last << dendl; + mdcache->export_pin_queue.insert(auth_last); + } else { + break; /* it is correctly pinned here! */ + } + } + } + } else { + if (pin != mdcache->mds->get_nodeid()) { + dout(20) << "adding to export_pin_queue " << *this << dendl; + mdcache->export_pin_queue.insert(this); + } + } + } +} + +void CInode::set_export_pin(mds_rank_t rank) +{ + assert(is_dir()); + assert(is_projected()); + get_projected_inode()->export_pin = rank; + maybe_export_pin(); +} + +mds_rank_t CInode::get_export_pin(void) const +{ + /* An inode that is export pinned may not necessarily be a subtree root, we + * need to traverse the parents. A base or system inode cannot be pinned. + * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not + * have a parent yet. + */ + for (const CInode *in = this; !in->is_base() && !in->is_system() && in->get_projected_parent_dn(); in = in->get_projected_parent_dn()->dir->inode) { + mds_rank_t pin = in->get_projected_inode()->export_pin; + if (pin >= 0) { + return pin; + } + } + return MDS_RANK_NONE; +} + +bool CInode::is_exportable(mds_rank_t dest) const +{ + mds_rank_t pin = get_export_pin(); + if (pin == dest) { + return true; + } else if (pin >= 0) { + return false; + } else { + return true; + } +} diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 1d6b210e0b4..9ca709652f9 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -1087,6 +1087,13 @@ public: projected_parent.pop_front(); } +private: + void maybe_export_pin(); +public: + void set_export_pin(mds_rank_t rank); + mds_rank_t get_export_pin(void) const; + bool is_exportable(mds_rank_t dest) const; + void print(ostream& out) override; void dump(Formatter *f) const; diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc index 2f9b26fa78c..cb2ae76149a 100644 --- a/src/mds/MDBalancer.cc +++ b/src/mds/MDBalancer.cc @@ -67,8 +67,51 @@ int MDBalancer::proc_message(Message *m) return 0; } +void MDBalancer::handle_export_pins(void) +{ + auto &q = mds->mdcache->export_pin_queue; + auto it = q.begin(); + dout(20) << "export_pin_queue size=" << q.size() << dendl; + while (it != q.end()) { + auto current = it++; + CInode *in = *current; + assert(in->is_dir()); + mds_rank_t export_pin = in->get_export_pin(); + if (!in->is_exportable(export_pin)) { + dout(10) << "can no longer export " << *in << " because export pins have since changed" << dendl; + q.erase(current); + continue; + } + dout(10) << "exporting dirfrags of " << *in << " to " << export_pin << dendl; + bool has_auth = false; + list ls; + in->dirfragtree.get_leaves(ls); + for (const auto &fg : ls) { + CDir *cd = in->get_dirfrag(fg); + if (cd && cd->is_auth()) { + /* N.B. when we are no longer auth after exporting, this function will remove the inode from the queue */ + mds->mdcache->migrator->export_dir(cd, export_pin); + has_auth = true; + } + } + if (!has_auth) { + dout(10) << "can no longer export " << *in << " because I am not auth for any dirfrags" << dendl; + q.erase(current); + continue; + } + } - + set authsubs; + mds->mdcache->get_auth_subtrees(authsubs); + for (auto &cd : authsubs) { + mds_rank_t export_pin = cd->inode->get_export_pin(); + dout(10) << "auth tree " << *cd << " export_pin=" << export_pin << dendl; + if (export_pin >= 0 && export_pin != mds->get_nodeid()) { + dout(10) << "exporting auth subtree " << *cd->inode << " to " << export_pin << dendl; + mds->mdcache->migrator->export_dir(cd, export_pin); + } + } +} void MDBalancer::tick() { @@ -78,6 +121,10 @@ void MDBalancer::tick() utime_t elapsed = now; elapsed -= first; + if (g_conf->mds_bal_export_pin) { + handle_export_pins(); + } + // sample? if ((double)now - (double)last_sample > g_conf->mds_bal_sample_interval) { dout(15) << "tick last_sample now " << now << dendl; diff --git a/src/mds/MDBalancer.h b/src/mds/MDBalancer.h index ca5f290657e..cbfc1c479ff 100644 --- a/src/mds/MDBalancer.h +++ b/src/mds/MDBalancer.h @@ -85,6 +85,8 @@ private: void prep_rebalance(int beat); int mantle_prep_rebalance(); + void handle_export_pins(void); + void export_empties(); int localize_balancer(); bool check_targets(const balance_state_t& state); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index faaaef32110..56a208853e9 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -271,6 +271,9 @@ void MDCache::add_inode(CInode *in) g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) { exceeded_size_limit = true; } + + /* This is the place where inodes are thrown in the cache, so check if the inode should be somewhere else: */ + in->maybe_export_pin(); } void MDCache::remove_inode(CInode *o) @@ -293,6 +296,8 @@ void MDCache::remove_inode(CInode *o) o->item_open_file.remove_myself(); + export_pin_queue.erase(o); + // remove from inode map inode_map.erase(o->vino()); @@ -351,6 +356,7 @@ void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino, in->inode.nlink = 1; in->inode.truncate_size = -1ull; in->inode.change_attr = 0; + in->inode.export_pin = MDS_RANK_NONE; memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout)); if (in->inode.is_dir()) { diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index ea83169c7e8..558ab0c5aff 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -1173,6 +1173,10 @@ public: Formatter *f, Context *fin); void repair_inode_stats(CInode *diri); void repair_dirfrag_stats(CDir *dir); + +public: + /* Because exports may fail, this set lets us keep track of inodes that need exporting. */ + std::set export_pin_queue; }; class C_MDS_RetryRequest : public MDSInternalContext { diff --git a/src/mds/MDSDaemon.h b/src/mds/MDSDaemon.h index 835fce67d01..2abcd87fe06 100644 --- a/src/mds/MDSDaemon.h +++ b/src/mds/MDSDaemon.h @@ -40,7 +40,7 @@ #include "Beacon.h" -#define CEPH_MDS_PROTOCOL 28 /* cluster internal */ +#define CEPH_MDS_PROTOCOL 29 /* cluster internal */ class MonClient; diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 0d9a5c15235..b0273ccd142 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -809,6 +809,16 @@ void Migrator::export_dir(CDir *dir, mds_rank_t dest) return; } + if (!mds->is_stopping() && !dir->inode->is_exportable(dest)) { + dout(7) << "dir is export pinned" << dendl; + return; + } + + if (dest == mds->get_nodeid() || !mds->mdsmap->is_up(dest)) { + dout(7) << "cannot export: dest " << dest << " is me or is not active" << dendl; + return; + } + mds->hit_export_target(ceph_clock_now(), dest, -1); dir->auth_pin(this); @@ -1531,7 +1541,17 @@ uint64_t Migrator::encode_export_dir(bufferlist& exportbl, ::encode(d_type, exportbl); continue; } - + + /* XXX The inode may be pinned to me (in->get_inode().export_pin) but it is + * not a subtree by the time I've found it here. So, keeping it is + * difficult as we've already notified the importer of the subtree bounds + * (MExportDirPrep). Creating a new subtree for this pinned inode would + * probably require widespread changes and is not worth the effort since + * the importer will simply export this inode and its subtrees back to us + * (Migrator::decode_import_inode). This should be rare enough to not + * justify mucking with things here. + */ + // primary link // -- inode exportbl.append("I", 1); // inode dentry @@ -2904,7 +2924,8 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist::iterator& blp, in->add_replica(oldauth, CInode::EXPORT_NONCE); if (in->is_replica(mds->get_nodeid())) in->remove_replica(mds->get_nodeid()); - + + in->maybe_export_pin(); } void Migrator::decode_import_inode_caps(CInode *in, bool auth_cap, diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 9cf926bce4b..c037c144a9e 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -4406,6 +4406,28 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur, pi = cur->project_inode(); pi->quota = quota; + } else if (name.find("ceph.dir.pin") == 0) { + if (!cur->is_dir() || cur->is_root()) { + respond_to_request(mdr, -EINVAL); + return; + } + + mds_rank_t rank; + try { + rank = boost::lexical_cast(value); + if (rank < 0) rank = MDS_RANK_NONE; + } catch (boost::bad_lexical_cast const&) { + dout(10) << "bad vxattr value, unable to parse int for " << name << dendl; + respond_to_request(mdr, -EINVAL); + return; + } + + xlocks.insert(&cur->policylock); + if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) + return; + + pi = cur->project_inode(); + cur->set_export_pin(rank); } else { dout(10) << " unknown vxattr " << name << dendl; respond_to_request(mdr, -EINVAL); diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc index 5a300d2fc03..71c5f11d3c2 100644 --- a/src/mds/mdstypes.cc +++ b/src/mds/mdstypes.cc @@ -244,7 +244,7 @@ void inline_data_t::decode(bufferlist::iterator &p) */ void inode_t::encode(bufferlist &bl, uint64_t features) const { - ENCODE_START(14, 6, bl); + ENCODE_START(15, 6, bl); ::encode(ino, bl); ::encode(rdev, bl); @@ -294,12 +294,14 @@ void inode_t::encode(bufferlist &bl, uint64_t features) const ::encode(btime, bl); ::encode(change_attr, bl); + ::encode(export_pin, bl); + ENCODE_FINISH(bl); } void inode_t::decode(bufferlist::iterator &p) { - DECODE_START_LEGACY_COMPAT_LEN(14, 6, 6, p); + DECODE_START_LEGACY_COMPAT_LEN(15, 6, 6, p); ::decode(ino, p); ::decode(rdev, p); @@ -380,6 +382,12 @@ void inode_t::decode(bufferlist::iterator &p) change_attr = 0; } + if (struct_v >= 15) { + ::decode(export_pin, p); + } else { + export_pin = MDS_RANK_NONE; + } + DECODE_FINISH(p); } @@ -416,6 +424,7 @@ void inode_t::dump(Formatter *f) const f->dump_stream("atime") << atime; f->dump_unsigned("time_warp_seq", time_warp_seq); f->dump_unsigned("change_attr", change_attr); + f->dump_int("export_pin", export_pin); f->open_array_section("client_ranges"); for (map::const_iterator p = client_ranges.begin(); p != client_ranges.end(); ++p) { diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index ed7067fc25b..59f53520e0a 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -74,8 +74,6 @@ typedef int32_t mds_rank_t; typedef int32_t fs_cluster_id_t; - - BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t) extern const mds_gid_t MDS_GID_NONE; constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = {-1}; @@ -508,6 +506,8 @@ struct inode_t { nest_info_t accounted_rstat; // protected by parent's nestlock quota_info_t quota; + + mds_rank_t export_pin; // special stuff version_t version; // auth only @@ -529,6 +529,7 @@ struct inode_t { truncate_seq(0), truncate_size(0), truncate_from(0), truncate_pending(0), time_warp_seq(0), change_attr(0), + export_pin(MDS_RANK_NONE), version(0), file_data_version(0), xattr_version(0), last_scrub_version(0), backtrace_version(0) { clear_layout(); -- 2.39.5