From: Patrick Donnelly Date: Tue, 2 May 2017 04:28:15 +0000 (-0400) Subject: mds: use aux subtrees for export pinned inodes X-Git-Tag: v12.0.3~38^2~16 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=6bd58fefb7ca76e4e313a2309cd70833d75f657a;p=ceph.git mds: use aux subtrees for export pinned inodes Idea here is that a pinned inode should not be exported when its parent is. Setting the pinned inode's dirfrags to aux subtrees prevents them from being merged with a parent subtree. Signed-off-by: Patrick Donnelly --- diff --git a/qa/tasks/cephfs/test_exports.py b/qa/tasks/cephfs/test_exports.py index 21e2168262e0..e07964046f6f 100644 --- a/qa/tasks/cephfs/test_exports.py +++ b/qa/tasks/cephfs/test_exports.py @@ -57,7 +57,7 @@ class TestExports(CephFSTestCase): subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 1)['name']) log.info(subtrees) subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees) - self.assertTrue(len(subtrees) == 1 and subtrees[0]['auth_first'] == 1) + self.assertTrue(len(subtrees) == 2 and subtrees[0]['auth_first'] == 1 and subtrees[1]['auth_first'] == 1) # change pin /1/2 to rank 0 self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "0", "1/2"]) @@ -85,7 +85,7 @@ class TestExports(CephFSTestCase): subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 1)['name']) log.info(subtrees) subtrees = filter(lambda s: s['dir']['path'].startswith('/1'), subtrees) - self.assertTrue(len(subtrees) == 1 and subtrees[0]['auth_first'] == 1) + self.assertTrue(len(subtrees) == 2 and subtrees[0]['auth_first'] == 1 and subtrees[1]['auth_first'] == 1) # add another directory pinned to 1 self.mount_a.run_shell(["mkdir", "-p", "1/4/5"]) @@ -97,7 +97,7 @@ class TestExports(CephFSTestCase): subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 0)['name']) log.info(subtrees) subtrees = filter(lambda s: s['dir']['path'] == '/1', subtrees) - self.assertTrue(len(subtrees) == 0) # /1 is merged into root + self.assertTrue(len(subtrees) == 1) subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, 1)['name']) log.info(subtrees) subtrees = filter(lambda s: s['dir']['path'].startswith('/1/'), subtrees) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 6d067985bfba..bbe8b7f99676 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -4391,35 +4391,87 @@ int64_t CInode::get_backtrace_pool() const } } +class C_CInode_AuxSubtree : public MDSInternalContext { +public: + explicit C_CInode_AuxSubtree(CInode *in) : MDSInternalContext(in->mdcache->mds), in(in) { + in->get(MDSCacheObject::PIN_PTRWAITER); + } + ~C_CInode_AuxSubtree() { + in->put(MDSCacheObject::PIN_PTRWAITER); + } + + void finish(int r) override { + in->maybe_export_pin(); + } +private: + CInode *in; +}; + void CInode::maybe_export_pin() { - if (g_conf->mds_bal_export_pin && is_dir() && !mdcache->export_pin_queue.count(this)) { + if (g_conf->mds_bal_export_pin && is_dir()) { dout(20) << "maybe_export_pin " << *this << dendl; mds_rank_t pin = get_projected_inode()->export_pin; if (pin == MDS_RANK_NONE) { - /* Try to find farthest full auth parent fragment which is pinned - * elsewhere. There cannot be a break in the authority chain of - * directories, otherwise this inode itself will not be exported. - */ - CInode *auth_last = this; /* N.B. we may not be full auth for any fragments of this inode, but adding it to the queue is harmless. */ - bool auth_barrier = false; - for (CDir *cd = get_projected_parent_dir(); cd && !cd->inode->is_base() && !cd->inode->is_system(); cd = cd->inode->get_projected_parent_dir()) { - if (cd->is_full_dir_auth() && !auth_barrier) { - auth_last = cd->inode; - } else { - auth_barrier = true; - } - pin = cd->inode->get_projected_inode()->export_pin; - if (pin != MDS_RANK_NONE) { - if (pin != mdcache->mds->get_nodeid()) { - dout(20) << "adding ancestor to export_pin_queue " << *auth_last << dendl; - mdcache->export_pin_queue.insert(auth_last); + if (!mdcache->export_pin_queue.count(this)) { + /* Try to find farthest full auth parent fragment which is pinned + * elsewhere. There cannot be a break in the authority chain of + * directories, otherwise this inode itself will not be exported. + */ + CInode *auth_last = this; /* N.B. we may not be full auth for any fragments of this inode, but adding it to the queue is harmless. */ + bool auth_barrier = false; + for (CDir *cd = get_projected_parent_dir(); cd && !cd->inode->is_base() && !cd->inode->is_system(); cd = cd->inode->get_projected_parent_dir()) { + if (cd->is_full_dir_auth() && !auth_barrier) { + auth_last = cd->inode; + } else { + auth_barrier = true; + } + pin = cd->inode->get_projected_inode()->export_pin; + if (pin != MDS_RANK_NONE) { + if (pin != mdcache->mds->get_nodeid()) { + dout(20) << "adding ancestor to export_pin_queue " << *auth_last << dendl; + mdcache->export_pin_queue.insert(auth_last); + } + break; } - break; } } } else { - if (pin != mdcache->mds->get_nodeid()) { + if (pin == mdcache->mds->get_nodeid()) { + for (auto it = dirfrags.begin(); it != dirfrags.end(); it++) { + CDir *cd = it->second; + if (cd->state_test(CDir::STATE_AUXSUBTREE)) continue; + dout(15) << "aux subtree pinning " << *cd << dendl; + CDir *subtree = mdcache->get_subtree_root(cd); + if (!subtree) continue; + if (subtree->is_ambiguous_auth()) { + subtree->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_CInode_AuxSubtree(this)); + dout(15) << "delaying pinning for single auth on subtree " << *subtree << dendl; + } else if (subtree->is_auth()) { + assert(cd->is_auth()); + if (subtree->is_frozen() || subtree->is_freezing()) { + subtree->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_CInode_AuxSubtree(this)); + dout(15) << "delaying pinning for thaw on subtree " << *subtree << dendl; + } else { + cd->state_set(CDir::STATE_AUXSUBTREE); + mdcache->adjust_subtree_auth(cd, mdcache->mds->get_nodeid()); + dout(15) << "set aux subtree " << *cd << dendl; + } + } else { + assert(!cd->is_auth()); + dout(15) << "not auth for fragment so not setting aux subtree for " << *cd << dendl; + } + } + } else { + for (auto it = dirfrags.begin(); it != dirfrags.end(); it++) { + CDir *cd = it->second; + if (cd->is_auth() && cd->state_test(CDir::STATE_AUXSUBTREE)) { + assert(!(cd->is_frozen() || cd->is_freezing())); + assert(!cd->state_test(CDir::STATE_EXPORTBOUND)); + cd->state_clear(CDir::STATE_AUXSUBTREE); /* merge will happen eventually */ + dout(15) << "cleared aux subtree " << *cd << dendl; + } + } dout(20) << "adding to export_pin_queue " << *this << dendl; mdcache->export_pin_queue.insert(this); } diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 9ca709652f9b..b05d843a87bc 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -648,6 +648,7 @@ public: friend class StrayManager; friend class CDir; friend class CInodeExport; + friend class C_CInode_AuxSubtree; // --------------------------- CInode(MDCache *c, bool auth=true, snapid_t f=2, snapid_t l=CEPH_NOSNAP) : diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 527f6c67baa6..1122022cb92e 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -1568,16 +1568,6 @@ uint64_t Migrator::encode_export_dir(bufferlist& exportbl, continue; } - /* XXX The inode may be pinned to me (in->get_inode().export_pin) but it is - * not a subtree by the time I've found it here. So, keeping it is - * difficult as we've already notified the importer of the subtree bounds - * (MExportDirPrep). Creating a new subtree for this pinned inode would - * probably require widespread changes and is not worth the effort since - * the importer will simply export this inode and its subtrees back to us - * (Migrator::decode_import_inode). This should be rare enough to not - * justify mucking with things here. - */ - // primary link // -- inode exportbl.append("I", 1); // inode dentry