From 59e01da10695c2ccff2623c09e78a9d572e64235 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 16 Mar 2022 02:08:55 -0400 Subject: [PATCH] mds: include encoded stray inode when sending dentry unlink message to replicas The series of events that lead to unaccessible dentries is as follows (requires some hardlinked files, directory pinning and path restricted caps). Assume 3 hardlinked files: d0/f0 <-- primary link d1/h1 d1/h2 with multiple active MDSs -- d0 pinned to rank-0 and d1 to rank-1. Reproducing this requires deleting a non-primary link first followed by deleting the primary link and lastly the other non-primary link. If one if unlucky, the last delete fails with "Permission denied" error. On the MDS side, this is what happens: Unlinking the first non-primary link would discover the remote inode and link it in dentry as part of lookup. Unlink would send a remote unlink operation (_link_remote(op-unlink)) to the auth mds of the remote inode. Now the other unlinks: rank0 | rank1 | x-----<-----unlink: #unlink(d0/f0) | | v | _unlink_local() | | | v | | | x ---send_dentry_unlink() | | reply_client_request() | | | | unlink: #lookup(d1/h2)----x | | | | | v | | MDCache::path_traverse() v | (uses linked remote inode, | | which is not a stray inode)-----x | | | x-------------->----------------|---------->------x v (dentry_unlink msg) | | | | v | | handle_dentry_unlink() | | (relinks inode to stray dentry) | | | | x---------------------x | | | v | dispatch_client_request() | (uses linked inode under | stray dentry, no reintegration check) | | | v | SessionMap::check_access() | (parent is a stray dir, | but stray_prior_path is empty) One possible fix could be to fix SessionMap::check_access() to build the inode path for a inode under stray directory, however, the inode path would be on the form "#" and it does not look like there is a good way to figure out what it was (as seen in inode dump). Sending the (encoded) inode as part of `dentry_unlink' message seems to get handled well -- decode_replica_inode() handles the case where the inode already existed (in MDCache::inode_map) and the auth mds sending the stray inode to its replicas doesn't seem to cause trouble. (I think it didn't do this since the replicas have the inode anyway, but in this case this inode is a _bit_ out of sync with what exists in the auth). Fixes: http://tracker.ceph.com/issues/54046 Signed-off-by: Venky Shankar --- src/mds/MDCache.cc | 15 +++++++++++---- src/mds/MDCache.h | 2 +- src/mds/Server.cc | 2 +- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index c4d5901e80017..c0163c671e543 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -10940,7 +10940,7 @@ void MDCache::decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, C void MDCache::encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl) { ceph_assert(straydn->get_num_auth_pins()); - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); uint64_t features = mds->mdsmap->get_up_features(); encode_replica_inode(get_myin(), who, bl, features); encode_replica_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl); @@ -10948,15 +10948,18 @@ void MDCache::encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& encode_replica_inode(straydn->get_dir()->inode, who, bl, features); encode_replica_dir(straydn->get_dir(), who, bl); encode_replica_dentry(straydn, who, bl); + if (!straydn->get_projected_linkage()->is_null()) { + encode_replica_inode(straydn->get_projected_linkage()->get_inode(), who, bl, features); + } ENCODE_FINISH(bl); } -void MDCache::decode_replica_stray(CDentry *&straydn, const bufferlist &bl, mds_rank_t from) +void MDCache::decode_replica_stray(CDentry *&straydn, CInode **in, const bufferlist &bl, mds_rank_t from) { MDSContext::vec finished; auto p = bl.cbegin(); - DECODE_START(1, p); + DECODE_START(2, p); CInode *mdsin = nullptr; decode_replica_inode(mdsin, p, NULL, finished); CDir *mdsdir = nullptr; @@ -10969,6 +10972,9 @@ void MDCache::decode_replica_stray(CDentry *&straydn, const bufferlist &bl, mds_ decode_replica_dir(straydir, p, strayin, from, finished); decode_replica_dentry(straydn, p, straydir, finished); + if (struct_v >= 2 && in) { + decode_replica_inode(*in, p, straydn, finished); + } if (!finished.empty()) mds->queue_waiters(finished); DECODE_FINISH(p); @@ -11200,8 +11206,9 @@ void MDCache::handle_dentry_unlink(const cref_t &m) { // straydn CDentry *straydn = nullptr; + CInode *strayin = nullptr; if (m->straybl.length()) - decode_replica_stray(straydn, m->straybl, mds_rank_t(m->get_source().num())); + decode_replica_stray(straydn, &strayin, m->straybl, mds_rank_t(m->get_source().num())); CDir *dir = get_dirfrag(m->get_dirfrag()); if (!dir) { diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 94eedb836d8b4..445b5edcbcfd8 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -877,7 +877,7 @@ class MDCache { void decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished); void encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl); - void decode_replica_stray(CDentry *&straydn, const bufferlist &bl, mds_rank_t from); + void decode_replica_stray(CDentry *&straydn, CInode **in, const bufferlist &bl, mds_rank_t from); // -- namespace -- void encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 5b54e8112a41a..41537a1e97200 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -2702,7 +2702,7 @@ void Server::handle_peer_request(const cref_t &m) CDentry *straydn = NULL; if (m->straybl.length() > 0) { - mdcache->decode_replica_stray(straydn, m->straybl, from); + mdcache->decode_replica_stray(straydn, nullptr, m->straybl, from); ceph_assert(straydn); m->straybl.clear(); } -- 2.39.5