From: Xiubo Li Date: Wed, 18 May 2022 04:59:38 +0000 (+0800) Subject: mds: wait unlink to finish to avoid conflict when creating same dentries X-Git-Tag: v17.2.6~78^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=7005de0f6ff9bb1c154b3d28e7dfaad9abcf7030;p=ceph.git mds: wait unlink to finish to avoid conflict when creating same dentries If the previous unlink request has been delayed due to some reasons, and the new creating for the same dentry may fail or new open will succeeds but new contents wrote to it will be lost. The kernel client will make sure before the unlink getting the first reply it won't send the followed create requests for the same dentry. Here we need to make sure that before the first reply has been sent out the dentry must be marked as unlinking. Fixes: https://tracker.ceph.com/issues/55332 Signed-off-by: Xiubo Li (cherry picked from commit d4b9431dfe23a97075766fa2a2b76d7554543b0c) --- diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h index dff250bcac3..efd27719283 100644 --- a/src/mds/CDentry.h +++ b/src/mds/CDentry.h @@ -29,6 +29,7 @@ #include "BatchOp.h" #include "MDSCacheObject.h" #include "MDSContext.h" +#include "Mutation.h" #include "SimpleLock.h" #include "LocalLockC.h" #include "ScrubHeader.h" @@ -86,18 +87,23 @@ public: static const int STATE_EVALUATINGSTRAY = (1<<4); static const int STATE_PURGINGPINNED = (1<<5); static const int STATE_BOTTOMLRU = (1<<6); + static const int STATE_UNLINKING = (1<<7); // stray dentry needs notification of releasing reference static const int STATE_STRAY = STATE_NOTIFYREF; static const int MASK_STATE_IMPORT_KEPT = STATE_BOTTOMLRU; // -- pins -- - static const int PIN_INODEPIN = 1; // linked inode is pinned - static const int PIN_FRAGMENTING = -2; // containing dir is refragmenting - static const int PIN_PURGING = 3; - static const int PIN_SCRUBPARENT = 4; + static const int PIN_INODEPIN = 1; // linked inode is pinned + static const int PIN_FRAGMENTING = -2; // containing dir is refragmenting + static const int PIN_PURGING = 3; + static const int PIN_SCRUBPARENT = 4; + static const int PIN_WAITUNLINKSTATE = 5; static const unsigned EXPORT_NONCE = 1; + const static uint64_t WAIT_UNLINK_STATE = (1<<0); + const static uint64_t WAIT_UNLINK_FINISH = (1<<1); + uint32_t replica_unlinking_ref = 0; CDentry(std::string_view n, __u32 h, mempool::mds_co::string alternate_name, @@ -136,6 +142,7 @@ public: case PIN_FRAGMENTING: return "fragmenting"; case PIN_PURGING: return "purging"; case PIN_SCRUBPARENT: return "scrubparent"; + case PIN_WAITUNLINKSTATE: return "waitunlinkstate"; default: return generic_pin_name(p); } } diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 387cd9b478b..59643fd776f 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -8116,6 +8116,10 @@ void MDCache::dispatch(const cref_t &m) case MSG_MDS_DENTRYUNLINK: handle_dentry_unlink(ref_cast(m)); break; + case MSG_MDS_DENTRYUNLINK_ACK: + handle_dentry_unlink_ack(ref_cast(m)); + break; + case MSG_MDS_FRAGMENTNOTIFY: handle_fragment_notify(ref_cast(m)); @@ -11034,7 +11038,8 @@ void MDCache::handle_dentry_link(const cref_t &m) // UNLINK -void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr) +void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, + MDRequestRef& mdr, bool unlinking) { dout(10) << __func__ << " " << *dn << dendl; // share unlink news with replicas @@ -11046,6 +11051,11 @@ void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& md CInode *strayin = straydn->get_linkage()->get_inode(); strayin->encode_snap_blob(snapbl); } + + if (unlinking) { + ceph_assert(!straydn); + dn->replica_unlinking_ref = 0; + } for (set::iterator it = replicas.begin(); it != replicas.end(); ++it) { @@ -11058,12 +11068,21 @@ void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& md rejoin_gather.count(*it))) continue; - auto unlink = make_message(dn->get_dir()->dirfrag(), dn->get_name()); + auto unlink = make_message(dn->get_dir()->dirfrag(), + dn->get_name(), unlinking); if (straydn) { encode_replica_stray(straydn, *it, unlink->straybl); unlink->snapbl = snapbl; } mds->send_message_mds(unlink, *it); + if (unlinking) { + dn->replica_unlinking_ref++; + dn->get(CDentry::PIN_WAITUNLINKSTATE); + } + } + + if (unlinking && dn->replica_unlinking_ref) { + dn->add_waiter(CDentry::WAIT_UNLINK_STATE, new C_MDS_RetryRequest(this, mdr)); } } @@ -11072,23 +11091,40 @@ void MDCache::handle_dentry_unlink(const cref_t &m) // straydn CDentry *straydn = nullptr; CInode *strayin = nullptr; + if (m->straybl.length()) decode_replica_stray(straydn, &strayin, m->straybl, mds_rank_t(m->get_source().num())); + boost::intrusive_ptr ack; + CDentry::linkage_t *dnl; + CDentry *dn; + CInode *in; + bool hadrealm; + CDir *dir = get_dirfrag(m->get_dirfrag()); if (!dir) { dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl; + if (m->is_unlinking()) + goto ack; } else { - CDentry *dn = dir->lookup(m->get_dn()); + dn = dir->lookup(m->get_dn()); if (!dn) { dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl; + if (m->is_unlinking()) + goto ack; } else { dout(7) << __func__ << " on " << *dn << dendl; - CDentry::linkage_t *dnl = dn->get_linkage(); + + if (m->is_unlinking()) { + dn->state_set(CDentry::STATE_UNLINKING); + goto ack; + } + + dnl = dn->get_linkage(); // open inode? if (dnl->is_primary()) { - CInode *in = dnl->get_inode(); + in = dnl->get_inode(); dn->dir->unlink_inode(dn); ceph_assert(straydn); straydn->dir->link_primary_inode(straydn, in); @@ -11099,11 +11135,12 @@ void MDCache::handle_dentry_unlink(const cref_t &m) in->first = straydn->first; // update subtree map? - if (in->is_dir()) + if (in->is_dir()) { adjust_subtree_after_rename(in, dir, false); + } if (m->snapbl.length()) { - bool hadrealm = (in->snaprealm ? true : false); + hadrealm = (in->snaprealm ? true : false); in->decode_snap_blob(m->snapbl); ceph_assert(in->snaprealm); if (!hadrealm) @@ -11114,7 +11151,7 @@ void MDCache::handle_dentry_unlink(const cref_t &m) if (in->is_any_caps() && !in->state_test(CInode::STATE_EXPORTINGCAPS)) migrator->export_caps(in); - + straydn = NULL; } else { ceph_assert(!straydn); @@ -11122,6 +11159,7 @@ void MDCache::handle_dentry_unlink(const cref_t &m) dn->dir->unlink_inode(dn); } ceph_assert(dnl->is_null()); + dn->state_clear(CDentry::STATE_UNLINKING); } } @@ -11133,8 +11171,36 @@ void MDCache::handle_dentry_unlink(const cref_t &m) trim_dentry(straydn, ex); send_expire_messages(ex); } + return; + +ack: + ack = make_message(m->get_dirfrag(), m->get_dn()); + mds->send_message(ack, m->get_connection()); } +void MDCache::handle_dentry_unlink_ack(const cref_t &m) +{ + CDir *dir = get_dirfrag(m->get_dirfrag()); + if (!dir) { + dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl; + } else { + CDentry *dn = dir->lookup(m->get_dn()); + if (!dn) { + dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl; + } else { + dout(7) << __func__ << " on " << *dn << " ref " + << dn->replica_unlinking_ref << " -> " + << dn->replica_unlinking_ref - 1 << dendl; + dn->replica_unlinking_ref--; + if (!dn->replica_unlinking_ref) { + MDSContext::vec finished; + dn->take_waiting(CDentry::WAIT_UNLINK_STATE, finished); + mds->queue_waiters(finished); + } + dn->put(CDentry::PIN_WAITUNLINKSTATE); + } + } +} diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index f5b4e2a0612..66eae6c2933 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -879,7 +879,7 @@ class MDCache { void encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl); void decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p); void send_dentry_link(CDentry *dn, MDRequestRef& mdr); - void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr); + void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr, bool unlinking=false); void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSContext *c) { uncommitted_fragments.at(dirfrag).waiters.push_back(c); @@ -1115,6 +1115,7 @@ class MDCache { void handle_discover_reply(const cref_t &m); void handle_dentry_link(const cref_t &m); void handle_dentry_unlink(const cref_t &m); + void handle_dentry_unlink_ack(const cref_t &m); int dump_cache(std::string_view fn, Formatter *f, double timeout); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 6d04da9cbf2..3e3e8f371c6 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1936,20 +1936,23 @@ void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEv mdr->pin(dn); early_reply(mdr, in, dn); - + mdr->committing = true; submit_mdlog_entry(le, fin, mdr, __func__); - + if (mdr->client_request && mdr->client_request->is_queued_for_replay()) { if (mds->queue_one_replay()) { dout(10) << " queued next replay op" << dendl; } else { dout(10) << " journaled last replay op" << dendl; } - } else if (mdr->did_early_reply) + } else if (mdr->did_early_reply) { mds->locker->drop_rdlocks_for_early_reply(mdr.get()); - else + if (dn && dn->is_waiter_for(CDentry::WAIT_UNLINK_FINISH)) + mdlog->flush(); + } else { mdlog->flush(); + } } void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr, @@ -4448,6 +4451,11 @@ void Server::handle_client_openc(MDRequestRef& mdr) if (!dn) return; + if (is_unlink_pending(dn)) { + wait_for_pending_unlink(dn, mdr); + return; + } + CDentry::linkage_t *dnl = dn->get_projected_linkage(); if (!excl && !dnl->is_null()) { // it existed. @@ -6618,6 +6626,44 @@ void Server::handle_client_getvxattr(MDRequestRef& mdr) // ------------------------------------------------ +struct C_WaitUnlinkToFinish : public MDSContext { +protected: + MDCache *mdcache; + CDentry *dn; + MDSContext *fin; + + MDSRank *get_mds() override + { + ceph_assert(mdcache != NULL); + return mdcache->mds; + } + +public: + C_WaitUnlinkToFinish(MDCache *m, CDentry *d, MDSContext *f) : + mdcache(m), dn(d), fin(f) {} + void finish(int r) override { + fin->complete(r); + dn->put(CDentry::PIN_PURGING); + } +}; + +bool Server::is_unlink_pending(CDentry *dn) +{ + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + if (!dnl->is_null() && dn->state_test(CDentry::STATE_UNLINKING)) { + return true; + } + return false; +} + +void Server::wait_for_pending_unlink(CDentry *dn, MDRequestRef& mdr) +{ + mds->locker->drop_locks(mdr.get()); + auto fin = new C_MDS_RetryRequest(mdcache, mdr); + dn->get(CDentry::PIN_PURGING); + dn->add_waiter(CDentry::WAIT_UNLINK_FINISH, new C_WaitUnlinkToFinish(mdcache, dn, fin)); +} + // MKNOD class C_MDS_mknod_finish : public ServerLogContext { @@ -6681,6 +6727,11 @@ void Server::handle_client_mknod(MDRequestRef& mdr) if (!dn) return; + if (is_unlink_pending(dn)) { + wait_for_pending_unlink(dn, mdr); + return; + } + CDir *dir = dn->get_dir(); CInode *diri = dir->get_inode(); if (!check_access(mdr, diri, MAY_WRITE)) @@ -6779,6 +6830,11 @@ void Server::handle_client_mkdir(MDRequestRef& mdr) if (!dn) return; + if (is_unlink_pending(dn)) { + wait_for_pending_unlink(dn, mdr); + return; + } + CDir *dir = dn->get_dir(); CInode *diri = dir->get_inode(); @@ -6874,6 +6930,11 @@ void Server::handle_client_symlink(MDRequestRef& mdr) if (!dn) return; + if (is_unlink_pending(dn)) { + wait_for_pending_unlink(dn, mdr); + return; + } + CDir *dir = dn->get_dir(); CInode *diri = dir->get_inode(); @@ -6980,6 +7041,11 @@ void Server::handle_client_link(MDRequestRef& mdr) targeti = ret.second->get_projected_linkage()->get_inode(); } + if (is_unlink_pending(destdn)) { + wait_for_pending_unlink(destdn, mdr); + return; + } + ceph_assert(destdn->get_projected_linkage()->is_null()); if (req->get_alternate_name().size() > alternate_name_max) { dout(10) << " alternate_name longer than " << alternate_name_max << dendl; @@ -7264,11 +7330,17 @@ void Server::_link_remote_finish(MDRequestRef& mdr, bool inc, mdr->apply(); MDRequestRef null_ref; - if (inc) + if (inc) { mdcache->send_dentry_link(dn, null_ref); - else + } else { + dn->state_clear(CDentry::STATE_UNLINKING); mdcache->send_dentry_unlink(dn, NULL, null_ref); - + + MDSContext::vec finished; + dn->take_waiting(CDentry::WAIT_UNLINK_FINISH, finished); + mdcache->mds->queue_waiters(finished); + } + // bump target popularity mds->balancer->hit_inode(targeti, META_POP_IWR); mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR); @@ -7642,10 +7714,20 @@ void Server::handle_client_unlink(MDRequestRef& mdr) if (rmdir) mdr->disable_lock_cache(); + CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true); if (!dn) return; + // notify replica MDSes the dentry is under unlink + if (!dn->state_test(CDentry::STATE_UNLINKING)) { + dn->state_set(CDentry::STATE_UNLINKING); + mdcache->send_dentry_unlink(dn, nullptr, mdr, true); + if (dn->replica_unlinking_ref) { + return; + } + } + CDentry::linkage_t *dnl = dn->get_linkage(client, mdr); ceph_assert(!dnl->is_null()); CInode *in = dnl->get_inode(); @@ -7920,9 +8002,14 @@ void Server::_unlink_local_finish(MDRequestRef& mdr, } mdr->apply(); - + + dn->state_clear(CDentry::STATE_UNLINKING); mdcache->send_dentry_unlink(dn, straydn, mdr); - + + MDSContext::vec finished; + dn->take_waiting(CDentry::WAIT_UNLINK_FINISH, finished); + mdcache->mds->queue_waiters(finished); + if (straydn) { // update subtree map? if (strayin->is_dir()) @@ -7937,7 +8024,7 @@ void Server::_unlink_local_finish(MDRequestRef& mdr, // reply respond_to_request(mdr, 0); - + // removing a new dn? dn->get_dir()->try_remove_unlinked_dn(dn); @@ -8396,6 +8483,16 @@ void Server::handle_client_rename(MDRequestRef& mdr) if (!destdn) return; + if (is_unlink_pending(destdn)) { + wait_for_pending_unlink(destdn, mdr); + return; + } + + if (is_unlink_pending(srcdn)) { + wait_for_pending_unlink(srcdn, mdr); + return; + } + dout(10) << " destdn " << *destdn << dendl; CDir *destdir = destdn->get_dir(); ceph_assert(destdir->is_auth()); diff --git a/src/mds/Server.h b/src/mds/Server.h index 33054bd067e..4217252ed7b 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -234,6 +234,9 @@ public: void handle_client_fsync(MDRequestRef& mdr); + bool is_unlink_pending(CDentry *dn); + void wait_for_pending_unlink(CDentry *dn, MDRequestRef& mdr); + // open void handle_client_open(MDRequestRef& mdr); void handle_client_openc(MDRequestRef& mdr); // O_CREAT variant. diff --git a/src/messages/MDentryUnlink.h b/src/messages/MDentryUnlink.h index 210fa033c52..fc522844166 100644 --- a/src/messages/MDentryUnlink.h +++ b/src/messages/MDentryUnlink.h @@ -22,15 +22,17 @@ class MDentryUnlink final : public MMDSOp { private: - static constexpr int HEAD_VERSION = 1; + static constexpr int HEAD_VERSION = 2; static constexpr int COMPAT_VERSION = 1; - + dirfrag_t dirfrag; std::string dn; + bool unlinking = false; public: dirfrag_t get_dirfrag() const { return dirfrag; } const std::string& get_dn() const { return dn; } + bool is_unlinking() const { return unlinking; } ceph::buffer::list straybl; ceph::buffer::list snapbl; @@ -38,10 +40,9 @@ private: protected: MDentryUnlink() : MMDSOp(MSG_MDS_DENTRYUNLINK, HEAD_VERSION, COMPAT_VERSION) { } - MDentryUnlink(dirfrag_t df, std::string_view n) : + MDentryUnlink(dirfrag_t df, std::string_view n, bool u=false) : MMDSOp(MSG_MDS_DENTRYUNLINK, HEAD_VERSION, COMPAT_VERSION), - dirfrag(df), - dn(n) {} + dirfrag(df), dn(n), unlinking(u) {} ~MDentryUnlink() final {} public: @@ -49,19 +50,66 @@ public: void print(std::ostream& o) const override { o << "dentry_unlink(" << dirfrag << " " << dn << ")"; } - + void decode_payload() override { using ceph::decode; auto p = payload.cbegin(); decode(dirfrag, p); decode(dn, p); decode(straybl, p); + if (header.version >= 2) + decode(unlinking, p); } void encode_payload(uint64_t features) override { using ceph::encode; encode(dirfrag, payload); encode(dn, payload); encode(straybl, payload); + encode(unlinking, payload); + } +private: + template + friend boost::intrusive_ptr ceph::make_message(Args&&... args); + template + friend MURef crimson::make_message(Args&&... args); +}; + +class MDentryUnlinkAck final : public MMDSOp { +private: + static constexpr int HEAD_VERSION = 1; + static constexpr int COMPAT_VERSION = 1; + + dirfrag_t dirfrag; + std::string dn; + + public: + dirfrag_t get_dirfrag() const { return dirfrag; } + const std::string& get_dn() const { return dn; } + +protected: + MDentryUnlinkAck() : + MMDSOp(MSG_MDS_DENTRYUNLINK_ACK, HEAD_VERSION, COMPAT_VERSION) { } + MDentryUnlinkAck(dirfrag_t df, std::string_view n) : + MMDSOp(MSG_MDS_DENTRYUNLINK_ACK, HEAD_VERSION, COMPAT_VERSION), + dirfrag(df), dn(n) {} + ~MDentryUnlinkAck() final {} + +public: + std::string_view get_type_name() const override { return "dentry_unlink_ack";} + void print(std::ostream& o) const override { + o << "dentry_unlink_ack(" << dirfrag << " " << dn << ")"; + } + + void decode_payload() override { + using ceph::decode; + auto p = payload.cbegin(); + decode(dirfrag, p); + decode(dn, p); + } + void encode_payload(uint64_t features) override { + using ceph::encode; + encode(dirfrag, payload); + encode(dn, payload); } private: template diff --git a/src/msg/Message.cc b/src/msg/Message.cc index 266eb767678..3f4405a5091 100644 --- a/src/msg/Message.cc +++ b/src/msg/Message.cc @@ -822,6 +822,9 @@ Message *decode_message(CephContext *cct, break; + case MSG_MDS_DENTRYUNLINK_ACK: + m = make_message(); + break; case MSG_MDS_DENTRYUNLINK: m = make_message(); break; diff --git a/src/msg/Message.h b/src/msg/Message.h index a7aff1e27a6..49ace3850c2 100644 --- a/src/msg/Message.h +++ b/src/msg/Message.h @@ -173,6 +173,7 @@ #define MSG_MDS_OPENINOREPLY 0x210 #define MSG_MDS_SNAPUPDATE 0x211 #define MSG_MDS_FRAGMENTNOTIFYACK 0x212 +#define MSG_MDS_DENTRYUNLINK_ACK 0x213 #define MSG_MDS_LOCK 0x300 // 0x3xx are for locker of mds #define MSG_MDS_INODEFILECAPS 0x301