From dbe3dd2dc78ef69e6fe4fed2382b60a96d3cf6b1 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Fri, 15 Jul 2022 16:39:00 -0400 Subject: [PATCH] mds: ensure next replay is queued on req drop Not all client replay requests are queued at once since [1]. We require the next request by queued when completed (unsafely) or during cleanup. Not all code paths seem to handle this [2] so move it to a generic location, MDCache::request_cleanup. Even so, this doesn't handle all errors (so we must still be careful) as sometimes we must queue the next replay request before an MDRequest is constructed [3] during some error conditions. Additionally, preserve the behavior of Server::journal_and_reply queueing the next replay op. Otherwise, must wait for the request to be durable before moving onto the next one, unnecessarily. For reproducing, two specific cases are highlighted (thanks to @Mer1997 on Github for locating these): - The request is killed by a session close / eviction while a replayed request is queued and waiting for a journal flush (e.g. dirty inest locks). - The request construction fails because the request is already in the active_requests. This could happen theoretically if a client resends the same request (same reqid) twice. The first case is most probable but very difficult to reproduce for testing purposes. The replayed op would need to wait on a journal flush (to be restarted by C_MDS_RetryRequest). Then, the request would need killed by a session close. [1] ed6a18d90fdd1dc869369fb92c2aad43bc5c9a34 [2] https://github.com/ceph/ceph/blob/a6f1a1c6c09d74f5918c715b05789f34f2ea0e90/src/mds/Server.cc#L2253-L2262 [3] https://github.com/ceph/ceph/blob/a6f1a1c6c09d74f5918c715b05789f34f2ea0e90/src/mds/Server.cc#L2380 Fixes: https://tracker.ceph.com/issues/56577 Signed-off-by: Patrick Donnelly (cherry picked from commit 078ecaa42b98f9858d2e3a045aedb51153b39e34) Conflicts: src/mds/Server.cc: minor code diff conflict --- src/mds/MDCache.cc | 6 ++++++ src/mds/MDSRank.cc | 2 ++ src/mds/Mutation.h | 7 +++++++ src/mds/Server.cc | 44 ++++++++++++++++++++++++++++---------------- 4 files changed, 43 insertions(+), 16 deletions(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 2ea13155ed4..90f3dfca992 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -9897,6 +9897,12 @@ void MDCache::request_cleanup(MDRequestRef& mdr) // remove from map active_requests.erase(mdr->reqid); + // queue next replay op? + if (mdr->is_queued_for_replay() && !mdr->get_queued_next_replay_op()) { + mdr->set_queued_next_replay_op(); + mds->queue_one_replay(); + } + if (mds->logger) log_stat(); diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 9a80534a4d5..92de09ca1ca 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -2060,6 +2060,7 @@ bool MDSRank::queue_one_replay() if (!replay_queue.empty()) { queue_waiter(replay_queue.front()); replay_queue.pop_front(); + dout(10) << " queued next replay op" << dendl; return true; } if (!replaying_requests_done) { @@ -2067,6 +2068,7 @@ bool MDSRank::queue_one_replay() mdlog->flush(); } maybe_clientreplay_done(); + dout(10) << " journaled last replay op" << dendl; return false; } diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index b963dee0842..bc83f219151 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -387,6 +387,12 @@ struct MDRequestImpl : public MutationImpl { void set_filepath(const filepath& fp); void set_filepath2(const filepath& fp); bool is_queued_for_replay() const; + bool get_queued_next_replay_op() const { + return queued_next_replay_op; + } + void set_queued_next_replay_op() { + queued_next_replay_op = true; + } int compare_paths(); bool can_batch(); @@ -460,6 +466,7 @@ protected: } void _dump(ceph::Formatter *f, bool has_mds_lock) const; void _dump_op_descriptor(std::ostream& stream) const override; + bool queued_next_replay_op = false; }; struct MDPeerUpdate { diff --git a/src/mds/Server.cc b/src/mds/Server.cc index ced4ecffae1..d77ad6f3788 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -305,6 +305,7 @@ void Server::dispatch(const cref_t &m) return; } bool queue_replay = false; + dout(5) << "dispatch request in up:reconnect: " << *req << dendl; if (req->is_replay() || req->is_async()) { dout(3) << "queuing replayed op" << dendl; queue_replay = true; @@ -323,10 +324,13 @@ void Server::dispatch(const cref_t &m) // process completed request in clientreplay stage. The completed request // might have created new file/directorie. This guarantees MDS sends a reply // to client before other request modifies the new file/directorie. - if (session->have_completed_request(req->get_reqid().tid, NULL)) { - dout(3) << "queuing completed op" << dendl; + bool r = session->have_completed_request(req->get_reqid().tid, NULL); + if (r) { + dout(3) << __func__ << ": queuing completed op" << dendl; queue_replay = true; - } + } else { + dout(20) << __func__ << ": request not complete" << dendl; + } // this request was created before the cap reconnect message, drop any embedded // cap releases. req->releases.clear(); @@ -1984,12 +1988,15 @@ void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEv mdr->committing = true; submit_mdlog_entry(le, fin, mdr, __func__); - if (mdr->client_request && mdr->client_request->is_queued_for_replay()) { - if (mds->queue_one_replay()) { - dout(10) << " queued next replay op" << dendl; - } else { - dout(10) << " journaled last replay op" << dendl; - } + if (mdr->is_queued_for_replay()) { + + /* We want to queue the next replay op while waiting for the journaling, so + * do it now when the early (unsafe) replay is dispatched. Don't wait until + * this request is cleaned up in MDCache.cc. + */ + + mdr->set_queued_next_replay_op(); + mds->queue_one_replay(); } else if (mdr->did_early_reply) mds->locker->drop_rdlocks_for_early_reply(mdr.get()); else @@ -2293,15 +2300,16 @@ void Server::reply_client_request(MDRequestRef& mdr, const ref_t & mds->send_message_client(reply, session); } - if (req->is_queued_for_replay() && - (mdr->has_completed || reply->get_result() < 0)) { - if (reply->get_result() < 0) { - int r = reply->get_result(); + if (client_inst.name.is_mds() && reply->get_op() == CEPH_MDS_OP_RENAME) { + mds->send_message(reply, mdr->client_request->get_connection()); + } + + if (req->is_queued_for_replay()) { + if (int r = reply->get_result(); r < 0) { derr << "reply_client_request: failed to replay " << *req - << " error " << r << " (" << cpp_strerror(r) << ")" << dendl; + << " error " << r << " (" << cpp_strerror(r) << ")" << dendl; mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r; } - mds->queue_one_replay(); } // clean up request @@ -2500,8 +2508,12 @@ void Server::handle_client_request(const cref_t &req) // register + dispatch MDRequestRef mdr = mdcache->request_start(req); - if (!mdr.get()) + if (!mdr.get()) { + dout(5) << __func__ << ": possibly duplicate op " << *req << dendl; + if (req->is_queued_for_replay()) + mds->queue_one_replay(); return; + } if (session) { mdr->session = session; -- 2.39.5