From 79abdcb31af04f7ec48dcf4cf2f058b2e93c9c80 Mon Sep 17 00:00:00 2001 From: xuxuehan Date: Thu, 27 Sep 2018 16:42:43 +0800 Subject: [PATCH] mds: answering all pending getattr/lookups targeting the same inode in one go As for now, all getattr/lookup requests get processes one by one, which is kind of wasting CPU resources. Actually, for those getattr/lookup requests for the same inode, only one of them needs to be processes, the results applies all others. Resolves: http://tracker.ceph.com/issues/36608 Signed-off-by: Xuehan Xu --- src/mds/CDentry.h | 4 +- src/mds/CInode.h | 22 +++++++ src/mds/MDCache.cc | 39 ++++++++++++- src/mds/Migrator.cc | 2 +- src/mds/Mutation.cc | 10 +++- src/mds/Mutation.h | 5 ++ src/mds/Server.cc | 137 +++++++++++++++++++++++++++++++++++++++++++- src/mds/Server.h | 2 + 8 files changed, 214 insertions(+), 7 deletions(-) diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h index 56aa58c561a..f8e408ef468 100644 --- a/src/mds/CDentry.h +++ b/src/mds/CDentry.h @@ -41,8 +41,7 @@ class CDentry; class LogSegment; class Session; - - +class BatchOp; // define an ordering bool operator<(const CDentry& l, const CDentry& r); @@ -354,6 +353,7 @@ public: LocalLock versionlock; // FIXME referenced containers not in mempool mempool::mds_co::map client_lease_map; + std::map batch_ops; protected: diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 4fba93d466b..e8c4b8a1542 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -58,6 +58,26 @@ struct cinode_lock_info_t { int wr_caps; }; +class BatchOp { +public: + virtual void add_request(const MDRequestRef& mdr) = 0; + virtual void set_request(const MDRequestRef& mdr) = 0; + virtual void forward_all(mds_rank_t target) = 0; + virtual void respond_all(int r) = 0; + + void finish(int r) { + respond_all(r); + delete this; + } + + void forward_requests(mds_rank_t target) { + forward_all(target); + delete this; + } + + virtual ~BatchOp() {} +}; + /** * Base class for CInode, containing the backing store data and * serialization methods. This exists so that we can read and @@ -348,6 +368,8 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter batch_ops; + std::string_view pin_name(int p) const override; ostream& print_db_line_prefix(ostream& out) override; diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index de4bef951b1..e703a1eeb6f 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -9312,7 +9312,44 @@ void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port) if (mdr->client_request && mdr->client_request->get_source().is_client()) { dout(7) << "request_forward " << *mdr << " to mds." << who << " req " << *mdr->client_request << dendl; - mds->forward_message_mds(mdr->release_client_request(), who); + if (mdr->is_batch_head) { + BatchOp* bop = nullptr; + int mask = mdr->client_request->head.args.getattr.mask; + + switch (mdr->client_request->get_op()) { + case CEPH_MDS_OP_GETATTR: + { + CInode* in = mdr->in[0]; + if (in) { + auto it = in->batch_ops.find(mask); + if (it != in->batch_ops.end()) { + bop = it->second; + } + } + break; + } + case CEPH_MDS_OP_LOOKUP: + { + CDentry* dn = mdr->dn[0].back(); + if (dn) { + auto it = dn->batch_ops.find(mask); + if (it != dn->batch_ops.end()) { + bop = it->second; + } + } + break; + } + default: + ceph_abort(); + } + if (bop) { + dout(20) << __func__ << ": forward other batch ops(GETATTR/LOOKUP) to " + << who << ":" << port <<", too. " << *mdr << dendl; + bop->forward_requests(who); + } + } else { + mds->forward_message_mds(mdr->release_client_request(), who); + } if (mds->logger) mds->logger->inc(l_mds_forward); } else if (mdr->internal_op >= 0) { dout(10) << "request_forward on internal op; cancelling" << dendl; diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 0551f37b134..ace31a30946 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -1776,7 +1776,7 @@ uint64_t Migrator::encode_export_dir(bufferlist& exportbl, exportbl.append("I", 1); // inode dentry encode_export_inode(in, exportbl, exported_client_map, exported_client_metadata_map); // encode, and (update state for) export - + // directory? auto&& dfs = in->get_dirfrags(); for (const auto& t : dfs) { diff --git a/src/mds/Mutation.cc b/src/mds/Mutation.cc index 1cfaeadab6a..ec26e4df1cd 100644 --- a/src/mds/Mutation.cc +++ b/src/mds/Mutation.cc @@ -357,6 +357,14 @@ bool MDRequestImpl::is_queued_for_replay() const return client_request ? client_request->is_queued_for_replay() : false; } +bool MDRequestImpl::is_batch_op() +{ + return (client_request->get_op() == CEPH_MDS_OP_LOOKUP && + client_request->get_filepath().depth() == 1) || + (client_request->get_op() == CEPH_MDS_OP_GETATTR && + client_request->get_filepath().depth() == 0); +} + cref_t MDRequestImpl::release_client_request() { msg_lock.lock(); @@ -378,7 +386,7 @@ void MDRequestImpl::reset_slave_request(const cref_t& req) void MDRequestImpl::print(ostream &out) const { - out << "request(" << reqid; + out << "request(" << reqid << " nref=" << nref; //if (request) out << " " << *request; if (is_slave()) out << " slave_to mds." << slave_to_mds; if (client_request) out << " cr=" << client_request; diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index 076be8b0b5f..b1b22470b83 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -28,6 +28,7 @@ #include "common/TrackedOp.h" #include "messages/MClientRequest.h" #include "messages/MMDSSlaveRequest.h" +#include "messages/MClientReply.h" class LogSegment; class Capability; @@ -281,6 +282,8 @@ struct MDRequestImpl : public MutationImpl { // indicates how may retries of request have been made int retry; + bool is_batch_head = false; + // indicator for vxattr osdmap update bool waited_for_osdmap; @@ -398,6 +401,7 @@ struct MDRequestImpl : public MutationImpl { void set_filepath(const filepath& fp); void set_filepath2(const filepath& fp); bool is_queued_for_replay() const; + bool is_batch_op(); void print(ostream &out) const override; void dump(Formatter *f) const override; @@ -407,6 +411,7 @@ struct MDRequestImpl : public MutationImpl { // TrackedOp stuff typedef boost::intrusive_ptr Ref; + std::vector batch_reqs; protected: void _dump(Formatter *f) const override; void _dump_op_descriptor_unlocked(ostream& stream) const override; diff --git a/src/mds/Server.cc b/src/mds/Server.cc index ebaa1fa6aa0..df03426f1d0 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -77,6 +77,43 @@ class ServerContext : public MDSContext { } }; +class Batch_Getattr_Lookup : public BatchOp { +protected: + Server* server; + MDRequestRef mdr; + MDCache* mdcache; + int res = 0; +public: + Batch_Getattr_Lookup(Server* s, MDRequestRef& r, MDCache* mdc) : server(s), mdr(r), mdcache(mdc) {} + void add_request(const MDRequestRef& m) override { + mdr->batch_reqs.push_back(m); + } + void set_request(const MDRequestRef& m) override { + mdr = m; + } + void forward_all(mds_rank_t t) override { + mdcache->mds->forward_message_mds(mdr->release_client_request(), t); + mdr->set_mds_stamp(ceph_clock_now()); + for (auto m : mdr->batch_reqs) { + if (!m->killed) + mdcache->request_forward(m, t); + } + mdr->batch_reqs.clear(); + } + void respond_all(int r) { + mdr->set_mds_stamp(ceph_clock_now()); + for (auto m : mdr->batch_reqs) { + if (!m->killed) { + m->tracei = mdr->tracei; + m->tracedn = mdr->tracedn; + server->respond_to_request(m, r); + } + } + mdr->batch_reqs.clear(); + server->reply_client_request(mdr, make_message(*mdr->client_request, r)); + } +}; + class ServerLogContext : public MDSLogContextBase { protected: Server *server; @@ -1750,7 +1787,28 @@ void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestR void Server::respond_to_request(MDRequestRef& mdr, int r) { if (mdr->client_request) { - reply_client_request(mdr, make_message(*mdr->client_request, r)); + if (mdr->is_batch_op() && mdr->is_batch_head) { + int mask = mdr->client_request->head.args.getattr.mask; + + BatchOp *fin; + if (mdr->client_request->get_op() == CEPH_MDS_OP_GETATTR) { + dout(20) << __func__ << ": respond other getattr ops. " << *mdr << dendl; + fin = mdr->in[0]->batch_ops[mask]; + } else { + dout(20) << __func__ << ": respond other lookup ops. " << *mdr << dendl; + fin = mdr->dn[0].back()->batch_ops[mask]; + } + + fin->finish(r); + + if (mdr->client_request->get_op() == CEPH_MDS_OP_GETATTR) { + mdr->in[0]->batch_ops.erase(mask); + } else { + mdr->dn[0].back()->batch_ops.erase(mask); + } + } else { + reply_client_request(mdr, make_message(*mdr->client_request, r)); + } } else if (mdr->internal_op > -1) { dout(10) << "respond_to_request on internal request " << mdr << dendl; if (!mdr->internal_op_finish) @@ -2267,6 +2325,22 @@ void Server::handle_osd_map() }); } +void Server::clear_batch_ops(const MDRequestRef& mdr) +{ + int mask = mdr->client_request->head.args.getattr.mask; + if (mdr->client_request->get_op() == CEPH_MDS_OP_GETATTR && mdr->in[0]) { + auto it = mdr->in[0]->batch_ops.find(mask); + auto bocp = it->second; + mdr->in[0]->batch_ops.erase(it); + delete bocp; + } else if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP && mdr->dn[0].size()) { + auto it = mdr->dn[0].back()->batch_ops.find(mask); + auto bocp = it->second; + mdr->dn[0].back()->batch_ops.erase(it); + delete bocp; + } +} + void Server::dispatch_client_request(MDRequestRef& mdr) { // we shouldn't be waiting on anyone. @@ -2274,7 +2348,44 @@ void Server::dispatch_client_request(MDRequestRef& mdr) if (mdr->killed) { dout(10) << "request " << *mdr << " was killed" << dendl; - return; + //if the mdr is a "batch_op" and it has followers, pick a follower as + //the new "head of the batch ops" and go on processing the new one. + if (mdr->is_batch_op() && mdr->is_batch_head ) { + if (!mdr->batch_reqs.empty()) { + MDRequestRef new_batch_head; + for (auto itr = mdr->batch_reqs.cbegin(); itr != mdr->batch_reqs.cend();) { + auto req = *itr; + itr = mdr->batch_reqs.erase(itr); + if (!req->killed) { + new_batch_head = req; + break; + } + } + + if (!new_batch_head) { + clear_batch_ops(mdr); + return; + } + + new_batch_head->batch_reqs = std::move(mdr->batch_reqs); + + mdr = new_batch_head; + mdr->is_batch_head = true; + int mask = mdr->client_request->head.args.getattr.mask; + if (mdr->client_request->get_op() == CEPH_MDS_OP_GETATTR) { + auto fin = mdr->in[0]->batch_ops[mask]; + fin->set_request(new_batch_head); + } else if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP) { + auto fin = mdr->dn[0].back()->batch_ops[mask]; + fin->set_request(new_batch_head); + } + } else { + clear_batch_ops(mdr); + return; + } + } else { + return; + } } else if (mdr->aborted) { mdr->aborted = false; mdcache->request_kill(mdr); @@ -3545,6 +3656,28 @@ void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup) !is_lookup); if (!ref) return; + mdr->getattr_caps = mask; + + if (!mdr->is_batch_head && mdr->is_batch_op()) { + if (!is_lookup) { + if (ref->batch_ops.count(mask)) { + dout(20) << __func__ << ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr << dendl; + ref->batch_ops[mask]->add_request(mdr); + return; + } else + ref->batch_ops[mask] = new Batch_Getattr_Lookup(this, mdr, mdcache); + } else { + CDentry* dn = mdr->dn[0].back(); + if (dn->batch_ops.count(mask)) { + dout(20) << __func__ << ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr << dendl; + dn->batch_ops[mask]->add_request(mdr); + return; + } else + dn->batch_ops[mask] = new Batch_Getattr_Lookup(this, mdr, mdcache); + } + } + mdr->is_batch_head = true; + /* * if client currently holds the EXCL cap on a field, do not rdlock * it; client's stat() will result in valid info if _either_ EXCL diff --git a/src/mds/Server.h b/src/mds/Server.h index a5296a0a430..3b72d97caf5 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -110,6 +110,7 @@ private: friend class MDSContinuation; friend class ServerContext; friend class ServerLogContext; + friend class Batch_Getattr_Lookup; public: bool terminating_sessions; @@ -348,6 +349,7 @@ public: private: void reply_client_request(MDRequestRef& mdr, const ref_t &reply); void flush_session(Session *session, MDSGatherBuilder *gather); + void clear_batch_ops(const MDRequestRef& mdr); DecayCounter recall_throttle; time last_recall_state; -- 2.39.5