From 98edb020538e383258edf0fae3bea4969d6152a0 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Thu, 20 Feb 2025 13:07:17 +0000 Subject: [PATCH] mds: add blockdiff operation support Fixes: http://tracker.ceph.com/issues/69791 Signed-off-by: Venky Shankar (cherry picked from commit 8cc05b172551826eac5b03f086cc872df9584837) Conflicts: src/common/ceph_strings.cc src/common/options/mds.yaml.in src/mds/mdstypes.cc src/mds/mdstypes.h Missing EstimatedReplayTime structure in squid and some features. --- src/common/ceph_strings.cc | 1 + src/common/options/mds.yaml.in | 9 ++ src/include/ceph_fs.h | 7 ++ src/mds/MDCache.cc | 199 +++++++++++++++++++++++++++++++++ src/mds/MDCache.h | 12 ++ src/mds/Server.cc | 101 ++++++++++++++++- src/mds/Server.h | 6 + src/mds/mdstypes.cc | 41 +++++++ src/mds/mdstypes.h | 13 +++ 9 files changed, 388 insertions(+), 1 deletion(-) diff --git a/src/common/ceph_strings.cc b/src/common/ceph_strings.cc index 752b1d59f1272..c51f5ea48ccfd 100644 --- a/src/common/ceph_strings.cc +++ b/src/common/ceph_strings.cc @@ -322,6 +322,7 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_REPAIR_INODESTATS: return "repair_inodestats"; case CEPH_MDS_OP_QUIESCE_PATH: return "quiesce_path"; case CEPH_MDS_OP_QUIESCE_INODE: return "quiesce_inode"; + case CEPH_MDS_OP_FILE_BLOCKDIFF: return "blockdiff"; } return "???"; } diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index 05fa4fca9a695..aacdadd858fb8 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -1686,3 +1686,12 @@ options: services: - mds min: 8 +- name: mds_file_blockdiff_max_concurrent_object_scans + type: uint + level: advanced + desc: maximum number of concurrent object scans + long_desc: Maximum number of concurrent listsnaps operations sent to RADOS. + default: 16 + services: + - mds + min: 1 diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index e595d4628e024..d042213a6d12d 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -428,6 +428,7 @@ enum { CEPH_MDS_OP_LSSNAP = 0x00402, CEPH_MDS_OP_RENAMESNAP = 0x01403, CEPH_MDS_OP_READDIR_SNAPDIFF = 0x01404, + CEPH_MDS_OP_FILE_BLOCKDIFF = 0x01405, // internal op CEPH_MDS_OP_FRAGMENTDIR= 0x01500, @@ -649,6 +650,12 @@ union ceph_mds_request_args { __le32 offset_hash; __le64 snap_other; } __attribute__ ((packed)) snapdiff; + struct { + // latest scan "pointer" + __le64 scan_idx; + // how many data objects to scan in one invocation (capped by the mds). + __le64 max_objects; + } __attribute__ ((packed)) blockdiff; } __attribute__ ((packed)); #define CEPH_MDS_REQUEST_HEAD_VERSION 3 diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 0b9e07c4b3fb3..98beb5272df1c 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -14218,3 +14218,202 @@ void MDCache::upkeep_main(void) upkeep_cvar.wait_for(lock, interval); } } + +struct C_ListSnapsAggregator : public MDSIOContext { + C_ListSnapsAggregator(MDSRank *mds, CInode *in1, CInode *in2, BlockDiff *block_diff, + Context *on_finish) + : MDSIOContext(mds), + in1(in1), + in2(in2), + block_diff(block_diff), + on_finish(on_finish) { + } + + void finish(int r) override { + mds->mdcache->aggregate_snap_sets(snap_set_context, in1, in2, + block_diff, on_finish); + } + + virtual void print(std::ostream& os) const { + os << "listsnaps"; + } + + void add_snap_set_context(std::unique_ptr ssc) { + snap_set_context.push_back(std::move(ssc)); + } + + CInode *in1; + CInode *in2; + BlockDiff *block_diff; + Context *on_finish; + std::vector> snap_set_context; +}; + +void MDCache::file_blockdiff(CInode *in1, CInode *in2, BlockDiff *block_diff, uint64_t max_objects, + MDSContext *ctx) { + ceph_assert(in1->last <= in2->last); + + // I think this is not required since the MDS disallows setting + // layout when truncate_seq > 1. + if (in1->get_inode()->layout != in2->get_inode()->layout) { + dout(20) << __func__ << ": snaps have different layout: " << in1->get_inode()->layout + << " vs " << in2->get_inode()->layout << dendl; + block_diff->blocks.union_insert(0, in2->get_inode()->size); + ctx->complete(0); + return; + } + + uint64_t scan_idx = block_diff->scan_idx; + uint64_t num_objects1 = Striper::get_num_objects(in1->get_inode()->layout, + in1->get_inode()->size); + uint64_t num_objects2 = Striper::get_num_objects(in2->get_inode()->layout, + in2->get_inode()->size); + uint64_t num_objects_pending1 = num_objects1 - scan_idx; + uint64_t num_objects_pending2 = num_objects2 - scan_idx; + + uint64_t scans = std::min( + std::min(num_objects_pending1, num_objects_pending2), + std::min((uint64_t)(g_conf().get_val("mds_file_blockdiff_max_concurrent_object_scans")), + max_objects)); + + dout(20) << __func__ << ": scanning " << scans << " objects" << dendl; + if (scans == 0) { + // we ran out of objects to scan - figure which ones + if (num_objects_pending1 == 0 && num_objects_pending2 == 0) { + // easy - both snaps have same number of objects + dout(20) << __func__ << ": equal extent" << dendl; + ctx->complete(0); + } else { + if (num_objects_pending1 == 0) { + // first snapshot has lesser number of objects - return + // an extent covering EOF. + dout(20) << __func__ << ": EOF extent" << dendl; + uint64_t offset = Striper::get_file_offset(g_ceph_context, &(in2->get_inode()->layout), + scan_idx, 0); + block_diff->blocks.union_insert(offset, in2->get_inode()->size - offset); + ctx->complete(0); + } else { + // num_objects_pending2 == 0 + dout(20) << __func__ << ": truncated extent" << dendl; + ctx->complete(0); + } + } + + return; + } + + C_ListSnapsAggregator *on_finish = new C_ListSnapsAggregator(mds, in1, in2, block_diff, ctx); + MDSGatherBuilder gather_ctx(g_ceph_context, on_finish); + + while (scans > 0) { + ObjectOperation op; + std::unique_ptr ssc(new SnapSetContext()); + op.list_snaps(&ssc->snaps, &ssc->r); + ssc->objectid = scan_idx; + + mds->objecter->read(file_object_t(in1->ino(), scan_idx), + OSDMap::file_to_object_locator(in2->get_inode()->layout), + op, LIBRADOS_SNAP_DIR, NULL, 0, gather_ctx.new_sub()); + on_finish->add_snap_set_context(std::move(ssc)); + ++scan_idx; + --scans; + } + + gather_ctx.activate(); +} + +void MDCache::aggregate_snap_sets(const std::vector> &snap_set_ctx, + CInode *in1, CInode *in2, BlockDiff *block_diff, Context *on_finish) { + dout(20) << __func__ << dendl; + + // always signal to the client to request again since request + // completion is signalled in file_blockdiff(). + int r = 1; + snapid_t snapid1 = in1->last; + snapid_t snapid2 = in2->last; + uint64_t scans = snap_set_ctx.size(); + + interval_set extents; + for (auto &snap_set : snap_set_ctx) { + dout(20) << __func__ << ": objectid=" << snap_set->objectid << ", r=" << snap_set->r + << dendl; + if (snap_set->r != 0 && snap_set->r != -ENOENT) { + derr << ": failed to get snap set for objectid=" << snap_set->objectid + << ", r=" << snap_set->r << dendl; + r = snap_set->r; + break; + } + + if (snap_set->r == 0) { + auto &clones = snap_set->snaps.clones; + auto it1 = std::find_if(clones.begin(), clones.end(), + [snapid1](const librados::clone_info_t &clone) + { + return snapid1 == clone.cloneid || + (std::find(clone.snaps.begin(), clone.snaps.end(), snapid1) != clone.snaps.end()); + }); + // point to "head" if not found + if (it1 == clones.end()) { + it1 = std::prev(it1); + } + auto it2 = std::find_if(clones.begin(), clones.end(), + [snapid2](const librados::clone_info_t &clone) + { + return snapid2 == clone.cloneid || + (std::find(clone.snaps.begin(), clone.snaps.end(), snapid2) != clone.snaps.end()); + }); + // point to "head" if not found + if (it2 == clones.end()) { + it2 = std::prev(it2); + } + + if (it1 == it2) { + dout(10) << __func__ << ": both snaps in same clone" << dendl; + continue; + } + + interval_set extent; + uint64_t offset = Striper::get_file_offset(g_ceph_context, &(in2->get_inode()->layout), + snap_set->objectid, 0); + + for (auto hops = std::distance(it1, it2); hops > 0; --hops) { + dout(20) << __func__ << ": [cloneid: " << it1->cloneid << " snaps: " << it1->snaps + << " overlap: " << it1->overlap << "]" << dendl; + auto next_it = it1 + 1; + dout(20) << __func__ << ": [next cloneid: " << next_it->cloneid << " snaps: " << next_it->snaps + << " overlap: " << next_it->overlap << "]" << dendl; + auto sz = next_it->size; + if (sz == 0) { + // this object is a hole in the file. + // TODO: report holes in blockdiff strucuter. that way, + // caller can optimize and punch holes rather than writing + // zeros. + dout(10) << __func__ << ": hole: [" << offset << "~" << it1->size << "]" << dendl; + dout(10) << __func__ << ": adding whole extent - reader will read zeros" << dendl; + sz = it1->size; + } + + extent.clear(); + extent.union_insert(offset, sz); + for (auto &overlap_region : it1->overlap) { + uint64_t overlap_offset = Striper::get_file_offset(g_ceph_context, &(in2->get_inode()->layout), + snap_set->objectid, overlap_region.first); + extent.erase(overlap_offset, overlap_region.second); + } + + dout(20) << __func__ << ": (non overlapping) extent=" << extent << dendl; + extents.union_of(extent); + dout(20) << __func__ << ": (modified) extents=" << extents << dendl; + ++it1; + } + } + } + + block_diff->rval = r; + if (r >= 0) { + r = 0; + block_diff->scan_idx += scans; + block_diff->blocks = extents; + } + on_finish->complete(r); +} diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index f2d4f6a020723..5f8463fbb8615 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -24,6 +24,7 @@ #include "include/types.h" #include "include/filepath.h" #include "include/elist.h" +#include "include/rados/rados_types.hpp" #include "messages/MCacheExpire.h" #include "messages/MClientQuota.h" @@ -1152,6 +1153,17 @@ private: double export_ephemeral_random_max = 0.0; + struct SnapSetContext { /* not to be confused with SnapContext */ + int r; + uint64_t objectid; + librados::snap_set_t snaps; + }; + + void file_blockdiff(CInode *in1, CInode *in2, BlockDiff *block_diff, uint64_t max_objects, + MDSContext *ctx); + void aggregate_snap_sets(const std::vector> &snap_set_ctx, + CInode *in1, CInode *in2, BlockDiff *block_diff, Context *on_finish); + protected: // track leader requests whose peers haven't acknowledged commit struct uleader { diff --git a/src/mds/Server.cc b/src/mds/Server.cc index f019b7b9ea063..9902fee593242 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -244,6 +244,8 @@ void Server::create_logger() "Request type rename snapshot latency"); plb.add_time_avg(l_mdss_req_snapdiff_latency, "req_snapdiff_latency", "Request type snapshot difference latency"); + plb.add_time_avg(l_mdss_req_file_blockdiff_latency, "req_blockdiff_latency", + "Request type file blockdiff latency"); plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY); plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request", @@ -2182,6 +2184,9 @@ void Server::perf_gather_op_latency(const cref_t &req, utime_t l case CEPH_MDS_OP_READDIR_SNAPDIFF: code = l_mdss_req_snapdiff_latency; break; + case CEPH_MDS_OP_FILE_BLOCKDIFF: + code = l_mdss_req_file_blockdiff_latency; + break; default: dout(1) << ": unknown client op" << dendl; return; @@ -2830,6 +2835,9 @@ void Server::dispatch_client_request(const MDRequestRef& mdr) case CEPH_MDS_OP_READDIR_SNAPDIFF: handle_client_readdir_snapdiff(mdr); break; + case CEPH_MDS_OP_FILE_BLOCKDIFF: + handle_client_file_blockdiff(mdr); + break; default: dout(1) << " unknown client op " << req->get_op() << dendl; @@ -3698,13 +3706,21 @@ public: } }; +CInode* Server::rdlock_path_pin_ref(const MDRequestRef& mdr, + bool want_auth, + bool no_want_auth) +{ + const filepath& refpath = mdr->get_filepath(); + return rdlock_path_pin_ref(mdr, refpath, want_auth, no_want_auth); +} + /* If this returns null, the request has been handled * as appropriate: forwarded on, or the client's been replied to */ CInode* Server::rdlock_path_pin_ref(const MDRequestRef& mdr, + const filepath& refpath, bool want_auth, bool no_want_auth) { - const filepath& refpath = mdr->get_filepath(); dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl; if (mdr->locking_state & MutationImpl::PATH_LOCKED) @@ -11667,6 +11683,89 @@ void Server::_renamesnap_finish(const MDRequestRef& mdr, CInode *diri, snapid_t respond_to_request(mdr, 0); } +class C_MDS_file_blockdiff_finish : public ServerContext { +public: + C_MDS_file_blockdiff_finish(Server *server, const MDRequestRef& mdr, CInode *in, uint64_t scan_idx) + : ServerContext(server), + mdr(mdr), + in(in) { + block_diff.rval = 0; + block_diff.scan_idx = scan_idx; + } + + void finish(int r) override { + server->handle_file_blockdiff_finish(mdr, in, block_diff, r); + } + +private: + MDRequestRef mdr; + CInode *in; + +public: + BlockDiff block_diff; +}; + +void Server::handle_client_file_blockdiff(const MDRequestRef& mdr) +{ + const cref_t& req = mdr->client_request; + + dout(10) << __func__ << dendl; + + // no real need to rdlock_path_pin_ref() since the caller holds an + // open ref, but we need the snapped inode. + const filepath& refpath1 = mdr->get_filepath(); + CInode* in1 = rdlock_path_pin_ref(mdr, refpath1, false, true); + if (!in1) { + return; + } + const filepath& refpath2 = mdr->get_filepath2(); + CInode* in2 = rdlock_path_pin_ref(mdr, refpath2, false, true); + if (!in2) { + return; + } + + if (!in1->is_file() || !in2->is_file()) { + dout(10) << __func__ << ": not a regular file req=" << req << dendl; + respond_to_request(mdr, -EINVAL); + return; + } + + dout(20) << __func__ << ": in1=" << *in1 << dendl; + dout(20) << __func__ << ": in2=" << *in2 << dendl; + + auto scan_idx = (uint64_t)req->head.args.blockdiff.scan_idx; + auto max_objects = (uint32_t)req->head.args.blockdiff.max_objects; + + C_MDS_file_blockdiff_finish *ctx = new C_MDS_file_blockdiff_finish(this, mdr, in2, scan_idx); + + if (in1 == in2) { + // does not matter if the inodes are snapped or refer to the head + // version -- both snaps are same. + dout(10) << __func__ << ": no diffs between snaps" << dendl; + handle_file_blockdiff_finish(mdr, in2, ctx->block_diff, 0); + delete ctx; + return; + } + + mdcache->file_blockdiff(in1, in2, &(ctx->block_diff), max_objects, ctx); +} + +void Server::handle_file_blockdiff_finish(const MDRequestRef& mdr, CInode *in, const BlockDiff &block_diff, + int r) { + dout(10) << __func__ << ": in=" << *in << ", r=" << r << dendl; + if (r == 0) { + dout(10) << __func__ << ": blockdiff=" << block_diff << dendl; + } + + ceph::bufferlist bl; + if (r == 0) { + encode(block_diff, bl); + mdr->reply_extra_bl = bl; + } + + respond_to_request(mdr, r); +} + void Server::handle_client_readdir_snapdiff(const MDRequestRef& mdr) { const cref_t& req = mdr->client_request; diff --git a/src/mds/Server.h b/src/mds/Server.h index e7aca9e18e160..c7d64efff5b3d 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -84,6 +84,7 @@ enum { l_mdss_cap_revoke_eviction, l_mdss_cap_acquisition_throttle, l_mdss_req_getvxattr_latency, + l_mdss_req_file_blockdiff_latency, l_mdss_last, }; @@ -196,6 +197,8 @@ public: void _try_open_ino(const MDRequestRef& mdr, int r, inodeno_t ino); CInode* rdlock_path_pin_ref(const MDRequestRef& mdr, bool want_auth, bool no_want_auth=false); + CInode* rdlock_path_pin_ref(const MDRequestRef& mdr, const filepath& refpath, bool want_auth, + bool no_want_auth=false); CDentry* rdlock_path_xlock_dentry(const MDRequestRef& mdr, bool create, bool okexist=false, bool authexist=false, bool want_layout=false); @@ -304,6 +307,9 @@ public: void handle_client_renamesnap(const MDRequestRef& mdr); void _renamesnap_finish(const MDRequestRef& mdr, CInode *diri, snapid_t snapid); void handle_client_readdir_snapdiff(const MDRequestRef& mdr); + void handle_client_file_blockdiff(const MDRequestRef& mdr); + void handle_file_blockdiff_finish(const MDRequestRef& mdr, CInode *in, const BlockDiff &block_diff, + int r); // helpers bool _rename_prepare_witness(const MDRequestRef& mdr, mds_rank_t who, std::set &witnesse, diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc index fce09baef8111..63a879fa47a09 100644 --- a/src/mds/mdstypes.cc +++ b/src/mds/mdstypes.cc @@ -1016,3 +1016,44 @@ void snaprealm_reconnect_t::generate_test_instances(std::listrealm.seq = 2; ls.back()->realm.parent = 1; } + +/* + * file block diffs + */ +void BlockDiff::encode(bufferlist& bl) const { + using ceph::encode; + ENCODE_START(1, 1, bl); + encode(rval, bl); + encode(scan_idx, bl); + encode(blocks, bl); + ENCODE_FINISH(bl); +} + +void BlockDiff::decode(bufferlist::const_iterator &p) { + using ceph::decode; + DECODE_START(1, p); + decode(rval, p); + decode(scan_idx, p); + decode(blocks, p); + DECODE_FINISH(p); +} + +void BlockDiff::dump(Formatter *f) const { + f->dump_int("rval", rval); + f->dump_unsigned("scan_idx", scan_idx); + f->dump_stream("blocks") << blocks; +} + +void BlockDiff::generate_test_instances(std::list& ls) +{ + ls.push_back(new BlockDiff()); + ls.push_back(new BlockDiff()); + ls.back()->rval = 0; + ls.back()->scan_idx = 1; + ls.back()->blocks.union_insert(0, 200); +} + +void BlockDiff::print(ostream& out) const +{ + out << "{rval: " << rval << ", scan_idx=" << scan_idx << ", blocks=" << blocks << "}"; +} diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 694808890bcf0..649f13b7df0b8 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -1029,4 +1029,17 @@ inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) } WRITE_CLASS_ENCODER(MDSCacheObjectInfo) +struct BlockDiff { + int rval; + uint64_t scan_idx; + interval_set blocks; + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& ls); + void print(std::ostream& out) const; +}; +WRITE_CLASS_ENCODER(BlockDiff); + #endif -- 2.39.5