]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: add blockdiff operation support
authorVenky Shankar <vshankar@redhat.com>
Thu, 20 Feb 2025 13:07:17 +0000 (13:07 +0000)
committerVenky Shankar <vshankar@redhat.com>
Wed, 4 Jun 2025 14:52:56 +0000 (20:22 +0530)
Fixes: http://tracker.ceph.com/issues/69791
Signed-off-by: Venky Shankar <vshankar@redhat.com>
(cherry picked from commit 8cc05b172551826eac5b03f086cc872df9584837)

 Conflicts:
src/common/ceph_strings.cc
src/common/options/mds.yaml.in
src/mds/mdstypes.cc
src/mds/mdstypes.h

Missing EstimatedReplayTime structure in squid and some features.

src/common/ceph_strings.cc
src/common/options/mds.yaml.in
src/include/ceph_fs.h
src/mds/MDCache.cc
src/mds/MDCache.h
src/mds/Server.cc
src/mds/Server.h
src/mds/mdstypes.cc
src/mds/mdstypes.h

index 752b1d59f1272351e3c6d82fd612615d4fad83fd..c51f5ea48ccfda282225cbc290b49a8489aab9d3 100644 (file)
@@ -322,6 +322,7 @@ const char *ceph_mds_op_name(int op)
        case CEPH_MDS_OP_REPAIR_INODESTATS: return "repair_inodestats";
        case CEPH_MDS_OP_QUIESCE_PATH: return "quiesce_path";
        case CEPH_MDS_OP_QUIESCE_INODE: return "quiesce_inode";
+        case CEPH_MDS_OP_FILE_BLOCKDIFF: return "blockdiff";
        }
        return "???";
 }
index 05fa4fca9a695f2d2231c2f3319b0414fadf7950..aacdadd858fb8b275a8da720884e8c8300b302f0 100644 (file)
@@ -1686,3 +1686,12 @@ options:
   services:
   - mds
   min: 8
+- name: mds_file_blockdiff_max_concurrent_object_scans
+  type: uint
+  level: advanced
+  desc: maximum number of concurrent object scans
+  long_desc: Maximum number of concurrent listsnaps operations sent to RADOS.
+  default: 16
+  services:
+  - mds
+  min: 1
index e595d4628e024a91b87c66f998a76a5c3145c276..d042213a6d12d8b298da2684b1a0f6dacaa98546 100644 (file)
@@ -428,6 +428,7 @@ enum {
        CEPH_MDS_OP_LSSNAP     = 0x00402,
        CEPH_MDS_OP_RENAMESNAP = 0x01403,
        CEPH_MDS_OP_READDIR_SNAPDIFF   = 0x01404,
+       CEPH_MDS_OP_FILE_BLOCKDIFF = 0x01405,
 
        // internal op
        CEPH_MDS_OP_FRAGMENTDIR= 0x01500,
@@ -649,6 +650,12 @@ union ceph_mds_request_args {
                 __le32 offset_hash;
                __le64 snap_other;
        } __attribute__ ((packed)) snapdiff;
+        struct {
+                // latest scan "pointer"
+                __le64 scan_idx;
+                // how many data objects to scan in one invocation (capped by the mds).
+                __le64 max_objects;
+        } __attribute__ ((packed)) blockdiff;
 } __attribute__ ((packed));
 
 #define CEPH_MDS_REQUEST_HEAD_VERSION  3
index 0b9e07c4b3fb3b8bab5087bb92fe27f16fa6001c..98beb5272df1cec215d93ae44a4a25c4706cf1ea 100644 (file)
@@ -14218,3 +14218,202 @@ void MDCache::upkeep_main(void)
     upkeep_cvar.wait_for(lock, interval);
   }
 }
+
+struct C_ListSnapsAggregator : public MDSIOContext {
+  C_ListSnapsAggregator(MDSRank *mds, CInode *in1, CInode *in2, BlockDiff *block_diff,
+                       Context *on_finish)
+    : MDSIOContext(mds),
+      in1(in1),
+      in2(in2),
+      block_diff(block_diff),
+      on_finish(on_finish) {
+  }
+
+  void finish(int r) override {
+    mds->mdcache->aggregate_snap_sets(snap_set_context, in1, in2,
+                                      block_diff, on_finish);
+  }
+
+  virtual void print(std::ostream& os) const {
+    os << "listsnaps";
+  }
+
+  void add_snap_set_context(std::unique_ptr<MDCache::SnapSetContext> ssc) {
+    snap_set_context.push_back(std::move(ssc));
+  }
+
+  CInode *in1;
+  CInode *in2;
+  BlockDiff *block_diff;
+  Context *on_finish;
+  std::vector<std::unique_ptr<MDCache::SnapSetContext>> snap_set_context;
+};
+
+void MDCache::file_blockdiff(CInode *in1, CInode *in2, BlockDiff *block_diff, uint64_t max_objects,
+                            MDSContext *ctx) {
+  ceph_assert(in1->last <= in2->last);
+
+  // I think this is not required since the MDS disallows setting
+  // layout when truncate_seq > 1.
+  if (in1->get_inode()->layout != in2->get_inode()->layout) {
+    dout(20) << __func__ << ": snaps have different layout: " << in1->get_inode()->layout
+            << " vs " << in2->get_inode()->layout << dendl;
+    block_diff->blocks.union_insert(0, in2->get_inode()->size);
+    ctx->complete(0);
+    return;
+  }
+
+  uint64_t scan_idx = block_diff->scan_idx;
+  uint64_t num_objects1 = Striper::get_num_objects(in1->get_inode()->layout,
+                                                  in1->get_inode()->size);
+  uint64_t num_objects2 = Striper::get_num_objects(in2->get_inode()->layout,
+                                                  in2->get_inode()->size);
+  uint64_t num_objects_pending1 = num_objects1 - scan_idx;
+  uint64_t num_objects_pending2 = num_objects2 - scan_idx;
+
+  uint64_t scans = std::min(
+    std::min(num_objects_pending1, num_objects_pending2),
+    std::min((uint64_t)(g_conf().get_val<uint64_t>("mds_file_blockdiff_max_concurrent_object_scans")),
+            max_objects));
+
+  dout(20) << __func__ << ": scanning " << scans << " objects" << dendl;
+  if (scans == 0) {
+    // we ran out of objects to scan - figure which ones
+    if (num_objects_pending1 == 0 && num_objects_pending2 == 0) {
+      // easy - both snaps have same number of objects
+      dout(20) << __func__ << ": equal extent" << dendl;
+      ctx->complete(0);
+    } else {
+      if (num_objects_pending1 == 0) {
+       // first snapshot has lesser number of objects - return
+       // an extent covering EOF.
+       dout(20) << __func__ << ": EOF extent" << dendl;
+       uint64_t offset = Striper::get_file_offset(g_ceph_context, &(in2->get_inode()->layout),
+                                                  scan_idx, 0);
+       block_diff->blocks.union_insert(offset, in2->get_inode()->size - offset);
+       ctx->complete(0);
+      } else {
+       // num_objects_pending2 == 0
+       dout(20) << __func__ << ": truncated extent" << dendl;
+       ctx->complete(0);
+      }
+    }
+
+    return;
+  }
+
+  C_ListSnapsAggregator *on_finish = new C_ListSnapsAggregator(mds, in1, in2, block_diff, ctx);
+  MDSGatherBuilder gather_ctx(g_ceph_context, on_finish);
+
+  while (scans > 0) {
+    ObjectOperation op;
+    std::unique_ptr<SnapSetContext> ssc(new SnapSetContext());
+    op.list_snaps(&ssc->snaps, &ssc->r);
+    ssc->objectid = scan_idx;
+
+    mds->objecter->read(file_object_t(in1->ino(), scan_idx),
+                       OSDMap::file_to_object_locator(in2->get_inode()->layout),
+                       op, LIBRADOS_SNAP_DIR, NULL, 0, gather_ctx.new_sub());
+    on_finish->add_snap_set_context(std::move(ssc));
+    ++scan_idx;
+    --scans;
+  }
+
+  gather_ctx.activate();
+}
+
+void MDCache::aggregate_snap_sets(const std::vector<std::unique_ptr<SnapSetContext>> &snap_set_ctx,
+                                  CInode *in1, CInode *in2, BlockDiff *block_diff, Context *on_finish) {
+  dout(20) << __func__ << dendl;
+
+  // always signal to the client to request again since request
+  // completion is signalled in file_blockdiff().
+  int r = 1;
+  snapid_t snapid1 = in1->last;
+  snapid_t snapid2 = in2->last;
+  uint64_t scans = snap_set_ctx.size();
+
+  interval_set<uint64_t> extents;
+  for (auto &snap_set : snap_set_ctx) {
+    dout(20) << __func__ << ": objectid=" << snap_set->objectid << ", r=" << snap_set->r
+            << dendl;
+    if (snap_set->r != 0 && snap_set->r != -ENOENT) {
+      derr << ": failed to get snap set for objectid=" << snap_set->objectid
+          << ", r=" << snap_set->r << dendl;
+      r = snap_set->r;
+      break;
+    }
+
+    if (snap_set->r == 0) {
+      auto &clones = snap_set->snaps.clones;
+      auto it1 = std::find_if(clones.begin(), clones.end(),
+                             [snapid1](const librados::clone_info_t &clone)
+                             {
+                               return snapid1 == clone.cloneid ||
+                                 (std::find(clone.snaps.begin(), clone.snaps.end(), snapid1) != clone.snaps.end());
+                             });
+      // point to "head" if not found
+      if (it1 == clones.end()) {
+       it1 = std::prev(it1);
+      }
+      auto it2 = std::find_if(clones.begin(), clones.end(),
+                             [snapid2](const librados::clone_info_t &clone)
+                             {
+                               return snapid2 == clone.cloneid ||
+                                 (std::find(clone.snaps.begin(), clone.snaps.end(), snapid2) != clone.snaps.end());
+                             });
+      // point to "head" if not found
+      if (it2 == clones.end()) {
+       it2 = std::prev(it2);
+      }
+
+      if (it1 == it2) {
+       dout(10) << __func__ << ": both snaps in same clone" << dendl;
+       continue;
+      }
+
+      interval_set<uint64_t> extent;
+      uint64_t offset = Striper::get_file_offset(g_ceph_context, &(in2->get_inode()->layout),
+                                                snap_set->objectid, 0);
+
+      for (auto hops = std::distance(it1, it2); hops > 0; --hops) {
+       dout(20) << __func__ << ": [cloneid: " << it1->cloneid << " snaps: " << it1->snaps
+                << " overlap: " << it1->overlap << "]" << dendl;
+       auto next_it = it1 + 1;
+       dout(20) << __func__ << ": [next cloneid: " << next_it->cloneid << " snaps: " << next_it->snaps
+                << " overlap: " << next_it->overlap << "]" << dendl;
+       auto sz = next_it->size;
+       if (sz == 0) {
+         // this object is a hole in the file.
+         // TODO: report holes in blockdiff strucuter. that way,
+         // caller can optimize and punch holes rather than writing
+         // zeros.
+         dout(10) << __func__ << ": hole: [" << offset << "~" << it1->size << "]" << dendl;
+         dout(10) << __func__ << ": adding whole extent - reader will read zeros" << dendl;
+         sz = it1->size;
+       }
+
+       extent.clear();
+       extent.union_insert(offset, sz);
+       for (auto &overlap_region : it1->overlap) {
+         uint64_t overlap_offset = Striper::get_file_offset(g_ceph_context, &(in2->get_inode()->layout),
+                                                            snap_set->objectid, overlap_region.first);
+         extent.erase(overlap_offset, overlap_region.second);
+       }
+
+       dout(20) << __func__ << ": (non overlapping) extent=" << extent << dendl;
+       extents.union_of(extent);
+       dout(20) << __func__ << ": (modified) extents=" << extents << dendl;
+       ++it1;
+      }
+    }
+  }
+
+  block_diff->rval = r;
+  if (r >= 0) {
+    r = 0;
+    block_diff->scan_idx += scans;
+    block_diff->blocks = extents;
+  }
+  on_finish->complete(r);
+}
index f2d4f6a0207235215bb4d9d94c01c3c64a0b536c..5f8463fbb8615192f3f3c67e53695c2674445293 100644 (file)
@@ -24,6 +24,7 @@
 #include "include/types.h"
 #include "include/filepath.h"
 #include "include/elist.h"
+#include "include/rados/rados_types.hpp"
 
 #include "messages/MCacheExpire.h"
 #include "messages/MClientQuota.h"
@@ -1152,6 +1153,17 @@ private:
 
   double export_ephemeral_random_max = 0.0;
 
+  struct SnapSetContext { /* not to be confused with SnapContext */
+    int r;
+    uint64_t objectid;
+    librados::snap_set_t snaps;
+  };
+
+  void file_blockdiff(CInode *in1, CInode *in2, BlockDiff *block_diff, uint64_t max_objects,
+                      MDSContext *ctx);
+  void aggregate_snap_sets(const std::vector<std::unique_ptr<SnapSetContext>> &snap_set_ctx,
+                           CInode *in1, CInode *in2, BlockDiff *block_diff, Context *on_finish);
+
  protected:
   // track leader requests whose peers haven't acknowledged commit
   struct uleader {
index f019b7b9ea063004db46a57d84155eaaf3abbc14..9902fee593242abb3cc86e648b691720828fec42 100644 (file)
@@ -244,6 +244,8 @@ void Server::create_logger()
                    "Request type rename snapshot latency");
   plb.add_time_avg(l_mdss_req_snapdiff_latency, "req_snapdiff_latency",
                   "Request type snapshot difference latency");
+    plb.add_time_avg(l_mdss_req_file_blockdiff_latency, "req_blockdiff_latency",
+                  "Request type file blockdiff latency");
 
   plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
   plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
@@ -2182,6 +2184,9 @@ void Server::perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t l
   case CEPH_MDS_OP_READDIR_SNAPDIFF:
     code = l_mdss_req_snapdiff_latency;
     break;
+  case CEPH_MDS_OP_FILE_BLOCKDIFF:
+    code = l_mdss_req_file_blockdiff_latency;
+    break;
   default:
     dout(1) << ": unknown client op" << dendl;
     return;
@@ -2830,6 +2835,9 @@ void Server::dispatch_client_request(const MDRequestRef& mdr)
   case CEPH_MDS_OP_READDIR_SNAPDIFF:
     handle_client_readdir_snapdiff(mdr);
     break;
+  case CEPH_MDS_OP_FILE_BLOCKDIFF:
+    handle_client_file_blockdiff(mdr);
+    break;
 
   default:
     dout(1) << " unknown client op " << req->get_op() << dendl;
@@ -3698,13 +3706,21 @@ public:
   }
 };
 
+CInode* Server::rdlock_path_pin_ref(const MDRequestRef& mdr,
+                                   bool want_auth,
+                                   bool no_want_auth)
+{
+  const filepath& refpath = mdr->get_filepath();
+  return rdlock_path_pin_ref(mdr, refpath, want_auth, no_want_auth);
+}
+
 /* If this returns null, the request has been handled
  * as appropriate: forwarded on, or the client's been replied to */
 CInode* Server::rdlock_path_pin_ref(const MDRequestRef& mdr,
+                                   const filepath& refpath,
                                    bool want_auth,
                                    bool no_want_auth)
 {
-  const filepath& refpath = mdr->get_filepath();
   dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
 
   if (mdr->locking_state & MutationImpl::PATH_LOCKED)
@@ -11667,6 +11683,89 @@ void Server::_renamesnap_finish(const MDRequestRef& mdr, CInode *diri, snapid_t
   respond_to_request(mdr, 0);
 }
 
+class C_MDS_file_blockdiff_finish : public ServerContext {
+public:
+  C_MDS_file_blockdiff_finish(Server *server, const MDRequestRef& mdr, CInode *in, uint64_t scan_idx)
+    : ServerContext(server),
+      mdr(mdr),
+      in(in) {
+    block_diff.rval = 0;
+    block_diff.scan_idx = scan_idx;
+  }
+
+  void finish(int r) override {
+    server->handle_file_blockdiff_finish(mdr, in, block_diff, r);
+  }
+
+private:
+  MDRequestRef mdr;
+  CInode *in;
+
+public:
+  BlockDiff block_diff;
+};
+
+void Server::handle_client_file_blockdiff(const MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest>& req = mdr->client_request;
+
+  dout(10) << __func__ << dendl;
+
+  // no real need to rdlock_path_pin_ref() since the caller holds an
+  // open ref, but we need the snapped inode.
+  const filepath& refpath1 = mdr->get_filepath();
+  CInode* in1 = rdlock_path_pin_ref(mdr, refpath1, false, true);
+  if (!in1) {
+    return;
+  }
+  const filepath& refpath2 = mdr->get_filepath2();
+  CInode* in2 = rdlock_path_pin_ref(mdr, refpath2, false, true);
+  if (!in2) {
+    return;
+  }
+
+  if (!in1->is_file() || !in2->is_file()) {
+    dout(10) << __func__ << ": not a regular file req=" << req << dendl;
+    respond_to_request(mdr, -EINVAL);
+    return;
+  }
+
+  dout(20) << __func__ << ": in1=" << *in1 << dendl;
+  dout(20) << __func__ << ": in2=" << *in2 << dendl;
+
+  auto scan_idx = (uint64_t)req->head.args.blockdiff.scan_idx;
+  auto max_objects = (uint32_t)req->head.args.blockdiff.max_objects;
+
+  C_MDS_file_blockdiff_finish *ctx = new C_MDS_file_blockdiff_finish(this, mdr, in2, scan_idx);
+
+  if (in1 == in2) {
+    // does not matter if the inodes are snapped or refer to the head
+    // version -- both snaps are same.
+    dout(10) << __func__ << ": no diffs between snaps" << dendl;
+    handle_file_blockdiff_finish(mdr, in2, ctx->block_diff, 0);
+    delete ctx;
+    return;
+  }
+
+  mdcache->file_blockdiff(in1, in2, &(ctx->block_diff), max_objects, ctx);
+}
+
+void Server::handle_file_blockdiff_finish(const MDRequestRef& mdr, CInode *in, const BlockDiff &block_diff,
+                                         int r) {
+  dout(10) << __func__ << ": in=" << *in << ", r=" << r << dendl;
+  if (r == 0) {
+    dout(10) << __func__ << ": blockdiff=" << block_diff << dendl;
+  }
+
+  ceph::bufferlist bl;
+  if (r == 0) {
+    encode(block_diff, bl);
+    mdr->reply_extra_bl = bl;
+  }
+
+  respond_to_request(mdr, r);
+}
+
 void Server::handle_client_readdir_snapdiff(const MDRequestRef& mdr)
 {
   const cref_t<MClientRequest>& req = mdr->client_request;
index e7aca9e18e1606bf9cf10ae51ba9d5f36bbe52d4..c7d64efff5b3d0cb877be2f45a3146f5d248e724 100644 (file)
@@ -84,6 +84,7 @@ enum {
   l_mdss_cap_revoke_eviction,
   l_mdss_cap_acquisition_throttle,
   l_mdss_req_getvxattr_latency,
+  l_mdss_req_file_blockdiff_latency,
   l_mdss_last,
 };
 
@@ -196,6 +197,8 @@ public:
   void _try_open_ino(const MDRequestRef& mdr, int r, inodeno_t ino);
   CInode* rdlock_path_pin_ref(const MDRequestRef& mdr, bool want_auth,
                              bool no_want_auth=false);
+  CInode* rdlock_path_pin_ref(const MDRequestRef& mdr, const filepath& refpath, bool want_auth,
+                             bool no_want_auth=false);
   CDentry* rdlock_path_xlock_dentry(const MDRequestRef& mdr, bool create,
                                    bool okexist=false, bool authexist=false,
                                    bool want_layout=false);
@@ -304,6 +307,9 @@ public:
   void handle_client_renamesnap(const MDRequestRef& mdr);
   void _renamesnap_finish(const MDRequestRef& mdr, CInode *diri, snapid_t snapid);
   void handle_client_readdir_snapdiff(const MDRequestRef& mdr);
+  void handle_client_file_blockdiff(const MDRequestRef& mdr);
+  void handle_file_blockdiff_finish(const MDRequestRef& mdr, CInode *in, const BlockDiff &block_diff,
+                                   int r);
 
   // helpers
   bool _rename_prepare_witness(const MDRequestRef& mdr, mds_rank_t who, std::set<mds_rank_t> &witnesse,
index fce09baef8111d9e55fd29bf3e082bbb21d1230d..63a879fa47a099fc3fb40bc58447de5726c68e76 100644 (file)
@@ -1016,3 +1016,44 @@ void snaprealm_reconnect_t::generate_test_instances(std::list<snaprealm_reconnec
   ls.back()->realm.seq = 2;
   ls.back()->realm.parent = 1;
 }
+
+/*
+ * file block diffs
+ */
+void BlockDiff::encode(bufferlist& bl) const {
+  using ceph::encode;
+  ENCODE_START(1, 1, bl);
+  encode(rval, bl);
+  encode(scan_idx, bl);
+  encode(blocks, bl);
+  ENCODE_FINISH(bl);
+}
+
+void BlockDiff::decode(bufferlist::const_iterator &p) {
+  using ceph::decode;
+  DECODE_START(1, p);
+  decode(rval, p);
+  decode(scan_idx, p);
+  decode(blocks, p);
+  DECODE_FINISH(p);
+}
+
+void BlockDiff::dump(Formatter *f) const {
+  f->dump_int("rval", rval);
+  f->dump_unsigned("scan_idx", scan_idx);
+  f->dump_stream("blocks") << blocks;
+}
+
+void BlockDiff::generate_test_instances(std::list<BlockDiff*>& ls)
+{
+  ls.push_back(new BlockDiff());
+  ls.push_back(new BlockDiff());
+  ls.back()->rval = 0;
+  ls.back()->scan_idx = 1;
+  ls.back()->blocks.union_insert(0, 200);
+}
+
+void BlockDiff::print(ostream& out) const
+{
+  out << "{rval: " << rval << ", scan_idx=" << scan_idx << ", blocks=" << blocks << "}";
+}
index 694808890bcf0020fdfcc9726329192354756303..649f13b7df0b83ca2121ad61996bd776faae1a1c 100644 (file)
@@ -1029,4 +1029,17 @@ inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r)
 }
 WRITE_CLASS_ENCODER(MDSCacheObjectInfo)
 
+struct BlockDiff {
+  int rval;
+  uint64_t scan_idx;
+  interval_set<uint64_t> blocks;
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<BlockDiff*>& ls);
+  void print(std::ostream& out) const;
+};
+WRITE_CLASS_ENCODER(BlockDiff);
+
 #endif