case CEPH_MDS_OP_REPAIR_INODESTATS: return "repair_inodestats";
case CEPH_MDS_OP_QUIESCE_PATH: return "quiesce_path";
case CEPH_MDS_OP_QUIESCE_INODE: return "quiesce_inode";
+ case CEPH_MDS_OP_FILE_BLOCKDIFF: return "blockdiff";
}
return "???";
}
services:
- mds
min: 8
+- name: mds_file_blockdiff_max_concurrent_object_scans
+ type: uint
+ level: advanced
+ desc: maximum number of concurrent object scans
+ long_desc: Maximum number of concurrent listsnaps operations sent to RADOS.
+ default: 16
+ services:
+ - mds
+ min: 1
CEPH_MDS_OP_LSSNAP = 0x00402,
CEPH_MDS_OP_RENAMESNAP = 0x01403,
CEPH_MDS_OP_READDIR_SNAPDIFF = 0x01404,
+ CEPH_MDS_OP_FILE_BLOCKDIFF = 0x01405,
// internal op
CEPH_MDS_OP_FRAGMENTDIR= 0x01500,
__le32 offset_hash;
__le64 snap_other;
} __attribute__ ((packed)) snapdiff;
+ struct {
+ // latest scan "pointer"
+ __le64 scan_idx;
+ // how many data objects to scan in one invocation (capped by the mds).
+ __le64 max_objects;
+ } __attribute__ ((packed)) blockdiff;
} __attribute__ ((packed));
#define CEPH_MDS_REQUEST_HEAD_VERSION 3
upkeep_cvar.wait_for(lock, interval);
}
}
+
+struct C_ListSnapsAggregator : public MDSIOContext {
+ C_ListSnapsAggregator(MDSRank *mds, CInode *in1, CInode *in2, BlockDiff *block_diff,
+ Context *on_finish)
+ : MDSIOContext(mds),
+ in1(in1),
+ in2(in2),
+ block_diff(block_diff),
+ on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ mds->mdcache->aggregate_snap_sets(snap_set_context, in1, in2,
+ block_diff, on_finish);
+ }
+
+ virtual void print(std::ostream& os) const {
+ os << "listsnaps";
+ }
+
+ void add_snap_set_context(std::unique_ptr<MDCache::SnapSetContext> ssc) {
+ snap_set_context.push_back(std::move(ssc));
+ }
+
+ CInode *in1;
+ CInode *in2;
+ BlockDiff *block_diff;
+ Context *on_finish;
+ std::vector<std::unique_ptr<MDCache::SnapSetContext>> snap_set_context;
+};
+
+void MDCache::file_blockdiff(CInode *in1, CInode *in2, BlockDiff *block_diff, uint64_t max_objects,
+ MDSContext *ctx) {
+ ceph_assert(in1->last <= in2->last);
+
+ // I think this is not required since the MDS disallows setting
+ // layout when truncate_seq > 1.
+ if (in1->get_inode()->layout != in2->get_inode()->layout) {
+ dout(20) << __func__ << ": snaps have different layout: " << in1->get_inode()->layout
+ << " vs " << in2->get_inode()->layout << dendl;
+ block_diff->blocks.union_insert(0, in2->get_inode()->size);
+ ctx->complete(0);
+ return;
+ }
+
+ uint64_t scan_idx = block_diff->scan_idx;
+ uint64_t num_objects1 = Striper::get_num_objects(in1->get_inode()->layout,
+ in1->get_inode()->size);
+ uint64_t num_objects2 = Striper::get_num_objects(in2->get_inode()->layout,
+ in2->get_inode()->size);
+ uint64_t num_objects_pending1 = num_objects1 - scan_idx;
+ uint64_t num_objects_pending2 = num_objects2 - scan_idx;
+
+ uint64_t scans = std::min(
+ std::min(num_objects_pending1, num_objects_pending2),
+ std::min((uint64_t)(g_conf().get_val<uint64_t>("mds_file_blockdiff_max_concurrent_object_scans")),
+ max_objects));
+
+ dout(20) << __func__ << ": scanning " << scans << " objects" << dendl;
+ if (scans == 0) {
+ // we ran out of objects to scan - figure which ones
+ if (num_objects_pending1 == 0 && num_objects_pending2 == 0) {
+ // easy - both snaps have same number of objects
+ dout(20) << __func__ << ": equal extent" << dendl;
+ ctx->complete(0);
+ } else {
+ if (num_objects_pending1 == 0) {
+ // first snapshot has lesser number of objects - return
+ // an extent covering EOF.
+ dout(20) << __func__ << ": EOF extent" << dendl;
+ uint64_t offset = Striper::get_file_offset(g_ceph_context, &(in2->get_inode()->layout),
+ scan_idx, 0);
+ block_diff->blocks.union_insert(offset, in2->get_inode()->size - offset);
+ ctx->complete(0);
+ } else {
+ // num_objects_pending2 == 0
+ dout(20) << __func__ << ": truncated extent" << dendl;
+ ctx->complete(0);
+ }
+ }
+
+ return;
+ }
+
+ C_ListSnapsAggregator *on_finish = new C_ListSnapsAggregator(mds, in1, in2, block_diff, ctx);
+ MDSGatherBuilder gather_ctx(g_ceph_context, on_finish);
+
+ while (scans > 0) {
+ ObjectOperation op;
+ std::unique_ptr<SnapSetContext> ssc(new SnapSetContext());
+ op.list_snaps(&ssc->snaps, &ssc->r);
+ ssc->objectid = scan_idx;
+
+ mds->objecter->read(file_object_t(in1->ino(), scan_idx),
+ OSDMap::file_to_object_locator(in2->get_inode()->layout),
+ op, LIBRADOS_SNAP_DIR, NULL, 0, gather_ctx.new_sub());
+ on_finish->add_snap_set_context(std::move(ssc));
+ ++scan_idx;
+ --scans;
+ }
+
+ gather_ctx.activate();
+}
+
+void MDCache::aggregate_snap_sets(const std::vector<std::unique_ptr<SnapSetContext>> &snap_set_ctx,
+ CInode *in1, CInode *in2, BlockDiff *block_diff, Context *on_finish) {
+ dout(20) << __func__ << dendl;
+
+ // always signal to the client to request again since request
+ // completion is signalled in file_blockdiff().
+ int r = 1;
+ snapid_t snapid1 = in1->last;
+ snapid_t snapid2 = in2->last;
+ uint64_t scans = snap_set_ctx.size();
+
+ interval_set<uint64_t> extents;
+ for (auto &snap_set : snap_set_ctx) {
+ dout(20) << __func__ << ": objectid=" << snap_set->objectid << ", r=" << snap_set->r
+ << dendl;
+ if (snap_set->r != 0 && snap_set->r != -ENOENT) {
+ derr << ": failed to get snap set for objectid=" << snap_set->objectid
+ << ", r=" << snap_set->r << dendl;
+ r = snap_set->r;
+ break;
+ }
+
+ if (snap_set->r == 0) {
+ auto &clones = snap_set->snaps.clones;
+ auto it1 = std::find_if(clones.begin(), clones.end(),
+ [snapid1](const librados::clone_info_t &clone)
+ {
+ return snapid1 == clone.cloneid ||
+ (std::find(clone.snaps.begin(), clone.snaps.end(), snapid1) != clone.snaps.end());
+ });
+ // point to "head" if not found
+ if (it1 == clones.end()) {
+ it1 = std::prev(it1);
+ }
+ auto it2 = std::find_if(clones.begin(), clones.end(),
+ [snapid2](const librados::clone_info_t &clone)
+ {
+ return snapid2 == clone.cloneid ||
+ (std::find(clone.snaps.begin(), clone.snaps.end(), snapid2) != clone.snaps.end());
+ });
+ // point to "head" if not found
+ if (it2 == clones.end()) {
+ it2 = std::prev(it2);
+ }
+
+ if (it1 == it2) {
+ dout(10) << __func__ << ": both snaps in same clone" << dendl;
+ continue;
+ }
+
+ interval_set<uint64_t> extent;
+ uint64_t offset = Striper::get_file_offset(g_ceph_context, &(in2->get_inode()->layout),
+ snap_set->objectid, 0);
+
+ for (auto hops = std::distance(it1, it2); hops > 0; --hops) {
+ dout(20) << __func__ << ": [cloneid: " << it1->cloneid << " snaps: " << it1->snaps
+ << " overlap: " << it1->overlap << "]" << dendl;
+ auto next_it = it1 + 1;
+ dout(20) << __func__ << ": [next cloneid: " << next_it->cloneid << " snaps: " << next_it->snaps
+ << " overlap: " << next_it->overlap << "]" << dendl;
+ auto sz = next_it->size;
+ if (sz == 0) {
+ // this object is a hole in the file.
+ // TODO: report holes in blockdiff strucuter. that way,
+ // caller can optimize and punch holes rather than writing
+ // zeros.
+ dout(10) << __func__ << ": hole: [" << offset << "~" << it1->size << "]" << dendl;
+ dout(10) << __func__ << ": adding whole extent - reader will read zeros" << dendl;
+ sz = it1->size;
+ }
+
+ extent.clear();
+ extent.union_insert(offset, sz);
+ for (auto &overlap_region : it1->overlap) {
+ uint64_t overlap_offset = Striper::get_file_offset(g_ceph_context, &(in2->get_inode()->layout),
+ snap_set->objectid, overlap_region.first);
+ extent.erase(overlap_offset, overlap_region.second);
+ }
+
+ dout(20) << __func__ << ": (non overlapping) extent=" << extent << dendl;
+ extents.union_of(extent);
+ dout(20) << __func__ << ": (modified) extents=" << extents << dendl;
+ ++it1;
+ }
+ }
+ }
+
+ block_diff->rval = r;
+ if (r >= 0) {
+ r = 0;
+ block_diff->scan_idx += scans;
+ block_diff->blocks = extents;
+ }
+ on_finish->complete(r);
+}
#include "include/types.h"
#include "include/filepath.h"
#include "include/elist.h"
+#include "include/rados/rados_types.hpp"
#include "messages/MCacheExpire.h"
#include "messages/MClientQuota.h"
double export_ephemeral_random_max = 0.0;
+ struct SnapSetContext { /* not to be confused with SnapContext */
+ int r;
+ uint64_t objectid;
+ librados::snap_set_t snaps;
+ };
+
+ void file_blockdiff(CInode *in1, CInode *in2, BlockDiff *block_diff, uint64_t max_objects,
+ MDSContext *ctx);
+ void aggregate_snap_sets(const std::vector<std::unique_ptr<SnapSetContext>> &snap_set_ctx,
+ CInode *in1, CInode *in2, BlockDiff *block_diff, Context *on_finish);
+
protected:
// track leader requests whose peers haven't acknowledged commit
struct uleader {
"Request type rename snapshot latency");
plb.add_time_avg(l_mdss_req_snapdiff_latency, "req_snapdiff_latency",
"Request type snapshot difference latency");
+ plb.add_time_avg(l_mdss_req_file_blockdiff_latency, "req_blockdiff_latency",
+ "Request type file blockdiff latency");
plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
case CEPH_MDS_OP_READDIR_SNAPDIFF:
code = l_mdss_req_snapdiff_latency;
break;
+ case CEPH_MDS_OP_FILE_BLOCKDIFF:
+ code = l_mdss_req_file_blockdiff_latency;
+ break;
default:
dout(1) << ": unknown client op" << dendl;
return;
case CEPH_MDS_OP_READDIR_SNAPDIFF:
handle_client_readdir_snapdiff(mdr);
break;
+ case CEPH_MDS_OP_FILE_BLOCKDIFF:
+ handle_client_file_blockdiff(mdr);
+ break;
default:
dout(1) << " unknown client op " << req->get_op() << dendl;
}
};
+CInode* Server::rdlock_path_pin_ref(const MDRequestRef& mdr,
+ bool want_auth,
+ bool no_want_auth)
+{
+ const filepath& refpath = mdr->get_filepath();
+ return rdlock_path_pin_ref(mdr, refpath, want_auth, no_want_auth);
+}
+
/* If this returns null, the request has been handled
* as appropriate: forwarded on, or the client's been replied to */
CInode* Server::rdlock_path_pin_ref(const MDRequestRef& mdr,
+ const filepath& refpath,
bool want_auth,
bool no_want_auth)
{
- const filepath& refpath = mdr->get_filepath();
dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
if (mdr->locking_state & MutationImpl::PATH_LOCKED)
respond_to_request(mdr, 0);
}
+class C_MDS_file_blockdiff_finish : public ServerContext {
+public:
+ C_MDS_file_blockdiff_finish(Server *server, const MDRequestRef& mdr, CInode *in, uint64_t scan_idx)
+ : ServerContext(server),
+ mdr(mdr),
+ in(in) {
+ block_diff.rval = 0;
+ block_diff.scan_idx = scan_idx;
+ }
+
+ void finish(int r) override {
+ server->handle_file_blockdiff_finish(mdr, in, block_diff, r);
+ }
+
+private:
+ MDRequestRef mdr;
+ CInode *in;
+
+public:
+ BlockDiff block_diff;
+};
+
+void Server::handle_client_file_blockdiff(const MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest>& req = mdr->client_request;
+
+ dout(10) << __func__ << dendl;
+
+ // no real need to rdlock_path_pin_ref() since the caller holds an
+ // open ref, but we need the snapped inode.
+ const filepath& refpath1 = mdr->get_filepath();
+ CInode* in1 = rdlock_path_pin_ref(mdr, refpath1, false, true);
+ if (!in1) {
+ return;
+ }
+ const filepath& refpath2 = mdr->get_filepath2();
+ CInode* in2 = rdlock_path_pin_ref(mdr, refpath2, false, true);
+ if (!in2) {
+ return;
+ }
+
+ if (!in1->is_file() || !in2->is_file()) {
+ dout(10) << __func__ << ": not a regular file req=" << req << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ dout(20) << __func__ << ": in1=" << *in1 << dendl;
+ dout(20) << __func__ << ": in2=" << *in2 << dendl;
+
+ auto scan_idx = (uint64_t)req->head.args.blockdiff.scan_idx;
+ auto max_objects = (uint32_t)req->head.args.blockdiff.max_objects;
+
+ C_MDS_file_blockdiff_finish *ctx = new C_MDS_file_blockdiff_finish(this, mdr, in2, scan_idx);
+
+ if (in1 == in2) {
+ // does not matter if the inodes are snapped or refer to the head
+ // version -- both snaps are same.
+ dout(10) << __func__ << ": no diffs between snaps" << dendl;
+ handle_file_blockdiff_finish(mdr, in2, ctx->block_diff, 0);
+ delete ctx;
+ return;
+ }
+
+ mdcache->file_blockdiff(in1, in2, &(ctx->block_diff), max_objects, ctx);
+}
+
+void Server::handle_file_blockdiff_finish(const MDRequestRef& mdr, CInode *in, const BlockDiff &block_diff,
+ int r) {
+ dout(10) << __func__ << ": in=" << *in << ", r=" << r << dendl;
+ if (r == 0) {
+ dout(10) << __func__ << ": blockdiff=" << block_diff << dendl;
+ }
+
+ ceph::bufferlist bl;
+ if (r == 0) {
+ encode(block_diff, bl);
+ mdr->reply_extra_bl = bl;
+ }
+
+ respond_to_request(mdr, r);
+}
+
void Server::handle_client_readdir_snapdiff(const MDRequestRef& mdr)
{
const cref_t<MClientRequest>& req = mdr->client_request;
l_mdss_cap_revoke_eviction,
l_mdss_cap_acquisition_throttle,
l_mdss_req_getvxattr_latency,
+ l_mdss_req_file_blockdiff_latency,
l_mdss_last,
};
void _try_open_ino(const MDRequestRef& mdr, int r, inodeno_t ino);
CInode* rdlock_path_pin_ref(const MDRequestRef& mdr, bool want_auth,
bool no_want_auth=false);
+ CInode* rdlock_path_pin_ref(const MDRequestRef& mdr, const filepath& refpath, bool want_auth,
+ bool no_want_auth=false);
CDentry* rdlock_path_xlock_dentry(const MDRequestRef& mdr, bool create,
bool okexist=false, bool authexist=false,
bool want_layout=false);
void handle_client_renamesnap(const MDRequestRef& mdr);
void _renamesnap_finish(const MDRequestRef& mdr, CInode *diri, snapid_t snapid);
void handle_client_readdir_snapdiff(const MDRequestRef& mdr);
+ void handle_client_file_blockdiff(const MDRequestRef& mdr);
+ void handle_file_blockdiff_finish(const MDRequestRef& mdr, CInode *in, const BlockDiff &block_diff,
+ int r);
// helpers
bool _rename_prepare_witness(const MDRequestRef& mdr, mds_rank_t who, std::set<mds_rank_t> &witnesse,
ls.back()->realm.seq = 2;
ls.back()->realm.parent = 1;
}
+
+/*
+ * file block diffs
+ */
+void BlockDiff::encode(bufferlist& bl) const {
+ using ceph::encode;
+ ENCODE_START(1, 1, bl);
+ encode(rval, bl);
+ encode(scan_idx, bl);
+ encode(blocks, bl);
+ ENCODE_FINISH(bl);
+}
+
+void BlockDiff::decode(bufferlist::const_iterator &p) {
+ using ceph::decode;
+ DECODE_START(1, p);
+ decode(rval, p);
+ decode(scan_idx, p);
+ decode(blocks, p);
+ DECODE_FINISH(p);
+}
+
+void BlockDiff::dump(Formatter *f) const {
+ f->dump_int("rval", rval);
+ f->dump_unsigned("scan_idx", scan_idx);
+ f->dump_stream("blocks") << blocks;
+}
+
+void BlockDiff::generate_test_instances(std::list<BlockDiff*>& ls)
+{
+ ls.push_back(new BlockDiff());
+ ls.push_back(new BlockDiff());
+ ls.back()->rval = 0;
+ ls.back()->scan_idx = 1;
+ ls.back()->blocks.union_insert(0, 200);
+}
+
+void BlockDiff::print(ostream& out) const
+{
+ out << "{rval: " << rval << ", scan_idx=" << scan_idx << ", blocks=" << blocks << "}";
+}
}
WRITE_CLASS_ENCODER(MDSCacheObjectInfo)
+struct BlockDiff {
+ int rval;
+ uint64_t scan_idx;
+ interval_set<uint64_t> blocks;
+
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator& p);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<BlockDiff*>& ls);
+ void print(std::ostream& out) const;
+};
+WRITE_CLASS_ENCODER(BlockDiff);
+
#endif