From: Igor Fedotov Date: Thu, 24 Jun 2021 11:27:53 +0000 (+0300) Subject: cephfs: imlplement readdir_snapdiff API X-Git-Tag: v18.2.1~138^2~4 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2ed401bbafb183924b28ea8fec275a8677a83426;p=ceph.git cephfs: imlplement readdir_snapdiff API Signed-off-by: Denis Barahtanov Signed-off-by: Igor Fedotov (cherry picked from commit e04b10f3f282a3eebc7223e76249fe2d1224eabc) --- diff --git a/src/client/Client.cc b/src/client/Client.cc index 8a0033441eae..3e385a0e7a2a 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -72,6 +72,7 @@ #include "mds/flock.h" #include "mds/cephfs_features.h" +#include "mds/snap.h" #include "osd/OSDMap.h" #include "osdc/Filer.h" @@ -1297,7 +1298,8 @@ void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete) /* * insert results from readdir or lssnap into the metadata cache. */ -void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) { +void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, + Inode *diri, Inode *diri_other) { auto& reply = request->reply; ConnectionRef con = request->reply->get_connection(); @@ -1312,7 +1314,8 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, dir_result_t *dirp = request->dirp; ceph_assert(dirp); - // the extra buffer list is only set for readdir and lssnap replies + // the extra buffer list is only set for readdir, lssnap and + // readdir_snapdiff replies auto p = reply->get_extra_bl().cbegin(); if (!p.end()) { // snapdir? @@ -1320,10 +1323,27 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, ceph_assert(diri); diri = open_snapdir(diri); } + bool snapdiff_req = request->head.op == CEPH_MDS_OP_READDIR_SNAPDIFF; + frag_t fg; + unsigned offset_hash; + if (snapdiff_req) { + fg = (unsigned)request->head.args.snapdiff.frag; + offset_hash = (unsigned)request->head.args.snapdiff.offset_hash; + } else { + fg = (unsigned)request->head.args.readdir.frag; + offset_hash = (unsigned)request->head.args.readdir.offset_hash; + } // only open dir if we're actually adding stuff to it! Dir *dir = diri->open_dir(); ceph_assert(dir); + //open opponent dir for snapdiff if any + Dir *dir_other = nullptr; + if (snapdiff_req) { + ceph_assert(diri_other); + dir_other = diri_other->open_dir(); + ceph_assert(dir_other); + } // dirstat DirStat dst(p, features); @@ -1335,7 +1355,6 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END); bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER); - frag_t fg = (unsigned)request->head.args.readdir.frag; unsigned readdir_offset = dirp->next_offset; string readdir_start = dirp->last_name; ceph_assert(!readdir_start.empty() || readdir_offset == 2); @@ -1346,7 +1365,7 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start)); } else if (flags & CEPH_READDIR_OFFSET_HASH) { /* mds understands offset_hash */ - last_hash = (unsigned)request->head.args.readdir.offset_hash; + last_hash = offset_hash; } } @@ -1391,13 +1410,22 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *in = add_update_inode(&ist, request->sent_stamp, session, request->perms); + auto *effective_dir = dir; + auto *effective_diri = diri; + + if (snapdiff_req && in->snapid != diri->snapid) { + ceph_assert(diri_other); + ceph_assert(dir_other); + effective_diri = diri_other; + effective_dir = dir_other; + } Dentry *dn; - if (diri->dir->dentries.count(dname)) { - Dentry *olddn = diri->dir->dentries[dname]; + if (effective_dir->dentries.count(dname)) { + Dentry *olddn = effective_dir->dentries[dname]; if (olddn->inode != in) { // replace incorrect dentry unlink(olddn, true, true); // keep dir, dentry - dn = link(dir, dname, in, olddn); + dn = link(effective_dir, dname, in, olddn); ceph_assert(dn == olddn); } else { // keep existing dn @@ -1406,13 +1434,13 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, } } else { // new dn - dn = link(dir, dname, in, NULL); + dn = link(effective_dir, dname, in, NULL); } dn->alternate_name = std::move(dlease.alternate_name); update_dentry_lease(dn, &dlease, request->sent_stamp, session); if (hash_order) { - unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname)); + unsigned hash = ceph_frag_value(effective_diri->hash_dentry_name(dname)); if (hash != last_hash) readdir_offset = 2; last_hash = hash; @@ -1421,20 +1449,21 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false); } // add to readdir cache - if (dirp->release_count == diri->dir_release_count && - dirp->ordered_count == diri->dir_ordered_count && - dirp->start_shared_gen == diri->shared_gen) { - if (dirp->cache_index == dir->readdir_cache.size()) { + if (!snapdiff_req && + dirp->release_count == effective_diri->dir_release_count && + dirp->ordered_count == effective_diri->dir_ordered_count && + dirp->start_shared_gen == effective_diri->shared_gen) { + if (dirp->cache_index == effective_dir->readdir_cache.size()) { if (i == 0) { ceph_assert(!dirp->inode->is_complete_and_ordered()); dir->readdir_cache.reserve(dirp->cache_index + numdn); } - dir->readdir_cache.push_back(dn); - } else if (dirp->cache_index < dir->readdir_cache.size()) { + effective_dir->readdir_cache.push_back(dn); + } else if (dirp->cache_index < effective_dir->readdir_cache.size()) { if (dirp->inode->is_complete_and_ordered()) - ceph_assert(dir->readdir_cache[dirp->cache_index] == dn); + ceph_assert(effective_dir->readdir_cache[dirp->cache_index] == dn); else - dir->readdir_cache[dirp->cache_index] = dn; + effective_dir->readdir_cache[dirp->cache_index] = dn; } else { ceph_abort_msg("unexpected readdir buffer idx"); } @@ -1454,6 +1483,8 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, if (dir->is_empty()) close_dir(dir); + if (dir_other && dir_other->is_empty()) + close_dir(dir_other); } } @@ -1613,10 +1644,20 @@ Inode* Client::insert_trace(MetaRequest *request, MetaSession *session) if (in) { if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) { - insert_readdir_results(request, session, in); + insert_readdir_results(request, + session, + in, + nullptr); } else if (op == CEPH_MDS_OP_LOOKUPNAME) { // hack: return parent inode instead in = diri; + } else if (op == CEPH_MDS_OP_READDIR_SNAPDIFF) { + // provide both request's inode (aka snapA) and traced one (snapB) + // to properly match snapdiff results + insert_readdir_results(request, + session, + request->inode(), + in); } if (request->dentry() == NULL && in != request->inode()) { @@ -1690,7 +1731,7 @@ mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri) * I think the MDS should be able to redirect as needed*/ in = in->get_first_parent()->dir->parent_inode; else { - ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl; + ldout(cct, 10) << __func__ << "got unlinked inode, can't look at parent" << dendl; break; } } @@ -3426,12 +3467,17 @@ Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn) lru.lru_insert_mid(dn); // mid or top? - ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in - << " dn " << dn << " (new dn)" << dendl; + if(in) { + ldout(cct, 15) << "link dir " << *dir->parent_inode << " '" << name << "' to inode " << *in + << " dn " << *dn << " (new dn)" << dendl; + } else { + ldout(cct, 15) << "link dir " << *dir->parent_inode << " '" << name << "' " + << " dn " << *dn << " (new dn)" << dendl; + } } else { ceph_assert(!dn->inode); - ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in - << " dn " << dn << " (old dn)" << dendl; + ldout(cct, 15) << "link dir " << *dir->parent_inode << " '" << name << "' to inode " << in + << " dn " << *dn << " (old dn)" << dendl; } if (in) { // link to inode @@ -9092,7 +9138,8 @@ void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp) dirp->buffer.clear(); } -int Client::_readdir_get_frag(dir_result_t *dirp) +int Client::_readdir_get_frag(int op, dir_result_t* dirp, + fill_readdir_args_cb_t fill_req_cb) { ceph_assert(dirp); ceph_assert(dirp->inode); @@ -9107,33 +9154,18 @@ int Client::_readdir_get_frag(dir_result_t *dirp) ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg << " offset " << hex << dirp->offset << dec << dendl; - int op = CEPH_MDS_OP_READDIR; - if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR) - op = CEPH_MDS_OP_LSSNAP; - InodeRef& diri = dirp->inode; MetaRequest *req = new MetaRequest(op); - filepath path; - diri->make_nosnap_relative_path(path); - req->set_filepath(path); - req->set_inode(diri.get()); - req->head.args.readdir.frag = fg; - req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS; - if (dirp->last_name.length()) { - req->path2.set_path(dirp->last_name); - } else if (dirp->hash_order()) { - req->head.args.readdir.offset_hash = dirp->offset_high(); - } - req->dirp = dirp; - + fill_req_cb(dirp, req, diri, fg); + bufferlist dirbl; int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl); if (res == -CEPHFS_EAGAIN) { ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl; _readdir_rechoose_frag(dirp); - return _readdir_get_frag(dirp); + return _readdir_get_frag(op, dirp, fill_req_cb); } if (res == 0) { @@ -9158,7 +9190,8 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p, { ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino - << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec + << " last_name " << dirp->last_name + << " offset " << hex << dirp->offset << dec << dendl; Dir *dir = dirp->inode->dir; @@ -9247,8 +9280,57 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p, return 0; } -int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p, - unsigned want, unsigned flags, bool getref) +int Client::readdir_r_cb(dir_result_t* d, + add_dirent_cb_t cb, + void* p, + unsigned want, + unsigned flags, + bool getref) +{ + auto fill_readdir_cb = [](dir_result_t* dirp, + MetaRequest* req, + InodeRef& diri, + frag_t fg) { + filepath path; + diri->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_inode(diri.get()); + req->head.args.readdir.frag = fg; + req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS; + if (dirp->last_name.length()) { + req->path2.set_path(dirp->last_name); + } else if (dirp->hash_order()) { + req->head.args.readdir.offset_hash = dirp->offset_high(); + } + req->dirp = dirp; + }; + int op = CEPH_MDS_OP_READDIR; + if (d->inode && d->inode->snapid == CEPH_SNAPDIR) + op = CEPH_MDS_OP_LSSNAP; + return _readdir_r_cb(op, + d, + cb, + fill_readdir_cb, + p, + want, + flags, + getref, + false); +} + +// +// NB: this is used for both readdir and readdir_snapdiff results processing +// hence it should be request type agnostic +// +int Client::_readdir_r_cb(int op, + dir_result_t *d, + add_dirent_cb_t cb, + fill_readdir_args_cb_t fill_cb, + void *p, + unsigned want, + unsigned flags, + bool getref, + bool bypass_cache) { int caps = statx_to_mask(flags, want); @@ -9338,12 +9420,14 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p, } // can we read from our cache? - ldout(cct, 10) << "offset " << hex << dirp->offset << dec + ldout(cct, 10) << __func__ + << " offset " << hex << dirp->offset << dec << " snapid " << dirp->inode->snapid << " (complete && ordered) " << dirp->inode->is_complete_and_ordered() << " issued " << ccap_string(dirp->inode->caps_issued()) << dendl; - if (dirp->inode->snapid != CEPH_SNAPDIR && + if (!bypass_cache && + dirp->inode->snapid != CEPH_SNAPDIR && dirp->inode->is_complete_and_ordered() && dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) { int err = _readdir_cache_cb(dirp, cb, p, caps, getref); @@ -9357,7 +9441,7 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p, bool check_caps = true; if (!dirp->is_cached()) { - int r = _readdir_get_frag(dirp); + int r = _readdir_get_frag(op, dirp, fill_cb); if (r) return r; // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is @@ -9366,7 +9450,8 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p, } frag_t fg = dirp->buffer_frag; - ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size() + ldout(cct, 10) << __func__ + << " frag " << fg << " buffer size " << dirp->buffer.size() << " offset " << hex << dirp->offset << dendl; for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(), @@ -9401,7 +9486,9 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p, r = cb(p, &de, &stx, next_off, inode); // _next_ offset cl.lock(); - ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec + ldout(cct, 15) << __func__ + << " de " << de.d_name << " off " << hex << next_off - 1 << dec + << " snap " << entry.inode->snapid << " = " << r << dendl; if (r < 0) return r; @@ -9423,7 +9510,8 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p, continue; } - if (diri->shared_gen == dirp->start_shared_gen && + if (!bypass_cache && + diri->shared_gen == dirp->start_shared_gen && diri->dir_release_count == dirp->release_count) { if (diri->dir_ordered_count == dirp->ordered_count) { ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl; @@ -9529,6 +9617,81 @@ int Client::readdirplus_r(dir_result_t *d, struct dirent *de, return 0; } +int Client::readdir_snapdiff(dir_result_t* d1, snapid_t snap2, + struct dirent* out_de, + snapid_t* out_snap) +{ + if (!d1 || !d1->inode || d1->inode->snapid == snap2) { + lderr(cct) << __func__ << " invalid parameters: " + << " d1:" << d1 + << " d1->inode:" << (d1 ? d1->inode : nullptr) + << " snap2 id :" << snap2 + << dendl; + errno = EINVAL; + return -errno; + } + + auto& de = d1->de; + ceph_statx stx; + single_readdir sr; + sr.de = &de; + sr.stx = &stx; + sr.inode = NULL; + sr.full = false; + + auto fill_snapdiff_cb = [&](dir_result_t* dirp, + MetaRequest* req, + InodeRef& diri, + frag_t fg) { + filepath path; + diri->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_inode(diri.get()); + req->head.args.snapdiff.snap_other = snap2; + req->head.args.snapdiff.frag = fg; + req->head.args.snapdiff.flags = CEPH_READDIR_REPLY_BITFLAGS; + if (dirp->last_name.length()) { + req->path2.set_path(dirp->last_name); + } else if (dirp->hash_order()) { + req->head.args.snapdiff.offset_hash = dirp->offset_high(); + } + req->dirp = dirp; + }; + + // our callback fills the dirent and sets sr.full=true on first + // call, and returns -1 the second time around. + int ret = _readdir_r_cb(CEPH_MDS_OP_READDIR_SNAPDIFF, + d1, + _readdir_single_dirent_cb, + fill_snapdiff_cb, + (void*)&sr, + 0, + AT_STATX_DONT_SYNC, + false, + true); + if (ret < -1) { + lderr(cct) << __func__ << " error: " + << cpp_strerror(ret) + << dendl; + errno = -ret; // this sucks. + return ret; + } + + ldout(cct, 15) << __func__ << " " << ret + << " " << sr.de->d_name + << " " << stx.stx_dev + << dendl; + if (sr.full) { + if (out_de) { + *out_de = de; + } + if (out_snap) { + *out_snap = stx.stx_dev; + } + return 1; + } + return 0; +} /* getdents */ struct getdents_result { diff --git a/src/client/Client.h b/src/client/Client.h index f71aab33f940..911a8b460dfa 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -362,6 +362,13 @@ public: int readdir_r(dir_result_t *dirp, struct dirent *de); int readdirplus_r(dir_result_t *dirp, struct dirent *de, struct ceph_statx *stx, unsigned want, unsigned flags, Inode **out); + /* + * Get the next snapshot delta entry. + * + */ + int readdir_snapdiff(dir_result_t* dir1, snapid_t snap2, + struct dirent* out_de, snapid_t* out_snap); + int getdir(const char *relpath, std::list& names, const UserPerm& perms); // get the whole dir at once. @@ -791,7 +798,8 @@ public: void update_dir_dist(Inode *in, DirStat *st, mds_rank_t from); void clear_dir_complete_and_ordered(Inode *diri, bool complete); - void insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri); + void insert_readdir_results(MetaRequest *request, MetaSession *session, + Inode *diri, Inode *diri_other); Inode* insert_trace(MetaRequest *request, MetaSession *session); void update_inode_file_size(Inode *in, int issued, uint64_t size, uint64_t truncate_seq, uint64_t truncate_size); @@ -1271,6 +1279,8 @@ private: MAY_READ = 4, }; + typedef std::function fill_readdir_args_cb_t; + std::unique_ptr> cct_deleter; /* Flags for VXattr */ @@ -1291,8 +1301,19 @@ private: bool _readdir_have_frag(dir_result_t *dirp); void _readdir_next_frag(dir_result_t *dirp); void _readdir_rechoose_frag(dir_result_t *dirp); - int _readdir_get_frag(dir_result_t *dirp); + int _readdir_get_frag(int op, dir_result_t *dirp, + fill_readdir_args_cb_t fill_req_cb); int _readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p, int caps, bool getref); + int _readdir_r_cb(int op, + dir_result_t* d, + add_dirent_cb_t cb, + fill_readdir_args_cb_t fill_cb, + void* p, + unsigned want, + unsigned flags, + bool getref, + bool bypass_cache); + void _closedir(dir_result_t *dirp); // other helpers diff --git a/src/common/ceph_strings.cc b/src/common/ceph_strings.cc index ca044cc85d0a..18dcc701b31d 100644 --- a/src/common/ceph_strings.cc +++ b/src/common/ceph_strings.cc @@ -300,6 +300,7 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_MKSNAP: return "mksnap"; case CEPH_MDS_OP_RMSNAP: return "rmsnap"; case CEPH_MDS_OP_RENAMESNAP: return "renamesnap"; + case CEPH_MDS_OP_READDIR_SNAPDIFF: return "readdir_snapdiff"; case CEPH_MDS_OP_SETFILELOCK: return "setfilelock"; case CEPH_MDS_OP_GETFILELOCK: return "getfilelock"; case CEPH_MDS_OP_FRAGMENTDIR: return "fragmentdir"; diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 1a75a5193336..42e5e53b438a 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -418,6 +418,7 @@ enum { CEPH_MDS_OP_RMSNAP = 0x01401, CEPH_MDS_OP_LSSNAP = 0x00402, CEPH_MDS_OP_RENAMESNAP = 0x01403, + CEPH_MDS_OP_READDIR_SNAPDIFF = 0x01404, // internal op CEPH_MDS_OP_FRAGMENTDIR= 0x01500, @@ -473,12 +474,12 @@ int ceph_flags_sys2wire(int flags); #define CEPH_XATTR_REMOVE (1 << 31) /* - * readdir request flags; + * readdir/readdir_snapdiff request flags; */ #define CEPH_READDIR_REPLY_BITFLAGS (1<<0) /* - * readdir reply flags. + * readdir/readdir_snapdiff reply flags. */ #define CEPH_READDIR_FRAG_END (1<<0) #define CEPH_READDIR_FRAG_COMPLETE (1<<8) @@ -622,6 +623,14 @@ union ceph_mds_request_args { __le64 parent; __le32 hash; } __attribute__ ((packed)) lookupino; + struct { + __le32 frag; /* which dir fragment */ + __le32 max_entries; /* how many dentries to grab */ + __le32 max_bytes; + __le16 flags; + __le32 offset_hash; + __le64 snap_other; + } __attribute__ ((packed)) snapdiff; } __attribute__ ((packed)); #define CEPH_MDS_REQUEST_HEAD_VERSION 2 diff --git a/src/include/cephfs/libcephfs.h b/src/include/cephfs/libcephfs.h index 62e0b51c2d31..dc62698fa482 100644 --- a/src/include/cephfs/libcephfs.h +++ b/src/include/cephfs/libcephfs.h @@ -27,6 +27,7 @@ #include #include #include +#include #include "ceph_ll_client.h" @@ -112,6 +113,11 @@ struct snap_info { struct snap_metadata *snap_metadata; }; +struct ceph_snapdiff_entry_t { + struct dirent dir_entry; + uint64_t snapid; //should be snapid_t but prefer not to exposure it +}; + /* setattr mask bits (up to an int in size) */ #ifndef CEPH_SETATTR_MODE #define CEPH_SETATTR_MODE (1 << 0) @@ -609,6 +615,53 @@ int ceph_readdir_r(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, int ceph_readdirplus_r(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, struct dirent *de, struct ceph_statx *stx, unsigned want, unsigned flags, struct Inode **out); +struct ceph_snapdiff_info +{ + struct ceph_mount_info* cmount; + struct ceph_dir_result* dir1; // primary dir entry to build snapdiff for. + struct ceph_dir_result* dir_aux; // aux dir entry to identify the second snapshot. + // Can point to the parent dir entry if entry-in-question + // doesn't exist in the second snapshot +}; + +/** + * Opens snapdiff stream to get snapshots delta (aka snapdiff). + * + * @param cmount the ceph mount handle to use for snapdiff retrieval. + * @param root_path root path for snapshots-in-question + * @param rel_path subpath under the root to build delta for + * @param snap1 the first snapshot name + * @param snap2 the second snapshot name + * @param out resulting snapdiff stream handle to be used for snapdiff results + retrieval via ceph_readdir_snapdiff + * @returns 0 on success and negative error code otherwise + */ +int ceph_open_snapdiff(struct ceph_mount_info* cmount, + const char* root_path, + const char* rel_path, + const char* snap1, + const char* snap2, + struct ceph_snapdiff_info* out); +/** + * Get the next snapshot delta entry. + * + * @param info snapdiff stream handle opened via ceph_open_snapdiff() + * @param out the next snapdiff entry which includes directory entry and the + * entry's snapshot id - later one for emerged/existing entry or + * former snapshot id for the removed entry. + * @returns >0 on success, 0 if no more entries in the stream and negative + * error code otherwise + */ +int ceph_readdir_snapdiff(struct ceph_snapdiff_info* snapdiff, + struct ceph_snapdiff_entry_t* out); +/** + * Close snapdiff stream. + * + * @param info snapdiff stream handle opened via ceph_open_snapdiff() + * @returns 0 on success and negative error code otherwise + */ +int ceph_close_snapdiff(struct ceph_snapdiff_info* snapdiff); + /** * Gets multiple directory entries. * diff --git a/src/libcephfs.cc b/src/libcephfs.cc index 99da0c5c5c8d..51e73efdb65e 100644 --- a/src/libcephfs.cc +++ b/src/libcephfs.cc @@ -19,6 +19,7 @@ #include "auth/Crypto.h" #include "client/Client.h" +#include "client/Inode.h" #include "librados/RadosClient.h" #include "common/async/context_pool.h" #include "common/ceph_argparse.h" @@ -28,6 +29,7 @@ #include "mon/MonClient.h" #include "include/str_list.h" #include "include/stringify.h" +#include "include/object.h" #include "messages/MMonMap.h" #include "msg/Messenger.h" #include "include/ceph_assert.h" @@ -687,6 +689,124 @@ extern "C" int ceph_readdirplus_r(struct ceph_mount_info *cmount, struct ceph_di return cmount->get_client()->readdirplus_r(reinterpret_cast(dirp), de, stx, want, flags, out); } +extern "C" int ceph_open_snapdiff(struct ceph_mount_info* cmount, + const char* root_path, + const char* rel_path, + const char* snap1, + const char* snap2, + struct ceph_snapdiff_info* out) +{ + if (!cmount->is_mounted()) { + /* we set errno to signal errors. */ + errno = ENOTCONN; + return -errno; + } + if (!out || !root_path || !rel_path || + !snap1 || !*snap1 || !snap2 || !*snap2) { + errno = EINVAL; + return -errno; + } + out->cmount = cmount; + out->dir1 = out->dir_aux = nullptr; + + char full_path1[PATH_MAX]; + char snapdir[PATH_MAX]; + cmount->conf_get("client_snapdir", snapdir, sizeof(snapdir) - 1); + int n = snprintf(full_path1, PATH_MAX, + "%s/%s/%s/%s", root_path, snapdir, snap1, rel_path); + if (n < 0 || n == PATH_MAX) { + errno = ENAMETOOLONG; + return -errno; + } + char full_path2[PATH_MAX]; + n = snprintf(full_path2, PATH_MAX, + "%s/%s/%s/%s", root_path, snapdir, snap2, rel_path); + if (n < 0 || n == PATH_MAX) { + errno = ENAMETOOLONG; + return -errno; + } + + int r = ceph_opendir(cmount, full_path1, &(out->dir1)); + if (r != 0) { + //it's OK to have one of the snap paths absent - attempting another one + r = ceph_opendir(cmount, full_path2, &(out->dir1)); + if (r != 0) { + // both snaps are absent, giving up + errno = ENOENT; + return -errno; + } + std::swap(snap1, snap2); // will use snap1 to learn snap_other below + } else { + // trying to open second snapshot to learn snapid and + // get the entry loaded into the client cache if any. + r = ceph_opendir(cmount, full_path2, &(out->dir_aux)); + //paranoic, rely on this value below + out->dir_aux = r == 0 ? out->dir_aux : nullptr; + } + if (!out->dir_aux) { + // now trying to learn the second snapshot's id by using snapshot's root + n = snprintf(full_path2, PATH_MAX, + "%s/%s/%s", root_path, snapdir, snap2); + ceph_assert(n > 0 && n < PATH_MAX); //we've already checked above + //that longer string fits. + // Hence unlikely to assert + r = ceph_opendir(cmount, full_path2, &(out->dir_aux)); + if (r != 0) { + goto close_err; + } + } + return 0; + +close_err: + ceph_close_snapdiff(out); + return r; +} + +extern "C" int ceph_readdir_snapdiff(struct ceph_snapdiff_info* snapdiff, + struct ceph_snapdiff_entry_t* out) +{ + if (!snapdiff->cmount->is_mounted()) { + /* also sets errno to signal errors. */ + errno = ENOTCONN; + return -errno; + } + dir_result_t* d1 = reinterpret_cast(snapdiff->dir1); + dir_result_t* d2 = reinterpret_cast(snapdiff->dir_aux); + if (!d1 || !d2 || !d1->inode || !d2->inode) { + errno = EINVAL; + return -errno; + } + snapid_t snapid; + int r = snapdiff->cmount->get_client()->readdir_snapdiff( + d1, + d2->inode->snapid, + &(out->dir_entry), + &snapid); + if (r >= 0) { + // converting snapid_t to uint64_t to avoid snapid_t exposure + out->snapid = snapid; + } + return r; +} + +extern "C" int ceph_close_snapdiff(struct ceph_snapdiff_info* snapdiff) +{ + if (!snapdiff->cmount || !snapdiff->cmount->is_mounted()) { + /* also sets errno to signal errors. */ + errno = ENOTCONN; + return -errno; + } + if (snapdiff->dir_aux) { + ceph_closedir(snapdiff->cmount, snapdiff->dir_aux); + } + if (snapdiff->dir1) { + ceph_closedir(snapdiff->cmount, snapdiff->dir1); + } + snapdiff->cmount = nullptr; + snapdiff->dir1 = snapdiff->dir_aux = nullptr; + return 0; +} + extern "C" int ceph_getdents(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, char *buf, int buflen) { diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 61058ec17fcd..0a43b09f6a06 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -8674,7 +8674,7 @@ int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf, // success. if (mds->logger) mds->logger->inc(l_mds_traverse_hit); dout(10) << "path_traverse finish on snapid " << snapid << dendl; - if (mdr) + if (mdr) ceph_assert(mdr->snapid == snapid); if (flags & MDS_TRAVERSE_RDLOCK_SNAP) diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index 09ebe7052a90..092651a7c1f7 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -416,6 +416,7 @@ struct MDRequestImpl : public MutationImpl { CInode *in[2] = {}; CDentry *straydn = nullptr; snapid_t snapid = CEPH_NOSNAP; + snapid_t snapid_diff_other = CEPH_NOSNAP; CInode *tracei = nullptr; CDentry *tracedn = nullptr; diff --git a/src/mds/Server.cc b/src/mds/Server.cc index bf12cb7e2c8f..69045ea04473 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -242,6 +242,8 @@ void Server::create_logger() "Request type remove snapshot latency"); plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency", "Request type rename snapshot latency"); + plb.add_time_avg(l_mdss_req_snapdiff_latency, "req_snapdiff_latency", + "Request type snapshot difference latency"); plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY); plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request", @@ -2125,6 +2127,9 @@ void Server::perf_gather_op_latency(const cref_t &req, utime_t l case CEPH_MDS_OP_RENAMESNAP: code = l_mdss_req_renamesnap_latency; break; + case CEPH_MDS_OP_READDIR_SNAPDIFF: + code = l_mdss_req_snapdiff_latency; + break; default: dout(1) << ": unknown client op" << dendl; return; @@ -2388,7 +2393,8 @@ void Server::set_trace_dist(const ref_t &reply, // inode if (in) { in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps); - dout(20) << "set_trace_dist added in " << *in << dendl; + dout(20) << "set_trace_dist added snap " << snapid << " in " << *in + << dendl; reply->head.is_target = 1; } else reply->head.is_target = 0; @@ -2752,6 +2758,9 @@ void Server::dispatch_client_request(MDRequestRef& mdr) case CEPH_MDS_OP_RENAMESNAP: handle_client_renamesnap(mdr); break; + case CEPH_MDS_OP_READDIR_SNAPDIFF: + handle_client_readdir_snapdiff(mdr); + break; default: dout(1) << " unknown client op " << req->get_op() << dendl; @@ -4732,6 +4741,47 @@ void Server::handle_client_openc(MDRequestRef& mdr) } +void Server::_finalize_readdir(MDRequestRef& mdr, + CInode *diri, + CDir* dir, + bool start, + bool end, + __u16 flags, + __u32 numfiles, + bufferlist& dirbl, + bufferlist& dnbl) +{ + const cref_t &req = mdr->client_request; + Session *session = mds->get_session(req); + + session->touch_readdir_cap(numfiles); + + if (end) { + flags |= CEPH_READDIR_FRAG_END; + if (start) + flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve + } + + // finish final blob + encode(numfiles, dirbl); + encode(flags, dirbl); + dirbl.claim_append(dnbl); + + // yay, reply + dout(10) << "reply to " << *req << " readdir num=" << numfiles + << " bytes=" << dirbl.length() + << " start=" << (int)start + << " end=" << (int)end + << dendl; + mdr->reply_extra_bl = dirbl; + + // bump popularity. NOTE: this doesn't quite capture it. + mds->balancer->hit_dir(dir, META_POP_READDIR, numfiles); + + // reply + mdr->tracei = diri; + respond_to_request(mdr, 0); +} void Server::handle_client_readdir(MDRequestRef& mdr) { @@ -4937,7 +4987,7 @@ void Server::handle_client_readdir(MDRequestRef& mdr) dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl; break; } - + unsigned start_len = dnbl.length(); // dentry @@ -4946,7 +4996,7 @@ void Server::handle_client_readdir(MDRequestRef& mdr) mds->locker->issue_client_lease(dn, in, mdr, now, dnbl); // inode - dout(12) << "including inode " << *in << dendl; + dout(12) << "including inode in " << *in << " snap " << snapid << dendl; int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length()); if (r < 0) { // chop off dn->name, lease @@ -4962,39 +5012,12 @@ void Server::handle_client_readdir(MDRequestRef& mdr) // touch dn mdcache->lru.lru_touch(dn); } - - session->touch_readdir_cap(numfiles); - __u16 flags = 0; - if (end) { - flags = CEPH_READDIR_FRAG_END; - if (start) - flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve - } // client only understand END and COMPLETE flags ? if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) { flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH; } - - // finish final blob - encode(numfiles, dirbl); - encode(flags, dirbl); - dirbl.claim_append(dnbl); - - // yay, reply - dout(10) << "reply to " << *req << " readdir num=" << numfiles - << " bytes=" << dirbl.length() - << " start=" << (int)start - << " end=" << (int)end - << dendl; - mdr->reply_extra_bl = dirbl; - - // bump popularity. NOTE: this doesn't quite capture it. - mds->balancer->hit_dir(dir, META_POP_READDIR, numfiles); - - // reply - mdr->tracei = diri; - respond_to_request(mdr, 0); + _finalize_readdir(mdr, diri, dir, start, end, flags, numfiles, dirbl, dnbl); } @@ -11372,7 +11395,8 @@ void Server::handle_client_renamesnap(MDRequestRef& mdr) return; } - snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino()); + snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino()); + dout(10) << " snapname " << srcname << " is " << snapid << dendl; // lock snap @@ -11457,6 +11481,159 @@ void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid respond_to_request(mdr, 0); } +void Server::handle_client_readdir_snapdiff(MDRequestRef& mdr) +{ + const cref_t& req = mdr->client_request; + Session* session = mds->get_session(req); + MutationImpl::LockOpVec lov; + CInode* diri = rdlock_path_pin_ref(mdr, false, true); + if (!diri) return; + + // it's a directory, right? + if (!diri->is_dir()) { + // not a dir + dout(10) << "reply to " << *req << " snapdiff -CEPHFS_ENOTDIR" << dendl; + respond_to_request(mdr, -CEPHFS_ENOTDIR); + return; + } + + auto num_caps = session->get_num_caps(); + auto session_cap_acquisition = session->get_cap_acquisition(); + + if (num_caps > static_cast(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) { + dout(20) << "snapdiff throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps + << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl; + if (logger) + logger->inc(l_mdss_cap_acquisition_throttle); + + mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + + lov.add_rdlock(&diri->filelock); + lov.add_rdlock(&diri->dirfragtreelock); + + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (!check_access(mdr, diri, MAY_READ)) + return; + + // which frag? + frag_t fg = (__u32)req->head.args.snapdiff.frag; + unsigned req_flags = (__u32)req->head.args.snapdiff.flags; + string offset_str = req->get_path2(); + + __u32 offset_hash = 0; + if (!offset_str.empty()) { + offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str)); + } else { + offset_hash = (__u32)req->head.args.snapdiff.offset_hash; + } + + dout(10) << " frag " << fg << " offset '" << offset_str << "'" + << " offset_hash " << offset_hash << " flags " << req_flags << dendl; + + // does the frag exist? + if (diri->dirfragtree[fg.value()] != fg) { + frag_t newfg; + if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) { + if (fg.contains((unsigned)offset_hash)) { + newfg = diri->dirfragtree[offset_hash]; + } else { + // client actually wants next frag + newfg = diri->dirfragtree[fg.value()]; + } + } else { + offset_str.clear(); + newfg = diri->dirfragtree[fg.value()]; + } + dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl; + fg = newfg; + } + + CDir* dir = try_open_auth_dirfrag(diri, fg, mdr); + if (!dir) return; + + // ok! + dout(10) << __func__<< " on " << *dir << dendl; + ceph_assert(dir->is_auth()); + + if (!dir->is_complete()) { + if (dir->is_frozen()) { + dout(7) << "dir is frozen " << *dir << dendl; + mds->locker->drop_locks(mdr.get()); + mdr->drop_local_auth_pins(); + dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + // fetch + dout(10) << " incomplete dir contents for snapdiff on " << *dir << ", fetching" << dendl; + dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true); + return; + } + +#ifdef MDS_VERIFY_FRAGSTAT + dir->verify_fragstat(); +#endif + + utime_t now = ceph_clock_now(); + mdr->set_mds_stamp(now); + + mdr->snapid_diff_other = (uint64_t)req->head.args.snapdiff.snap_other; + if (mdr->snapid_diff_other == mdr->snapid || + mdr->snapid == CEPH_NOSNAP || + mdr->snapid_diff_other == CEPH_NOSNAP) { + dout(10) << "reply to " << *req << " snapdiff -CEPHFS_EINVAL" << dendl; + respond_to_request(mdr, -CEPHFS_EINVAL); + } + + dout(10) << __func__ + << " snap " << mdr->snapid + << " vs. snap " << mdr->snapid_diff_other + << dendl; + + SnapRealm* realm = diri->find_snaprealm(); + + unsigned max = req->head.args.snapdiff.max_entries; + if (!max) + max = dir->get_num_any(); // whatever, something big. + unsigned max_bytes = req->head.args.snapdiff.max_bytes; + if (!max_bytes) + // make sure at least one item can be encoded + max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size; + + // start final blob + bufferlist dirbl; + DirStat ds; + ds.frag = dir->get_frag(); + ds.auth = dir->get_dir_auth().first; + if (dir->is_auth() && !forward_all_requests_to_auth) + dir->get_dist_spec(ds.dist, mds->get_nodeid()); + + dir->encode_dirstat(dirbl, mdr->session->info, ds); + + // count bytes available. + // this isn't perfect, but we should capture the main variable/unbounded size items! + int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8) * 2; + int bytes_left = max_bytes - front_bytes; + bytes_left -= get_snap_trace(session, realm).length(); + + _readdir_diff( + now, + mdr, + diri, + dir, + realm, + max, + bytes_left, + offset_str, + offset_hash, + req_flags, + dirbl); +} + + /** * Return true if server is in state RECONNECT and this * client has not yet reconnected. @@ -11487,3 +11664,265 @@ const bufferlist& Server::get_snap_trace(client_t client, SnapRealm *realm) cons Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v)); return get_snap_trace(session, realm); } + +void Server::_readdir_diff( + utime_t now, + MDRequestRef& mdr, + CInode* diri, + CDir* dir, + SnapRealm* realm, + unsigned max_entries, + int bytes_left, + const string& offset_str, + uint32_t offset_hash, + unsigned req_flags, + bufferlist& dirbl) +{ + // build dir contents + bufferlist dnbl; + __u32 numfiles = 0; + + snapid_t snapid = mdr->snapid; + snapid_t snapid_prev = mdr->snapid_diff_other; + if (snapid < snapid_prev) { + std::swap(snapid, snapid_prev); + } + bool from_the_beginning = !offset_hash && offset_str.empty(); + // skip all dns < dentry_key_t(snapid, offset_str, offset_hash) + dentry_key_t skip_key(snapid_prev, offset_str.c_str(), offset_hash); + + bool end = build_snap_diff( + mdr, + dir, + bytes_left, + from_the_beginning ? nullptr : & skip_key, + snapid_prev, + snapid, + dnbl, + [&](CDentry* dn, CInode* in, bool exists) { + string name; + snapid_t effective_snapid; + const auto& dn_name = dn->get_name(); + // provide the first snapid for removed entries and + // the last one for existent ones + effective_snapid = exists ? snapid : snapid_prev; + name.append(dn_name); + if ((int)(dnbl.length() + name.length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) { + dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl; + return false; + } + + auto diri = dir->get_inode(); + auto hash = ceph_frag_value(diri->hash_dentry_name(dn_name)); + unsigned start_len = dnbl.length(); + dout(10) << "inc dn " << *dn << " as " << name + << std::hex << " hash 0x" << hash << std::dec + << dendl; + encode(name, dnbl); + mds->locker->issue_client_lease(dn, in, mdr, now, dnbl); + + // inode + dout(10) << "inc inode " << *in << " snap " << effective_snapid << dendl; + int r = in->encode_inodestat(dnbl, mdr->session, realm, effective_snapid, bytes_left - (int)dnbl.length()); + if (r < 0) { + // chop off dn->name, lease + dout(10) << " ran out of room, stopping at " + << start_len << " < " << bytes_left << dendl; + bufferlist keep; + keep.substr_of(dnbl, 0, start_len); + dnbl.swap(keep); + return false; + } + + // touch dn + mdcache->lru.lru_touch(dn); + ++numfiles; + return true; + }); + + __u16 flags = 0; + if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) { + flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH; + } + + std::swap(mdr->snapid, mdr->snapid_diff_other); // we want opponent snapid to be used for tracei + + _finalize_readdir(mdr, diri, dir, from_the_beginning, end, flags, numfiles, + dirbl, dnbl); +} + +bool Server::build_snap_diff( + MDRequestRef& mdr, + CDir* dir, + int bytes_left, + dentry_key_t* skip_key, + snapid_t snapid_prev, + snapid_t snapid, + const bufferlist& dnbl, + std::function add_result_cb) +{ + client_t client = mdr->client_request->get_source().num(); + + struct EntryInfo { + CDentry* dn = nullptr; + CInode* in = nullptr; + utime_t mtime; + + void reset() { + *this = EntryInfo(); + } + } before; + + auto insert_deleted = [&](EntryInfo& ei) { + dout(20) << "build_snap_diff deleted file " << ei.dn->get_name() << " " + << ei.dn->first << "/" << ei.dn->last << dendl; + int r = add_result_cb(ei.dn, ei.in, false); + ei.reset(); + return r; + }; + + auto it = !skip_key ? dir->begin() : dir->lower_bound(*skip_key); + + while(it != dir->end()) { + CDentry* dn = it->second; + dout(20) << __func__ << " " << it->first << "->" << *dn << dendl; + ++it; + if (dn->state_test(CDentry::STATE_PURGING)) + continue; + + bool dnp = dn->use_projected(client, mdr); + CDentry::linkage_t* dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage(); + + if (dnl->is_null()) { + dout(20) << __func__ << " linkage is null, skipping" << dendl; + continue; + } + + if (dn->last < snapid_prev || dn->first > snapid) { + dout(20) << __func__ << " not in range, skipping" << dendl; + continue; + } + if (skip_key) { + skip_key->snapid = dn->last; + if (!(*skip_key < dn->key())) + continue; + } + + CInode* in = dnl->get_inode(); + if (in && in->ino() == CEPH_INO_CEPH) + continue; + + // remote link? + // better for the MDS to do the work, if we think the client will stat any of these files. + if (dnl->is_remote() && !in) { + in = mdcache->get_inode(dnl->get_remote_ino()); + dout(20) << __func__ << " remote in: " << *in << " ino " << std::hex << dnl->get_remote_ino() << std::dec << dendl; + if (in) { + dn->link_remote(dnl, in); + } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) { + dout(10) << "skipping bad remote ino on " << *dn << dendl; + continue; + } else { + // touch everything i _do_ have + for (auto& p : *dir) { + if (!p.second->get_linkage()->is_null()) + mdcache->lru.lru_touch(p.second); + } + + // already issued caps and leases, reply immediately. + if (dnbl.length() > 0) { + mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop); + dout(10) << " open remote dentry after caps were issued, stopping at " + << dnbl.length() << " < " << bytes_left << dendl; + } else { + mds->locker->drop_locks(mdr.get()); + mdr->drop_local_auth_pins(); + mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr)); + } + return false; + } + } + ceph_assert(in); + + utime_t mtime = in->get_inode()->mtime; + + if (in->is_dir()) { + + // we need to maintain the order of entries (determined by their name hashes) + // hence need to insert the previous entry if any immediately. + if (before.dn) { + if (!insert_deleted(before)) { + break; + } + } + + bool exists = true; + if (snapid_prev < dn->first && dn->last < snapid) { + dout(20) << __func__ << " skipping inner " << dn->get_name() << " " + << dn->first << "/" << dn->last << dendl; + continue; + } else if (dn->first <= snapid_prev && dn->last < snapid) { + // dir deleted + dout(20) << __func__ << " deleted dir " << dn->get_name() << " " + << dn->first << "/" << dn->last << dendl; + exists = false; + } + bool r = add_result_cb(dn, in, exists); + if (!r) { + break; + } + } else { + if (snapid_prev >= dn->first && snapid <= dn->last) { + dout(20) << __func__ << " skipping unchanged " << dn->get_name() << " " + << dn->first << "/" << dn->last << dendl; + continue; + } else if (snapid_prev < dn->first && snapid > dn->last) { + dout(20) << __func__ << " skipping inner modification " << dn->get_name() << " " + << dn->first << "/" << dn->last << dendl; + continue; + } + string_view name_before = + before.dn ? string_view(before.dn->get_name()) : string_view(); + if (before.dn && dn->get_name() != name_before) { + if (!insert_deleted(before)) { + break; + } + before.reset(); + } + if (snapid_prev >= dn->first && snapid_prev <= dn->last) { + dout(30) << __func__ << " dn_before " << dn->get_name() << " " + << dn->first << "/" << dn->last << dendl; + before = EntryInfo {dn, in, mtime}; + continue; + } else { + if (before.dn && dn->get_name() == name_before) { + if (mtime == before.mtime) { + dout(30) << __func__ << " timestamp not changed " << dn->get_name() << " " + << dn->first << "/" << dn->last + << " " << mtime + << dendl; + before.reset(); + continue; + } else { + dout(30) << __func__ << " timestamp changed " << dn->get_name() << " " + << dn->first << "/" << dn->last + << " " << before.mtime << " vs. " << mtime + << dendl; + before.reset(); + } + } + dout(20) << __func__ << " new file " << dn->get_name() << " " + << dn->first << "/" << dn->last + << dendl; + ceph_assert(snapid >= dn->first && snapid <= dn->last); + } + if (!add_result_cb(dn, in, true)) { + break; + } + } + } + if (before.dn) { + insert_deleted(before); + } + return it == dir->end(); +} diff --git a/src/mds/Server.h b/src/mds/Server.h index a269d6cb4790..ef24b27fcfb9 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -68,6 +68,7 @@ enum { l_mdss_req_readdir_latency, l_mdss_req_rename_latency, l_mdss_req_renamesnap_latency, + l_mdss_req_snapdiff_latency, l_mdss_req_rmdir_latency, l_mdss_req_rmsnap_latency, l_mdss_req_rmxattr_latency, @@ -299,6 +300,7 @@ public: void _rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid); void handle_client_renamesnap(MDRequestRef& mdr); void _renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid); + void handle_client_readdir_snapdiff(MDRequestRef& mdr); // helpers bool _rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, std::set &witnesse, @@ -480,6 +482,37 @@ private: void reply_client_request(MDRequestRef& mdr, const ref_t &reply); void flush_session(Session *session, MDSGatherBuilder& gather); + void _finalize_readdir(MDRequestRef& mdr, + CInode *diri, + CDir* dir, + bool start, + bool end, + __u16 flags, + __u32 numfiles, + bufferlist& dirbl, + bufferlist& dnbl); + void _readdir_diff( + utime_t now, + MDRequestRef& mdr, + CInode* diri, + CDir* dir, + SnapRealm* realm, + unsigned max_entries, + int bytes_left, + const std::string& offset_str, + uint32_t offset_hash, + unsigned req_flags, + bufferlist& dirbl); + bool build_snap_diff( + MDRequestRef& mdr, + CDir* dir, + int bytes_left, + dentry_key_t* skip_key, + snapid_t snapid_before, + snapid_t snapid, + const bufferlist& dnbl, + std::function add_result_cb); + MDSRank *mds; MDCache *mdcache; MDLog *mdlog; diff --git a/src/mds/SnapRealm.cc b/src/mds/SnapRealm.cc index c7a7d75bc242..9d303bcb03ae 100644 --- a/src/mds/SnapRealm.cc +++ b/src/mds/SnapRealm.cc @@ -255,7 +255,7 @@ snapid_t SnapRealm::resolve_snapname(std::string_view n, inodeno_t atino, snapid //if (num && p->second.snapid == num) //return p->first; if (actual && p->second.name == n) - return p->first; + return p->first; if (!actual && p->second.name == pname && p->second.ino == pino) return p->first; }