From 1cad96b43de669254b87589b5ad7ae8c20490b13 Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Tue, 12 Aug 2025 16:17:49 +0300 Subject: [PATCH] mds: rollback the snapdiff fragment entries with the same name if needed. This is required when more entries with the same name don't fit into the fragment. With the existing means for fragment offset specification such a splitting to be prohibited. Fixes: https://tracker.ceph.com/issues/72518 Signed-off-by: Igor Fedotov (cherry picked from commit 24955e66f4826f8623d2bec1dbfc580f0e4c39ae) --- src/mds/CDir.h | 1 + src/mds/Server.cc | 51 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/src/mds/CDir.h b/src/mds/CDir.h index fb5d71bb792..ac684940bb9 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -350,6 +350,7 @@ public: dentry_key_map::iterator begin() { return items.begin(); } dentry_key_map::iterator end() { return items.end(); } dentry_key_map::iterator lower_bound(dentry_key_t key) { return items.lower_bound(key); } + dentry_key_map::iterator upper_bound(dentry_key_t key) { return items.upper_bound(key); } unsigned get_num_head_items() const { return num_head_items; } unsigned get_num_head_null() const { return num_head_null; } diff --git a/src/mds/Server.cc b/src/mds/Server.cc index effa7f4f341..45422694c09 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -11897,7 +11897,7 @@ void Server::handle_client_readdir_snapdiff(const MDRequestRef& mdr) offset_hash = (__u32)req->head.args.snapdiff.offset_hash; } - dout(10) << " frag " << fg << " offset '" << offset_str << "'" + dout(10) << __func__ << " frag " << fg << " offset '" << offset_str << "'" << " offset_hash " << offset_hash << " flags " << req_flags << dendl; // does the frag exist? @@ -12054,8 +12054,18 @@ void Server::_readdir_diff( std::swap(snapid, snapid_prev); } bool from_the_beginning = !offset_hash && offset_str.empty(); - // skip all dns < dentry_key_t(snapid, offset_str, offset_hash) - dentry_key_t skip_key(snapid_prev, offset_str.c_str(), offset_hash); + // skip all dns <= dentry_key_t(*, offset_str, offset_hash) + dentry_key_t skip_key(CEPH_NOSNAP, offset_str.c_str(), offset_hash); + + // We need to rollback all the entries with the same name + // when some entries with this name don't fit into the same fragment. + // This is caused by the limited ability for offset provisioning between + // fragments - there is no way to identify specific snapshot for the last entry. + // The following vars denote the potential rollback position for such a case. + // Fixes: https://tracker.ceph.com/issues/72518 + string last_name; + size_t rollback_pos = 0; + size_t rollback_num = 0; bool end = build_snap_diff( mdr, @@ -12074,7 +12084,16 @@ void Server::_readdir_diff( effective_snapid = exists ? snapid : snapid_prev; name.append(dn_name); if ((int)(dnbl.length() + name.length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) { - dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl; + dout(10) << " ran out of room for name, stopping at " << dnbl.length() << " < " << bytes_left << dendl; + if (name == last_name) { + bufferlist keep; + keep.substr_of(dnbl, 0, rollback_pos); + dnbl.swap(keep); + last_name.clear(); + rollback_pos = 0; + numfiles = rollback_num; + rollback_num = 0; + } return false; } @@ -12083,6 +12102,7 @@ void Server::_readdir_diff( unsigned start_len = dnbl.length(); dout(10) << "inc dn " << *dn << " as " << name << std::hex << " hash 0x" << hash << std::dec + << " " << effective_snapid << dendl; encode(name, dnbl); mds->locker->issue_client_lease(dn, in, mdr, now, dnbl); @@ -12095,11 +12115,24 @@ void Server::_readdir_diff( dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl; bufferlist keep; - keep.substr_of(dnbl, 0, start_len); + + keep.substr_of(dnbl, 0, + name == last_name ? rollback_pos : start_len); dnbl.swap(keep); + + last_name.clear(); + rollback_pos = 0; + numfiles = rollback_num; + rollback_num = 0; return false; } + // set rollback position + if (name != last_name) { + last_name = name; + rollback_pos = start_len; + rollback_num = numfiles; + } // touch dn mdcache->lru.lru_touch(dn); ++numfiles; @@ -12147,7 +12180,7 @@ bool Server::build_snap_diff( return r; }; - auto it = !skip_key ? dir->begin() : dir->lower_bound(*skip_key); + auto it = !skip_key ? dir->begin() : dir->upper_bound(*skip_key); while(it != dir->end()) { CDentry* dn = it->second; @@ -12168,11 +12201,6 @@ bool Server::build_snap_diff( dout(20) << __func__ << " not in range, skipping" << dendl; continue; } - if (skip_key) { - skip_key->snapid = dn->last; - if (!(*skip_key < dn->key())) - continue; - } CInode* in = dnl->get_inode(); if (in && in->ino() == CEPH_INO_CEPH) @@ -12211,7 +12239,6 @@ bool Server::build_snap_diff( ceph_assert(in); utime_t mtime = in->get_inode()->mtime; - if (in->is_dir()) { // we need to maintain the order of entries (determined by their name hashes) -- 2.39.5