From 887dbc2933c4ffa4b10fd0ea1a96514919930e8f Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Tue, 12 Aug 2025 16:17:49 +0300 Subject: [PATCH] mds: rollback the snapdiff fragment entries with the same name if needed. This is required when more entries with the same name don't fit into the fragment. With the existing means for fragment offset specification such a splitting to be prohibited. Fixes: https://tracker.ceph.com/issues/72518 Signed-off-by: Igor Fedotov (cherry picked from commit 24955e66f4826f8623d2bec1dbfc580f0e4c39ae) --- src/mds/CDir.h | 1 + src/mds/Server.cc | 51 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/src/mds/CDir.h b/src/mds/CDir.h index a5cd3101744..44f1c12699d 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -347,6 +347,7 @@ public: dentry_key_map::iterator begin() { return items.begin(); } dentry_key_map::iterator end() { return items.end(); } dentry_key_map::iterator lower_bound(dentry_key_t key) { return items.lower_bound(key); } + dentry_key_map::iterator upper_bound(dentry_key_t key) { return items.upper_bound(key); } unsigned get_num_head_items() const { return num_head_items; } unsigned get_num_head_null() const { return num_head_null; } diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 49d5fdd7efa..ca8b204b052 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -11817,7 +11817,7 @@ void Server::handle_client_readdir_snapdiff(const MDRequestRef& mdr) offset_hash = (__u32)req->head.args.snapdiff.offset_hash; } - dout(10) << " frag " << fg << " offset '" << offset_str << "'" + dout(10) << __func__ << " frag " << fg << " offset '" << offset_str << "'" << " offset_hash " << offset_hash << " flags " << req_flags << dendl; // does the frag exist? @@ -11974,8 +11974,18 @@ void Server::_readdir_diff( std::swap(snapid, snapid_prev); } bool from_the_beginning = !offset_hash && offset_str.empty(); - // skip all dns < dentry_key_t(snapid, offset_str, offset_hash) - dentry_key_t skip_key(snapid_prev, offset_str.c_str(), offset_hash); + // skip all dns <= dentry_key_t(*, offset_str, offset_hash) + dentry_key_t skip_key(CEPH_NOSNAP, offset_str.c_str(), offset_hash); + + // We need to rollback all the entries with the same name + // when some entries with this name don't fit into the same fragment. + // This is caused by the limited ability for offset provisioning between + // fragments - there is no way to identify specific snapshot for the last entry. + // The following vars denote the potential rollback position for such a case. + // Fixes: https://tracker.ceph.com/issues/72518 + string last_name; + size_t rollback_pos = 0; + size_t rollback_num = 0; bool end = build_snap_diff( mdr, @@ -11994,7 +12004,16 @@ void Server::_readdir_diff( effective_snapid = exists ? snapid : snapid_prev; name.append(dn_name); if ((int)(dnbl.length() + name.length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) { - dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl; + dout(10) << " ran out of room for name, stopping at " << dnbl.length() << " < " << bytes_left << dendl; + if (name == last_name) { + bufferlist keep; + keep.substr_of(dnbl, 0, rollback_pos); + dnbl.swap(keep); + last_name.clear(); + rollback_pos = 0; + numfiles = rollback_num; + rollback_num = 0; + } return false; } @@ -12003,6 +12022,7 @@ void Server::_readdir_diff( unsigned start_len = dnbl.length(); dout(10) << "inc dn " << *dn << " as " << name << std::hex << " hash 0x" << hash << std::dec + << " " << effective_snapid << dendl; encode(name, dnbl); mds->locker->issue_client_lease(dn, in, mdr, now, dnbl); @@ -12015,11 +12035,24 @@ void Server::_readdir_diff( dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl; bufferlist keep; - keep.substr_of(dnbl, 0, start_len); + + keep.substr_of(dnbl, 0, + name == last_name ? rollback_pos : start_len); dnbl.swap(keep); + + last_name.clear(); + rollback_pos = 0; + numfiles = rollback_num; + rollback_num = 0; return false; } + // set rollback position + if (name != last_name) { + last_name = name; + rollback_pos = start_len; + rollback_num = numfiles; + } // touch dn mdcache->lru.lru_touch(dn); ++numfiles; @@ -12067,7 +12100,7 @@ bool Server::build_snap_diff( return r; }; - auto it = !skip_key ? dir->begin() : dir->lower_bound(*skip_key); + auto it = !skip_key ? dir->begin() : dir->upper_bound(*skip_key); while(it != dir->end()) { CDentry* dn = it->second; @@ -12088,11 +12121,6 @@ bool Server::build_snap_diff( dout(20) << __func__ << " not in range, skipping" << dendl; continue; } - if (skip_key) { - skip_key->snapid = dn->last; - if (!(*skip_key < dn->key())) - continue; - } CInode* in = dnl->get_inode(); if (in && in->ino() == CEPH_INO_CEPH) @@ -12131,7 +12159,6 @@ bool Server::build_snap_diff( ceph_assert(in); utime_t mtime = in->get_inode()->mtime; - if (in->is_dir()) { // we need to maintain the order of entries (determined by their name hashes) -- 2.39.5