From: Kotresh HR Date: Fri, 4 Apr 2025 13:23:14 +0000 (+0530) Subject: mds: Don't use global snaprealm seq for subvolumes X-Git-Tag: v21.0.0~256^2~151^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=1cebd41369b71fef9e8727aa55c9670e828330aa;p=ceph.git mds: Don't use global snaprealm seq for subvolumes Don't use global snaprealm seq number while doing cow on old inodes for subvolume inode and inodes under it i.e., for directories marked with 'ceph.dir.subvolume' vxattr. This is safe because all the hardlink/renames are contained within the same subvolume snaprealm and doesn't cross the subvolume snaprealms For the directories between / and subvolume snapshot directory, use the global snaprealm seq to cow the old inodes only if there is atleast one snapshot taken. The above behavior is made optional with the mds config 'mds_use_global_snaprealm_seq_for_subvol'. The option is enabled by default which means the above behaviour is disabled by default. The option is suggested to be disabled only on cephfs volumes used for pure subvolume usecase. Fixes: https://tracker.ceph.com/issues/70794 Signed-off-by: Kotresh HR --- diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index 06c12ea671db..844ed2be4ada 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -1563,6 +1563,28 @@ options: max: 4_K flags: - runtime +- name: mds_use_global_snaprealm_seq_for_subvol + type: bool + level: advanced + default: true + services: + - mds + fmt_desc: MDS will use the global snaprealm's seq to cow old inodes + for the directory. If the option is disabled, MDS will use the + respective snaprealm seq number for the subvolume snapshot directory + i.e., the directory marked with the 'ceph.dir.subvolume'. This is + safe because the hardlinks/renames are contained within the subvolume + snaprealm with the marking of 'ceph.dir.subvolume'. For the directories + above subvolume, it will use the global snaprealm's seq only if there + is at least one snapshot of the directory above the subvolume. + If the option is enabled, which is the default, it uses the global + snaprealm's seq to cow old inodes across the filesystem. Hence, + disabling this option is only suitalbe for the cephfs volumes used + purely for subvolume usecase where there are no snapshots in the + filesystem apart from the subvolume snapshots. So it's a great + optimization for subvolumes. + flags: + - runtime - name: mds_asio_thread_count type: uint level: advanced diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 01fd41fa6587..cdd136dd4290 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -3183,8 +3183,38 @@ const CInode::mempool_old_inode& CInode::cow_old_inode(snapid_t follows, bool co void CInode::pre_cow_old_inode() { - snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq(); - dout(20) << __func__ << " follows " << follows << " on " << *this << dendl; + snapid_t follows; + bool using_global_snaprealm_seq = true; + SnapRealm *realm = find_snaprealm(); + //bool use_global_snaprealm_seq = mdcache->use_global_snaprealm_seq; + + if (mdcache->get_use_global_snaprealm_seq()) { + follows = mdcache->get_global_snaprealm()->get_newest_seq(); + } else if (realm->get_subvolume_ino() || realm->get_newest_seq() <= 1 ) { + /* Config is disabled : + 1. If it's a subvolume realm, obviously use realm's seq number. + 2. If there are no snaps on that directory, use realm's seq number. + a. In a pure subvolume use case, updates outside the subvolume directory + from group directory (/volumes// to root would use realm's + seq number to avoid unnecessary cow of old inodes. + b. In a non subvolume use case, use realm's seq number only if there are + no snaps. If there are snaps, always use global snaprealm's seq as there + could be hardlinks/renames. + */ + follows = realm->get_newest_seq(); + using_global_snaprealm_seq = false; + } else { + /* Config is disabled: + * 1. In a pure subvolume use case, if there is atleast one snap in the realm (between + * root and subvolume snap path), use global snaprealm's seq number. + * 2. In a non subvolume use case, if there is atleast one snap in the realm, + * use global snaprealm's seq number. + */ + follows = mdcache->get_global_snaprealm()->get_newest_seq(); + } + + dout(20) << __func__ << " using_global_snaprealm_seq:" << (using_global_snaprealm_seq ? "yes ":"no ") + << " follows " << follows << " on " << *this << " snaprealm=" << *realm << dendl; if (first <= follows) cow_old_inode(follows, true); } diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 5e7bcd2a376a..b96d9146ac5f 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -202,6 +202,7 @@ MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) : kill_dirfrag_at = static_cast(g_conf().get_val("mds_kill_dirfrag_at")); kill_shutdown_at = g_conf().get_val("mds_kill_shutdown_at"); + use_global_snaprealm_seq = g_conf().get_val("mds_use_global_snaprealm_seq_for_subvol"); lru.lru_set_midpoint(g_conf().get_val("mds_cache_mid")); @@ -280,6 +281,10 @@ void MDCache::handle_conf_change(const std::set& changed, const MDS if (changed.count("mds_kill_shutdown_at")) { kill_shutdown_at = g_conf().get_val("mds_kill_shutdown_at"); } + if (changed.count("mds_use_global_snaprealm_seq_for_subvol")) { + use_global_snaprealm_seq = g_conf().get_val("mds_use_global_snaprealm_seq_for_subvol"); + dout(20) << __func__ << " mds_use_global_snaprealm_seq_for_subvol now " << use_global_snaprealm_seq << dendl; + } migrator->handle_conf_change(changed, mdsmap); mds->balancer->handle_conf_change(changed, mdsmap); diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index dc655903e18f..f78840df5e74 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -255,6 +255,10 @@ class MDCache { return symlink_recovery; } + bool get_use_global_snaprealm_seq(void) const { + return use_global_snaprealm_seq; + } + /** * Call this when you know that a CDentry is ready to be passed * on to StrayManager (i.e. this is a stray you've just created) @@ -1578,6 +1582,7 @@ private: // -- snaprealms -- SnapRealm *global_snaprealm = nullptr; + bool use_global_snaprealm_seq = true; std::map uncommitted_fragments; diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 6f4adb81be49..c1ab2aa55fcc 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -4145,7 +4145,8 @@ std::vector MDSRankDispatcher::get_tracked_keys() "mds_session_cap_acquisition_throttle", "mds_session_max_caps_throttle_ratio", "mds_session_metadata_threshold", - "mds_symlink_recovery" + "mds_symlink_recovery", + "mds_use_global_snaprealm_seq_for_subvol" }); static_assert(std::is_sorted(as_sv.begin(), as_sv.end()), "keys are not sorted!");