From 9648d4610c35ccc79cce7e715230b273c2f5107e Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Sat, 21 Feb 2026 21:25:30 +0530 Subject: [PATCH] tools/cephfs_mirror: Do remote fs sync once instead of fsync on each fd Do remote fs sync once just before taking snapshot as it's faster than doing fsync on each fd after file copy. Moreover, all the datasync threads use the same sinlge libceph onnection and doing ceph_fsync concurrently on different fds on a single libcephfs connection could cause hang as observed in testing as below. This issue is tracked at https://tracker.ceph.com/issues/75070 ----- Thread 2 (Thread 0xffff644cc400 (LWP 74020) "d_replayer-0"): 0 0x0000ffff8e82656c in __futex_abstimed_wait_cancelable64 () from /lib64/libc.so.6 1 0x0000ffff8e828ff0 [PAC] in pthread_cond_wait@@GLIBC_2.17 () from /lib64/libc.so.6 2 0x0000ffff8fc90fd4 [PAC] in ceph::condition_variable_debug::wait ... 3 0x0000ffff9080fc9c in ceph::condition_variable_debug::wait --- src/tools/cephfs_mirror/PeerReplayer.cc | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/tools/cephfs_mirror/PeerReplayer.cc b/src/tools/cephfs_mirror/PeerReplayer.cc index 424026f61f4..eee7fb14f3c 100644 --- a/src/tools/cephfs_mirror/PeerReplayer.cc +++ b/src/tools/cephfs_mirror/PeerReplayer.cc @@ -806,11 +806,6 @@ int PeerReplayer::copy_to_remote(const std::string &dir_root, const std::string << cpp_strerror(r) << dendl; goto freeptr; } - r = ceph_fsync(m_remote_mount, r_fd, 0); - if (r < 0) { - derr << ": failed to sync data for file path=" << epath << ": " - << cpp_strerror(r) << dendl; - } } freeptr: @@ -2008,7 +2003,14 @@ int PeerReplayer::do_synchronize(const std::string &dir_root, const Snapshot &cu bool datasync_err = syncm->wait_for_sync(); if (r == 0 && !datasync_err) { - // All good, take the snapshot + // All good, fsync remote fs and take the snapshot + dout(20) << ": syncing remote filesystem, dir_root=" << dir_root << dendl; + r = ceph_sync_fs(m_remote_mount); + if (r < 0) { + derr << ": failed to sync remote filesystem, dir_root=" << dir_root + << ": " << cpp_strerror(r) << dendl; + return r; + } auto cur_snap_id_str{stringify(current.second)}; snap_metadata snap_meta[] = {{PRIMARY_SNAP_ID_KEY.c_str(), cur_snap_id_str.c_str()}}; r = ceph_mksnap(m_remote_mount, dir_root.c_str(), current.first.c_str(), 0755, @@ -2016,6 +2018,7 @@ int PeerReplayer::do_synchronize(const std::string &dir_root, const Snapshot &cu if (r < 0) { derr << ": failed to snap remote directory dir_root=" << dir_root << ": " << cpp_strerror(r) << dendl; + return r; } } else if (datasync_err) { r = syncm->get_datasync_errno(); -- 2.47.3