From 931226016f1efefcf1354c7cda3e6b1628bbb4bd Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Sat, 21 Feb 2026 13:58:47 +0530 Subject: [PATCH] tools/cephfs_mirror: Synchronize taking snapshot The crawler/entry creation thread needs to wait until all the data is synced by datasync threads to take the snapshot. This patch adds the necessary conditions for the same. It is important for the conditional flag to be part of SyncMechanism and not part of PeerReplayer class. The following bug would be hit if it were part of PeerReplayer class. When multiple directories are confiugred for mirroring as below /d0 /d1 /d2 Crawler1 Crawler2 Crawler3 DoneEntryOps DoneEntryOps DoneEntryOps WaitForSafeSnap WaitForSafeSnap WaitForSafeSnap When all crawler threads are waiting at above, the data sync threads which is done processing /d1, would notify, waking up all the crawlers causing spurious/unwanted wake up and half baked snapshots. Fixes: https://tracker.ceph.com/issues/73452 Signed-off-by: Kotresh HR --- src/tools/cephfs_mirror/PeerReplayer.cc | 38 ++++++++++++++++++------- src/tools/cephfs_mirror/PeerReplayer.h | 9 ++++++ 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/src/tools/cephfs_mirror/PeerReplayer.cc b/src/tools/cephfs_mirror/PeerReplayer.cc index ff4f3aa3088..cfcd35b7787 100644 --- a/src/tools/cephfs_mirror/PeerReplayer.cc +++ b/src/tools/cephfs_mirror/PeerReplayer.cc @@ -1348,6 +1348,18 @@ void PeerReplayer::SyncMechanism::mark_crawl_finished() { sdq_cv.notify_all(); } +void PeerReplayer::SyncMechanism::wait_for_sync() { + std::unique_lock lock(sdq_lock); + dout(20) << ": Waiting for data sync to be done to take snapshot - dir_root=" << m_dir_root + << " current=" << m_current << " prev=" << (m_prev ? stringify(m_prev) : "") + << " syncm=" << this << dendl; + sdq_cv.wait(lock, [this]{return m_sync_done;}); + dout(20) << ": Woke up to take snapshot - dir_root=" << m_dir_root + << " current=" << m_current << " prev=" << (m_prev ? stringify(m_prev) : "") + << " syncm=" << this << dendl; + m_sync_done = false; +} + int PeerReplayer::SyncMechanism::get_changed_blocks(const std::string &epath, const struct ceph_statx &stx, bool sync_check, const std::function &callback) { @@ -1899,7 +1911,6 @@ int PeerReplayer::do_synchronize(const std::string &dir_root, const Snapshot &cu dout(10) << ": tree traversal done for dir_root=" << dir_root << dendl; break; } - } syncm->finish_crawl(); @@ -1916,6 +1927,20 @@ int PeerReplayer::do_synchronize(const std::string &dir_root, const Snapshot &cu // there is no need to close this fd manually. ceph_close(fh.p_mnt, fh.p_fd); + if (r == 0 ) { //Bail out without taking snap if r < 0 + syncm->wait_for_sync(); + + // All good, take the snapshot + auto cur_snap_id_str{stringify(current.second)}; + snap_metadata snap_meta[] = {{PRIMARY_SNAP_ID_KEY.c_str(), cur_snap_id_str.c_str()}}; + r = ceph_mksnap(m_remote_mount, dir_root.c_str(), current.first.c_str(), 0755, + snap_meta, sizeof(snap_meta)/sizeof(snap_metadata)); + if (r < 0) { + derr << ": failed to snap remote directory dir_root=" << dir_root + << ": " << cpp_strerror(r) << dendl; + } + } + return r; } @@ -1961,21 +1986,11 @@ int PeerReplayer::synchronize(const std::string &dir_root, const Snapshot &curre r = do_synchronize(dir_root, current); } } - // snap sync failed -- bail out! if (r < 0) { return r; } - auto cur_snap_id_str{stringify(current.second)}; - snap_metadata snap_meta[] = {{PRIMARY_SNAP_ID_KEY.c_str(), cur_snap_id_str.c_str()}}; - r = ceph_mksnap(m_remote_mount, dir_root.c_str(), current.first.c_str(), 0755, - snap_meta, sizeof(snap_meta)/sizeof(snap_metadata)); - if (r < 0) { - derr << ": failed to snap remote directory dir_root=" << dir_root - << ": " << cpp_strerror(r) << dendl; - } - return r; } @@ -2244,6 +2259,7 @@ void PeerReplayer::run_datasync(SnapshotDataSyncThread *data_replayer) { && syncm->get_in_flight_unlocked() == 0 && syncm->get_crawl_finished_unlocked() == true) { dout(20) << ": Dequeue syncm object=" << syncm << dendl; + syncm->set_sync_finished_and_notify_unlocked(); // To wake up crawler thread waiting to take snapshot syncm_q.pop(); smq_cv.notify_all(); } diff --git a/src/tools/cephfs_mirror/PeerReplayer.h b/src/tools/cephfs_mirror/PeerReplayer.h index 067baab5f97..b6fe1b8a74d 100644 --- a/src/tools/cephfs_mirror/PeerReplayer.h +++ b/src/tools/cephfs_mirror/PeerReplayer.h @@ -237,6 +237,14 @@ private: boost::optional get_m_prev() const { return m_prev; } + void set_sync_finished_and_notify_unlocked() { + m_sync_done = true; + sdq_cv.notify_all(); + } + void sdq_cv_notify_all_unlocked() { + sdq_cv.notify_all(); + } + void wait_for_sync(); int remote_mkdir(const std::string &epath, const struct ceph_statx &stx); protected: @@ -253,6 +261,7 @@ private: std::queue m_sync_dataq; int m_in_flight = 0; bool m_crawl_finished = false; + bool m_sync_done = false; // It's not used in RemoteSync but required to be accessed in datasync threads std::string m_dir_root; }; -- 2.47.3