From 4baa1c9f3372ef942b6d452ec6bec428cfbdd360 Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Wed, 14 Jan 2026 18:00:43 +0530 Subject: [PATCH] tools/cephfs_mirror: Synchronize taking snapshot The crawler/entry creation thread needs to wait until all the data is synced by datasync threads to take the snapshot. This patch adds the necessary conditions for the same. It is important for the conditional flag to be part of SyncMechanism and not part of PeerReplayer class. The following bug would be hit if it were part of PeerReplayer class. When multiple directories are confiugred for mirroring as below /d0 /d1 /d2 Crawler1 Crawler2 Crawler3 DoneEntryOps DoneEntryOps DoneEntryOps WaitForSafeSnap WaitForSafeSnap WaitForSafeSnap When all crawler threads are waiting at above, the data sync threads which is done processing /d1, would notify, waking up all the crawlers causing spurious/unwanted wake up and half baked snapshots. Fixes: https://tracker.ceph.com/issues/73452 Signed-off-by: Kotresh HR --- src/tools/cephfs_mirror/PeerReplayer.cc | 39 ++++++++++++++++++------- src/tools/cephfs_mirror/PeerReplayer.h | 8 +++++ 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/src/tools/cephfs_mirror/PeerReplayer.cc b/src/tools/cephfs_mirror/PeerReplayer.cc index 3c8bc1509c6..3b701836a89 100644 --- a/src/tools/cephfs_mirror/PeerReplayer.cc +++ b/src/tools/cephfs_mirror/PeerReplayer.cc @@ -1348,6 +1348,18 @@ void PeerReplayer::SyncMechanism::mark_crawl_finished() { sdq_cv.notify_all(); } +void PeerReplayer::SyncMechanism::wait_until_safe_to_snapshot() { + std::unique_lock lock(sdq_lock); + dout(20) << ": Waiting for data sync to be done to take snapshot - dir_root=" << m_dir_root + << " current=" << m_current << " prev=" << (m_prev ? stringify(m_prev) : "") + << " syncm=" << this << dendl; + sdq_cv.wait(lock, [this]{return m_take_snapshot;}); + dout(20) << ": Woke up to take snapshot - dir_root=" << m_dir_root + << " current=" << m_current << " prev=" << (m_prev ? stringify(m_prev) : "") + << " syncm=" << this << dendl; + m_take_snapshot = false; +} + int PeerReplayer::SyncMechanism::get_changed_blocks(const std::string &epath, const struct ceph_statx &stx, bool sync_check, const std::function &callback) { @@ -1899,7 +1911,6 @@ int PeerReplayer::do_synchronize(const std::string &dir_root, const Snapshot &cu dout(10) << ": tree traversal done for dir_root=" << dir_root << dendl; break; } - } syncm->finish_sync(); @@ -1916,6 +1927,20 @@ int PeerReplayer::do_synchronize(const std::string &dir_root, const Snapshot &cu // there is no need to close this fd manually. ceph_close(fh.p_mnt, fh.p_fd); + if (r == 0 ) { //Bail out without taking snap if r < 0 + syncm->wait_until_safe_to_snapshot(); + + // All good, take the snapshot + auto cur_snap_id_str{stringify(current.second)}; + snap_metadata snap_meta[] = {{PRIMARY_SNAP_ID_KEY.c_str(), cur_snap_id_str.c_str()}}; + r = ceph_mksnap(m_remote_mount, dir_root.c_str(), current.first.c_str(), 0755, + snap_meta, sizeof(snap_meta)/sizeof(snap_metadata)); + if (r < 0) { + derr << ": failed to snap remote directory dir_root=" << dir_root + << ": " << cpp_strerror(r) << dendl; + } + } + return r; } @@ -1961,21 +1986,11 @@ int PeerReplayer::synchronize(const std::string &dir_root, const Snapshot &curre r = do_synchronize(dir_root, current); } } - // snap sync failed -- bail out! if (r < 0) { return r; } - auto cur_snap_id_str{stringify(current.second)}; - snap_metadata snap_meta[] = {{PRIMARY_SNAP_ID_KEY.c_str(), cur_snap_id_str.c_str()}}; - r = ceph_mksnap(m_remote_mount, dir_root.c_str(), current.first.c_str(), 0755, - snap_meta, sizeof(snap_meta)/sizeof(snap_metadata)); - if (r < 0) { - derr << ": failed to snap remote directory dir_root=" << dir_root - << ": " << cpp_strerror(r) << dendl; - } - return r; } @@ -2244,6 +2259,8 @@ void PeerReplayer::run_datasync(SnapshotDataSyncThread *data_replayer) { && syncm->get_in_flight_unlocked() == 0 && syncm->get_crawl_finished_unlocked() == true) { dout(20) << ": Dequeue syncm object=" << syncm << dendl; + syncm->set_snapshot_unlocked(); + syncm->sdq_cv_notify_all_unlocked(); // To wake up crawler thread waiting to take snapshot syncm_q.pop(); smq_cv.notify_all(); } diff --git a/src/tools/cephfs_mirror/PeerReplayer.h b/src/tools/cephfs_mirror/PeerReplayer.h index ae83cf00410..c91c010e8e7 100644 --- a/src/tools/cephfs_mirror/PeerReplayer.h +++ b/src/tools/cephfs_mirror/PeerReplayer.h @@ -237,6 +237,13 @@ private: boost::optional get_m_prev() const { return m_prev; } + void set_snapshot_unlocked() { + m_take_snapshot = true; + } + void sdq_cv_notify_all_unlocked() { + sdq_cv.notify_all(); + } + void wait_until_safe_to_snapshot(); int remote_mkdir(const std::string &epath, const struct ceph_statx &stx); protected: @@ -253,6 +260,7 @@ private: std::queue m_sync_dataq; int m_in_flight = 0; bool m_sync_crawl_finished = false; + bool m_take_snapshot = false; // It's not used in RemoteSync but required to be accessed in datasync threads std::string m_dir_root; }; -- 2.47.3