From 66827323fcb4e5cf419afa03dbaefcd198774058 Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Tue, 31 Mar 2026 12:34:27 +0530 Subject: [PATCH] tools/cephfs_mirror: Fix sync hang The snapshot mirror sync can hang if all of the following are true. 1. The snapshot being synced contain only directories and no files. 2. The crawler finishes syncing dirs and completes crawling before datasync threads picks it up from syncm queue. The above scenario can be achieved as below. 1. Configure say /d0 and /d1 for mirroring. 2. Create around 10k files in /d0 3. Create a single dir say /d1/dir0 4. snapshot /d0 and wait for status to change 'syncing' 5. Now, snapshot /d1. The /d1 snapshot will be stuck in syncing, as datasync thread's has_pending_work logic would never pick it up as there are no files to be synced (i.e. dataq is empty and crawl is finished which is essentially job done) and never notifies crawler thread to proceed with taking snapshot. The fix is to pick up the syncm if the crawler is finished and dataq is empty to avoid missing notification to the crawler thread. Fixes: https://tracker.ceph.com/issues/75804 Signed-off-by: Kotresh HR --- src/tools/cephfs_mirror/PeerReplayer.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/tools/cephfs_mirror/PeerReplayer.cc b/src/tools/cephfs_mirror/PeerReplayer.cc index 2b5dff98ef81..d584b419790c 100644 --- a/src/tools/cephfs_mirror/PeerReplayer.cc +++ b/src/tools/cephfs_mirror/PeerReplayer.cc @@ -1390,11 +1390,11 @@ bool PeerReplayer::SyncMechanism::has_pending_work() const { const bool job_done = m_sync_dataq.empty() && m_crawl_finished; - /* On crawl error, return true even if the queue is empty to + /* On crawl error or crawl finished (job_don), return true even if the queue is empty to * - Dequeue the syncm object * - Notify the crawler as it waits after the error for pending jobs to finish. */ - if (m_crawl_error) { + if (m_crawl_error || job_done) { // If m_in_flight > 0, those threads will take care of dequeue/notify, you just consume next job if (m_in_flight > 0) return false; @@ -1402,8 +1402,9 @@ bool PeerReplayer::SyncMechanism::has_pending_work() const { return true; } - // No more work if datasync failed or everything is done - if (m_datasync_error || job_done) + // No more work if datasync failed. Since datasync error is set by datasync thread, dequeue + // and notify will be taken care by it. + if (m_datasync_error) return false; // Distribute threads fairly if enabled -- 2.47.3