]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
tools/cephfs_mirror: Add datasync_queue_wait_duration metric
authorKotresh HR <khiremat@redhat.com>
Fri, 8 May 2026 00:22:59 +0000 (05:52 +0530)
committerKotresh HR <khiremat@redhat.com>
Fri, 12 Jun 2026 19:12:17 +0000 (00:42 +0530)
Add the metric which measures the time spent by the snapshot
in the data queue waiting for the datasync threads.

Sample output:
When still 'waiting' in queue
{
    "/d1": {
        "state": "syncing",
        "current_syncing_snap": {
            "id": 18,
            "name": "d1_snap5",
            "sync-mode": "delta",
            "avg_read_throughput_bytes": "0.00 B/s",
            "avg_write_throughput_bytes": "0.00 B/s",
            "crawl": {
                "state": "in-progress",
                "duration": "13s"
            },
            "datasync_queue_wait": {
                "state": "waiting",
                "duration": "12s"
            },
            "bytes": {
                "sync_bytes": "0.00 B",
                "total_bytes": "110.99 MiB",
                "sync_percent": "0.00%"
            },
            "files": {
                "sync_files": 0,
                "total_files": 3719,
                "sync_percent": "0.00%"
            },
            "eta": "calculating..."
        },
        "last_synced_snap": {
            "id": 15,
            "name": "d1_snap4"
        },
        "snaps_synced": 0,
        "snaps_deleted": 0,
        "snaps_renamed": 0
    },
}
---------------
After 'complete'
{
    "/d1": {
        "state": "syncing",
        "current_syncing_snap": {
            "id": 18,
            "name": "d1_snap5",
            "sync-mode": "delta",
            "avg_read_throughput_bytes": "11.66 MiB/s",
            "avg_write_throughput_bytes": "34.55 MiB/s",
            "crawl": {
                "state": "completed",
                "duration": "17s"
            },
            "datasync_queue_wait": {
                "state": "completed",
                "duration": "19s"
            },
            "bytes": {
                "sync_bytes": "149.94 MiB",
                "total_bytes": "149.94 MiB",
                "sync_percent": "100.00%"
            },
            "files": {
                "sync_files": 5000,
                "total_files": 5000,
                "sync_percent": "100.00%"
            },
            "eta": "0s"
        },
        "last_synced_snap": {
            "id": 15,
            "name": "d1_snap4"
        },
        "snaps_synced": 0,
        "snaps_deleted": 0,
        "snaps_renamed": 0
    }
}
-----
Also stored in last_sync_snap section
{
    "/d1": {
        "state": "idle",
        "last_synced_snap": {
            "id": 18,
            "name": "d1_snap5",
            "crawl_duration": "17s",
            "datasync_queue_wait_duration": "19s",
            "sync_duration": "44s",
            "sync_time_stamp": "8172.009480s",
            "sync_bytes": "149.94 MiB",
            "sync_files": 5000
        },
        "snaps_synced": 1,
        "snaps_deleted": 0,
        "snaps_renamed": 0
    }
}

Fixes: https://tracker.ceph.com/issues/73453
Signed-off-by: Kotresh HR <khiremat@redhat.com>
src/tools/cephfs_mirror/PeerReplayer.cc
src/tools/cephfs_mirror/PeerReplayer.h

index 334567b0c9d86acd480ee78c7c3c064debfa2197..11d4c7f67fae1acb00f4812367d77d316e29227c 100644 (file)
@@ -1353,10 +1353,17 @@ PeerReplayer::SyncMechanism::~SyncMechanism() {
 void PeerReplayer::SyncMechanism::push_dataq_entry(SyncEntry e) {
   dout(10) << ": snapshot data replayer dataq pushed" << " syncm=" << this
           << " epath=" << e.epath << dendl;
-  m_peer_replayer.inc_total_bytes_files(std::string(m_dir_root), e.stx.stx_size);
+  const uint64_t entry_bytes = e.stx.stx_size;
   std::unique_lock lock(sdq_lock);
+  if (!m_first_dataq_push_time) {
+    m_first_dataq_push_time = clock::now();
+    m_peer_replayer.set_datasync_queue_wait_start_time(std::string(m_dir_root),
+                                                       *m_first_dataq_push_time);
+  }
   m_sync_dataq.push(std::move(e));
   sdq_cv.notify_all();
+  lock.unlock();
+  m_peer_replayer.inc_total_bytes_files(std::string(m_dir_root), entry_bytes);
 }
 
 bool PeerReplayer::SyncMechanism::pop_dataq_entry(SyncEntry &out_entry) {
@@ -1402,6 +1409,12 @@ bool PeerReplayer::SyncMechanism::pop_dataq_entry(SyncEntry &out_entry) {
     return false; // no more work
   }
 
+  if (!m_datasync_queue_wait_reported && m_first_dataq_push_time) {
+    sec_duration dq_wait{0};
+    dq_wait = sec_duration(clock::now() - *m_first_dataq_push_time);
+    m_peer_replayer.set_datasync_queue_wait_duration(std::string(m_dir_root), dq_wait.count());
+    m_datasync_queue_wait_reported = true;
+  }
   out_entry = std::move(m_sync_dataq.front());
   m_sync_dataq.pop();
   dout(10) << ": snapshot data replayer dataq popped" << " syncm=" << this
@@ -2749,6 +2762,22 @@ void PeerReplayer::peer_status(Formatter *f) {
         f->dump_string("duration", format_time(crawl_duration_till_now.count()));
       }
       f->close_section(); //crawl
+      if (sync_stat.datasync_queue_wait_duration ||
+          sync_stat.datasync_queue_wait_start_time) {
+        f->open_object_section("datasync_queue_wait");
+        if (sync_stat.datasync_queue_wait_duration) {
+          f->dump_string("state", "completed");
+          f->dump_string("duration",
+                         format_time(*sync_stat.datasync_queue_wait_duration));
+        } else {
+          f->dump_string("state", "waiting");
+          auto cur_time = clock::now();
+          sec_duration dq_wait{0};
+          dq_wait = sec_duration(cur_time - *sync_stat.datasync_queue_wait_start_time);
+          f->dump_string("duration", format_time(dq_wait.count()));
+        }
+        f->close_section();
+      }
       f->open_object_section("bytes");
       f->dump_string("sync_bytes", format_bytes(sync_stat.sync_bytes));
       f->dump_string("total_bytes", format_bytes(sync_stat.total_bytes));
@@ -2783,6 +2812,10 @@ void PeerReplayer::peer_status(Formatter *f) {
       if (sync_stat.last_sync_crawl_duration) {
         f->dump_string("crawl_duration", format_time(*sync_stat.last_sync_crawl_duration));
       }
+      if (sync_stat.last_sync_datasync_queue_wait_duration) {
+        f->dump_string("datasync_queue_wait_duration",
+                       format_time(*sync_stat.last_sync_datasync_queue_wait_duration));
+      }
       if (sync_stat.last_sync_duration) {
         f->dump_string("sync_duration", format_time(*sync_stat.last_sync_duration));
         f->dump_stream("sync_time_stamp") << sync_stat.last_synced;
index 9cf3df0715048660e86919655dde1c11df361702..6bfc84a5835e47bda855f50be796061cb6bf7a8c 100644 (file)
@@ -319,6 +319,8 @@ private:
     bool m_datasync_error = false;
     int m_datasync_errno = 0;
     bool m_backoff = false;
+    boost::optional<monotime> m_first_dataq_push_time;
+    bool m_datasync_queue_wait_reported = false;
   };
 
   class RemoteSync : public SyncMechanism {
@@ -385,6 +387,7 @@ private:
     monotime last_synced = clock::zero();
     boost::optional<double> last_sync_duration;
     boost::optional<double> last_sync_crawl_duration;
+    boost::optional<double> last_sync_datasync_queue_wait_duration;
     boost::optional<uint64_t> last_sync_bytes; //last sync bytes for display in status
     boost::optional<uint64_t> last_sync_files; //last num of sync files for display in status
     uint64_t sync_bytes = 0; //sync bytes counter, independently for each directory sync.
@@ -407,6 +410,8 @@ private:
     uint64_t discovered_delta_bytes = 0; //discovered delta bytes counter, independently for each directory sync.
     uint64_t blockdiff_sync_bytes = 0; //actual bytes synced using SnapDiff/blockdiff counter
     double blockdiff_time_sec = 0.0; //actual sync time using SnapDiff/blockdiff counter
+    boost::optional<double> datasync_queue_wait_duration; // first data_q push to first pop (final)
+    boost::optional<monotime> datasync_queue_wait_start_time; // until first pop; for in-progress display
   };
 
   void _inc_failed_count(const std::string &dir_root) {
@@ -455,6 +460,8 @@ private:
     sync_stat.discovered_delta_bytes = 0;
     sync_stat.blockdiff_sync_bytes = 0;
     sync_stat.blockdiff_time_sec = 0.0;
+    sync_stat.datasync_queue_wait_duration = boost::none;
+    sync_stat.datasync_queue_wait_start_time = boost::none;
   }
   void _set_last_synced_snap(const std::string &dir_root, uint64_t snap_id,
                             const std::string &snap_name) {
@@ -497,6 +504,9 @@ private:
     sync_stat.last_synced = clock::now();
     sync_stat.last_sync_duration = duration;
     sync_stat.last_sync_crawl_duration = sync_stat.crawl_duration;
+    //For empty snapshot sync, datasync_queue_wait_duration is 0
+    sync_stat.last_sync_datasync_queue_wait_duration =
+      sync_stat.datasync_queue_wait_duration.value_or(0.0);
     sync_stat.last_sync_bytes = sync_stat.sync_bytes;
     sync_stat.last_sync_files = sync_stat.sync_files;
     ++sync_stat.synced_snap_count;
@@ -513,6 +523,21 @@ private:
     sync_stat.crawl_finished = state;
     sync_stat.crawl_duration = seconds;
   }
+  void set_datasync_queue_wait_start_time(const std::string &dir_root, monotime t) {
+    std::scoped_lock locker(m_lock);
+    auto &sync_stat = m_snap_sync_stats.at(dir_root);
+    if (!sync_stat.datasync_queue_wait_start_time && !sync_stat.datasync_queue_wait_duration) {
+      sync_stat.datasync_queue_wait_start_time = t;
+    }
+  }
+  void set_datasync_queue_wait_duration(const std::string &dir_root, double seconds) {
+    std::scoped_lock locker(m_lock);
+    auto &sync_stat = m_snap_sync_stats.at(dir_root);
+    if (!sync_stat.datasync_queue_wait_duration) {
+      sync_stat.datasync_queue_wait_duration = seconds;
+      sync_stat.datasync_queue_wait_start_time = boost::none;
+    }
+  }
   void set_blockdiff_metrics(const std::string &dir_root, const uint64_t bd_syncbytes, const double bd_time) {
     std::scoped_lock locker(m_lock);
     auto &sync_stat = m_snap_sync_stats.at(dir_root);