]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
tools/cephfs_mirror: Expose per-directory snap metrics via perf counters
authorKotresh HR <khiremat@redhat.com>
Tue, 16 Jun 2026 17:22:21 +0000 (22:52 +0530)
committerKotresh HR <khiremat@redhat.com>
Tue, 16 Jun 2026 16:47:23 +0000 (22:17 +0530)
Introduce a new labeled perf counter group, cephfs_mirror_directory, so
per-directory snapshot mirror progress can be scraped via "counter dump" and
exported to Prometheus by ceph-exporter (e.g.
ceph_cephfs_mirror_directory_current_sync_bytes).

Design
------

* One PerfCounters instance per mirrored directory on a peer, keyed in
  m_directory_perf_counters and registered on the daemon-wide
  PerfCountersCollection.
* Labels on each instance (flat counter dump array entries):
    - source_fscid, source_filesystem
    - peer_uuid, peer_cluster_name, peer_cluster_filesystem
    - directory (dir_root, e.g. "/parent/d1")
  The peer_uuid label disambiguates the same directory path mirrored to
  different peers.
* Counters are created in init() and add_directory(), removed in
  remove_directory() and the PeerReplayer destructor.
* Priority follows cephfs_mirror_perf_stats_prio (same as
  cephfs_mirror_peers).

Update path
-----------

Live / current_syncing_snap gauges are refreshed from
update_directory_current_sync_perf_counters(), called by
refresh_directory_current_sync_perf_counters() from the per-peer tick
thread (run_tick()). Each cephfs_mirror_tick_interval seconds (default 5)
the tick thread updates counters for each registered (actively syncing)
directory.

Counters registered (schema)
----------------------------

All of the following are added to the builder in this commit. Only the
"current sync" and dir_state fields listed under "Updated in this commit"
are written here; last_synced_snap and per-directory snap summary counters
are registered for a follow-up commit that updates them when stats change.

Directory state
  dir_state (gauge u64)
    0 = idle, 1 = syncing, 2 = failed
    Maps peer_status top-level "state" (numeric; no string values).

Current syncing snapshot (peer_status "current_syncing_snap")
  [Updated in this commit]
  current_snap_id          - snapshot id being synchronized
  current_sync_mode        - 0 = full, 1 = delta (snapdiff)
  current_read_bps         - bytes/sec read (raw, not formatted)
  current_write_bps        - bytes/sec written
  crawl_state              - 0 = N/A, 1 = in-progress, 2 = completed
  crawl_duration_seconds   - crawl duration; in-progress uses now - start
  datasync_wait_state      - 0 = none, 1 = waiting, 2 = complete
  datasync_wait_duration_seconds
  current_sync_bytes       - bytes synced so far for this snap
  current_total_bytes      - total bytes for this snap
  current_sync_bytes_percent - basis points (1745 = 17.45%)
  current_sync_files
  current_total_files
  current_sync_files_percent - basis points
  current_eta_valid        - 0 = calculating, 1 = ETA available
  current_eta_seconds      - ETA in seconds when valid

Per-directory snapshot summary (peer_status snaps_*)
  [Registered only; not updated in this commit]
  snaps_synced, snaps_deleted, snaps_renamed

Last synced snapshot (peer_status "last_synced_snap")
  [Registered only; not updated in this commit]
  last_snap_id
  last_crawl_duration_seconds
  last_datasync_wait_duration_seconds
  last_sync_duration_seconds
  last_sync_timestamp      - utime_t / seconds since epoch
  last_sync_bytes
  last_sync_files

When idle or failed, current_* counters are zeroed and dir_state reflects
0 or 2 respectively.

Fixes: https://tracker.ceph.com/issues/73457
Signed-off-by: Kotresh HR <khiremat@redhat.com>
src/tools/cephfs_mirror/PeerReplayer.cc
src/tools/cephfs_mirror/PeerReplayer.h

index 4bd4681f0e06e098ba4d5004db0b22d9b0343c3a..d532b6a3adfca4833c9699273d7f4761e7213dde 100644 (file)
@@ -48,6 +48,38 @@ enum {
   l_cephfs_mirror_peer_replayer_last,
 };
 
+enum {
+  l_cephfs_mirror_directory_first = 7000,
+  l_cephfs_mirror_directory_dir_state,
+  l_cephfs_mirror_directory_current_snap_id,
+  l_cephfs_mirror_directory_current_sync_mode,
+  l_cephfs_mirror_directory_current_read_bps,
+  l_cephfs_mirror_directory_current_write_bps,
+  l_cephfs_mirror_directory_crawl_state,
+  l_cephfs_mirror_directory_crawl_duration_seconds,
+  l_cephfs_mirror_directory_datasync_wait_state,
+  l_cephfs_mirror_directory_datasync_wait_duration_seconds,
+  l_cephfs_mirror_directory_current_sync_bytes,
+  l_cephfs_mirror_directory_current_total_bytes,
+  l_cephfs_mirror_directory_current_sync_bytes_percent,
+  l_cephfs_mirror_directory_current_sync_files,
+  l_cephfs_mirror_directory_current_total_files,
+  l_cephfs_mirror_directory_current_sync_files_percent,
+  l_cephfs_mirror_directory_current_eta_valid,
+  l_cephfs_mirror_directory_current_eta_seconds,
+  l_cephfs_mirror_directory_snaps_synced,
+  l_cephfs_mirror_directory_snaps_deleted,
+  l_cephfs_mirror_directory_snaps_renamed,
+  l_cephfs_mirror_directory_last_snap_id,
+  l_cephfs_mirror_directory_last_crawl_duration_seconds,
+  l_cephfs_mirror_directory_last_datasync_wait_duration_seconds,
+  l_cephfs_mirror_directory_last_sync_duration_seconds,
+  l_cephfs_mirror_directory_last_sync_timestamp,
+  l_cephfs_mirror_directory_last_sync_bytes,
+  l_cephfs_mirror_directory_last_sync_files,
+  l_cephfs_mirror_directory_last,
+};
+
 namespace cephfs {
 namespace mirror {
 
@@ -221,6 +253,11 @@ PeerReplayer::PeerReplayer(CephContext *cct, FSMirror *fs_mirror,
 
 PeerReplayer::~PeerReplayer() {
   delete m_asok_hook;
+  for (auto &[dir_root, perf] : m_directory_perf_counters) {
+    m_cct->get_perfcounters_collection()->remove(perf);
+    delete perf;
+  }
+  m_directory_perf_counters.clear();
   PerfCounters *perf_counters = nullptr;
   std::swap(perf_counters, m_perf_counters);
   if (perf_counters != nullptr) {
@@ -229,11 +266,234 @@ PeerReplayer::~PeerReplayer() {
   }
 }
 
+uint64_t percent_basis_points(uint64_t num, uint64_t den) {
+  if (den == 0) {
+    return 0;
+  }
+  return static_cast<uint64_t>((static_cast<double>(num) * 10000.0) / den);
+}
+
+void PeerReplayer::create_directory_perf_counters(const std::string &dir_root) {
+  ceph_assert(m_directory_perf_counters.find(dir_root) ==
+              m_directory_perf_counters.end());
+
+  std::string labels = ceph::perf_counters::key_create("cephfs_mirror_directory", {
+    {"source_fscid", stringify(m_filesystem.fscid)},
+    {"source_filesystem", m_filesystem.fs_name},
+    {"peer_uuid", m_peer.uuid},
+    {"peer_cluster_name", m_peer.remote.cluster_name},
+    {"peer_cluster_filesystem", m_peer.remote.fs_name},
+    {"directory", dir_root},
+  });
+  PerfCountersBuilder plb(m_cct, labels, l_cephfs_mirror_directory_first,
+                          l_cephfs_mirror_directory_last);
+  auto prio = m_cct->_conf.get_val<int64_t>("cephfs_mirror_perf_stats_prio");
+
+  plb.add_u64(l_cephfs_mirror_directory_dir_state,
+              "dir_state", "Directory mirror state", "dste", prio);
+  plb.add_u64(l_cephfs_mirror_directory_current_snap_id,
+              "current_snap_id", "Current syncing snapshot id", "csid", prio);
+  plb.add_u64(l_cephfs_mirror_directory_current_sync_mode,
+              "current_sync_mode", "Current sync mode", "csmd", prio);
+  plb.add_u64(l_cephfs_mirror_directory_current_read_bps,
+              "current_read_bps", "Current read throughput bytes per second", "crbp", prio);
+  plb.add_u64(l_cephfs_mirror_directory_current_write_bps,
+              "current_write_bps", "Current write throughput bytes per second", "cwbp", prio);
+  plb.add_u64(l_cephfs_mirror_directory_crawl_state,
+              "crawl_state", "Current crawl state", "crst", prio);
+  plb.add_u64(l_cephfs_mirror_directory_crawl_duration_seconds,
+              "crawl_duration_seconds", "Current crawl duration seconds", "crdn", prio);
+  plb.add_u64(l_cephfs_mirror_directory_datasync_wait_state,
+              "datasync_wait_state", "Current datasync queue wait state", "dwst", prio);
+  plb.add_u64(l_cephfs_mirror_directory_datasync_wait_duration_seconds,
+              "datasync_wait_duration_seconds",
+              "Current datasync queue wait duration seconds", "dwdn", prio);
+  plb.add_u64(l_cephfs_mirror_directory_current_sync_bytes,
+              "current_sync_bytes", "Current sync bytes", "csby", prio);
+  plb.add_u64(l_cephfs_mirror_directory_current_total_bytes,
+              "current_total_bytes", "Current total bytes", "ctby", prio);
+  plb.add_u64(l_cephfs_mirror_directory_current_sync_bytes_percent,
+              "current_sync_bytes_percent",
+              "Current sync bytes percent in basis points", "csbp", prio);
+  plb.add_u64(l_cephfs_mirror_directory_current_sync_files,
+              "current_sync_files", "Current sync files", "csfl", prio);
+  plb.add_u64(l_cephfs_mirror_directory_current_total_files,
+              "current_total_files", "Current total files", "ctfl", prio);
+  plb.add_u64(l_cephfs_mirror_directory_current_sync_files_percent,
+              "current_sync_files_percent",
+              "Current sync files percent in basis points", "csfp", prio);
+  plb.add_u64(l_cephfs_mirror_directory_current_eta_valid,
+              "current_eta_valid", "Current sync ETA validity", "cetv", prio);
+  plb.add_u64(l_cephfs_mirror_directory_current_eta_seconds,
+              "current_eta_seconds", "Current sync ETA seconds", "cets", prio);
+  plb.add_u64(l_cephfs_mirror_directory_snaps_synced,
+              "snaps_synced", "Snapshots synchronized", "ssnc", prio);
+  plb.add_u64(l_cephfs_mirror_directory_snaps_deleted,
+              "snaps_deleted", "Snapshots deleted", "sdel", prio);
+  plb.add_u64(l_cephfs_mirror_directory_snaps_renamed,
+              "snaps_renamed", "Snapshots renamed", "sren", prio);
+  plb.add_u64(l_cephfs_mirror_directory_last_snap_id,
+              "last_snap_id", "Last synced snapshot id", "lsid", prio);
+  plb.add_u64(l_cephfs_mirror_directory_last_crawl_duration_seconds,
+              "last_crawl_duration_seconds",
+              "Last synced snapshot crawl duration seconds", "lcdn", prio);
+  plb.add_u64(l_cephfs_mirror_directory_last_datasync_wait_duration_seconds,
+              "last_datasync_wait_duration_seconds",
+              "Last synced snapshot datasync queue wait duration seconds", "ldwd", prio);
+  plb.add_u64(l_cephfs_mirror_directory_last_sync_duration_seconds,
+              "last_sync_duration_seconds",
+              "Last synced snapshot duration seconds", "lsdn", prio);
+  plb.add_time(l_cephfs_mirror_directory_last_sync_timestamp,
+               "last_sync_timestamp", "Last synced snapshot timestamp", "lsts", prio);
+  plb.add_u64(l_cephfs_mirror_directory_last_sync_bytes,
+              "last_sync_bytes", "Last synced snapshot bytes", "lsby", prio);
+  plb.add_u64(l_cephfs_mirror_directory_last_sync_files,
+              "last_sync_files", "Last synced snapshot files", "lsfl", prio);
+
+  PerfCounters *perf = plb.create_perf_counters();
+  m_cct->get_perfcounters_collection()->add(perf);
+  m_directory_perf_counters.emplace(dir_root, perf);
+}
+
+void PeerReplayer::remove_directory_perf_counters(const std::string &dir_root) {
+  auto it = m_directory_perf_counters.find(dir_root);
+  if (it == m_directory_perf_counters.end()) {
+    return;
+  }
+  m_cct->get_perfcounters_collection()->remove(it->second);
+  delete it->second;
+  m_directory_perf_counters.erase(it);
+}
+
+PerfCounters *PeerReplayer::find_directory_perf_counters(const std::string &dir_root) {
+  auto it = m_directory_perf_counters.find(dir_root);
+  if (it == m_directory_perf_counters.end()) {
+    return nullptr;
+  }
+  return it->second;
+}
+
+void PeerReplayer::update_directory_current_sync_perf_counters(
+    PerfCounters *perf, const SnapSyncStat &sync_stat) {
+  if (!perf) {
+    return;
+  }
+
+  auto clear_current = [&perf]() {
+    perf->set(l_cephfs_mirror_directory_current_snap_id, 0);
+    perf->set(l_cephfs_mirror_directory_current_sync_mode, 0);
+    perf->set(l_cephfs_mirror_directory_current_read_bps, 0);
+    perf->set(l_cephfs_mirror_directory_current_write_bps, 0);
+    perf->set(l_cephfs_mirror_directory_crawl_state, 0);
+    perf->set(l_cephfs_mirror_directory_crawl_duration_seconds, 0);
+    perf->set(l_cephfs_mirror_directory_datasync_wait_state, 0);
+    perf->set(l_cephfs_mirror_directory_datasync_wait_duration_seconds, 0);
+    perf->set(l_cephfs_mirror_directory_current_sync_bytes, 0);
+    perf->set(l_cephfs_mirror_directory_current_total_bytes, 0);
+    perf->set(l_cephfs_mirror_directory_current_sync_bytes_percent, 0);
+    perf->set(l_cephfs_mirror_directory_current_sync_files, 0);
+    perf->set(l_cephfs_mirror_directory_current_total_files, 0);
+    perf->set(l_cephfs_mirror_directory_current_sync_files_percent, 0);
+    perf->set(l_cephfs_mirror_directory_current_eta_valid, 0);
+    perf->set(l_cephfs_mirror_directory_current_eta_seconds, 0);
+  };
+
+  if (sync_stat.failed) {
+    perf->set(l_cephfs_mirror_directory_dir_state, 2);
+    clear_current();
+    return;
+  }
+
+  if (!sync_stat.current_syncing_snap) {
+    perf->set(l_cephfs_mirror_directory_dir_state, 0);
+    clear_current();
+    return;
+  }
+
+  perf->set(l_cephfs_mirror_directory_dir_state, 1);
+  perf->set(l_cephfs_mirror_directory_current_snap_id,
+            sync_stat.current_syncing_snap->first);
+  perf->set(l_cephfs_mirror_directory_current_sync_mode,
+            sync_stat.snapdiff ? 1 : 0);
+
+  double read_bps = sync_stat.read_time_sec > 0 ?
+      sync_stat.bytes_read / sync_stat.read_time_sec : 0;
+  double write_bps = sync_stat.write_time_sec > 0 ?
+      sync_stat.bytes_written / sync_stat.write_time_sec : 0;
+  perf->set(l_cephfs_mirror_directory_current_read_bps,
+            static_cast<uint64_t>(read_bps));
+  perf->set(l_cephfs_mirror_directory_current_write_bps,
+            static_cast<uint64_t>(write_bps));
+
+  if (sync_stat.crawl_finished) {
+    perf->set(l_cephfs_mirror_directory_crawl_state, 2);
+    perf->set(l_cephfs_mirror_directory_crawl_duration_seconds,
+              static_cast<uint64_t>(sync_stat.crawl_duration));
+  } else {
+    perf->set(l_cephfs_mirror_directory_crawl_state, 1);
+    auto cur_time = clock::now();
+    sec_duration crawl_duration =
+      sec_duration(cur_time - sync_stat.crawl_start_time);
+    perf->set(l_cephfs_mirror_directory_crawl_duration_seconds,
+              static_cast<uint64_t>(crawl_duration.count()));
+  }
+
+  if (sync_stat.datasync_queue_wait_duration) {
+    perf->set(l_cephfs_mirror_directory_datasync_wait_state, 2);
+    perf->set(l_cephfs_mirror_directory_datasync_wait_duration_seconds,
+              static_cast<uint64_t>(*sync_stat.datasync_queue_wait_duration));
+  } else if (sync_stat.datasync_queue_wait_start_time) {
+    perf->set(l_cephfs_mirror_directory_datasync_wait_state, 1);
+    auto cur_time = clock::now();
+    sec_duration dq_wait =
+      sec_duration(cur_time - *sync_stat.datasync_queue_wait_start_time);
+    perf->set(l_cephfs_mirror_directory_datasync_wait_duration_seconds,
+              static_cast<uint64_t>(dq_wait.count()));
+  } else {
+    perf->set(l_cephfs_mirror_directory_datasync_wait_state, 0);
+    perf->set(l_cephfs_mirror_directory_datasync_wait_duration_seconds, 0);
+  }
+
+  perf->set(l_cephfs_mirror_directory_current_sync_bytes, sync_stat.sync_bytes);
+  perf->set(l_cephfs_mirror_directory_current_total_bytes, sync_stat.total_bytes);
+  perf->set(l_cephfs_mirror_directory_current_sync_bytes_percent,
+            percent_basis_points(sync_stat.sync_bytes, sync_stat.total_bytes));
+  perf->set(l_cephfs_mirror_directory_current_sync_files, sync_stat.sync_files);
+  perf->set(l_cephfs_mirror_directory_current_total_files, sync_stat.total_files);
+  perf->set(l_cephfs_mirror_directory_current_sync_files_percent,
+            percent_basis_points(sync_stat.sync_files, sync_stat.total_files));
+
+  SnapSyncStat stat_for_eta = sync_stat;
+  double eta = compute_eta(stat_for_eta);
+  if (eta == -1.0) {
+    perf->set(l_cephfs_mirror_directory_current_eta_valid, 0);
+    perf->set(l_cephfs_mirror_directory_current_eta_seconds, 0);
+  } else {
+    perf->set(l_cephfs_mirror_directory_current_eta_valid, 1);
+    perf->set(l_cephfs_mirror_directory_current_eta_seconds,
+              static_cast<uint64_t>(eta));
+  }
+}
+
+void PeerReplayer::refresh_directory_current_sync_perf_counters(
+    const std::string &dir_root) {
+  // caller must hold m_lock
+  auto it = m_snap_sync_stats.find(dir_root);
+  if (it == m_snap_sync_stats.end()) {
+    return;
+  }
+  PerfCounters *dir_perf = find_directory_perf_counters(dir_root);
+  update_directory_current_sync_perf_counters(dir_perf, it->second);
+}
+
 int PeerReplayer::init() {
   dout(20) << ": initial dir list=[" << m_directories << "]" << dendl;
   for (auto &dir_root : m_directories) {
     m_snap_sync_stats.emplace(dir_root, SnapSyncStat());
   }
+  for (auto &dir_root : m_directories) {
+    create_directory_perf_counters(dir_root);
+  }
 
   auto &remote_client = m_peer.remote.client_name;
   auto &remote_cluster = m_peer.remote.cluster_name;
@@ -378,6 +638,10 @@ void PeerReplayer::add_directory(string_view dir_root) {
   }
   m_directories.emplace_back(_dir_root);
   m_snap_sync_stats.emplace(_dir_root, SnapSyncStat());
+  if (m_directory_perf_counters.find(_dir_root) ==
+      m_directory_perf_counters.end()) {
+    create_directory_perf_counters(_dir_root);
+  }
   m_cond.notify_all();
 }
 
@@ -393,6 +657,7 @@ void PeerReplayer::remove_directory(string_view dir_root) {
 
   auto it1 = m_registered.find(_dir_root);
   if (it1 == m_registered.end()) {
+    remove_directory_perf_counters(_dir_root);
     m_snap_sync_stats.erase(_dir_root);
   } else {
     it1->second.canceled = true;
@@ -460,6 +725,7 @@ void PeerReplayer::unregister_directory(const std::string &dir_root) {
   unlock_directory(it->first, it->second);
   m_registered.erase(it);
   if (std::find(m_directories.begin(), m_directories.end(), dir_root) == m_directories.end()) {
+    remove_directory_perf_counters(dir_root);
     m_snap_sync_stats.erase(dir_root);
   }
 }
@@ -2359,6 +2625,11 @@ void PeerReplayer::run_tick() {
       dout(5) << ": shutting down exiting" << dendl;
       break;
     }
+
+    // refresh current-sync perf counters for registered directories
+    for (const auto &kv : m_registered) {
+      refresh_directory_current_sync_perf_counters(kv.first);
+    }
   }
 }
 
index e91c5b495b291ce1b41b73f12ce2a9f49af18968..38e54516bdd55a18463464c56496c28877561ba1 100644 (file)
@@ -665,6 +665,14 @@ private:
   ServiceDaemonStats m_service_daemon_stats;
 
   PerfCounters *m_perf_counters;
+  std::map<std::string, PerfCounters *> m_directory_perf_counters;
+
+  void create_directory_perf_counters(const std::string &dir_root);
+  void remove_directory_perf_counters(const std::string &dir_root);
+  PerfCounters *find_directory_perf_counters(const std::string &dir_root);
+  void update_directory_current_sync_perf_counters(PerfCounters *perf,
+                                                   const SnapSyncStat &sync_stat);
+  void refresh_directory_current_sync_perf_counters(const std::string &dir_root);
 
   void run(SnapshotReplayerThread *replayer);
   void run_datasync(SnapshotDataSyncThread *data_replayer);