From 658ee6c40116e07f45085ad0df7ef33d3c8ed78e Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Feb 2024 07:28:58 -0500 Subject: [PATCH] cephfs_mirror: add labeled replication performance metrics Fixes: http://tracker.ceph.com/issues/63945 Signed-off-by: Jos Collin Signed-off-by: Venky Shankar --- src/common/options/cephfs-mirror.yaml.in | 13 ++++- src/pybind/mgr/mgr_module.py | 2 +- src/tools/cephfs_mirror/FSMirror.cc | 40 +++++++++++++++ src/tools/cephfs_mirror/FSMirror.h | 2 + src/tools/cephfs_mirror/Mirror.cc | 44 ++++++++++++++++ src/tools/cephfs_mirror/Mirror.h | 2 + src/tools/cephfs_mirror/PeerReplayer.cc | 65 ++++++++++++++++++++++++ src/tools/cephfs_mirror/PeerReplayer.h | 2 + 8 files changed, 168 insertions(+), 2 deletions(-) diff --git a/src/common/options/cephfs-mirror.yaml.in b/src/common/options/cephfs-mirror.yaml.in index 78f86dfb1a762..f826161872b88 100644 --- a/src/common/options/cephfs-mirror.yaml.in +++ b/src/common/options/cephfs-mirror.yaml.in @@ -91,4 +91,15 @@ options: default: 10 services: - cephfs-mirror - min: 0 \ No newline at end of file + min: 0 +- name: cephfs_mirror_perf_stats_prio + type: int + level: advanced + desc: Priority level for mirror daemon replication perf counters + long_desc: The daemon will send perf counter data to the manager daemon if the priority + is not lower than mgr_stats_threshold. + default: 5 + services: + - cephfs-mirror + min: 0 + max: 11 diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py index 28fd69738eb9a..13cf9386bcdb5 100644 --- a/src/pybind/mgr/mgr_module.py +++ b/src/pybind/mgr/mgr_module.py @@ -2074,7 +2074,7 @@ class MgrModule(ceph_module.BaseMgrModule, MgrModuleLoggingMixin): @profile_method() def get_unlabeled_perf_counters(self, prio_limit: int = PRIO_USEFUL, services: Sequence[str] = ("mds", "mon", "osd", - "rbd-mirror", "rgw", + "rbd-mirror", "cephfs-mirror", "rgw", "tcmu-runner")) -> Dict[str, dict]: """ Return the perf counters currently known to this ceph-mgr diff --git a/src/tools/cephfs_mirror/FSMirror.cc b/src/tools/cephfs_mirror/FSMirror.cc index 7ea798e6bec2b..3d5bf2d1c7242 100644 --- a/src/tools/cephfs_mirror/FSMirror.cc +++ b/src/tools/cephfs_mirror/FSMirror.cc @@ -8,6 +8,8 @@ #include "common/debug.h" #include "common/errno.h" #include "common/WorkQueue.h" +#include "common/perf_counters.h" +#include "common/perf_counters_key.h" #include "include/stringify.h" #include "msg/Messenger.h" #include "FSMirror.h" @@ -25,6 +27,14 @@ using namespace std; +// Performance Counters +enum { + l_cephfs_mirror_fs_mirror_first = 5000, + l_cephfs_mirror_fs_mirror_peers, + l_cephfs_mirror_fs_mirror_dir_count, + l_cephfs_mirror_fs_mirror_last, +}; + namespace cephfs { namespace mirror { @@ -107,6 +117,18 @@ FSMirror::FSMirror(CephContext *cct, const Filesystem &filesystem, uint64_t pool m_asok_hook(new MirrorAdminSocketHook(cct, filesystem, this)) { m_service_daemon->add_or_update_fs_attribute(m_filesystem.fscid, SERVICE_DAEMON_DIR_COUNT_KEY, (uint64_t)0); + + std::string labels = ceph::perf_counters::key_create("cephfs_mirror_mirrored_filesystems", + {{"filesystem", m_filesystem.fs_name}}); + PerfCountersBuilder plb(m_cct, labels, l_cephfs_mirror_fs_mirror_first, + l_cephfs_mirror_fs_mirror_last); + auto prio = m_cct->_conf.get_val("cephfs_mirror_perf_stats_prio"); + plb.add_u64(l_cephfs_mirror_fs_mirror_peers, + "mirroring_peers", "Mirroring Peers", "mpee", prio); + plb.add_u64(l_cephfs_mirror_fs_mirror_dir_count, + "directory_count", "Directory Count", "dirc", prio); + m_perf_counters = plb.create_perf_counters(); + m_cct->get_perfcounters_collection()->add(m_perf_counters); } FSMirror::~FSMirror() { @@ -120,6 +142,12 @@ FSMirror::~FSMirror() { // outside the lock so that in-progress commands can acquire // lock and finish executing. delete m_asok_hook; + PerfCounters *perf_counters = nullptr; + std::swap(perf_counters, m_perf_counters); + if (perf_counters != nullptr) { + m_cct->get_perfcounters_collection()->remove(perf_counters); + delete perf_counters; + } } int FSMirror::init_replayer(PeerReplayer *peer_replayer) { @@ -355,6 +383,9 @@ void FSMirror::handle_acquire_directory(string_view dir_path) { peer_replayer->add_directory(dir_path); } } + if (m_perf_counters) { + m_perf_counters->set(l_cephfs_mirror_fs_mirror_dir_count, m_directories.size()); + } } void FSMirror::handle_release_directory(string_view dir_path) { @@ -372,6 +403,9 @@ void FSMirror::handle_release_directory(string_view dir_path) { peer_replayer->remove_directory(dir_path); } } + if (m_perf_counters) { + m_perf_counters->set(l_cephfs_mirror_fs_mirror_dir_count, m_directories.size()); + } } } @@ -395,6 +429,9 @@ void FSMirror::add_peer(const Peer &peer) { } m_peer_replayers.emplace(peer, std::move(replayer)); ceph_assert(m_peer_replayers.size() == 1); // support only a single peer + if (m_perf_counters) { + m_perf_counters->inc(l_cephfs_mirror_fs_mirror_peers); + } } void FSMirror::remove_peer(const Peer &peer) { @@ -415,6 +452,9 @@ void FSMirror::remove_peer(const Peer &peer) { dout(5) << ": shutting down replayers for peer=" << peer << dendl; shutdown_replayer(replayer.get()); } + if (m_perf_counters) { + m_perf_counters->dec(l_cephfs_mirror_fs_mirror_peers); + } } void FSMirror::mirror_status(Formatter *f) { diff --git a/src/tools/cephfs_mirror/FSMirror.h b/src/tools/cephfs_mirror/FSMirror.h index a9c1fab1025d9..75fca758520d8 100644 --- a/src/tools/cephfs_mirror/FSMirror.h +++ b/src/tools/cephfs_mirror/FSMirror.h @@ -154,6 +154,8 @@ private: MountRef m_mount; + PerfCounters *m_perf_counters; + int init_replayer(PeerReplayer *peer_replayer); void shutdown_replayer(PeerReplayer *peer_replayer); void cleanup(); diff --git a/src/tools/cephfs_mirror/Mirror.cc b/src/tools/cephfs_mirror/Mirror.cc index cc811f0a1c7fb..0ad9f101e3710 100644 --- a/src/tools/cephfs_mirror/Mirror.cc +++ b/src/tools/cephfs_mirror/Mirror.cc @@ -9,6 +9,8 @@ #include "common/errno.h" #include "common/Timer.h" #include "common/WorkQueue.h" +#include "common/perf_counters.h" +#include "common/perf_counters_key.h" #include "include/types.h" #include "mon/MonClient.h" #include "msg/Messenger.h" @@ -20,6 +22,14 @@ #undef dout_prefix #define dout_prefix *_dout << "cephfs::mirror::Mirror " << __func__ +// Performance Counters +enum { + l_cephfs_mirror_first = 4000, + l_cephfs_mirror_file_systems_mirrorred, + l_cephfs_mirror_file_systems_mirror_enable_failures, + l_cephfs_mirror_last, +}; + namespace cephfs { namespace mirror { @@ -277,6 +287,17 @@ int Mirror::init(std::string &reason) { return r; } + std::string labels = ceph::perf_counters::key_create("cephfs_mirror"); + PerfCountersBuilder plb(m_cct, labels, l_cephfs_mirror_first, l_cephfs_mirror_last); + + auto prio = m_cct->_conf.get_val("cephfs_mirror_perf_stats_prio"); + plb.add_u64(l_cephfs_mirror_file_systems_mirrorred, + "mirrored_filesystems", "Filesystems mirrored", "mir", prio); + plb.add_u64_counter(l_cephfs_mirror_file_systems_mirror_enable_failures, + "mirror_enable_failures", "Mirroring enable failures", "mirf", prio); + m_perf_counters = plb.create_perf_counters(); + m_cct->get_perfcounters_collection()->add(m_perf_counters); + return 0; } @@ -285,6 +306,13 @@ void Mirror::shutdown() { m_stopping = true; m_cluster_watcher->shutdown(); m_cond.notify_all(); + + PerfCounters *perf_counters = nullptr; + std::swap(perf_counters, m_perf_counters); + if (perf_counters != nullptr) { + m_cct->get_perfcounters_collection()->remove(perf_counters); + delete perf_counters; + } } void Mirror::reopen_logs() { @@ -328,6 +356,9 @@ void Mirror::handle_enable_mirroring(const Filesystem &filesystem, m_service_daemon->add_or_update_fs_attribute(filesystem.fscid, SERVICE_DAEMON_MIRROR_ENABLE_FAILED_KEY, true); + if (m_perf_counters) { + m_perf_counters->inc(l_cephfs_mirror_file_systems_mirror_enable_failures); + } return; } @@ -341,6 +372,9 @@ void Mirror::handle_enable_mirroring(const Filesystem &filesystem, } dout(10) << ": Initialized FSMirror for filesystem=" << filesystem << dendl; + if (m_perf_counters) { + m_perf_counters->inc(l_cephfs_mirror_file_systems_mirrorred); + } } void Mirror::handle_enable_mirroring(const Filesystem &filesystem, int r) { @@ -358,6 +392,9 @@ void Mirror::handle_enable_mirroring(const Filesystem &filesystem, int r) { m_service_daemon->add_or_update_fs_attribute(filesystem.fscid, SERVICE_DAEMON_MIRROR_ENABLE_FAILED_KEY, true); + if (m_perf_counters) { + m_perf_counters->inc(l_cephfs_mirror_file_systems_mirror_enable_failures); + } return; } @@ -367,6 +404,9 @@ void Mirror::handle_enable_mirroring(const Filesystem &filesystem, int r) { m_cond.notify_all(); dout(10) << ": Initialized FSMirror for filesystem=" << filesystem << dendl; + if (m_perf_counters) { + m_perf_counters->inc(l_cephfs_mirror_file_systems_mirrorred); + } } void Mirror::enable_mirroring(const Filesystem &filesystem, uint64_t local_pool_id, @@ -422,6 +462,10 @@ void Mirror::handle_disable_mirroring(const Filesystem &filesystem, int r) { m_mirror_actions.erase(filesystem); } } + + if (m_perf_counters) { + m_perf_counters->dec(l_cephfs_mirror_file_systems_mirrorred); + } } void Mirror::disable_mirroring(const Filesystem &filesystem, Context *on_finish) { diff --git a/src/tools/cephfs_mirror/Mirror.h b/src/tools/cephfs_mirror/Mirror.h index 2081b5b53050f..5e37b5df4580b 100644 --- a/src/tools/cephfs_mirror/Mirror.h +++ b/src/tools/cephfs_mirror/Mirror.h @@ -104,6 +104,8 @@ private: RadosRef m_local; std::unique_ptr m_service_daemon; + PerfCounters *m_perf_counters; + int init_mon_client(); // called via listener diff --git a/src/tools/cephfs_mirror/PeerReplayer.cc b/src/tools/cephfs_mirror/PeerReplayer.cc index bd47046bb1218..6f3fb724a659e 100644 --- a/src/tools/cephfs_mirror/PeerReplayer.cc +++ b/src/tools/cephfs_mirror/PeerReplayer.cc @@ -12,6 +12,8 @@ #include "common/ceph_context.h" #include "common/debug.h" #include "common/errno.h" +#include "common/perf_counters.h" +#include "common/perf_counters_key.h" #include "FSMirror.h" #include "PeerReplayer.h" #include "Utils.h" @@ -26,6 +28,18 @@ using namespace std; +// Performance Counters +enum { + l_cephfs_mirror_peer_replayer_first = 6000, + l_cephfs_mirror_peer_replayer_snaps_synced, + l_cephfs_mirror_peer_replayer_snaps_deleted, + l_cephfs_mirror_peer_replayer_snaps_renamed, + l_cephfs_mirror_peer_replayer_snap_sync_failures, + l_cephfs_mirror_peer_replayer_avg_sync_time, + l_cephfs_mirror_peer_replayer_sync_bytes, + l_cephfs_mirror_peer_replayer_last, +}; + namespace cephfs { namespace mirror { @@ -161,10 +175,39 @@ PeerReplayer::PeerReplayer(CephContext *cct, FSMirror *fs_mirror, SERVICE_DAEMON_FAILED_DIR_COUNT_KEY, (uint64_t)0); m_service_daemon->add_or_update_peer_attribute(m_filesystem.fscid, m_peer, SERVICE_DAEMON_RECOVERED_DIR_COUNT_KEY, (uint64_t)0); + + std::string labels = ceph::perf_counters::key_create("cephfs_mirror_peers", + {{"source_fscid", stringify(m_filesystem.fscid)}, + {"source_filesystem", m_filesystem.fs_name}, + {"peer_cluster_name", m_peer.remote.cluster_name}, + {"peer_cluster_filesystem", m_peer.remote.fs_name}}); + PerfCountersBuilder plb(m_cct, labels, l_cephfs_mirror_peer_replayer_first, + l_cephfs_mirror_peer_replayer_last); + auto prio = m_cct->_conf.get_val("cephfs_mirror_perf_stats_prio"); + plb.add_u64_counter(l_cephfs_mirror_peer_replayer_snaps_synced, + "snaps_synced", "Snapshots Synchronized", "sync", prio); + plb.add_u64_counter(l_cephfs_mirror_peer_replayer_snaps_deleted, + "snaps_deleted", "Snapshots Deleted", "del", prio); + plb.add_u64_counter(l_cephfs_mirror_peer_replayer_snaps_renamed, + "snaps_renamed", "Snapshots Renamed", "ren", prio); + plb.add_u64_counter(l_cephfs_mirror_peer_replayer_snap_sync_failures, + "sync_failures", "Snapshot Sync Failures", "fail", prio); + plb.add_time_avg(l_cephfs_mirror_peer_replayer_avg_sync_time, + "avg_sync_time", "Average Sync Time", "asyn", prio); + plb.add_u64_counter(l_cephfs_mirror_peer_replayer_sync_bytes, + "sync_bytes", "Sync Bytes", "sbye", prio); + m_perf_counters = plb.create_perf_counters(); + m_cct->get_perfcounters_collection()->add(m_perf_counters); } PeerReplayer::~PeerReplayer() { delete m_asok_hook; + PerfCounters *perf_counters = nullptr; + std::swap(perf_counters, m_perf_counters); + if (perf_counters != nullptr) { + m_cct->get_perfcounters_collection()->remove(perf_counters); + delete perf_counters; + } } int PeerReplayer::init() { @@ -516,6 +559,9 @@ int PeerReplayer::propagate_snap_deletes(const std::string &dir_root, return r; } inc_deleted_snap(dir_root); + if (m_perf_counters) { + m_perf_counters->inc(l_cephfs_mirror_peer_replayer_snaps_deleted); + } } return 0; @@ -539,6 +585,9 @@ int PeerReplayer::propagate_snap_renames( return r; } inc_renamed_snap(dir_root); + if (m_perf_counters) { + m_perf_counters->inc(l_cephfs_mirror_peer_replayer_snaps_renamed); + } } return 0; @@ -694,6 +743,9 @@ int PeerReplayer::remote_file_op(const std::string &dir_root, const std::string derr << ": failed to copy path=" << epath << ": " << cpp_strerror(r) << dendl; return r; } + if (m_perf_counters) { + m_perf_counters->inc(l_cephfs_mirror_peer_replayer_sync_bytes, stx.stx_size); + } } else if (S_ISLNK(stx.stx_mode)) { // free the remote link before relinking r = ceph_unlinkat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), 0); @@ -1457,7 +1509,17 @@ int PeerReplayer::do_sync_snaps(const std::string &dir_root) { clear_current_syncing_snap(dir_root); return r; } + if (m_perf_counters) { + m_perf_counters->inc(l_cephfs_mirror_peer_replayer_snaps_synced); + } std::chrono::duration duration = clock::now() - start; + + utime_t d; + d.set_from_double(duration.count()); + if (m_perf_counters) { + m_perf_counters->tinc(l_cephfs_mirror_peer_replayer_avg_sync_time, d); + } + set_last_synced_stat(dir_root, it->first, it->second, duration.count()); if (--snaps_per_cycle == 0) { break; @@ -1481,6 +1543,9 @@ void PeerReplayer::sync_snaps(const std::string &dir_root, locker.lock(); if (r < 0) { _inc_failed_count(dir_root); + if (m_perf_counters) { + m_perf_counters->inc(l_cephfs_mirror_peer_replayer_snap_sync_failures); + } } else { _reset_failed_count(dir_root); } diff --git a/src/tools/cephfs_mirror/PeerReplayer.h b/src/tools/cephfs_mirror/PeerReplayer.h index 0511d154a759d..63e1bd9e8a777 100644 --- a/src/tools/cephfs_mirror/PeerReplayer.h +++ b/src/tools/cephfs_mirror/PeerReplayer.h @@ -269,6 +269,8 @@ private: ServiceDaemonStats m_service_daemon_stats; + PerfCounters *m_perf_counters; + void run(SnapshotReplayerThread *replayer); boost::optional pick_directory(); -- 2.39.5