From 3c9b07eb87e67027e9988c1587c07e27ed168657 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Juan=20Miguel=20Olmo=20Mart=C3=ADnez?= Date: Mon, 7 Oct 2024 16:55:51 +0200 Subject: [PATCH] exporter: New metric for report ceph daemons health MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Ceph exporter provide metrics to report ceph daemons communication health using the admin socket Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=2146728 https://tracker.ceph.com/issues/68428 Signed-off-by: Juan Miguel Olmo Martínez --- doc/monitoring/index.rst | 24 ++++++ src/exporter/DaemonMetricCollector.cc | 9 ++- src/exporter/DaemonMetricCollector.h | 2 +- src/test/exporter/test_exporter.cc | 110 +++++++++++++++++++++++++- 4 files changed, 141 insertions(+), 4 deletions(-) diff --git a/doc/monitoring/index.rst b/doc/monitoring/index.rst index 794fdf8419505..afccd9ab16ac3 100644 --- a/doc/monitoring/index.rst +++ b/doc/monitoring/index.rst @@ -64,6 +64,30 @@ in: It is good to outline that the main tool allowing users to observe and monitor a Ceph cluster is the **Ceph dashboard**. It provides graphics where the most important cluster and service metrics are represented. Most of the examples in this document are extracted from the dashboard graphics or extrapolated from the metrics exposed by the Ceph dashboard. +Ceph daemon health metrics +========================== + +The Ceph exporter provides a metric called ``ceph_daemon_socket_up`` that reports the liveness status of each Ceph daemon that exposes an admin socket. + +The ``ceph_daemon_socket_up`` metric indicates the health status of a Ceph daemon based on its ability to respond via the admin socket, where a value of ``1`` means healthy, and ``0`` means unhealthy. Although a Ceph daemon might still be "alive" when it reports ``ceph_daemon_socket_up=0``, this situation highlights a significant issue in its functionality. As such, this metric serves as an excellent tool for detecting problems in any of the main Ceph daemons. + +Labels: +- **``ceph_daemon``**: Identifier of the Ceph daemon exposing an admin socket on the host. +- **``hostname``**: Name of the host where the Ceph daemon is running. + +Example: + +.. code-block:: bash + + ceph_daemon_socket_up{ceph_daemon="mds.a",hostname="testhost"} 1 + ceph_daemon_socket_up{ceph_daemon="osd.1",hostname="testhost"} 0 + +To identify any Ceph daemons that were not responsive at any point in the last 12 hours, you can use the following PromQL expression: + +.. code-block:: bash + + ceph_daemon_socket_up == 0 or min_over_time(ceph_daemon_socket_up[12h]) == 0 + Performance metrics =================== diff --git a/src/exporter/DaemonMetricCollector.cc b/src/exporter/DaemonMetricCollector.cc index d4930ea35c0d2..4b8a8131bcfd3 100644 --- a/src/exporter/DaemonMetricCollector.cc +++ b/src/exporter/DaemonMetricCollector.cc @@ -168,10 +168,17 @@ void DaemonMetricCollector::dump_asok_metrics(bool sort_metrics, int64_t counter if (sockClientsPing) { bool ok; sock_client.ping(&ok); + std::string ceph_daemon_socket_up_desc( + "Reports the health status of a Ceph daemon, as determined by whether it is able to respond via its admin socket (1 = healthy, 0 = unhealthy)."); + labels_t ceph_daemon_socket_up_labels; + ceph_daemon_socket_up_labels["hostname"] = quote(ceph_get_hostname()); + ceph_daemon_socket_up_labels["ceph_daemon"] = quote(daemon_name); + add_metric(builder, static_cast(ok), "ceph_daemon_socket_up", ceph_daemon_socket_up_desc, + "gauge", ceph_daemon_socket_up_labels); if (!ok) { failures++; continue; - } + } } std::string counter_dump_response = dump_response.size() > 0 ? dump_response : asok_request(sock_client, "counter dump", daemon_name); diff --git a/src/exporter/DaemonMetricCollector.h b/src/exporter/DaemonMetricCollector.h index d2e929b4d670f..3302e95df916c 100644 --- a/src/exporter/DaemonMetricCollector.h +++ b/src/exporter/DaemonMetricCollector.h @@ -42,11 +42,11 @@ public: std::map clients; std::string metrics; std::pair add_fixed_name_metrics(std::string metric_name); + void update_sockets(); private: std::mutex metrics_mutex; std::unique_ptr builder; - void update_sockets(); void request_loop(boost::asio::steady_timer &timer); void dump_asok_metric(boost::json::object perf_info, diff --git a/src/test/exporter/test_exporter.cc b/src/test/exporter/test_exporter.cc index 907884fe35d60..e24773886bcb3 100644 --- a/src/test/exporter/test_exporter.cc +++ b/src/test/exporter/test_exporter.cc @@ -1,6 +1,8 @@ #include "common/ceph_argparse.h" #include "common/config.h" #include "common/config_proxy.h" +#include "common/admin_socket.h" +#include "common/admin_socket_client.h" #include #include "gtest/gtest.h" #include "common/ceph_context.h" @@ -8,6 +10,7 @@ #include "global/global_init.h" #include "exporter/util.h" #include "exporter/DaemonMetricCollector.h" +#include #include #include @@ -674,6 +677,27 @@ static std::vector> promethize_data = { {"rocksdb.submit_sync_latency_sum", "ceph_rocksdb_submit_sync_latency_sum"} }; + +class AdminSocketTest +{ +public: + explicit AdminSocketTest(AdminSocket *asokc) + : m_asokc(asokc) + { + } + bool init(const std::string &uri) { + return m_asokc->init(uri); + } + std::string bind_and_listen(const std::string &sock_path, int *fd) { + return m_asokc->bind_and_listen(sock_path, fd); + } + bool shutdown() { + m_asokc->shutdown(); + return true; + } + AdminSocket *m_asokc; +}; + int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); @@ -1289,8 +1313,11 @@ ceph_mon_session_rm{ceph_daemon="mon.a"} 577 # TYPE ceph_mon_session_trim counter ceph_mon_session_trim{ceph_daemon="mon.a"} 9 )"; - - ASSERT_TRUE(collector.metrics.find(expectedMetrics) != std::string::npos); + + std::string actualMetrics = collector.metrics; + std::cout << "Actual MON Metrics: " << actualMetrics << std::endl; + ASSERT_TRUE(actualMetrics.find(expectedMetrics) != std::string::npos); + //ASSERT_TRUE(collector.metrics.find(expectedMetrics) != std::string::npos); // Test for labeled metrics - RGW daemon = "ceph-client.rgw.foo.ceph-node-00.aayrrj.2.93993527376064"; @@ -1452,3 +1479,82 @@ TEST(Exporter, add_fixed_name_metrics) { EXPECT_EQ(new_metric.first, expected_labels); ASSERT_TRUE(new_metric.second == expected_metric_name); } + +TEST(Exporter, UpdateSockets) { + const std::string mock_dir = "/tmp/fake_sock_dir"; + + // Create the mock directory + std::filesystem::create_directories(mock_dir); + + // Create a mix of vstart and real cluster mock .asok files + std::ofstream(mock_dir + "/ceph-osd.0.asok").close(); + std::ofstream(mock_dir + "/ceph-mds.a.asok").close(); + std::ofstream(mock_dir + "/ceph-mgr.chatest-node-00.ijzynn.asok").close(); + std::ofstream(mock_dir + "/ceph-client.rgw.rgwfoo.chatest-node-00.yqaoen.2.94354846193952.asok").close(); + std::ofstream(mock_dir + "/ceph-client.ceph-exporter.chatest-node-00.asok").close(); + std::ofstream(mock_dir + "/ceph-mon.chatest-node-00.asok").close(); + + g_conf().set_val("exporter_sock_dir", mock_dir); + + DaemonMetricCollector collector; + + // Run the function that interacts with the mock directory + collector.update_sockets(); + + // Verify the expected results + ASSERT_EQ(collector.clients.size(), 4); + ASSERT_TRUE(collector.clients.find("ceph-osd.0") != collector.clients.end()); + ASSERT_TRUE(collector.clients.find("ceph-mds.a") != collector.clients.end()); + ASSERT_TRUE(collector.clients.find("ceph-mon.chatest-node-00") != collector.clients.end()); + ASSERT_TRUE(collector.clients.find("ceph-client.rgw.rgwfoo.chatest-node-00.yqaoen.2.94354846193952") != collector.clients.end()); + + + // Remove the mock directory and files + std::filesystem::remove_all(mock_dir); +} + + +TEST(Exporter, HealthMetrics) { + std::map clients; + DaemonMetricCollector &collector = collector_instance(); + std::string daemon = "test_daemon"; + std::string expectedCounterDump = ""; + std::string expectedCounterSchema = ""; + std::string metricName = "ceph_daemon_socket_up"; + + // Fake admin socket + std::string asok_path = "/tmp/" + daemon + ".asok"; + std::unique_ptr asokc = std::make_unique(g_ceph_context); + AdminSocketClient client(asok_path); + + // Add the daemon clients to the collector + clients.insert({daemon, std::move(client)}); + collector.clients = clients; + + auto verifyMetricValue = [&](const std::string &metricValue, bool shouldInitializeSocket) { + collector.metrics = ""; + + if (shouldInitializeSocket) { + AdminSocketTest asoct(asokc.get()); + ASSERT_TRUE(asoct.init(asok_path)); + } + + collector.dump_asok_metrics(true, 5, true, expectedCounterDump, expectedCounterSchema, false); + + if (shouldInitializeSocket) { + AdminSocketTest asoct(asokc.get()); + ASSERT_TRUE(asoct.shutdown()); + } + + std::string retrievedMetrics = collector.metrics; + std::string pattern = metricName + R"(\{[^}]*ceph_daemon=\")" + daemon + R"(\"[^}]*\}\s+)" + metricValue + R"(\b)"; + std::regex regexPattern(pattern); + ASSERT_TRUE(std::regex_search(retrievedMetrics, regexPattern)); + }; + + // Test an admin socket not answering: metric value should be "0" + verifyMetricValue("0", false); + + // Test an admin socket answering: metric value should be "1" + verifyMetricValue("1", true); +} -- 2.39.5