From: Juan Miguel Olmo Martínez Date: Mon, 7 Oct 2024 14:55:51 +0000 (+0200) Subject: exporter: New metric for report ceph daemons health X-Git-Tag: testing/wip-pdonnell-testing-20241019.005706-debug~23^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=3c9b07eb87e67027e9988c1587c07e27ed168657;p=ceph-ci.git exporter: New metric for report ceph daemons health Ceph exporter provide metrics to report ceph daemons communication health using the admin socket Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=2146728 https://tracker.ceph.com/issues/68428 Signed-off-by: Juan Miguel Olmo Martínez --- diff --git a/doc/monitoring/index.rst b/doc/monitoring/index.rst index 794fdf84195..afccd9ab16a 100644 --- a/doc/monitoring/index.rst +++ b/doc/monitoring/index.rst @@ -64,6 +64,30 @@ in: It is good to outline that the main tool allowing users to observe and monitor a Ceph cluster is the **Ceph dashboard**. It provides graphics where the most important cluster and service metrics are represented. Most of the examples in this document are extracted from the dashboard graphics or extrapolated from the metrics exposed by the Ceph dashboard. +Ceph daemon health metrics +========================== + +The Ceph exporter provides a metric called ``ceph_daemon_socket_up`` that reports the liveness status of each Ceph daemon that exposes an admin socket. + +The ``ceph_daemon_socket_up`` metric indicates the health status of a Ceph daemon based on its ability to respond via the admin socket, where a value of ``1`` means healthy, and ``0`` means unhealthy. Although a Ceph daemon might still be "alive" when it reports ``ceph_daemon_socket_up=0``, this situation highlights a significant issue in its functionality. As such, this metric serves as an excellent tool for detecting problems in any of the main Ceph daemons. + +Labels: +- **``ceph_daemon``**: Identifier of the Ceph daemon exposing an admin socket on the host. +- **``hostname``**: Name of the host where the Ceph daemon is running. + +Example: + +.. code-block:: bash + + ceph_daemon_socket_up{ceph_daemon="mds.a",hostname="testhost"} 1 + ceph_daemon_socket_up{ceph_daemon="osd.1",hostname="testhost"} 0 + +To identify any Ceph daemons that were not responsive at any point in the last 12 hours, you can use the following PromQL expression: + +.. code-block:: bash + + ceph_daemon_socket_up == 0 or min_over_time(ceph_daemon_socket_up[12h]) == 0 + Performance metrics =================== diff --git a/src/exporter/DaemonMetricCollector.cc b/src/exporter/DaemonMetricCollector.cc index d4930ea35c0..4b8a8131bcf 100644 --- a/src/exporter/DaemonMetricCollector.cc +++ b/src/exporter/DaemonMetricCollector.cc @@ -168,10 +168,17 @@ void DaemonMetricCollector::dump_asok_metrics(bool sort_metrics, int64_t counter if (sockClientsPing) { bool ok; sock_client.ping(&ok); + std::string ceph_daemon_socket_up_desc( + "Reports the health status of a Ceph daemon, as determined by whether it is able to respond via its admin socket (1 = healthy, 0 = unhealthy)."); + labels_t ceph_daemon_socket_up_labels; + ceph_daemon_socket_up_labels["hostname"] = quote(ceph_get_hostname()); + ceph_daemon_socket_up_labels["ceph_daemon"] = quote(daemon_name); + add_metric(builder, static_cast(ok), "ceph_daemon_socket_up", ceph_daemon_socket_up_desc, + "gauge", ceph_daemon_socket_up_labels); if (!ok) { failures++; continue; - } + } } std::string counter_dump_response = dump_response.size() > 0 ? dump_response : asok_request(sock_client, "counter dump", daemon_name); diff --git a/src/exporter/DaemonMetricCollector.h b/src/exporter/DaemonMetricCollector.h index d2e929b4d67..3302e95df91 100644 --- a/src/exporter/DaemonMetricCollector.h +++ b/src/exporter/DaemonMetricCollector.h @@ -42,11 +42,11 @@ public: std::map clients; std::string metrics; std::pair add_fixed_name_metrics(std::string metric_name); + void update_sockets(); private: std::mutex metrics_mutex; std::unique_ptr builder; - void update_sockets(); void request_loop(boost::asio::steady_timer &timer); void dump_asok_metric(boost::json::object perf_info, diff --git a/src/test/exporter/test_exporter.cc b/src/test/exporter/test_exporter.cc index 907884fe35d..e24773886bc 100644 --- a/src/test/exporter/test_exporter.cc +++ b/src/test/exporter/test_exporter.cc @@ -1,6 +1,8 @@ #include "common/ceph_argparse.h" #include "common/config.h" #include "common/config_proxy.h" +#include "common/admin_socket.h" +#include "common/admin_socket_client.h" #include #include "gtest/gtest.h" #include "common/ceph_context.h" @@ -8,6 +10,7 @@ #include "global/global_init.h" #include "exporter/util.h" #include "exporter/DaemonMetricCollector.h" +#include #include #include @@ -674,6 +677,27 @@ static std::vector> promethize_data = { {"rocksdb.submit_sync_latency_sum", "ceph_rocksdb_submit_sync_latency_sum"} }; + +class AdminSocketTest +{ +public: + explicit AdminSocketTest(AdminSocket *asokc) + : m_asokc(asokc) + { + } + bool init(const std::string &uri) { + return m_asokc->init(uri); + } + std::string bind_and_listen(const std::string &sock_path, int *fd) { + return m_asokc->bind_and_listen(sock_path, fd); + } + bool shutdown() { + m_asokc->shutdown(); + return true; + } + AdminSocket *m_asokc; +}; + int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); @@ -1289,8 +1313,11 @@ ceph_mon_session_rm{ceph_daemon="mon.a"} 577 # TYPE ceph_mon_session_trim counter ceph_mon_session_trim{ceph_daemon="mon.a"} 9 )"; - - ASSERT_TRUE(collector.metrics.find(expectedMetrics) != std::string::npos); + + std::string actualMetrics = collector.metrics; + std::cout << "Actual MON Metrics: " << actualMetrics << std::endl; + ASSERT_TRUE(actualMetrics.find(expectedMetrics) != std::string::npos); + //ASSERT_TRUE(collector.metrics.find(expectedMetrics) != std::string::npos); // Test for labeled metrics - RGW daemon = "ceph-client.rgw.foo.ceph-node-00.aayrrj.2.93993527376064"; @@ -1452,3 +1479,82 @@ TEST(Exporter, add_fixed_name_metrics) { EXPECT_EQ(new_metric.first, expected_labels); ASSERT_TRUE(new_metric.second == expected_metric_name); } + +TEST(Exporter, UpdateSockets) { + const std::string mock_dir = "/tmp/fake_sock_dir"; + + // Create the mock directory + std::filesystem::create_directories(mock_dir); + + // Create a mix of vstart and real cluster mock .asok files + std::ofstream(mock_dir + "/ceph-osd.0.asok").close(); + std::ofstream(mock_dir + "/ceph-mds.a.asok").close(); + std::ofstream(mock_dir + "/ceph-mgr.chatest-node-00.ijzynn.asok").close(); + std::ofstream(mock_dir + "/ceph-client.rgw.rgwfoo.chatest-node-00.yqaoen.2.94354846193952.asok").close(); + std::ofstream(mock_dir + "/ceph-client.ceph-exporter.chatest-node-00.asok").close(); + std::ofstream(mock_dir + "/ceph-mon.chatest-node-00.asok").close(); + + g_conf().set_val("exporter_sock_dir", mock_dir); + + DaemonMetricCollector collector; + + // Run the function that interacts with the mock directory + collector.update_sockets(); + + // Verify the expected results + ASSERT_EQ(collector.clients.size(), 4); + ASSERT_TRUE(collector.clients.find("ceph-osd.0") != collector.clients.end()); + ASSERT_TRUE(collector.clients.find("ceph-mds.a") != collector.clients.end()); + ASSERT_TRUE(collector.clients.find("ceph-mon.chatest-node-00") != collector.clients.end()); + ASSERT_TRUE(collector.clients.find("ceph-client.rgw.rgwfoo.chatest-node-00.yqaoen.2.94354846193952") != collector.clients.end()); + + + // Remove the mock directory and files + std::filesystem::remove_all(mock_dir); +} + + +TEST(Exporter, HealthMetrics) { + std::map clients; + DaemonMetricCollector &collector = collector_instance(); + std::string daemon = "test_daemon"; + std::string expectedCounterDump = ""; + std::string expectedCounterSchema = ""; + std::string metricName = "ceph_daemon_socket_up"; + + // Fake admin socket + std::string asok_path = "/tmp/" + daemon + ".asok"; + std::unique_ptr asokc = std::make_unique(g_ceph_context); + AdminSocketClient client(asok_path); + + // Add the daemon clients to the collector + clients.insert({daemon, std::move(client)}); + collector.clients = clients; + + auto verifyMetricValue = [&](const std::string &metricValue, bool shouldInitializeSocket) { + collector.metrics = ""; + + if (shouldInitializeSocket) { + AdminSocketTest asoct(asokc.get()); + ASSERT_TRUE(asoct.init(asok_path)); + } + + collector.dump_asok_metrics(true, 5, true, expectedCounterDump, expectedCounterSchema, false); + + if (shouldInitializeSocket) { + AdminSocketTest asoct(asokc.get()); + ASSERT_TRUE(asoct.shutdown()); + } + + std::string retrievedMetrics = collector.metrics; + std::string pattern = metricName + R"(\{[^}]*ceph_daemon=\")" + daemon + R"(\"[^}]*\}\s+)" + metricValue + R"(\b)"; + std::regex regexPattern(pattern); + ASSERT_TRUE(std::regex_search(retrievedMetrics, regexPattern)); + }; + + // Test an admin socket not answering: metric value should be "0" + verifyMetricValue("0", false); + + // Test an admin socket answering: metric value should be "1" + verifyMetricValue("1", true); +}