From: Oguzhan Ozmen Date: Tue, 3 Mar 2026 16:49:21 +0000 (+0000) Subject: rgw/zone: increase visibility into zone connections via admin socket X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=11aa2c86cd341c9a73978458426f4ebf093798e4;p=ceph.git rgw/zone: increase visibility into zone connections via admin socket Adds a new admin socket command to dump zone connection details including endpoints, resolved IPs, and health status. Useful for debugging multisite connectivity issues. Usage: ceph daemon zone connections Signed-off-by: Oguzhan Ozmen --- diff --git a/src/rgw/services/svc_zone.cc b/src/rgw/services/svc_zone.cc index 055a5f6fe69..88102761333 100644 --- a/src/rgw/services/svc_zone.cc +++ b/src/rgw/services/svc_zone.cc @@ -2,6 +2,7 @@ // vim: ts=8 sw=2 sts=2 expandtab ft=cpp #include "svc_zone.h" +#include "common/admin_socket.h" #include "svc_sys_obj.h" #include "svc_sync_modules.h" @@ -20,6 +21,188 @@ using namespace std; using namespace rgw_zone_defaults; + +// Admin socket hook for zone connection information +class RGWSI_Zone_ASocketHook : public AdminSocketHook { + RGWSI_Zone *svc; + + static constexpr std::string_view admin_commands[][2] = { + { + "zone connections", + "zone connections: list zone connections with endpoints and IP health status" + } + }; + +public: + RGWSI_Zone_ASocketHook(RGWSI_Zone *_svc) : svc(_svc) {} + + int start(); + void shutdown(); + + int call(std::string_view command, const cmdmap_t& cmdmap, + const bufferlist&, + Formatter *f, + std::ostream& ss, + bufferlist& out) override; +}; + +int RGWSI_Zone_ASocketHook::start() +{ + auto admin_socket = svc->ctx()->get_admin_socket(); + for (auto cmd : admin_commands) { + int r = admin_socket->register_command(cmd[0], this, cmd[1]); + if (r < 0) { + ldout(svc->ctx(), 0) << "ERROR: fail to register admin socket command (r=" + << r << ")" << dendl; + return r; + } + } + return 0; +} + +void RGWSI_Zone_ASocketHook::shutdown() +{ + auto admin_socket = svc->ctx()->get_admin_socket(); + admin_socket->unregister_commands(this); +} + +static void dump_resolved_ip(Formatter *f, const ResolvedIP& ip, + double timeout_secs) { + f->open_object_section("ip"); + f->dump_string("connect_to", ip.connect_to); + auto last_fail = ip.last_failure.load(); + if (ceph::real_clock::is_zero(last_fail)) { + f->dump_string("status", "up"); + f->dump_string("last_failure", ""); + } else { + // Check if failure has expired based on timeout + auto now = ceph::real_clock::now(); + auto diff = ceph::to_seconds(now - last_fail); + if (diff >= timeout_secs) { + f->dump_string("status", "retry-ready"); // Timeout expired, eligible for retry but unknown if actually up + } else { + f->dump_string("status", "down"); + } + f->dump_string("last_failure", ceph::to_iso_8601(last_fail)); + } + f->close_section(); +} + +static void dump_resolved_endpoint(Formatter *f, const ResolvedEndpoint& ep, + double timeout_secs) { + f->open_object_section("endpoint"); + f->dump_string("url", ep.url); + f->dump_string("scheme", ep.scheme); + f->dump_string("host", ep.host); + f->dump_int("port", ep.port); + f->dump_string("last_failure_time", + ceph::real_clock::is_zero(ep.last_failure_time.load()) + ? "" : ceph::to_iso_8601(ep.last_failure_time.load())); + + f->open_array_section("resolved_ips"); + for (const auto& ip : ep.resolved_ips) { + dump_resolved_ip(f, ip, timeout_secs); + } + f->close_section(); + + f->close_section(); +} + +static void dump_rest_conn(Formatter *f, const std::string& name, + RGWRESTConn* conn) { + if (!conn) return; + + f->open_object_section(name); + f->dump_string("remote_id", conn->get_remote_id()); + f->dump_unsigned("endpoint_count", conn->get_endpoint_count()); + + double timeout_secs = conn->get_ctx()->_conf->rgw_rest_conn_ip_fail_timeout_secs; + + const auto& endpoints = conn->get_resolved_endpoints(); + f->open_array_section("endpoints"); + for (const auto& ep : endpoints) { + dump_resolved_endpoint(f, ep, timeout_secs); + } + f->close_section(); + + f->close_section(); +} + +int RGWSI_Zone_ASocketHook::call( + std::string_view command, const cmdmap_t& cmdmap, + const bufferlist&, + Formatter *f, + std::ostream& ss, + bufferlist& out) +{ + if (command == "zone connections"sv) { + f->open_object_section("zone_connections"); + + f->dump_string("current_time", ceph::to_iso_8601(ceph::real_clock::now())); + f->dump_string("current_zone_id", svc->zone_id().id); + f->dump_string("current_zone_name", svc->zone_name()); + + auto* master_conn = svc->get_master_conn(); + if (master_conn) { + dump_rest_conn(f, "master_conn", master_conn); + } + + // Zone connections map + auto& zone_conn_map = svc->get_zone_conn_map(); + f->open_object_section("zone_conn_map"); + for (auto& [zone_id, conn] : zone_conn_map) { + dump_rest_conn(f, zone_id.id, conn); + } + f->close_section(); + + // Only show zonegroup_conn_map if it has connections beyond the master + auto& zonegroup_conn_map = svc->get_zonegroup_conn_map(); + bool zg_differs_from_master = false; + if (!master_conn) { + zg_differs_from_master = !zonegroup_conn_map.empty(); + } else if (zonegroup_conn_map.size() > 1) { + zg_differs_from_master = true; + } else if (zonegroup_conn_map.size() == 1) { + auto& [zg_name, conn] = *zonegroup_conn_map.begin(); + zg_differs_from_master = (conn->get_remote_id() != master_conn->get_remote_id()); + } + if (zg_differs_from_master) { + f->open_object_section("zonegroup_conn_map"); + for (auto& [zg_name, conn] : zonegroup_conn_map) { + dump_rest_conn(f, zg_name, conn); + } + f->close_section(); + } + + // Only show zone_data_notify_to_map if it differs from zone_conn_map + auto& notify_map = svc->get_zone_data_notify_to_map(); + bool notify_differs = false; + if (notify_map.size() != zone_conn_map.size()) { + notify_differs = true; + } else { + for (auto& [zone_id, conn] : notify_map) { + auto it = zone_conn_map.find(zone_id); + if (it == zone_conn_map.end() || it->second != conn) { + notify_differs = true; + break; + } + } + } + if (notify_differs) { + f->open_object_section("zone_data_notify_to_map"); + for (auto& [zone_id, conn] : notify_map) { + dump_rest_conn(f, zone_id.id, conn); + } + f->close_section(); + } + + f->close_section(); + return 0; + } + + return -ENOSYS; +} + RGWSI_Zone::RGWSI_Zone(CephContext *cct, rgw::sal::ConfigStore* _cfgstore, const rgw::SiteConfig* _site) : RGWServiceInstance(cct), cfgstore(_cfgstore), site(_site) { @@ -212,11 +395,20 @@ int RGWSI_Zone::do_start(optional_yield y, const DoutPrefixProvider *dpp) ldpp_dout(dpp, 20) << "started zone id=" << zone_params->get_id() << " (name=" << zone_params->get_name() << ") with tier type = " << zone_public_config->tier_type << dendl; + + // Initialize admin socket hook + asocket_hook = std::make_unique(this); + asocket_hook->start(); return 0; } void RGWSI_Zone::shutdown() { + // Shutdown admin socket hook + if (asocket_hook) { + asocket_hook->shutdown(); + } + delete rest_master_conn; for (auto& item : zone_conn_map) { diff --git a/src/rgw/services/svc_zone.h b/src/rgw/services/svc_zone.h index 451faac8bef..1c702ec40c8 100644 --- a/src/rgw/services/svc_zone.h +++ b/src/rgw/services/svc_zone.h @@ -4,6 +4,7 @@ #pragma once #include "driver/rados/rgw_service.h" +#include class RGWSI_SysObj; @@ -22,6 +23,7 @@ class RGWBucketSyncPolicyHandler; class RGWRESTConn; struct rgw_sync_policy_info; +class RGWSI_Zone_ASocketHook; class RGWSI_Zone : public RGWServiceInstance { @@ -57,6 +59,7 @@ class RGWSI_Zone : public RGWServiceInstance std::unique_ptr sync_policy; rgw::sal::ConfigStore *cfgstore{nullptr}; const rgw::SiteConfig* site{nullptr}; + std::unique_ptr asocket_hook; void init(RGWSI_SysObj *_sysobj_svc, librados::Rados* rados_,