From 98571db537cea1f0a8cccfd92c21568d9206185a Mon Sep 17 00:00:00 2001 From: Alexander Indenbaum Date: Mon, 7 Jul 2025 10:42:37 +0300 Subject: [PATCH] src/nvmeof/NVMeofGwMonitorClient: connect panic - add NVMe-oF Monitor Client configuration reference Tracker: https://tracker.ceph.com/issues/72509 Signed-off-by: Alexander Indenbaum (cherry picked from commit babd3c00341c59cb82e66f93f41c687ab4fc6928) --- doc/rados/configuration/mon-config-ref.rst | 7 +++++++ src/common/options/mon.yaml.in | 9 ++++++++- src/nvmeof/NVMeofGwMonitorClient.cc | 20 +++++++++++++++++++- src/nvmeof/NVMeofGwMonitorClient.h | 6 ++++++ 4 files changed, 40 insertions(+), 2 deletions(-) diff --git a/doc/rados/configuration/mon-config-ref.rst b/doc/rados/configuration/mon-config-ref.rst index 880fb608090..9f4cf6ea1f2 100644 --- a/doc/rados/configuration/mon-config-ref.rst +++ b/doc/rados/configuration/mon-config-ref.rst @@ -629,6 +629,13 @@ Miscellaneous .. confval:: mon_memory_autotune .. confval:: enable_availability_tracking +NVMe-oF Monitor Client +===================== + +.. confval:: nvmeof_mon_client_disconnect_panic +.. confval:: nvmeof_mon_client_connect_panic +.. confval:: nvmeof_mon_client_tick_period + .. _Paxos: https://en.wikipedia.org/wiki/Paxos_(computer_science) .. _Monitor Keyrings: ../../../dev/mon-bootstrap#secret-keys .. _Ceph configuration file: ../ceph-conf/#monitors diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in index 2c337fcea70..4ac1b2f05a4 100644 --- a/src/common/options/mon.yaml.in +++ b/src/common/options/mon.yaml.in @@ -1412,4 +1412,11 @@ options: - mon flags: - runtime - \ No newline at end of file +- name: nvmeof_mon_client_connect_panic + type: secs + level: advanced + desc: The duration, expressed in seconds, after which the nvmeof gateway + should trigger a panic if it does not receive the initial map from the monitor + default: 30 + services: + - mon diff --git a/src/nvmeof/NVMeofGwMonitorClient.cc b/src/nvmeof/NVMeofGwMonitorClient.cc index 0b798c370a2..2000f718a7d 100644 --- a/src/nvmeof/NVMeofGwMonitorClient.cc +++ b/src/nvmeof/NVMeofGwMonitorClient.cc @@ -40,6 +40,7 @@ NVMeofGwMonitorClient::NVMeofGwMonitorClient(int argc, const char **argv) : gwmap_epoch(0), last_map_time(std::chrono::steady_clock::now()), reset_timestamp(std::chrono::steady_clock::now()), + start_time(last_map_time), monc{g_ceph_context, poolctx}, client_messenger(Messenger::create(g_ceph_context, "async", entity_name_t::CLIENT(-1), "client", getpid())), objecter{g_ceph_context, client_messenger.get(), &monc, poolctx}, @@ -263,10 +264,27 @@ void NVMeofGwMonitorClient::disconnect_panic() } } +void NVMeofGwMonitorClient::connect_panic() +{ + // Return immediately if the gateway was assigned group ID by the monitor + if (set_group_id) { + return; + } + // If the gateway has not been assigned a group ID, panic after timeout + auto connect_panic_duration = g_conf().get_val("nvmeof_mon_client_connect_panic").count(); + auto now = std::chrono::steady_clock::now(); + auto elapsed_seconds = std::chrono::duration_cast(now - start_time).count(); + if (elapsed_seconds > connect_panic_duration) { + dout(4) << "Triggering a panic: did not receive initial map from monitor, elapsed " << elapsed_seconds << ", configured connect panic duration " << connect_panic_duration << " seconds." << dendl; + throw std::runtime_error("Did not receive initial map from monitor (connect panic)."); + } +} + void NVMeofGwMonitorClient::tick() { dout(10) << dendl; + connect_panic(); disconnect_panic(); send_beacon(); first_beacon = false; @@ -339,7 +357,7 @@ void NVMeofGwMonitorClient::handle_nvmeof_gw_map(ceph::ref_t nmap) dout(10) << "Can not find new gw state" << dendl; return; } - bool set_group_id = false; + ceph_assert(!set_group_id); while (!set_group_id) { NVMeofGwMonitorGroupClient monitor_group_client( grpc::CreateChannel(monitor_address, gw_creds())); diff --git a/src/nvmeof/NVMeofGwMonitorClient.h b/src/nvmeof/NVMeofGwMonitorClient.h index 546fff27db7..f187a47b481 100644 --- a/src/nvmeof/NVMeofGwMonitorClient.h +++ b/src/nvmeof/NVMeofGwMonitorClient.h @@ -47,8 +47,12 @@ private: last_map_time; // used to panic on disconnect std::chrono::time_point reset_timestamp; // used to bypass some validations + std::chrono::time_point + start_time; // used to panic on connect bool first_beacon = true; + bool set_group_id = false; + // init gw ssl opts void init_gw_ssl_opts(); @@ -96,6 +100,8 @@ public: void disconnect_panic(); void handle_nvmeof_gw_map(ceph::ref_t m); + + void connect_panic(); }; #endif -- 2.39.5