]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
src/nvmeof/NVMeofGwMonitorClient: connect panic
authorAlexander Indenbaum <aindenba@redhat.com>
Mon, 7 Jul 2025 07:42:37 +0000 (10:42 +0300)
committerAlexander Indenbaum <aindenba@redhat.com>
Sun, 17 Aug 2025 11:44:10 +0000 (14:44 +0300)
- add NVMe-oF Monitor Client configuration reference

Tracker: https://tracker.ceph.com/issues/72509

Signed-off-by: Alexander Indenbaum <aindenba@redhat.com>
(cherry picked from commit babd3c00341c59cb82e66f93f41c687ab4fc6928)

doc/rados/configuration/mon-config-ref.rst
src/common/options/mon.yaml.in
src/nvmeof/NVMeofGwMonitorClient.cc
src/nvmeof/NVMeofGwMonitorClient.h

index 880fb608090f683de18dd443c73b42c95576ab8d..9f4cf6ea1f2c76035ba35bdc5c1ddb38262b5035 100644 (file)
@@ -629,6 +629,13 @@ Miscellaneous
 .. confval:: mon_memory_autotune
 .. confval:: enable_availability_tracking
 
+NVMe-oF Monitor Client
+=====================
+
+.. confval:: nvmeof_mon_client_disconnect_panic
+.. confval:: nvmeof_mon_client_connect_panic
+.. confval:: nvmeof_mon_client_tick_period
+
 .. _Paxos: https://en.wikipedia.org/wiki/Paxos_(computer_science)
 .. _Monitor Keyrings: ../../../dev/mon-bootstrap#secret-keys
 .. _Ceph configuration file: ../ceph-conf/#monitors
index 2c337fcea70721b73abd548bb9657fcd15300bb1..4ac1b2f05a45bd31ebb9522a2b5e12b32ee5463d 100644 (file)
@@ -1412,4 +1412,11 @@ options:
   - mon
   flags:
   - runtime
\ No newline at end of file
+- name: nvmeof_mon_client_connect_panic
+  type: secs
+  level: advanced
+  desc: The duration, expressed in seconds, after which the nvmeof gateway
+    should trigger a panic if it does not receive the initial map from the monitor
+  default: 30
+  services:
+  - mon
index 0b798c370a2e764f68008fff03c04bdd6a8d847e..2000f718a7da8f681d430b0849a6b72646aefc43 100644 (file)
@@ -40,6 +40,7 @@ NVMeofGwMonitorClient::NVMeofGwMonitorClient(int argc, const char **argv) :
   gwmap_epoch(0),
   last_map_time(std::chrono::steady_clock::now()),
   reset_timestamp(std::chrono::steady_clock::now()),
+  start_time(last_map_time),
   monc{g_ceph_context, poolctx},
   client_messenger(Messenger::create(g_ceph_context, "async", entity_name_t::CLIENT(-1), "client", getpid())),
   objecter{g_ceph_context, client_messenger.get(), &monc, poolctx},
@@ -263,10 +264,27 @@ void NVMeofGwMonitorClient::disconnect_panic()
   }
 }
 
+void NVMeofGwMonitorClient::connect_panic()
+{
+  // Return immediately if the gateway was assigned group ID by the monitor
+  if (set_group_id) {
+    return;
+  }
+  // If the gateway has not been assigned a group ID, panic after timeout
+  auto connect_panic_duration = g_conf().get_val<std::chrono::seconds>("nvmeof_mon_client_connect_panic").count();
+  auto now = std::chrono::steady_clock::now();
+  auto elapsed_seconds = std::chrono::duration_cast<std::chrono::seconds>(now - start_time).count();
+  if (elapsed_seconds > connect_panic_duration) {
+    dout(4) << "Triggering a panic: did not receive initial map from monitor, elapsed " << elapsed_seconds << ", configured connect panic duration " << connect_panic_duration << " seconds." << dendl;
+    throw std::runtime_error("Did not receive initial map from monitor (connect panic).");
+  }
+}
+
 void NVMeofGwMonitorClient::tick()
 {
   dout(10) << dendl;
 
+  connect_panic();
   disconnect_panic();
   send_beacon();
   first_beacon = false;
@@ -339,7 +357,7 @@ void NVMeofGwMonitorClient::handle_nvmeof_gw_map(ceph::ref_t<MNVMeofGwMap> nmap)
       dout(10) << "Can not find new gw state" << dendl;
       return;
     }
-    bool set_group_id = false;
+    ceph_assert(!set_group_id);
     while (!set_group_id) {
       NVMeofGwMonitorGroupClient monitor_group_client(
           grpc::CreateChannel(monitor_address, gw_creds()));
index 546fff27db7eb21c1593c5f763bc2eb957963e0a..f187a47b481e3d2a4e276b2e82d7ba525c1c51cd 100644 (file)
@@ -47,8 +47,12 @@ private:
               last_map_time; // used to panic on disconnect
   std::chrono::time_point<std::chrono::steady_clock>
                 reset_timestamp; // used to bypass some validations
+  std::chrono::time_point<std::chrono::steady_clock>
+                start_time; // used to panic on connect
 
   bool first_beacon = true;
+  bool set_group_id = false;
+
   // init gw ssl opts
   void init_gw_ssl_opts();
 
@@ -96,6 +100,8 @@ public:
   void disconnect_panic();
 
   void handle_nvmeof_gw_map(ceph::ref_t<MNVMeofGwMap> m);
+
+  void connect_panic();
 };
 
 #endif