]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mon/NVMeofGwMap: add healthcheck warning NVMEOF_GATEWAY_DELETING
authorVallari Agrawal <vallari.agrawal@ibm.com>
Wed, 18 Dec 2024 07:59:47 +0000 (13:29 +0530)
committerVallari Agrawal <vallari.agrawal@ibm.com>
Fri, 3 Jan 2025 10:39:17 +0000 (16:09 +0530)
Add a warning when NVMeoF gateways are in DELETING state.
This happens when there are namespaces under the deleted gateway's
ANA group ID.

The gateways are removed completely after users manually move these
namespaces to another load balancing group. Or if a new gateway is
deployed on that host.

Signed-off-by: Vallari Agrawal <vallari.agrawal@ibm.com>
doc/rados/operations/health-checks.rst
src/mon/NVMeofGwMap.cc

index f5d38948150d757fe56e73735525a37f9e9f90aa..a1498a09fd07d514f5782a6b85595a6dc08a0b1e 100644 (file)
@@ -1665,6 +1665,14 @@ Some of the gateways are in the GW_UNAVAILABLE state. If a NVMeoF daemon has
 crashed, the daemon log file (found at ``/var/log/ceph/``) may contain
 troubleshooting information.
 
+NVMEOF_GATEWAY_DELETING
+_______________________
+
+Some of the gateways are in the GW_DELETING state. They will stay in this
+state until all the namespaces under the gateway's load balancing group are 
+moved to another load balancing group ID. This is done automatically by the 
+load balancing process. If this alert persist for a long time, there might 
+be an issue with that process.
 
 Miscellaneous
 -------------
index 719403925adba33737b8cf298d026daa4bfedf16..fb5e5a4a91079c886c45461d39c00ed5eda7583f 100755 (executable)
@@ -899,6 +899,7 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const
 {
   list<string> singleGatewayDetail;
   list<string> gatewayDownDetail;
+  list<string> gatewayInDeletingDetail;
   for (const auto& created_map_pair: created_gws) {
     const auto& group_key = created_map_pair.first;
     auto& group = group_key.second;
@@ -915,6 +916,10 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const
         ostringstream ss;
         ss << "NVMeoF Gateway '" << gw_id << "' is unavailable." ;
         gatewayDownDetail.push_back(ss.str());
+      } else if (gw_created.availability == gw_availability_t::GW_DELETING) {
+        ostringstream ss;
+        ss << "NVMeoF Gateway '" << gw_id << "' is in deleting state." ;
+        gatewayInDeletingDetail.push_back(ss.str());
       }
     }
   }
@@ -934,6 +939,15 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const
         ss.str(), gatewayDownDetail.size());
     d.detail.swap(gatewayDownDetail);
   }
+  if (!gatewayInDeletingDetail.empty()) {
+    ostringstream ss;
+    ss << gatewayInDeletingDetail.size() << " gateway(s) are in deleting state"
+      << "; namespaces are automatically balanced across remaining gateways, "
+      << "this should take a few minutes.";
+    auto& d = checks->add("NVMEOF_GATEWAY_DELETING", HEALTH_WARN,
+        ss.str(), gatewayInDeletingDetail.size());
+    d.detail.swap(gatewayInDeletingDetail);
+  }
 }
 
 int NVMeofGwMap::blocklist_gw(