From: Vallari Agrawal Date: Wed, 18 Dec 2024 07:59:47 +0000 (+0530) Subject: mon/NVMeofGwMap: add healthcheck warning NVMEOF_GATEWAY_DELETING X-Git-Tag: testing/wip-pdonnell-testing-20250108.221010-debug~4^2~2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=571dd531663aa94460d1f4194e70ac25538c7b64;p=ceph-ci.git mon/NVMeofGwMap: add healthcheck warning NVMEOF_GATEWAY_DELETING Add a warning when NVMeoF gateways are in DELETING state. This happens when there are namespaces under the deleted gateway's ANA group ID. The gateways are removed completely after users manually move these namespaces to another load balancing group. Or if a new gateway is deployed on that host. Signed-off-by: Vallari Agrawal --- diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst index f5d38948150..a1498a09fd0 100644 --- a/doc/rados/operations/health-checks.rst +++ b/doc/rados/operations/health-checks.rst @@ -1665,6 +1665,14 @@ Some of the gateways are in the GW_UNAVAILABLE state. If a NVMeoF daemon has crashed, the daemon log file (found at ``/var/log/ceph/``) may contain troubleshooting information. +NVMEOF_GATEWAY_DELETING +_______________________ + +Some of the gateways are in the GW_DELETING state. They will stay in this +state until all the namespaces under the gateway's load balancing group are +moved to another load balancing group ID. This is done automatically by the +load balancing process. If this alert persist for a long time, there might +be an issue with that process. Miscellaneous ------------- diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc index 719403925ad..fb5e5a4a910 100755 --- a/src/mon/NVMeofGwMap.cc +++ b/src/mon/NVMeofGwMap.cc @@ -899,6 +899,7 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const { list singleGatewayDetail; list gatewayDownDetail; + list gatewayInDeletingDetail; for (const auto& created_map_pair: created_gws) { const auto& group_key = created_map_pair.first; auto& group = group_key.second; @@ -915,6 +916,10 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const ostringstream ss; ss << "NVMeoF Gateway '" << gw_id << "' is unavailable." ; gatewayDownDetail.push_back(ss.str()); + } else if (gw_created.availability == gw_availability_t::GW_DELETING) { + ostringstream ss; + ss << "NVMeoF Gateway '" << gw_id << "' is in deleting state." ; + gatewayInDeletingDetail.push_back(ss.str()); } } } @@ -934,6 +939,15 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const ss.str(), gatewayDownDetail.size()); d.detail.swap(gatewayDownDetail); } + if (!gatewayInDeletingDetail.empty()) { + ostringstream ss; + ss << gatewayInDeletingDetail.size() << " gateway(s) are in deleting state" + << "; namespaces are automatically balanced across remaining gateways, " + << "this should take a few minutes."; + auto& d = checks->add("NVMEOF_GATEWAY_DELETING", HEALTH_WARN, + ss.str(), gatewayInDeletingDetail.size()); + d.detail.swap(gatewayInDeletingDetail); + } } int NVMeofGwMap::blocklist_gw(