From 571dd531663aa94460d1f4194e70ac25538c7b64 Mon Sep 17 00:00:00 2001 From: Vallari Agrawal Date: Wed, 18 Dec 2024 13:29:47 +0530 Subject: [PATCH] mon/NVMeofGwMap: add healthcheck warning NVMEOF_GATEWAY_DELETING Add a warning when NVMeoF gateways are in DELETING state. This happens when there are namespaces under the deleted gateway's ANA group ID. The gateways are removed completely after users manually move these namespaces to another load balancing group. Or if a new gateway is deployed on that host. Signed-off-by: Vallari Agrawal --- doc/rados/operations/health-checks.rst | 8 ++++++++ src/mon/NVMeofGwMap.cc | 14 ++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst index f5d38948150d7..a1498a09fd07d 100644 --- a/doc/rados/operations/health-checks.rst +++ b/doc/rados/operations/health-checks.rst @@ -1665,6 +1665,14 @@ Some of the gateways are in the GW_UNAVAILABLE state. If a NVMeoF daemon has crashed, the daemon log file (found at ``/var/log/ceph/``) may contain troubleshooting information. +NVMEOF_GATEWAY_DELETING +_______________________ + +Some of the gateways are in the GW_DELETING state. They will stay in this +state until all the namespaces under the gateway's load balancing group are +moved to another load balancing group ID. This is done automatically by the +load balancing process. If this alert persist for a long time, there might +be an issue with that process. Miscellaneous ------------- diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc index 719403925adba..fb5e5a4a91079 100755 --- a/src/mon/NVMeofGwMap.cc +++ b/src/mon/NVMeofGwMap.cc @@ -899,6 +899,7 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const { list singleGatewayDetail; list gatewayDownDetail; + list gatewayInDeletingDetail; for (const auto& created_map_pair: created_gws) { const auto& group_key = created_map_pair.first; auto& group = group_key.second; @@ -915,6 +916,10 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const ostringstream ss; ss << "NVMeoF Gateway '" << gw_id << "' is unavailable." ; gatewayDownDetail.push_back(ss.str()); + } else if (gw_created.availability == gw_availability_t::GW_DELETING) { + ostringstream ss; + ss << "NVMeoF Gateway '" << gw_id << "' is in deleting state." ; + gatewayInDeletingDetail.push_back(ss.str()); } } } @@ -934,6 +939,15 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const ss.str(), gatewayDownDetail.size()); d.detail.swap(gatewayDownDetail); } + if (!gatewayInDeletingDetail.empty()) { + ostringstream ss; + ss << gatewayInDeletingDetail.size() << " gateway(s) are in deleting state" + << "; namespaces are automatically balanced across remaining gateways, " + << "this should take a few minutes."; + auto& d = checks->add("NVMEOF_GATEWAY_DELETING", HEALTH_WARN, + ss.str(), gatewayInDeletingDetail.size()); + d.detail.swap(gatewayInDeletingDetail); + } } int NVMeofGwMap::blocklist_gw( -- 2.39.5