From: Vallari Agrawal Date: Tue, 1 Oct 2024 06:10:23 +0000 (+0530) Subject: mon: add warning NVMEOF_GATEWAY_DOWN X-Git-Tag: v20.0.0~665^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=0006599c9e1a6941e0a347f2c86e795f1fda56c7;p=ceph.git mon: add warning NVMEOF_GATEWAY_DOWN In src/mon/NVMeofGwMap.cc, add warning NVMEOF_GATEWAY_DOWN when any gateway is in GW_UNAVAILABLE state. Signed-off-by: Vallari Agrawal --- diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst index bbce958bebe7..4ee02c04e1eb 100644 --- a/doc/rados/operations/health-checks.rst +++ b/doc/rados/operations/health-checks.rst @@ -1636,8 +1636,8 @@ bucket are the same. NVMeoF Gateway -------------- -NVMOEF_SINGLE_GATEWAY -__________________________________ +NVMEOF_SINGLE_GATEWAY +_____________________ One of the gateway group has only one gateway. This is not ideal because it makes high availability (HA) impossible with a single gatway in a group. This can lead to @@ -1645,6 +1645,13 @@ problems with failover and failback operations for the NVMeoF gateway. It's recommended to have multiple NVMeoF gateways in a group. +NVMEOF_GATEWAY_DOWN +___________________ + +Some of the gateways are in the GW_UNAVAILABLE state. If a NVMeoF daemon has crashed, +the daemon log file (found at ``/var/log/ceph/``) may contain troubleshooting information. + + Miscellaneous ------------- diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc index c350622b7db6..b29606db998e 100755 --- a/src/mon/NVMeofGwMap.cc +++ b/src/mon/NVMeofGwMap.cc @@ -883,7 +883,8 @@ struct CMonRequestProposal : public Context { void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const { - list detail; + list singleGatewayDetail; + list gatewayDownDetail; for (const auto& created_map_pair: created_gws) { const auto& group_key = created_map_pair.first; auto& group = group_key.second; @@ -891,16 +892,33 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const if ( gw_created_map.size() == 1) { ostringstream ss; ss << "NVMeoF Gateway Group '" << group << "' has 1 gateway." ; - detail.push_back(ss.str()); + singleGatewayDetail.push_back(ss.str()); + } + for (const auto& gw_created_pair: gw_created_map) { + const auto& gw_id = gw_created_pair.first; + const auto& gw_created = gw_created_pair.second; + if (gw_created.availability == gw_availability_t::GW_UNAVAILABLE) { + ostringstream ss; + ss << "NVMeoF Gateway '" << gw_id << "' is unavailable." ; + gatewayDownDetail.push_back(ss.str()); + } } } - if (!detail.empty()) { + if (!singleGatewayDetail.empty()) { ostringstream ss; - ss << detail.size() << " group(s) have only 1 nvmeof gateway" + ss << singleGatewayDetail.size() << " group(s) have only 1 nvmeof gateway" << "; HA is not possible with single gateway."; auto& d = checks->add("NVMEOF_SINGLE_GATEWAY", HEALTH_WARN, - ss.str(), detail.size()); - d.detail.swap(detail); + ss.str(), singleGatewayDetail.size()); + d.detail.swap(singleGatewayDetail); + } + if (!gatewayDownDetail.empty()) { + ostringstream ss; + ss << gatewayDownDetail.size() << " gateway(s) are in unavailable state" + << "; gateway might be down, try to redeploy."; + auto& d = checks->add("NVMEOF_GATEWAY_DOWN", HEALTH_WARN, + ss.str(), gatewayDownDetail.size()); + d.detail.swap(gatewayDownDetail); } }