]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
src/mon/HealthMonitor.cc: Add NONEXISTENT_MON_CRUSH_LOC_STRETCH_MODE
authorKamoltat Sirivadhna <ksirivad@redhat.com>
Thu, 8 Aug 2024 20:18:27 +0000 (20:18 +0000)
committerKamoltat Sirivadhna <ksirivad@redhat.com>
Mon, 25 Nov 2024 21:54:02 +0000 (21:54 +0000)
In streth mode, warn the user when
we encounter a MON that
has nonexistent crush location, with
the tiebreaker MON being the only exception to
this.

Fixes: https://tracker.ceph.com/issues/63861
Signed-off-by: Kamoltat Sirivadhna <ksirivad@redhat.com>
src/mon/HealthMonitor.cc
src/mon/HealthMonitor.h

index 45563f87d3d0403e9a04458fefe05fb09c3a0fe4..2a78c46af829e76e51a2808df77b840b007e8956 100644 (file)
@@ -25,6 +25,7 @@
 
 #include "mon/Monitor.h"
 #include "mon/HealthMonitor.h"
+#include "mon/OSDMonitor.h"
 
 #include "messages/MMonHealthChecks.h"
 
@@ -740,6 +741,8 @@ bool HealthMonitor::check_leader_health()
   if (g_conf().get_val<bool>("mon_warn_on_msgr2_not_enabled")) {
     check_if_msgr2_enabled(&next);
   }
+  // STRETCH MODE
+  check_mon_crush_loc_stretch_mode(&next);
 
   if (next != leader_checks) {
     changed = true;
@@ -885,3 +888,33 @@ void HealthMonitor::check_if_msgr2_enabled(health_check_map_t *checks)
     }
   }
 }
+
+void HealthMonitor::check_mon_crush_loc_stretch_mode(health_check_map_t *checks)
+{
+  // Check if the CRUSH location exists for all MONs
+  if (!mon.monmap->stretch_mode_enabled){
+    return;
+  }
+  list<string> details;
+  for (auto& i : mon.monmap->mon_info) {
+    // Skip the tiebreaker monitor
+    if (i.second.name == mon.monmap->tiebreaker_mon) {
+      continue;
+    }
+    for (auto& pair : i.second.crush_loc){
+      if (!mon.osdmon()->osdmap.crush->name_exists(pair.second)) {
+        ostringstream ds;
+        ds << "CRUSH location " << pair.second << " does not exist";
+        details.push_back(ds.str());
+      }
+    }
+  }
+  // WARN in ceph -s if the CRUSH location does not exist
+  if (!details.empty()) {
+    ostringstream ss;
+    ss << details.size() << " monitor(s) have nonexistent CRUSH location";
+    auto &d = checks->add("NONEXISTENT_MON_CRUSH_LOC_STRETCH_MODE", HEALTH_WARN, ss.str(),
+                details.size());
+    d.detail.swap(details);
+  }
+}
index c0e79d03375d51b161327ba27ede4f7c94cc7adb..2182b6bbfce5833a127760bfc9e31b42a4708555 100644 (file)
@@ -66,6 +66,7 @@ private:
   void check_for_older_version(health_check_map_t *checks);
   void check_for_mon_down(health_check_map_t *checks);
   void check_for_clock_skew(health_check_map_t *checks);
+  void check_mon_crush_loc_stretch_mode(health_check_map_t *checks);
   void check_if_msgr2_enabled(health_check_map_t *checks);
   bool check_leader_health();
   bool check_member_health();