From: Eric Date: Tue, 24 Feb 2026 18:39:58 +0000 (-0800) Subject: mon: Health warning for colocated monitors X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=44e62f85d400c68031c507a14a26b3758928c29c;p=ceph-ci.git mon: Health warning for colocated monitors In HealthMonitor::tick(), check ip addresses of all monitors to detect if any are on the same node. Create a HEALTH_WARNING if this occurs. Added standalone test. Fixes: https://tracker.ceph.com/issues/74955 Signed-off-by: Eric Zhang --- diff --git a/qa/standalone/mon/mon-colocated.sh b/qa/standalone/mon/mon-colocated.sh new file mode 100755 index 00000000000..efd54f836c0 --- /dev/null +++ b/qa/standalone/mon/mon-colocated.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +# Can be executed using ../qa/run-standalone.sh +# The goal of this script is to test the "MON_COLOCATED" HEALTH_WARNING under different monitor ip configurations: +# - All monitors have different IPs +# - One pair of monitors share the same IP +# - Two pairs of monitors share the same IP + + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh +function run() { + local dir=$1 + shift + + export CEPH_MON_A="127.0.0.1:7139" # git grep '\<7139\>' : there must be only one + export CEPH_MON_B="127.0.0.2:7141" # git grep '\<7141\>' : there must be only one + export CEPH_MON_C="127.0.0.3:7142" # git grep '\<7142\>' : there must be only one + export CEPH_MON_D="127.0.0.1:7143" # git grep '\<7143\>' : there must be only one + export CEPH_MON_E="127.0.0.2:7144" # git grep '\<7144\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + + export BASE_CEPH_ARGS=$CEPH_ARGS + CEPH_ARGS+="--mon-host=$CEPH_MON_A" + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} +TEST_mon_colocated() { + local dir=$1 + setup $dir || return 1 + + run_mon $dir a --public-addr $CEPH_MON_A || return 1 + + run_mon $dir b --public-addr $CEPH_MON_B || return 1 + CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B" + + run_mon $dir c --public-addr $CEPH_MON_C || return 1 + CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C" + + wait_for_health_ok || return 1 + + run_mon $dir d --public-addr $CEPH_MON_D || return 1 + CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D" + + wait_for_health_ok || return 1 + + ceph config set mon mon_warn_on_colocated_monitors true + + wait_for_health "MON_COLOCATED" || return 1 + wait_for_health "2 monitors (a,d) share the same ip = 127.0.0.1" + + run_mon $dir e --public-addr $CEPH_MON_E || return 1 + CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D,$CEPH_MON_E" + + wait_for_health "MON_COLOCATED" || return 1 + wait_for_health "2 monitors (a,d) share the same ip = 127.0.0.1" + wait_for_health "2 monitors (b,e) share the same ip = 127.0.0.2" + + ceph mon remove e + + wait_for_health "MON_COLOCATED" || return 1 + wait_for_health "2 monitors (a,d) share the same ip = 127.0.0.1" + + ceph mon remove d + + wait_for_health_ok || return 1 + + teardown $dir || return 1 +} +main mon-colocated "$@" \ No newline at end of file diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in index 9e9554b60df..9c252017883 100644 --- a/src/common/options/mon.yaml.in +++ b/src/common/options/mon.yaml.in @@ -634,6 +634,14 @@ options: default: true services: - mon +- name: mon_warn_on_colocated_monitors + type: bool + level: advanced + desc: issue MON_COLOCATED health warning if two or + more monitors have the same IP address + default: false + services: + - mon - name: mon_stretch_cluster_recovery_ratio type: float level: advanced diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc index 000aafb2511..36ffc903487 100644 --- a/src/mon/HealthMonitor.cc +++ b/src/mon/HealthMonitor.cc @@ -755,6 +755,11 @@ bool HealthMonitor::check_leader_health() //CHECK_ERASURE_CODE_PROFILE check_erasure_code_profiles(&next); + // MON_COLOCATED + if (g_conf().get_val("mon_warn_on_colocated_monitors")) { + check_for_colocated_monitors(&next); + } + if (next != leader_checks) { changed = true; leader_checks = next; @@ -762,6 +767,40 @@ bool HealthMonitor::check_leader_health() return changed; } +void HealthMonitor::check_for_colocated_monitors(health_check_map_t *checks) +{ + std::unordered_map> unique_addrs; + for (auto& [mon_id, mon_info] : mon.monmap->mon_info) { + std::string ip = mon_info.public_addrs.msgr2_addr().ip_only_to_str(); + unique_addrs[ip].push_back(mon_id); + } + + bool has_colocated_mon = false; + ostringstream ss, ds; + for (const auto& [ip, mon_ids]: unique_addrs) { + unsigned size = mon_ids.size(); + if (size > 1) { + has_colocated_mon = true; + ss << size << " monitors ("; + for (size_t i = 0; i < size; ++i) { + ss << mon_ids[i]; + if (i != size - 1) + ss << ","; + } + ss << ") share the same ip = " << ip; + ss << "\n"; + for(const auto& name: mon_ids) { + ds << "mon." << name << " is on the same node as another monitor\n"; + } + } + } + + if (has_colocated_mon) { + auto& d = checks->add("MON_COLOCATED", HEALTH_WARN, ss.str(), 1); + d.detail.push_back(ds.str()); + } +} + void HealthMonitor::check_for_older_version(health_check_map_t *checks) { static ceph::coarse_mono_time old_version_first_time = diff --git a/src/mon/HealthMonitor.h b/src/mon/HealthMonitor.h index b26a40d7c9d..31bc3cd231e 100644 --- a/src/mon/HealthMonitor.h +++ b/src/mon/HealthMonitor.h @@ -75,6 +75,7 @@ private: bool prepare_command(MonOpRequestRef op); bool prepare_health_checks(MonOpRequestRef op); + void check_for_colocated_monitors(health_check_map_t *checks); void check_for_older_version(health_check_map_t *checks); void check_for_mon_down(health_check_map_t *checks, std::set &mons_down); void check_for_clock_skew(health_check_map_t *checks);