From: Kamoltat (Junior) Sirivadhna Date: Wed, 15 Apr 2026 22:34:08 +0000 (+0000) Subject: mon: make tiebreaker mon optional in stretch-mode X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=aa9e1dad056fa5cb611310a8c0ed9d1e890b3f2d;p=ceph.git mon: make tiebreaker mon optional in stretch-mode Motivation: To future support EC stretch feature, we need to simplify how we enable stretch-mode Solution: Make tiebreaker argument optional. old: ``` ceph mon enable_stretch_mode ``` new: ceph mon enable_stretch_mode Ceph will try to select a tiebreaker mon that resides in the crush type but doesn't belong to any of the data sites which the OSDs resides in. Also created a helper function `MonmapMonitor::validate_and_enable_stretch_mode` inside `MonmapMonitor::try_enable_stretch_mode` making the logic unittestable Moreover, ceph mon enable_stretch_mode will automatically set monitor election strategy to Connectivity. We now also enforce that at least 1 monitor exists for each data zone. Signed-off-by: Kamoltat (Junior) Sirivadhna --- diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc index f03ada651ebd..fcc30f840f78 100644 --- a/src/crush/CrushWrapper.cc +++ b/src/crush/CrushWrapper.cc @@ -1729,12 +1729,12 @@ int CrushWrapper::get_parent_of_type(int item, int type, int rule) const return 0; // not found } -void CrushWrapper::get_subtree_of_type(int type, vector *subtrees) +void CrushWrapper::get_subtree_of_type(int type, vector *subtrees) const { set roots; find_roots(&roots); for (auto r: roots) { - crush_bucket *b = get_bucket(r); + const crush_bucket *b = get_bucket(r); if (IS_ERR(b)) continue; get_children_of_type(b->id, type, subtrees); diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h index a8bf035d4ff0..064a966a26c9 100644 --- a/src/crush/CrushWrapper.h +++ b/src/crush/CrushWrapper.h @@ -787,7 +787,7 @@ public: /** * enumerate all subtrees by type */ - void get_subtree_of_type(int type, std::vector *subtrees); + void get_subtree_of_type(int type, std::vector *subtrees) const; /** diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index a29fa23cad9b..825005e087d5 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -551,7 +551,7 @@ COMMAND("mon set_location " \ "specify location for the monitor , using CRUSH bucket names", \ "mon", "rw") COMMAND("mon enable_stretch_mode " \ - "name=tiebreaker_mon,type=CephString, " + "name=tiebreaker_mon,type=CephString,req=false, " "name=new_crush_rule,type=CephString, " "name=dividing_bucket,type=CephString, ", "enable stretch mode, changing the peering rules and " diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc index 23e72f32e7b5..a476ac678b0e 100644 --- a/src/mon/MonmapMonitor.cc +++ b/src/mon/MonmapMonitor.cc @@ -1141,76 +1141,80 @@ bool MonmapMonitor::prepare_command(MonOpRequestRef op) mon.osdmon()->wait_for_writeable(op, new Monitor::C_RetryMessage(&mon, op)); return false; /* do not propose, yet */ } - { - if (monmap.stretch_mode_enabled) { - ss << "stretch mode is already engaged"; - err = -EINVAL; - goto reply_no_propose; - } - if (pending_map.stretch_mode_enabled) { - ss << "stretch mode currently committing"; - err = 0; - goto reply_no_propose; - } - string tiebreaker_mon; - if (!cmd_getval(cmdmap, "tiebreaker_mon", tiebreaker_mon)) { - ss << "must specify a tiebreaker monitor"; - err = -EINVAL; - goto reply_no_propose; - } - string new_crush_rule; - if (!cmd_getval(cmdmap, "new_crush_rule", new_crush_rule)) { - ss << "must specify a new crush rule that spreads out copies over multiple sites"; - err = -EINVAL; - goto reply_no_propose; - } - string dividing_bucket; - if (!cmd_getval(cmdmap, "dividing_bucket", dividing_bucket)) { - ss << "must specify a dividing bucket"; - err = -EINVAL; - goto reply_no_propose; - } - //okay, initial arguments make sense, check pools and cluster state - err = mon.osdmon()->check_cluster_features(CEPH_FEATUREMASK_STRETCH_MODE, ss); - if (err) - goto reply_no_propose; - struct Plugger { - Paxos &p; - Plugger(Paxos &p) : p(p) { p.plug(); } - ~Plugger() { p.unplug(); } - } plugger(paxos); - - set pools; - bool okay = false; - int errcode = 0; - - mon.osdmon()->try_enable_stretch_mode_pools(ss, &okay, &errcode, - &pools, new_crush_rule); - if (!okay) { - err = errcode; - goto reply_no_propose; - } - try_enable_stretch_mode(ss, &okay, &errcode, false, - tiebreaker_mon, dividing_bucket); - if (!okay) { - err = errcode; - goto reply_no_propose; - } - mon.osdmon()->try_enable_stretch_mode(ss, &okay, &errcode, false, - dividing_bucket, 2, pools, new_crush_rule); - if (!okay) { - err = errcode; - goto reply_no_propose; - } - // everything looks good, actually commit the changes! - try_enable_stretch_mode(ss, &okay, &errcode, true, - tiebreaker_mon, dividing_bucket); - mon.osdmon()->try_enable_stretch_mode(ss, &okay, &errcode, true, - dividing_bucket, - 2, // right now we only support 2 sites - pools, new_crush_rule); - ceph_assert(okay == true); + if (monmap.stretch_mode_enabled) { + ss << "stretch mode is already engaged"; + err = -EINVAL; + goto reply_no_propose; + } + if (pending_map.stretch_mode_enabled) { + ss << "stretch mode currently committing"; + err = 0; + goto reply_no_propose; + } + string new_crush_rule; + if (!cmd_getval(cmdmap, "new_crush_rule", new_crush_rule)) { + ss << "must specify a new crush rule that spreads out copies over multiple sites"; + err = -EINVAL; + goto reply_no_propose; + } + string dividing_bucket; + if (!cmd_getval(cmdmap, "dividing_bucket", dividing_bucket)) { + ss << "must specify a dividing bucket"; + err = -EINVAL; + goto reply_no_propose; + } + string tiebreaker_mon; + if (!cmd_getval(cmdmap, "tiebreaker_mon", tiebreaker_mon)) { + tiebreaker_mon = ""; + } + //okay, initial arguments make sense, check pools and cluster state + err = mon.osdmon()->check_cluster_features(CEPH_FEATUREMASK_STRETCH_MODE, ss); + if (err) goto reply_no_propose; + + // Get pending CRUSH once for validation + CrushWrapper crush = mon.osdmon()->_get_pending_crush(); + + set pools; + bool okay = false; + int errcode = 0; + struct Plugger { + // RAII helper to make sure no paxos commits during critical section. + // Auto unplug upon destruction. + public: + explicit Plugger(Paxos &p) : paxos(p) { paxos.plug(); } + ~Plugger() { paxos.unplug(); } + Plugger(const Plugger&) = delete; // prevent copying + Plugger& operator=(const Plugger&) = delete; // prevent object assignment + private: + Paxos &paxos; + }; + Plugger plugger(paxos); + mon.osdmon()->try_enable_stretch_mode_pools(ss, &okay, &errcode, + &pools, new_crush_rule); + if (!okay) { + err = errcode; + goto reply_no_propose; + } + try_enable_stretch_mode(ss, &okay, &errcode, false, + tiebreaker_mon, dividing_bucket, crush); + if (!okay) { + err = errcode; + goto reply_no_propose; } + mon.osdmon()->try_enable_stretch_mode(ss, &okay, &errcode, false, + dividing_bucket, 2, pools, new_crush_rule, crush); + if (!okay) { + err = errcode; + goto reply_no_propose; + } + // everything looks good, actually commit the changes! + try_enable_stretch_mode(ss, &okay, &errcode, true, + tiebreaker_mon, dividing_bucket, crush); + mon.osdmon()->try_enable_stretch_mode(ss, &okay, &errcode, true, + dividing_bucket, + 2, // right now we only support 2 sites + pools, new_crush_rule, crush); + ceph_assert(okay == true); request_proposal(mon.osdmon()); } else if (prefix == "mon disable_stretch_mode") { if (!mon.osdmon()->is_writeable()) { @@ -1276,78 +1280,168 @@ reply_propose: } } -void MonmapMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay, - int *errcode, bool commit, - const string& tiebreaker_mon, - const string& dividing_bucket) +void MonmapMonitor::validate_and_enable_stretch_mode( + const MonMap& monmap, // passed as arg for unittest + MonMap& pending_map, // passed as arg for unittest + stringstream& ss, bool *okay, + int *errcode, bool commit, + string tiebreaker_mon, + const string& dividing_bucket, + const CrushWrapper& crush) { - dout(20) << __func__ << dendl; + /* A helper function so that we can unittest + * the logic of going into stretch mode. + * Validates against the current committed monmap state, + * but modifies pending_map only when commit=true. + */ *okay = false; - if (pending_map.strategy != MonMap::CONNECTIVITY) { - ss << "Monitors must use the connectivity strategy to enable stretch mode"; + MonMap::election_strategy strategy = pending_map.strategy; + if (strategy != MonMap::CONNECTIVITY) { + strategy = MonMap::CONNECTIVITY; + } + // Validate tiebreaker_mon only if explicitly specified + if (!tiebreaker_mon.empty() && !monmap.contains(tiebreaker_mon)) { + ss << "mon " << tiebreaker_mon << " does not seem to exist"; + *errcode = -ENOENT; + return; + } + + // Get CRUSH subtrees and dividing_id + int dividing_id = *crush.get_validated_type_id(dividing_bucket); + vector subtrees; + crush.get_subtree_of_type(dividing_id, &subtrees); + if (subtrees.size() != 2) { + ss << "there are " << subtrees.size() << " " << dividing_bucket + << "'s in the cluster but stretch mode currently only works with 2!"; *errcode = -EINVAL; - ceph_assert(!commit); return; } - if (!pending_map.contains(tiebreaker_mon)) { - ss << "mon " << tiebreaker_mon << "does not seem to exist"; - *errcode = -ENOENT; - ceph_assert(!commit); + + // Get subtree names from CRUSH + set subtree_names; + for (int subtree_id : subtrees) { + const char *subtree_name = crush.get_item_name(subtree_id); + if (subtree_name) { + subtree_names.insert(subtree_name); + } + } + + if (subtree_names.size() != 2) { + ss << "Could not get names for both CRUSH subtrees"; + *errcode = -EINVAL; return; } - map buckets; - for (const auto&mii : mon.monmap->mon_info) { - const auto& mi = mii.second; - const auto& bi = mi.crush_loc.find(dividing_bucket); - if (bi == mi.crush_loc.end()) { + + // Build map of monitor names to their location at dividing_bucket level + map mon_name_to_crush_loc; + for (const auto& [mon_name, mon_info] : monmap.mon_info) { + auto loc_it = mon_info.crush_loc.find(dividing_bucket); + if (loc_it == mon_info.crush_loc.end()) { ss << "Could not find location entry for " << dividing_bucket - << " on monitor " << mi.name; + << " on monitor " << mon_name; *errcode = -EINVAL; - ceph_assert(!commit); return; } - buckets[mii.first] = bi->second; + mon_name_to_crush_loc.emplace(mon_name, loc_it->second); } - string bucket1, bucket2, tiebreaker_bucket; - for (auto& i : buckets) { - if (i.first == tiebreaker_mon) { - tiebreaker_bucket = i.second; - continue; - } - if (bucket1.empty()) { - bucket1 = i.second; - } - if (bucket1 != i.second && - bucket2.empty()) { - bucket2 = i.second; - } - if (bucket1 != i.second && - bucket2 != i.second) { - ss << "There are too many monitor buckets for stretch mode, found " - << bucket1 << "," << bucket2 << "," << i.second; - *errcode = -EINVAL; - ceph_assert(!commit); - return; + + // Identify the two data zones - they must match the CRUSH subtree names + map> location_to_mons; + for (const auto& [mon_name, loc_name] : mon_name_to_crush_loc) { + location_to_mons[loc_name].insert(mon_name); + } + + vector data_zones; + for (const string& subtree_name : subtree_names) { + if (location_to_mons.count(subtree_name)) { + data_zones.push_back(subtree_name); } } - if (bucket1.empty() || bucket2.empty()) { - ss << "There are not enough monitor buckets for stretch mode;" - << " must have at least 2 plus the tiebreaker but only found " - << (bucket1.empty() ? bucket1 : bucket2); + + if (data_zones.size() != 2) { + ss << "Could not find monitors in both CRUSH subtrees ("; + for (auto it = subtree_names.begin(); it != subtree_names.end(); ++it) { + if (it != subtree_names.begin()) ss << ", "; + ss << *it; + } + ss << "); found monitors only in: "; + for (size_t i = 0; i < data_zones.size(); ++i) { + if (i > 0) ss << ", "; + ss << data_zones[i]; + } *errcode = -EINVAL; - ceph_assert(!commit); return; } - if (tiebreaker_bucket == bucket1 || - tiebreaker_bucket == bucket2) { - ss << "The named tiebreaker monitor " << tiebreaker_mon - << " is in the same CRUSH bucket " << tiebreaker_bucket - << " as other monitors"; + + string data_zone1 = data_zones[0]; + string data_zone2 = data_zones[1]; + + // Verify each data zone has at least 1 monitor + size_t zone1_mon_count = location_to_mons[data_zone1].size(); + size_t zone2_mon_count = location_to_mons[data_zone2].size(); + + if (zone1_mon_count == 0 || zone2_mon_count == 0) { + ss << "Each data zone must have at least 1 monitor; " + << data_zone1 << " has " << zone1_mon_count << ", " + << data_zone2 << " has " << zone2_mon_count; *errcode = -EINVAL; - ceph_assert(!commit); return; } + + // Auto-select tiebreaker monitor if not specified + if (tiebreaker_mon.empty()) { + // Find tiebreaker monitor(s) - must be exactly one monitor not in data zones + vector tiebreaker_mons; + for (const auto& [mon_name, loc_name] : mon_name_to_crush_loc) { + if (loc_name != data_zone1 && loc_name != data_zone2) { + tiebreaker_mons.push_back(mon_name); + } + } + + if (tiebreaker_mons.empty()) { + ss << "Could not auto-select a tiebreaker monitor; " + << "must have a monitor in a third " << dividing_bucket + << " location (found only " << data_zone1 << " and " << data_zone2 << ")"; + *errcode = -EINVAL; + return; + } + + if (tiebreaker_mons.size() > 1) { + ss << "Could not auto-select a tiebreaker monitor; " + << "found " << tiebreaker_mons.size() << " monitors (" << tiebreaker_mons + << ") not in the 2 data zones " + << data_zone1 << " and " << data_zone2 << ", need exactly 1"; + *errcode = -EINVAL; + return; + } + + // Exactly one tiebreaker monitor found + tiebreaker_mon = tiebreaker_mons[0]; + } else { + // Validate explicitly specified tiebreaker monitor + auto tiebreaker_loc_it = mon_name_to_crush_loc.find(tiebreaker_mon); + if (tiebreaker_loc_it == mon_name_to_crush_loc.end()) { + // tiebreaker doesn't belong to any of the dividing_bucket locations + ss << "tiebreaker monitor " << tiebreaker_mon + << " does not have a location specified for " << dividing_bucket; + *errcode = -EINVAL; + return; + } + + const string& tiebreaker_location = tiebreaker_loc_it->second; + if (tiebreaker_location == data_zone1 || tiebreaker_location == data_zone2) { + ss << "tiebreaker monitor " << tiebreaker_mon + << " is in " << tiebreaker_location + << ", which is one of the two data zones (" + << data_zone1 << ", " << data_zone2 + << "); tiebreaker must be in a third location"; + *errcode = -EINVAL; + return; + } + } + if (commit) { + pending_map.strategy = strategy; pending_map.disallowed_leaders.insert(tiebreaker_mon); pending_map.tiebreaker_mon = tiebreaker_mon; pending_map.stretch_mode_enabled = true; @@ -1355,6 +1449,40 @@ void MonmapMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay, *okay = true; } +void MonmapMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay, + int *errcode, bool commit, + string tiebreaker_mon, + const string& dividing_bucket, + const CrushWrapper& crush) +{ + dout(20) << __func__ << dendl; + + // Check if monitors support connectivity election strategy if not already using it + if (pending_map.strategy != MonMap::CONNECTIVITY) { + if (!mon.get_quorum_mon_features().contains_all( + ceph::features::mon::FEATURE_PINGING)) { + *okay = false; + *errcode = -ENOTSUP; + ss << "Not all monitors support changing election strategies; please upgrade first!"; + return; + } + } + + validate_and_enable_stretch_mode(*mon.monmap, pending_map, ss, okay, errcode, commit, + tiebreaker_mon, dividing_bucket, crush); + + // Add debug logging if successful + if (*okay && !tiebreaker_mon.empty()) { + // Note: tiebreaker_mon may have been auto-selected, but we can't easily retrieve it here + // The validate_and_enable_stretch_mode function modifies it internally + dout(20) << __func__ << " stretch mode validation succeeded" << dendl; + } + + if (!*okay) { + ceph_assert(!commit); + } +} + void MonmapMonitor::trigger_degraded_stretch_mode(const set& dead_mons) { dout(20) << __func__ << dendl; diff --git a/src/mon/MonmapMonitor.h b/src/mon/MonmapMonitor.h index 612b967b65ac..f6b3e272d8bd 100644 --- a/src/mon/MonmapMonitor.h +++ b/src/mon/MonmapMonitor.h @@ -89,13 +89,35 @@ private: * @param okay: Filled to true if okay, false if validation fails * @param errcode: filled with -errno if there's a problem * @param commit: true if we should commit the change, false if just testing - * @param tiebreaker_mon: the name of the monitor to declare tiebreaker + * @param tiebreaker_mon: the name of the monitor to declare tiebreaker (empty for auto-select) * @param dividing_bucket: the bucket type (eg 'dc') that divides the cluster + * @param crush: the pending CrushWrapper for validating monitor locations against subtrees + * + * Note: CRUSH bucket type and subtree count validation is performed by + * OSDMonitor::try_enable_stretch_mode() to avoid redundancy. */ void try_enable_stretch_mode(std::stringstream& ss, bool *okay, int *errcode, bool commit, - const std::string& tiebreaker_mon, - const std::string& dividing_bucket); + std::string tiebreaker_mon, + const std::string& dividing_bucket, + const CrushWrapper& crush); + +public: + /** + * Static helper for validating and enabling stretch mode on a MonMap. + * Extracted for testability - can be called from unit tests. + * Parameters same as try_enable_stretch_mode above. + * @param monmap: current committed monmap to validate against + * @param pending_map: pending monmap to modify when commit=true + */ + static void validate_and_enable_stretch_mode( + const MonMap& monmap, + MonMap& pending_map, + std::stringstream& ss, bool *okay, + int *errcode, bool commit, + std::string tiebreaker_mon, + const std::string& dividing_bucket, + const CrushWrapper& crush); public: /** diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 856fce6ff18a..0413dc1fa442 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -15582,6 +15582,10 @@ void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay, set* pools, const string& new_crush_rule) { + /* Validate pool configurations for stretch mode enablement. + * This checks that the specified crush rule exists and that all pools + * are replicated pools with default size/min_size. + */ dout(20) << __func__ << dendl; *okay = false; int new_crush_rule_result = osdmap.crush->get_rule_id(new_crush_rule); @@ -15623,11 +15627,11 @@ void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay, const string& dividing_bucket, uint32_t bucket_count, const set& pools, - const string& new_crush_rule) + const string& new_crush_rule, + CrushWrapper& crush) { dout(20) << __func__ << dendl; *okay = false; - CrushWrapper crush = _get_pending_crush(); int dividing_id = -1; if (auto type_id = crush.get_validated_type_id(dividing_bucket); !type_id.has_value()) { diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index b37681067fa2..5b3b633cca1b 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -851,7 +851,8 @@ public: const std::string& dividing_bucket, uint32_t bucket_count, const std::set& pools, - const std::string& new_crush_rule); + const std::string& new_crush_rule, + CrushWrapper& crush); /** * * Set all stretch mode values of all pools back to pre-stretch mode values.