From: Tom Sollers Date: Thu, 2 Oct 2025 13:36:16 +0000 (+0100) Subject: mon: Add a new health warning for blaum-roth erasure code profiles with a w+1 that... X-Git-Tag: v21.0.0~209^2~11^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=b9aca15605aa810c9e84e58e9696ddaeaeeda271;p=ceph.git mon: Add a new health warning for blaum-roth erasure code profiles with a w+1 that is not prime This commit adds a new health warning for when a user has an erasure code profile using the blaum-roth technique which has a w+1 value that is not prime. Fixes: http://tracker.ceph.com/issues/64419 Signed-off-by: Tom Sollers --- diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst index 30a9bd64405f..67dab4a8f1b4 100644 --- a/doc/rados/operations/health-checks.rst +++ b/doc/rados/operations/health-checks.rst @@ -1900,3 +1900,21 @@ To disable the debug mode, run the following command: .. prompt:: bash $ ceph dashboard debug disable + +BLAUM_ROTH_W_IS_NOT_PRIME +_________________________ + +An EC pool is using the ``blaum_roth`` technique and ``w + 1`` is not a prime number. +This can result in data corruption. + +To check the list of Erasure Code Profiles use the command: + +.. prompt:: bash $ + + ceph osd erasure-code-profile ls + +Then to check the ``w`` value for a particular profile use the command: + +.. prompt:: bash $ + + ceph osd erasure-code-profile get \ No newline at end of file diff --git a/src/erasure-code/jerasure/ErasureCodeJerasure.cc b/src/erasure-code/jerasure/ErasureCodeJerasure.cc index 401299d3ca7a..aea8d4e7bc03 100644 --- a/src/erasure-code/jerasure/ErasureCodeJerasure.cc +++ b/src/erasure-code/jerasure/ErasureCodeJerasure.cc @@ -707,14 +707,14 @@ void ErasureCodeJerasureLiberation::prepare() // ErasureCodeJerasureBlaumRoth // bool ErasureCodeJerasureBlaumRoth::check_w(ostream *ss) const -{ +{ // back in Firefly, w = 7 was the default and produced usable // chunks. Tolerate this value for backward compatibility. if (w == 7) return true; if (w <= 2 || !is_prime(w+1)) { *ss << "w=" << w << " must be greater than two and " - << "w+1 must be prime" << std::endl; + << "w+1 must be prime" << std::endl; return false; } else { return true; diff --git a/src/erasure-code/jerasure/ErasureCodeJerasure.h b/src/erasure-code/jerasure/ErasureCodeJerasure.h index 14ea1d99d580..fe084c0537c0 100644 --- a/src/erasure-code/jerasure/ErasureCodeJerasure.h +++ b/src/erasure-code/jerasure/ErasureCodeJerasure.h @@ -305,6 +305,7 @@ public: ErasureCodeJerasureBlaumRoth() : ErasureCodeJerasureLiberation("blaum_roth") { + DEFAULT_W = "6"; } bool check_w(std::ostream *ss) const override; diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc index b78f900330ea..1a2f8639dc1d 100644 --- a/src/mon/HealthMonitor.cc +++ b/src/mon/HealthMonitor.cc @@ -27,6 +27,8 @@ #include "mon/Monitor.h" #include "mon/HealthMonitor.h" #include "mon/OSDMonitor.h" +#include "osd/OSDMap.h" + #include "messages/MMonCommand.h" #include "messages/MMonHealthChecks.h" @@ -749,6 +751,9 @@ bool HealthMonitor::check_leader_health() // STRETCH MODE check_mon_crush_loc_stretch_mode(&next); + //CHECK_ERASURE_CODE_PROFILE + check_erasure_code_profiles(&next); + if (next != leader_checks) { changed = true; leader_checks = next; @@ -1169,3 +1174,34 @@ void HealthMonitor::check_netsplit(health_check_map_t *checks, std::set details; + + //This is a loop that will go through all the erasure code profiles + for (auto& erasure_code_profile : mon.osdmon()->osdmap.get_erasure_code_profiles()) { + dout(20) << "check_erasure_code_profiles" << "checking" << erasure_code_profile << dendl; + + //This will look at the erasure code profiles technique is blaum_roth and will check that the w key exists + if (erasure_code_profile.second.at("technique") == "blaum_roth" && + erasure_code_profile.second.count("w") == 1) { + //Read the w value from the profile and convert it to an int + int w = std::stoi(erasure_code_profile.second.at("w")); + + if (!mon.is_prime(w + 1)) { + ostringstream ds; + ds << "w+1="<< w+1 << " for the EC profile " << erasure_code_profile.first + << " is not prime and could lead to data corruption"; + details.push_back(ds.str()); + } + } + } + if (!details.empty()) { + ostringstream ss; + ss << "1 or more EC profiles have a w value such that w+1 is not prime." + << " This can result in data corruption"; + auto &d = checks->add("BLAUM_ROTH_W_IS_NOT_PRIME", HEALTH_WARN, ss.str(), details.size()); + d.detail.swap(details); + } +} diff --git a/src/mon/HealthMonitor.h b/src/mon/HealthMonitor.h index 52dc82f35946..52393c599b7f 100644 --- a/src/mon/HealthMonitor.h +++ b/src/mon/HealthMonitor.h @@ -73,6 +73,7 @@ private: void check_for_clock_skew(health_check_map_t *checks); void check_mon_crush_loc_stretch_mode(health_check_map_t *checks); void check_if_msgr2_enabled(health_check_map_t *checks); + void check_erasure_code_profiles(health_check_map_t *checks); void check_netsplit(health_check_map_t *checks, std::set &mons_down); bool check_leader_health(); bool check_member_health(); diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 1d8245c98f04..810e8500e187 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -7083,3 +7083,19 @@ void Monitor::disconnect_disallowed_stretch_sessions() session_stretch_allowed(*j, blank); } } + +// A utility function that will check to see if the given value is prime using a set of the first 55 prime numbers +bool Monitor::is_prime(int value) { + int prime55[] = { + 2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71, + 73,79,83,89,97,101,103,107,109,113,127,131,137,139,149, + 151,157,163,167,173,179, + 181,191,193,197,199,211,223,227,229,233,239,241,251,257 + }; + + for (int i: prime55) + if (value == i) + return true; + + return false; +} diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h index 93c758dc5019..8e202dbb5e5a 100644 --- a/src/mon/Monitor.h +++ b/src/mon/Monitor.h @@ -283,6 +283,8 @@ public: bool is_degraded_stretch_mode() { return degraded_stretch_mode; } bool is_recovering_stretch_mode() { return recovering_stretch_mode; } + bool is_prime(int value); // A utility funtion that returns true if value is prime + /** * This set of functions maintains the in-memory stretch state * and sets up transitions of the map states by calling in to diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 29ceebe21061..b06515eb7e2f 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -11668,6 +11668,43 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, err = -EINVAL; goto reply_no_propose; } + + bool force_no_fake = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", force_no_fake); + + + + //This is the start of the validation for the w value in a blaum_roth profile + //this will search the Profile map, which contains the values for the parameters given in the command, for the technique parameter + if (auto found = profile_map.find("technique"); found != profile_map.end()) { + + //if the technique parameter is found then save the value of it + string technique = found->second; + + + //then search the profile map again for the w value, which doesnt have to be specified, and if it is found and the technique used is blaum-roth then check that the w value is correct. + if (found = profile_map.find("w"); technique == "blaum_roth" && found != profile_map.end()) { + int w = std::stoi(found->second); + + //checks if w+1 is not prime + if (w <= 2 || !mon.is_prime(w + 1)) { + + if (force ^ force_no_fake) { + err = -EPERM; + ss << "Creating a blaum-roth erasure code profile with a w+1 value that is not prime is dangerious," + << " as it can cause data corruption." + << " You need to use both --yes-i-really-mean-it and --force flags." << std::endl; + goto reply_no_propose; + } else if (!force && !force_no_fake) { + ss << "erasure-code-profile: " << profile_map + << " must use a w value such that w+1 is prime and w is greater than 2." << std::endl; + err = -EINVAL; + goto reply_no_propose; + } + } + } + } + string plugin = profile_map["plugin"]; if (pending_inc.has_erasure_code_profile(name)) { @@ -11689,8 +11726,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, err = 0; goto reply_no_propose; } - bool force_no_fake = false; - cmd_getval(cmdmap, "yes_i_really_mean_it", force_no_fake); + if (!force) { err = -EPERM; ss << "will not override erasure code profile " << name