From bca9920b003f5dd1bdb431ac76071756dc008ee1 Mon Sep 17 00:00:00 2001 From: Sridhar Seshasayee Date: Wed, 11 Sep 2024 18:55:10 +0530 Subject: [PATCH] common,osd: Use last valid OSD IOPS value if measured IOPS is unrealistic The OSD's IOPS capacity is used by the mClock scheduler to determine the quantum of bandwidth allocation for the various operations on the OSD. Prior to this commit, maybe_override_max_osd_capacity_for_qos() only checked if the measured IOPS capacity exceeded the higher threshold defined by 'osd_mclock_iops_capacity_threshold_[hdd|ssd]' and if so fallback to the last valid or the default IOPS capacity as defined by osd_mclock_max_capacity_iops_[hdd|ssd]. It's quite possible that the reported IOPS is unrealistically low. This could be due to transient factors on the underlying device or it could indicate bad health of the device. Either way, the safer option would be to fallback to the last valid or the default IOPS setting for that OSD in order to avoid cluster performance (slow or stalled ops) issues down the line. Therefore, to handle this case, the commit introduces additional config options viz., - osd_mclock_iops_capacity_low_threshold_hdd - set to 50 IOPS and - osd_mclock_iops_capacity_low_threshold_ssd - set to 1000 IOPS If the measured IOPS capacity doesn't fall within the low and high threshold range, the default or the last valid IOPS capacity is used. The existing cluster log warning is suitably modified to convey the reason. Additionally, for a couple of valgrind related teuthology tests, the cluster warning is added to the ignorelist since the reported IOPS can be very low due to slowness. Fixes: https://tracker.ceph.com/issues/67421 Signed-off-by: Sridhar Seshasayee (cherry picked from commit da4b85c55a15f49b241f3fc44dda2263b42dc637) --- doc/rados/configuration/mclock-config-ref.rst | 2 + qa/suites/rados/valgrind-leaks/1-start.yaml | 1 + .../rados/verify/validater/valgrind.yaml | 1 + src/common/options/osd.yaml.in | 58 ++++++++++++++++--- src/osd/OSD.cc | 22 ++++--- 5 files changed, 68 insertions(+), 16 deletions(-) diff --git a/doc/rados/configuration/mclock-config-ref.rst b/doc/rados/configuration/mclock-config-ref.rst index a338aa6da56..a31b43492b9 100644 --- a/doc/rados/configuration/mclock-config-ref.rst +++ b/doc/rados/configuration/mclock-config-ref.rst @@ -694,6 +694,8 @@ mClock Config Options .. confval:: osd_mclock_skip_benchmark .. confval:: osd_mclock_override_recovery_settings .. confval:: osd_mclock_iops_capacity_threshold_hdd +.. confval:: osd_mclock_iops_capacity_low_threshold_hdd .. confval:: osd_mclock_iops_capacity_threshold_ssd +.. confval:: osd_mclock_iops_capacity_low_threshold_ssd .. _the dmClock algorithm: https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Gulati.pdf diff --git a/qa/suites/rados/valgrind-leaks/1-start.yaml b/qa/suites/rados/valgrind-leaks/1-start.yaml index 1cdd8a688e8..cc8c8e53766 100644 --- a/qa/suites/rados/valgrind-leaks/1-start.yaml +++ b/qa/suites/rados/valgrind-leaks/1-start.yaml @@ -12,6 +12,7 @@ overrides: - overall HEALTH_ - \(PG_ - \(POOL_APP_NOT_ENABLED\) + - OSD bench result conf: global: osd heartbeat grace: 40 diff --git a/qa/suites/rados/verify/validater/valgrind.yaml b/qa/suites/rados/verify/validater/valgrind.yaml index 03accceaff2..32ba6c2aab8 100644 --- a/qa/suites/rados/verify/validater/valgrind.yaml +++ b/qa/suites/rados/verify/validater/valgrind.yaml @@ -23,6 +23,7 @@ overrides: - \(MON_DOWN\) - \(SLOW_OPS\) - slow request + - OSD bench result valgrind: mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes] osd: [--tool=memcheck] diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in index 9dc40735e48..9dff993d356 100644 --- a/src/common/options/osd.yaml.in +++ b/src/common/options/osd.yaml.in @@ -1207,12 +1207,33 @@ options: level: basic desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore the OSD bench results for an OSD (for rotational media) - long_desc: This option specifies the threshold IOPS capacity for an OSD under - which the OSD bench results can be considered for QoS calculations. Only - considered for osd_op_queue = mclock_scheduler + long_desc: This option specifies the high threshold IOPS capacity for an OSD + below which the OSD bench results can be considered for QoS calculations. + Only considered when osd_op_queue = mclock_scheduler fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to - ignore OSD bench results for an OSD (for rotational media) + ignore OSD bench results for an OSD (for rotational media) and fall back to + the last valid or default IOPS capacity defined by + ``osd_mclock_max_capacity_iops_hdd``. default: 500 + see_also: + - osd_mclock_max_capacity_iops_hdd + flags: + - runtime +- name: osd_mclock_iops_capacity_low_threshold_hdd + type: float + level: basic + desc: The threshold IOPs capacity (at 4KiB block size) below which to ignore + the OSD bench results for an OSD (for rotational media) + long_desc: This option specifies the low threshold IOPS capacity of an OSD + above which the OSD bench results can be considered for QoS calculations. + Only considered when osd_op_queue = mclock_scheduler + fmt_desc: The threshold IOPS capacity (at 4KiB block size) below which to + ignore OSD bench results for an OSD (for rotational media) and fall back to + the last valid or default IOPS capacity defined by + ``osd_mclock_max_capacity_iops_hdd``. + default: 50 + see_also: + - osd_mclock_max_capacity_iops_hdd flags: - runtime - name: osd_mclock_iops_capacity_threshold_ssd @@ -1220,12 +1241,33 @@ options: level: basic desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore the OSD bench results for an OSD (for solid state media) - long_desc: This option specifies the threshold IOPS capacity for an OSD under - which the OSD bench results can be considered for QoS calculations. Only - considered for osd_op_queue = mclock_scheduler + long_desc: This option specifies the high threshold IOPS capacity for an OSD + below which the OSD bench results can be considered for QoS calculations. + Only considered when osd_op_queue = mclock_scheduler fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to - ignore OSD bench results for an OSD (for solid state media) + ignore OSD bench results for an OSD (for solid state media) and fall back to + the last valid or default IOPS capacity defined by + ``osd_mclock_max_capacity_iops_ssd``. default: 80000 + see_also: + - osd_mclock_max_capacity_iops_ssd + flags: + - runtime +- name: osd_mclock_iops_capacity_low_threshold_ssd + type: float + level: basic + desc: The threshold IOPs capacity (at 4KiB block size) below which to ignore + the OSD bench results for an OSD (for solid state media) + long_desc: This option specifies the low threshold IOPS capacity for an OSD + above which the OSD bench results can be considered for QoS calculations. + Only considered when osd_op_queue = mclock_scheduler + fmt_desc: The threshold IOPS capacity (at 4KiB block size) below which to + ignore OSD bench results for an OSD (for solid state media) and fall back to + the last valid or default IOPS capacity defined by + ``osd_mclock_max_capacity_iops_ssd``. + default: 1000 + see_also: + - osd_mclock_max_capacity_iops_ssd flags: - runtime # Set to true for testing. Users should NOT set this. diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 7b36ad612c3..a4455a96cae 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -10090,22 +10090,28 @@ void OSD::maybe_override_max_osd_capacity_for_qos() << dendl; // Get the threshold IOPS set for the underlying hdd/ssd. - double threshold_iops = 0.0; + double hi_threshold_iops = 0.0; + double lo_threshold_iops = 0.0; if (store_is_rotational) { - threshold_iops = cct->_conf.get_val( + hi_threshold_iops = cct->_conf.get_val( "osd_mclock_iops_capacity_threshold_hdd"); + lo_threshold_iops = cct->_conf.get_val( + "osd_mclock_iops_capacity_low_threshold_hdd"); } else { - threshold_iops = cct->_conf.get_val( + hi_threshold_iops = cct->_conf.get_val( "osd_mclock_iops_capacity_threshold_ssd"); + lo_threshold_iops = cct->_conf.get_val( + "osd_mclock_iops_capacity_low_threshold_ssd"); } // Persist the iops value to the MON store or throw cluster warning - // if the measured iops exceeds the set threshold. If the iops exceed - // the threshold, the default value is used. - if (iops > threshold_iops) { + // if the measured iops is not in the threshold range. If the iops is + // not within the threshold range, the current/default value is retained. + if (iops < lo_threshold_iops || iops > hi_threshold_iops) { clog->warn() << "OSD bench result of " << std::to_string(iops) - << " IOPS exceeded the threshold limit of " - << std::to_string(threshold_iops) << " IOPS for osd." + << " IOPS is not within the threshold limit range of " + << std::to_string(lo_threshold_iops) << " IOPS and " + << std::to_string(hi_threshold_iops) << " IOPS for osd." << std::to_string(whoami) << ". IOPS capacity is unchanged" << " at " << std::to_string(cur_iops) << " IOPS. The" << " recommendation is to establish the osd's IOPS capacity" -- 2.39.5