From: Sridhar Seshasayee Date: Mon, 19 Sep 2022 08:48:33 +0000 (+0530) Subject: osd: Reduce default max backfill/recoveries for mclock to realistic limits X-Git-Tag: v17.2.6~276^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=81c0ca6cdc623278f64efd1daf65887d57ece621;p=ceph.git osd: Reduce default max backfill/recoveries for mclock to realistic limits The earlier max number of backfill and recovery limits were overly optimistic and caused recovery ops to overwhelm client ops regardless of the mclock profile type. Set the max backfill limit to 10. Set the max recovery limit to 10 for osds having HDD backing device. For osds having SSD backing device, set the recovery limit to 20. Introduce and new config option, 'osd_mclock_override_recovery_settings', that when set (false by default) allows the modification of the max backfill and recovery options with mclock scheduler enabled. It is important to note that with 'osd_mclock_override_recovery_settings' disabled, the config subsystem will still allow the changes made to the recovery and backfill settings when using the cli commands as the level at which the change is made is always greater than CONF_DEFAULT. To counter this, the osd will forcibly revert the recovery and backfill limits to the mClock defaults by removing the existing setting from the config store and raises a cluster log warning indicating the revert. This should force the user to set the 'osd_mclock_override_recovery_settings' option first and then make changes to the recovery and backfill limits. Fixes: https://tracker.ceph.com/issues/57529 Signed-off-by: Sridhar Seshasayee (cherry picked from commit 89e48395f8b1329066a1d7e05a4e9e083c88c1a6) --- diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in index d8c98b1f271..fea7f15342d 100644 --- a/src/common/options/osd.yaml.in +++ b/src/common/options/osd.yaml.in @@ -1107,6 +1107,27 @@ options: - custom flags: - runtime +- name: osd_mclock_override_recovery_settings + type: bool + level: advanced + desc: Setting this option enables the override of recovery/backfill limits + for the mClock scheduler. + long_desc: This option when set enables the override of the max recovery + active and the max backfills limits with mClock scheduler active. These + options are not modifiable when mClock scheduler is active. Any attempt + to modify these values without setting this option will reset the + recovery or backfill option back to its default value. + fmt_desc: Setting this option will enable the override of the + recovery/backfill limits for the mClock scheduler as defined by the + ``osd_recovery_max_active_hdd``, ``osd_recovery_max_active_ssd`` and + ``osd_max_backfills`` options. + default: false + see_also: + - osd_recovery_max_active_hdd + - osd_recovery_max_active_ssd + - osd_max_backfills + flags: + - runtime # Set to true for testing. Users should NOT set this. # If set to true even after reading enough shards to # decode the object, any error will be reported. diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 323d1edace3..b0be86a1998 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -3861,6 +3861,7 @@ int OSD::init() start_boot(); // Override a few options if mclock scheduler is enabled. + maybe_override_sleep_options_for_qos(); maybe_override_options_for_qos(); maybe_override_max_osd_capacity_for_qos(); @@ -9964,7 +9965,17 @@ void OSD::handle_conf_change(const ConfigProxy& conf, std::lock_guard l{osd_lock}; if (changed.count("osd_max_backfills") || - changed.count("osd_delete_sleep") || + changed.count("osd_recovery_max_active") || + changed.count("osd_recovery_max_active_hdd") || + changed.count("osd_recovery_max_active_ssd")) { + if (!maybe_override_options_for_qos(&changed) && + changed.count("osd_max_backfills")) { + // Scheduler is not "mclock". Fallback to earlier behavior + service.local_reserver.set_max(cct->_conf->osd_max_backfills); + service.remote_reserver.set_max(cct->_conf->osd_max_backfills); + } + } + if (changed.count("osd_delete_sleep") || changed.count("osd_delete_sleep_hdd") || changed.count("osd_delete_sleep_ssd") || changed.count("osd_delete_sleep_hybrid") || @@ -9976,16 +9987,8 @@ void OSD::handle_conf_change(const ConfigProxy& conf, changed.count("osd_recovery_sleep") || changed.count("osd_recovery_sleep_hdd") || changed.count("osd_recovery_sleep_ssd") || - changed.count("osd_recovery_sleep_hybrid") || - changed.count("osd_recovery_max_active") || - changed.count("osd_recovery_max_active_hdd") || - changed.count("osd_recovery_max_active_ssd")) { - if (!maybe_override_options_for_qos() && - changed.count("osd_max_backfills")) { - // Scheduler is not "mclock". Fallback to earlier behavior - service.local_reserver.set_max(cct->_conf->osd_max_backfills); - service.remote_reserver.set_max(cct->_conf->osd_max_backfills); - } + changed.count("osd_recovery_sleep_hybrid")) { + maybe_override_sleep_options_for_qos(); } if (changed.count("osd_min_recovery_priority")) { service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority); @@ -10154,30 +10157,105 @@ void OSD::maybe_override_max_osd_capacity_for_qos() } } -bool OSD::maybe_override_options_for_qos() +bool OSD::maybe_override_options_for_qos(const std::set *changed) { - // If the scheduler enabled is mclock, override the recovery, backfill - // and sleep options so that mclock can meet the QoS goals. + // Override options only if the scheduler enabled is mclock and the + // underlying objectstore is supported by mclock if (cct->_conf.get_val("osd_op_queue") == "mclock_scheduler" && !unsupported_objstore_for_qos()) { - dout(1) << __func__ - << ": Changing recovery/backfill/sleep settings for QoS" << dendl; - - // Set high value for recovery max active - uint32_t rec_max_active = 1000; - cct->_conf.set_val( - "osd_recovery_max_active", std::to_string(rec_max_active)); - cct->_conf.set_val( - "osd_recovery_max_active_hdd", std::to_string(rec_max_active)); - cct->_conf.set_val( - "osd_recovery_max_active_ssd", std::to_string(rec_max_active)); - - // Set high value for osd_max_backfill - uint32_t max_backfills = 1000; - cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills)); - service.local_reserver.set_max(max_backfills); - service.remote_reserver.set_max(max_backfills); + static const std::map recovery_qos_defaults { + {"osd_recovery_max_active", 0}, + {"osd_recovery_max_active_hdd", 10}, + {"osd_recovery_max_active_ssd", 20}, + {"osd_max_backfills", 10}, + }; + // Check if we were called because of a configuration change + if (changed != nullptr) { + if (cct->_conf.get_val("osd_mclock_override_recovery_settings")) { + if (changed->count("osd_max_backfills")) { + dout(1) << __func__ << " Set local and remote max backfills to " + << cct->_conf->osd_max_backfills << dendl; + service.local_reserver.set_max(cct->_conf->osd_max_backfills); + service.remote_reserver.set_max(cct->_conf->osd_max_backfills); + } + } else { + // Recovery options change was attempted without setting + // the 'osd_mclock_override_recovery_settings' option. + // Find the key to remove from the configuration db. + std::string key; + if (changed->count("osd_max_backfills")) { + key = "osd_max_backfills"; + } else if (changed->count("osd_recovery_max_active")) { + key = "osd_recovery_max_active"; + } else if (changed->count("osd_recovery_max_active_hdd")) { + key = "osd_recovery_max_active_hdd"; + } else if (changed->count("osd_recovery_max_active_ssd")) { + key = "osd_recovery_max_active_ssd"; + } else { + // No key that we are interested in. Return. + return true; + } + + // Remove the current entry from the configuration if + // different from its default value. + auto val = recovery_qos_defaults.find(key); + if (val != recovery_qos_defaults.end() && + cct->_conf.get_val(key) != val->second) { + static const std::vector osds = { + "osd", + "osd." + std::to_string(whoami) + }; + + for (auto osd : osds) { + std::string cmd = + "{" + "\"prefix\": \"config rm\", " + "\"who\": \"" + osd + "\", " + "\"name\": \"" + key + "\"" + "}"; + vector vcmd{cmd}; + + dout(1) << __func__ << " Removing Key: " << key + << " for " << osd << " from Mon db" << dendl; + monc->start_mon_command(vcmd, {}, nullptr, nullptr, nullptr); + } + + // Raise a cluster warning indicating that the changes did not + // take effect and indicate the reason why. + clog->warn() << "Change to " << key << " on osd." + << std::to_string(whoami) << " did not take effect." + << " Enable osd_mclock_override_recovery_settings before" + << " setting this option."; + } + } + } else { // if (changed != nullptr) (osd boot-up) + // Override the default recovery max active and max backfills to + // higher values based on the type of backing device (hdd/ssd). + // This section is executed only during osd boot-up. + for (auto opt : recovery_qos_defaults) { + cct->_conf.set_val_default(opt.first, std::to_string(opt.second)); + if (opt.first == "osd_max_backfills") { + service.local_reserver.set_max(opt.second); + service.remote_reserver.set_max(opt.second); + } + dout(1) << __func__ << " Set default value for " << opt.first + << " to " << opt.second << dendl; + } + } + return true; + } + return false; +} + +void OSD::maybe_override_sleep_options_for_qos() +{ + // Override options only if the scheduler enabled is mclock and the + // underlying objectstore is supported by mclock + if (cct->_conf.get_val("osd_op_queue") == "mclock_scheduler" && + !unsupported_objstore_for_qos()) { + + // Override the various sleep settings // Disable recovery sleep cct->_conf.set_val("osd_recovery_sleep", std::to_string(0)); cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0)); @@ -10198,9 +10276,7 @@ bool OSD::maybe_override_options_for_qos() // Disable scrub sleep cct->_conf.set_val("osd_scrub_sleep", std::to_string(0)); - return true; } - return false; } /** diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 843c021d0d3..18946f1e967 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -2057,7 +2057,9 @@ private: int get_recovery_max_active(); void maybe_override_max_osd_capacity_for_qos(); - bool maybe_override_options_for_qos(); + void maybe_override_sleep_options_for_qos(); + bool maybe_override_options_for_qos( + const std::set *changed = nullptr); int run_osd_bench_test(int64_t count, int64_t bsize, int64_t osize,