From 1f44f38359f58d37ff875136852078aebb438ba1 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Mon, 11 Aug 2025 14:14:45 -0700 Subject: [PATCH] common/mclock_common: don't use config keys to pass profile info to client_registry Rather than overriding config variables for qos params based on profile, use MclockConfig::current_profile. This avoids the need to update the mon for cases where users set both a non-custom profile and one or more of the qos params. Also updates config descriptions to clarify that they are ignored unless osd_mclock_profile is set to custom. Signed-off-by: Samuel Just --- src/common/mclock_common.cc | 228 ++++++++-------------------- src/common/mclock_common.h | 53 ++----- src/common/options/osd.yaml.in | 18 +++ src/osd/scheduler/mClockScheduler.h | 2 +- 4 files changed, 95 insertions(+), 206 deletions(-) diff --git a/src/common/mclock_common.cc b/src/common/mclock_common.cc index d8cb8fa6950..cb7be4289f9 100644 --- a/src/common/mclock_common.cc +++ b/src/common/mclock_common.cc @@ -80,9 +80,10 @@ std::ostream& operator<<(std::ostream& out, * for the osd_mclock_scheduler_client_* parameters prior to calling * update_from_config -- see set_config_defaults_from_profile(). */ -void ClientRegistry::update_from_config(const ConfigProxy &conf, - const double capacity_per_shard) +void ClientRegistry::update_from_profile(const profile_t ¤t_profile, + const double capacity_per_shard) { + auto get_res = [&](double res) { if (res) { return res * capacity_per_shard; @@ -99,43 +100,22 @@ void ClientRegistry::update_from_config(const ConfigProxy &conf, } }; - // Set external client infos - double res = conf.get_val( - "osd_mclock_scheduler_client_res"); - double lim = conf.get_val( - "osd_mclock_scheduler_client_lim"); - uint64_t wgt = conf.get_val( - "osd_mclock_scheduler_client_wgt"); default_external_client_info.update( - get_res(res), - wgt, - get_lim(lim)); - - // Set background recovery client infos - res = conf.get_val( - "osd_mclock_scheduler_background_recovery_res"); - lim = conf.get_val( - "osd_mclock_scheduler_background_recovery_lim"); - wgt = conf.get_val( - "osd_mclock_scheduler_background_recovery_wgt"); + get_res(current_profile.client.reservation), + current_profile.client.weight, + get_lim(current_profile.client.limit)); + internal_client_infos[ static_cast(SchedulerClass::background_recovery)].update( - get_res(res), - wgt, - get_lim(lim)); - - // Set background best effort client infos - res = conf.get_val( - "osd_mclock_scheduler_background_best_effort_res"); - lim = conf.get_val( - "osd_mclock_scheduler_background_best_effort_lim"); - wgt = conf.get_val( - "osd_mclock_scheduler_background_best_effort_wgt"); + get_res(current_profile.background_recovery.reservation), + current_profile.background_recovery.weight, + get_lim(current_profile.background_recovery.limit)); + internal_client_infos[ static_cast(SchedulerClass::background_best_effort)].update( - get_res(res), - wgt, - get_lim(lim)); + get_res(current_profile.background_best_effort.reservation), + current_profile.background_best_effort.weight, + get_lim(current_profile.background_best_effort.limit)); } const dmc::ClientInfo *ClientRegistry::get_external_client( @@ -180,66 +160,7 @@ static std::ostream &operator<<(std::ostream &lhs, const profile_t &rhs) << "]"; } -void MclockConfig::set_config_defaults_from_profile() -{ - // Let only a single osd shard (id:0) set the profile configs - if (shard_id > 0) { - return; - } - - const profile_t *profile = nullptr; - auto mclock_profile = cct->_conf.get_val("osd_mclock_profile"); - if (mclock_profile == "high_client_ops") { - profile = &HIGH_CLIENT_OPS; - dout(10) << "Setting high_client_ops profile " << *profile << dendl; - } else if (mclock_profile == "high_recovery_ops") { - profile = &HIGH_RECOVERY_OPS; - dout(10) << "Setting high_recovery_ops profile " << *profile << dendl; - } else if (mclock_profile == "balanced") { - profile = &BALANCED; - dout(10) << "Setting balanced profile " << *profile << dendl; - } else if (mclock_profile == "custom") { - dout(10) << "Profile set to custom, not setting defaults" << dendl; - return; - } else { - derr << "Invalid mclock profile: " << mclock_profile << dendl; - ceph_assert("Invalid choice of mclock profile" == 0); - return; - } - ceph_assert(nullptr != profile); - - auto set_config = [&conf = cct->_conf](const char *key, auto val) { - conf.set_val_default(key, std::to_string(val)); - }; - - set_config("osd_mclock_scheduler_client_res", profile->client.reservation); - set_config("osd_mclock_scheduler_client_wgt", profile->client.weight); - set_config("osd_mclock_scheduler_client_lim", profile->client.limit); - - set_config( - "osd_mclock_scheduler_background_recovery_res", - profile->background_recovery.reservation); - set_config( - "osd_mclock_scheduler_background_recovery_wgt", - profile->background_recovery.weight); - set_config( - "osd_mclock_scheduler_background_recovery_lim", - profile->background_recovery.limit); - - set_config( - "osd_mclock_scheduler_background_best_effort_res", - profile->background_best_effort.reservation); - set_config( - "osd_mclock_scheduler_background_best_effort_wgt", - profile->background_best_effort.weight); - set_config( - "osd_mclock_scheduler_background_best_effort_lim", - profile->background_best_effort.limit); - - cct->_conf.apply_changes(nullptr); -} - -void MclockConfig::set_osd_capacity_params_from_config() +void MclockConfig::set_from_config() { uint64_t osd_bandwidth_capacity; double osd_iop_capacity; @@ -272,6 +193,47 @@ void MclockConfig::set_osd_capacity_params_from_config() << ", osd_bandwidth_capacity_per_shard " << osd_bandwidth_capacity_per_shard << " bytes/second" << dendl; + + auto mclock_profile = cct->_conf.get_val("osd_mclock_profile"); + if (mclock_profile == "high_client_ops") { + current_profile = HIGH_CLIENT_OPS; + dout(10) << "Setting high_client_ops profile " << current_profile << dendl; + } else if (mclock_profile == "high_recovery_ops") { + current_profile = HIGH_RECOVERY_OPS; + dout(10) << "Setting high_recovery_ops profile " << current_profile << dendl; + } else if (mclock_profile == "balanced") { + current_profile = BALANCED; + dout(10) << "Setting balanced profile " << current_profile << dendl; + } else if (mclock_profile == "custom") { + current_profile = { + { + cct->_conf.get_val("osd_mclock_scheduler_client_res"), + cct->_conf.get_val("osd_mclock_scheduler_client_wgt"), + cct->_conf.get_val("osd_mclock_scheduler_client_lim") + }, { + cct->_conf.get_val( + "osd_mclock_scheduler_background_recovery_res"), + cct->_conf.get_val( + "osd_mclock_scheduler_background_recovery_wgt"), + cct->_conf.get_val( + "osd_mclock_scheduler_background_recovery_lim") + }, { + cct->_conf.get_val( + "osd_mclock_scheduler_background_best_effort_res"), + cct->_conf.get_val( + "osd_mclock_scheduler_background_best_effort_wgt"), + cct->_conf.get_val( + "osd_mclock_scheduler_background_best_effort_lim") + } + }; + dout(10) << "Setting custom profile " << current_profile << dendl; + } else { + derr << "Invalid mclock profile: " << mclock_profile << dendl; + ceph_assert("Invalid choice of mclock profile" == 0); + return; + } + client_registry.update_from_profile( + current_profile, osd_bandwidth_capacity_per_shard); } void MclockConfig::init_logger() @@ -380,80 +342,10 @@ uint32_t MclockConfig::calc_scaled_cost(int item_cost) void MclockConfig::handle_conf_change(const ConfigProxy& conf, const std::set &changed) { - if (changed.count("osd_mclock_max_capacity_iops_hdd") || - changed.count("osd_mclock_max_capacity_iops_ssd")) { - set_osd_capacity_params_from_config(); - client_registry.update_from_config( - conf, osd_bandwidth_capacity_per_shard); - } - if (changed.count("osd_mclock_max_sequential_bandwidth_hdd") || - changed.count("osd_mclock_max_sequential_bandwidth_ssd")) { - set_osd_capacity_params_from_config(); - client_registry.update_from_config( - conf, osd_bandwidth_capacity_per_shard); - } - if (changed.count("osd_mclock_profile")) { - set_config_defaults_from_profile(); - client_registry.update_from_config( - conf, osd_bandwidth_capacity_per_shard); - } - - auto get_changed_key = [&changed]() -> std::optional { - static const std::vector qos_params = { - "osd_mclock_scheduler_client_res", - "osd_mclock_scheduler_client_wgt", - "osd_mclock_scheduler_client_lim", - "osd_mclock_scheduler_background_recovery_res", - "osd_mclock_scheduler_background_recovery_wgt", - "osd_mclock_scheduler_background_recovery_lim", - "osd_mclock_scheduler_background_best_effort_res", - "osd_mclock_scheduler_background_best_effort_wgt", - "osd_mclock_scheduler_background_best_effort_lim" - }; - - for (auto &qp : qos_params) { - if (changed.count(qp)) { - return qp; - } - } - return std::nullopt; - }; - if (auto key = get_changed_key(); key.has_value()) { - auto mclock_profile = cct->_conf.get_val("osd_mclock_profile"); - if (mclock_profile == "custom") { - client_registry.update_from_config( - conf, osd_bandwidth_capacity_per_shard); - } else { - // Attempt to change QoS parameter for a built-in profile. Restore the - // profile defaults by making one of the OSD shards remove the key from - // config monitor store. Note: monc is included in the check since the - // mock unit test currently doesn't initialize it. - if (shard_id == 0 && monc) { - static const std::vector osds = { - "osd", - "osd." + std::to_string(whoami) - }; - - for (auto osd : osds) { - std::string cmd = - "{" - "\"prefix\": \"config rm\", " - "\"who\": \"" + osd + "\", " - "\"name\": \"" + *key + "\"" - "}"; - std::vector vcmd{cmd}; - - dout(10) << __func__ << " Removing Key: " << *key - << " for " << osd << " from Mon db" << dendl; - monc->start_mon_command(vcmd, {}, nullptr, nullptr, nullptr); - } - } - } - // Alternatively, the QoS parameter, if set ephemerally for this OSD via - // the 'daemon' or 'tell' interfaces must be removed. - if (!cct->_conf.rm_val(*key)) { - dout(10) << __func__ << " Restored " << *key << " to default" << dendl; - cct->_conf.apply_changes(nullptr); + for (auto &key : get_tracked_keys()) { + if (changed.count(key)) { + set_from_config(); + return; } } } diff --git a/src/common/mclock_common.h b/src/common/mclock_common.h index 0c056996338..428ffeb491a 100644 --- a/src/common/mclock_common.h +++ b/src/common/mclock_common.h @@ -180,8 +180,10 @@ class ClientRegistry { internal_client_infos.emplace_back(1, 1, 1); } } - void update_from_config(const ConfigProxy &conf, - double capacity_per_shard); + void update_from_profile( + const profile_t ¤t_profile, + const double capacity_per_shard); + const crimson::dmclock::ClientInfo *get_info( const scheduler_id_t &id) const; }; @@ -191,55 +193,32 @@ private: CephContext *cct; uint32_t num_shards; bool is_rotational; - PerfCounters *logger; + PerfCounters *logger = nullptr; int shard_id; int whoami; - double osd_bandwidth_cost_per_io; - double osd_bandwidth_capacity_per_shard; + double osd_bandwidth_cost_per_io = 0.0; + double osd_bandwidth_capacity_per_shard = 0.0; ClientRegistry& client_registry; - #ifndef WITH_CRIMSON - MonClient *monc; - #endif + + // currently active profile, will be overridden from config on startup + // and upon config change + profile_t current_profile = BALANCED; public: - #ifdef WITH_CRIMSON MclockConfig(CephContext *cct, ClientRegistry& creg, uint32_t num_shards, bool is_rotational, int shard_id, int whoami):cct(cct), num_shards(num_shards), is_rotational(is_rotational), - logger(nullptr),shard_id(shard_id), - whoami(whoami), osd_bandwidth_cost_per_io(0.0), - osd_bandwidth_capacity_per_shard(0.0), + shard_id(shard_id), + whoami(whoami), client_registry(creg) { cct->_conf.add_observer(this); - set_osd_capacity_params_from_config(); - set_config_defaults_from_profile(); - client_registry.update_from_config( - cct->_conf, get_capacity_per_shard()); - } - #else - MclockConfig(CephContext *cct, ClientRegistry& creg, - MonClient *monc, uint32_t num_shards, bool is_rotational, - int shard_id, int whoami):cct(cct), - num_shards(num_shards), - is_rotational(is_rotational), - logger(nullptr),shard_id(shard_id), - whoami(whoami), - osd_bandwidth_cost_per_io(0.0), - osd_bandwidth_capacity_per_shard(0.0), - client_registry(creg), monc(monc) - { - cct->_conf.add_observer(this); - set_osd_capacity_params_from_config(); - set_config_defaults_from_profile(); - client_registry.update_from_config( - cct->_conf, get_capacity_per_shard()); + set_from_config(); } -#endif ~MclockConfig() final; - void set_config_defaults_from_profile(); - void set_osd_capacity_params_from_config(); + + void set_from_config(); void init_logger(); void get_mclock_counter(scheduler_id_t id); void put_mclock_counter(scheduler_id_t id); diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in index 7fda01f7c24..03c05279044 100644 --- a/src/common/options/osd.yaml.in +++ b/src/common/options/osd.yaml.in @@ -1085,6 +1085,7 @@ options: of 0 specifies the lowest possible reservation. Any value greater than 0 and up to 1.0 specifies the minimum IO proportion to reserve for each client in terms of a fraction of the OSD's maximum IOPS capacity. + Ignored unless osd_mclock_profile is set to 'custom'. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO proportion reserved for each client (default). default: 0 @@ -1092,15 +1093,18 @@ options: max: 1.0 see_also: - osd_op_queue + - osd_mclock_profile - name: osd_mclock_scheduler_client_wgt type: uint level: advanced desc: IO share for each client (default) over reservation + Ignored unless osd_mclock_profile is set to 'custom'. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO share for each client (default) over reservation. default: 1 see_also: - osd_op_queue + - osd_mclock_profile - name: osd_mclock_scheduler_client_lim type: float level: advanced @@ -1110,6 +1114,7 @@ options: than 0 and up to 1.0 specifies the upper IO limit over reservation that each client receives in terms of a fraction of the OSD's maximum IOPS capacity. + Ignored unless osd_mclock_profile is set to 'custom'. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO limit for each client (default) over reservation. default: 0 @@ -1117,6 +1122,7 @@ options: max: 1.0 see_also: - osd_op_queue + - osd_mclock_profile - name: osd_mclock_scheduler_background_recovery_res type: float level: advanced @@ -1125,6 +1131,7 @@ options: greater than 0 and up to 1.0 specifies the minimum IO proportion to reserve for background recovery operations in terms of a fraction of the OSD's maximum IOPS capacity. + Ignored unless osd_mclock_profile is set to 'custom'. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO proportion reserved for background recovery (default). default: 0 @@ -1132,15 +1139,18 @@ options: max: 1.0 see_also: - osd_op_queue + - osd_mclock_profile - name: osd_mclock_scheduler_background_recovery_wgt type: uint level: advanced desc: IO share for each background recovery over reservation + Ignored unless osd_mclock_profile is set to 'custom'. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO share for each background recovery over reservation. default: 1 see_also: - osd_op_queue + - osd_mclock_profile - name: osd_mclock_scheduler_background_recovery_lim type: float level: advanced @@ -1150,6 +1160,7 @@ options: OSD. Any value greater than 0 and up to 1.0 specifies the upper IO limit over reservation that background recovery operation receives in terms of a fraction of the OSD's maximum IOPS capacity. + Ignored unless osd_mclock_profile is set to 'custom'. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO limit for background recovery over reservation. default: 0 @@ -1157,6 +1168,7 @@ options: max: 1.0 see_also: - osd_op_queue + - osd_mclock_profile - name: osd_mclock_scheduler_background_best_effort_res type: float level: advanced @@ -1165,6 +1177,7 @@ options: greater than 0 and up to 1.0 specifies the minimum IO proportion to reserve for background best_effort operations in terms of a fraction of the OSD's maximum IOPS capacity. + Ignored unless osd_mclock_profile is set to 'custom'. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO proportion reserved for background best_effort (default). default: 0 @@ -1172,15 +1185,18 @@ options: max: 1.0 see_also: - osd_op_queue + - osd_mclock_profile - name: osd_mclock_scheduler_background_best_effort_wgt type: uint level: advanced desc: IO share for each background best_effort over reservation + Ignored unless osd_mclock_profile is set to 'custom'. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO share for each background best_effort over reservation. default: 1 see_also: - osd_op_queue + - osd_mclock_profile - name: osd_mclock_scheduler_background_best_effort_lim type: float level: advanced @@ -1190,6 +1206,7 @@ options: OSD. Any value greater than 0 and up to 1.0 specifies the upper IO limit over reservation that background best_effort operation receives in terms of a fraction of the OSD's maximum IOPS capacity. + Ignored unless osd_mclock_profile is set to 'custom'. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO limit for background best_effort over reservation. default: 0 @@ -1197,6 +1214,7 @@ options: max: 1.0 see_also: - osd_op_queue + - osd_mclock_profile - name: osd_mclock_scheduler_anticipation_timeout type: float level: advanced diff --git a/src/osd/scheduler/mClockScheduler.h b/src/osd/scheduler/mClockScheduler.h index 4442ad40b7c..190b5745bb8 100644 --- a/src/osd/scheduler/mClockScheduler.h +++ b/src/osd/scheduler/mClockScheduler.h @@ -86,7 +86,7 @@ public: : cct(cct), cutoff_priority(cutoff_priority), monc(monc), - mclock_conf(cct, client_registry, monc, num_shards, + mclock_conf(cct, client_registry, num_shards, is_rotational, shard_id, whoami), scheduler( std::bind(&ClientRegistry::get_info, -- 2.39.5