.. index:: mclock; configuration
Mclock profiles mask the low level details from users, making it
-easier for them to configure mclock.
+easier for them to configure mclock.
To use mclock, you must provide the following input parameters:
profile make it possible to tune the QoS between client I/O, recovery/backfill
operations, and other background operations (for example, scrub, snap trim, and
PG deletion). These background activities are considered best-effort internal
-clients of Ceph.
+clients of Ceph.
.. index:: mclock; profile definition
The low-level mclock resource control parameters are the *reservation*,
*limit*, and *weight* that provide control of the resource shares, as
-described in the `OSD Config Reference`_.
+described in the :ref:`dmclock-qos` section.
.. index:: mclock; profile types
This profile allocates equal reservation to client ops and background
recovery ops.
-- **Custom**: This profile gives users complete control over all mclock and
- Ceph configuration parameters. Using this profile is not recommended without
+- **Custom**: This profile gives users complete control over all the mclock
+ configuration parameters. Using this profile is not recommended without
a deep understanding of mclock and related Ceph-configuration options.
.. note:: Across the built-in profiles, internal clients of mclock (for example
- "scrub", "snap trim", and "pg deletion") are given slightly lower
- reservations, but higher weight and no limit. This ensures that
- these operations are able to complete quickly if there are no other
+ "scrub", "snap trim", and "pg deletion") are given slightly lower
+ reservations, but higher weight and no limit. This ensures that
+ these operations are able to complete quickly if there are no other
competing services.
client ops or recovery ops. In order to deal with such a situation, you can
enable one of the alternate built-in profiles mentioned above.
-If a built-in profile is active, the following Ceph config sleep options will
-be disabled,
+If any mClock profile (including "custom") is active, the following Ceph config
+sleep options will be disabled,
- :confval:`osd_recovery_sleep`
- :confval:`osd_recovery_sleep_hdd`
steps use the *Ceph Benchmarking Tool* (cbt_). Regardless of the tool
used, the steps described below remain the same.
-As already described in the `OSD Config Reference`_ section, the number of
+As already described in the :ref:`dmclock-qos` section, the number of
shards and the bluestore's throttle parameters have an impact on the mclock op
queues. Therefore, it is critical to set these values carefully in order to
maximize the impact of the mclock scheduler.
.. confval:: osd_mclock_cost_per_byte_usec
.. confval:: osd_mclock_cost_per_byte_usec_hdd
.. confval:: osd_mclock_cost_per_byte_usec_ssd
-
-.. _OSD Config Reference: ../osd-config-ref#dmclock-qos
Increasing this value will slow down snap trimming.
This option overrides backend specific variants.
default: 0
+ flags:
+ - runtime
with_legacy: true
- name: osd_snap_trim_sleep_hdd
type: float
level: advanced
desc: Time in seconds to sleep before next snap trim for HDDs
default: 5
+ flags:
+ - runtime
- name: osd_snap_trim_sleep_ssd
type: float
level: advanced
fmt_desc: Time in seconds to sleep before next snap trim op
for SSD OSDs (including NVMe).
default: 0
+ flags:
+ - runtime
- name: osd_snap_trim_sleep_hybrid
type: float
level: advanced
fmt_desc: Time in seconds to sleep before next snap trim op
when OSD data is on an HDD and the OSD journal or WAL+DB is on an SSD.
default: 2
+ flags:
+ - runtime
- name: osd_scrub_invalid_stats
type: bool
level: advanced
fmt_desc: Time to sleep before scrubbing the next group of chunks. Increasing this value will slow
down the overall rate of scrubbing so that client operations will be less impacted.
default: 0
+ flags:
+ - runtime
with_legacy: true
# more sleep between [deep]scrub ops
- name: osd_scrub_extended_sleep
fmt_desc: Time in seconds to sleep before the next removal transaction. This
throttles the PG deletion process.
default: 0
+ flags:
+ - runtime
- name: osd_delete_sleep_hdd
type: float
level: advanced
desc: Time in seconds to sleep before next removal transaction for HDDs
default: 5
+ flags:
+ - runtime
- name: osd_delete_sleep_ssd
type: float
level: advanced
desc: Time in seconds to sleep before next removal transaction for SSDs
default: 1
+ flags:
+ - runtime
- name: osd_delete_sleep_hybrid
type: float
level: advanced
desc: Time in seconds to sleep before next removal transaction when OSD data is on HDD
and OSD journal or WAL+DB is on SSD
default: 1
+ flags:
+ - runtime
# what % full makes an OSD "full" (failsafe)
- name: osd_failsafe_full_ratio
type: float
this);
shards.push_back(one_shard);
}
+
+ // override some config options if mclock is enabled on all the shards
+ maybe_override_options_for_qos();
}
OSD::~OSD()
"osd_recovery_sleep_hdd",
"osd_recovery_sleep_ssd",
"osd_recovery_sleep_hybrid",
+ "osd_delete_sleep",
+ "osd_delete_sleep_hdd",
+ "osd_delete_sleep_ssd",
+ "osd_delete_sleep_hybrid",
+ "osd_snap_trim_sleep",
+ "osd_snap_trim_sleep_hdd",
+ "osd_snap_trim_sleep_ssd",
+ "osd_snap_trim_sleep_hybrid"
+ "osd_scrub_sleep",
"osd_recovery_max_active",
"osd_recovery_max_active_hdd",
"osd_recovery_max_active_ssd",
changed.count("osd_recovery_max_active") ||
changed.count("osd_recovery_max_active_hdd") ||
changed.count("osd_recovery_max_active_ssd")) {
- if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler" &&
- cct->_conf.get_val<std::string>("osd_mclock_profile") != "custom") {
- // Set ceph config option to meet QoS goals
- // Set high value for recovery max active
- uint32_t recovery_max_active = 1000;
- if (cct->_conf->osd_recovery_max_active) {
- cct->_conf.set_val(
- "osd_recovery_max_active", std::to_string(recovery_max_active));
- }
- if (store_is_rotational) {
- cct->_conf.set_val(
- "osd_recovery_max_active_hdd", std::to_string(recovery_max_active));
- } else {
- cct->_conf.set_val(
- "osd_recovery_max_active_ssd", std::to_string(recovery_max_active));
- }
- // Set high value for osd_max_backfill
- cct->_conf.set_val("osd_max_backfills", std::to_string(1000));
-
- // Disable recovery sleep
- cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
- cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
- cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
- cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
-
- // Disable delete sleep
- cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
- cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
- cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
- cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
-
- // Disable snap trim sleep
- cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
- cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
- cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
- cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
-
- // Disable scrub sleep
- cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
- } else {
+ if (!maybe_override_options_for_qos() &&
+ changed.count("osd_max_backfills")) {
+ // Scheduler is not "mclock". Fallback to earlier behavior
service.local_reserver.set_max(cct->_conf->osd_max_backfills);
service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
}
}
}
+bool OSD::maybe_override_options_for_qos()
+{
+ // If the scheduler enabled is mclock, override the recovery, backfill
+ // and sleep options so that mclock can meet the QoS goals.
+ if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") {
+ dout(1) << __func__
+ << ": Changing recovery/backfill/sleep settings for QoS" << dendl;
+
+ // Set high value for recovery max active
+ uint32_t rec_max_active = 1000;
+ cct->_conf.set_val(
+ "osd_recovery_max_active", std::to_string(rec_max_active));
+ cct->_conf.set_val(
+ "osd_recovery_max_active_hdd", std::to_string(rec_max_active));
+ cct->_conf.set_val(
+ "osd_recovery_max_active_ssd", std::to_string(rec_max_active));
+
+ // Set high value for osd_max_backfill
+ uint32_t max_backfills = 1000;
+ cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills));
+ service.local_reserver.set_max(max_backfills);
+ service.remote_reserver.set_max(max_backfills);
+
+ // Disable recovery sleep
+ cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
+ cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
+ cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
+ cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
+
+ // Disable delete sleep
+ cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
+ cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
+ cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
+ cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
+
+ // Disable snap trim sleep
+ cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
+ cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
+ cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
+ cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
+
+ // Disable scrub sleep
+ cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
+ return true;
+ }
+ return false;
+}
+
void OSD::update_log_config()
{
map<string,string> log_to_monitors;