common/options: extract osd and mgr settings out

author Kefu Chai <kchai@redhat.com>

Fri, 23 Apr 2021 09:32:00 +0000 (17:32 +0800)

committer Kefu Chai <kchai@redhat.com>

Fri, 14 May 2021 11:18:47 +0000 (19:18 +0800)
author Kefu Chai <kchai@redhat.com>
Fri, 23 Apr 2021 09:32:00 +0000 (17:32 +0800)
committer Kefu Chai <kchai@redhat.com>
Fri, 14 May 2021 11:18:47 +0000 (19:18 +0800)
diff --git a/src/common/options/CMakeLists.txt b/src/common/options/CMakeLists.txt

index 6424271e381217385718d3e4b3ce93dd98a0988f..8705c31fdc836f04f253576911bc9ce45105c96f 100644 (file)
--- a/src/common/options/CMakeLists.txt
+++ b/src/common/options/CMakeLists.txt
@@ -82,8 +82,10 @@ endif()
  add_options(global)
  add_options(cephfs-mirror)
  add_options(crimson)
+add_options(mgr)
  add_options(mds)
  add_options(mds-client)
+add_options(osd)
  add_options(rbd)
  add_options(rbd-mirror)
  add_options(immutable-object-cache)
diff --git a/src/common/options/build_options.cc b/src/common/options/build_options.cc

index cf3f5ef6c92de58234fd4a1c84f5d93c93583ff6..ec3a1f9ecda8932f9d8537497c377984a2389f77 100644 (file)
--- a/src/common/options/build_options.cc
+++ b/src/common/options/build_options.cc
@@ -4,7 +4,9 @@
  #include "build_options.h"
  
  std::vector<Option> get_global_options();
+std::vector<Option> get_mgr_options();
  std::vector<Option> get_crimson_options();
+std::vector<Option> get_osd_options();
  std::vector<Option> get_rgw_options();
  std::vector<Option> get_rbd_options();
  std::vector<Option> get_rbd_mirror_options();
@@ -25,6 +27,8 @@ std::vector<Option> build_options()
    };
  
    ingest(get_crimson_options(), "osd");
+  ingest(get_mgr_options(), "mgr");
+  ingest(get_osd_options(), "osd");
    ingest(get_rgw_options(), "rgw");
    ingest(get_rbd_options(), "rbd");
    ingest(get_rbd_mirror_options(), "rbd-mirror");
diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in

index fe2bdf96540644545958eefa9dc719b499bdd3d9..ec173f0aac2b3e335a3b2dfb9afd3db2e216cf58 100644 (file)
--- a/src/common/options/global.yaml.in
+++ b/src/common/options/global.yaml.in
@@ -3671,239 +3671,6 @@ options:
    default: 100
    flags:
    - runtime
-- name: osd_numa_prefer_iface
-  type: bool
-  level: advanced
-  desc: prefer IP on network interface on same numa node as storage
-  default: true
-  see_also:
-  - osd_numa_auto_affinity
-  flags:
-  - startup
-- name: osd_numa_auto_affinity
-  type: bool
-  level: advanced
-  desc: automatically set affinity to numa node when storage and network match
-  default: true
-  flags:
-  - startup
-- name: osd_numa_node
-  type: int
-  level: advanced
-  desc: set affinity to a numa node (-1 for none)
-  default: -1
-  see_also:
-  - osd_numa_auto_affinity
-  flags:
-  - startup
-- name: osd_smart_report_timeout
-  type: uint
-  level: advanced
-  desc: Timeout (in seconds) for smarctl to run, default is set to 5
-  default: 5
-# verify backend can support configured max object name length
-- name: osd_check_max_object_name_len_on_startup
-  type: bool
-  level: dev
-  default: true
-  with_legacy: true
-- name: osd_max_backfills
-  type: uint
-  level: advanced
-  desc: Maximum number of concurrent local and remote backfills or recoveries per
-    OSD
-  long_desc: There can be osd_max_backfills local reservations AND the same remote
-    reservations per OSD. So a value of 1 lets this OSD participate as 1 PG primary
-    in recovery and 1 shard of another recovering PG.
-  fmt_desc: The maximum number of backfills allowed to or from a single OSD.
-    Note that this is applied separately for read and write operations.
-  default: 1
-  flags:
-  - runtime
-  with_legacy: true
-# Minimum recovery priority (255 = max, smaller = lower)
-- name: osd_min_recovery_priority
-  type: int
-  level: advanced
-  desc: Minimum priority below which recovery is not performed
-  long_desc: The purpose here is to prevent the cluster from doing *any* lower priority
-    work (e.g., rebalancing) below this threshold and focus solely on higher priority
-    work (e.g., replicating degraded objects).
-  default: 0
-  with_legacy: true
-- name: osd_backfill_retry_interval
-  type: float
-  level: advanced
-  desc: how frequently to retry backfill reservations after being denied (e.g., due
-    to a full OSD)
-  fmt_desc: The number of seconds to wait before retrying backfill requests.
-  default: 30
-  with_legacy: true
-- name: osd_recovery_retry_interval
-  type: float
-  level: advanced
-  desc: how frequently to retry recovery reservations after being denied (e.g., due
-    to a full OSD)
-  default: 30
-  with_legacy: true
-- name: osd_agent_max_ops
-  type: int
-  level: advanced
-  desc: maximum concurrent tiering operations for tiering agent
-  fmt_desc: The maximum number of simultaneous flushing ops per tiering agent
-    in the high speed mode.
-  default: 4
-  with_legacy: true
-- name: osd_agent_max_low_ops
-  type: int
-  level: advanced
-  desc: maximum concurrent low-priority tiering operations for tiering agent
-  fmt_desc: The maximum number of simultaneous flushing ops per tiering agent
-    in the low speed mode.
-  default: 2
-  with_legacy: true
-- name: osd_agent_min_evict_effort
-  type: float
-  level: advanced
-  desc: minimum effort to expend evicting clean objects
-  default: 0.1
-  min: 0
-  max: 0.99
-  with_legacy: true
-- name: osd_agent_quantize_effort
-  type: float
-  level: advanced
-  desc: size of quantize unit for eviction effort
-  default: 0.1
-  with_legacy: true
-- name: osd_agent_delay_time
-  type: float
-  level: advanced
-  desc: how long agent should sleep if it has no work to do
-  default: 5
-  with_legacy: true
-- name: osd_find_best_info_ignore_history_les
-  type: bool
-  level: dev
-  desc: ignore last_epoch_started value when peering AND PROBABLY LOSE DATA
-  long_desc: THIS IS AN EXTREMELY DANGEROUS OPTION THAT SHOULD ONLY BE USED AT THE
-    DIRECTION OF A DEVELOPER.  It makes peering ignore the last_epoch_started value
-    when peering, which can allow the OSD to believe an OSD has an authoritative view
-    of a PG's contents even when it is in fact old and stale, typically leading to
-    data loss (by believing a stale PG is up to date).
-  default: false
-  with_legacy: true
-# decay atime and hist histograms after how many objects go by
-- name: osd_agent_hist_halflife
-  type: int
-  level: advanced
-  desc: halflife of agent atime and temp histograms
-  default: 1000
-  with_legacy: true
-# decay atime and hist histograms after how many objects go by
-- name: osd_agent_slop
-  type: float
-  level: advanced
-  desc: slop factor to avoid switching tiering flush and eviction mode
-  default: 0.02
-  with_legacy: true
-- name: osd_uuid
-  type: uuid
-  level: advanced
-  desc: uuid label for a new OSD
-  fmt_desc: The universally unique identifier (UUID) for the Ceph OSD Daemon.
-  note: The ``osd_uuid`` applies to a single Ceph OSD Daemon. The ``fsid``
-    applies to the entire cluster.
-  flags:
-  - create
-  with_legacy: true
-- name: osd_data
-  type: str
-  level: advanced
-  desc: path to OSD data
-  fmt_desc: The path to the OSDs data. You must create the directory when
-    deploying Ceph. You should mount a drive for OSD data at this
-    mount point. We do not recommend changing the default.
-  default: /var/lib/ceph/osd/$cluster-$id
-  flags:
-  - no_mon_update
-  with_legacy: true
-- name: osd_journal
-  type: str
-  level: advanced
-  desc: path to OSD journal (when FileStore backend is in use)
-  fmt_desc: The path to the OSD's journal. This may be a path to a file or a
-    block device (such as a partition of an SSD). If it is a file,
-    you must create the directory to contain it. We recommend using a
-    separate fast device when the ``osd_data`` drive is an HDD.
-  default: /var/lib/ceph/osd/$cluster-$id/journal
-  flags:
-  - no_mon_update
-  with_legacy: true
-- name: osd_journal_size
-  type: size
-  level: advanced
-  desc: size of FileStore journal (in MiB)
-  fmt_desc: The size of the journal in megabytes.
-  default: 5_K
-  flags:
-  - create
-  with_legacy: true
-- name: osd_journal_flush_on_shutdown
-  type: bool
-  level: advanced
-  desc: flush FileStore journal contents during clean OSD shutdown
-  default: true
-  with_legacy: true
-- name: osd_compact_on_start
-  type: bool
-  level: advanced
-  desc: compact OSD's object store's OMAP on start
-  default: false
-# flags for specific control purpose during osd mount() process.
-# e.g., can be 1 to skip over replaying journal
-# or 2 to skip over mounting omap or 3 to skip over both.
-# This might be helpful in case the journal is totally corrupted
-# and we still want to bring the osd daemon back normally, etc.
-- name: osd_os_flags
-  type: uint
-  level: dev
-  desc: flags to skip filestore omap or journal initialization
-  default: 0
-- name: osd_max_write_size
-  type: size
-  level: advanced
-  desc: Maximum size of a RADOS write operation in megabytes
-  long_desc: This setting prevents clients from doing very large writes to RADOS.  If
-    you set this to a value below what clients expect, they will receive an error
-    when attempting to write to the cluster.
-  fmt_desc: The maximum size of a write in megabytes.
-  default: 90
-  min: 4
-  with_legacy: true
-- name: osd_max_pgls
-  type: uint
-  level: advanced
-  desc: maximum number of results when listing objects in a pool
-  fmt_desc: The maximum number of placement groups to list. A client
-    requesting a large number can tie up the Ceph OSD Daemon.
-  default: 1_K
-  with_legacy: true
-- name: osd_client_message_size_cap
-  type: size
-  level: advanced
-  desc: maximum memory to devote to in-flight client requests
-  long_desc: If this value is exceeded, the OSD will not read any new client data
-    off of the network until memory is freed.
-  fmt_desc: The largest client data message allowed in memory.
-  default: 500_M
-  with_legacy: true
-- name: osd_client_message_cap
-  type: uint
-  level: advanced
-  desc: maximum number of in-flight client requests
-  default: 0
-  with_legacy: true
  - name: osd_crush_update_weight_set
    type: bool
    level: advanced
@@ -3934,29 +3701,6 @@ options:
    long_desc: This setting only exists for compatibility with hammer (and older) clusters.
    default: true
    with_legacy: true
-- name: osd_crush_update_on_start
-  type: bool
-  level: advanced
-  desc: update OSD CRUSH location on startup
-  default: true
-  with_legacy: true
-- name: osd_class_update_on_start
-  type: bool
-  level: advanced
-  desc: set OSD device class on startup
-  default: true
-  with_legacy: true
-- name: osd_crush_initial_weight
-  type: float
-  level: advanced
-  desc: if >= 0, initial CRUSH weight for newly created OSDs
-  long_desc: If this value is negative, the size of the OSD in TiB is used.
-  fmt_desc: The initial CRUSH weight for newly added OSDs. The default
-    value of this option is ``the size of a newly added OSD in TB``. By default,
-    the initial CRUSH weight for a newly added OSD is set to its device size in
-    TB. See `Weighting Bucket Items`_ for details.
-  default: -1
-  with_legacy: true
  # whether turn on fast read on the pool or not
  - name: osd_pool_default_ec_fast_read
    type: bool
@@ -4086,15 +3830,6 @@ options:
    flags:
    - startup
    with_legacy: true
-# Allows the "peered" state for recovery and backfill below min_size
-- name: osd_allow_recovery_below_min_size
-  type: bool
-  level: dev
-  desc: allow replicated pools to recover with < min_size active members
-  default: true
-  services:
-  - osd
-  with_legacy: true
  - name: osd_pool_default_flags
    type: int
    level: dev
@@ -4300,1100 +4035,153 @@ options:
    default: true
    fmt_desc: Enable removing duplicates in the OSD map.
    with_legacy: true
-- name: osd_map_cache_size
-  type: int
-  level: advanced
-  default: 50
-  fmt_desc: The number of OSD maps to keep cached.
-  with_legacy: true
  - name: osd_map_message_max
    type: int
    level: advanced
    desc: maximum number of OSDMaps to include in a single message
    fmt_desc: The maximum map entries allowed per MOSDMap message.
    default: 40
+  services:
+  - osd
+  - mon
    with_legacy: true
-- name: osd_map_message_max_bytes
-  type: size
-  level: advanced
-  desc: maximum number of bytes worth of OSDMaps to include in a single message
-  default: 10_M
-  with_legacy: true
-# cap on # of inc maps we send to peers, clients
-- name: osd_map_share_max_epochs
-  type: int
-  level: advanced
-  default: 40
-  with_legacy: true
-- name: osd_pg_epoch_max_lag_factor
-  type: float
-  level: advanced
-  desc: Max multiple of the map cache that PGs can lag before we throttle map injest
-  default: 2
-  see_also:
-  - osd_map_cache_size
-- name: osd_inject_bad_map_crc_probability
-  type: float
-  level: dev
-  default: 0
-  with_legacy: true
-- name: osd_inject_failure_on_pg_removal
-  type: bool
-  level: dev
-  default: false
-  with_legacy: true
-# shutdown the OSD if stuatus flipping more than max_markdown_count times in recent max_markdown_period seconds
-- name: osd_max_markdown_period
-  type: int
-  level: advanced
-  default: 10_min
-  with_legacy: true
-- name: osd_max_markdown_count
-  type: int
-  level: advanced
-  default: 5
-  with_legacy: true
-- name: osd_op_pq_max_tokens_per_priority
-  type: uint
-  level: advanced
-  default: 4_M
-  with_legacy: true
-- name: osd_op_pq_min_cost
-  type: size
-  level: advanced
-  default: 64_K
-  with_legacy: true
-# preserve clone_overlap during recovery/migration
-- name: osd_recover_clone_overlap
-  type: bool
-  level: advanced
-  default: true
-  fmt_desc: Preserves clone overlap during recovery. Should always be set
-    to ``true``.
-  with_legacy: true
-- name: osd_num_cache_shards
-  type: size
-  level: advanced
-  desc: The number of cache shards to use in the object store.
-  default: 32
-  flags:
-  - startup
-- name: osd_op_num_threads_per_shard
-  type: int
-  level: advanced
-  default: 0
-  flags:
-  - startup
-  with_legacy: true
-- name: osd_op_num_threads_per_shard_hdd
-  type: int
-  level: advanced
-  default: 1
-  see_also:
-  - osd_op_num_threads_per_shard
-  flags:
-  - startup
-  with_legacy: true
-- name: osd_op_num_threads_per_shard_ssd
-  type: int
-  level: advanced
-  default: 2
-  see_also:
-  - osd_op_num_threads_per_shard
-  flags:
-  - startup
-  with_legacy: true
-- name: osd_op_num_shards
-  type: int
-  level: advanced
-  fmt_desc: The number of shards allocated for a given OSD. Each shard has its own processing queue.
-    PGs on the OSD are distributed evenly in the shard. This setting overrides _ssd and _hdd if
-    non-zero.
-  default: 0
-  flags:
-  - startup
-  with_legacy: true
-- name: osd_op_num_shards_hdd
-  type: int
-  level: advanced
-  fmt_desc: the number of shards allocated for a given OSD (for rotational media).
-  default: 5
-  see_also:
-  - osd_op_num_shards
-  flags:
-  - startup
-  with_legacy: true
-- name: osd_op_num_shards_ssd
-  type: int
-  level: advanced
-  fmt_desc: the number of shards allocated for a given OSD (for solid state media).
-  default: 8
-  see_also:
-  - osd_op_num_shards
-  flags:
-  - startup
-  with_legacy: true
-- name: osd_skip_data_digest
-  type: bool
-  level: dev
-  desc: Do not store full-object checksums if the backend (bluestore) does its own
-    checksums.  Only usable with all BlueStore OSDs.
-  default: false
-# PrioritzedQueue (prio), Weighted Priority Queue (wpq ; default),
-# mclock_opclass, mclock_client, or debug_random. "mclock_opclass"
-# and "mclock_client" are based on the mClock/dmClock algorithm
-# (Gulati, et al. 2010). "mclock_opclass" prioritizes based on the
-# class the operation belongs to. "mclock_client" does the same but
-# also works to ienforce fairness between clients. "debug_random"
-# chooses among all four with equal probability.
-- name: osd_op_queue
-  type: str
-  level: advanced
-  desc: which operation priority queue algorithm to use
-  long_desc: which operation priority queue algorithm to use
-  fmt_desc: This sets the type of queue to be used for prioritizing ops
-    within each OSD. Both queues feature a strict sub-queue which is
-    dequeued before the normal queue. The normal queue is different
-    between implementations. The WeightedPriorityQueue (``wpq``)
-    dequeues operations in relation to their priorities to prevent
-    starvation of any queue. WPQ should help in cases where a few OSDs
-    are more overloaded than others. The mClockQueue
-    (``mclock_scheduler``) prioritizes operations based on which class
-    they belong to (recovery, scrub, snaptrim, client op, osd subop).
-    See `QoS Based on mClock`_. Requires a restart.
-  default: mclock_scheduler
-  see_also:
-  - osd_op_queue_cut_off
-  enum_values:
-  - wpq
-  - mclock_scheduler
-  - debug_random
-  with_legacy: true
-# Min priority to go to strict queue. (low, high)
-- name: osd_op_queue_cut_off
-  type: str
-  level: advanced
-  desc: the threshold between high priority ops and low priority ops
-  long_desc: the threshold between high priority ops that use strict priority ordering
-    and low priority ops that use a fairness algorithm that may or may not incorporate
-    priority
-  fmt_desc: This selects which priority ops will be sent to the strict
-    queue verses the normal queue. The ``low`` setting sends all
-    replication ops and higher to the strict queue, while the ``high``
-    option sends only replication acknowledgment ops and higher to
-    the strict queue. Setting this to ``high`` should help when a few
-    OSDs in the cluster are very busy especially when combined with
-    ``wpq`` in the ``osd_op_queue`` setting. OSDs that are very busy
-    handling replication traffic could starve primary client traffic
-    on these OSDs without these settings. Requires a restart.
-  default: high
-  see_also:
-  - osd_op_queue
-  enum_values:
-  - low
-  - high
-  - debug_random
-  with_legacy: true
-- name: osd_mclock_scheduler_client_res
-  type: uint
-  level: advanced
-  desc: IO proportion reserved for each client (default)
-  long_desc: Only considered for osd_op_queue = mclock_scheduler
-  fmt_desc: IO proportion reserved for each client (default).
-  default: 1
-  see_also:
-  - osd_op_queue
-- name: osd_mclock_scheduler_client_wgt
-  type: uint
-  level: advanced
-  desc: IO share for each client (default) over reservation
-  long_desc: Only considered for osd_op_queue = mclock_scheduler
-  fmt_desc: IO share for each client (default) over reservation.
-  default: 1
-  see_also:
-  - osd_op_queue
-- name: osd_mclock_scheduler_client_lim
-  type: uint
-  level: advanced
-  desc: IO limit for each client (default) over reservation
-  long_desc: Only considered for osd_op_queue = mclock_scheduler
-  fmt_desc: IO limit for each client (default) over reservation.
-  default: 999999
-  see_also:
-  - osd_op_queue
-- name: osd_mclock_scheduler_background_recovery_res
-  type: uint
-  level: advanced
-  desc: IO proportion reserved for background recovery (default)
-  long_desc: Only considered for osd_op_queue = mclock_scheduler
-  fmt_desc: IO proportion reserved for background recovery (default).
-  default: 1
-  see_also:
-  - osd_op_queue
-- name: osd_mclock_scheduler_background_recovery_wgt
-  type: uint
-  level: advanced
-  desc: IO share for each background recovery over reservation
-  long_desc: Only considered for osd_op_queue = mclock_scheduler
-  fmt_desc: IO share for each background recovery over reservation.
-  default: 1
-  see_also:
-  - osd_op_queue
-- name: osd_mclock_scheduler_background_recovery_lim
-  type: uint
-  level: advanced
-  desc: IO limit for background recovery over reservation
-  long_desc: Only considered for osd_op_queue = mclock_scheduler
-  fmt_desc: IO limit for background recovery over reservation.
-  default: 999999
-  see_also:
-  - osd_op_queue
-- name: osd_mclock_scheduler_background_best_effort_res
-  type: uint
-  level: advanced
-  desc: IO proportion reserved for background best_effort (default)
-  long_desc: Only considered for osd_op_queue = mclock_scheduler
-  fmt_desc: IO proportion reserved for background best_effort (default).
-  default: 1
-  see_also:
-  - osd_op_queue
-- name: osd_mclock_scheduler_background_best_effort_wgt
-  type: uint
-  level: advanced
-  desc: IO share for each background best_effort over reservation
-  long_desc: Only considered for osd_op_queue = mclock_scheduler
-  fmt_desc: IO share for each background best_effort over reservation.
-  default: 1
-  see_also:
-  - osd_op_queue
-- name: osd_mclock_scheduler_background_best_effort_lim
-  type: uint
-  level: advanced
-  desc: IO limit for background best_effort over reservation
-  long_desc: Only considered for osd_op_queue = mclock_scheduler
-  fmt_desc: IO limit for background best_effort over reservation.
-  default: 999999
-  see_also:
-  - osd_op_queue
-- name: osd_mclock_scheduler_anticipation_timeout
-  type: float
-  level: advanced
-  desc: mclock anticipation timeout in seconds
-  long_desc: the amount of time that mclock waits until the unused resource is forfeited
-  default: 0
-- name: osd_mclock_cost_per_io_usec
-  type: float
-  level: dev
-  desc: Cost per IO in microseconds to consider per OSD (overrides _ssd and _hdd if
-    non-zero)
-  long_desc: This option specifies the cost factor to consider in usec per OSD. This
-    is considered by the mclock scheduler to set an additional cost factor in QoS
-    calculations. Only considered for osd_op_queue = mclock_scheduler
-  fmt_desc: Cost per IO in microseconds to consider per OSD (overrides _ssd
-    and _hdd if non-zero)
-  default: 0
-  flags:
-  - runtime
-- name: osd_mclock_cost_per_io_usec_hdd
-  type: float
-  level: dev
-  desc: Cost per IO in microseconds to consider per OSD (for rotational media)
-  long_desc: This option specifies the cost factor to consider in usec per OSD for
-    rotational device type. This is considered by the mclock_scheduler to set an additional
-    cost factor in QoS calculations. Only considered for osd_op_queue = mclock_scheduler
-  fmt_desc: Cost per IO in microseconds to consider per OSD (for rotational
-    media)
-  default: 25000
-  flags:
-  - runtime
-- name: osd_mclock_cost_per_io_usec_ssd
-  type: float
-  level: dev
-  desc: Cost per IO in microseconds to consider per OSD (for solid state media)
-  long_desc: This option specifies the cost factor to consider in usec per OSD for
-    solid state device type. This is considered by the mclock_scheduler to set an
-    additional cost factor in QoS calculations. Only considered for osd_op_queue =
-    mclock_scheduler
-  fmt_desc: Cost per IO in microseconds to consider per OSD (for solid state
-    media)
-  default: 50
-  flags:
-  - runtime
-- name: osd_mclock_cost_per_byte_usec
-  type: float
-  level: dev
-  desc: Cost per byte in microseconds to consider per OSD (overrides _ssd and _hdd
-    if non-zero)
-  long_desc: This option specifies the cost per byte to consider in microseconds per
-    OSD. This is considered by the mclock scheduler to set an additional cost factor
-    in QoS calculations. Only considered for osd_op_queue = mclock_scheduler
-  fmt_desc: Cost per byte in microseconds to consider per OSD (overrides _ssd
-    and _hdd if non-zero)
-  default: 0
-  flags:
-  - runtime
-- name: osd_mclock_cost_per_byte_usec_hdd
-  type: float
-  level: dev
-  desc: Cost per byte in microseconds to consider per OSD (for rotational media)
-  long_desc: This option specifies the cost per byte to consider in microseconds per
-    OSD for rotational device type. This is considered by the mclock_scheduler to
-    set an additional cost factor in QoS calculations. Only considered for osd_op_queue
-    = mclock_scheduler
-  fmt_desc: Cost per byte in microseconds to consider per OSD (for rotational
-    media)
-  default: 5.2
-  flags:
-  - runtime
-- name: osd_mclock_cost_per_byte_usec_ssd
-  type: float
-  level: dev
-  desc: Cost per byte in microseconds to consider per OSD (for solid state media)
-  long_desc: This option specifies the cost per byte to consider in microseconds per
-    OSD for solid state device type. This is considered by the mclock_scheduler to
-    set an additional cost factor in QoS calculations. Only considered for osd_op_queue
-    = mclock_scheduler
-  fmt_desc: Cost per byte in microseconds to consider per OSD (for solid state
-    media)
-  default: 0.011
-  flags:
-  - runtime
-- name: osd_mclock_max_capacity_iops
-  type: float
-  level: basic
-  desc: Max IOPs capacity (at 4KiB block size) to consider per OSD (overrides _ssd
-    and _hdd if non-zero)
-  long_desc: This option specifies the max osd capacity in iops per OSD. Helps in
-    QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue
-    = mclock_scheduler
-  fmt_desc: Max IOPS capacity (at 4KiB block size) to consider per OSD
-    (overrides _ssd and _hdd if non-zero)
-  default: 0
-  flags:
-  - runtime
-- name: osd_mclock_max_capacity_iops_hdd
-  type: float
-  level: basic
-  desc: Max IOPs capacity (at 4KiB block size) to consider per OSD (for rotational
-    media)
-  long_desc: This option specifies the max OSD capacity in iops per OSD. Helps in
-    QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue
-    = mclock_scheduler
-  fmt_desc: Max IOPS capacity (at 4KiB block size) to consider per OSD (for
-    rotational media)
-  default: 315
-  flags:
-  - runtime
-- name: osd_mclock_max_capacity_iops_ssd
-  type: float
-  level: basic
-  desc: Max IOPs capacity (at 4KiB block size) to consider per OSD (for solid state
-    media)
-  long_desc: This option specifies the max OSD capacity in iops per OSD. Helps in
-    QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue
-    = mclock_scheduler
-  fmt_desc: Max IOPS capacity (at 4KiB block size) to consider per OSD (for
-    solid state media)
-  default: 21500
-  flags:
-  - runtime
-- name: osd_mclock_profile
-  type: str
-  level: advanced
-  desc: Which mclock profile to use
-  long_desc: This option specifies the mclock profile to enable - one among the set
-    of built-in profiles or a custom profile. Only considered for osd_op_queue = mclock_scheduler
-  fmt_desc: |
-    This sets the type of mclock profile to use for providing QoS
-    based on operations belonging to different classes (background
-    recovery, scrub, snaptrim, client op, osd subop). Once a built-in
-    profile is enabled, the lower level mclock resource control
-    parameters [*reservation, weight, limit*] and some Ceph
-    configuration parameters are set transparently. Note that the
-    above does not apply for the *custom* profile.
-  default: high_client_ops
-  see_also:
-  - osd_op_queue
-  enum_values:
-  - balanced
-  - high_recovery_ops
-  - high_client_ops
-  - custom
-  flags:
-  - runtime
-# do not assert on divergent_prior entries which aren't in the log and whose on-disk objects are newer
-- name: osd_ignore_stale_divergent_priors
-  type: bool
-  level: advanced
-  default: false
-  with_legacy: true
-# Set to true for testing.  Users should NOT set this.
-# If set to true even after reading enough shards to
-# decode the object, any error will be reported.
-- name: osd_read_ec_check_for_errors
-  type: bool
-  level: advanced
-  default: false
-  with_legacy: true
-# Only use clone_overlap for recovery if there are fewer than
-# osd_recover_clone_overlap_limit entries in the overlap set
-- name: osd_recover_clone_overlap_limit
-  type: uint
-  level: advanced
-  default: 10
-  flags:
-  - runtime
-- name: osd_debug_feed_pullee
-  type: int
-  level: dev
-  desc: Feed a pullee, and force primary to pull a currently missing object from it
-  default: -1
-  with_legacy: true
-- name: osd_backfill_scan_min
-  type: int
-  level: advanced
-  default: 64
-  fmt_desc: The minimum number of objects per backfill scan.
-  with_legacy: true
-- name: osd_backfill_scan_max
-  type: int
-  level: advanced
-  default: 512
-  fmt_desc: The maximum number of objects per backfill scan.p
-  with_legacy: true
-- name: osd_op_thread_timeout
-  type: int
-  level: advanced
-  default: 15
-  fmt_desc: The Ceph OSD Daemon operation thread timeout in seconds.
-  with_legacy: true
-- name: osd_op_thread_suicide_timeout
-  type: int
-  level: advanced
-  default: 150
-  with_legacy: true
-- name: osd_recovery_sleep
-  type: float
-  level: advanced
-  desc: Time in seconds to sleep before next recovery or backfill op
-  fmt_desc: Time in seconds to sleep before the next recovery or backfill op.
-    Increasing this value will slow down recovery operation while
-    client operations will be less impacted.
-  default: 0
-  flags:
-  - runtime
-  with_legacy: true
-- name: osd_recovery_sleep_hdd
-  type: float
-  level: advanced
-  desc: Time in seconds to sleep before next recovery or backfill op for HDDs
-  fmt_desc: Time in seconds to sleep before next recovery or backfill op
-    for HDDs.
-  default: 0.1
-  flags:
-  - runtime
-  with_legacy: true
-- name: osd_recovery_sleep_ssd
-  type: float
-  level: advanced
-  desc: Time in seconds to sleep before next recovery or backfill op for SSDs
-  fmt_desc: Time in seconds to sleep before the next recovery or backfill op
-    for SSDs.
-  default: 0
-  see_also:
-  - osd_recovery_sleep
-  flags:
-  - runtime
-  with_legacy: true
-- name: osd_recovery_sleep_hybrid
-  type: float
-  level: advanced
-  desc: Time in seconds to sleep before next recovery or backfill op when data is
-    on HDD and journal is on SSD
-  fmt_desc: Time in seconds to sleep before the next recovery or backfill op
-    when OSD data is on HDD and OSD journal / WAL+DB is on SSD.
-  default: 0.025
-  see_also:
-  - osd_recovery_sleep
-  flags:
-  - runtime
-- name: osd_snap_trim_sleep
-  type: float
-  level: advanced
-  desc: Time in seconds to sleep before next snap trim (overrides values below)
-  fmt_desc: Time in seconds to sleep before next snap trim op.
-    Increasing this value will slow down snap trimming.
-    This option overrides backend specific variants.
-  default: 0
-  flags:
-  - runtime
-  with_legacy: true
-- name: osd_snap_trim_sleep_hdd
-  type: float
-  level: advanced
-  desc: Time in seconds to sleep before next snap trim for HDDs
-  default: 5
-  flags:
-  - runtime
-- name: osd_snap_trim_sleep_ssd
-  type: float
-  level: advanced
-  desc: Time in seconds to sleep before next snap trim for SSDs
-  fmt_desc: Time in seconds to sleep before next snap trim op
-    for SSD OSDs (including NVMe).
-  default: 0
-  flags:
-  - runtime
-- name: osd_snap_trim_sleep_hybrid
-  type: float
-  level: advanced
-  desc: Time in seconds to sleep before next snap trim when data is on HDD and journal
-    is on SSD
-  fmt_desc: Time in seconds to sleep before next snap trim op
-    when OSD data is on an HDD and the OSD journal or WAL+DB is on an SSD.
-  default: 2
-  flags:
-  - runtime
-- name: osd_scrub_invalid_stats
-  type: bool
-  level: advanced
-  default: true
-  with_legacy: true
-- name: osd_heartbeat_interval
-  type: int
-  level: dev
-  desc: Interval (in seconds) between peer pings
-  fmt_desc: How often an Ceph OSD Daemon pings its peers (in seconds).
-  default: 6
-  min: 1
-  max: 1_min
-  with_legacy: true
-# (seconds) how long before we decide a peer has failed
-# This setting is read by the MONs and OSDs and has to be set to a equal value in both settings of the configuration
-- name: osd_heartbeat_grace
-  type: int
-  level: advanced
-  default: 20
-  fmt_desc: The elapsed time when a Ceph OSD Daemon hasn't shown a heartbeat
-              that the Ceph Storage Cluster considers it ``down``.
-              This setting must be set in both the [mon] and [osd] or [global]
-              sections so that it is read by both monitor and OSD daemons.
-  with_legacy: true
-- name: osd_heartbeat_stale
-  type: int
-  level: advanced
-  desc: Interval (in seconds) we mark an unresponsive heartbeat peer as stale.
-  long_desc: Automatically mark unresponsive heartbeat sessions as stale and tear
-    them down. The primary benefit is that OSD doesn't need to keep a flood of blocked
-    heartbeat messages around in memory.
-  default: 10_min
-# minimum number of peers
-- name: osd_heartbeat_min_peers
-  type: int
-  level: advanced
-  default: 10
-  with_legacy: true
-# prio the heartbeat tcp socket and set dscp as CS6 on it if true
-- name: osd_heartbeat_use_min_delay_socket
-  type: bool
-  level: advanced
-  default: false
-  with_legacy: true
-# the minimum size of OSD heartbeat messages to send
-- name: osd_heartbeat_min_size
-  type: size
-  level: advanced
-  desc: Minimum heartbeat packet size in bytes. Will add dummy payload if heartbeat
-    packet is smaller than this.
-  default: 2000
-  with_legacy: true
-# max number of parallel snap trims/pg
-- name: osd_pg_max_concurrent_snap_trims
-  type: uint
-  level: advanced
-  default: 2
-  with_legacy: true
-# max number of trimming pgs
-- name: osd_max_trimming_pgs
-  type: uint
-  level: advanced
-  default: 2
-  with_legacy: true
-# minimum number of peers that must be reachable to mark ourselves
-# back up after being wrongly marked down.
-- name: osd_heartbeat_min_healthy_ratio
-  type: float
-  level: advanced
-  default: 0.33
-  with_legacy: true
-# (seconds) how often to ping monitor if no peers
-- name: osd_mon_heartbeat_interval
-  type: int
-  level: advanced
-  default: 30
-  fmt_desc: How often the Ceph OSD Daemon pings a Ceph Monitor if it has no
-              Ceph OSD Daemon peers.
-  with_legacy: true
-- name: osd_mon_heartbeat_stat_stale
-  type: int
-  level: advanced
-  desc: Stop reporting on heartbeat ping times not updated for this many seconds.
-  long_desc: Stop reporting on old heartbeat information unless this is set to zero
-  fmt_desc: Stop reporting on heartbeat ping times which haven't been updated for
-              this many seconds.  Set to zero to disable this action.
-  default: 1_hr
-# failures, up_thru, boot.
-- name: osd_mon_report_interval
-  type: int
-  level: advanced
-  desc: Frequency of OSD reports to mon for peer failures, fullness status changes
-  fmt_desc: The number of seconds a Ceph OSD Daemon may wait
-              from startup or another reportable event before reporting
-              to a Ceph Monitor.
-  default: 5
-  with_legacy: true
-# max updates in flight
-- name: osd_mon_report_max_in_flight
-  type: int
-  level: advanced
-  default: 2
-  with_legacy: true
-# (second) how often to send beacon message to monitor
-- name: osd_beacon_report_interval
-  type: int
-  level: advanced
-  default: 5_min
-  with_legacy: true
-# report pg stats for any given pg at least this often
-- name: osd_pg_stat_report_interval_max
-  type: int
-  level: advanced
-  default: 500
-  with_legacy: true
-# Max number of snap intervals to report to mgr in pg_stat_t
-- name: osd_max_snap_prune_intervals_per_epoch
-  type: uint
-  level: dev
-  desc: Max number of snap intervals to report to mgr in pg_stat_t
-  default: 512
-  with_legacy: true
-- name: osd_default_data_pool_replay_window
-  type: int
-  level: advanced
-  default: 45
-  fmt_desc: The time (in seconds) for an OSD to wait for a client to replay
-    a request.
-- name: osd_auto_mark_unfound_lost
-  type: bool
-  level: advanced
-  default: false
-  with_legacy: true
-- name: osd_recovery_delay_start
-  type: float
-  level: advanced
-  default: 0
-  fmt_desc: After peering completes, Ceph will delay for the specified number
-    of seconds before starting to recover RADOS objects.
-  with_legacy: true
-- name: osd_recovery_max_active
-  type: uint
-  level: advanced
-  desc: Number of simultaneous active recovery operations per OSD (overrides _ssd
-    and _hdd if non-zero)
-  fmt_desc: The number of active recovery requests per OSD at one time. More
-    requests will accelerate recovery, but the requests places an
-    increased load on the cluster.
-  note: This value is only used if it is non-zero. Normally it
-    is ``0``, which means that the ``hdd`` or ``ssd`` values
-    (below) are used, depending on the type of the primary
-    device backing the OSD.
-  default: 0
-  see_also:
-  - osd_recovery_max_active_hdd
-  - osd_recovery_max_active_ssd
-  flags:
-  - runtime
-  with_legacy: true
-- name: osd_recovery_max_active_hdd
-  type: uint
-  level: advanced
-  desc: Number of simultaneous active recovery operations per OSD (for rotational
-    devices)
-  fmt_desc: The number of active recovery requests per OSD at one time, if the
-    primary device is rotational.
-  default: 3
-  see_also:
-  - osd_recovery_max_active
-  - osd_recovery_max_active_ssd
-  flags:
-  - runtime
-  with_legacy: true
-- name: osd_recovery_max_active_ssd
-  type: uint
-  level: advanced
-  desc: Number of simultaneous active recovery operations per OSD (for non-rotational
-    solid state devices)
-  fmt_desc: The number of active recovery requests per OSD at one time, if the
-    primary device is non-rotational (i.e., an SSD).
-  default: 10
-  see_also:
-  - osd_recovery_max_active
-  - osd_recovery_max_active_hdd
-  flags:
-  - runtime
-  with_legacy: true
-- name: osd_recovery_max_single_start
-  type: uint
-  level: advanced
-  default: 1
-  fmt_desc: The maximum number of recovery operations per OSD that will be
-    newly started when an OSD is recovering.
-  with_legacy: true
-# max size of push chunk
-- name: osd_recovery_max_chunk
-  type: size
-  level: advanced
-  default: 8_M
-  fmt_desc: the maximum total size of data chunks a recovery op can carry.
-  with_legacy: true
-# max number of omap entries per chunk; 0 to disable limit
-- name: osd_recovery_max_omap_entries_per_chunk
-  type: uint
-  level: advanced
-  default: 8096
-  with_legacy: true
-# max size of a COPYFROM chunk
-- name: osd_copyfrom_max_chunk
-  type: size
-  level: advanced
-  default: 8_M
-  with_legacy: true
-# push cost per object
-- name: osd_push_per_object_cost
-  type: size
-  level: advanced
-  default: 1000
-  fmt_desc: the overhead for serving a push op
-  with_legacy: true
-# max size of push message
-- name: osd_max_push_cost
-  type: size
-  level: advanced
-  default: 8_M
-  with_legacy: true
-# max objects in single push op
-- name: osd_max_push_objects
-  type: uint
-  level: advanced
-  default: 10
-  with_legacy: true
-- name: osd_max_scrubs
-  type: int
-  level: advanced
-  desc: Maximum concurrent scrubs on a single OSD
-  fmt_desc: The maximum number of simultaneous scrub operations for
-    a Ceph OSD Daemon.
-  default: 1
-  with_legacy: true
-- name: osd_scrub_during_recovery
-  type: bool
-  level: advanced
-  desc: Allow scrubbing when PGs on the OSD are undergoing recovery
-  fmt_desc: Allow scrub during recovery. Setting this to ``false`` will disable
-    scheduling new scrub (and deep--scrub) while there is active recovery.
-    Already running scrubs will be continued. This might be useful to reduce
-    load on busy clusters.
-  default: false
-  with_legacy: true
-- name: osd_repair_during_recovery
-  type: bool
-  level: advanced
-  desc: Allow requested repairing when PGs on the OSD are undergoing recovery
-  default: false
-  with_legacy: true
-- name: osd_scrub_begin_hour
-  type: int
-  level: advanced
-  desc: Restrict scrubbing to this hour of the day or later
-  long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
-  fmt_desc: This restricts scrubbing to this hour of the day or later.
-    Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0``
-    to allow scrubbing the entire day.  Along with ``osd_scrub_end_hour``, they define a time
-    window, in which the scrubs can happen.
-    But a scrub will be performed
-    no matter whether the time window allows or not, as long as the placement
-    group's scrub interval exceeds ``osd_scrub_max_interval``.
-  default: 0
-  see_also:
-  - osd_scrub_end_hour
-  min: 0
-  max: 23
-  with_legacy: true
-- name: osd_scrub_end_hour
-  type: int
-  level: advanced
-  desc: Restrict scrubbing to hours of the day earlier than this
-  long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
-  fmt_desc: This restricts scrubbing to the hour earlier than this.
-    Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0`` to allow scrubbing
-    for the entire day.  Along with ``osd_scrub_begin_hour``, they define a time
-    window, in which the scrubs can happen. But a scrub will be performed
-    no matter whether the time window allows or not, as long as the placement
-    group's scrub interval exceeds ``osd_scrub_max_interval``.
-  default: 0
-  see_also:
-  - osd_scrub_begin_hour
-  min: 0
-  max: 23
-  with_legacy: true
-- name: osd_scrub_begin_week_day
-  type: int
-  level: advanced
-  desc: Restrict scrubbing to this day of the week or later
-  long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0
-    for the entire week.
-  fmt_desc: This restricts scrubbing to this day of the week or later.
-    0  = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0``
-    and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
-    Along with ``osd_scrub_end_week_day``, they define a time window in which
-    scrubs can happen. But a scrub will be performed
-    no matter whether the time window allows or not, when the PG's
-    scrub interval exceeds ``osd_scrub_max_interval``.
-  default: 0
-  see_also:
-  - osd_scrub_end_week_day
-  min: 0
-  max: 6
-  with_legacy: true
-- name: osd_scrub_end_week_day
-  type: int
-  level: advanced
-  desc: Restrict scrubbing to days of the week earlier than this
-  long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0
-    for the entire week.
-  fmt_desc: This restricts scrubbing to days of the week earlier than this.
-    0 = Sunday, 1 = Monday, etc.  Use ``osd_scrub_begin_week_day = 0``
-    and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
-    Along with ``osd_scrub_begin_week_day``, they define a time
-    window, in which the scrubs can happen. But a scrub will be performed
-    no matter whether the time window allows or not, as long as the placement
-    group's scrub interval exceeds ``osd_scrub_max_interval``.
-  default: 0
-  see_also:
-  - osd_scrub_begin_week_day
-  min: 0
-  max: 6
-  with_legacy: true
-- name: osd_scrub_load_threshold
-  type: float
-  level: advanced
-  desc: Allow scrubbing when system load divided by number of CPUs is below this value
-  fmt_desc: The normalized maximum load. Ceph will not scrub when the system load
-    (as defined by ``getloadavg() / number of online CPUs``) is higher than this number.
-    Default is ``0.5``.
-  default: 0.5
-  with_legacy: true
-# if load is low
-- name: osd_scrub_min_interval
-  type: float
-  level: advanced
-  desc: Scrub each PG no more often than this interval
-  fmt_desc: The minimal interval in seconds for scrubbing the Ceph OSD Daemon
-    when the Ceph Storage Cluster load is low.
-  default: 1_day
-  see_also:
-  - osd_scrub_max_interval
-  with_legacy: true
-# regardless of load
-- name: osd_scrub_max_interval
-  type: float
+- name: osd_map_message_max_bytes
+  type: size
    level: advanced
-  desc: Scrub each PG no less often than this interval
-  fmt_desc: The maximum interval in seconds for scrubbing the Ceph OSD Daemon
-    irrespective of cluster load.
-  default: 7_day
-  see_also:
-  - osd_scrub_min_interval
+  desc: maximum number of bytes worth of OSDMaps to include in a single message
+  default: 10_M
+  services:
+  - osd
+  - mon
    with_legacy: true
-# randomize the scheduled scrub in the span of [min,min*(1+randomize_ratio))
-- name: osd_scrub_interval_randomize_ratio
-  type: float
+# do not assert on divergent_prior entries which aren't in the log and whose on-disk objects are newer
+- name: osd_ignore_stale_divergent_priors
+  type: bool
    level: advanced
-  desc: Ratio of scrub interval to randomly vary
-  long_desc: This prevents a scrub 'stampede' by randomly varying the scrub intervals
-    so that they are soon uniformly distributed over the week
-  fmt_desc: Add a random delay to ``osd_scrub_min_interval`` when scheduling
-    the next scrub job for a PG. The delay is a random
-    value less than ``osd_scrub_min_interval`` \*
-    ``osd_scrub_interval_randomized_ratio``. The default setting
-    spreads scrubs throughout the allowed time
-    window of ``[1, 1.5]`` \* ``osd_scrub_min_interval``.
-  default: 0.5
-  see_also:
-  - osd_scrub_min_interval
+  default: false
    with_legacy: true
-# the probability to back off the scheduled scrub
-- name: osd_scrub_backoff_ratio
-  type: float
+- name: osd_heartbeat_interval
+  type: int
    level: dev
-  desc: Backoff ratio for scheduling scrubs
-  long_desc: This is the precentage of ticks that do NOT schedule scrubs, 66% means
-    that 1 out of 3 ticks will schedule scrubs
-  default: 0.66
+  desc: Interval (in seconds) between peer pings
+  fmt_desc: How often an Ceph OSD Daemon pings its peers (in seconds).
+  default: 6
+  min: 1
+  max: 1_min
    with_legacy: true
-- name: osd_scrub_chunk_min
+# (seconds) how long before we decide a peer has failed
+# This setting is read by the MONs and OSDs and has to be set to a equal value in both settings of the configuration
+- name: osd_heartbeat_grace
    type: int
    level: advanced
-  desc: Minimum number of objects to scrub in a single chunk
-  fmt_desc: The minimal number of object store chunks to scrub during single operation.
-    Ceph blocks writes to single chunk during scrub.
-  default: 5
-  see_also:
-  - osd_scrub_chunk_max
+  default: 20
+  fmt_desc: The elapsed time when a Ceph OSD Daemon hasn't shown a heartbeat
+              that the Ceph Storage Cluster considers it ``down``.
+              This setting must be set in both the [mon] and [osd] or [global]
+              sections so that it is read by both monitor and OSD daemons.
    with_legacy: true
-- name: osd_scrub_chunk_max
+- name: osd_heartbeat_stale
    type: int
    level: advanced
-  desc: Maximum number of objects to scrub in a single chunk
-  fmt_desc: The maximum number of object store chunks to scrub during single operation.
-  default: 25
-  see_also:
-  - osd_scrub_chunk_min
-  with_legacy: true
-# sleep between [deep]scrub ops
-- name: osd_scrub_sleep
-  type: float
-  level: advanced
-  desc: Duration to inject a delay during scrubbing
-  fmt_desc: Time to sleep before scrubbing the next group of chunks. Increasing this value will slow
-    down the overall rate of scrubbing so that client operations will be less impacted.
-  default: 0
-  flags:
-  - runtime
-  with_legacy: true
-# more sleep between [deep]scrub ops
-- name: osd_scrub_extended_sleep
-  type: float
-  level: advanced
-  desc: Duration to inject a delay during scrubbing out of scrubbing hours
-  default: 0
-  see_also:
-  - osd_scrub_begin_hour
-  - osd_scrub_end_hour
-  - osd_scrub_begin_week_day
-  - osd_scrub_end_week_day
-  with_legacy: true
-# whether auto-repair inconsistencies upon deep-scrubbing
-- name: osd_scrub_auto_repair
+  desc: Interval (in seconds) we mark an unresponsive heartbeat peer as stale.
+  long_desc: Automatically mark unresponsive heartbeat sessions as stale and tear
+    them down. The primary benefit is that OSD doesn't need to keep a flood of blocked
+    heartbeat messages around in memory.
+  default: 10_min
+# prio the heartbeat tcp socket and set dscp as CS6 on it if true
+- name: osd_heartbeat_use_min_delay_socket
    type: bool
    level: advanced
-  desc: Automatically repair damaged objects detected during scrub
-  fmt_desc: Setting this to ``true`` will enable automatic PG repair when errors
-    are found by scrubs or deep-scrubs.  However, if more than
-    ``osd_scrub_auto_repair_num_errors`` errors are found a repair is NOT performed.
    default: false
    with_legacy: true
-# only auto-repair when number of errors is below this threshold
-- name: osd_scrub_auto_repair_num_errors
-  type: uint
+# the minimum size of OSD heartbeat messages to send
+- name: osd_heartbeat_min_size
+  type: size
    level: advanced
-  desc: Maximum number of detected errors to automatically repair
-  fmt_desc: Auto repair will not occur if more than this many errors are found.
-  default: 5
-  see_also:
-  - osd_scrub_auto_repair
+  desc: Minimum heartbeat packet size in bytes. Will add dummy payload if heartbeat
+    packet is smaller than this.
+  default: 2000
    with_legacy: true
-- name: osd_scrub_max_preemptions
+# max number of parallel snap trims/pg
+- name: osd_pg_max_concurrent_snap_trims
    type: uint
    level: advanced
-  desc: Set the maximum number of times we will preempt a deep scrub due to a client
-    operation before blocking client IO to complete the scrub
-  default: 5
-  min: 0
-  max: 30
-- name: osd_deep_scrub_interval
-  type: float
+  default: 2
+  with_legacy: true
+# max number of trimming pgs
+- name: osd_max_trimming_pgs
+  type: uint
    level: advanced
-  desc: Deep scrub each PG (i.e., verify data checksums) at least this often
-  fmt_desc: The interval for "deep" scrubbing (fully reading all data). The
-    ``osd_scrub_load_threshold`` does not affect this setting.
-  default: 7_day
+  default: 2
    with_legacy: true
-- name: osd_deep_scrub_randomize_ratio
+# minimum number of peers that must be reachable to mark ourselves
+# back up after being wrongly marked down.
+- name: osd_heartbeat_min_healthy_ratio
    type: float
    level: advanced
-  desc: Scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs
-    are deep)
-  long_desc: This prevents a deep scrub 'stampede' by spreading deep scrubs so they
-    are uniformly distributed over the week
-  default: 0.15
+  default: 0.33
    with_legacy: true
-- name: osd_deep_scrub_stride
-  type: size
+# (seconds) how often to ping monitor if no peers
+- name: osd_mon_heartbeat_interval
+  type: int
    level: advanced
-  desc: Number of bytes to read from an object at a time during deep scrub
-  fmt_desc: Read size when doing a deep scrub.
-  default: 512_K
+  default: 30
+  fmt_desc: How often the Ceph OSD Daemon pings a Ceph Monitor if it has no
+              Ceph OSD Daemon peers.
    with_legacy: true
-- name: osd_deep_scrub_keys
+- name: osd_mon_heartbeat_stat_stale
    type: int
    level: advanced
-  desc: Number of keys to read from an object at a time during deep scrub
-  default: 1024
-  with_legacy: true
-# objects must be this old (seconds) before we update the whole-object digest on scrub
-- name: osd_deep_scrub_update_digest_min_age
+  desc: Stop reporting on heartbeat ping times not updated for this many seconds.
+  long_desc: Stop reporting on old heartbeat information unless this is set to zero
+  fmt_desc: Stop reporting on heartbeat ping times which haven't been updated for
+              this many seconds.  Set to zero to disable this action.
+  default: 1_hr
+# failures, up_thru, boot.
+- name: osd_mon_report_interval
    type: int
    level: advanced
-  desc: Update overall object digest only if object was last modified longer ago than
-    this
-  default: 2_hr
+  desc: Frequency of OSD reports to mon for peer failures, fullness status changes
+  fmt_desc: The number of seconds a Ceph OSD Daemon may wait
+              from startup or another reportable event before reporting
+              to a Ceph Monitor.
+  default: 5
    with_legacy: true
-- name: osd_deep_scrub_large_omap_object_key_threshold
-  type: uint
+# max updates in flight
+- name: osd_mon_report_max_in_flight
+  type: int
    level: advanced
-  desc: Warn when we encounter an object with more omap keys than this
-  default: 200000
-  services:
-  - osd
-  see_also:
-  - osd_deep_scrub_large_omap_object_value_sum_threshold
+  default: 2
    with_legacy: true
-- name: osd_deep_scrub_large_omap_object_value_sum_threshold
-  type: size
+# (second) how often to send beacon message to monitor
+- name: osd_beacon_report_interval
+  type: int
    level: advanced
-  desc: Warn when we encounter an object with more omap key bytes than this
-  default: 1_G
-  services:
-  - osd
-  see_also:
-  - osd_deep_scrub_large_omap_object_key_threshold
+  default: 5_min
    with_legacy: true
-# where rados plugins are stored
-- name: osd_class_dir
-  type: str
+# report pg stats for any given pg at least this often
+- name: osd_pg_stat_report_interval_max
+  type: int
    level: advanced
-  default: @CMAKE_INSTALL_LIBDIR@/rados-classes
-  fmt_desc: The class path for RADOS class plug-ins.
+  default: 500
    with_legacy: true
-- name: osd_open_classes_on_start
-  type: bool
-  level: advanced
-  default: true
+# Max number of snap intervals to report to mgr in pg_stat_t
+- name: osd_max_snap_prune_intervals_per_epoch
+  type: uint
+  level: dev
+  desc: Max number of snap intervals to report to mgr in pg_stat_t
+  default: 512
    with_legacy: true
-# list of object classes allowed to be loaded (allow all: *)
-- name: osd_class_load_list
-  type: str
+- name: osd_default_data_pool_replay_window
+  type: int
    level: advanced
-  default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex
-    user version cas cmpomap queue 2pc_queue fifo
-  with_legacy: true
-# list of object classes with default execute perm (allow all: *)
-- name: osd_class_default_list
-  type: str
+  default: 45
+  fmt_desc: The time (in seconds) for an OSD to wait for a client to replay
+    a request.
+- name: osd_auto_mark_unfound_lost
+  type: bool
    level: advanced
-  default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex
-    user version cas cmpomap queue 2pc_queue fifo
+  default: false
    with_legacy: true
  - name: osd_check_for_log_corruption
    type: bool
@@ -5744,38 +4532,6 @@ options:
    level: advanced
    default: 30
    with_legacy: true
-- name: osd_delete_sleep
-  type: float
-  level: advanced
-  desc: Time in seconds to sleep before next removal transaction (overrides values
-    below)
-  fmt_desc: Time in seconds to sleep before the next removal transaction. This
-    throttles the PG deletion process.
-  default: 0
-  flags:
-  - runtime
-- name: osd_delete_sleep_hdd
-  type: float
-  level: advanced
-  desc: Time in seconds to sleep before next removal transaction for HDDs
-  default: 5
-  flags:
-  - runtime
-- name: osd_delete_sleep_ssd
-  type: float
-  level: advanced
-  desc: Time in seconds to sleep before next removal transaction for SSDs
-  default: 1
-  flags:
-  - runtime
-- name: osd_delete_sleep_hybrid
-  type: float
-  level: advanced
-  desc: Time in seconds to sleep before next removal transaction when OSD data is on HDD
-    and OSD journal or WAL+DB is on SSD
-  default: 1
-  flags:
-  - runtime
  # what % full makes an OSD "full" (failsafe)
  - name: osd_failsafe_full_ratio
    type: float
@@ -8446,69 +7202,6 @@ options:
    level: advanced
    default: false
    with_legacy: true
-- name: cephadm_path
-  type: str
-  level: advanced
-  desc: Path to cephadm utility
-  default: /usr/sbin/cephadm
-  services:
-  - mgr
-- name: mgr_module_path
-  type: str
-  level: advanced
-  desc: Filesystem path to manager modules.
-  default: @CEPH_INSTALL_DATADIR@/mgr
-  services:
-  - mgr
-- name: mgr_standby_modules
-  type: bool
-  default: true
-  level: advanced
-  desc: Start modules in standby (redirect) mode when mgr is standby
-  long_desc: By default, the standby modules will answer incoming requests with a
-    HTTP redirect to the active manager, allowing users to point their browser at any
-    mgr node and find their way to an active mgr.  However, this mode is problematic
-    when using a load balancer because (1) the redirect locations are usually private
-    IPs and (2) the load balancer can't identify which mgr is the right one to send
-    traffic to. If a load balancer is being used, set this to false.
-- name: mgr_disabled_modules
-  type: str
-  level: advanced
-  desc: List of manager modules never get loaded
-  long_desc: A comma delimited list of module names. This list is read by manager
-    when it starts. By default, manager loads all modules found in specified 'mgr_module_path',
-    and it starts the enabled ones as instructed. The modules in this list will not
-    be loaded at all.
-  default: @mgr_disabled_modules@
-  services:
-  - mgr
-  see_also:
-  - mgr_module_path
-  flags:
-  - startup
-- name: mgr_initial_modules
-  type: str
-  level: basic
-  desc: List of manager modules to enable when the cluster is first started
-  long_desc: This list of module names is read by the monitor when the cluster is
-    first started after installation, to populate the list of enabled manager modules.  Subsequent
-    updates are done using the 'mgr module [enable|disable]' commands.  List may be
-    comma or space separated.
-  default: restful iostat
-  services:
-  - mon
-  flags:
-  - no_mon_update
-  - cluster_create
-- name: mgr_data
-  type: str
-  level: advanced
-  desc: Filesystem path to the ceph-mgr data directory, used to contain keyring.
-  default: /var/lib/ceph/mgr/$cluster-$id
-  services:
-  - mgr
-  flags:
-  - no_mon_update
  - name: mgr_tick_period
    type: secs
    level: advanced
@@ -8516,90 +7209,18 @@ options:
    default: 2
    services:
    - mgr
-- name: mgr_stats_period
-  type: int
-  level: basic
-  desc: Period in seconds of OSD/MDS stats reports to manager
-  long_desc: Use this setting to control the granularity of time series data collection
-    from daemons.  Adjust upwards if the manager CPU load is too high, or if you simply
-    do not require the most up to date performance counter data.
-  default: 5
-  services:
-  - mgr
-- name: mgr_client_bytes
-  type: size
-  level: dev
-  default: 128_M
-  services:
-  - mgr
-- name: mgr_client_messages
-  type: uint
-  level: dev
-  default: 512
-  services:
-  - mgr
-- name: mgr_osd_bytes
-  type: size
-  level: dev
-  default: 512_M
-  services:
-  - mgr
-- name: mgr_osd_messages
-  type: uint
-  level: dev
-  default: 8_K
-  services:
-  - mgr
-- name: mgr_mds_bytes
-  type: size
-  level: dev
-  default: 128_M
-  services:
-  - mgr
-- name: mgr_mds_messages
-  type: uint
-  level: dev
-  default: 128
-  services:
-  - mgr
-- name: mgr_mon_bytes
-  type: size
-  level: dev
-  default: 128_M
-  services:
-  - mgr
-- name: mgr_mon_messages
-  type: uint
-  level: dev
-  default: 128
-  services:
-  - mgr
+  - mon
  - name: mgr_connect_retry_interval
    type: float
    level: dev
    default: 1
    services:
    - common
-- name: mgr_service_beacon_grace
-  type: float
-  level: advanced
-  desc: Period in seconds from last beacon to manager dropping state about a monitored
-    service (RGW, rbd-mirror etc)
-  default: 1_min
-  services:
-  - mgr
  - name: mgr_client_service_daemon_unregister_timeout
    type: float
    level: dev
    desc: Time to wait during shutdown to deregister service with mgr
    default: 1
-- name: mgr_debug_aggressive_pg_num_changes
-  type: bool
-  level: dev
-  desc: Bypass most throttling and safety checks in pg[p]_num controller
-  default: false
-  services:
-  - mgr
  - name: mon_mgr_digest_period
    type: int
    level: dev
diff --git a/src/common/options/legacy_config_opts.h b/src/common/options/legacy_config_opts.h

index d5156328250510b9f120525fed7afc55e170077c..d3429a675d77466d1d08c83739a596eada664415 100644 (file)
--- a/src/common/options/legacy_config_opts.h
+++ b/src/common/options/legacy_config_opts.h
@@ -2,6 +2,8 @@
  #include "cephfs-mirror_legacy_options.h"
  #include "mds_legacy_options.h"
  #include "mds-client_legacy_options.h"
+#include "mgr_legacy_options.h"
+#include "osd_legacy_options.h"
  #include "rbd_legacy_options.h"
  #include "rbd-mirror_legacy_options.h"
  #include "immutable-object-cache_legacy_options.h"
diff --git a/src/common/options/mgr.yaml.in b/src/common/options/mgr.yaml.in

new file mode 100644 (file)

index 0000000..8384b9e
--- /dev/null
+++ b/src/common/options/mgr.yaml.in
@@ -0,0 +1,142 @@
+# -*- mode: YAML -*-
+---
+
+options:
+- name: mgr_data
+  type: str
+  level: advanced
+  desc: Filesystem path to the ceph-mgr data directory, used to contain keyring.
+  default: /var/lib/ceph/mgr/$cluster-$id
+  services:
+  - mgr
+  flags:
+  - no_mon_update
+- name: mgr_stats_period
+  type: int
+  level: basic
+  desc: Period in seconds of OSD/MDS stats reports to manager
+  long_desc: Use this setting to control the granularity of time series data collection
+    from daemons.  Adjust upwards if the manager CPU load is too high, or if you simply
+    do not require the most up to date performance counter data.
+  default: 5
+  services:
+  - mgr
+  - common
+- name: mgr_client_bytes
+  type: size
+  level: dev
+  default: 128_M
+  services:
+  - mgr
+- name: mgr_client_messages
+  type: uint
+  level: dev
+  default: 512
+  services:
+  - mgr
+- name: mgr_osd_bytes
+  type: size
+  level: dev
+  default: 512_M
+  services:
+  - mgr
+- name: mgr_osd_messages
+  type: uint
+  level: dev
+  default: 8_K
+  services:
+  - mgr
+- name: mgr_mds_bytes
+  type: size
+  level: dev
+  default: 128_M
+  services:
+  - mgr
+- name: mgr_mds_messages
+  type: uint
+  level: dev
+  default: 128
+  services:
+  - mgr
+- name: mgr_mon_bytes
+  type: size
+  level: dev
+  default: 128_M
+  services:
+  - mgr
+- name: mgr_mon_messages
+  type: uint
+  level: dev
+  default: 128
+  services:
+  - mgr
+- name: mgr_service_beacon_grace
+  type: float
+  level: advanced
+  desc: Period in seconds from last beacon to manager dropping state about a monitored
+    service (RGW, rbd-mirror etc)
+  default: 1_min
+  services:
+  - mgr
+- name: mgr_debug_aggressive_pg_num_changes
+  type: bool
+  level: dev
+  desc: Bypass most throttling and safety checks in pg[p]_num controller
+  default: false
+  services:
+  - mgr
+- name: mgr_module_path
+  type: str
+  level: advanced
+  desc: Filesystem path to manager modules.
+  default: @CEPH_INSTALL_DATADIR@/mgr
+  services:
+  - mgr
+- name: mgr_standby_modules
+  type: bool
+  default: true
+  level: advanced
+  desc: Start modules in standby (redirect) mode when mgr is standby
+  long_desc: By default, the standby modules will answer incoming requests with a
+    HTTP redirect to the active manager, allowing users to point their browser at any
+    mgr node and find their way to an active mgr.  However, this mode is problematic
+    when using a load balancer because (1) the redirect locations are usually private
+    IPs and (2) the load balancer can't identify which mgr is the right one to send
+    traffic to. If a load balancer is being used, set this to false.
+- name: mgr_disabled_modules
+  type: str
+  level: advanced
+  desc: List of manager modules never get loaded
+  long_desc: A comma delimited list of module names. This list is read by manager
+    when it starts. By default, manager loads all modules found in specified 'mgr_module_path',
+    and it starts the enabled ones as instructed. The modules in this list will not
+    be loaded at all.
+  default: @mgr_disabled_modules@
+  services:
+  - mgr
+  see_also:
+  - mgr_module_path
+  flags:
+  - startup
+- name: mgr_initial_modules
+  type: str
+  level: basic
+  desc: List of manager modules to enable when the cluster is first started
+  long_desc: This list of module names is read by the monitor when the cluster is
+    first started after installation, to populate the list of enabled manager modules.  Subsequent
+    updates are done using the 'mgr module [enable|disable]' commands.  List may be
+    comma or space separated.
+  default: restful iostat
+  services:
+  - mon
+  - common
+  flags:
+  - no_mon_update
+  - cluster_create
+- name: cephadm_path
+  type: str
+  level: advanced
+  desc: Path to cephadm utility
+  default: /usr/sbin/cephadm
+  services:
+  - mgr
diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in

new file mode 100644 (file)

index 0000000..c5f33fe
--- /dev/null
+++ b/src/common/options/osd.yaml.in
@@ -0,0 +1,1255 @@
+# -*- mode: YAML -*-
+---
+
+options:
+- name: osd_numa_prefer_iface
+  type: bool
+  level: advanced
+  desc: prefer IP on network interface on same numa node as storage
+  default: true
+  see_also:
+  - osd_numa_auto_affinity
+  flags:
+  - startup
+- name: osd_numa_auto_affinity
+  type: bool
+  level: advanced
+  desc: automatically set affinity to numa node when storage and network match
+  default: true
+  flags:
+  - startup
+- name: osd_numa_node
+  type: int
+  level: advanced
+  desc: set affinity to a numa node (-1 for none)
+  default: -1
+  see_also:
+  - osd_numa_auto_affinity
+  flags:
+  - startup
+- name: osd_smart_report_timeout
+  type: uint
+  level: advanced
+  desc: Timeout (in seconds) for smarctl to run, default is set to 5
+  default: 5
+# verify backend can support configured max object name length
+- name: osd_check_max_object_name_len_on_startup
+  type: bool
+  level: dev
+  default: true
+  with_legacy: true
+- name: osd_max_backfills
+  type: uint
+  level: advanced
+  desc: Maximum number of concurrent local and remote backfills or recoveries per
+    OSD
+  long_desc: There can be osd_max_backfills local reservations AND the same remote
+    reservations per OSD. So a value of 1 lets this OSD participate as 1 PG primary
+    in recovery and 1 shard of another recovering PG.
+  fmt_desc: The maximum number of backfills allowed to or from a single OSD.
+    Note that this is applied separately for read and write operations.
+  default: 1
+  flags:
+  - runtime
+  with_legacy: true
+# Minimum recovery priority (255 = max, smaller = lower)
+- name: osd_min_recovery_priority
+  type: int
+  level: advanced
+  desc: Minimum priority below which recovery is not performed
+  long_desc: The purpose here is to prevent the cluster from doing *any* lower priority
+    work (e.g., rebalancing) below this threshold and focus solely on higher priority
+    work (e.g., replicating degraded objects).
+  default: 0
+  with_legacy: true
+- name: osd_backfill_retry_interval
+  type: float
+  level: advanced
+  desc: how frequently to retry backfill reservations after being denied (e.g., due
+    to a full OSD)
+  fmt_desc: The number of seconds to wait before retrying backfill requests.
+  default: 30
+  with_legacy: true
+- name: osd_recovery_retry_interval
+  type: float
+  level: advanced
+  desc: how frequently to retry recovery reservations after being denied (e.g., due
+    to a full OSD)
+  default: 30
+  with_legacy: true
+- name: osd_recovery_sleep
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next recovery or backfill op
+  fmt_desc: Time in seconds to sleep before the next recovery or backfill op.
+    Increasing this value will slow down recovery operation while
+    client operations will be less impacted.
+  default: 0
+  flags:
+  - runtime
+  with_legacy: true
+- name: osd_recovery_sleep_hdd
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next recovery or backfill op for HDDs
+  fmt_desc: Time in seconds to sleep before next recovery or backfill op
+    for HDDs.
+  default: 0.1
+  flags:
+  - runtime
+  with_legacy: true
+- name: osd_recovery_sleep_ssd
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next recovery or backfill op for SSDs
+  fmt_desc: Time in seconds to sleep before the next recovery or backfill op
+    for SSDs.
+  default: 0
+  see_also:
+  - osd_recovery_sleep
+  flags:
+  - runtime
+  with_legacy: true
+- name: osd_recovery_sleep_hybrid
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next recovery or backfill op when data is
+    on HDD and journal is on SSD
+  fmt_desc: Time in seconds to sleep before the next recovery or backfill op
+    when OSD data is on HDD and OSD journal / WAL+DB is on SSD.
+  default: 0.025
+  see_also:
+  - osd_recovery_sleep
+  flags:
+  - runtime
+- name: osd_snap_trim_sleep
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next snap trim (overrides values below)
+  fmt_desc: Time in seconds to sleep before next snap trim op.
+    Increasing this value will slow down snap trimming.
+    This option overrides backend specific variants.
+  default: 0
+  flags:
+  - runtime
+  with_legacy: true
+- name: osd_snap_trim_sleep_hdd
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next snap trim for HDDs
+  default: 5
+  flags:
+  - runtime
+- name: osd_snap_trim_sleep_ssd
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next snap trim for SSDs
+  fmt_desc: Time in seconds to sleep before next snap trim op
+    for SSD OSDs (including NVMe).
+  default: 0
+  flags:
+  - runtime
+- name: osd_snap_trim_sleep_hybrid
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next snap trim when data is on HDD and journal
+    is on SSD
+  fmt_desc: Time in seconds to sleep before next snap trim op
+    when OSD data is on an HDD and the OSD journal or WAL+DB is on an SSD.
+  default: 2
+  flags:
+  - runtime
+- name: osd_scrub_invalid_stats
+  type: bool
+  level: advanced
+  default: true
+  with_legacy: true
+- name: osd_max_scrubs
+  type: int
+  level: advanced
+  desc: Maximum concurrent scrubs on a single OSD
+  fmt_desc: The maximum number of simultaneous scrub operations for
+    a Ceph OSD Daemon.
+  default: 1
+  with_legacy: true
+- name: osd_scrub_during_recovery
+  type: bool
+  level: advanced
+  desc: Allow scrubbing when PGs on the OSD are undergoing recovery
+  fmt_desc: Allow scrub during recovery. Setting this to ``false`` will disable
+    scheduling new scrub (and deep--scrub) while there is active recovery.
+    Already running scrubs will be continued. This might be useful to reduce
+    load on busy clusters.
+  default: false
+  with_legacy: true
+- name: osd_repair_during_recovery
+  type: bool
+  level: advanced
+  desc: Allow requested repairing when PGs on the OSD are undergoing recovery
+  default: false
+  with_legacy: true
+- name: osd_scrub_begin_hour
+  type: int
+  level: advanced
+  desc: Restrict scrubbing to this hour of the day or later
+  long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
+  fmt_desc: This restricts scrubbing to this hour of the day or later.
+    Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0``
+    to allow scrubbing the entire day.  Along with ``osd_scrub_end_hour``, they define a time
+    window, in which the scrubs can happen.
+    But a scrub will be performed
+    no matter whether the time window allows or not, as long as the placement
+    group's scrub interval exceeds ``osd_scrub_max_interval``.
+  default: 0
+  see_also:
+  - osd_scrub_end_hour
+  min: 0
+  max: 23
+  with_legacy: true
+- name: osd_scrub_end_hour
+  type: int
+  level: advanced
+  desc: Restrict scrubbing to hours of the day earlier than this
+  long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
+  fmt_desc: This restricts scrubbing to the hour earlier than this.
+    Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0`` to allow scrubbing
+    for the entire day.  Along with ``osd_scrub_begin_hour``, they define a time
+    window, in which the scrubs can happen. But a scrub will be performed
+    no matter whether the time window allows or not, as long as the placement
+    group's scrub interval exceeds ``osd_scrub_max_interval``.
+  default: 0
+  see_also:
+  - osd_scrub_begin_hour
+  min: 0
+  max: 23
+  with_legacy: true
+- name: osd_scrub_begin_week_day
+  type: int
+  level: advanced
+  desc: Restrict scrubbing to this day of the week or later
+  long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0
+    for the entire week.
+  fmt_desc: This restricts scrubbing to this day of the week or later.
+    0  = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0``
+    and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
+    Along with ``osd_scrub_end_week_day``, they define a time window in which
+    scrubs can happen. But a scrub will be performed
+    no matter whether the time window allows or not, when the PG's
+    scrub interval exceeds ``osd_scrub_max_interval``.
+  default: 0
+  see_also:
+  - osd_scrub_end_week_day
+  min: 0
+  max: 6
+  with_legacy: true
+- name: osd_scrub_end_week_day
+  type: int
+  level: advanced
+  desc: Restrict scrubbing to days of the week earlier than this
+  long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0
+    for the entire week.
+  fmt_desc: This restricts scrubbing to days of the week earlier than this.
+    0 = Sunday, 1 = Monday, etc.  Use ``osd_scrub_begin_week_day = 0``
+    and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
+    Along with ``osd_scrub_begin_week_day``, they define a time
+    window, in which the scrubs can happen. But a scrub will be performed
+    no matter whether the time window allows or not, as long as the placement
+    group's scrub interval exceeds ``osd_scrub_max_interval``.
+  default: 0
+  see_also:
+  - osd_scrub_begin_week_day
+  min: 0
+  max: 6
+  with_legacy: true
+- name: osd_scrub_load_threshold
+  type: float
+  level: advanced
+  desc: Allow scrubbing when system load divided by number of CPUs is below this value
+  fmt_desc: The normalized maximum load. Ceph will not scrub when the system load
+    (as defined by ``getloadavg() / number of online CPUs``) is higher than this number.
+    Default is ``0.5``.
+  default: 0.5
+  with_legacy: true
+# if load is low
+- name: osd_scrub_min_interval
+  type: float
+  level: advanced
+  desc: Scrub each PG no more often than this interval
+  fmt_desc: The minimal interval in seconds for scrubbing the Ceph OSD Daemon
+    when the Ceph Storage Cluster load is low.
+  default: 1_day
+  see_also:
+  - osd_scrub_max_interval
+  with_legacy: true
+# regardless of load
+- name: osd_scrub_max_interval
+  type: float
+  level: advanced
+  desc: Scrub each PG no less often than this interval
+  fmt_desc: The maximum interval in seconds for scrubbing the Ceph OSD Daemon
+    irrespective of cluster load.
+  default: 7_day
+  see_also:
+  - osd_scrub_min_interval
+  with_legacy: true
+# randomize the scheduled scrub in the span of [min,min*(1+randomize_ratio))
+- name: osd_scrub_interval_randomize_ratio
+  type: float
+  level: advanced
+  desc: Ratio of scrub interval to randomly vary
+  long_desc: This prevents a scrub 'stampede' by randomly varying the scrub intervals
+    so that they are soon uniformly distributed over the week
+  fmt_desc: Add a random delay to ``osd_scrub_min_interval`` when scheduling
+    the next scrub job for a PG. The delay is a random
+    value less than ``osd_scrub_min_interval`` \*
+    ``osd_scrub_interval_randomized_ratio``. The default setting
+    spreads scrubs throughout the allowed time
+    window of ``[1, 1.5]`` \* ``osd_scrub_min_interval``.
+  default: 0.5
+  see_also:
+  - osd_scrub_min_interval
+  with_legacy: true
+# the probability to back off the scheduled scrub
+- name: osd_scrub_backoff_ratio
+  type: float
+  level: dev
+  desc: Backoff ratio for scheduling scrubs
+  long_desc: This is the precentage of ticks that do NOT schedule scrubs, 66% means
+    that 1 out of 3 ticks will schedule scrubs
+  default: 0.66
+  with_legacy: true
+- name: osd_scrub_chunk_min
+  type: int
+  level: advanced
+  desc: Minimum number of objects to scrub in a single chunk
+  fmt_desc: The minimal number of object store chunks to scrub during single operation.
+    Ceph blocks writes to single chunk during scrub.
+  default: 5
+  see_also:
+  - osd_scrub_chunk_max
+  with_legacy: true
+- name: osd_scrub_chunk_max
+  type: int
+  level: advanced
+  desc: Maximum number of objects to scrub in a single chunk
+  fmt_desc: The maximum number of object store chunks to scrub during single operation.
+  default: 25
+  see_also:
+  - osd_scrub_chunk_min
+  with_legacy: true
+# sleep between [deep]scrub ops
+- name: osd_scrub_sleep
+  type: float
+  level: advanced
+  desc: Duration to inject a delay during scrubbing
+  fmt_desc: Time to sleep before scrubbing the next group of chunks. Increasing this value will slow
+    down the overall rate of scrubbing so that client operations will be less impacted.
+  default: 0
+  flags:
+  - runtime
+  with_legacy: true
+# more sleep between [deep]scrub ops
+- name: osd_scrub_extended_sleep
+  type: float
+  level: advanced
+  desc: Duration to inject a delay during scrubbing out of scrubbing hours
+  default: 0
+  see_also:
+  - osd_scrub_begin_hour
+  - osd_scrub_end_hour
+  - osd_scrub_begin_week_day
+  - osd_scrub_end_week_day
+  with_legacy: true
+# whether auto-repair inconsistencies upon deep-scrubbing
+- name: osd_scrub_auto_repair
+  type: bool
+  level: advanced
+  desc: Automatically repair damaged objects detected during scrub
+  fmt_desc: Setting this to ``true`` will enable automatic PG repair when errors
+    are found by scrubs or deep-scrubs.  However, if more than
+    ``osd_scrub_auto_repair_num_errors`` errors are found a repair is NOT performed.
+  default: false
+  with_legacy: true
+# only auto-repair when number of errors is below this threshold
+- name: osd_scrub_auto_repair_num_errors
+  type: uint
+  level: advanced
+  desc: Maximum number of detected errors to automatically repair
+  fmt_desc: Auto repair will not occur if more than this many errors are found.
+  default: 5
+  see_also:
+  - osd_scrub_auto_repair
+  with_legacy: true
+- name: osd_scrub_max_preemptions
+  type: uint
+  level: advanced
+  desc: Set the maximum number of times we will preempt a deep scrub due to a client
+    operation before blocking client IO to complete the scrub
+  default: 5
+  min: 0
+  max: 30
+- name: osd_deep_scrub_interval
+  type: float
+  level: advanced
+  desc: Deep scrub each PG (i.e., verify data checksums) at least this often
+  fmt_desc: The interval for "deep" scrubbing (fully reading all data). The
+    ``osd_scrub_load_threshold`` does not affect this setting.
+  default: 7_day
+  with_legacy: true
+- name: osd_deep_scrub_randomize_ratio
+  type: float
+  level: advanced
+  desc: Scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs
+    are deep)
+  long_desc: This prevents a deep scrub 'stampede' by spreading deep scrubs so they
+    are uniformly distributed over the week
+  default: 0.15
+  with_legacy: true
+- name: osd_deep_scrub_stride
+  type: size
+  level: advanced
+  desc: Number of bytes to read from an object at a time during deep scrub
+  fmt_desc: Read size when doing a deep scrub.
+  default: 512_K
+  with_legacy: true
+- name: osd_deep_scrub_keys
+  type: int
+  level: advanced
+  desc: Number of keys to read from an object at a time during deep scrub
+  default: 1024
+  with_legacy: true
+# objects must be this old (seconds) before we update the whole-object digest on scrub
+- name: osd_deep_scrub_update_digest_min_age
+  type: int
+  level: advanced
+  desc: Update overall object digest only if object was last modified longer ago than
+    this
+  default: 2_hr
+  with_legacy: true
+- name: osd_deep_scrub_large_omap_object_key_threshold
+  type: uint
+  level: advanced
+  desc: Warn when we encounter an object with more omap keys than this
+  default: 200000
+  services:
+  - osd
+  - mds
+  see_also:
+  - osd_deep_scrub_large_omap_object_value_sum_threshold
+  with_legacy: true
+- name: osd_deep_scrub_large_omap_object_value_sum_threshold
+  type: size
+  level: advanced
+  desc: Warn when we encounter an object with more omap key bytes than this
+  default: 1_G
+  services:
+  - osd
+  see_also:
+  - osd_deep_scrub_large_omap_object_key_threshold
+  with_legacy: true
+# where rados plugins are stored
+- name: osd_class_dir
+  type: str
+  level: advanced
+  default: @CMAKE_INSTALL_LIBDIR@/rados-classes
+  fmt_desc: The class path for RADOS class plug-ins.
+  with_legacy: true
+- name: osd_open_classes_on_start
+  type: bool
+  level: advanced
+  default: true
+  with_legacy: true
+# list of object classes allowed to be loaded (allow all: *)
+- name: osd_class_load_list
+  type: str
+  level: advanced
+  default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex
+    user version cas cmpomap queue 2pc_queue fifo
+  with_legacy: true
+# list of object classes with default execute perm (allow all: *)
+- name: osd_class_default_list
+  type: str
+  level: advanced
+  default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex
+    user version cas cmpomap queue 2pc_queue fifo
+  with_legacy: true
+- name: osd_agent_max_ops
+  type: int
+  level: advanced
+  desc: maximum concurrent tiering operations for tiering agent
+  fmt_desc: The maximum number of simultaneous flushing ops per tiering agent
+    in the high speed mode.
+  default: 4
+  with_legacy: true
+- name: osd_agent_max_low_ops
+  type: int
+  level: advanced
+  desc: maximum concurrent low-priority tiering operations for tiering agent
+  fmt_desc: The maximum number of simultaneous flushing ops per tiering agent
+    in the low speed mode.
+  default: 2
+  with_legacy: true
+- name: osd_agent_min_evict_effort
+  type: float
+  level: advanced
+  desc: minimum effort to expend evicting clean objects
+  default: 0.1
+  min: 0
+  max: 0.99
+  with_legacy: true
+- name: osd_agent_quantize_effort
+  type: float
+  level: advanced
+  desc: size of quantize unit for eviction effort
+  default: 0.1
+  with_legacy: true
+- name: osd_agent_delay_time
+  type: float
+  level: advanced
+  desc: how long agent should sleep if it has no work to do
+  default: 5
+  with_legacy: true
+# decay atime and hist histograms after how many objects go by
+- name: osd_agent_hist_halflife
+  type: int
+  level: advanced
+  desc: halflife of agent atime and temp histograms
+  default: 1000
+  with_legacy: true
+# decay atime and hist histograms after how many objects go by
+- name: osd_agent_slop
+  type: float
+  level: advanced
+  desc: slop factor to avoid switching tiering flush and eviction mode
+  default: 0.02
+  with_legacy: true
+- name: osd_find_best_info_ignore_history_les
+  type: bool
+  level: dev
+  desc: ignore last_epoch_started value when peering AND PROBABLY LOSE DATA
+  long_desc: THIS IS AN EXTREMELY DANGEROUS OPTION THAT SHOULD ONLY BE USED AT THE
+    DIRECTION OF A DEVELOPER.  It makes peering ignore the last_epoch_started value
+    when peering, which can allow the OSD to believe an OSD has an authoritative view
+    of a PG's contents even when it is in fact old and stale, typically leading to
+    data loss (by believing a stale PG is up to date).
+  default: false
+  with_legacy: true
+- name: osd_uuid
+  type: uuid
+  level: advanced
+  desc: uuid label for a new OSD
+  fmt_desc: The universally unique identifier (UUID) for the Ceph OSD Daemon.
+  note: The ``osd_uuid`` applies to a single Ceph OSD Daemon. The ``fsid``
+    applies to the entire cluster.
+  flags:
+  - create
+  with_legacy: true
+- name: osd_data
+  type: str
+  level: advanced
+  desc: path to OSD data
+  fmt_desc: The path to the OSDs data. You must create the directory when
+    deploying Ceph. You should mount a drive for OSD data at this
+    mount point. We do not recommend changing the default.
+  default: /var/lib/ceph/osd/$cluster-$id
+  flags:
+  - no_mon_update
+  with_legacy: true
+- name: osd_journal
+  type: str
+  level: advanced
+  desc: path to OSD journal (when FileStore backend is in use)
+  fmt_desc: The path to the OSD's journal. This may be a path to a file or a
+    block device (such as a partition of an SSD). If it is a file,
+    you must create the directory to contain it. We recommend using a
+    separate fast device when the ``osd_data`` drive is an HDD.
+  default: /var/lib/ceph/osd/$cluster-$id/journal
+  flags:
+  - no_mon_update
+  with_legacy: true
+- name: osd_journal_size
+  type: size
+  level: advanced
+  desc: size of FileStore journal (in MiB)
+  fmt_desc: The size of the journal in megabytes.
+  default: 5_K
+  flags:
+  - create
+  with_legacy: true
+- name: osd_journal_flush_on_shutdown
+  type: bool
+  level: advanced
+  desc: flush FileStore journal contents during clean OSD shutdown
+  default: true
+  with_legacy: true
+- name: osd_compact_on_start
+  type: bool
+  level: advanced
+  desc: compact OSD's object store's OMAP on start
+  default: false
+# flags for specific control purpose during osd mount() process.
+# e.g., can be 1 to skip over replaying journal
+# or 2 to skip over mounting omap or 3 to skip over both.
+# This might be helpful in case the journal is totally corrupted
+# and we still want to bring the osd daemon back normally, etc.
+- name: osd_os_flags
+  type: uint
+  level: dev
+  desc: flags to skip filestore omap or journal initialization
+  default: 0
+- name: osd_max_write_size
+  type: size
+  level: advanced
+  desc: Maximum size of a RADOS write operation in megabytes
+  long_desc: This setting prevents clients from doing very large writes to RADOS.  If
+    you set this to a value below what clients expect, they will receive an error
+    when attempting to write to the cluster.
+  fmt_desc: The maximum size of a write in megabytes.
+  default: 90
+  min: 4
+  with_legacy: true
+- name: osd_max_pgls
+  type: uint
+  level: advanced
+  desc: maximum number of results when listing objects in a pool
+  fmt_desc: The maximum number of placement groups to list. A client
+    requesting a large number can tie up the Ceph OSD Daemon.
+  default: 1_K
+  with_legacy: true
+- name: osd_client_message_size_cap
+  type: size
+  level: advanced
+  desc: maximum memory to devote to in-flight client requests
+  long_desc: If this value is exceeded, the OSD will not read any new client data
+    off of the network until memory is freed.
+  fmt_desc: The largest client data message allowed in memory.
+  default: 500_M
+  with_legacy: true
+- name: osd_client_message_cap
+  type: uint
+  level: advanced
+  desc: maximum number of in-flight client requests
+  default: 0
+  with_legacy: true
+- name: osd_crush_update_on_start
+  type: bool
+  level: advanced
+  desc: update OSD CRUSH location on startup
+  default: true
+  with_legacy: true
+- name: osd_class_update_on_start
+  type: bool
+  level: advanced
+  desc: set OSD device class on startup
+  default: true
+  with_legacy: true
+- name: osd_crush_initial_weight
+  type: float
+  level: advanced
+  desc: if >= 0, initial CRUSH weight for newly created OSDs
+  long_desc: If this value is negative, the size of the OSD in TiB is used.
+  fmt_desc: The initial CRUSH weight for newly added OSDs. The default
+    value of this option is ``the size of a newly added OSD in TB``. By default,
+    the initial CRUSH weight for a newly added OSD is set to its device size in
+    TB. See `Weighting Bucket Items`_ for details.
+  default: -1
+  with_legacy: true
+# Allows the "peered" state for recovery and backfill below min_size
+- name: osd_allow_recovery_below_min_size
+  type: bool
+  level: dev
+  desc: allow replicated pools to recover with < min_size active members
+  default: true
+  services:
+  - osd
+  with_legacy: true
+# cap on # of inc maps we send to peers, clients
+- name: osd_map_share_max_epochs
+  type: int
+  level: advanced
+  default: 40
+  with_legacy: true
+- name: osd_map_cache_size
+  type: int
+  level: advanced
+  default: 50
+  fmt_desc: The number of OSD maps to keep cached.
+  with_legacy: true
+- name: osd_pg_epoch_max_lag_factor
+  type: float
+  level: advanced
+  desc: Max multiple of the map cache that PGs can lag before we throttle map injest
+  default: 2
+  see_also:
+  - osd_map_cache_size
+- name: osd_inject_bad_map_crc_probability
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+- name: osd_inject_failure_on_pg_removal
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+# shutdown the OSD if stuatus flipping more than max_markdown_count times in recent max_markdown_period seconds
+- name: osd_max_markdown_period
+  type: int
+  level: advanced
+  default: 10_min
+  with_legacy: true
+- name: osd_max_markdown_count
+  type: int
+  level: advanced
+  default: 5
+  with_legacy: true
+- name: osd_op_thread_timeout
+  type: int
+  level: advanced
+  default: 15
+  fmt_desc: The Ceph OSD Daemon operation thread timeout in seconds.
+  with_legacy: true
+- name: osd_op_thread_suicide_timeout
+  type: int
+  level: advanced
+  default: 150
+  with_legacy: true
+- name: osd_op_pq_max_tokens_per_priority
+  type: uint
+  level: advanced
+  default: 4_M
+  with_legacy: true
+- name: osd_op_pq_min_cost
+  type: size
+  level: advanced
+  default: 64_K
+  with_legacy: true
+# preserve clone_overlap during recovery/migration
+- name: osd_recover_clone_overlap
+  type: bool
+  level: advanced
+  default: true
+  fmt_desc: Preserves clone overlap during recovery. Should always be set
+    to ``true``.
+  with_legacy: true
+- name: osd_num_cache_shards
+  type: size
+  level: advanced
+  desc: The number of cache shards to use in the object store.
+  default: 32
+  flags:
+  - startup
+- name: osd_op_num_threads_per_shard
+  type: int
+  level: advanced
+  default: 0
+  flags:
+  - startup
+  with_legacy: true
+- name: osd_op_num_threads_per_shard_hdd
+  type: int
+  level: advanced
+  default: 1
+  see_also:
+  - osd_op_num_threads_per_shard
+  flags:
+  - startup
+  with_legacy: true
+- name: osd_op_num_threads_per_shard_ssd
+  type: int
+  level: advanced
+  default: 2
+  see_also:
+  - osd_op_num_threads_per_shard
+  flags:
+  - startup
+  with_legacy: true
+- name: osd_op_num_shards
+  type: int
+  level: advanced
+  fmt_desc: The number of shards allocated for a given OSD. Each shard has its own processing queue.
+    PGs on the OSD are distributed evenly in the shard. This setting overrides _ssd and _hdd if
+    non-zero.
+  default: 0
+  flags:
+  - startup
+  with_legacy: true
+- name: osd_op_num_shards_hdd
+  type: int
+  level: advanced
+  fmt_desc: the number of shards allocated for a given OSD (for rotational media).
+  default: 5
+  see_also:
+  - osd_op_num_shards
+  flags:
+  - startup
+  with_legacy: true
+- name: osd_op_num_shards_ssd
+  type: int
+  level: advanced
+  fmt_desc: the number of shards allocated for a given OSD (for solid state media).
+  default: 8
+  see_also:
+  - osd_op_num_shards
+  flags:
+  - startup
+  with_legacy: true
+- name: osd_skip_data_digest
+  type: bool
+  level: dev
+  desc: Do not store full-object checksums if the backend (bluestore) does its own
+    checksums.  Only usable with all BlueStore OSDs.
+  default: false
+# PrioritzedQueue (prio), Weighted Priority Queue (wpq ; default),
+# mclock_opclass, mclock_client, or debug_random. "mclock_opclass"
+# and "mclock_client" are based on the mClock/dmClock algorithm
+# (Gulati, et al. 2010). "mclock_opclass" prioritizes based on the
+# class the operation belongs to. "mclock_client" does the same but
+# also works to ienforce fairness between clients. "debug_random"
+# chooses among all four with equal probability.
+- name: osd_op_queue
+  type: str
+  level: advanced
+  desc: which operation priority queue algorithm to use
+  long_desc: which operation priority queue algorithm to use
+  fmt_desc: This sets the type of queue to be used for prioritizing ops
+    within each OSD. Both queues feature a strict sub-queue which is
+    dequeued before the normal queue. The normal queue is different
+    between implementations. The WeightedPriorityQueue (``wpq``)
+    dequeues operations in relation to their priorities to prevent
+    starvation of any queue. WPQ should help in cases where a few OSDs
+    are more overloaded than others. The mClockQueue
+    (``mclock_scheduler``) prioritizes operations based on which class
+    they belong to (recovery, scrub, snaptrim, client op, osd subop).
+    See `QoS Based on mClock`_. Requires a restart.
+  default: mclock_scheduler
+  see_also:
+  - osd_op_queue_cut_off
+  enum_values:
+  - wpq
+  - mclock_scheduler
+  - debug_random
+  with_legacy: true
+# Min priority to go to strict queue. (low, high)
+- name: osd_op_queue_cut_off
+  type: str
+  level: advanced
+  desc: the threshold between high priority ops and low priority ops
+  long_desc: the threshold between high priority ops that use strict priority ordering
+    and low priority ops that use a fairness algorithm that may or may not incorporate
+    priority
+  fmt_desc: This selects which priority ops will be sent to the strict
+    queue verses the normal queue. The ``low`` setting sends all
+    replication ops and higher to the strict queue, while the ``high``
+    option sends only replication acknowledgment ops and higher to
+    the strict queue. Setting this to ``high`` should help when a few
+    OSDs in the cluster are very busy especially when combined with
+    ``wpq`` in the ``osd_op_queue`` setting. OSDs that are very busy
+    handling replication traffic could starve primary client traffic
+    on these OSDs without these settings. Requires a restart.
+  default: high
+  see_also:
+  - osd_op_queue
+  enum_values:
+  - low
+  - high
+  - debug_random
+  with_legacy: true
+- name: osd_mclock_scheduler_client_res
+  type: uint
+  level: advanced
+  desc: IO proportion reserved for each client (default)
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO proportion reserved for each client (default).
+  default: 1
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_client_wgt
+  type: uint
+  level: advanced
+  desc: IO share for each client (default) over reservation
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO share for each client (default) over reservation.
+  default: 1
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_client_lim
+  type: uint
+  level: advanced
+  desc: IO limit for each client (default) over reservation
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO limit for each client (default) over reservation.
+  default: 999999
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_background_recovery_res
+  type: uint
+  level: advanced
+  desc: IO proportion reserved for background recovery (default)
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO proportion reserved for background recovery (default).
+  default: 1
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_background_recovery_wgt
+  type: uint
+  level: advanced
+  desc: IO share for each background recovery over reservation
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO share for each background recovery over reservation.
+  default: 1
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_background_recovery_lim
+  type: uint
+  level: advanced
+  desc: IO limit for background recovery over reservation
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO limit for background recovery over reservation.
+  default: 999999
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_background_best_effort_res
+  type: uint
+  level: advanced
+  desc: IO proportion reserved for background best_effort (default)
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO proportion reserved for background best_effort (default).
+  default: 1
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_background_best_effort_wgt
+  type: uint
+  level: advanced
+  desc: IO share for each background best_effort over reservation
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO share for each background best_effort over reservation.
+  default: 1
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_background_best_effort_lim
+  type: uint
+  level: advanced
+  desc: IO limit for background best_effort over reservation
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO limit for background best_effort over reservation.
+  default: 999999
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_anticipation_timeout
+  type: float
+  level: advanced
+  desc: mclock anticipation timeout in seconds
+  long_desc: the amount of time that mclock waits until the unused resource is forfeited
+  default: 0
+- name: osd_mclock_cost_per_io_usec
+  type: float
+  level: dev
+  desc: Cost per IO in microseconds to consider per OSD (overrides _ssd and _hdd if
+    non-zero)
+  long_desc: This option specifies the cost factor to consider in usec per OSD. This
+    is considered by the mclock scheduler to set an additional cost factor in QoS
+    calculations. Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: Cost per IO in microseconds to consider per OSD (overrides _ssd
+    and _hdd if non-zero)
+  default: 0
+  flags:
+  - runtime
+- name: osd_mclock_cost_per_io_usec_hdd
+  type: float
+  level: dev
+  desc: Cost per IO in microseconds to consider per OSD (for rotational media)
+  long_desc: This option specifies the cost factor to consider in usec per OSD for
+    rotational device type. This is considered by the mclock_scheduler to set an additional
+    cost factor in QoS calculations. Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: Cost per IO in microseconds to consider per OSD (for rotational
+    media)
+  default: 25000
+  flags:
+  - runtime
+- name: osd_mclock_cost_per_io_usec_ssd
+  type: float
+  level: dev
+  desc: Cost per IO in microseconds to consider per OSD (for solid state media)
+  long_desc: This option specifies the cost factor to consider in usec per OSD for
+    solid state device type. This is considered by the mclock_scheduler to set an
+    additional cost factor in QoS calculations. Only considered for osd_op_queue =
+    mclock_scheduler
+  fmt_desc: Cost per IO in microseconds to consider per OSD (for solid state
+    media)
+  default: 50
+  flags:
+  - runtime
+- name: osd_mclock_cost_per_byte_usec
+  type: float
+  level: dev
+  desc: Cost per byte in microseconds to consider per OSD (overrides _ssd and _hdd
+    if non-zero)
+  long_desc: This option specifies the cost per byte to consider in microseconds per
+    OSD. This is considered by the mclock scheduler to set an additional cost factor
+    in QoS calculations. Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: Cost per byte in microseconds to consider per OSD (overrides _ssd
+    and _hdd if non-zero)
+  default: 0
+  flags:
+  - runtime
+- name: osd_mclock_cost_per_byte_usec_hdd
+  type: float
+  level: dev
+  desc: Cost per byte in microseconds to consider per OSD (for rotational media)
+  long_desc: This option specifies the cost per byte to consider in microseconds per
+    OSD for rotational device type. This is considered by the mclock_scheduler to
+    set an additional cost factor in QoS calculations. Only considered for osd_op_queue
+    = mclock_scheduler
+  fmt_desc: Cost per byte in microseconds to consider per OSD (for rotational
+    media)
+  default: 5.2
+  flags:
+  - runtime
+- name: osd_mclock_cost_per_byte_usec_ssd
+  type: float
+  level: dev
+  desc: Cost per byte in microseconds to consider per OSD (for solid state media)
+  long_desc: This option specifies the cost per byte to consider in microseconds per
+    OSD for solid state device type. This is considered by the mclock_scheduler to
+    set an additional cost factor in QoS calculations. Only considered for osd_op_queue
+    = mclock_scheduler
+  fmt_desc: Cost per byte in microseconds to consider per OSD (for solid state
+    media)
+  default: 0.011
+  flags:
+  - runtime
+- name: osd_mclock_max_capacity_iops
+  type: float
+  level: basic
+  desc: Max IOPs capacity (at 4KiB block size) to consider per OSD (overrides _ssd
+    and _hdd if non-zero)
+  long_desc: This option specifies the max osd capacity in iops per OSD. Helps in
+    QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue
+    = mclock_scheduler
+  fmt_desc: Max IOPS capacity (at 4KiB block size) to consider per OSD
+    (overrides _ssd and _hdd if non-zero)
+  default: 0
+  flags:
+  - runtime
+- name: osd_mclock_max_capacity_iops_hdd
+  type: float
+  level: basic
+  desc: Max IOPs capacity (at 4KiB block size) to consider per OSD (for rotational
+    media)
+  long_desc: This option specifies the max OSD capacity in iops per OSD. Helps in
+    QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue
+    = mclock_scheduler
+  fmt_desc: Max IOPS capacity (at 4KiB block size) to consider per OSD (for
+    rotational media)
+  default: 315
+  flags:
+  - runtime
+- name: osd_mclock_max_capacity_iops_ssd
+  type: float
+  level: basic
+  desc: Max IOPs capacity (at 4KiB block size) to consider per OSD (for solid state
+    media)
+  long_desc: This option specifies the max OSD capacity in iops per OSD. Helps in
+    QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue
+    = mclock_scheduler
+  fmt_desc: Max IOPS capacity (at 4KiB block size) to consider per OSD (for
+    solid state media)
+  default: 21500
+  flags:
+  - runtime
+- name: osd_mclock_profile
+  type: str
+  level: advanced
+  desc: Which mclock profile to use
+  long_desc: This option specifies the mclock profile to enable - one among the set
+    of built-in profiles or a custom profile. Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: |
+    This sets the type of mclock profile to use for providing QoS
+    based on operations belonging to different classes (background
+    recovery, scrub, snaptrim, client op, osd subop). Once a built-in
+    profile is enabled, the lower level mclock resource control
+    parameters [*reservation, weight, limit*] and some Ceph
+    configuration parameters are set transparently. Note that the
+    above does not apply for the *custom* profile.
+  default: high_client_ops
+  see_also:
+  - osd_op_queue
+  enum_values:
+  - balanced
+  - high_recovery_ops
+  - high_client_ops
+  - custom
+  flags:
+  - runtime
+# Set to true for testing.  Users should NOT set this.
+# If set to true even after reading enough shards to
+# decode the object, any error will be reported.
+- name: osd_read_ec_check_for_errors
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: osd_recovery_delay_start
+  type: float
+  level: advanced
+  default: 0
+  fmt_desc: After peering completes, Ceph will delay for the specified number
+    of seconds before starting to recover RADOS objects.
+  with_legacy: true
+- name: osd_recovery_max_active
+  type: uint
+  level: advanced
+  desc: Number of simultaneous active recovery operations per OSD (overrides _ssd
+    and _hdd if non-zero)
+  fmt_desc: The number of active recovery requests per OSD at one time. More
+    requests will accelerate recovery, but the requests places an
+    increased load on the cluster.
+  note: This value is only used if it is non-zero. Normally it
+    is ``0``, which means that the ``hdd`` or ``ssd`` values
+    (below) are used, depending on the type of the primary
+    device backing the OSD.
+  default: 0
+  see_also:
+  - osd_recovery_max_active_hdd
+  - osd_recovery_max_active_ssd
+  flags:
+  - runtime
+  with_legacy: true
+- name: osd_recovery_max_active_hdd
+  type: uint
+  level: advanced
+  desc: Number of simultaneous active recovery operations per OSD (for rotational
+    devices)
+  fmt_desc: The number of active recovery requests per OSD at one time, if the
+    primary device is rotational.
+  default: 3
+  see_also:
+  - osd_recovery_max_active
+  - osd_recovery_max_active_ssd
+  flags:
+  - runtime
+  with_legacy: true
+- name: osd_recovery_max_active_ssd
+  type: uint
+  level: advanced
+  desc: Number of simultaneous active recovery operations per OSD (for non-rotational
+    solid state devices)
+  fmt_desc: The number of active recovery requests per OSD at one time, if the
+    primary device is non-rotational (i.e., an SSD).
+  default: 10
+  see_also:
+  - osd_recovery_max_active
+  - osd_recovery_max_active_hdd
+  flags:
+  - runtime
+  with_legacy: true
+- name: osd_recovery_max_single_start
+  type: uint
+  level: advanced
+  default: 1
+  fmt_desc: The maximum number of recovery operations per OSD that will be
+    newly started when an OSD is recovering.
+  with_legacy: true
+# max size of push chunk
+- name: osd_recovery_max_chunk
+  type: size
+  level: advanced
+  default: 8_M
+  fmt_desc: the maximum total size of data chunks a recovery op can carry.
+  with_legacy: true
+# max number of omap entries per chunk; 0 to disable limit
+- name: osd_recovery_max_omap_entries_per_chunk
+  type: uint
+  level: advanced
+  default: 8096
+  with_legacy: true
+# max size of a COPYFROM chunk
+- name: osd_copyfrom_max_chunk
+  type: size
+  level: advanced
+  default: 8_M
+  with_legacy: true
+# push cost per object
+- name: osd_push_per_object_cost
+  type: size
+  level: advanced
+  default: 1000
+  fmt_desc: the overhead for serving a push op
+  with_legacy: true
+# max size of push message
+- name: osd_max_push_cost
+  type: size
+  level: advanced
+  default: 8_M
+  with_legacy: true
+# max objects in single push op
+- name: osd_max_push_objects
+  type: uint
+  level: advanced
+  default: 10
+  with_legacy: true
+# Only use clone_overlap for recovery if there are fewer than
+# osd_recover_clone_overlap_limit entries in the overlap set
+- name: osd_recover_clone_overlap_limit
+  type: uint
+  level: advanced
+  default: 10
+  flags:
+  - runtime
+- name: osd_debug_feed_pullee
+  type: int
+  level: dev
+  desc: Feed a pullee, and force primary to pull a currently missing object from it
+  default: -1
+  with_legacy: true
+- name: osd_backfill_scan_min
+  type: int
+  level: advanced
+  default: 64
+  fmt_desc: The minimum number of objects per backfill scan.
+  with_legacy: true
+- name: osd_backfill_scan_max
+  type: int
+  level: advanced
+  default: 512
+  fmt_desc: The maximum number of objects per backfill scan.p
+  with_legacy: true
+# minimum number of peers
+- name: osd_heartbeat_min_peers
+  type: int
+  level: advanced
+  default: 10
+  with_legacy: true
+- name: osd_delete_sleep
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next removal transaction (overrides values
+    below)
+  fmt_desc: Time in seconds to sleep before the next removal transaction. This
+    throttles the PG deletion process.
+  default: 0
+  flags:
+  - runtime
+- name: osd_delete_sleep_hdd
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next removal transaction for HDDs
+  default: 5
+  flags:
+  - runtime
+- name: osd_delete_sleep_ssd
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next removal transaction for SSDs
+  default: 1
+  flags:
+  - runtime
+- name: osd_delete_sleep_hybrid
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next removal transaction when OSD data is on HDD
+    and OSD journal or WAL+DB is on SSD
+  default: 1
+  flags:
+  - runtime
author	Kefu Chai <kchai@redhat.com>
	Fri, 23 Apr 2021 09:32:00 +0000 (17:32 +0800)
committer	Kefu Chai <kchai@redhat.com>
	Fri, 14 May 2021 11:18:47 +0000 (19:18 +0800)
src/common/options/CMakeLists.txt		patch \| blob \| history
src/common/options/build_options.cc		patch \| blob \| history
src/common/options/global.yaml.in		patch \| blob \| history
src/common/options/legacy_config_opts.h		patch \| blob \| history
src/common/options/mgr.yaml.in	[new file with mode: 0644]	patch \| blob
src/common/options/osd.yaml.in	[new file with mode: 0644]	patch \| blob