From 8d7eb9f7e07bc91d0e887ffa3baefafa71bccd10 Mon Sep 17 00:00:00 2001 From: Josh Salomon <41079547+JoshSalomon@users.noreply.github.com> Date: Tue, 26 Dec 2023 10:41:18 +0200 Subject: [PATCH] osd: Add 'read_ratio' pool parameterr This parameter is used for better read balancing with non identical devices. - This parameter is controlled using the commands 'ceph osd pool set/get' - This parameter is applicable only for replicated pools - Valid values are integers in the range [0..100] and represent the percentage of read IOs out of all IOs in the pool - Value of 0 unsets this parameter and the value will be the default value (this is the generic behavior of the command 'ceph osd pool set' - default value can be set by config parameter `osd_pool_default_read_ratio` Signed-off-by: Josh Salomon <41079547+JoshSalomon@users.noreply.github.com> --- src/common/options/global.yaml.in | 12 ++++++++++ src/mon/MonCommands.h | 4 ++-- src/mon/OSDMonitor.cc | 37 +++++++++++++++++++++++++++++-- src/osd/osd_types.cc | 4 +++- src/osd/osd_types.h | 1 + 5 files changed, 53 insertions(+), 5 deletions(-) diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index 88c896f9026..cebf59304a6 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -2550,6 +2550,18 @@ options: - mon flags: - runtime +- name: osd_pool_default_read_ratio + type: uint + level: advanced + desc: Default read ratio (the percent of read IOs out of all IOs) for a pool. + long_desc: Default read ratio (the percent of read IOs out of all IOs) for a pool. + applicable to replicated pools only. This value is used to improve read balancing + when OSDs have different weights. + default: 70 + services: + - mon + flags: + - runtime - name: osd_erasure_code_plugins type: str level: advanced diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 954dcb077fb..14bb3602c9b 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -1137,11 +1137,11 @@ COMMAND("osd pool rename " "rename to ", "osd", "rw") COMMAND("osd pool get " "name=pool,type=CephPoolname " - "name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk", + "name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk|read_ratio", "get pool parameter ", "osd", "r") COMMAND("osd pool set " "name=pool,type=CephPoolname " - "name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|pgp_num_actual|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk " + "name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|pgp_num_actual|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk|read_ratio " "name=val,type=CephString " "name=yes_i_really_mean_it,type=CephBool,req=false", "set pool parameter to ", "osd", "rw") diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index f8e379326f2..37cb3033189 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -5398,7 +5398,7 @@ namespace { CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM, PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO, PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM, - DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK, PG_NUM_MAX }; + DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK, PG_NUM_MAX, READ_RATIO }; std::set subtract_second_from_first(const std::set& first, @@ -6148,7 +6148,8 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) {"dedup_tier", DEDUP_TIER}, {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM}, {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE}, - {"bulk", BULK} + {"bulk", BULK}, + {"read_ratio", READ_RATIO} }; typedef std::set choices_set_t; @@ -6165,6 +6166,9 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) const choices_set_t ONLY_ERASURE_CHOICES = { EC_OVERWRITES, ERASURE_CODE_PROFILE }; + const choices_set_t ONLY_REPLICA_CHOICES = { + READ_RATIO + }; choices_set_t selected_choices; if (var == "all") { @@ -6182,6 +6186,10 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) selected_choices = subtract_second_from_first(selected_choices, ONLY_ERASURE_CHOICES); } + if(!p->is_replicated()) { + selected_choices = subtract_second_from_first(selected_choices, + ONLY_REPLICA_CHOICES); + } } else /* var != "all" */ { choices_map_t::const_iterator found = ALL_CHOICES.find(var); if (found == ALL_CHOICES.end()) { @@ -6210,6 +6218,15 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) goto reply; } + if (!p->is_replicated() && + ONLY_REPLICA_CHOICES.find(selected) + != ONLY_REPLICA_CHOICES.end()) { + ss << "pool '" << poolstr + << "' is not a replicated pool: variable not applicable"; + r = -EACCES; + goto reply; + } + if (pool_opts_t::is_opt_name(var) && !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) { ss << "option '" << var << "' is not set on pool '" << poolstr << "'"; @@ -6378,6 +6395,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) case DEDUP_TIER: case DEDUP_CHUNK_ALGORITHM: case DEDUP_CDC_CHUNK_SIZE: + case READ_RATIO: pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key; if (p->opts.is_set(key)) { if(*it == CSUM_TYPE) { @@ -6541,6 +6559,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) case DEDUP_TIER: case DEDUP_CHUNK_ALGORITHM: case DEDUP_CDC_CHUNK_SIZE: + case READ_RATIO: for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) { if (i->second == *it) break; @@ -8314,6 +8333,11 @@ int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap, return -EACCES; } + if (!p.is_replicated() && + (var == "read_ratio")) { + return -EACCES; + } + if (var == "size") { if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) { ss << "pool size change is disabled; you must unset nosizechange flag for the pool first"; @@ -8948,6 +8972,15 @@ int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap, ss << "error parsing int value '" << val << "': " << interr; return -EINVAL; } + } else if (var == "read_ratio") { + if (interr.length()) { + ss << "error parsing int value '" << val << "': " << interr; + return -EINVAL; + } + if (n < 0 || n > 100) { + ss << "read_ratio must be between 0 and 100"; + return -ERANGE; + } } pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var); diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 7596723a0e3..2af3894f1e7 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -1376,7 +1376,9 @@ static opt_mapping_t opt_mapping = boost::assign::map_list_of ("dedup_cdc_chunk_size", pool_opts_t::opt_desc_t( pool_opts_t::DEDUP_CDC_CHUNK_SIZE, pool_opts_t::INT)) ("pg_num_max", pool_opts_t::opt_desc_t( - pool_opts_t::PG_NUM_MAX, pool_opts_t::INT)); + pool_opts_t::PG_NUM_MAX, pool_opts_t::INT)) + ("read_ratio", pool_opts_t::opt_desc_t( + pool_opts_t::READ_RATIO, pool_opts_t::INT)); bool pool_opts_t::is_opt_name(const std::string& name) { diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 66f39a91ac4..8f08e298ee5 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1101,6 +1101,7 @@ public: DEDUP_CHUNK_ALGORITHM, DEDUP_CDC_CHUNK_SIZE, PG_NUM_MAX, // max pg_num + READ_RATIO, // read ration for the read balancer work [0-100] }; enum type_t { -- 2.39.5