From: Kamoltat Date: Fri, 15 Mar 2024 22:01:06 +0000 (+0000) Subject: Allow setting peering_crush_bucket_{count|target|barrier} X-Git-Tag: v19.2.1~260^2~3 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=072b08eddccd4654d42b1d65b36ed4d3f12c126c;p=ceph.git Allow setting peering_crush_bucket_{count|target|barrier} In the command `ceph osd pool stretch set` user has the option of setting the value of `peering_crush_bucket_{count|target|barrier}`. This will then allow the utilization `calc_replicated_acting_stretch`, since with `peering_crush_bucket_count != 0` the pool is now a stretch_pool and we can handle pg_temp better by settubg barriers and limits to how much OSDs should be in a pg_temp. This will enable the specify pool to handle pg_temp properly during create_acting, as a stretch pool should. User can also use the command: `osd pool stretch show ` to show all the stretch related information for the pool pool: cephfs.a.data pool_id: 3 is_stretch_pool: 1 peering_crush_bucket_count: 3 peering_crush_bucket_target: 3 peering_crush_bucket_barrier: 8 crush_rule: replicated_rule_custom size: 3 min_size: 2 User can also unset the stretch pool wiith the commnad: `osd pool stretch unset ` However, the pool must be a stretch pool. Fixes: https://tracker.ceph.com/issues/64802 Signed-off-by: Kamoltat (cherry picked from commit 661e8e22cff8763cf8775f204dda4490e464db2c) --- diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 1f40621cea9a6..380e3e58695de 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -1194,6 +1194,25 @@ COMMAND("osd pool application get " "name=key,type=CephString,req=false", "get value of key of application on pool ", "osd", "r") +COMMAND("osd pool stretch show " + "name=pool,type=CephPoolname", + "show all the stretch related information for the pool", + "osd", "r") +COMMAND("osd pool stretch set " + "name=pool,type=CephPoolname " + "name=peering_crush_bucket_count,type=CephInt,range=0 " + "name=peering_crush_bucket_target,type=CephInt,range=0 " + "name=peering_crush_bucket_barrier,type=CephString " + "name=crush_rule,type=CephString " + "name=size,type=CephInt,range=0 " + "name=min_size,type=CephInt,range=0 " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "make the pool stretched across the specified number of CRUSH buckets", + "osd", "rw") +COMMAND("osd pool stretch unset " + "name=pool,type=CephPoolname", + "unset the stretch mode for the pool", + "osd", "rw") COMMAND("osd utilization", "get basic pg distribution stats", "osd", "r") diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 7bfd4e2cacc4f..57143bb618aee 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -6093,6 +6093,62 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op) } r = 0; + } else if (prefix == "osd pool stretch show") { + string poolstr; + cmd_getval(cmdmap, "pool", poolstr); + int64_t pool = osdmap.lookup_pg_pool_name(poolstr); + if (pool < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + r = -ENOENT; + goto reply; + } + const pg_pool_t *p = osdmap.get_pg_pool(pool); + + if (!p->is_stretch_pool()) { + ss << poolstr << " " << " is not a stretch pool."; + r = -ENOENT; + goto reply; + } else { + if (f) { + f->open_object_section("pool"); + f->dump_string("pool", poolstr); + f->dump_int("pool_id", pool); + f->dump_bool("is_stretch_pool", p->is_stretch_pool()); + f->dump_int("peering_crush_bucket_count", p->peering_crush_bucket_count); + f->dump_int("peering_crush_bucket_target", p->peering_crush_bucket_target); + f->dump_string("peering_crush_bucket_barrier", stringify(osdmap.crush->get_type_name(p->peering_crush_bucket_barrier))); + if (osdmap.crush->rule_exists(p->get_crush_rule())) { + f->dump_string("crush_rule", osdmap.crush->get_rule_name(p->get_crush_rule())); + } else { + f->dump_string("crush_rule", stringify(p->get_crush_rule())); + // warn if the rule does not exist + mon.clog->warn() << __func__ << " pool " << poolstr << " crush rule " << stringify(p->get_crush_rule()) << " does not exist"; + } + f->dump_int("size", p->get_size()); + f->dump_int("min_size", p->get_min_size()); + f->close_section(); + f->flush(rdata); + } else { + stringstream ss; + ss << "pool: " << poolstr << "\n"; + ss << "pool_id: " << pool << "\n"; + ss << "is_stretch_pool: " << p->is_stretch_pool() << "\n"; + ss << "peering_crush_bucket_count: " << p->peering_crush_bucket_count << "\n"; + ss << "peering_crush_bucket_target: " << p->peering_crush_bucket_target << "\n"; + ss << "peering_crush_bucket_barrier: " << osdmap.crush->get_type_name(p->peering_crush_bucket_barrier) << "\n"; + if (osdmap.crush->rule_exists(p->get_crush_rule())) { + ss << "crush_rule: " << osdmap.crush->get_rule_name(p->get_crush_rule()) << "\n"; + } else { + ss << "crush_rule: " << p->get_crush_rule() << "\n"; + // warn if the rule does not exist + mon.clog->warn() << __func__ << " pool " << poolstr << " crush rule " << stringify(p->get_crush_rule()) << " does not exist"; + } + ss << "size: " << p->get_size() << "\n"; + ss << "min_size: " << p->get_min_size() << "\n"; + rdata.append(ss.str()); + } + } + r = 0; } else if (prefix == "osd pool get") { string poolstr; cmd_getval(cmdmap, "pool", poolstr); @@ -9046,6 +9102,149 @@ int OSDMonitor::prepare_command_pool_application(const string &prefix, return _command_pool_application(prefix, cmdmap, ss, nullptr, true); } +int OSDMonitor::prepare_command_pool_stretch_set(const cmdmap_t& cmdmap, + stringstream& ss) +{ + string pool_name; + cmd_getval(cmdmap, "pool", pool_name); + int64_t pool = osdmap.lookup_pg_pool_name(pool_name); + if (pool < 0) { + ss << "unrecognized pool '" << pool_name << "'"; + return -ENOENT; + } + + pg_pool_t p = *osdmap.get_pg_pool(pool); + if (pending_inc.new_pools.count(pool)) + p = pending_inc.new_pools[pool]; + + int64_t bucket_count = cmd_getval_or(cmdmap, "peering_crush_bucket_count", 0); + if (bucket_count <= 0) { + ss << "peering_crush_bucket_count must be >= 0! FYI use 'ceph osd pool stretch unset' to unset the stretch values"; + return -EINVAL; + } + + int64_t bucket_target = cmd_getval_or(cmdmap, "peering_crush_bucket_target", 0); + if (bucket_target <= 0) { + ss << "peering_crush_bucket_target must be >= 0! FYI use 'ceph osd pool stretch unset' to unset the stretch values"; + return -EINVAL; + } + + int bucket_barrier = 0; + string bucket_barrier_str; + cmd_getval(cmdmap, "peering_crush_bucket_barrier", bucket_barrier_str); + CrushWrapper& crush = _get_stable_crush(); + if (bucket_barrier_str.empty()) { + ss << "peering_crush_bucket_barrier must be provided"; + return -EINVAL; + } else { + bucket_barrier = crush.get_type_id(bucket_barrier_str); + if (bucket_barrier < 0) { + ss << "peering_crush_bucket_barrier " << bucket_barrier_str << " does not exist"; + return -EINVAL; + } else if (bucket_barrier == 0) { + ss << "peering_crush_bucket_barrier " << bucket_barrier_str << " is not a bucket type"; + return -EINVAL; + } + } + // Check if the number of peering_crush_bucket_count and peering_crush_bucket_target + // exceeds the number of subtrees of the specified bucket_barrier in the cluster. + vector subtrees; + bool sure = false; + crush.get_subtree_of_type(bucket_barrier, &subtrees); + cmd_getval(cmdmap, "yes_i_really_mean_it", sure); + if (static_cast(bucket_count) > subtrees.size()) { + if (!sure) { + ss << "peering_crush_bucket_count=" << bucket_count + << " > " << bucket_barrier_str << "=" << subtrees.size() + << " can lead to data unavailability, pass --yes-i-really-mean-it to proceed"; + return -EPERM; + } + } else if (static_cast(bucket_target) > subtrees.size()) { + if (!sure) { + ss << "peering_crush_bucket_target=" << bucket_target + << " > " << bucket_barrier_str << "=" << subtrees.size() + << " can lead to data unavailability, pass --yes-i-really-mean-it to proceed"; + return -EPERM; + } + } + + string crush_rule_str; + cmd_getval(cmdmap, "crush_rule", crush_rule_str); + if (crush_rule_str.empty()) { + ss << "crush_rule must be provided"; + return -EINVAL; + } + int crush_rule = crush.get_rule_id(crush_rule_str); + if (crush_rule < 0) { + ss << "crush rule " << crush_rule_str << " does not exist"; + return -ENOENT; + } + if (!crush.rule_valid_for_pool_type(crush_rule, p.get_type())) { + ss << "crush rule " << crush_rule << " type does not match pool"; + return -EINVAL; + } + int64_t pool_size = cmd_getval_or(cmdmap, "size", 0); + if (pool_size < 0) { + ss << "pool size must be non-negative"; + return -EINVAL; + } + + int64_t pool_min_size = cmd_getval_or(cmdmap, "min_size", 0); + if (pool_min_size < 0) { + ss << "pool min_size must be non-negative"; + return -EINVAL; + } + + p.peering_crush_bucket_count = static_cast(bucket_count); + p.peering_crush_bucket_target = static_cast(bucket_target); + p.peering_crush_bucket_barrier = static_cast(bucket_barrier); + p.crush_rule = static_cast<__u8>(crush_rule); + p.size = static_cast<__u8>(pool_size); + p.min_size = static_cast<__u8>(pool_min_size); + p.last_change = pending_inc.epoch; + pending_inc.new_pools[pool] = p; + ss << "pool " << pool_name << " stretch values are set successfully"; + return 0; +} + +int OSDMonitor::prepare_command_pool_stretch_unset(const cmdmap_t& cmdmap, + stringstream& ss) +{ + /** + * Command syntax: + * ceph osd pool stretch unset + */ + string pool_name; + cmd_getval(cmdmap, "pool", pool_name); + int64_t pool = osdmap.lookup_pg_pool_name(pool_name); + // check if pool exists + if (pool < 0) { + ss << "unrecognized pool '" << pool_name << "'"; + return -ENOENT; + } + + // get pool + pg_pool_t p = *osdmap.get_pg_pool(pool); + if (pending_inc.new_pools.count(pool)) + p = pending_inc.new_pools[pool]; + + // check if pool is a stretch pool + if (!p.is_stretch_pool()) { + ss << "pool " << pool_name << " is not a stretch pool"; + return -ENOENT; + } + + // unset stretch values + p.peering_crush_bucket_count = 0; + p.peering_crush_bucket_target = 0; + p.peering_crush_bucket_barrier = 0; + p.last_change = pending_inc.epoch; + pending_inc.new_pools[pool] = p; + ss << "pool " << pool_name + << " is no longer a stretch pool, all stretch values are unset successfully"; + return 0; +} + int OSDMonitor::preprocess_command_pool_application(const string &prefix, const cmdmap_t& cmdmap, stringstream& ss, @@ -13902,6 +14101,28 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, } else { goto update; } + } else if (prefix == "osd pool stretch set") { + err = prepare_command_pool_stretch_set(cmdmap, ss); + if (err == -EAGAIN) + goto wait; + if (err < 0) + goto reply_no_propose; + + getline(ss, rs); + wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else if (prefix == "osd pool stretch unset") { + err = prepare_command_pool_stretch_unset(cmdmap, ss); + if (err == -EAGAIN) + goto wait; + if (err < 0) + goto reply_no_propose; + + getline(ss, rs); + wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; } else if (prefix == "osd force-create-pg") { pg_t pgid; string pgidstr; diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 398daf9243e10..8e01a0e7e1e25 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -735,6 +735,10 @@ public: const cmdmap_t& cmdmap, std::stringstream& ss, bool *modified); + int prepare_command_pool_stretch_set(const cmdmap_t& cmdmap, + std::stringstream& ss); + int prepare_command_pool_stretch_unset(const cmdmap_t& cmdmap, + std::stringstream& ss); int _command_pool_application(const std::string &prefix, const cmdmap_t& cmdmap, std::stringstream& ss, diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 2c6028f9cb416..0bd54acb96c7e 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -1568,6 +1568,7 @@ void pg_pool_t::dump(Formatter *f) const f->dump_int("peering_crush_bucket_target", peering_crush_bucket_target); f->dump_int("peering_crush_bucket_barrier", peering_crush_bucket_barrier); f->dump_int("peering_crush_bucket_mandatory_member", peering_crush_mandatory_member); + f->dump_bool("is_stretch_pool", is_stretch_pool()); f->dump_int("object_hash", get_object_hash()); f->dump_string("pg_autoscale_mode", get_pg_autoscale_mode_name(pg_autoscale_mode));