From: Sridhar Seshasayee Date: Mon, 28 Jul 2025 11:09:34 +0000 (+0530) Subject: src/messages, osd: Calculate and set cost for subOpReads for mClock scheduler X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=e79befea328e58366c43e490c129e2a467e6db84;p=ceph.git src/messages, osd: Calculate and set cost for subOpReads for mClock scheduler Previously, sub-op reads returned a hardcoded cost of 0, bypassing mClock's background bandwidth and tag calculation mechanisms. This allowed backfill operations to proceed un-metered, occasionally causing backend resource contention and driving up client tail latencies. Cost is calculated based on whether the complete chunk/shard or a subchunk needs to be read. The possible cases are: 1. Read the complete chunk aligned length: - Cost is set to the length of the chunk aligned extent size. 2. Fragmented reads: - Consider the subchunk length and count to calculate the cost. - compute_cost evaluates the exact layout of fragmented shard bytes on disk by summing up the active subchunk allocations exactly once (`fragmented_shard_bytes += k.second * subchunk_size`). - Linear Extent Scaling: Scale the baseline footprint cleanly by multiplying it against the true count of read extents (`tl.size()`), achieving a highly efficient O(N) time complexity. This linear cost model is compatible with pools running with 'allow_ec_optimizations' set to true. Under the FastEC optimized pipeline, most operations are unified and bypass fragment slicing, meaning requests will primarily match the Case 1 chunk-aligned path. In Case 2 where applicable, the O(N) loop ensures that cost will scale proportionally according to the layout. It is important to note that the amount of data to read was set to an upper bound defined by osd_recovery_max_chunk (8 MiB) and was rounded up to the stripe width. The reason for setting a higher than actual upper bound is that there may be cases where the object doesn't have the xattrs yet to determine its size. Therefore, the amount to read was ultimatly set to ~(8 MiB / k) where k is the number of data shards. This can cause mClock to prolong the recovery times as items stay longer in the queue. To address this, the amount to read is set to the remaining length of the object to recover if the object size is known. Otherwise, the amount to read is set to the recovery chunk size as before. Therefore, in some cases, only the first recovery read could be costly if the object context is not known. The MOSDECSubOpRead class introduces the following: - cost member. This necessitates an increment to the HEAD_VERSION and appropriate handling within the encode and decode methods. - compute_cost() that is called when creating the message by ECCommonL::ReadPipeline::do_read_op(). This calls into ECSubRead::cost() that performs the actual calculations to set the cost based on the cases mentioned above. - The same sequence applies to the EC optimized path in ECCommon::ReadPipeline::do_read_op(). Fixes: https://tracker.ceph.com/issues/71655 Signed-off-by: Sridhar Seshasayee --- diff --git a/src/messages/MOSDECSubOpRead.h b/src/messages/MOSDECSubOpRead.h index 22614a910e9..9d57cef1696 100644 --- a/src/messages/MOSDECSubOpRead.h +++ b/src/messages/MOSDECSubOpRead.h @@ -21,16 +21,26 @@ class MOSDECSubOpRead : public MOSDFastDispatchOp { private: - static constexpr int HEAD_VERSION = 3; + static constexpr int HEAD_VERSION = 4; static constexpr int COMPAT_VERSION = 1; public: spg_t pgid; epoch_t map_epoch = 0, min_epoch = 0; ECSubRead op; + uint64_t cost = 0; + /** + * Calculate the cost of the SubOp read operation for mClock scheduler. + * + * @param *cct: *CephContext + * @param pair&: subchunk_count and subchunk_size + */ + void compute_cost(CephContext *cct, std::pair &subchunk_info) { + cost = op.cost(cct, subchunk_info); + } int get_cost() const override { - return 0; + return cost; } epoch_t get_map_epoch() const override { return map_epoch; @@ -58,6 +68,9 @@ public: } else { min_epoch = map_epoch; } + if (header.version >= 4) { + decode(cost, p); + } } void encode_payload(uint64_t features) override { @@ -67,6 +80,9 @@ public: encode(op, payload, features); encode(min_epoch, payload); encode_trace(payload, features); + if (header.version >= 4) { + encode(cost, payload); + } } std::string_view get_type_name() const override { return "MOSDECSubOpRead"; } diff --git a/src/osd/ECBackendL.cc b/src/osd/ECBackendL.cc index 487c7d53d59..7d98cad28f5 100644 --- a/src/osd/ECBackendL.cc +++ b/src/osd/ECBackendL.cc @@ -573,7 +573,18 @@ void ECBackendL::RecoveryBackend::continue_recovery_op( ceph_assert(!op.recovery_progress.data_complete); set want(op.missing_on_shards.begin(), op.missing_on_shards.end()); uint64_t from = op.recovery_progress.data_recovered_to; + /* When beginning recovery, the OI may not be known. As such the object + * size is not known. For the first read, attempt to read the default + * size. If this is larger than the object sizes, then the OSD will + * return truncated reads. If the object size is known, then attempt + * correctly sized reads, capped at recovery chunk size. + * (Ref: ECCommon.cc -> ECCommon::RecoveryBackend::continue_recovery_op()) + */ uint64_t amount = get_recovery_chunk_size(); + if (op.obc) { + uint64_t remaining = op.obc->obs.oi.size - from; + amount = std::min(remaining, amount); + } if (op.recovery_progress.first && op.obc) { if (auto [r, attrs, size] = ecbackend->get_attrs_n_size_from_disk(op.hoid); diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc index 9982b0ffc05..cb177eb5517 100644 --- a/src/osd/ECCommon.cc +++ b/src/osd/ECCommon.cc @@ -524,6 +524,9 @@ void ECCommon::ReadPipeline::do_read_op(ReadOp &rop) { std::optional local_read_op; std::vector> m; m.reserve(messages.size()); + std::pair subchunk_info = + std::make_pair(ec_impl->get_sub_chunk_count(), + sinfo.get_chunk_size() / ec_impl->get_sub_chunk_count()); for (auto &&[pg_shard, read]: messages) { rop.in_progress.insert(pg_shard); shard_to_read_map[pg_shard].insert(rop.tid); @@ -547,6 +550,7 @@ void ECCommon::ReadPipeline::do_read_op(ReadOp &rop) { msg->trace.init("ec sub read", nullptr, &rop.trace); msg->trace.keyval("shard", pg_shard.shard.id); } + msg->compute_cost(cct, subchunk_info); m.push_back(std::make_pair(pg_shard.osd, msg)); dout(10) << __func__ << ": will send msg " << *msg << " to osd." << pg_shard.osd << dendl; diff --git a/src/osd/ECCommonL.cc b/src/osd/ECCommonL.cc index 8dc08df190a..e60cd15cbe5 100644 --- a/src/osd/ECCommonL.cc +++ b/src/osd/ECCommonL.cc @@ -468,6 +468,9 @@ void ECCommonL::ReadPipeline::do_read_op(ReadOp &op) std::vector> m; m.reserve(messages.size()); + std::pair subchunk_info = + std::make_pair(ec_impl->get_sub_chunk_count(), + sinfo.get_chunk_size() / ec_impl->get_sub_chunk_count()); for (map::iterator i = messages.begin(); i != messages.end(); ++i) { @@ -489,6 +492,7 @@ void ECCommonL::ReadPipeline::do_read_op(ReadOp &op) msg->trace.init("ec sub read", nullptr, &op.trace); msg->trace.keyval("shard", i->first.shard.id); } + msg->compute_cost(cct, subchunk_info); m.push_back(std::make_pair(i->first.osd, msg)); } if (!m.empty()) { @@ -1179,4 +1183,4 @@ ECUtilL::HashInfoRef ECCommonL::UnstableHashInfoRegistry::get_hash_info( } } -END_IGNORE_DEPRECATED \ No newline at end of file +END_IGNORE_DEPRECATED diff --git a/src/osd/ECMsgTypes.cc b/src/osd/ECMsgTypes.cc index 206378fe1f0..36480c20439 100644 --- a/src/osd/ECMsgTypes.cc +++ b/src/osd/ECMsgTypes.cc @@ -15,6 +15,8 @@ #include "ECMsgTypes.h" +#include "common/ceph_context.h" + using std::list; using std::make_pair; using std::map; @@ -266,6 +268,66 @@ void ECSubRead::decode(bufferlist::const_iterator &bl) DECODE_FINISH(bl); } +/** + * Calculate the cost of the SubOp read operation for mClock scheduler. + * Cost is calculated based on whether the complete chunk/shard + * or a subchunk needs to be read: + * Case 1. Read the complete chunk aligned length: + * - Cost is set to the length of the chunk aligned extent size. + * Case 2. Fragmented reads: + * - Cost is set by considering the subchunk length and count. + * + * Note: To retain the legacy behavior, a cost of '0' is returned as + * before for WeightedPriorityQueue scheduler. + */ +uint64_t ECSubRead::cost(CephContext *cct, std::pair& subchunk_info) +{ + uint64_t total_cost = 0; + /** + * While the cost is calculated by the primary shard with mClock + * scheduler being active, the replica shard could still be + * running on the legacy WPQ scheduler. In such a case, the + * replica shard's decoding logic would set the cost to 0, which + * is consistent with what the legacy WPQ scheduler expects. + * + * In the converse case, the primary shard running with the + * legacy WPQ scheduler sends a cost of 0. The replica shard + * running with mClock scheduler will interpret this and set + * the cost to 1 accordingly. + */ + if (cct->_conf->osd_op_queue != "mclock_scheduler") { + return total_cost; // Legacy behavior for WPQ scheduler + } + + uint64_t subchunk_size = subchunk_info.second; + + for (auto &&[hoid, tl] : to_read) { + auto it = subchunks.find(hoid); + if (it == subchunks.end()) continue; + + auto &sc = it->second; + if (sc.empty()) continue; + + // Case 1: Optimized / Complete chunk aligned read + if (sc.size() == 1 && sc.front().second == subchunk_info.first) { + for ([[maybe_unused]] auto &&[offset, len, flags] : tl) { + total_cost += len; + } + continue; + } + + // Case 2: Fragmented / Subchunk Reads + uint64_t fragmented_shard_bytes = 0; + for (auto &&k : sc) { + fragmented_shard_bytes += (uint64_t)k.second * subchunk_size; + } + total_cost += fragmented_shard_bytes * tl.size(); + } + + // Safety Boundary: mClock requires non-zero costs for tracking active ops + return std::max(total_cost, 1ULL); +} + std::ostream &operator<<( std::ostream &lhs, const ECSubRead &rhs) { diff --git a/src/osd/ECMsgTypes.h b/src/osd/ECMsgTypes.h index 72aace2335c..282ba480dc8 100644 --- a/src/osd/ECMsgTypes.h +++ b/src/osd/ECMsgTypes.h @@ -119,6 +119,14 @@ struct ECSubRead { std::map>> subchunks; std::set omap_headers_to_read; std::map> omap_read_from; + /** + * Calculate the cost of the SubOp read operation for mClock scheduler. + * + * @param *cct: *CephContext + * @param pair&: subchunk_count and subchunk_size + * @return uint64_t - Cost of EC SubOpRead (size in Bytes) + */ + uint64_t cost(CephContext *cct, std::pair& subchunk_info); void encode(ceph::buffer::list &bl, uint64_t features) const; void decode(ceph::buffer::list::const_iterator &bl); void dump(ceph::Formatter *f) const;