From 5beee2ad46cfeb8ffc70d106c1180f531e455e3e Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Fri, 15 Aug 2025 13:15:07 +0300 Subject: [PATCH] os/bluestore: enforce extent split on shard boundary Partially fixes: https://tracker.ceph.com/issues/70390 Signed-off-by: Igor Fedotov --- src/os/bluestore/BlueStore.cc | 109 ++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 45 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 24a4e3848fc..9cd84051957 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -3850,7 +3850,7 @@ void BlueStore::ExtentMap::reshard_action( if (extent->logical_offset >= needs_reshard_end) { break; } - dout(30) << " extent " << *extent << dendl; + dout(30) << __func__ << " extent " << *extent << dendl; while (extent->logical_offset >= shard_end) { shard_start = shard_end; ceph_assert(current_shard != end_shard); @@ -3865,58 +3865,78 @@ void BlueStore::ExtentMap::reshard_action( } if (extent->blob_escapes_range(shard_start, shard_end - shard_start)) { - if (!extent->blob->is_spanning()) { + BlobRef b = extent->blob; + uint32_t bstart = extent->blob_start(); + uint32_t bend = extent->blob_end(); + if (!b->is_spanning()) { // We have two options: (1) split the blob into pieces at the // shard boundaries (and adjust extents accordingly), or (2) // mark it spanning. We prefer to cut the blob if we can. Note that // we may have to split it multiple times--potentially at every // shard boundary. - bool must_span = false; - BlobRef b = extent->blob; + auto _make_spanning = [&](BlobRef& b) { + auto bid = allocate_spanning_blob_id(); + b->id = bid; + spanning_blob_map[b->id] = b; + dout(20) << __func__ << " adding spanning " << *b << dendl; + if (!was_too_many_blobs_check && + too_many_blobs_threshold && + spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) { + + was_too_many_blobs_check = true; + for (size_t i = 0; i < dumped_onodes.size(); ++i) { + if (dumped_onodes[i].first == onode->oid) { + oid_slot = &dumped_onodes[i]; + break; + } + if (!oldest_slot || (oldest_slot && + dumped_onodes[i].second < oldest_slot->second)) { + oldest_slot = &dumped_onodes[i]; + } + } + } + }; if (b->can_split()) { - uint32_t bstart = extent->blob_start(); - uint32_t bend = extent->blob_end(); + auto bstart1 = bstart; for (const auto& sh : shards) { - if (bstart < sh.shard_info->offset && + if (bstart1 < sh.shard_info->offset && bend > sh.shard_info->offset) { - uint32_t blob_offset = sh.shard_info->offset - bstart; + uint32_t blob_offset = sh.shard_info->offset - bstart1; if (b->can_split_at(blob_offset)) { dout(20) << __func__ << " splitting blob, bstart 0x" - << std::hex << bstart << " blob_offset 0x" + << std::hex << bstart1 << " blob_offset 0x" << blob_offset << std::dec << " " << *b << dendl; b = split_blob(b, blob_offset, sh.shard_info->offset); // switch b to the new right-hand side, in case it // *also* has to get split. - bstart += blob_offset; + bstart1 = sh.shard_info->offset; onode->c->store->logger->inc(l_bluestore_blob_split); } else { - must_span = true; + _make_spanning(b); break; } } } } else { - must_span = true; + _make_spanning(b); } - if (must_span) { - auto bid = allocate_spanning_blob_id(); - b->id = bid; - spanning_blob_map[b->id] = b; - dout(20) << __func__ << " adding spanning " << *b << dendl; - if (!was_too_many_blobs_check && - too_many_blobs_threshold && - spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) { - - was_too_many_blobs_check = true; - for (size_t i = 0; i < dumped_onodes.size(); ++i) { - if (dumped_onodes[i].first == onode->oid) { - oid_slot = &dumped_onodes[i]; - break; - } - if (!oldest_slot || (oldest_slot && - dumped_onodes[i].second < oldest_slot->second)) { - oldest_slot = &dumped_onodes[i]; - } + } // if (!extent->blob->is_spanning()) + // Make sure extent with a spanning blob doesn't span over shard boundary + if (extent->blob->is_spanning()) { + BlobRef b = extent->blob; + uint32_t bstart = extent->blob_start(); + for (const auto& sh : shards) { + if (bstart < sh.shard_info->offset && bend > sh.shard_info->offset) { + uint32_t blob_offset = sh.shard_info->offset - bstart; + auto pos = sh.shard_info->offset; + if (extent->logical_offset < pos && extent->logical_end() > pos) { + // split extent + size_t left = pos - extent->logical_offset; + Extent* ne = new Extent(pos, blob_offset, extent->length - left, b); + extent_map.insert(*ne); + extent->length = left; + dout(20) << __func__ << " split " << *extent << dendl; + dout(20) << __func__ << " to " << *ne << dendl; } } } @@ -3925,7 +3945,7 @@ void BlueStore::ExtentMap::reshard_action( if (extent->blob->is_spanning()) { spanning_blob_map.erase(extent->blob->id); extent->blob->id = -1; - dout(30) << __func__ << " un-spanning " << *extent->blob << dendl; + dout(20) << __func__ << " un-spanning " << *extent->blob << dendl; } } } @@ -3977,7 +3997,6 @@ bool BlueStore::ExtentMap::encode_some( unsigned n = 0; size_t bound = 0; - bool must_reshard = false; uint32_t prev_offset_end = 0; for (auto p = start; p != extent_map.end() && p->logical_offset < end; @@ -3986,25 +4005,28 @@ bool BlueStore::ExtentMap::encode_some( if (complain_extent_overlap) { if (p->logical_offset < prev_offset_end) { using P = BlueStore::printer; - dout(-1) << __func__ << " extents overlap: " << std::endl - << onode->print(P::NICK + P::SDISK + P::SUSE + P::SBUF) << dendl; - ceph_abort(); + dout(-1) << __func__ << " extents overlap: " + << std::hex << offset <<"~" << length + << " " << p->logical_offset <<"~" << p->length + << std::dec << std::endl + << onode->print(P::NICK + P::SDISK + P::SUSE + P::SBUF) + << dendl; + ceph_abort_msg("extents overlaps"); } prev_offset_end = p->logical_end(); } p->blob->last_encoded_id = -1; if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) { - dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length + dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length << std::dec << " hit new spanning blob " << *p << dendl; request_reshard(p->blob_start(), p->blob_end()); - must_reshard = true; + return true; } else if (p->blob->is_spanning() && p->logical_end() > end) { - dout(30) << __func__ << std::hex << offset << "~" << length + dout(20) << __func__ << std::hex << offset << "~" << length << std::dec << " extent stands out " << *p << dendl; request_reshard(p->blob_start(), p->blob_end()); - must_reshard = true; - } - if (!must_reshard) { + return true; + } else { denc_varint(0, bound); // blobid denc_varint(0, bound); // logical_offset denc_varint(0, bound); // len @@ -4017,9 +4039,6 @@ bool BlueStore::ExtentMap::encode_some( false); } } - if (must_reshard) { - return true; - } denc(struct_v, bound); denc_varint(0, bound); // number of extents -- 2.39.5