From 15bd14cbed904bbea6fc6f13464db849b8f3dd06 Mon Sep 17 00:00:00 2001 From: Adam Kupczyk Date: Wed, 7 May 2025 08:30:11 +0000 Subject: [PATCH] os/bluestore/recompression: Estimator omits large compressed blobs The problem was that Estimator accepted large compressed blobs for recompression. The fix is to discourage such actions by penalizing compressed blobs based on their size. In effect small compressed blob is likely to be recompressed, and large compressed blob will not. Fixes: https://tracker.ceph.com/issues/71244 Signed-off-by: Adam Kupczyk (cherry picked from commit bbc9e961e9046949138bb3d70e8dd91761fcb088) --- src/os/bluestore/BlueStore.cc | 5 ++--- src/os/bluestore/Compression.cc | 36 ++++++++++++++++++++++++++++----- src/os/bluestore/Compression.h | 11 +++++++--- 3 files changed, 41 insertions(+), 11 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 5b6ff47ef1571..2e627945e7174 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -17640,6 +17640,7 @@ int BlueStore::_do_write_v2_compressed( o->extent_map.fault_range(db, scan_left, scan_right - scan_left); if (!c->estimator) c->estimator.reset(create_estimator()); Estimator* estimator = c->estimator.get(); + estimator->set_wctx(&wctx); Scanner scanner(this); scanner.write_lookaround(o.get(), offset, length, scan_left, scan_right, estimator); std::vector regions; @@ -17674,9 +17675,7 @@ int BlueStore::_do_write_v2_compressed( int32_t disk_for_compressed; int32_t disk_for_raw; uint32_t au_size = min_alloc_size; - uint32_t max_blob_size = c->pool_opts.value_or( - pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, (int64_t)comp_max_blob_size.load()); - disk_for_compressed = estimator->split_and_compress(wctx.compressor, max_blob_size, data_bl, bd); + disk_for_compressed = estimator->split_and_compress(data_bl, bd); disk_for_raw = p2roundup(i.offset + i.length, au_size) - p2align(i.offset, au_size); BlueStore::Writer wr(this, txc, &wctx, o); if (disk_for_compressed < disk_for_raw) { diff --git a/src/os/bluestore/Compression.cc b/src/os/bluestore/Compression.cc index ca3befe3cf190..3b4b5f5692f67 100644 --- a/src/os/bluestore/Compression.cc +++ b/src/os/bluestore/Compression.cc @@ -52,23 +52,41 @@ using P = BlueStore::printer; void Estimator::cleanup() { + wctx = nullptr; new_size = 0; uncompressed_size = 0; compressed_occupied = 0; compressed_size = 0; + compressed_area = 0; total_uncompressed_size = 0; total_compressed_occupied = 0; total_compressed_size = 0; actual_compressed = 0; actual_compressed_plus_pad = 0; extra_recompress.clear(); + single_compressed_blob = true; + last_blob = nullptr; } + +void Estimator::set_wctx(const WriteContext* wctx) +{ + this->wctx = wctx; +} + inline void Estimator::batch(const BlueStore::Extent* e, uint32_t gain) { const Blob *h_Blob = &(*e->blob); const bluestore_blob_t &h_bblob = h_Blob->get_blob(); if (h_bblob.is_compressed()) { + if (last_blob) { + if (h_Blob != last_blob) { + single_compressed_blob = false; + } + } else { + last_blob = h_Blob; + } compressed_size += e->length * h_bblob.get_compressed_payload_length() / h_bblob.get_logical_length(); + compressed_area += e->length; compressed_occupied += gain; } else { uncompressed_size += e->length; @@ -82,6 +100,16 @@ inline bool Estimator::is_worth() { uint32_t cost = uncompressed_size * expected_compression_factor + compressed_size * expected_recompression_error; + if (uncompressed_size == 0 && single_compressed_blob) { + // The special case if all extents are from compressed blobs. + // We want to avoid the case of recompressing into exactly the same. + // The cost should increase proportionally to blob size; + // the rationale is that recompressing small blob is likely to provide gain, + // but recompressing whole large blob isn't. + uint64_t padding_size = p2nphase(compressed_size, bluestore->min_alloc_size); + uint32_t split_tax = padding_size * compressed_area / wctx->target_blob_size; + cost += split_tax; + } uint32_t gain = uncompressed_size + compressed_occupied; double need_ratio = bluestore->cct->_conf->bluestore_recompression_min_gain; bool take = gain > cost * need_ratio; @@ -153,15 +181,13 @@ void Estimator::get_regions(std::vector& regions) } int32_t Estimator::split_and_compress( - CompressorRef compr, - uint32_t max_blob_size, ceph::buffer::list& data_bl, Writer::blob_vec& bd) { uint32_t au_size = bluestore->min_alloc_size; uint32_t size = data_bl.length(); ceph_assert(size > 0); - uint32_t blobs = (size + max_blob_size - 1) / max_blob_size; + uint32_t blobs = (size + wctx->target_blob_size - 1) / wctx->target_blob_size; uint32_t blob_size = p2roundup(size / blobs, au_size); std::vector blob_sizes(blobs); for (auto& i: blob_sizes) { @@ -179,10 +205,10 @@ int32_t Estimator::split_and_compress( // FIXME: memory alignment here is bad bufferlist t; std::optional compressor_message; - int r = compr->compress(bd.back().object_data, t, compressor_message); + int r = wctx->compressor->compress(bd.back().object_data, t, compressor_message); ceph_assert(r == 0); bluestore_compression_header_t chdr; - chdr.type = compr->get_type(); + chdr.type = wctx->compressor->get_type(); chdr.length = t.length(); chdr.compressor_message = compressor_message; encode(chdr, bd.back().disk_data); diff --git a/src/os/bluestore/Compression.h b/src/os/bluestore/Compression.h index ed78150019141..850a96bd70f06 100644 --- a/src/os/bluestore/Compression.h +++ b/src/os/bluestore/Compression.h @@ -22,6 +22,9 @@ public: Estimator(BlueStore* bluestore) :bluestore(bluestore) {} + // Each estimator run needs specific WriteContext + void set_wctx(const WriteContext* wctx); + // Inform estimator that an extent is a candidate for recompression. // Estimator has to calculate (guess) the cost (size) of the referenced data. // 'gain' is the size that will be released should extent be recompressed. @@ -45,8 +48,6 @@ public: void get_regions(std::vector& regions); int32_t split_and_compress( - CompressorRef compr, - uint32_t max_blob_size, ceph::buffer::list& data_bl, Writer::blob_vec& bd); @@ -57,9 +58,11 @@ private: double expected_compression_factor = 0.5; double expected_recompression_error = 1.1; double expected_pad_expansion = 1.1; + const WriteContext* wctx = nullptr; uint32_t new_size = 0; // fresh data to write uint32_t uncompressed_size = 0; // data that was not compressed - uint32_t compressed_size = 0; // data of compressed size + uint32_t compressed_size = 0; // estimated size of compressed data + uint32_t compressed_area = 0; // area that is compressed uint32_t compressed_occupied = 0; // disk size that will be freed uint32_t total_uncompressed_size = 0; uint32_t total_compressed_size = 0; @@ -68,6 +71,8 @@ private: uint32_t actual_compressed = 0; uint32_t actual_compressed_plus_pad = 0; std::map extra_recompress; + bool single_compressed_blob = true; + const Blob* last_blob = nullptr; // Prepare for new write void cleanup(); }; -- 2.39.5