From: Adam Kupczyk Date: Wed, 7 May 2025 08:30:11 +0000 (+0000) Subject: os/bluestore/recompression: Estimator omits large compressed blobs X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=bbc9e961e9046949138bb3d70e8dd91761fcb088;p=ceph.git os/bluestore/recompression: Estimator omits large compressed blobs The problem was that Estimator accepted large compressed blobs for recompression. The fix is to discourage such actions by penalizing compressed blobs based on their size. In effect small compressed blob is likely to be recompressed, and large compressed blob will not. Fixes: https://tracker.ceph.com/issues/71168 Signed-off-by: Adam Kupczyk --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 5b6ff47ef15..2e627945e71 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -17640,6 +17640,7 @@ int BlueStore::_do_write_v2_compressed( o->extent_map.fault_range(db, scan_left, scan_right - scan_left); if (!c->estimator) c->estimator.reset(create_estimator()); Estimator* estimator = c->estimator.get(); + estimator->set_wctx(&wctx); Scanner scanner(this); scanner.write_lookaround(o.get(), offset, length, scan_left, scan_right, estimator); std::vector regions; @@ -17674,9 +17675,7 @@ int BlueStore::_do_write_v2_compressed( int32_t disk_for_compressed; int32_t disk_for_raw; uint32_t au_size = min_alloc_size; - uint32_t max_blob_size = c->pool_opts.value_or( - pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, (int64_t)comp_max_blob_size.load()); - disk_for_compressed = estimator->split_and_compress(wctx.compressor, max_blob_size, data_bl, bd); + disk_for_compressed = estimator->split_and_compress(data_bl, bd); disk_for_raw = p2roundup(i.offset + i.length, au_size) - p2align(i.offset, au_size); BlueStore::Writer wr(this, txc, &wctx, o); if (disk_for_compressed < disk_for_raw) { diff --git a/src/os/bluestore/Compression.cc b/src/os/bluestore/Compression.cc index ca3befe3cf1..3b4b5f5692f 100644 --- a/src/os/bluestore/Compression.cc +++ b/src/os/bluestore/Compression.cc @@ -52,23 +52,41 @@ using P = BlueStore::printer; void Estimator::cleanup() { + wctx = nullptr; new_size = 0; uncompressed_size = 0; compressed_occupied = 0; compressed_size = 0; + compressed_area = 0; total_uncompressed_size = 0; total_compressed_occupied = 0; total_compressed_size = 0; actual_compressed = 0; actual_compressed_plus_pad = 0; extra_recompress.clear(); + single_compressed_blob = true; + last_blob = nullptr; } + +void Estimator::set_wctx(const WriteContext* wctx) +{ + this->wctx = wctx; +} + inline void Estimator::batch(const BlueStore::Extent* e, uint32_t gain) { const Blob *h_Blob = &(*e->blob); const bluestore_blob_t &h_bblob = h_Blob->get_blob(); if (h_bblob.is_compressed()) { + if (last_blob) { + if (h_Blob != last_blob) { + single_compressed_blob = false; + } + } else { + last_blob = h_Blob; + } compressed_size += e->length * h_bblob.get_compressed_payload_length() / h_bblob.get_logical_length(); + compressed_area += e->length; compressed_occupied += gain; } else { uncompressed_size += e->length; @@ -82,6 +100,16 @@ inline bool Estimator::is_worth() { uint32_t cost = uncompressed_size * expected_compression_factor + compressed_size * expected_recompression_error; + if (uncompressed_size == 0 && single_compressed_blob) { + // The special case if all extents are from compressed blobs. + // We want to avoid the case of recompressing into exactly the same. + // The cost should increase proportionally to blob size; + // the rationale is that recompressing small blob is likely to provide gain, + // but recompressing whole large blob isn't. + uint64_t padding_size = p2nphase(compressed_size, bluestore->min_alloc_size); + uint32_t split_tax = padding_size * compressed_area / wctx->target_blob_size; + cost += split_tax; + } uint32_t gain = uncompressed_size + compressed_occupied; double need_ratio = bluestore->cct->_conf->bluestore_recompression_min_gain; bool take = gain > cost * need_ratio; @@ -153,15 +181,13 @@ void Estimator::get_regions(std::vector& regions) } int32_t Estimator::split_and_compress( - CompressorRef compr, - uint32_t max_blob_size, ceph::buffer::list& data_bl, Writer::blob_vec& bd) { uint32_t au_size = bluestore->min_alloc_size; uint32_t size = data_bl.length(); ceph_assert(size > 0); - uint32_t blobs = (size + max_blob_size - 1) / max_blob_size; + uint32_t blobs = (size + wctx->target_blob_size - 1) / wctx->target_blob_size; uint32_t blob_size = p2roundup(size / blobs, au_size); std::vector blob_sizes(blobs); for (auto& i: blob_sizes) { @@ -179,10 +205,10 @@ int32_t Estimator::split_and_compress( // FIXME: memory alignment here is bad bufferlist t; std::optional compressor_message; - int r = compr->compress(bd.back().object_data, t, compressor_message); + int r = wctx->compressor->compress(bd.back().object_data, t, compressor_message); ceph_assert(r == 0); bluestore_compression_header_t chdr; - chdr.type = compr->get_type(); + chdr.type = wctx->compressor->get_type(); chdr.length = t.length(); chdr.compressor_message = compressor_message; encode(chdr, bd.back().disk_data); diff --git a/src/os/bluestore/Compression.h b/src/os/bluestore/Compression.h index ed781500191..850a96bd70f 100644 --- a/src/os/bluestore/Compression.h +++ b/src/os/bluestore/Compression.h @@ -22,6 +22,9 @@ public: Estimator(BlueStore* bluestore) :bluestore(bluestore) {} + // Each estimator run needs specific WriteContext + void set_wctx(const WriteContext* wctx); + // Inform estimator that an extent is a candidate for recompression. // Estimator has to calculate (guess) the cost (size) of the referenced data. // 'gain' is the size that will be released should extent be recompressed. @@ -45,8 +48,6 @@ public: void get_regions(std::vector& regions); int32_t split_and_compress( - CompressorRef compr, - uint32_t max_blob_size, ceph::buffer::list& data_bl, Writer::blob_vec& bd); @@ -57,9 +58,11 @@ private: double expected_compression_factor = 0.5; double expected_recompression_error = 1.1; double expected_pad_expansion = 1.1; + const WriteContext* wctx = nullptr; uint32_t new_size = 0; // fresh data to write uint32_t uncompressed_size = 0; // data that was not compressed - uint32_t compressed_size = 0; // data of compressed size + uint32_t compressed_size = 0; // estimated size of compressed data + uint32_t compressed_area = 0; // area that is compressed uint32_t compressed_occupied = 0; // disk size that will be freed uint32_t total_uncompressed_size = 0; uint32_t total_compressed_size = 0; @@ -68,6 +71,8 @@ private: uint32_t actual_compressed = 0; uint32_t actual_compressed_plus_pad = 0; std::map extra_recompress; + bool single_compressed_blob = true; + const Blob* last_blob = nullptr; // Prepare for new write void cleanup(); };