From 193c8fb153703443a8b18a57034a537b89d38678 Mon Sep 17 00:00:00 2001 From: Adam Kupczyk Date: Fri, 5 Jan 2024 08:08:09 +0000 Subject: [PATCH] os/bluestore: Introducing BlueStore::Writer BlueStore::Writer is a toolkit to give more options to control write. It gives more control over compression process, letting user of the class manually split incoming data to blobs. Now for large writes all but last blob can be fully filled with data. There is now a single place that decides on deferred/direct. Signed-off-by: Adam Kupczyk --- src/os/bluestore/BlueStore.h | 3 + src/os/bluestore/Writer.cc | 1233 ++++++++++++++++++++++++++++++++++ src/os/bluestore/Writer.h | 210 ++++++ 3 files changed, 1446 insertions(+) create mode 100644 src/os/bluestore/Writer.h diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 6eec97b1336..fe64713f8ea 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -2353,6 +2353,9 @@ public: bool apply_defer(); }; + class Writer; + friend class Writer; + // -------------------------------------------------------- // members private: diff --git a/src/os/bluestore/Writer.cc b/src/os/bluestore/Writer.cc index 3ab3fb53115..09ba5338d21 100644 --- a/src/os/bluestore/Writer.cc +++ b/src/os/bluestore/Writer.cc @@ -16,6 +16,20 @@ #include "include/intarith.h" #include "os/bluestore/bluestore_types.h" +std::ostream& operator<<(std::ostream& out, const BlueStore::Writer::blob_data_printer& printer) +{ + out << std::hex; + uint32_t lof = printer.base_position; + for (auto q: printer.blobs) { + out << " " << lof << "~" << q.real_length; + if (q.is_compressed()) { + out << "(" << q.compressed_length << ")"; + } + lof += q.real_length; + } + out << std::dec; + return out; +} /// Empties range [offset~length] of object o that is in collection c. /// Collects unused elements: @@ -149,3 +163,1222 @@ inline void bluestore_blob_t::allocated_full( logical_length = length; } +// split data +inline bufferlist split_left(bufferlist& data, uint32_t split_pos) +{ + bufferlist left; + left.substr_of(data, 0, split_pos); + data.splice(0, split_pos); + return left; +} +inline bufferlist split_right(bufferlist& data, uint32_t split_pos) +{ + bufferlist right; + data.splice(split_pos, data.length() - split_pos, &right); + return right; +} + +// should _maybe_expand_blob go to Blob ? +inline void BlueStore::Writer::_maybe_expand_blob( + Blob* blob, + uint32_t new_blob_size) +{ + ceph_assert(blob->get_blob().get_logical_length() > 0); + if (blob->get_blob().get_logical_length() < new_blob_size) { + uint32_t min_release_size = blob->get_blob_use_tracker().au_size; + blob->add_tail(new_blob_size, min_release_size); + } +} + +#define dout_context bstore->cct +#define dout_subsys ceph_subsys_bluestore + +//general levels: +// 10 init, fundamental state changes (not present here) +// 15 key functions, important params +// 20 most functions, most params +// 25 all functions, key variables +// 30 prints passing data (not used here) +// modifiers of extent, blob, onode printout: +// +0 nick + sdisk + suse +// +1 nick + sdisk + suse + sbuf +// +2 nick + sdisk + suse + sbuf + schk + attrs +// +3 ptr + disk + use + buf +// +4 ptr + disk + use + chk + buf + attrs +using exmp_it = BlueStore::extent_map_t::iterator; + +uint16_t BlueStore::Writer::debug_level_to_pp_mode(CephContext* cct) { + static constexpr uint16_t modes[5] = { + P::NICK + P::SDISK + P::SUSE, + P::NICK + P::SDISK + P::SUSE + P::SBUF, + P::NICK + P::SDISK + P::SUSE + P::SBUF + P::SCHK + P::ATTRS, + P::PTR + P::DISK + P::USE + P::BUF, + P::PTR + P::DISK + P::USE + P::BUF + P::CHK + P::ATTRS + }; + int level = cct->_conf->subsys.get_gather_level(dout_subsys); + if (level >= 30) return modes[4]; + if (level <= 15) return modes[0]; + return modes[level % 5]; +} + + +inline BlueStore::extent_map_t::iterator BlueStore::Writer::_find_mutable_blob_left( + BlueStore::extent_map_t::iterator it, + uint32_t search_begin, // only interested in blobs that are + uint32_t search_end, // within range [begin - end) + uint32_t mapmust_begin,// for 'unused' case: the area + uint32_t mapmust_end) // [begin - end) must be mapped +{ + extent_map_t& map = onode->extent_map.extent_map; + if (it == map.begin()) { + return map.end(); + } + do { + --it; + if (it->logical_offset < search_begin) break; + if (search_begin > it->blob_start()) continue; + if (it->blob_end() > search_end) continue; + if (it->blob_start() > mapmust_begin) continue; + auto bblob = it->blob->get_blob(); + if (!bblob.is_mutable()) continue; + if (bblob.has_csum()) { + uint32_t mask = mapmust_begin | mapmust_end; + if (p2phase(mask, bblob.get_csum_chunk_size()) != 0) continue; + } + return it; + } while (it != map.begin()); + return map.end(); +} + +inline BlueStore::extent_map_t::iterator BlueStore::Writer::_find_mutable_blob_right( + BlueStore::extent_map_t::iterator it, + uint32_t search_begin, // only interested in blobs that are + uint32_t search_end, // within range [begin - end) + uint32_t mapmust_begin, // for 'unused' case: the area + uint32_t mapmust_end) // [begin - end) must be mapped +{ + extent_map_t& map = onode->extent_map.extent_map; + for (;it != map.end();++it) { + if (it->logical_offset >= search_end) break; + if (search_begin > it->blob_start()) continue; + if (it->blob_end() > search_end) continue; + if (it->blob_start() > mapmust_begin) continue; + auto bblob = it->blob->get_blob(); + if (!bblob.is_mutable()) continue; + if (bblob.has_csum()) { + uint32_t mask = mapmust_begin | mapmust_end; + if (p2phase(mask, bblob.get_csum_chunk_size()) != 0) continue; + } + return it; + break; + //++it; + }; + return map.end(); +} + +void BlueStore::Writer::_get_disk_space( + uint32_t length, + PExtentVector& dst) +{ + while (length > 0) { + ceph_assert(disk_allocs.it->length > 0); + uint32_t s = std::min(length, disk_allocs.it->length - disk_allocs.pos); + length -= s; + dst.emplace_back(disk_allocs.it->offset + disk_allocs.pos, s); + disk_allocs.pos += s; + if (disk_allocs.it->length == disk_allocs.pos) { + ++disk_allocs.it; + disk_allocs.pos = 0; + } + } +} + + +/* +1. _blob_put_data (tool) + Modifies existing blob to contain specific data, does not care + for allocations. Does not check anything. + +2. _blob_put_data_subau + Modifies existing blob on range that is allocated, but 'unused'. + Data is block aligned. No ref++; + +3. _blob_put_data_allocate + Modifies existing blob on unallocated range, puts allocations. + Data is au aligned. No ref++; + +4. _blob_put_data_combined + No reason to combine 2 + 3. + +5. _blob_create_with_data + Create new blob with wctx specs. + Gives blob allocation units. Puts data to blob. Sets unused. + No ref++. + +6. _blob_create_full + Create new blob with wctx specs. + Gives blob allocation units. Puts data to blob. No unused. + Full ref++ done. +*/ + +inline void BlueStore::Writer::_blob_put_data( + Blob* blob, + uint32_t in_blob_offset, + bufferlist disk_data) +{ + auto& bblob = blob->dirty_blob(); + uint32_t in_blob_end = in_blob_offset + disk_data.length(); + // update csum, used_in_blob and unused + if (bblob.has_csum()) { + // calc_csum has fallback for csum == NONE, but is not inlined + bblob.calc_csum(in_blob_offset, disk_data); + } + bblob.mark_used(in_blob_offset, in_blob_end - in_blob_offset); + // do not update ref, we do not know how much of the data is actually used +} + +/// Modifies blob to accomodate new data. +/// For partial AU overwrites only. +/// Requirements: +/// - target range is block aligned +/// - has unused +/// - target range is 'unused' +/// By extension: +/// - csum & tracker are large enough +/// No ref++. +/// Similar to _blob_put_data_allocate, but does not put new allocations +inline void BlueStore::Writer::_blob_put_data_subau( + Blob* blob, + uint32_t in_blob_offset, + bufferlist disk_data) +{ + auto& bblob = blob->dirty_blob(); + uint32_t in_blob_end = in_blob_offset + disk_data.length(); + ceph_assert(bblob.is_mutable()); + //TODO WHY? - ceph_assert(bblob.has_unused()); + //TODO WHY? - ceph_assert(bblob.is_unused(in_blob_offset, in_blob_end - in_blob_offset)); + uint32_t chunk_size = bblob.get_chunk_size(bstore->block_size); + ceph_assert(p2phase(in_blob_offset, chunk_size) == 0); + ceph_assert(p2phase(in_blob_end, chunk_size) == 0); + ceph_assert(bblob.get_logical_length() >= in_blob_end); + _blob_put_data(blob, in_blob_offset, disk_data); +} + + +/// Modifies blob to accomodate new data. +/// For AU aligned operations only. +/// Requirements: +/// - blob is mutable +/// - target range is AU aligned +/// - csum and tracker are large enough +/// Calculates csum, clears unused. +/// Moves disk space from disk_allocs to blob. +/// No ref++. +inline void BlueStore::Writer::_blob_put_data_allocate( + Blob* blob, + uint32_t in_blob_offset, + bufferlist disk_data) +{ + dout(25) << __func__ << "@" << std::hex << in_blob_offset + << "~" << disk_data.length() << std::dec << " -> " << blob->print(pp_mode) << dendl; + auto& bblob = blob->dirty_blob(); + uint32_t in_blob_end = in_blob_offset + disk_data.length(); + ceph_assert(bblob.is_mutable()); + ceph_assert(p2phase(in_blob_offset, (uint32_t)bstore->min_alloc_size) == 0); + ceph_assert(p2phase(in_blob_end, (uint32_t)bstore->min_alloc_size) == 0); + ceph_assert(bblob.get_logical_length() >= in_blob_end); + _blob_put_data(blob, in_blob_offset, disk_data); + PExtentVector blob_allocs; + _get_disk_space(in_blob_end - in_blob_offset, blob_allocs); + bblob.allocated(in_blob_offset, in_blob_end - in_blob_offset, blob_allocs); + _schedule_io(blob_allocs, 0, disk_data); + bstore->_buffer_cache_write(txc, blob, in_blob_offset, disk_data, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + + dout(25) << __func__ << " 0x" << std::hex << disk_data.length() + << "@" << in_blob_offset << std::dec << " -> " + << blob->print(pp_mode) << " no ref yet" << dendl; +} + +/// Modifies blob to accomodate new data. +/// Only operates on new AUs. Takes those AUs from 'disk_allocs'. +/// Requirements: +/// - blob is mutable +/// - target range is csum and tracker aligned +/// - csum and tracker are large enough +/// No AU alignment requirement. +/// Calculates csum, clears unused. +/// No ref++. +/// Very similiar to _blob_put_data_allocate, but also allows for partial AU writes. +/// to newly allocated AUs +inline void BlueStore::Writer::_blob_put_data_subau_allocate( + Blob* blob, + uint32_t in_blob_offset, + bufferlist disk_data) +{ + dout(25) << __func__ << "@" << std::hex << in_blob_offset + << "~" << disk_data.length() << std::dec << " -> " << blob->print(pp_mode) << dendl; + auto& bblob = blob->dirty_blob(); + uint32_t au_size = bstore->min_alloc_size; + uint32_t in_blob_end = in_blob_offset + disk_data.length(); + uint32_t chunk_size = bblob.get_chunk_size(bstore->block_size); + ceph_assert(bblob.is_mutable()); + ceph_assert(p2phase(in_blob_offset, chunk_size) == 0); + ceph_assert(p2phase(in_blob_end, chunk_size) == 0); + ceph_assert(bblob.get_logical_length() >= in_blob_end); + uint32_t in_blob_alloc_offset = p2align(in_blob_offset, au_size); + uint32_t in_blob_alloc_end = p2roundup(in_blob_end, au_size); + _blob_put_data(blob, in_blob_offset, disk_data); + PExtentVector blob_allocs; + _get_disk_space(in_blob_alloc_end - in_blob_alloc_offset, blob_allocs); + bblob.allocated(in_blob_alloc_offset, in_blob_alloc_end - in_blob_alloc_offset, blob_allocs); + _schedule_io(blob_allocs, in_blob_offset - in_blob_alloc_offset, disk_data); + bstore->_buffer_cache_write(txc, blob, in_blob_offset, disk_data, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + dout(25) << __func__ << " 0x" << std::hex << disk_data.length() + << "@" << in_blob_offset << std::dec << " -> " + << blob->print(pp_mode) << " no ref yet" << dendl; +} + + +/// Create new blob with wctx specs. +/// Allowed for block and AU alignments. +/// Requirements: +/// - target range is block aligned +/// Calculates csum, sets unused. +/// Moves disk space from disk_allocs to blob. +/// No ref++. +BlueStore::BlobRef BlueStore::Writer::_blob_create_with_data( + uint32_t in_blob_offset, + bufferlist& disk_data) +{ + uint32_t block_size = bstore->block_size; + uint32_t min_alloc_size = bstore->min_alloc_size; + ceph_assert(p2phase(in_blob_offset, block_size) == 0); + ceph_assert(p2phase(disk_data.length(), block_size) == 0); + BlobRef blob = onode->c->new_blob(); + bluestore_blob_t &bblob = blob->dirty_blob(); + uint32_t data_length = disk_data.length(); + uint32_t alloc_offset = p2align(in_blob_offset, min_alloc_size); + uint32_t blob_length = p2roundup(in_blob_offset + data_length, min_alloc_size); + uint32_t tracked_unit = min_alloc_size; + uint32_t csum_length_mask = in_blob_offset | data_length; //to find 2^n common denominator + uint32_t csum_order = // conv 8 -> 32 so "<<" does not overflow + std::min(wctx->csum_order, std::countr_zero(csum_length_mask)); + if (wctx->csum_type != Checksummer::CSUM_NONE) { + bblob.init_csum(wctx->csum_type, csum_order, blob_length); + bblob.calc_csum(in_blob_offset, disk_data); + tracked_unit = std::max(1u << csum_order, min_alloc_size); + } + blob->dirty_blob_use_tracker().init(blob_length, tracked_unit); + PExtentVector blob_allocs; + _get_disk_space(blob_length - alloc_offset, blob_allocs); + bblob.allocated(alloc_offset, blob_length - alloc_offset, blob_allocs); + //^sets also logical_length = blob_length + dout(25) << __func__ << " @0x" << std::hex << in_blob_offset + << "~" << disk_data.length() + << " alloc_offset=" << alloc_offset + << " -> " << blob->print(pp_mode) << dendl; + _schedule_io(blob_allocs, in_blob_offset - alloc_offset, disk_data); + bstore->_buffer_cache_write(txc, blob, in_blob_offset, disk_data, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + return blob; +} + +/// Create new blob with wctx specs, fill with data. +/// Requirements: +/// - data is AU aligned +/// Calculates csum, sets unused. +/// Moves disk space from disk_allocs to blob. +/// Full ref done. +BlueStore::BlobRef BlueStore::Writer::_blob_create_full( + bufferlist& disk_data) +{ + uint32_t min_alloc_size = bstore->min_alloc_size; + uint32_t blob_length = disk_data.length(); + ceph_assert(p2phase(blob_length, bstore->min_alloc_size) == 0); + BlobRef blob = onode->c->new_blob(); + + //uint32_t in_blob_end = disk_data.length(); + bluestore_blob_t &bblob = blob->dirty_blob(); + uint32_t tracked_unit = min_alloc_size; + uint32_t csum_order = // conv 8 -> 32 so "<<" does not overflow + std::min(wctx->csum_order, std::countr_zero(blob_length)); + if (wctx->csum_type != Checksummer::CSUM_NONE) { + bblob.init_csum(wctx->csum_type, csum_order, blob_length); + bblob.calc_csum(0, disk_data); + tracked_unit = std::max(1u << csum_order, min_alloc_size); + } + //std::cout << "blob_length=" << blob_length << std::endl; + blob->dirty_blob_use_tracker().init_and_ref(blob_length, tracked_unit); + PExtentVector blob_allocs; + _get_disk_space(blob_length, blob_allocs); + _schedule_io(blob_allocs, 0, disk_data); //have to do before move() + bblob.allocated_full(blob_length, std::move(blob_allocs)); + bblob.mark_used(0, blob_length); //todo - optimize; this obviously clears it + bstore->_buffer_cache_write(txc, blob, 0, disk_data, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + return blob; +} + +/** + * Note from developer + * This module tries to keep naming convention: + * 1) Data location in object is named "position/location/begin", not "offset". + * 2) Data location within blob is named "offset". + * 3) Disk location is named "position/location", not "offset". + */ + +/* + note for myself + I decided to not mix sub-au writes and normal writes. + When there is sub-au write to blob there are 2 cases: + a) entire write region is "unused" + In this case we do speed up direct write + b) some part is "used" + 1) read block-wise and do deferred + 2) read au-wise and have choice deferred / direct + end note for myself + + Let's treat case of 'unused' as special. + If such thing happens, move execution of it outside + optimization logic. + So, before going to main processing we do 'unused'. + Then we crop the data and continue with rest. + It is only first and last blob that can be unused. + + The real use case of unused is when AU is 64k, block is 4k. + There is a difference in expected size of deferred on appends: + without its avg ~32k, with it only ~2k. + The unused feature would be useful if ZERO_OP could reset used->unused, + but this one is not easy. + This is why we do not bother with considering unused + in non-head / non-tail blobs. + Which change of default AU 64k->4k, its importance dwindles. + + note for myself + The presence of blobs with unused makes impact on alignment restrictions? + It seems reasonable that expand-read should be to block size. + Even if we allocate larger AU, there is no need to write to empty. + Overwrite must be deferred or to unused. + Can I just make a determination that unused is an excuse not to do deferred? + Or is writing to unused just a signal that reallocation is not an option? + Clearly if something is unused, then it does exist. + So write-selection function could make a determination what to do. + But having limitations complicates optimization alg + If I sacrifice optimization of defered, will I be done? + +*/ + +/** + * Transfer to disk modulated by unused() bits + * + * Blob can have unused() bits; it encodes which disk blocks are allocated, + * but have never been used. Those bits determine if we can do direct or + * deferred write is required. + * Function has \ref Writer::test_write_divertor bypass for testing purposes. + * + * disk_position - Location must be disk block aligned. + * data - Data to write. + * mask - Set of unused() bits, starting from bit 0. + * chunk_size - Size covered by one "mask" bit. + */ +inline void BlueStore::Writer::_schedule_io_masked( + uint64_t disk_position, + bufferlist data, + bluestore_blob_t::unused_t mask, + uint32_t chunk_size) +{ + if (test_write_divertor == nullptr) { + int32_t data_left = data.length(); + while (data_left > 0) { + bool chunk_is_unused = (mask & 1) != 0; + bufferlist ddata; + data.splice(0, chunk_size, &ddata); + if (chunk_is_unused) { + bstore->bdev->aio_write(disk_position, ddata, &txc->ioc, false); + } else { + bluestore_deferred_op_t *op = bstore->_get_deferred_op(txc, ddata.length()); + op->op = bluestore_deferred_op_t::OP_WRITE; + op->extents.emplace_back(bluestore_pextent_t(disk_position, chunk_size)); + op->data = ddata; + } + disk_position += chunk_size; + data_left -= chunk_size; + mask >>= 1; + } + ceph_assert(data_left == 0); + } else { + int32_t data_left = data.length(); + while (data_left > 0) { + bool chunk_is_unused = (mask & 1) != 0; + bufferlist ddata; + data.splice(0, chunk_size, &ddata); + test_write_divertor->write(disk_position, ddata, !chunk_is_unused); + disk_position += chunk_size; + data_left -= chunk_size; + mask >>= 1; + } + ceph_assert(data_left == 0); + } +} + +/** + * Transfer to disk + * + * Initiates transfer of data to disk. + * Depends on \ref Writer::do_deferred to select direct or deferred action. + * If \ref Writer::test_write_divertor bypass is set it overrides default path. + * + * disk_allocs - Target disk allocation units. + * initial_offset - Offset withing first AU; used when sub-au write is ongoing. + * data - Data. + */ +inline void BlueStore::Writer::_schedule_io( + const PExtentVector& disk_allocs, + uint32_t initial_offset, + bufferlist data) +{ + if (test_write_divertor == nullptr) { + if (do_deferred) { + bluestore_deferred_op_t *op = bstore->_get_deferred_op(txc, data.length()); + op->op = bluestore_deferred_op_t::OP_WRITE; + for (auto& e : disk_allocs) { + op->extents.emplace_back(e); + } + op->data = data; + } else { + for (auto loc : disk_allocs) { + ceph_assert(initial_offset <= loc.length); + bufferlist data_chunk; + uint32_t data_to_write = std::min(data.length(), loc.length - initial_offset); + data.splice(0, data_to_write, &data_chunk); + bstore->bdev->aio_write(loc.offset + initial_offset, data_chunk, &txc->ioc, false); + initial_offset = 0; + } + ceph_assert(data.length() == 0); + } + } else { + for (auto loc: disk_allocs) { + ceph_assert(initial_offset <= loc.length); + bufferlist data_chunk; + uint32_t data_to_write = std::min(data.length(), loc.length - initial_offset); + data.splice(0, data_to_write, &data_chunk); + test_write_divertor->write(loc.offset + initial_offset, data_chunk, do_deferred); + initial_offset = 0; + } + ceph_assert(data.length() == 0); + } +} + +/** + * Read part of own data + * + * Rados protocol allows for byte aligned writes. Disk blocks are larger and + * we need to read data that is around to form whole block. + * + * If \ref Writer::test_read_divertor is set it overrides default. + */ +inline bufferlist BlueStore::Writer::_read_self( + uint32_t position, + uint32_t length) +{ + if (test_read_divertor == nullptr) { + bufferlist result; + int r; + r = bstore->_do_read(onode->c, onode, position, length, result); + ceph_assert(r >= 0 && r <= (int)length); + size_t zlen = length - r; + if (zlen) { + result.append_zero(zlen); + //logger->inc(l_bluestore_write_pad_bytes, zlen); + } + return result; + } else { + return test_read_divertor->read(position, length); + } +} + +// used to put data to blobs that does not require allocation +// crops data from bufferlist, +// returns disk pos and length and mask +// or updates wctx does deferred/direct +void BlueStore::Writer::_try_reuse_allocated_l( + exmp_it after_punch_it, // hint, we could have found it ourselves + uint32_t& logical_offset, // will fix value if something consumed + uint32_t ref_end_offset, // limit to ref, if data was padded + blob_data_t& bd) // modified when consumed +{ + uint32_t search_stop = p2align(logical_offset, (uint32_t)wctx->target_blob_size); + uint32_t au_size = bstore->min_alloc_size; + uint32_t block_size = bstore->block_size; + ceph_assert(!bd.is_compressed()); + ceph_assert(p2phase(logical_offset, au_size) != 0); + BlueStore::ExtentMap& emap = onode->extent_map; + auto it = after_punch_it; + while (it != emap.extent_map.begin()) { + --it; + // first of all, check it we can even use the blob here + if (it->blob_end() < search_stop) break; + if (it->blob_end() <= logical_offset) continue; // need at least something + Blob* b = it->blob.get(); + dout(25) << __func__ << " trying " << b->print(pp_mode) << dendl; + bluestore_blob_t bb = b->dirty_blob(); + if (!bb.is_mutable()) continue; + // all offsets must be aligned to blob chunk_size, + // which is larger of csum and device block granularity + bufferlist& data = bd.disk_data; + uint32_t chunk_size = it->blob->get_blob().get_chunk_size(block_size); + if (p2phase(logical_offset, chunk_size) != 0) continue; + // this blob can handle required granularity + // the blob might, or might not be allocated where we need it + // note we operate on 1 AU max + uint32_t blob_offset = it->blob_start(); + uint32_t want_subau_begin = logical_offset; //it is chunk_size aligned + uint32_t want_subau_end = p2roundup(logical_offset, au_size); + if (logical_offset + data.length() < want_subau_end) { + // we do not have enough data to cut at AU, try chunk + want_subau_end = logical_offset + data.length(); + if (p2phase(want_subau_end, chunk_size) !=0) continue; + } + uint32_t in_blob_offset = want_subau_begin - blob_offset; + uint64_t subau_disk_offset = bb.get_allocation_at(want_subau_begin - blob_offset); + if (subau_disk_offset == bluestore_blob_t::NO_ALLOCATION) continue; + dout(25) << __func__ << " 0x" << std::hex << want_subau_begin << "-" + << want_subau_end << std::dec << " -> " << b->print(pp_mode) << dendl; + uint32_t data_size = want_subau_end - want_subau_begin; + bufferlist data_at_left = split_left(data, data_size); + bd.real_length -= data_size; + uint32_t mask = bb.get_unused_mask(in_blob_offset, data_size, chunk_size); + _blob_put_data_subau(b, in_blob_offset, data_at_left); + // transfer do disk + _schedule_io_masked(subau_disk_offset, data_at_left, mask, chunk_size); + + uint32_t ref_end = std::min(ref_end_offset, want_subau_end); + //fixme/improve - need something without stupid extras - that is without coll + b->get_ref(onode->c, in_blob_offset, ref_end - want_subau_begin); + Extent *le = new Extent( + want_subau_begin, in_blob_offset, ref_end - want_subau_begin, it->blob); + dout(20) << __func__ << " new extent " << le->print(pp_mode) << dendl; + emap.extent_map.insert(*le); + + logical_offset += data_size; + bstore->_buffer_cache_write(txc, b, in_blob_offset, data_at_left, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + break; + } +} + +// used to put data to blobs that does not require allocation +// crops data from bufferlist, +// returns disk pos and length and mask +// or updates wctx does deferred/direct +// AU | AU | AU +// |bl|bl|bl|bl|bl|bl|bl|bl| +// |csum |csum |csum |csum | +// datadatadatadatada case A - input rejected +// tadatadat case B - input rejected +void BlueStore::Writer::_try_reuse_allocated_r( + exmp_it after_punch_it, // hint, we could have found it ourselves + uint32_t& end_offset, // will fix value if something consumed + uint32_t ref_end_offset, // limit to ref, if data was padded + blob_data_t& bd) // modified when consumed +{ + // this function should be called only when its applicable + // that is, data is not compressed and is not AU aligned + uint32_t au_size = bstore->min_alloc_size; + uint32_t block_size = bstore->block_size; + ceph_assert(!bd.is_compressed()); + ceph_assert(p2phase(end_offset, au_size) != 0); + BlueStore::ExtentMap& emap = onode->extent_map; + for (auto it = after_punch_it; it != emap.extent_map.end(); ++it) { + // first of all, check it we can even use the blob here + if (it->blob_start() > end_offset) break; // need at least something + Blob* b = it->blob.get(); + dout(25) << __func__ << " trying " << b->print(pp_mode) << dendl; + bluestore_blob_t bb = b->dirty_blob(); + if (!bb.is_mutable()) continue; + // all offsets must be aligned to blob chunk_size, + // which is larger of csum and device block granularity + bufferlist& data = bd.disk_data; + uint32_t chunk_size = it->blob->get_blob().get_chunk_size(block_size); + if (p2phase(end_offset, chunk_size) != 0) continue; //case A + uint32_t blob_offset = it->blob_start(); + uint32_t want_subau_begin = p2align(end_offset, au_size); //we operate on 1 AU max + uint32_t want_subau_end = end_offset; //it is chunk_size aligned + if (data.length() < end_offset - want_subau_begin) { + // we do not have enough data to cut at AU, fallback to chunk + want_subau_begin = end_offset - data.length(); + if (p2phase(want_subau_begin, chunk_size) != 0) continue; //case B + } + uint32_t in_blob_offset = want_subau_begin - blob_offset; + uint64_t subau_disk_offset = bb.get_allocation_at(want_subau_begin - blob_offset); + if (subau_disk_offset == bluestore_blob_t::NO_ALLOCATION) continue; + dout(25) << __func__ << " 0x" << std::hex << want_subau_begin << "-" + << want_subau_end << std::dec << " -> " << b->print(pp_mode) << dendl; + uint32_t data_size = want_subau_end - want_subau_begin; + bufferlist data_at_right = split_right(data, data.length() - data_size); + bd.real_length -= data_size; + uint32_t mask = bb.get_unused_mask(in_blob_offset, data_size, chunk_size); + _blob_put_data_subau(b, in_blob_offset, data_at_right); + //transfer to disk + _schedule_io_masked(subau_disk_offset, data_at_right, mask, chunk_size); + + uint32_t ref_end = std::min(ref_end_offset, want_subau_end); + //fixme/improve - need something without stupid extras - that is without coll + b->get_ref(onode->c, in_blob_offset, ref_end - want_subau_begin); + Extent *le = new Extent( + want_subau_begin, in_blob_offset, ref_end - want_subau_begin, it->blob); + dout(20) << __func__ << " new extent " << le->print(pp_mode) << dendl; + emap.extent_map.insert(*le); + + end_offset -= data_size; + bstore->_buffer_cache_write(txc, b, in_blob_offset, data_at_right, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + break; + } +} + +/** + * Export some data to neighboring blobs. + * + * Sometimes punch_hole_2 will clear only part of AU. + * Example: AU = 64K, DiskBlock = 4K, CSUM = 16K. + * Punch_hole_2 will always align to max(DiskBlock, CSUM) and get rid of whole AUs, + * but the boundary ones might need to leave some data intact, leaving some + * space unused. This function tries to use that space. + * + * If possible function cuts portions of data from first and last + * element of blob_data_t sequence. Params logical_offset, end_offset and + * ref_end_offset are updated to reflect data truncation. + * Only uncompressed input data is eligiable for being moved to other blobs. + * + * logical_offset - In-object offset of first byte in bd. + * end_offset - Offset of last byte in bd. + * ref_end_offset - Last byte that should be part of object; ref_end_offset <= end_offset. + * bd - Continous sequence of data blocks to be put to object. + * after_punch_it - Hint from punch_hole_2. + * Blobs to modify will be either left of it (for left search), + * or right of it (for right side search). + */ +void BlueStore::Writer::_try_put_data_on_allocated( + uint32_t& logical_offset, + uint32_t& end_offset, + uint32_t& ref_end_offset, + blob_vec& bd, + exmp_it after_punch_it) +{ + const char* func_name = __func__; + auto print = [&](const char* caption) { + dout(25) << func_name << caption << std::hex << logical_offset << ".." + << end_offset << " ref_end=" << ref_end_offset << " bd="; + uint32_t lof = logical_offset; + for (auto q: bd) { + *_dout << " " << lof << "~" << q.disk_data.length(); + lof += q.disk_data.length(); + } + *_dout << std::dec << dendl; + }; + print(" IN "); + ceph_assert(bstore->min_alloc_size != bstore->block_size); + ceph_assert(bd.size() >= 1); + if (!bd[0].is_compressed() && + p2phase(logical_offset, bstore->min_alloc_size) != 0) { + // check if we have already allocated space to fill + _try_reuse_allocated_l(after_punch_it, logical_offset, ref_end_offset, bd[0]); + } + if (bd[0].real_length == 0) { + bd.erase(bd.begin()); + } + if (logical_offset == end_offset) { + // it is possible that we already consumed all + goto out; + } + print(" MID "); + { + ceph_assert(bd.size() >= 1); + auto &bd_back = bd.back(); + if (!bd_back.is_compressed() && + p2phase(end_offset, bstore->min_alloc_size) != 0) { + // check if we have some allocated space to fill + _try_reuse_allocated_r(after_punch_it, end_offset, ref_end_offset, bd_back); + } + if (bd_back.real_length == 0) { + bd.erase(bd.end() - 1); + } + } + out: + print(" OUT "); +} + +/** + * Puts data to onode by creating new blobs/extents. + * + * Does not check if data can be merged into other blobs are done. + * Requires that the target region is already emptied (\ref punch_hole_2). + * + * Input data is a continous sequence of blob_data_t segments + * that starts at logical_offset. + * This is the final step in processing write op. + * + * logical_offset - Offset of first blob_data_t element. + * ref_end_offset - Actual data end, it might be earlier then last blob_data_t. + * It happens because we pad data to disk block alignment, + * while we preserve logical range of put data. + * bd_it..bd_end - Sequence of blob_data_t to put. + */ +void BlueStore::Writer::_do_put_new_blobs( + uint32_t logical_offset, + uint32_t ref_end_offset, + blob_vec::iterator& bd_it, + blob_vec::iterator bd_end) +{ + extent_map_t& emap = onode->extent_map.extent_map; + uint32_t blob_size = wctx->target_blob_size; + while (bd_it != bd_end) { + if (!bd_it->is_compressed()) { + // only 1st blob to write can have blob_location != logical_offset + uint32_t blob_location = p2align(logical_offset, blob_size); + BlobRef new_blob; + uint32_t in_blob_offset = logical_offset - blob_location; + uint32_t ref_end = std::min(ref_end_offset, logical_offset + bd_it->disk_data.length()); + if (blob_location == logical_offset && + bd_it->disk_data.length() >= blob_size && + ref_end_offset - blob_location >= blob_size) { + new_blob = _blob_create_full(bd_it->disk_data); + // all already ref'ed + } else { + new_blob = _blob_create_with_data(in_blob_offset, bd_it->disk_data); + new_blob->get_ref(onode->c, in_blob_offset, ref_end - blob_location - in_blob_offset); + } + Extent *le = new Extent( + logical_offset, in_blob_offset, ref_end - logical_offset, new_blob); + dout(20) << __func__ << " new extent+blob " << le->print(pp_mode) << dendl; + emap.insert(*le); + logical_offset = ref_end; + } else { + // compressed + ceph_assert(false); + } + ++bd_it; + } +} + +void BlueStore::Writer::_do_put_blobs( + uint32_t logical_offset, + uint32_t data_end_offset, + uint32_t ref_end_offset, + blob_vec& bd, + exmp_it after_punch_it) +{ + Collection* coll = onode->c; + extent_map_t& emap = onode->extent_map.extent_map; + uint32_t au_size = bstore->min_alloc_size; + uint32_t blob_size = wctx->target_blob_size; + auto bd_it = bd.begin(); + exmp_it to_it; + uint32_t left_bound = p2align(logical_offset, blob_size); + uint32_t right_bound = p2roundup(logical_offset, blob_size); + // Try to put first data pack to already existing blob + if (!bd_it->is_compressed()) { + // it is thinkable to put the data to some blob + exmp_it left_b = _find_mutable_blob_left( + after_punch_it, left_bound, right_bound, + logical_offset, logical_offset + bd_it->disk_data.length()); + if (left_b != emap.end()) { + uint32_t in_blob_offset = logical_offset - left_b->blob_start(); + uint32_t in_blob_end = in_blob_offset + bd_it->disk_data.length(); + uint32_t data_end_offset = logical_offset + bd_it->disk_data.length(); + _maybe_expand_blob(left_b->blob.get(), p2roundup(in_blob_end, au_size)); + _blob_put_data_subau_allocate( + left_b->blob.get(), in_blob_offset, bd_it->disk_data); + uint32_t ref_end = std::min(ref_end_offset, data_end_offset); + //fixme/improve - need something without stupid extras - that is without coll + left_b->blob->get_ref(coll, in_blob_offset, ref_end - logical_offset); + Extent *le = new Extent( + logical_offset, in_blob_offset, ref_end - logical_offset, left_b->blob); + dout(20) << __func__ << " new extent " << le->print(pp_mode) << dendl; + emap.insert(*le); + logical_offset = ref_end; + ++bd_it; + } else { + // it is still possible to use first bd and put it into + // blob after punch_hole + // can blob before punch_hole be different then blob after punch_hole ? + } + } + if (bd_it != bd.end()) { + // still something to process + auto back_it = bd.end() - 1; + if (!back_it->is_compressed()) { + // it is thinkable to put the data to some after + uint32_t left_bound = p2align(data_end_offset, blob_size); + uint32_t right_bound = p2roundup(data_end_offset, blob_size); + exmp_it right_b = _find_mutable_blob_right( + after_punch_it, left_bound, right_bound, + data_end_offset - back_it->disk_data.length(), data_end_offset); + if (right_b != emap.end()) { + // before putting last blob, put previous + // it is nicer to have AUs in order. + _do_put_new_blobs(logical_offset, ref_end_offset, bd_it, back_it); + uint32_t data_begin_offset = data_end_offset - back_it->disk_data.length(); + uint32_t in_blob_offset = data_begin_offset - right_b->blob_start(); + _maybe_expand_blob(right_b->blob.get(), in_blob_offset + bd_it->disk_data.length()); + _blob_put_data_subau_allocate( + right_b->blob.get(), in_blob_offset, back_it->disk_data); + uint32_t ref_end = std::min(ref_end_offset, data_begin_offset + back_it->disk_data.length()); + //fixme - need something without stupid extras + right_b->blob->get_ref(coll, in_blob_offset, ref_end - data_begin_offset); + Extent *le = new Extent( + data_begin_offset, in_blob_offset, ref_end - data_begin_offset, right_b->blob); + dout(20) << __func__ << " new extent " << le->print(pp_mode) << dendl; + emap.insert(*le); + bd.erase(back_it); //TODO - or other way of limiting end + } + } + } + + // that's it about blob reuse, now is the time to full blobs + if (bd_it != bd.end()) { + _do_put_new_blobs(logical_offset, ref_end_offset, bd_it, bd.end()); + } +} + +/** + * The idea is to give us a chance to reuse blob. + * To do so, we must have enough to for block/csum/au. + * The decision is to either read or to pad with zeros. + * We return pair: + * first: true = pad with 0s, false = read the region + * second: new logical offset for data + * NOTE: Unlike _write_expand_r expanded punch_hole region + * is always equal to ref'ed region. + * NOTE2: This function can be called without split_at(logical_offset) + * NOTE3: If logical_offset is AU aligned, some blobs have larger csum. + * We ignore them, in result not even wanting to expand. + */ +std::pair BlueStore::Writer::_write_expand_l( + uint32_t logical_offset) +{ + uint32_t block_size = bstore->block_size; + uint32_t off_stop = p2align(logical_offset, bstore->min_alloc_size); + // no need to go earlier then one AU + ceph_assert(off_stop != logical_offset); // to prevent superfluous invocation + uint32_t min_off = p2align(logical_offset, block_size); + uint32_t new_data_off = min_off; + bool new_data_pad = true; // unless otherwise stated, we pad + exmp_it it = onode->extent_map.seek_lextent(logical_offset); + // it can be extent in which we are interested in + if (it == onode->extent_map.extent_map.end()) { + if (it == onode->extent_map.extent_map.begin()) { + goto done; + } + --it; //step back to first element + } + do { + if (it->logical_end() < off_stop) { + // Nothing before this point will be interesting. + // Not found blob to adapt to. + break; + } + if (!it->blob->get_blob().is_mutable()) { + new_data_pad = false; // we have to read data here + if (it == onode->extent_map.extent_map.begin()) break; + --it; + continue; + } + // we take first blob that we can + uint32_t can_off = p2align(logical_offset, it->blob->get_blob().get_chunk_size(block_size)); + // ^smallest stop point that blob can accomodate + // the blob is mapped, so it has space for at least up to begin of AU@logical_offset + if (it->logical_offset < logical_offset && logical_offset < it->logical_end()) { + // ^ this only works for the first extent we check + new_data_off = can_off; + new_data_pad = false; + } else { + if (it->logical_end() <= can_off) { + // we have a fortunate area in blob that was mapped but not used + new_data_off = can_off; + // the new_data_pad here depends on whether we have visited immutable blobs + } else { + // interested in using this blob, but there is data, must read + new_data_off = can_off; + new_data_pad = false; + //^ read means we must expand punch_hole / ref, but not outside object size + } + } + break; + } while (true); + done: + dout(25) << __func__ << std::hex << " logical_offset=0x" << logical_offset + << " -> 0x" << new_data_off << (new_data_pad ? " pad" : " read") << dendl; + return std::make_pair(new_data_pad, new_data_off); +} + +/** + * The idea is to give us a chance to reuse blob. + * To do so, we must have enough to for block/csum/au. + * The decision is to either read or to pad with zeros. + * We return pair: + * first: true = pad with 0s, false = read the region + * second: new end offset for data + * NOTE: When we pad with 0s, we do not expand ref range. + * When we read, we expand ref range. + * Ref range cannot to outside object size. + * NOTE2: This function can be called without split_at(end_offset) + * NOTE3: If logical_offset is AU aligned, some blobs have larger csum. + * We ignore them, in result not even wanting to expand. + */ +std::pair BlueStore::Writer::_write_expand_r( + uint32_t end_offset) +{ + uint32_t block_size = bstore->block_size; + uint32_t end_stop = p2roundup(end_offset, bstore->min_alloc_size); + // no need to go further then one AU, since new blob it if happens can allocate one AU + ceph_assert(end_stop != end_offset); // to prevent superfluous invocation + uint32_t min_end = p2roundup(end_offset, block_size); + uint32_t new_data_end = min_end; + bool new_data_pad = true; // unless otherwise stated, we pad + exmp_it it = onode->extent_map.seek_lextent(end_offset); + int tries = 0; //check 3 blobs + for (; ++tries <= 3 && it != onode->extent_map.extent_map.end(); ++it) { + if (it->logical_offset >= end_stop) { + // nothing beyond this point is interesting + // no blob should have an free AU outside its logical mapping + // This is failure in reuse search. + break; + } + if (it->blob_start() >= end_offset) { + //not interested with blob that we are not overlapping + continue; + //^ not break, we still can find valid blob to reuse + } + if (!it->blob->get_blob().is_mutable()) { + new_data_pad = false; //must read... + continue; + } + //if @ end_offset is something then this blob certainly qualifies + // we take first blob that we can + uint32_t can_end = p2roundup(end_offset, it->blob->get_blob().get_chunk_size(block_size)); + // ^smallest stop point that blob can accomodate + // the blob is mapped, so it has space for at least up to end of AU@end_offset + if (it->logical_offset <= end_offset && end_offset < it->logical_end()) { + // ^ this only works for the first extent we check + new_data_end = can_end; + new_data_pad = false; + //^ read means we must expand punch_hole / ref, but not outside object size + } else { + if (can_end <= it->logical_offset) { + // we have a fortunate area in blob that was mapped but not used + new_data_end = can_end; + // the new_data_pad here depends on whether we have visited immutable blobs + } else { + // interested in using this blob, but there is data, must read + new_data_end = can_end; + new_data_pad = false; + //^ read means we must expand punch_hole / ref, but not outside object size + } + } + break; + } + dout(25) << __func__ << std::hex << " end_offset=0x" << end_offset + << " -> 0x" << new_data_end << (new_data_pad ? " pad" : " read") << dendl; + return std::make_pair(new_data_pad, new_data_end); +} + +// This function is a centralized place to make a decision on +// whether to use deferred or direct writes. +// The assumption behind it is that having parts of write executed as +// deferred and other parts as direct is suboptimal in any case. +void BlueStore::Writer::_deferred_decision(uint32_t need_size) +{ + // make a deferred decision + uint32_t released_size = 0; + for (auto& r : released) { + released_size += r.length; + } + uint32_t au_size = bstore->min_alloc_size; + do_deferred = need_size <= released_size && released_size < bstore->prefer_deferred_size; + dout(15) << __func__ << " released=0x" << std::hex << released_size + << " need=0x" << need_size << std::dec + << (do_deferred ? " deferred" : " direct") << dendl; + + if (do_deferred) { + disk_allocs.it = released.begin(); + statfs_delta.allocated() += need_size; + disk_allocs.pos = 0; + } else { + int64_t new_alloc_size = bstore->alloc->allocate(need_size, au_size, 0, 0, &allocated); + ceph_assert(need_size == new_alloc_size); + statfs_delta.allocated() += new_alloc_size; + disk_allocs.it = allocated.begin(); + disk_allocs.pos = 0; + } +} + +// data (input) is split into chunks bd (output) +// data is emptied as a result +void BlueStore::Writer::_split_data( + uint32_t location, + bufferlist& data, + blob_vec& bd) +{ + auto lof = location; + uint32_t end_offset = location + data.length(); + while (lof < end_offset) { + uint32_t p = p2remain(lof, wctx->target_blob_size); + if (p > end_offset - lof) p = end_offset - lof; + bufferlist tmp; + data.splice(0, p, &tmp); + bd.emplace_back(p, 0, tmp, tmp); + lof += p; + } +} + +void BlueStore::Writer::_align_to_disk_block( + uint32_t& location, + uint32_t& data_end, + blob_vec& blobs) +{ + ceph_assert(!blobs.empty()); + uint32_t au_size = bstore->min_alloc_size; + bool left_do_pad; + bool right_do_pad; + uint32_t left_location; + uint32_t right_location; + if (p2phase(location, au_size) != 0) { + blob_data_t& first_blob = blobs.front(); + if (!first_blob.is_compressed()) { + // try to make at least disk block aligned + std::tie(left_do_pad, left_location) = _write_expand_l(location); + if (left_location < location) { + bufferlist tmp; + if (left_do_pad) { + tmp.append_zero(location - left_location); + } else { + tmp = _read_self(left_location, location - left_location); + } + tmp.claim_append(first_blob.disk_data); + first_blob.disk_data.swap(tmp); + first_blob.real_length += location - left_location; + location = left_location; + } + } + } + if (p2phase(data_end, au_size) != 0) { + blob_data_t& last_blob = blobs.back(); + if (!last_blob.is_compressed()) { + // try to make at least disk block aligned + std::tie(right_do_pad, right_location) = _write_expand_r(data_end); + if (data_end < right_location) { + // TODO - when we right-expand because of some blob csum restriction, it is possible + // we will be left-blob-csum-unaligned. It is wasted space. + // Think if we want to fix it. + if (right_do_pad) { + last_blob.disk_data.append_zero(right_location - data_end); + } else { + bufferlist tmp; + tmp = _read_self(data_end, right_location - data_end); + last_blob.disk_data.append(tmp); + } + last_blob.real_length += right_location - data_end; + } + data_end = right_location; + } + } +} + +// Writes uncompressed data. +void BlueStore::Writer::do_write( + uint32_t location, + bufferlist& data) +{ + do_deferred = false; + disk_allocs.it = allocated.end(); + disk_allocs.pos = 0; + dout(20) << __func__ << " 0x" << std::hex << location << "~" << data.length() << dendl; + dout(25) << "on: " << onode->print(pp_mode) << dendl; + blob_vec bd; + uint32_t ref_end = location + data.length(); + uint32_t data_end = location + data.length(); + _split_data(location, data, bd); + _align_to_disk_block(location, data_end, bd); + if (ref_end < onode->onode.size) { + ref_end = std::min(data_end, onode->onode.size); + } + dout(20) << "blobs to put:" << blob_data_printer(bd, location) << dendl; + statfs_delta.stored() += ref_end - location; + exmp_it after_punch_it = + bstore->_punch_hole_2(onode->c, onode, location, data_end - location, + released, pruned_blobs, shared_changed, statfs_delta); + dout(25) << "after punch_hole_2: " << std::endl << onode->print(pp_mode) << dendl; + + uint32_t au_size = bstore->min_alloc_size; + if (au_size != bstore->block_size) { + _try_put_data_on_allocated(location, data_end, ref_end, bd, after_punch_it); + } + if (location != data_end) { + uint32_t need_size = p2roundup(data_end, au_size) - p2align(location, au_size); + // make a deferred decision + _deferred_decision(need_size); + _do_put_blobs(location, data_end, ref_end, bd, after_punch_it); + } else { + // Unlikely, but we just put everything. + ceph_assert(bd.size() == 0); + } + if (onode->onode.size < ref_end) + onode->onode.size = ref_end; + _collect_released_allocated(); + dout(25) << "result: " << std::endl << onode->print(pp_mode) << dendl; +} + +/** + * Move allocated and released regions to txc. + * NOTE: Consider in future to directly use variables in txc. + */ +void BlueStore::Writer::_collect_released_allocated() +{ + if (!do_deferred) { + // When we do direct all released is really released. + for (auto e : released) { + txc->released.insert(e.offset, e.length); + } + // We do not accept allocating more than really using later. + ceph_assert(disk_allocs.it == allocated.end()); + } else { + // When when we do deferred it is possible to not use all. + // Release the unused rest. + uint32_t pos = disk_allocs.pos; + while (disk_allocs.it != released.end()) { + auto& e = *disk_allocs.it; + dout(15) << "Deferred, some left unused location=0x" + << std::hex << e.offset + pos << "~" << e.length - pos << std::dec << dendl; + txc->released.insert(e.offset + pos, e.length - pos); + pos = 0; + ++disk_allocs.it; + } + } + for (auto e : allocated) { + txc->allocated.insert(e.offset, e.length); + } + released.clear(); + allocated.clear(); +} + +/** + * Debug function that extracts data from BufferSpace buffers. + * Typically it is useless - it is not guaranteed that buffers will not be evicted. + */ +void BlueStore::Writer::debug_iterate_buffers( + std::function data_callback) +{ + for (auto& e : onode->extent_map.extent_map) { + auto b = e.blob->dirty_bc()._data_lower_bound(e.blob_offset); + uint32_t in_blob_offset = e.blob_offset; + ceph_assert(b->second.offset <= in_blob_offset); + ceph_assert(in_blob_offset + e.length <= b->second.offset + b->second.data.length()); + bufferlist data; + data.substr_of(b->second.data, in_blob_offset - b->second.offset, e.length); + data_callback(e.logical_offset, data); + } +} diff --git a/src/os/bluestore/Writer.h b/src/os/bluestore/Writer.h new file mode 100644 index 00000000000..79f24b52d22 --- /dev/null +++ b/src/os/bluestore/Writer.h @@ -0,0 +1,210 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2023 IBM + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef BLUESTORE_WRITER +#define BLUESTORE_WRITER + +#include "BlueStore.h" +#include "Allocator.h" + +class BlueStore::Writer { +public: + using exmp_it = extent_map_t::iterator; + using P = BlueStore::printer; + + // Data that is to be put to object. + struct blob_data_t { + //uint32_t location; // There is no need for each chunk to have separate location. + uint32_t real_length; // Size of object data covered by this chunk. Same as object_data.length(). + uint32_t compressed_length; // Size of compressed representation. 0 or disk_data.length(). + bufferlist disk_data; // Bitstream to got o disk. Its either same as object_data, + // or contains compressed data. Block aligned. + bufferlist object_data; // Object data. Needed to put into caches. + bool is_compressed() {return compressed_length != 0;} + }; + using blob_vec = std::vector; + struct blob_data_printer { + const blob_vec& blobs; + uint32_t base_position; + blob_data_printer(const blob_vec& blobs, uint32_t base_position) + : blobs(blobs), base_position(base_position) {} + }; + + struct write_divertor { + virtual ~write_divertor() = default; + virtual void write( + uint64_t disk_offset, const bufferlist& data, bool deferred) = 0; + }; + struct read_divertor { + virtual ~read_divertor() = default; + virtual bufferlist read(uint32_t object_offset, uint32_t object_length) = 0; + }; + Writer(BlueStore* bstore, TransContext* txc, WriteContext* wctx, OnodeRef o) + :bstore(bstore), txc(txc), wctx(wctx), onode(o) { + pp_mode = debug_level_to_pp_mode(bstore->cct); + } +public: + void do_write( + uint32_t location, + bufferlist& data + ); + + void debug_iterate_buffers( + std::function data_callback + ); + + write_divertor* test_write_divertor = nullptr; + read_divertor* test_read_divertor = nullptr; + std::vector pruned_blobs; + std::set shared_changed; + volatile_statfs statfs_delta; + +private: + BlueStore* bstore; + TransContext* txc; + WriteContext* wctx; + OnodeRef onode; + PExtentVector released; //filled by punch_hole + PExtentVector allocated; //filled by alloc() + bool do_deferred = false; + // note: disk_allocs.it is uninitialized. + // it must be initialized in do_write + struct { + PExtentVector::iterator it; //iterator + uint32_t pos; //in-iterator position + } disk_allocs; //disk locations to use when placing data + uint16_t pp_mode = 0; //pretty print mode + uint16_t debug_level_to_pp_mode(CephContext* cct); + + inline exmp_it _find_mutable_blob_left( + exmp_it it, + uint32_t search_begin, // only interested in blobs that are + uint32_t search_end, // within range [begin - end) + uint32_t mapmust_begin,// for 'unused' case: the area + uint32_t mapmust_end); // [begin - end) must be mapped + + inline exmp_it _find_mutable_blob_right( + exmp_it it, + uint32_t search_begin, // only interested in blobs that are + uint32_t search_end, // within range [begin - end) + uint32_t mapmust_begin, // for 'unused' case: the area + uint32_t mapmust_end); // [begin - end) must be mapped + + inline void _schedule_io_masked( + uint64_t disk_offset, + bufferlist data, + bluestore_blob_t::unused_t mask, + uint32_t chunk_size); + + inline void _schedule_io( + const PExtentVector& disk_allocs, + uint32_t initial_offset, + bufferlist data); + + //Take `length` space from `this.disk_allocs` and put it to `dst`. + void _get_disk_space( + uint32_t length, + PExtentVector& dst); + + inline bufferlist _read_self( + uint32_t offset, + uint32_t length); + + inline void _maybe_expand_blob( + Blob* blob, + uint32_t new_blob_size); + + inline void _blob_put_data( + Blob* blob, + uint32_t in_blob_offset, + bufferlist disk_data); + + void _split_data( + uint32_t location, + bufferlist& data, + blob_vec& bd); + + void _align_to_disk_block( + uint32_t& location, + uint32_t& ref_end, + blob_vec& blobs + ); + + inline void _blob_put_data_subau( + Blob* blob, + uint32_t in_blob_offset, + bufferlist disk_data); + + inline void _blob_put_data_allocate( + Blob* blob, + uint32_t in_blob_offset, + bufferlist disk_data); + + inline void _blob_put_data_subau_allocate( + Blob* blob, + uint32_t in_blob_offset, + bufferlist disk_data); + + BlobRef _blob_create_with_data( + uint32_t in_blob_offset, + bufferlist& disk_data); + + BlobRef _blob_create_full( + bufferlist& disk_data); + + void _try_reuse_allocated_l( + exmp_it after_punch_it, // hint, we could have found it ourselves + uint32_t& logical_offset, // will fix value if something consumed + uint32_t ref_end_offset, // useful when data is padded + blob_data_t& bd); // modified when consumed + + void _try_reuse_allocated_r( + exmp_it after_punch_it, // hint, we could have found it ourselves + uint32_t& end_offset, // will fix value if something consumed + uint32_t ref_end_offset, // useful when data is padded + blob_data_t& bd); // modified when consumed + + void _try_put_data_on_allocated( + uint32_t& logical_offset, + uint32_t& end_offset, + uint32_t& ref_end_offset, + blob_vec& bd, + exmp_it after_punch_it); + + void _do_put_new_blobs( + uint32_t logical_offset, + uint32_t ref_end_offset, + blob_vec::iterator& bd_it, + blob_vec::iterator bd_end); + + void _do_put_blobs( + uint32_t logical_offset, + uint32_t data_end_offset, + uint32_t ref_end_offset, + blob_vec& bd, + exmp_it after_punch_it); + + std::pair _write_expand_l( + uint32_t logical_offset); + + std::pair _write_expand_r( + uint32_t end_offset); + + void _collect_released_allocated(); + + void _deferred_decision(uint32_t need_size); +}; + +std::ostream& operator<<(std::ostream& out, const BlueStore::Writer::blob_data_printer& printer); + +#endif // BLUESTORE_WRITER -- 2.39.5