From d59108c0acbd87620f77f0311f371f0ef52aef31 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Tue, 2 Jan 2024 15:04:40 +0100 Subject: [PATCH] os/bluestore: BufferSpace per Onode Instead of indexing buffers by blob offset we do it by logical offset. Previously we tried using physical offsets but it was a pretty bad idea is it required extra bound checks of physical extents and extra loops for to add them, that includes chopping off buffer data. Logical offsets don't have these problems but we have different ones. Bufferlist data is padded before being used in `_do_alloc_write` which means before adding it to BufferSpace, we are required to get the unpadded substr_of data because padded data might overflow to an invalid logical offset in contrast to when we used blob offsets where padded data was wonderful. This is true in in `generate_read_result_bl` too, we read by regions of blob extents but we must translate it back to logical offsets, that means an extra loop to include correct logical regions was Signed-off-by: Pere Diaz Bou --- src/os/bluestore/BlueStore.cc | 514 +++++++++++++++------------------- src/os/bluestore/BlueStore.h | 67 +++-- 2 files changed, 270 insertions(+), 311 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 0dc03c78e3af6..eddf065312e1f 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -25,6 +25,7 @@ #include #include +#include "common/dout.h" #include "include/cpp-btree/btree_set.h" #include "BlueStore.h" @@ -582,12 +583,6 @@ void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em) dout(LogLevelV) << __func__ << " csum: " << std::hex << v << std::dec << dendl; } - std::lock_guard l(e.blob->get_cache()->lock); - for (auto& i : e.blob->get_bc().buffer_map) { - dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first - << "~" << i.second.length << std::dec - << " " << i.second << dendl; - } } } @@ -617,6 +612,12 @@ void _dump_onode(CephContext *cct, const BlueStore::Onode& o) << " len " << p->second.length() << dendl; } _dump_extent_map(cct, o.extent_map); + + for (auto& i : o.bc.buffer_map) { + dout(LogLevelV) << __func__ << " 0x" << std::hex << i.first << "~" + << i.second.length << std::dec << " " << i.second + << dendl; + } } template @@ -1653,20 +1654,29 @@ BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create( #undef dout_prefix #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") " -void BlueStore::BufferSpace::_clear(BufferCacheShard* cache) +bool BlueStore::BufferSpace::_dup_writing(TransContext* txc, BufferCacheShard* cache, OnodeRef onode) { - // note: we already hold cache->lock - ldout(cache->cct, 20) << __func__ << dendl; - while (!buffer_map.empty()) { - _rm_buffer(cache, buffer_map.begin()); + bool copied = false; + if (!writing.empty()) { + copied = true; + for (auto it = writing.begin(); it != writing.end(); ++it) { + Buffer& b = *it; + Buffer to_b(&onode->bc, b.state, b.seq, b.offset, b.data, b.flags); + BufferSpace& to = onode->bc; + ceph_assert(to_b.is_writing()); + to._add_buffer(cache, &to, std::move(to_b), b.cache_private, 0, nullptr); + txc->buffers_written.insert({onode.get(), b.offset, b.seq}); + } } + return copied; } + int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length) { // note: we already hold cache->lock - ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length - << std::dec << dendl; + ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" + << length << std::dec << dendl; int cache_private = 0; cache->_audit("discard start"); auto i = _data_lower_bound(offset); @@ -1676,6 +1686,12 @@ int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, u if (b->offset >= end) { break; } + // If no overlap is found between what we want to read and what we have + // cached, then forget about continuing + if (std::max(b->end(), offset + length) - std::min(b->offset, offset) > + length + b->length) { + break; + } if (b->cache_private > cache_private) { cache_private = b->cache_private; } @@ -1750,7 +1766,14 @@ void BlueStore::BufferSpace::read( for (auto i = _data_lower_bound(offset); i != buffer_map.end() && offset < end && i->first < end; ++i) { Buffer *b = &i->second; - ceph_assert(b->end() > offset); + if (b->end() <= offset) { + break; + } + + // If no overlap is found between what we want to read and what we have cached, then forget about continuing + if (std::max(b->end(), offset+length) - std::min(b->offset, offset) > length + b->length) { + break; + } bool val = false; if (flags & BYPASS_CLEAN_CACHE) @@ -1804,7 +1827,7 @@ void BlueStore::BufferSpace::read( cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes); } -void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq) +void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint32_t offset, uint64_t seq) { auto i = writing.begin(); while (i != writing.end()) { @@ -1836,28 +1859,6 @@ void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq cache->_audit("finish_write end"); } -/* - copy Buffers that are in writing queue - returns: - true if something copied - false if nothing copied -*/ -bool BlueStore::BufferSpace::_dup_writing(BufferCacheShard* cache, BufferSpace* to) -{ - bool copied = false; - if (!writing.empty()) { - copied = true; - for (auto it = writing.begin(); it != writing.end(); ++it) { - Buffer& b = *it; - ceph_assert(b.is_writing()); - to->_add_buffer(cache, to, - Buffer(to, b.state, b.seq, b.offset, b.data, b.flags), 0, - 0, nullptr); - } - } - return copied; -} - void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r) { std::lock_guard lk(cache->lock); @@ -2170,11 +2171,8 @@ BlueStore::Blob::~Blob() if (coll_cache != get_cache()) { goto again; } - bc._clear(coll_cache); coll_cache->rm_blob(); } - SharedBlob* sb = shared_blob.get(); - ceph_assert(sb || (!sb && bc.buffer_map.empty())); } void BlueStore::Blob::dump(Formatter* f) const @@ -2204,7 +2202,7 @@ ostream& operator<<(ostream& out, const BlueStore::Blob& b) return out; } -void BlueStore::Blob::discard_unallocated(Collection *coll) +void BlueStore::Blob::discard_unallocated(Collection *coll, Onode* onode, uint32_t logical_offset) { if (get_blob().is_shared()) { return; @@ -2222,17 +2220,17 @@ void BlueStore::Blob::discard_unallocated(Collection *coll) ceph_assert(discard == all_invalid); // in case of compressed blob all // or none pextents are invalid. if (discard) { - dirty_bc().discard(get_cache(), 0, + onode->bc.discard(onode->c->cache, logical_offset, get_blob().get_logical_length()); } } else { size_t pos = 0; for (auto e : get_blob().get_extents()) { if (!e.is_valid()) { - dout(20) << __func__ << " 0x" << std::hex << pos + dout(20) << __func__ << " 0x" << std::hex << pos << "~" << e.length - << std::dec << dendl; - dirty_bc().discard(get_cache(), pos, e.length); + << std::dec << dendl; + onode->bc.discard(onode->c->cache, logical_offset + pos, e.length); } pos += e.length; } @@ -2375,32 +2373,6 @@ bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size, #undef dout_context #define dout_context cct -// Cut Buffers that are not covered by extents. -// It happens when we punch hole in Blob, but not refill with new data. -// Normally it is not a problem (other then wasted memory), -// but when 2 Blobs are merged Buffers might collide. -// Todo: in future cut Buffers when we delete extents from Blobs, -// and get rid of this function. -void BlueStore::Blob::discard_unused_buffers(CephContext* cct, BufferCacheShard* cache) -{ - dout(25) << __func__ << " input " << *this << " bc=" << bc << dendl; - const PExtentVector& extents = get_blob().get_extents(); - uint32_t epos = 0; - auto e = extents.begin(); - while(e != extents.end()) { - if (!e->is_valid()) { - bc._discard(cache, epos, e->length); - } - epos += e->length; - ++e; - } - ceph_assert(epos <= blob.get_logical_length()); - // Preferably, we would trim up to blob.get_logical_length(), - // but we copied writing buffers (see _dup_writing) before blob logical_length is fixed. - bc._discard(cache, epos, OBJECT_MAX_SIZE - epos); - dout(25) << __func__ << " output bc=" << bc << dendl; -} - void BlueStore::Blob::dup(const Blob& from, bool copy_used_in_blob) { set_shared_blob(from.shared_blob); @@ -2857,24 +2829,6 @@ uint32_t BlueStore::Blob::merge_blob(CephContext* cct, Blob* blob_to_dissolve) // now apply freshly merged tmp_extents into dst blob dst_blob.dirty_extents().swap(tmp_extents); - // move BufferSpace buffers - while(!src->bc.buffer_map.empty()) { - auto buf = src->bc.buffer_map.extract(src->bc.buffer_map.cbegin()); - buf.mapped().space = &dst->bc; - if (dst->bc.buffer_map.count(buf.key()) == 0) { - dst->bc.buffer_map.emplace(buf.key(), std::move(buf.mapped())); - } - } - // move BufferSpace writing - auto wrt_dst_it = dst->bc.writing.begin(); - while(!src->bc.writing.empty()) { - Buffer& buf = src->bc.writing.front(); - src->bc.writing.pop_front(); - while (wrt_dst_it != dst->bc.writing.end() && wrt_dst_it->seq < buf.seq) { - ++wrt_dst_it; - } - dst->bc.writing.insert(wrt_dst_it, buf); - } dout(20) << __func__ << " result=" << *dst << dendl; return dst_blob.get_logical_length(); } @@ -2882,24 +2836,6 @@ uint32_t BlueStore::Blob::merge_blob(CephContext* cct, Blob* blob_to_dissolve) #undef dout_context #define dout_context collection->store->cct -void BlueStore::Blob::finish_write(uint64_t seq) -{ - while (true) { - auto coll = get_collection(); - BufferCacheShard *cache = coll->cache; - std::lock_guard l(cache->lock); - if (coll->cache != cache) { - dout(20) << __func__ - << " raced with sb cache update, was " << cache - << ", now " << coll->cache << ", retrying" - << dendl; - continue; - } - bc._finish_write(cache, seq); - break; - } -} - void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r) { dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec @@ -2914,7 +2850,6 @@ void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r) &(r->used_in_blob)); lb.split(blob_offset, rb); - dirty_bc().split(get_cache(), blob_offset, r->dirty_bc()); dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec << " finish " << *this << dendl; @@ -3122,44 +3057,42 @@ void BlueStore::ExtentMap::make_range_shared_maybe_merge( if (e.logical_offset >= end) { break; } - dout(25) << __func__ << " src " << e - << " bc=" << e.blob->bc << dendl; - const bluestore_blob_t& blob = e.blob->get_blob(); + dout(25) << __func__ << " src " << e << " bc=" << onoderef->bc << dendl; + const bluestore_blob_t &blob = e.blob->get_blob(); // make sure it is shared if (!blob.is_shared()) { dirty_range_begin = std::min(dirty_range_begin, e.blob_start()); // first try to find a shared blob nearby // that can accomodate extra extents - uint32_t blob_width; //to signal when extents end - dout(20) << __func__ << std::hex - << " e.blob_start=" << e.blob_start() - << " e.logical_offset=" << e.logical_offset - << std::dec << dendl; - Blob* b = blob.is_compressed() ? nullptr : - find_mergable_companion(e.blob.get(), e.blob_start(), blob_width, candidates); + uint32_t blob_width; // to signal when extents end + dout(20) << __func__ << std::hex << " e.blob_start=" << e.blob_start() + << " e.logical_offset=" << e.logical_offset << std::dec << dendl; + Blob *b = blob.is_compressed() + ? nullptr + : find_mergable_companion(e.blob.get(), e.blob_start(), + blob_width, candidates); if (b) { - dout(20) << __func__ << " merging to: " << *b << " bc=" << b->bc << dendl; - e.blob->discard_unused_buffers(store->cct, c->cache); - b->discard_unused_buffers(store->cct, c->cache); - uint32_t b_logical_length = b->merge_blob(store->cct, e.blob.get()); - for (auto p : blob.get_extents()) { - if (p.is_valid()) { - b->get_shared_blob()->get_ref(p.offset, p.length); - } - } - // reblob extents might erase e - dirty_range_end = std::max(dirty_range_end, e.blob_start() + b_logical_length); - uint32_t goto_logical_offset = e.logical_offset + e.length; - reblob_extents(e.blob_start(), e.blob_start() + blob_width, + dout(20) << __func__ << " merging to: " << *b << " bc=" << onode->bc + << dendl; + uint32_t b_logical_length = b->merge_blob(store->cct, e.blob.get()); + for (auto p : blob.get_extents()) { + if (p.is_valid()) { + b->shared_blob->get_ref(p.offset, p.length); + } + } + // reblob extents might erase e + dirty_range_end = std::max(dirty_range_end, e.blob_start() + b_logical_length); + uint32_t goto_logical_offset = e.logical_offset + e.length; + reblob_extents(e.blob_start(), e.blob_start() + blob_width, e.blob, b); - ep = seek_lextent(goto_logical_offset); - dout(20) << __func__ << " merged: " << *b << dendl; + ep = seek_lextent(goto_logical_offset); + dout(20) << __func__ << " merged: " << *b << dendl; } else { - // no candidate, has to convert to shared - c->make_blob_shared(store->_assign_blobid(txc), e.blob); - ceph_assert(e.logical_end() > 0); - dirty_range_end = std::max(dirty_range_end, e.logical_end()); - ++ep; + // no candidate, has to convert to shared + c->make_blob_shared(store->_assign_blobid(txc), e.blob); + ceph_assert(e.logical_end() > 0); + dirty_range_end = std::max(dirty_range_end, e.logical_end()); + ++ep; } } else { c->load_shared_blob(e.blob->get_shared_blob()); @@ -3229,15 +3162,6 @@ void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc, e.blob->last_encoded_id = n; id_to_blob[n] = cb; e.blob->dup(*cb); - // By default do not copy buffers to clones, and let them read data by themselves. - // The exception are 'writing' buffers, which are not yet stable on device. - bool some_copied = e.blob->bc._dup_writing(cb->get_cache(), &cb->bc); - if (some_copied) { - // Pretend we just wrote those buffers; - // we need to get _finish_write called, so we can clear then from writing list. - // Otherwise it will be stuck until someone does write-op on clone. - txc->blobs_written.insert(cb); - } // bump the extent refs on the copied blob's extents for (auto p : blob.get_extents()) { @@ -3272,12 +3196,17 @@ void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc, txc->statfs_delta.compressed_original() += ne->length; if (blob_duped) { txc->statfs_delta.compressed() += - cb->get_blob().get_compressed_payload_length(); + cb->get_blob().get_compressed_payload_length(); } } dout(20) << __func__ << " dst " << *ne << dendl; ++n; } + // By default do not copy buffers to clones, and let them read data by + // themselves. The exception are 'writing' buffers, which are not yet + // stable on device. + oldo->bc._dup_writing(txc, onode->c->cache, newo); + if (src_dirty) { oldo->extent_map.dirty_range(dirty_range_begin, dirty_range_end - dirty_range_begin); @@ -3356,15 +3285,6 @@ void BlueStore::ExtentMap::dup_esb(BlueStore* b, TransContext* txc, cb->dirty_blob().set_flag(bluestore_blob_t::FLAG_SHARED); cb->set_shared_blob(e.blob->get_shared_blob()); } - // By default do not copy buffers to clones, and let them read data by themselves. - // The exception are 'writing' buffers, which are not yet stable on device. - bool some_copied = e.blob->bc._dup_writing(cb->get_cache(), &cb->bc); - if (some_copied) { - // Pretend we just wrote those buffers; - // we need to get _finish_write called, so we can clear then from writing list. - // Otherwise it will be stuck until someone does write-op on the clone. - txc->blobs_written.insert(cb); - } txc->write_shared_blob(e.blob->get_shared_blob()); dout(20) << __func__ << " new " << *cb << dendl; @@ -3411,6 +3331,11 @@ void BlueStore::ExtentMap::dup_esb(BlueStore* b, TransContext* txc, dout(20) << __func__ << " dst " << *ne << dendl; ++n; } + // By default do not copy buffers to clones, and let them read data by + // themselves. The exception are 'writing' buffers, which are not yet + // stable on device. + oldo->bc._dup_writing(txc, onode->c->cache, newo); + if (src_dirty) { dirty_range(dirty_range_begin, dirty_range_end - dirty_range_begin); txc->write_onode(oldo); @@ -3794,40 +3719,40 @@ void BlueStore::ExtentMap::reshard( } if (e->blob_escapes_range(shard_start, shard_end - shard_start)) { - if (!e->blob->is_spanning()) { - // We have two options: (1) split the blob into pieces at the - // shard boundaries (and adjust extents accordingly), or (2) - // mark it spanning. We prefer to cut the blob if we can. Note that - // we may have to split it multiple times--potentially at every - // shard boundary. - bool must_span = false; - BlobRef b = e->blob; - if (b->can_split()) { - uint32_t bstart = e->blob_start(); - uint32_t bend = e->blob_end(); - for (const auto& sh : shards) { - if (bstart < sh.shard_info->offset && - bend > sh.shard_info->offset) { - uint32_t blob_offset = sh.shard_info->offset - bstart; - if (b->can_split_at(blob_offset)) { - dout(20) << __func__ << " splitting blob, bstart 0x" - << std::hex << bstart << " blob_offset 0x" - << blob_offset << std::dec << " " << *b << dendl; - b = split_blob(b, blob_offset, sh.shard_info->offset); - // switch b to the new right-hand side, in case it - // *also* has to get split. - bstart += blob_offset; - onode->c->store->logger->inc(l_bluestore_blob_split); - } else { - must_span = true; - break; - } - } - } - } else { - must_span = true; - } - if (must_span) { + if (!e->blob->is_spanning()) { + // We have two options: (1) split the blob into pieces at the + // shard boundaries (and adjust extents accordingly), or (2) + // mark it spanning. We prefer to cut the blob if we can. Note that + // we may have to split it multiple times--potentially at every + // shard boundary. + bool must_span = false; + BlobRef b = e->blob; + if (onode->bc.writing.empty() && b->can_split()) { + uint32_t bstart = e->blob_start(); + uint32_t bend = e->blob_end(); + for (const auto& sh : shards) { + if (bstart < sh.shard_info->offset && + bend > sh.shard_info->offset) { + uint32_t blob_offset = sh.shard_info->offset - bstart; + if (b->can_split_at(blob_offset)) { + dout(20) << __func__ << " splitting blob, bstart 0x" + << std::hex << bstart << " blob_offset 0x" + << blob_offset << std::dec << " " << *b << dendl; + b = split_blob(b, blob_offset, sh.shard_info->offset); + // switch b to the new right-hand side, in case it + // *also* has to get split. + bstart += blob_offset; + onode->c->store->logger->inc(l_bluestore_blob_split); + } else { + must_span = true; + break; + } + } + } + } else { + must_span = true; + } + if (must_span) { auto bid = allocate_spanning_blob_id(); b->id = bid; spanning_blob_map[b->id] = b; @@ -5153,16 +5078,6 @@ void BlueStore::Collection::split_cache( // may not be faulted in) auto rehome_blob = [&](Blob* b) { - for (auto& i : b->bc.buffer_map) { - if (!i.second.is_writing()) { - ldout(store->cct, 1) << __func__ << " moving " << i.second - << dendl; - dest->cache->_move(cache, &i.second); - } else { - ldout(store->cct, 1) << __func__ << " not moving " << i.second - << dendl; - } - } cache->rm_blob(); dest->cache->add_blob(); SharedBlob* sb = b->get_shared_blob().get(); @@ -5182,10 +5097,23 @@ void BlueStore::Collection::split_cache( }; for (auto& e : o->extent_map.extent_map) { - e.blob->last_encoded_id = -1; + e.blob->last_encoded_id = -1; } for (auto& b : o->extent_map.spanning_blob_map) { - b.second->last_encoded_id = -1; + b.second->last_encoded_id = -1; + } + // By default do not copy buffers to clones, and let them read data by + // themselves. The exception are 'writing' buffers, which are not yet + // stable on device. + for (auto &i : o->bc.buffer_map) { + if (!i.second.is_writing()) { + ldout(store->cct, 1) + << __func__ << " moving " << i.second << dendl; + dest->cache->_move(cache, &i.second); + } else { + ldout(store->cct, 1) + << __func__ << " not moving " << i.second << dendl; + } } for (auto& e : o->extent_map.extent_map) { cache->rm_extent(); @@ -11766,11 +11694,11 @@ void BlueStore::_read_cache( ready_regions_t cache_res; interval_set cache_interval; - bptr->dirty_bc().read( - bptr->get_cache(), b_off, b_len, cache_res, cache_interval, + o->bc.read( + o->c->cache, pos, b_len, cache_res, cache_interval, read_cache_policy); dout(20) << __func__ << " blob " << *bptr << std::hex - << " need 0x" << b_off << "~" << b_len + << " need 0x" << pos << "~" << b_len << " cache has 0x" << cache_interval << std::dec << dendl; @@ -11779,17 +11707,17 @@ void BlueStore::_read_cache( while (b_len > 0) { unsigned l; if (pc != cache_res.end() && - pc->first == b_off) { + pc->first == pos) { l = pc->second.length(); ready_regions[pos] = std::move(pc->second); dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x" - << b_off << "~" << l << std::dec << dendl; + << pos << "~" << l << std::dec << dendl; ++pc; } else { l = b_len; if (pc != cache_res.end()) { - ceph_assert(pc->first > b_off); - l = pc->first - b_off; + ceph_assert(pc->first > pos); + l = pc->first - pos; } dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x" << b_off << "~" << l << std::dec << dendl; @@ -11924,8 +11852,9 @@ int BlueStore::_generate_read_result_bl( if (bptr->get_blob().is_compressed()) { ceph_assert(p != compressed_blob_bls.end()); bufferlist& compressed_bl = *p++; - if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl, - r2r.front().regs.front().logical_offset) < 0) { + uint32_t offset = r2r.front().regs.front().logical_offset; + uint32_t length = r2r.front().regs.front().length; + if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl, offset) < 0) { *csum_error = true; return -EIO; } @@ -11934,8 +11863,12 @@ int BlueStore::_generate_read_result_bl( if (r < 0) return r; if (buffered) { - bptr->dirty_bc().did_read(bptr->get_cache(), 0, - raw_bl); + bufferlist region_buffer; + // todo bad offset + region_buffer.substr_of(raw_bl, offset, length); + // need offset before padding + o->bc.did_read(o->c->cache, offset, + length, std::move(region_buffer)); } for (auto& req : r2r) { for (auto& r : req.regs) { @@ -11945,18 +11878,20 @@ int BlueStore::_generate_read_result_bl( } } else { for (auto& req : r2r) { - if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl, - req.regs.front().logical_offset) < 0) { + uint64_t offset = r2r.front().regs.front().logical_offset; + if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl, offset) < 0) { *csum_error = true; return -EIO; } - if (buffered) { - bptr->dirty_bc().did_read(bptr->get_cache(), - req.r_off, req.bl); - } // prune and keep result for (const auto& r : req.regs) { + if (buffered) { + bufferlist region_buffer; + region_buffer.substr_of(req.bl, r.front, r.length); + // need offset before padding + o->bc.did_read(o->c->cache, r.logical_offset, r.length, std::move(region_buffer)); + } ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length); } } @@ -13798,10 +13733,11 @@ void BlueStore::_txc_finish(TransContext *txc) dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl; ceph_assert(txc->get_state() == TransContext::STATE_FINISHING); - for (auto& sb : txc->blobs_written) { - sb->finish_write(txc->seq); + for (auto& [onode, offset, seq] : txc->buffers_written) { + onode->bc._finish_write(onode->c->cache, offset, seq); } - txc->blobs_written.clear(); + txc->buffers_written.clear(); + while (!txc->removed_collections.empty()) { _queue_reap_collection(txc->removed_collections.front()); txc->removed_collections.pop_front(); @@ -15468,45 +15404,46 @@ void BlueStore::_do_write_small( uint64_t b_off = offset - head_pad - bstart; uint64_t b_len = length + head_pad + tail_pad; - // direct write into unused blocks of an existing mutable blob? - if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) && - b->get_blob().get_ondisk_length() >= b_off + b_len && - b->get_blob().is_unused(b_off, b_len) && - b->get_blob().is_allocated(b_off, b_len)) { - _apply_padding(head_pad, tail_pad, bl); - - dout(20) << __func__ << " write to unused 0x" << std::hex - << b_off << "~" << b_len - << " pad 0x" << head_pad << " + 0x" << tail_pad - << std::dec << " of mutable " << *b << dendl; - _buffer_cache_write(txc, b, b_off, bl, - wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); - - if (!g_conf()->bluestore_debug_omit_block_device_write) { - if (b_len < prefer_deferred_size) { - dout(20) << __func__ << " deferring small 0x" << std::hex + // direct write into unused blocks of an existing mutable blob? + if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) && + b->get_blob().get_ondisk_length() >= b_off + b_len && + b->get_blob().is_unused(b_off, b_len) && + b->get_blob().is_allocated(b_off, b_len)) { + _buffer_cache_write(txc, b, o, offset, bl, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + _apply_padding(head_pad, tail_pad, bl); + + dout(20) << __func__ << " write to unused 0x" << std::hex << b_off + << "~" << b_len << " pad 0x" << head_pad << " + 0x" + << tail_pad << std::dec << " of mutable " << *b << dendl; + _buffer_cache_write(txc, b, o, offset - head_pad, bl, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + + if (!g_conf()->bluestore_debug_omit_block_device_write) { + if (b_len < prefer_deferred_size) { + dout(20) << __func__ << " deferring small 0x" << std::hex << b_len << std::dec << " unused write via deferred" << dendl; - bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length()); - op->op = bluestore_deferred_op_t::OP_WRITE; - b->get_blob().map( + bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length()); + op->op = bluestore_deferred_op_t::OP_WRITE; + b->get_blob().map( b_off, b_len, - [&](uint64_t offset, uint64_t length) { - op->extents.emplace_back(bluestore_pextent_t(offset, length)); - return 0; - }); - op->data = bl; - } else { - b->get_blob().map_bl( - b_off, bl, + [&](uint64_t offset, uint64_t length) { + op->extents.emplace_back(bluestore_pextent_t(offset, length)); + return 0; + }); + op->data = bl; + } else { + b->get_blob().map_bl( + b_off, bl, [&](uint64_t offset, bufferlist& t) { - bdev->aio_write(offset, t, + bdev->aio_write(offset, t, &txc->ioc, wctx->buffered); - }); - } - } - b->dirty_blob().calc_csum(b_off, bl); - dout(20) << __func__ << " lex old " << *ep << dendl; - Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length, + }); + } + } + b->dirty_blob().calc_csum(b_off, bl); + dout(20) << __func__ << " lex old " << *ep << dendl; + Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length, b, &wctx->old_extents); b->dirty_blob().mark_used(le->blob_offset, le->length); @@ -15568,36 +15505,38 @@ void BlueStore::_do_write_small( } logger->inc(l_bluestore_write_small_pre_read); - _buffer_cache_write(txc, b, b_off, bl, - wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + bufferlist without_padding; + without_padding.substr_of(bl, head_pad, bl.length() - head_pad); + _buffer_cache_write(txc, b, o, offset - head_read, std::move(without_padding), + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); - b->dirty_blob().calc_csum(b_off, bl); + b->dirty_blob().calc_csum(b_off, bl); - if (!g_conf()->bluestore_debug_omit_block_device_write) { - bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length()); - op->op = bluestore_deferred_op_t::OP_WRITE; - int r = b->get_blob().map( - b_off, b_len, + if (!g_conf()->bluestore_debug_omit_block_device_write) { + bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length()); + op->op = bluestore_deferred_op_t::OP_WRITE; + int r = b->get_blob().map( + b_off, b_len, [&](uint64_t offset, uint64_t length) { - op->extents.emplace_back(bluestore_pextent_t(offset, length)); - return 0; - }); - ceph_assert(r == 0); - op->data = std::move(bl); - dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~" + op->extents.emplace_back(bluestore_pextent_t(offset, length)); + return 0; + }); + ceph_assert(r == 0); + op->data = std::move(bl); + dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~" << b_len << std::dec << " of mutable " << *b << " at " << op->extents << dendl; - } + } - Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length, + Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length, b, &wctx->old_extents); - b->dirty_blob().mark_used(le->blob_offset, le->length); - txc->statfs_delta.stored() += le->length; - dout(20) << __func__ << " lex " << *le << dendl; - return; - } - // try to reuse blob if we can - if (b->can_reuse_blob(min_alloc_size, + b->dirty_blob().mark_used(le->blob_offset, le->length); + txc->statfs_delta.stored() += le->length; + dout(20) << __func__ << " lex " << *le << dendl; + return; + } + // try to reuse blob if we can + if (b->can_reuse_blob(min_alloc_size, max_bsize, offset0 - bstart, &alloc_len)) { @@ -15856,7 +15795,7 @@ void BlueStore::_do_write_big_apply_deferred( logger->inc(l_bluestore_write_penalty_read_ops); } auto& b0 = dctx.blob_ref; - _buffer_cache_write(txc, b0, dctx.b_off, bl, + _buffer_cache_write(txc, b0, o, dctx.off - dctx.head_read, bl, wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); b0->dirty_blob().calc_csum(dctx.b_off, bl); @@ -16393,7 +16332,9 @@ int BlueStore::_do_alloc_write( wi.b->dirty_blob().mark_used(le->blob_offset, le->length); txc->statfs_delta.stored() += le->length; dout(20) << __func__ << " lex " << *le << dendl; - _buffer_cache_write(txc, wi.b, b_off, wi.bl, + bufferlist without_pad; + without_pad.substr_of(wi.bl, wi.b_off0-wi.b_off, wi.length0); + _buffer_cache_write(txc, wi.b, o, wi.logical_offset, std::move(without_pad), wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); // queue io @@ -16444,7 +16385,7 @@ void BlueStore::_wctx_finish( const bluestore_blob_t& blob = b->get_blob(); if (blob.is_compressed()) { if (lo.blob_empty) { - txc->statfs_delta.compressed() -= blob.get_compressed_payload_length(); + txc->statfs_delta.compressed() -= blob.get_compressed_payload_length(); } txc->statfs_delta.compressed_original() -= lo.e.length; } @@ -16480,7 +16421,7 @@ void BlueStore::_wctx_finish( // longer allocated. Note that this will leave behind edge bits // that are no longer referenced but not deallocated (until they // age out of the cache naturally). - b->discard_unallocated(c.get()); + // b->discard_unallocated(c.get(), o.get(), lo.e.logical_offset); for (auto e : r) { dout(20) << __func__ << " release " << e << dendl; txc->released.insert(e.offset, e.length); @@ -16887,6 +16828,7 @@ void BlueStore::_do_truncate( WriteContext wctx; if (offset < o->onode.size) { uint64_t length = o->onode.size - offset; + o->bc.discard(o->c->cache, offset, length); o->extent_map.fault_range(db, offset, length); o->extent_map.punch_hole(c, offset, length, &wctx.old_extents); o->extent_map.dirty_range(offset, length); @@ -17486,6 +17428,8 @@ int BlueStore::_do_clone_range( << " 0x" << dstoff << "~" << length << std::dec << dendl; oldo->extent_map.fault_range(db, srcoff, length); newo->extent_map.fault_range(db, dstoff, length); + // it is possible the onode had previous buffers written + newo->bc.discard(c->cache, dstoff, length); _dump_onode<30>(cct, *oldo); _dump_onode<30>(cct, *newo); diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 2932a59b89b2b..1296943e0a4bc 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -263,7 +263,9 @@ public: struct BufferSpace; struct Collection; + struct Onode; typedef boost::intrusive_ptr CollectionRef; + typedef boost::intrusive_ptr OnodeRef; struct AioContext { virtual void aio_finish(BlueStore *store) = 0; @@ -317,6 +319,10 @@ public: unsigned f = 0) : space(space), state(s), flags(f), seq(q), offset(o), length(b.length()), data(b) {} + Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, ceph::buffer::list&& b, + unsigned f = 0) + : space(space), state(s), flags(f), seq(q), offset(o), + length(b.length()), data(b) {} Buffer(Buffer &&other) { std::swap(space, other.space); @@ -474,6 +480,16 @@ public: } int _discard(BufferCacheShard* cache, uint32_t offset, uint32_t length); + void write(BufferCacheShard* cache, uint64_t seq, uint32_t offset, ceph::buffer::list&& bl, + unsigned flags) { + std::lock_guard l(cache->lock); + Buffer b(this, Buffer::STATE_WRITING, seq, offset, std::move(bl), + flags); + uint16_t cache_private = _discard(cache, offset, bl.length()); + _add_buffer(cache, this, Buffer(this, Buffer::STATE_WRITING, seq, offset, std::move(bl), + flags), cache_private, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr); + cache->_trim(); + } void write(BufferCacheShard* cache, uint64_t seq, uint32_t offset, ceph::buffer::list& bl, unsigned flags) { std::lock_guard l(cache->lock); @@ -485,8 +501,8 @@ public: nullptr); cache->_trim(); } - void _finish_write(BufferCacheShard* cache, uint64_t seq); - void did_read(BufferCacheShard* cache, uint32_t offset, ceph::buffer::list& bl) { + void _finish_write(BufferCacheShard* cache, uint32_t offset, uint64_t seq); + void did_read(BufferCacheShard* cache, uint32_t offset, uint32_t length, ceph::buffer::list&& bl) { std::lock_guard l(cache->lock); uint16_t cache_private = _discard(cache, offset, bl.length()); _add_buffer( @@ -505,7 +521,7 @@ public: discard(cache, offset, (uint32_t)-1 - offset); } - bool _dup_writing(BufferCacheShard* cache, BufferSpace* to); + bool _dup_writing(TransContext* txc, BufferCacheShard* cache, OnodeRef onode); void split(BufferCacheShard* cache, size_t pos, BufferSpace &r); void dump(BufferCacheShard* cache, ceph::Formatter *f) const { @@ -647,7 +663,6 @@ public: ceph_assert(get_cache()); } Blob(CollectionRef collection) : collection(collection) {} - BufferSpace bc; private: SharedBlobRef shared_blob; ///< shared blob state (if any) mutable bluestore_blob_t blob; ///< decoded blob metadata @@ -694,8 +709,7 @@ public: bool can_split() { std::lock_guard l(get_cache()->lock); // splitting a BufferSpace writing list is too hard; don't try. - return get_bc().writing.empty() && - used_in_blob.can_split() && + return used_in_blob.can_split() && get_blob().can_split(); } @@ -735,26 +749,15 @@ public: #endif return blob; } - /// clear buffers from unused sections - void discard_unused_buffers(CephContext* cct, BufferCacheShard* cache); - - inline const BufferSpace& get_bc() const { - return bc; - } - inline BufferSpace& dirty_bc() { - return bc; - } /// discard buffers for unallocated regions - void discard_unallocated(Collection *coll); + void discard_unallocated(Collection *coll, BlueStore::Onode *onode, uint32_t logical_offset); /// get logical references void get_ref(Collection *coll, uint32_t offset, uint32_t length); /// put logical references, and get back any released extents bool put_ref(Collection *coll, uint32_t offset, uint32_t length, PExtentVector *r); - // update caches to reflect content up to seq - void finish_write(uint64_t seq); /// split the blob void split(Collection *coll, uint32_t blob_offset, Blob *o); @@ -948,14 +951,12 @@ public: boost::intrusive::list_member_hook<>, &OldExtent::old_extent_item> > old_extent_map_t; - struct Onode; /// a sharded extent map, mapping offsets to lextents to blobs struct ExtentMap { Onode *onode; extent_map_t extent_map; ///< map of Extents to Blobs blob_map_t spanning_blob_map; ///< blobs that span shards - typedef boost::intrusive_ptr OnodeRef; struct Shard { bluestore_onode_t::shard_info *shard_info = nullptr; @@ -1328,6 +1329,7 @@ public: /// (it can be pinned and hence physically out /// of it at the moment though) ExtentMap extent_map; + BufferSpace bc; ///< buffer cache // track txc's that have not been committed to kv store (and whose // effects cannot be read via the kvdb read methods) @@ -1436,7 +1438,6 @@ public: private: void _decode(const ceph::buffer::list& v); }; - typedef boost::intrusive_ptr OnodeRef; /// A generic Cache Shard struct CacheShard { @@ -1894,7 +1895,8 @@ private: std::set modified_objects; ///< objects we modified (and need a ref) std::set shared_blobs; ///< these need to be updated/written - std::set blobs_written; ///< update these on io completion + std::set> buffers_written; ///< update these on io completion (buffer -> offset, seq pair) + KeyValueDB::Transaction t; ///< then we will commit this std::list oncommits; ///< more commit completions std::list removed_collections; ///< colls we removed @@ -2884,13 +2886,26 @@ private: void _buffer_cache_write( TransContext *txc, - BlobRef b, - uint64_t offset, + BlobRef blob, + OnodeRef onode, + uint32_t offset, + ceph::buffer::list&& bl, + unsigned flags) { + onode->bc.write(onode->c->cache, txc->seq, offset, std::move(bl), + flags); + txc->buffers_written.insert({onode.get(), offset, txc->seq}); + } + + void _buffer_cache_write( + TransContext *txc, + BlobRef blob, + OnodeRef onode, + uint32_t offset, ceph::buffer::list& bl, unsigned flags) { - b->dirty_bc().write(b->get_cache(), txc->seq, offset, bl, + onode->bc.write(onode->c->cache, txc->seq, offset, bl, flags); - txc->blobs_written.insert(b); + txc->buffers_written.insert({onode.get(), offset, txc->seq}); } int _collection_list( -- 2.39.5