From: Adam Kupczyk Date: Thu, 4 May 2023 08:50:10 +0000 (+0000) Subject: os/bluestore: Add UT for ExtentMap::dup_esb() X-Git-Tag: v19.0.0~486^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=310bd7e685caa6b6c800021c293f85867a93660e;p=ceph.git os/bluestore: Add UT for ExtentMap::dup_esb() Add unit tests for ExtentMap::dup_esb() to test_bluestore_types. Extra hooks added to BlueStore: 1) bluestore_extent_ref_map_t::debug_peek inspects how many times specific AU is used 2) ExtentMap::debug_list_disk_layout to extract how onode maps to AUs 3) BlueStore::debug_punch_hole to use logic from _wctx_finish internal function Unit tests are limited to operations aligned to allocation unit. Signed-off-by: Adam Kupczyk --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 3d464eb67b94..3e016c9352a1 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -4512,6 +4512,94 @@ BlueStore::BlobRef BlueStore::ExtentMap::split_blob( return rb; } +BlueStore::ExtentMap::debug_au_vector_t +BlueStore::ExtentMap::debug_list_disk_layout() +{ + BlueStore::ExtentMap::debug_au_vector_t res; + uint32_t l_pos = 0; + for (auto ep = extent_map.begin(); ep != extent_map.end(); ++ep) { + if (l_pos < ep->logical_offset) { + // a hole in logical mapping, mark it + res.emplace_back(-1ULL, ep->logical_offset - l_pos, 0, 0); + } + l_pos = ep->logical_offset + ep->length; + const bluestore_blob_t& bblob = ep->blob->get_blob(); + uint32_t chunk_size = bblob.get_chunk_size(onode->c->store->block_size); + uint32_t length_left = ep->length; + + bluestore_extent_ref_map_t* ref_map = nullptr; + if (bblob.is_shared()) { + ceph_assert(ep->blob->shared_blob->is_loaded()); + bluestore_shared_blob_t* bsblob = ep->blob->shared_blob->persistent; + ref_map = &bsblob->ref_map; + } + + unsigned csum_i = 0; + size_t csum_cnt; + uint32_t length; + if (bblob.has_csum()) { + csum_cnt = bblob.get_csum_count(); + uint32_t csum_chunk_size = bblob.get_csum_chunk_size(); + uint64_t csum_offset_align = p2align(ep->blob_offset, csum_chunk_size); + csum_i = csum_offset_align / csum_chunk_size; + // size of first chunk + length = p2align(ep->blob_offset + csum_chunk_size, csum_chunk_size) - ep->blob_offset; + length = std::min(length_left, length); + if (csum_chunk_size < chunk_size) { + chunk_size = csum_chunk_size; + } + } else { + length = p2align(ep->blob_offset + chunk_size, chunk_size) - ep->blob_offset; + length = std::min(length_left, length); + } + + uint32_t bo = ep->blob_offset; + while (length_left > 0) { + uint64_t csum_val = 0; + if (bblob.has_csum()) { + ceph_assert(csum_cnt > csum_i); + csum_val = bblob.get_csum_item(csum_i); + ++csum_i; + } + //extract AU from extents + uint64_t disk_extent_left; // length till the end of disk extent + uint64_t disk_offset = bblob.calc_offset(bo, &disk_extent_left); + bluestore_extent_ref_map_t::debug_len_cnt l_c = {0, std::numeric_limits::max()}; + if (bblob.is_shared()) { + l_c = ref_map->debug_peek(disk_offset); + if (l_c.len < length) { + length = l_c.len; + } + } + res.emplace_back(disk_offset, length, csum_val, l_c.cnt); + bo += length; + length_left -= length; + length = chunk_size; + }; + } + return res; +} + +std::ostream& operator<<(std::ostream& out, const BlueStore::ExtentMap::debug_au_vector_t& auv) +{ + out << "["; + for (size_t i = 0; i < auv.size(); ++i) { + if (i != 0) { + out << " "; + } + out << "0x" << std::hex; + if (auv[i].disk_offset != -1ULL) { + out << auv[i].disk_offset << "~" << auv[i].disk_length + << "(" << std::dec << int32_t(auv[i].ref_cnts) + << "):" << std::hex << auv[i].chksum; + } else { + out << "~" << auv[i].disk_length << std::dec; + } + } + out << "]" << std::dec; + return out; +} + // Onode #undef dout_prefix @@ -16775,7 +16863,8 @@ void BlueStore::_wctx_finish( set *maybe_unshared_blobs) { #ifdef HAVE_LIBZBD - if (bdev->is_smr()) { + bool is_smr = bdev && bdev->is_smr(); + if (is_smr) { for (auto& w : wctx->writes) { for (auto& e : w.b->get_blob().get_extents()) { if (!e.is_valid()) { @@ -16823,7 +16912,7 @@ void BlueStore::_wctx_finish( unshare_ptr); #ifdef HAVE_LIBZBD // we also drop zone ref for shared blob extents - if (bdev->is_smr() && e.is_valid()) { + if (is_smr && e.is_valid()) { zones_with_releases.insert(e.offset / zone_size); } #endif @@ -16854,7 +16943,7 @@ void BlueStore::_wctx_finish( txc->statfs_delta.compressed_allocated() -= e.length; } #ifdef HAVE_LIBZBD - if (bdev->is_smr() && e.is_valid()) { + if (is_smr && e.is_valid()) { zones_with_releases.insert(e.offset / zone_size); } #endif diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index c5d548f8b540..c795af015be4 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1112,6 +1112,28 @@ public: /// split a blob (and referring extents) BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos); + + /// allocation unit status + struct debug_au_state_t { + uint64_t disk_offset; //< offset of the data on disk (in bytes) + uint32_t disk_length; //< length of the data on disk + // ; + /// Produces a sequence of allocation units representing logical offsets. + /// If there is a discontinuity, it is encoded as disk_offset==-1. + debug_au_vector_t debug_list_disk_layout(); + + friend std::ostream& operator<<(std::ostream& out, const debug_au_vector_t& auv); }; /// Compressed Blob Garbage collector @@ -3352,6 +3374,18 @@ public: bool has_builtin_csum() const override { return true; } + // a debug punch_hole function, to use internals of _wctx_finish + // to remove old_extents from object + void debug_punch_hole( + CollectionRef& c, + OnodeRef& o, + uint32_t off, + uint32_t len) { + BlueStore::TransContext txc(cct, c.get(), nullptr, nullptr); + BlueStore::WriteContext wctx; + o->extent_map.punch_hole(c, off, len, &wctx.old_extents); + _wctx_finish(&txc, c, o, &wctx, nullptr); + } inline void log_latency(const char* name, int idx, diff --git a/src/os/bluestore/bluestore_types.cc b/src/os/bluestore/bluestore_types.cc index 229215fc0783..a3d0d41acb54 100644 --- a/src/os/bluestore/bluestore_types.cc +++ b/src/os/bluestore/bluestore_types.cc @@ -279,6 +279,42 @@ out: } } +/// Inspects reference region at specified offset. +/// If reference region is located, returns its reference count and +/// distance from offset to the end of the region. +/// If reference region is not located, returns 0 reference count and +/// distance to the next region. +/// If offset is beyond last region, returns 0 for both +/// reference count and the distance. +bluestore_extent_ref_map_t::debug_len_cnt bluestore_extent_ref_map_t::debug_peek(uint64_t offset) const +{ + // locate offset + auto p = ref_map.lower_bound(offset); + if (p != ref_map.end() && p->first == offset) { + // direct request for us + return {p->second.length, p->second.refs}; + } + if (p != ref_map.begin()) { + --p; + if (p->first + p->second.length <= offset) { + // nah, it ends too soon, we landed in a hole + ++p; + if (p != ref_map.end()) { + // there is a region after + return {uint32_t(p->first - offset), 0}; + } else { + // nothing after + return {0, 0}; + } + } else { + // we're in the range + return {uint32_t((p->first + p->second.length) - offset), p->second.refs}; + } + } else { + return {uint32_t(p->first - offset), 0}; + } +} + bool bluestore_extent_ref_map_t::contains(uint64_t offset, uint32_t length) const { auto p = ref_map.lower_bound(offset); diff --git a/src/os/bluestore/bluestore_types.h b/src/os/bluestore/bluestore_types.h index 040f76723797..8a0e0b7c9f71 100644 --- a/src/os/bluestore/bluestore_types.h +++ b/src/os/bluestore/bluestore_types.h @@ -176,6 +176,11 @@ struct bluestore_extent_ref_map_t { void get(uint64_t offset, uint32_t len); void put(uint64_t offset, uint32_t len, PExtentVector *release, bool *maybe_unshared); + struct debug_len_cnt { + uint32_t len; // length for which cnt is valid + uint32_t cnt; // reference count for the region + }; + debug_len_cnt debug_peek(uint64_t offset) const; bool contains(uint64_t offset, uint32_t len) const; bool intersects(uint64_t offset, uint32_t len) const; @@ -845,6 +850,28 @@ public: ceph_abort_msg("unrecognized csum word size"); } } + void set_csum_item(unsigned i, uint64_t val) { + size_t cs = get_csum_value_size(); + char *p = csum_data.c_str(); + switch (cs) { + case 0: + ceph_abort_msg("no csum data, bad index"); + case 1: + reinterpret_cast(p)[i] = val; + break; + case 2: + reinterpret_cast(p)[i] = val; + break; + case 4: + reinterpret_cast(p)[i] = val; + break; + case 8: + reinterpret_cast(p)[i] = val; + break; + default: + ceph_abort_msg("unrecognized csum word size"); + } + } const char *get_csum_item_ptr(unsigned i) const { size_t cs = get_csum_value_size(); return csum_data.c_str() + (cs * i); diff --git a/src/test/objectstore/test_bluestore_types.cc b/src/test/objectstore/test_bluestore_types.cc index eac7b223cdbd..2d10d3839ae6 100644 --- a/src/test/objectstore/test_bluestore_types.cc +++ b/src/test/objectstore/test_bluestore_types.cc @@ -1284,6 +1284,380 @@ TEST(ExtentMap, compress_extent_map) ASSERT_EQ(6u, em.extent_map.size()); } +class ExtentMapFixture : virtual public ::testing::Test { + +public: + BlueStore store; + BlueStore::OnodeCacheShard *oc; + BlueStore::BufferCacheShard *bc; + BlueStore::CollectionRef coll; + + static constexpr uint32_t au_size = 4096; + uint32_t blob_size = 65536; + size_t csum_order = 12; //1^12 = 4096 bytes + + struct au { + uint32_t chksum; + uint32_t refs; + }; + std::vector disk; + + // test onode that glues some simplifications in representation + // with actual BlueStore's onode + struct t_onode { + BlueStore::OnodeRef onode; //actual BS onode + std::vector data; //map to AUs + static constexpr uint32_t empty = std::numeric_limits::max(); + }; + void print(std::ostream& out, t_onode& onode) + { + for (size_t i = 0; i < onode.data.size(); ++i) { + if (i != 0) out << " "; + if (onode.data[i] == t_onode::empty) { + out << "-"; + } else { + out << std::hex << onode.data[i] + << "/" << disk[onode.data[i]].chksum + << ":" << std::dec << disk[onode.data[i]].refs; + } + } + } + explicit ExtentMapFixture() + : store(g_ceph_context, "", au_size) + { + oc = BlueStore::OnodeCacheShard::create(g_ceph_context, "lru", NULL); + bc = BlueStore::BufferCacheShard::create(g_ceph_context, "lru", NULL); + coll = ceph::make_ref(&store, oc, bc, coll_t()); + } + + void SetUp() override { + } + void TearDown() override { + } + + // takes new space from disk, initializes csums + // returns index of first au + uint32_t allocate(uint32_t num_au) { + uint32_t pos = disk.size(); + disk.resize(pos + num_au); + for (uint32_t i = 0; i < num_au; i++) { + uint32_t p = pos + i; + disk[p].chksum = 2 * p + 1; + disk[p].refs = 0; + } + return pos; + } + void release(uint32_t& au_idx) { + if (au_idx != t_onode::empty) { + disk_unref(au_idx); + } + au_idx = t_onode::empty; + } + void disk_ref(uint32_t au_idx) { + ++disk[au_idx].refs; + } + void disk_unref(uint32_t au_idx) { + ceph_assert(disk[au_idx].refs > 0); + --disk[au_idx].refs; + } + + t_onode create() { + t_onode res; + res.onode = new BlueStore::Onode(coll.get(), ghobject_t(), ""); + return res; + } + + void fillup(t_onode& onode, uint32_t end) { + if (end > onode.data.size()) { + size_t e = onode.data.size(); + onode.data.resize(end); + for (; e < end; ++e) { + onode.data[e] = t_onode::empty; + } + } + } + void punch_hole(t_onode& onode, uint32_t off, uint32_t len) { + ceph_assert((off % au_size) == 0); + ceph_assert((len % au_size) == 0); + uint32_t i = off / au_size; + uint32_t end = (off + len) / au_size; + fillup(onode, end); + while (i < end && i < onode.data.size()) { + if (onode.data[i] != t_onode::empty) + release(onode.data[i]); + onode.data[i] = t_onode::empty; + i++; + } + store.debug_punch_hole(coll, onode.onode, off, len); + } + + void write(t_onode& onode, uint32_t off, uint32_t len) { + ceph_assert((off % au_size) == 0); + ceph_assert((len % au_size) == 0); + punch_hole(onode, off, len); + + uint32_t i = off / au_size; + uint32_t end = (off + len) / au_size; + fillup(onode, end); + + uint32_t au_idx = allocate(end - i); + uint32_t idx = au_idx; + while (i < end) { + onode.data[i] = idx; + disk_ref(idx); + ++idx; + ++i; + } + + // below simulation of write performed by BlueStore::do_write() + auto helper_blob_write = [&]( + uint32_t log_off, // logical offset of blob to put to onode + uint32_t empty_aus, // amount of unreferenced aus in the beginning + uint32_t first_au, // first au that will be referenced + uint32_t num_aus // number of aus, first, first+1.. first+num_au-1 + ) { + uint32_t blob_length = (empty_aus + num_aus) * au_size; + BlueStore::BlobRef b(new BlueStore::Blob); + b->shared_blob = new BlueStore::SharedBlob(coll.get()); + bluestore_blob_t& bb = b->dirty_blob(); + bb.init_csum(Checksummer::CSUM_CRC32C, csum_order, blob_length); + for(size_t i = 0; i < num_aus; ++i) { + bb.set_csum_item(empty_aus + i, disk[first_au + i].chksum); + } + + PExtentVector pextents; + pextents.emplace_back(first_au * au_size, num_aus * au_size); + bb.allocated(empty_aus * au_size, num_aus * au_size, pextents); + + auto *ext = new BlueStore::Extent(log_off, empty_aus * au_size, + num_aus * au_size, b); + onode.onode->extent_map.extent_map.insert(*ext); + b->get_ref(coll.get(), empty_aus * au_size, num_aus * au_size); + bb.mark_used(empty_aus * au_size, num_aus * au_size); + }; + + size_t off_blob_aligned = p2align(off, blob_size); + size_t off_blob_roundup = p2align(off + blob_size, blob_size); + uint32_t skip_aus = (off - off_blob_aligned) / au_size; + size_t l = std::min(off_blob_roundup - off, len); + uint32_t num_aus = l / au_size; + + while (len > 0) { + helper_blob_write(off, skip_aus, au_idx, num_aus); + skip_aus = 0; + au_idx += num_aus; + len -= num_aus * au_size; + off += (skip_aus + num_aus) * au_size; + l = std::min(blob_size, len); + num_aus = l / au_size; + }; + } + + void dup(t_onode& ofrom, t_onode& oto, uint64_t off, uint64_t len) { + ceph_assert((off % au_size) == 0); + ceph_assert((len % au_size) == 0); + punch_hole(oto, off, len); + + uint32_t i = off / au_size; + uint32_t end = (off + len) / au_size; + fillup(ofrom, end); + ceph_assert(end <= ofrom.data.size()); + while (i < end) { + oto.data[i] = ofrom.data[i]; + if (oto.data[i] != t_onode::empty) { + disk_ref(oto.data[i]); + } + ++i; + } + BlueStore::TransContext txc(store.cct, coll.get(), nullptr, nullptr); + ofrom.onode->extent_map.dup_esb(&store, &txc, coll, ofrom.onode, oto.onode, off, len, off); + } + + int32_t compare(t_onode& onode) { + BlueStore::ExtentMap::debug_au_vector_t debug = + onode.onode->extent_map.debug_list_disk_layout(); + size_t pos = 0; + for (size_t i = 0; i < debug.size(); ++i) { + if (debug[i].disk_offset == -1ULL) { + size_t len = debug[i].disk_length; + size_t l = len / au_size; + if (pos + l > onode.data.size()) { + return pos + l; + } + while (l > 0) { + if (onode.data[pos] != t_onode::empty) { + return pos; + } + --l; + ++pos; + }; + } else { + ceph_assert(pos < onode.data.size()); + uint32_t au = onode.data[pos]; + if (debug[i].disk_offset != au * au_size || + debug[i].disk_length != au_size || + debug[i].chksum != disk[au].chksum) { + return pos; + } + if ((int32_t)debug[i].ref_cnts == -1) { + if (disk[au].refs != 1) { + return pos; + } + } else { + if (disk[au].refs != debug[i].ref_cnts) { + return pos; + } + } + ++pos; + } + } + // remaining aus must be empty + while (pos < onode.data.size()) { + if (onode.data[pos] != t_onode::empty) { + return pos; + } + ++pos; + } + return -1; + } + + bool check(t_onode& onode) { + int32_t res = compare(onode); + if (res != -1) { + cout << "Discrepancy at 0x" << std::hex << res * au_size << std::dec << std::endl; + cout << "Simulated: "; + print(cout, onode); + cout << std::endl; + cout << "Onode: " << onode.onode->extent_map.debug_list_disk_layout() << std::endl; + return false; + } + return true; + } + void print(t_onode& onode) { + cout << "Simulated: "; + print(cout, onode); + cout << std::endl; + cout << "Onode: " << onode.onode->extent_map.debug_list_disk_layout() << std::endl; + } +}; + +TEST_F(ExtentMapFixture, walk) +{ + std::vector X; + for (size_t i = 0; i < 100; i++) { + X.push_back(create()); + } + + for (size_t i = 0; i < 100 - 1; i++) { + write(X[i], (i + 2) * au_size, 4 * au_size); + dup(X[i], X[i+1], (i + 1) * au_size, 8 * au_size); + } + for (size_t i = 0; i < 100; i++) { + ASSERT_EQ(check(X[i]), true); + } +} + +TEST_F(ExtentMapFixture, pyramid) +{ + constexpr size_t H = 100; + std::vector X; + for (size_t i = 0; i < H; i++) { + X.push_back(create()); + } + write(X[0], 0, (H * 2 + 1) * au_size); + + for (size_t i = 0; i < H - 1; i++) { + dup(X[i], X[i + 1], i * au_size, (H * 2 + 1 - i * 2) * au_size); + } + for (size_t i = 0; i < H; i++) { + ASSERT_EQ(check(X[i]), true); + } +} + +TEST_F(ExtentMapFixture, rain) +{ + constexpr size_t H = 100; + constexpr size_t W = 100; + std::vector X; + for (size_t i = 0; i < H; i++) { + X.push_back(create()); + } + for (size_t i = 0; i < H - 1; i++) { + write(X[i], (rand() % W - 1) * au_size, au_size); + dup(X[i], X[i + 1], 0, W * au_size); + } + for (size_t i = 0; i < H; i++) { + ASSERT_EQ(check(X[i]), true); + } +} + +TEST_F(ExtentMapFixture, pollock) +{ + constexpr size_t H = 100; + constexpr size_t W = 100; + std::vector X; + for (size_t i = 0; i < H; i++) { + X.push_back(create()); + } + for (size_t i = 0; i < H - 1; i++) { + size_t w = rand() % (W / 3) + 1; + size_t l = rand() % (W - w); + write(X[i], l * au_size, w * au_size); + w = rand() % (W / 3) + 1; + l = rand() % (W - w); + dup(X[i], X[i + 1], l * au_size, w * au_size); + } + for (size_t i = 0; i < H; i++) { + ASSERT_EQ(check(X[i]), true); + } +} + +TEST_F(ExtentMapFixture, carousel) +{ + constexpr size_t R = 10; + constexpr size_t CNT = 300; + constexpr size_t W = 100; + std::vector X; + for (size_t i = 0; i < R; i++) { + X.push_back(create()); + } + for (size_t i = 0; i < CNT; i++) { + size_t w = rand() % (W / 3) + 1; + size_t l = rand() % (W - w); + write(X[i % R], l * au_size, w * au_size); + w = rand() % (W / 3) + 1; + l = rand() % (W - w); + dup(X[i % R], X[(i + 1) % R], l * au_size, w * au_size); + } + for (size_t i = 0; i < R; i++) { + ASSERT_EQ(check(X[i]), true); + } +} + +TEST_F(ExtentMapFixture, petri) +{ + constexpr size_t R = 10; + constexpr size_t CNT = 300; + constexpr size_t W = 100; + std::vector X; + for (size_t i = 0; i < R; i++) { + X.push_back(create()); + write(X[i], 0 * au_size, W * au_size); + } + for (size_t i = 0; i < CNT; i++) { + size_t from = rand() % R; + size_t to = from; + while (to == from) { + to = rand() % R; + } + size_t w = rand() % (W / 5) + 1; + size_t l = rand() % (W - w); + dup(X[from], X[to], l * au_size, w * au_size); + } + for (size_t i = 0; i < R; i++) { + ASSERT_EQ(check(X[i]), true); + } +} TEST(ExtentMap, dup_extent_map) { @@ -1335,7 +1709,7 @@ TEST(ExtentMap, dup_extent_map) BlueStore::TransContext txc(store.cct, coll.get(), nullptr, nullptr); //em1.dup(&store, &txc, coll, em2, ext1_offs, ext1_len, ext1_offs); - onode1->extent_map.dup(&store, &txc, coll, onode1, onode2, ext1_offs, ext1_len, ext1_offs); + onode1->extent_map.dup_esb(&store, &txc, coll, onode1, onode2, ext1_offs, ext1_len, ext1_offs); em1.dump(formatter.get()); // see the log if any formatter->flush(std::cout); @@ -1371,7 +1745,7 @@ TEST(ExtentMap, dup_extent_map) size_t clone_len = ext1_len - clone_shift; BlueStore::TransContext txc(store.cct, coll.get(), nullptr, nullptr); - onode1->extent_map.dup(&store, &txc, coll, onode1, onode3, clone_offs, clone_len, clone_offs); + onode1->extent_map.dup_esb(&store, &txc, coll, onode1, onode3, clone_offs, clone_len, clone_offs); em1.dump(formatter.get()); // see the log if any formatter->flush(std::cout); std::cout << std::endl; @@ -1414,7 +1788,7 @@ TEST(ExtentMap, dup_extent_map) size_t clone_offs = ext1_offs + clone_shift; BlueStore::TransContext txc(store.cct, coll.get(), nullptr, nullptr); - onode2->extent_map.dup(&store, &txc, coll, onode2, onode4, clone_offs, clone_len, clone_offs); + onode2->extent_map.dup_esb(&store, &txc, coll, onode2, onode4, clone_offs, clone_len, clone_offs); em2.dump(formatter.get()); // see the log if any formatter->flush(std::cout); std::cout << std::endl;