From 94d4b65ec12d495f2988312b55289ce86722152f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 8 Sep 2021 10:30:24 -0500 Subject: [PATCH] os/bluestore: fsck smr allocations (verify num_dead_bytes, alloc past write pointer) Signed-off-by: Sage Weil --- src/os/bluestore/BlueStore.cc | 204 ++++++++++++++++++++---------- src/os/bluestore/ZonedAllocator.h | 4 + 2 files changed, 144 insertions(+), 64 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 3744a3b9a1ea4..169ecc2257235 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -9218,77 +9218,153 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair) } } - dout(1) << __func__ << " checking freelist vs allocated" << dendl; // skip freelist vs allocated compare when we have Null fm if (!fm->is_null_manager()) { - fm->enumerate_reset(); - uint64_t offset, length; - while (fm->enumerate_next(db, &offset, &length)) { - bool intersects = false; - apply_for_bitset_range( - offset, length, alloc_size, used_blocks, - [&](uint64_t pos, mempool_dynamic_bitset &bs) { - ceph_assert(pos < bs.size()); - if (bs.test(pos) && !bluefs_used_blocks.test(pos)) { - if (offset == SUPER_RESERVED && - length == min_alloc_size - SUPER_RESERVED) { - // this is due to the change just after luminous to min_alloc_size - // granularity allocations, and our baked in assumption at the top - // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used - // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless, - // since we will never allocate this region below min_alloc_size. - dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED" - << " and min_alloc_size, 0x" << std::hex << offset << "~" - << length << std::dec << dendl; + dout(1) << __func__ << " checking freelist vs allocated" << dendl; +#ifdef HAVE_LIBZBD + if (freelist_type == "zoned") { + // verify per-zone state + // - verify no allocations beyond write pointer + // - verify num_dead_bytes count (neither allocated nor + // free space past the write pointer) + auto a = dynamic_cast(shared_alloc.a); + auto num_zones = bdev->get_size() / zone_size; + + // mark the free space past the write pointer + for (uint32_t zone = first_sequential_zone; zone < num_zones; ++zone) { + auto wp = a->get_write_pointer(zone); + uint64_t offset = zone_size * zone + wp; + uint64_t length = zone_size - wp; + if (!length) { + continue; + } + bool intersects = false; + dout(10) << " marking zone 0x" << std::hex << zone + << " region after wp 0x" << offset << "~" << length + << std::dec << dendl; + apply_for_bitset_range( + offset, length, alloc_size, used_blocks, + [&](uint64_t pos, mempool_dynamic_bitset &bs) { + if (bs.test(pos)) { + derr << "fsck error: zone 0x" << std::hex << zone + << " has used space at 0x" << pos * alloc_size + << " beyond write pointer 0x" << wp + << std::dec << dendl; + intersects = true; } else { - intersects = true; - if (repair) { - repairer.fix_false_free(db, fm, - pos * min_alloc_size, - min_alloc_size); - } + bs.set(pos); } - } else { - bs.set(pos); - } - } - ); - if (intersects) { - derr << "fsck error: free extent 0x" << std::hex << offset - << "~" << length << std::dec - << " intersects allocated blocks" << dendl; - ++errors; - } - } - fm->enumerate_reset(); - size_t count = used_blocks.count(); - if (used_blocks.size() != count) { - ceph_assert(used_blocks.size() > count); - used_blocks.flip(); - size_t start = used_blocks.find_first(); - while (start != decltype(used_blocks)::npos) { - size_t cur = start; - while (true) { - size_t next = used_blocks.find_next(cur); - if (next != cur + 1) { - ++errors; - derr << "fsck error: leaked extent 0x" << std::hex - << ((uint64_t)start * fm->get_alloc_size()) << "~" - << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec - << dendl; - if (repair) { - repairer.fix_leaked(db, - fm, - start * min_alloc_size, - (cur + 1 - start) * min_alloc_size); + } + ); + if (intersects) { + ++errors; + } + } + + used_blocks.flip(); + + // skip conventional zones + uint64_t pos = (first_sequential_zone * zone_size) / min_alloc_size - 1; + pos = used_blocks.find_next(pos); + + uint64_t zone_dead = 0; + for (uint32_t zone = first_sequential_zone; + zone < num_zones; + ++zone, zone_dead = 0) { + while (pos != decltype(used_blocks)::npos && + (pos * min_alloc_size) / zone_size == zone) { + dout(40) << " zone 0x" << std::hex << zone + << " dead 0x" << (pos * min_alloc_size) << "~" << min_alloc_size + << std::dec << dendl; + zone_dead += min_alloc_size; + pos = used_blocks.find_next(pos); + } + dout(20) << " zone 0x" << std::hex << zone << " dead is 0x" << zone_dead + << std::dec << dendl; + // cross-check dead bytes against zone state + if (a->get_dead_bytes(zone) != zone_dead) { + derr << "fsck error: zone 0x" << std::hex << zone << " has 0x" << zone_dead + << " dead bytes but freelist says 0x" << a->get_dead_bytes(zone) + << dendl; + ++errors; + // TODO: repair + } + } + used_blocks.flip(); + } else +#endif + { + fm->enumerate_reset(); + uint64_t offset, length; + while (fm->enumerate_next(db, &offset, &length)) { + bool intersects = false; + apply_for_bitset_range( + offset, length, alloc_size, used_blocks, + [&](uint64_t pos, mempool_dynamic_bitset &bs) { + ceph_assert(pos < bs.size()); + if (bs.test(pos) && !bluefs_used_blocks.test(pos)) { + if (offset == SUPER_RESERVED && + length == min_alloc_size - SUPER_RESERVED) { + // this is due to the change just after luminous to min_alloc_size + // granularity allocations, and our baked in assumption at the top + // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used + // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless, + // since we will never allocate this region below min_alloc_size. + dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED" + << " and min_alloc_size, 0x" << std::hex << offset << "~" + << length << std::dec << dendl; + } else { + intersects = true; + if (repair) { + repairer.fix_false_free(db, fm, + pos * min_alloc_size, + min_alloc_size); + } + } + } else { + bs.set(pos); } - start = next; - break; } - cur = next; + ); + if (intersects) { + derr << "fsck error: free extent 0x" << std::hex << offset + << "~" << length << std::dec + << " intersects allocated blocks" << dendl; + ++errors; } - } - used_blocks.flip(); + } + fm->enumerate_reset(); + + // check for leaked extents + size_t count = used_blocks.count(); + if (used_blocks.size() != count) { + ceph_assert(used_blocks.size() > count); + used_blocks.flip(); + size_t start = used_blocks.find_first(); + while (start != decltype(used_blocks)::npos) { + size_t cur = start; + while (true) { + size_t next = used_blocks.find_next(cur); + if (next != cur + 1) { + ++errors; + derr << "fsck error: leaked extent 0x" << std::hex + << ((uint64_t)start * fm->get_alloc_size()) << "~" + << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec + << dendl; + if (repair) { + repairer.fix_leaked(db, + fm, + start * min_alloc_size, + (cur + 1 - start) * min_alloc_size); + } + start = next; + break; + } + cur = next; + } + } + used_blocks.flip(); + } } } } diff --git a/src/os/bluestore/ZonedAllocator.h b/src/os/bluestore/ZonedAllocator.h index 903b62fd1a09a..872ee29325aa8 100644 --- a/src/os/bluestore/ZonedAllocator.h +++ b/src/os/bluestore/ZonedAllocator.h @@ -80,6 +80,10 @@ public: return "zoned"; } + uint64_t get_dead_bytes(uint32_t zone) { + return zone_states[zone].num_dead_bytes; + } + int64_t allocate( uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size, int64_t hint, PExtentVector *extents) override; -- 2.39.5