From b185fb2b69f995720d52ada14fc1109672ee04a7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 11 Aug 2021 11:48:45 -0500 Subject: [PATCH] os/bluestore: adjust allocator+freelist interfaces for smr params Instead of shoehorning these fields into alloc_size, adjust the inferfaces to explicitly pass in zone_size and first_sequential_zone for Allocator and FreelistManager. Signed-off-by: Sage Weil --- src/os/bluestore/Allocator.cc | 16 +++- src/os/bluestore/Allocator.h | 11 ++- src/os/bluestore/BitmapFreelistManager.cc | 1 + src/os/bluestore/BitmapFreelistManager.h | 1 + src/os/bluestore/BlueFS.cc | 4 +- src/os/bluestore/BlueStore.cc | 77 ++++++++++++++----- src/os/bluestore/BlueStore.h | 5 +- src/os/bluestore/FreelistManager.h | 1 + src/os/bluestore/ZonedAllocator.cc | 13 ++-- src/os/bluestore/ZonedAllocator.h | 4 +- src/os/bluestore/ZonedFreelistManager.cc | 12 ++- src/os/bluestore/ZonedFreelistManager.h | 2 + src/test/objectstore/Allocator_test.cc | 3 +- src/test/objectstore/allocator_replay_test.cc | 2 +- 14 files changed, 107 insertions(+), 45 deletions(-) diff --git a/src/os/bluestore/Allocator.cc b/src/os/bluestore/Allocator.cc index 8e4a08b2587ab..731ae5de73c5b 100644 --- a/src/os/bluestore/Allocator.cc +++ b/src/os/bluestore/Allocator.cc @@ -109,7 +109,8 @@ public: Allocator::Allocator(std::string_view name, int64_t _capacity, int64_t _block_size) - : device_size(_capacity), block_size(_block_size) + : device_size(_capacity), + block_size(_block_size) { asok_hook = new SocketHook(this, name); } @@ -124,8 +125,14 @@ const string& Allocator::get_name() const { return asok_hook->name; } -Allocator *Allocator::create(CephContext* cct, std::string_view type, - int64_t size, int64_t block_size, std::string_view name) +Allocator *Allocator::create( + CephContext* cct, + std::string_view type, + int64_t size, + int64_t block_size, + int64_t zone_size, + int64_t first_sequential_zone, + std::string_view name) { Allocator* alloc = nullptr; if (type == "stupid") { @@ -142,7 +149,8 @@ Allocator *Allocator::create(CephContext* cct, std::string_view type, name); #ifdef HAVE_LIBZBD } else if (type == "zoned") { - return new ZonedAllocator(cct, size, block_size, name); + return new ZonedAllocator(cct, size, block_size, zone_size, first_sequential_zone, + name); #endif } if (alloc == nullptr) { diff --git a/src/os/bluestore/Allocator.h b/src/os/bluestore/Allocator.h index 6f6325d57af9f..5503ed213fb58 100644 --- a/src/os/bluestore/Allocator.h +++ b/src/os/bluestore/Allocator.h @@ -66,8 +66,15 @@ public: virtual double get_fragmentation_score(); virtual void shutdown() = 0; - static Allocator *create(CephContext* cct, std::string_view type, int64_t size, - int64_t block_size, const std::string_view name = ""); + static Allocator *create( + CephContext* cct, + std::string_view type, + int64_t size, + int64_t block_size, + int64_t zone_size = 0, + int64_t firs_sequential_zone = 0, + const std::string_view name = "" + ); const std::string& get_name() const; diff --git a/src/os/bluestore/BitmapFreelistManager.cc b/src/os/bluestore/BitmapFreelistManager.cc index 2c89712962684..e03a6ecacb748 100644 --- a/src/os/bluestore/BitmapFreelistManager.cc +++ b/src/os/bluestore/BitmapFreelistManager.cc @@ -65,6 +65,7 @@ BitmapFreelistManager::BitmapFreelistManager(CephContext* cct, } int BitmapFreelistManager::create(uint64_t new_size, uint64_t granularity, + uint64_t zone_size, uint64_t first_sequential_zone, KeyValueDB::Transaction txn) { bytes_per_block = granularity; diff --git a/src/os/bluestore/BitmapFreelistManager.h b/src/os/bluestore/BitmapFreelistManager.h index 5b04e8fd28cc0..8e4ea8fd385c4 100644 --- a/src/os/bluestore/BitmapFreelistManager.h +++ b/src/os/bluestore/BitmapFreelistManager.h @@ -63,6 +63,7 @@ public: static void setup_merge_operator(KeyValueDB *db, std::string prefix); int create(uint64_t size, uint64_t granularity, + uint64_t zone_size, uint64_t first_sequential_zone, KeyValueDB::Transaction txn) override; int init(KeyValueDB *kvdb, bool db_in_read_only, diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index f34996be3e000..5b00c2e9a9121 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -590,7 +590,9 @@ void BlueFS::_init_alloc() << std::dec << dendl; alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator, bdev[id]->get_size(), - alloc_size[id], name); + alloc_size[id], + 0, 0, + name); alloc[id]->init_add_free( block_reserved[id], _get_total(id)); diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 030c1f59429f4..dd4d8f42d596e 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -5397,7 +5397,6 @@ int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only, bool fm_resto << dendl; return -EINVAL; } - alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size); } else #endif if (freelist_type == "zoned") { @@ -5406,7 +5405,13 @@ int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only, bool fm_resto return -EINVAL; } - fm->create(bdev->get_size(), alloc_size, t); + fm->create(bdev->get_size(), alloc_size, + zone_size, first_sequential_zone, + t); + + // allocate superblock reserved space. note that we do not mark + // bluefs space as allocated in the freelist; we instead rely on + // bluefs doing that itself. auto reserved = _get_ondisk_reserved(); if (fm_restore) { // we need to allocate the full space in restore case @@ -5545,14 +5550,17 @@ int BlueStore::_create_alloc() << "Please set to 0." << dendl; return -EINVAL; } - - alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size); } #endif - shared_alloc.set(Allocator::create(cct, allocator_type, - bdev->get_size(), - alloc_size, "block")); + shared_alloc.set( + Allocator::create( + cct, allocator_type, + bdev->get_size(), + alloc_size, + zone_size, + first_sequential_zone, + "block")); if (!shared_alloc.a) { lderr(cct) << __func__ << " failed to create " << allocator_type << " allocator" @@ -6700,6 +6708,8 @@ int BlueStore::mkfs() #ifdef HAVE_LIBZBD if (bdev->is_smr()) { freelist_type = "zoned"; + zone_size = bdev->get_zone_size(); + first_sequential_zone = bdev->get_conventional_region_size() / zone_size; } else #endif { @@ -6765,6 +6775,22 @@ int BlueStore::mkfs() bl.append(stringify(OMAP_PER_PG)); t->set(PREFIX_SUPER, "per_pool_omap", bl); } + +#ifdef HAVE_LIBZBD + if (bdev->is_smr()) { + { + bufferlist bl; + encode((uint64_t)zone_size, bl); + t->set(PREFIX_SUPER, "zone_size", bl); + } + { + bufferlist bl; + encode((uint64_t)first_sequential_zone, bl); + t->set(PREFIX_SUPER, "first_sequential_zone", bl); + } + } +#endif + ondisk_format = latest_ondisk_format; _prepare_ondisk_format_super(t); db->submit_transaction_sync(t); @@ -11467,6 +11493,27 @@ int BlueStore::_open_super_meta() << std::dec << dendl; } + // smr fields + { + bufferlist bl; + int r = db->get(PREFIX_SUPER, "zone_size", &bl); + if (r >= 0) { + auto p = bl.cbegin(); + decode(zone_size, p); + dout(1) << __func__ << " zone_size 0x" << std::hex << zone_size << std::dec << dendl; + } + } + { + bufferlist bl; + int r = db->get(PREFIX_SUPER, "first_sequential_zone", &bl); + if (r >= 0) { + auto p = bl.cbegin(); + decode(first_sequential_zone, p); + dout(1) << __func__ << " first_sequential_zone 0x" << std::hex + << first_sequential_zone << std::dec << dendl; + } + } + _set_per_pool_omap(); _open_statfs(); @@ -11880,18 +11927,6 @@ std::string BlueStore::_zoned_key(uint64_t offset, const ghobject_t *oid) { return zone_key + object_key; } -// For now, to avoid interface changes we piggyback zone_size (in MiB) and the -// first sequential zone number onto min_alloc_size and pass it to functions -// Allocator::create and FreelistManager::create. -uint64_t BlueStore::_zoned_piggyback_device_parameters_onto(uint64_t min_alloc_size) { - uint64_t zone_size = bdev->get_zone_size(); - uint64_t zone_size_mb = zone_size / (1024 * 1024); - uint64_t first_seq_zone = bdev->get_conventional_region_size() / zone_size; - min_alloc_size |= (zone_size_mb << 32); - min_alloc_size |= (first_seq_zone << 48); - return min_alloc_size; -} - #endif void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t) @@ -17349,7 +17384,9 @@ int BlueStore::store_allocator(Allocator* src_allocator) Allocator* BlueStore::create_bitmap_allocator(uint64_t bdev_size) { // create allocator uint64_t alloc_size = min_alloc_size; - Allocator* alloc = Allocator::create(cct, "bitmap", bdev_size, alloc_size, "recovery"); + Allocator* alloc = Allocator::create(cct, "bitmap", bdev_size, alloc_size, + zone_size, first_sequential_zone, + "recovery"); if (alloc) { return alloc; } else { diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 4a4171f41b081..c932ca36b528d 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -2149,6 +2149,10 @@ private: std::numeric_limits::digits, "not enough bits for min_alloc_size"); + // smr-only + uint64_t zone_size = 0; ///< number of SMR zones + uint64_t first_sequential_zone = 0; ///< first SMR zone that is sequential-only + enum { // Please preserve the order since it's DB persistent OMAP_BULK = 0, @@ -2419,7 +2423,6 @@ private: #ifdef HAVE_LIBZBD // Functions related to zoned storage. - uint64_t _zoned_piggyback_device_parameters_onto(uint64_t min_alloc_size); void _zoned_update_cleaning_metadata(TransContext *txc); std::string _zoned_key(uint64_t offset, const ghobject_t *oid); #endif diff --git a/src/os/bluestore/FreelistManager.h b/src/os/bluestore/FreelistManager.h index 4d375b4309456..54d27f10804be 100644 --- a/src/os/bluestore/FreelistManager.h +++ b/src/os/bluestore/FreelistManager.h @@ -26,6 +26,7 @@ public: static void setup_merge_operators(KeyValueDB *db, const std::string &type); virtual int create(uint64_t size, uint64_t granularity, + uint64_t zone_size, uint64_t first_sequential_zone, KeyValueDB::Transaction txn) = 0; virtual int init(KeyValueDB *kvdb, bool db_in_read_only, diff --git a/src/os/bluestore/ZonedAllocator.cc b/src/os/bluestore/ZonedAllocator.cc index 6144779cb68e0..4a7a1f7ef5d4d 100644 --- a/src/os/bluestore/ZonedAllocator.cc +++ b/src/os/bluestore/ZonedAllocator.cc @@ -21,17 +21,16 @@ ZonedAllocator::ZonedAllocator(CephContext* cct, int64_t size, int64_t blk_size, + int64_t _zone_size, + int64_t _first_sequential_zone, std::string_view name) - : Allocator(name, size, blk_size & 0x00000000ffffffff), + : Allocator(name, size, blk_size), cct(cct), num_free(0), size(size), - // To avoid interface changes, we piggyback zone size and the first - // sequential zone number onto the first 32 bits of 64-bit |blk_size|. - // The last 32 bits of |blk_size| is holding the actual block size. - block_size((blk_size & 0x00000000ffffffff)), - zone_size(((blk_size & 0x0000ffff00000000) >> 32) * 1024 * 1024), - first_seq_zone_num((blk_size >> 48) & 0xffff), + block_size(blk_size), + zone_size(_zone_size), + first_seq_zone_num(_first_sequential_zone), starting_zone_num(first_seq_zone_num), num_zones(size / zone_size), num_zones_to_clean(0) { diff --git a/src/os/bluestore/ZonedAllocator.h b/src/os/bluestore/ZonedAllocator.h index 5deedcae9ab02..585e8943cec67 100644 --- a/src/os/bluestore/ZonedAllocator.h +++ b/src/os/bluestore/ZonedAllocator.h @@ -70,7 +70,9 @@ class ZonedAllocator : public Allocator { public: ZonedAllocator(CephContext* cct, int64_t size, int64_t block_size, - std::string_view name); + int64_t _zone_size, + int64_t _first_sequential_zone, + std::string_view name); ~ZonedAllocator() override; const char *get_type() const override { diff --git a/src/os/bluestore/ZonedFreelistManager.cc b/src/os/bluestore/ZonedFreelistManager.cc index 3b31e202fae68..60899939200e7 100644 --- a/src/os/bluestore/ZonedFreelistManager.cc +++ b/src/os/bluestore/ZonedFreelistManager.cc @@ -81,16 +81,14 @@ ZonedFreelistManager::ZonedFreelistManager( int ZonedFreelistManager::create( uint64_t new_size, uint64_t granularity, + uint64_t new_zone_size, + uint64_t first_sequential_zone, KeyValueDB::Transaction txn) { - // To avoid interface changes, we piggyback zone size and the first sequential - // zone number onto the first 32 bits of 64-bit |granularity|. The last 32 - // bits of |granularity| is holding the actual allocation granularity, which - // is bytes_per_block. size = new_size; - bytes_per_block = granularity & 0x00000000ffffffff; - zone_size = ((granularity & 0x0000ffff00000000) >> 32) * 1024 * 1024; + bytes_per_block = granularity; + zone_size = new_zone_size; num_zones = size / zone_size; - starting_zone_num = (granularity & 0xffff000000000000) >> 48; + starting_zone_num = first_sequential_zone; enumerate_zone_num = ~0UL; ceph_assert(size % zone_size == 0); diff --git a/src/os/bluestore/ZonedFreelistManager.h b/src/os/bluestore/ZonedFreelistManager.h index c6f9be3151e9e..7ad0723369857 100644 --- a/src/os/bluestore/ZonedFreelistManager.h +++ b/src/os/bluestore/ZonedFreelistManager.h @@ -63,6 +63,8 @@ public: int create(uint64_t size, uint64_t granularity, + uint64_t zone_size, + uint64_t first_sequential_zone, KeyValueDB::Transaction txn) override; int init(KeyValueDB *kvdb, diff --git a/src/test/objectstore/Allocator_test.cc b/src/test/objectstore/Allocator_test.cc index 40fbed12bd3a0..210bc6d9d7457 100644 --- a/src/test/objectstore/Allocator_test.cc +++ b/src/test/objectstore/Allocator_test.cc @@ -26,7 +26,8 @@ public: void init_alloc(int64_t size, uint64_t min_alloc_size) { std::cout << "Creating alloc type " << string(GetParam()) << " \n"; alloc.reset(Allocator::create(g_ceph_context, GetParam(), size, - min_alloc_size)); + min_alloc_size, + 256*1048576, 100*256*1048576ull)); } void init_close() { diff --git a/src/test/objectstore/allocator_replay_test.cc b/src/test/objectstore/allocator_replay_test.cc index 7927790c485f3..811cc92cdeaf4 100644 --- a/src/test/objectstore/allocator_replay_test.cc +++ b/src/test/objectstore/allocator_replay_test.cc @@ -274,7 +274,7 @@ int replay_free_dump_and_apply(char* fname, unique_ptr alloc; alloc.reset(Allocator::create(g_ceph_context, alloc_type, - capacity, alloc_unit, alloc_name)); + capacity, alloc_unit, 0, 0, alloc_name)); auto it = o->find_first(); while (!it.end()) { -- 2.39.5