From: Pere Diaz Bou Date: Wed, 3 Jan 2024 15:32:06 +0000 (+0100) Subject: os/bluestore: remove zoned namespace support X-Git-Tag: v19.3.0~73^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=169bd8553ed5c0985e46c5da366a07965bc362e6;p=ceph.git os/bluestore: remove zoned namespace support Lately we've been adding a lot of commits that could've interfered with smr support but since no one is actively reviewing/supporting smr in bluestore, it doesn't make sense for us to mantain it. Signed-off-by: Pere Diaz Bou --- diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index 286a1126a467..f4a92295e48a 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -4897,7 +4897,6 @@ options: - stupid - avl - hybrid - - zoned with_legacy: true - name: bluestore_freelist_blocks_per_key type: size @@ -6326,7 +6325,6 @@ options: - aio - spdk - pmem - - hm_smr - name: bluestore_cleaner_sleep_interval type: float level: advanced diff --git a/src/os/CMakeLists.txt b/src/os/CMakeLists.txt index 55415fb37228..1cd85d3b213c 100644 --- a/src/os/CMakeLists.txt +++ b/src/os/CMakeLists.txt @@ -27,12 +27,6 @@ if(WITH_BLUESTORE) ) endif(WITH_BLUESTORE) -if(WITH_ZBD) - list(APPEND libos_srcs - bluestore/ZonedFreelistManager.cc - bluestore/ZonedAllocator.cc) -endif() - if(WITH_FUSE) list(APPEND libos_srcs FuseStore.cc) diff --git a/src/os/bluestore/Allocator.cc b/src/os/bluestore/Allocator.cc index 5c5b8db70ecc..7029420b53d0 100644 --- a/src/os/bluestore/Allocator.cc +++ b/src/os/bluestore/Allocator.cc @@ -173,8 +173,6 @@ Allocator *Allocator::create( std::string_view type, int64_t size, int64_t block_size, - int64_t zone_size, - int64_t first_sequential_zone, std::string_view name) { Allocator* alloc = nullptr; @@ -190,11 +188,6 @@ Allocator *Allocator::create( return new HybridAllocator(cct, size, block_size, cct->_conf.get_val("bluestore_hybrid_alloc_mem_cap"), name); -#ifdef HAVE_LIBZBD - } else if (type == "zoned") { - return new ZonedAllocator(cct, size, block_size, zone_size, first_sequential_zone, - name); -#endif } if (alloc == nullptr) { lderr(cct) << "Allocator::" << __func__ << " unknown alloc type " diff --git a/src/os/bluestore/Allocator.h b/src/os/bluestore/Allocator.h index f136c98b2926..f5a128fda876 100644 --- a/src/os/bluestore/Allocator.h +++ b/src/os/bluestore/Allocator.h @@ -72,8 +72,6 @@ public: std::string_view type, int64_t size, int64_t block_size, - int64_t zone_size = 0, - int64_t firs_sequential_zone = 0, const std::string_view name = "" ); diff --git a/src/os/bluestore/BitmapFreelistManager.cc b/src/os/bluestore/BitmapFreelistManager.cc index bec6ace868b1..f1f4831d5671 100644 --- a/src/os/bluestore/BitmapFreelistManager.cc +++ b/src/os/bluestore/BitmapFreelistManager.cc @@ -67,7 +67,6 @@ BitmapFreelistManager::BitmapFreelistManager(CephContext* cct, } int BitmapFreelistManager::create(uint64_t new_size, uint64_t granularity, - uint64_t zone_size, uint64_t first_sequential_zone, KeyValueDB::Transaction txn) { bytes_per_block = granularity; diff --git a/src/os/bluestore/BitmapFreelistManager.h b/src/os/bluestore/BitmapFreelistManager.h index 8e4ea8fd385c..5b04e8fd28cc 100644 --- a/src/os/bluestore/BitmapFreelistManager.h +++ b/src/os/bluestore/BitmapFreelistManager.h @@ -63,7 +63,6 @@ public: static void setup_merge_operator(KeyValueDB *db, std::string prefix); int create(uint64_t size, uint64_t granularity, - uint64_t zone_size, uint64_t first_sequential_zone, KeyValueDB::Transaction txn) override; int init(KeyValueDB *kvdb, bool db_in_read_only, diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 54d0bbc38e57..04b680d950d4 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -749,7 +749,6 @@ void BlueFS::_init_alloc() alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator, bdev[id]->get_size(), alloc_size[id], - 0, 0, name); alloc[id]->init_add_free( block_reserved[id], diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 4f449c3d1c08..0c0cb5c00cf5 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -53,11 +53,6 @@ #include "common/WorkQueue.h" #include "kv/KeyValueHistogram.h" -#ifdef HAVE_LIBZBD -#include "ZonedAllocator.h" -#include "ZonedFreelistManager.h" -#endif - #if defined(WITH_LTTNG) #define TRACEPOINT_DEFINE #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE @@ -134,12 +129,6 @@ const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist) const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager) const string PREFIX_SHARED_BLOB = "X"; // u64 SB id -> shared_blob_t -#ifdef HAVE_LIBZBD -const string PREFIX_ZONED_FM_META = "Z"; // (see ZonedFreelistManager) -const string PREFIX_ZONED_FM_INFO = "z"; // (see ZonedFreelistManager) -const string PREFIX_ZONED_CL_INFO = "G"; // (per-zone cleaner metadata) -#endif - const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs"; #define OBJECT_MAX_SIZE 0xffffffff // 32 bits @@ -569,37 +558,6 @@ static int get_key_pool_stat(const string& key, uint64_t* pool_id) return 0; } -#ifdef HAVE_LIBZBD -static void get_zone_offset_object_key( - uint32_t zone, - uint64_t offset, - ghobject_t oid, - std::string *key) -{ - key->clear(); - _key_encode_u32(zone, key); - _key_encode_u64(offset, key); - _get_object_key(oid, key); -} - -static int get_key_zone_offset_object( - const string& key, - uint32_t *zone, - uint64_t *offset, - ghobject_t *oid) -{ - const char *p = key.c_str(); - if (key.length() < sizeof(uint64_t) + sizeof(uint32_t) + ENCODED_KEY_PREFIX_LEN + 1) - return -1; - p = _key_decode_u32(p, zone); - p = _key_decode_u64(p, offset); - int r = _get_key_object(p, oid); - if (r < 0) { - return r; - } - return 0; -} -#endif template void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em) @@ -5690,9 +5648,6 @@ BlueStore::BlueStore(CephContext *cct, finisher(cct, "commit_finisher", "cfin"), kv_sync_thread(this), kv_finalize_thread(this), -#ifdef HAVE_LIBZBD - zoned_cleaner_thread(this), -#endif min_alloc_size(_min_alloc_size), min_alloc_size_order(std::countr_zero(_min_alloc_size)), mempool_thread(this) @@ -6628,12 +6583,6 @@ void BlueStore::_set_alloc_sizes(void) { max_alloc_size = cct->_conf->bluestore_max_alloc_size; -#ifdef HAVE_LIBZBD - ceph_assert(bdev); - if (bdev->is_smr()) { - prefer_deferred_size = 0; - } else -#endif if (cct->_conf->bluestore_prefer_deferred_size) { prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size; } else { @@ -6741,8 +6690,7 @@ int BlueStore::_open_fm(KeyValueDB::Transaction t, bool can_have_null_fm = !is_db_rotational() && !read_only && db_avail && - cct->_conf->bluestore_allocation_from_file && - !bdev->is_smr(); + cct->_conf->bluestore_allocation_from_file; // When allocation-info is stored in a single file we set freelist_type to "null" if (can_have_null_fm) { @@ -6764,20 +6712,13 @@ int BlueStore::_open_fm(KeyValueDB::Transaction t, ceph_assert(cct->_conf->bdev_block_size <= min_alloc_size); uint64_t alloc_size = min_alloc_size; - if (bdev->is_smr() && freelist_type != "zoned") { - derr << "SMR device but freelist_type = " << freelist_type << " (not zoned)" - << dendl; - return -EINVAL; - } if (!bdev->is_smr() && freelist_type == "zoned") { derr << "non-SMR device (or SMR support not built-in) but freelist_type = zoned" << dendl; return -EINVAL; } - fm->create(bdev->get_size(), alloc_size, - zone_size, first_sequential_zone, - t); + fm->create(bdev->get_size(), alloc_size, t); // allocate superblock reserved space. note that we do not mark // bluefs space as allocated in the freelist; we instead rely on @@ -6902,18 +6843,10 @@ int BlueStore::_create_alloc() std::string allocator_type = cct->_conf->bluestore_allocator; -#ifdef HAVE_LIBZBD - if (freelist_type == "zoned") { - allocator_type = "zoned"; - } -#endif - alloc = Allocator::create( cct, allocator_type, bdev->get_size(), alloc_size, - zone_size, - first_sequential_zone, "block"); if (!alloc) { lderr(cct) << __func__ << " failed to create " << allocator_type << " allocator" @@ -6921,27 +6854,8 @@ int BlueStore::_create_alloc() return -EINVAL; } -#ifdef HAVE_LIBZBD - if (freelist_type == "zoned") { - Allocator *a = Allocator::create( - cct, cct->_conf->bluestore_allocator, - bdev->get_conventional_region_size(), - alloc_size, - zone_size, 0, - "zoned_block"); - if (!a) { - lderr(cct) << __func__ << " failed to create " << cct->_conf->bluestore_allocator - << " allocator" << dendl; - delete alloc; - return -EINVAL; - } - shared_alloc.set(a, alloc_size); - } else -#endif - { - // BlueFS will share the same allocator - shared_alloc.set(alloc, alloc_size); - } + // BlueFS will share the same allocator + shared_alloc.set(alloc, alloc_size); return 0; } @@ -6954,66 +6868,6 @@ int BlueStore::_init_alloc(std::map *zone_adjustments) } ceph_assert(alloc != NULL); -#ifdef HAVE_LIBZBD - if (bdev->is_smr()) { - auto a = dynamic_cast(alloc); - ceph_assert(a); - auto f = dynamic_cast(fm); - ceph_assert(f); - vector wp = bdev->get_zones(); - vector zones = f->get_zone_states(db); - ceph_assert(wp.size() == zones.size()); - - // reconcile zone state - auto num_zones = bdev->get_size() / zone_size; - for (unsigned i = first_sequential_zone; i < num_zones; ++i) { - ceph_assert(wp[i] >= i * zone_size); - ceph_assert(wp[i] <= (i + 1) * zone_size); // pos might be at start of next zone - uint64_t p = wp[i] - i * zone_size; - if (zones[i].write_pointer > p) { - derr << __func__ << " zone 0x" << std::hex << i - << " bluestore write pointer 0x" << zones[i].write_pointer - << " > device write pointer 0x" << p - << std::dec << " -- VERY SUSPICIOUS!" << dendl; - } else if (zones[i].write_pointer < p) { - // this is "normal" in that it can happen after any crash (if we have a - // write in flight but did not manage to commit the transaction) - auto delta = p - zones[i].write_pointer; - dout(1) << __func__ << " zone 0x" << std::hex << i - << " device write pointer 0x" << p - << " > bluestore pointer 0x" << zones[i].write_pointer - << ", advancing 0x" << delta << std::dec << dendl; - (*zone_adjustments)[zones[i].write_pointer] = delta; - zones[i].num_dead_bytes += delta; - zones[i].write_pointer = p; - } - } - - // start with conventional zone "free" (bluefs may adjust this when it starts up) - auto reserved = _get_ondisk_reserved(); - // for now we require a conventional zone - ceph_assert(bdev->get_conventional_region_size()); - ceph_assert(shared_alloc.a != alloc); // zoned allocator doesn't use conventional region - shared_alloc.a->init_add_free( - reserved, - p2align(bdev->get_conventional_region_size(), min_alloc_size) - reserved); - - // init sequential zone based on the device's write pointers - a->init_from_zone_pointers(std::move(zones)); - dout(1) << __func__ - << " loaded zone pointers: " - << std::hex - << ", allocator type " << alloc->get_type() - << ", capacity 0x" << alloc->get_capacity() - << ", block size 0x" << alloc->get_block_size() - << ", free 0x" << alloc->get_free() - << ", fragmentation " << alloc->get_fragmentation() - << std::dec << dendl; - - return 0; - } -#endif - uint64_t num = 0, bytes = 0; utime_t start_time = ceph_clock_now(); if (!fm->is_null_manager()) { @@ -7070,24 +6924,6 @@ int BlueStore::_init_alloc(std::map *zone_adjustments) void BlueStore::_post_init_alloc(const std::map& zone_adjustments) { int r = 0; -#ifdef HAVE_LIBZBD - if (bdev->is_smr()) { - if (zone_adjustments.empty()) { - return; - } - dout(1) << __func__ << " adjusting freelist based on device write pointers" << dendl; - auto f = dynamic_cast(fm); - ceph_assert(f); - KeyValueDB::Transaction t = db->get_transaction(); - for (auto& i : zone_adjustments) { - // allocate AND release since this gap is now dead space - // note that the offset is imprecise, but only need to select the zone - f->allocate(i.first, i.second, t); - f->release(i.first, i.second, t); - } - r = db->submit_transaction_sync(t); - } else -#endif if (fm->is_null_manager()) { // Now that we load the allocation map we need to invalidate the file as new allocation won't be reflected // Changes to the allocation map (alloc/release) are not updated inline and will only be stored on umount() @@ -7584,11 +7420,7 @@ int BlueStore::_open_db_and_around(bool read_only, bool to_repair) } // when function is called in repair mode (to_repair=true) we skip db->open()/create() - if (!is_db_rotational() && !read_only && !to_repair && cct->_conf->bluestore_allocation_from_file -#ifdef HAVE_LIBZBD - && !bdev->is_smr() -#endif - ) { + if (!is_db_rotational() && !read_only && !to_repair && cct->_conf->bluestore_allocation_from_file) { dout(5) << __func__ << "::NCB::Commit to Null-Manager" << dendl; commit_to_null_manager(); need_to_destage_allocation_file = true; @@ -8271,18 +8103,7 @@ int BlueStore::mkfs() if (r < 0) goto out_close_fsid; - // choose freelist manager -#ifdef HAVE_LIBZBD - if (bdev->is_smr()) { - freelist_type = "zoned"; - zone_size = bdev->get_zone_size(); - first_sequential_zone = bdev->get_conventional_region_size() / zone_size; - bdev->reset_all_zones(); - } else -#endif - { - freelist_type = "bitmap"; - } + freelist_type = "bitmap"; dout(10) << " freelist_type " << freelist_type << dendl; // choose min_alloc_size @@ -8333,13 +8154,6 @@ int BlueStore::mkfs() reserved = _get_ondisk_reserved(); alloc->init_add_free(reserved, p2align(bdev->get_size(), min_alloc_size) - reserved); -#ifdef HAVE_LIBZBD - if (bdev->is_smr() && alloc != shared_alloc.a) { - shared_alloc.a->init_add_free(reserved, - p2align(bdev->get_conventional_region_size(), - min_alloc_size) - reserved); - } -#endif r = _open_db(true); if (r < 0) @@ -8372,21 +8186,6 @@ int BlueStore::mkfs() t->set(PREFIX_SUPER, "per_pool_omap", bl); } -#ifdef HAVE_LIBZBD - if (bdev->is_smr()) { - { - bufferlist bl; - encode((uint64_t)zone_size, bl); - t->set(PREFIX_SUPER, "zone_size", bl); - } - { - bufferlist bl; - encode((uint64_t)first_sequential_zone, bl); - t->set(PREFIX_SUPER, "first_sequential_zone", bl); - } - } -#endif - ondisk_format = latest_ondisk_format; _prepare_ondisk_format_super(t); db->submit_transaction_sync(t); @@ -8900,12 +8699,6 @@ int BlueStore::_mount() return r; } -#ifdef HAVE_LIBZBD - if (bdev->is_smr()) { - _zoned_cleaner_start(); - } -#endif - mempool_thread.init(); if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) && @@ -8937,12 +8730,6 @@ int BlueStore::umount() if (!_kv_only) { mempool_thread.shutdown(); -#ifdef HAVE_LIBZBD - if (bdev->is_smr()) { - dout(20) << __func__ << " stopping zone cleaner thread" << dendl; - _zoned_cleaner_stop(); - } -#endif dout(20) << __func__ << " stopping kv thread" << dendl; _kv_stop(); // skip cache cleanup step on fast shutdown @@ -9339,7 +9126,6 @@ BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow( &ctx.expected_pool_statfs[pool_id] : &ctx.expected_store_statfs; - map zone_first_offsets; // for zoned/smr devices dout(10) << __func__ << " " << oid << dendl; OnodeRef o; @@ -9396,22 +9182,6 @@ BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow( ceph_assert(l.blob); const bluestore_blob_t& blob = l.blob->get_blob(); -#ifdef HAVE_LIBZBD - if (bdev->is_smr() && depth != FSCK_SHALLOW) { - for (auto& e : blob.get_extents()) { - if (e.is_valid()) { - uint32_t zone = e.offset / zone_size; - uint64_t offset = e.offset % zone_size; - auto p = zone_first_offsets.find(zone); - if (p == zone_first_offsets.end() || p->second > offset) { - // FIXME: use interator for guided insert? - zone_first_offsets[zone] = offset; - } - } - } - } -#endif - auto& ref = ref_map[l.blob]; if (ref.is_empty()) { uint32_t min_release_size = blob.get_release_size(min_alloc_size); @@ -9545,33 +9315,6 @@ BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow( } } -#ifdef HAVE_LIBZBD - if (bdev->is_smr() && depth != FSCK_SHALLOW) { - for (auto& [zone, first_offset] : zone_first_offsets) { - auto p = (*ctx.zone_refs)[zone].find(oid); - if (p != (*ctx.zone_refs)[zone].end()) { - if (first_offset < p->second) { - dout(20) << " slightly wonky zone ref 0x" << std::hex << zone - << " offset 0x" << p->second - << " but first offset is 0x" << first_offset - << "; this can happen due to clone_range" - << dendl; - } else { - dout(20) << " good zone ref 0x" << std::hex << zone << " offset 0x" << p->second - << " <= first offset 0x" << first_offset - << std::dec << dendl; - } - (*ctx.zone_refs)[zone].erase(p); - } else { - derr << "fsck error: " << oid << " references zone 0x" << std::hex << zone - << " but there is no zone ref" << std::dec << dendl; - // FIXME: add repair - ++errors; - } - } - } -#endif - if (broken) { derr << "fsck error: " << oid << " - " << broken << " zombie spanning blob(s) found, the first one: " @@ -10449,69 +10192,6 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair) goto out_scan; } -#ifdef HAVE_LIBZBD - if (bdev->is_smr()) { - auto a = dynamic_cast(alloc); - ceph_assert(a); - auto f = dynamic_cast(fm); - ceph_assert(f); - vector wp = bdev->get_zones(); - vector zones = f->get_zone_states(db); - ceph_assert(wp.size() == zones.size()); - auto num_zones = bdev->get_size() / zone_size; - for (unsigned i = first_sequential_zone; i < num_zones; ++i) { - uint64_t p = wp[i] == (i + 1) * zone_size ? zone_size : wp[i] % zone_size; - if (zones[i].write_pointer > p && - zones[i].num_dead_bytes < zones[i].write_pointer) { - derr << "fsck error: zone 0x" << std::hex << i - << " bluestore write pointer 0x" << zones[i].write_pointer - << " > device write pointer 0x" << p - << " (with only 0x" << zones[i].num_dead_bytes << " dead bytes)" - << std::dec << dendl; - ++errors; - } - } - - if (depth != FSCK_SHALLOW) { - // load zone refs - zone_refs.resize(bdev->get_size() / zone_size); - it = db->get_iterator(PREFIX_ZONED_CL_INFO, KeyValueDB::ITERATOR_NOCACHE); - if (it) { - for (it->lower_bound(string()); - it->valid(); - it->next()) { - uint32_t zone = 0; - uint64_t offset = 0; - ghobject_t oid; - string key = it->key(); - int r = get_key_zone_offset_object(key, &zone, &offset, &oid); - if (r < 0) { - derr << "fsck error: invalid zone ref key " << pretty_binary_string(key) - << dendl; - if (repair) { - repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key); - } - ++errors; - continue; - } - dout(30) << " zone ref 0x" << std::hex << zone << " offset 0x" << offset - << " -> " << std::dec << oid << dendl; - if (zone_refs[zone].count(oid)) { - derr << "fsck error: second zone ref in zone 0x" << std::hex << zone - << " offset 0x" << offset << std::dec << " for " << oid << dendl; - if (repair) { - repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key); - } - ++errors; - continue; - } - zone_refs[zone][oid] = offset; - } - } - } - } -#endif - dout(1) << __func__ << " checking shared_blobs (phase 1)" << dendl; it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE); if (it) { @@ -10576,20 +10256,6 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair) _fsck_check_objects(depth, ctx); } -#ifdef HAVE_LIBZBD - if (bdev->is_smr() && depth != FSCK_SHALLOW) { - dout(1) << __func__ << " checking for leaked zone refs" << dendl; - for (uint32_t zone = 0; zone < zone_refs.size(); ++zone) { - for (auto& [oid, offset] : zone_refs[zone]) { - derr << "fsck error: stray zone ref 0x" << std::hex << zone - << " offset 0x" << offset << " -> " << std::dec << oid << dendl; - // FIXME: add repair - ++errors; - } - } - } -#endif - sb_ref_mismatches = sb_ref_counts.count_non_zero(); if (sb_ref_mismatches != 0) { derr << "fsck error:" << "*" << sb_ref_mismatches @@ -11035,150 +10701,76 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair) // skip freelist vs allocated compare when we have Null fm if (!fm->is_null_manager()) { dout(1) << __func__ << " checking freelist vs allocated" << dendl; -#ifdef HAVE_LIBZBD - if (freelist_type == "zoned") { - // verify per-zone state - // - verify no allocations beyond write pointer - // - verify num_dead_bytes count (neither allocated nor - // free space past the write pointer) - auto a = dynamic_cast(alloc); - auto num_zones = bdev->get_size() / zone_size; - - // mark the free space past the write pointer - for (uint32_t zone = first_sequential_zone; zone < num_zones; ++zone) { - auto wp = a->get_write_pointer(zone); - uint64_t offset = zone_size * zone + wp; - uint64_t length = zone_size - wp; - if (!length) { - continue; - } - bool intersects = false; - dout(10) << " marking zone 0x" << std::hex << zone - << " region after wp 0x" << offset << "~" << length - << std::dec << dendl; - apply_for_bitset_range( - offset, length, alloc_size, used_blocks, - [&](uint64_t pos, mempool_dynamic_bitset &bs) { - if (bs.test(pos)) { - derr << "fsck error: zone 0x" << std::hex << zone - << " has used space at 0x" << pos * alloc_size - << " beyond write pointer 0x" << wp - << std::dec << dendl; - intersects = true; - } else { - bs.set(pos); - } - } - ); - if (intersects) { - ++errors; - } - } - - used_blocks.flip(); - - // skip conventional zones - uint64_t pos = (first_sequential_zone * zone_size) / min_alloc_size - 1; - pos = used_blocks.find_next(pos); - - uint64_t zone_dead = 0; - for (uint32_t zone = first_sequential_zone; - zone < num_zones; - ++zone, zone_dead = 0) { - while (pos != decltype(used_blocks)::npos && - (pos * min_alloc_size) / zone_size == zone) { - dout(40) << " zone 0x" << std::hex << zone - << " dead 0x" << (pos * min_alloc_size) << "~" << min_alloc_size - << std::dec << dendl; - zone_dead += min_alloc_size; - pos = used_blocks.find_next(pos); - } - dout(20) << " zone 0x" << std::hex << zone << " dead is 0x" << zone_dead - << std::dec << dendl; - // cross-check dead bytes against zone state - if (a->get_dead_bytes(zone) != zone_dead) { - derr << "fsck error: zone 0x" << std::hex << zone << " has 0x" << zone_dead - << " dead bytes but freelist says 0x" << a->get_dead_bytes(zone) - << dendl; - ++errors; - // TODO: repair - } - } - used_blocks.flip(); - } else -#endif - { - fm->enumerate_reset(); - uint64_t offset, length; - while (fm->enumerate_next(db, &offset, &length)) { - bool intersects = false; - apply_for_bitset_range( - offset, length, alloc_size, used_blocks, - [&](uint64_t pos, mempool_dynamic_bitset &bs) { - ceph_assert(pos < bs.size()); - if (bs.test(pos) && !bluefs_used_blocks.test(pos)) { - if (offset == DB_SUPER_RESERVED && - length == min_alloc_size - DB_SUPER_RESERVED) { - // this is due to the change just after luminous to min_alloc_size - // granularity allocations, and our baked in assumption at the top - // of _fsck that 0~round_up_to(DB_SUPER_RESERVED,min_alloc_size) is used - // (vs luminous's round_up_to(DB_SUPER_RESERVED,block_size)). harmless, - // since we will never allocate this region below min_alloc_size. - dout(10) << __func__ << " ignoring free extent between DB_SUPER_RESERVED" - << " and min_alloc_size, 0x" << std::hex << offset << "~" - << length << std::dec << dendl; - } else { - intersects = true; - if (repair) { - repairer.fix_false_free(db, fm, - pos * min_alloc_size, - min_alloc_size); - } - } - } else { - bs.set(pos); - } - } - ); - if (intersects) { - derr << "fsck error: free extent 0x" << std::hex << offset - << "~" << length << std::dec - << " intersects allocated blocks" << dendl; - ++errors; - } - } - fm->enumerate_reset(); - - // check for leaked extents - size_t count = used_blocks.count(); - if (used_blocks.size() != count) { - ceph_assert(used_blocks.size() > count); - used_blocks.flip(); - size_t start = used_blocks.find_first(); - while (start != decltype(used_blocks)::npos) { - size_t cur = start; - while (true) { - size_t next = used_blocks.find_next(cur); - if (next != cur + 1) { - ++errors; - derr << "fsck error: leaked extent 0x" << std::hex - << ((uint64_t)start * fm->get_alloc_size()) << "~" - << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec - << dendl; - if (repair) { - repairer.fix_leaked(db, - fm, - start * min_alloc_size, - (cur + 1 - start) * min_alloc_size); - } - start = next; - break; - } - cur = next; - } - } - used_blocks.flip(); - } + fm->enumerate_reset(); + uint64_t offset, length; + while (fm->enumerate_next(db, &offset, &length)) { + bool intersects = false; + apply_for_bitset_range( + offset, length, alloc_size, used_blocks, + [&](uint64_t pos, mempool_dynamic_bitset &bs) { + ceph_assert(pos < bs.size()); + if (bs.test(pos) && !bluefs_used_blocks.test(pos)) { + if (offset == DB_SUPER_RESERVED && + length == min_alloc_size - DB_SUPER_RESERVED) { + // this is due to the change just after luminous to min_alloc_size + // granularity allocations, and our baked in assumption at the top + // of _fsck that 0~round_up_to(DB_SUPER_RESERVED,min_alloc_size) is used + // (vs luminous's round_up_to(DB_SUPER_RESERVED,block_size)). harmless, + // since we will never allocate this region below min_alloc_size. + dout(10) << __func__ << " ignoring free extent between DB_SUPER_RESERVED" + << " and min_alloc_size, 0x" << std::hex << offset << "~" + << length << std::dec << dendl; + } else { + intersects = true; + if (repair) { + repairer.fix_false_free(db, fm, + pos * min_alloc_size, + min_alloc_size); + } + } + } else { + bs.set(pos); + } + } + ); + if (intersects) { + derr << "fsck error: free extent 0x" << std::hex << offset + << "~" << length << std::dec + << " intersects allocated blocks" << dendl; + ++errors; + } + } + fm->enumerate_reset(); + + // check for leaked extents + size_t count = used_blocks.count(); + if (used_blocks.size() != count) { + ceph_assert(used_blocks.size() > count); + used_blocks.flip(); + size_t start = used_blocks.find_first(); + while (start != decltype(used_blocks)::npos) { + size_t cur = start; + while (true) { + size_t next = used_blocks.find_next(cur); + if (next != cur + 1) { + ++errors; + derr << "fsck error: leaked extent 0x" << std::hex + << ((uint64_t)start * fm->get_alloc_size()) << "~" + << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec + << dendl; + if (repair) { + repairer.fix_leaked(db, + fm, + start * min_alloc_size, + (cur + 1 - start) * min_alloc_size); + } + start = next; + break; + } + cur = next; + } + } + used_blocks.flip(); } } } @@ -13545,33 +13137,6 @@ int BlueStore::_open_super_meta() logger->set(l_bluestore_alloc_unit, min_alloc_size); } - // smr fields - { - bufferlist bl; - int r = db->get(PREFIX_SUPER, "zone_size", &bl); - if (r >= 0) { - auto p = bl.cbegin(); - decode(zone_size, p); - dout(1) << __func__ << " zone_size 0x" << std::hex << zone_size << std::dec << dendl; - ceph_assert(bdev->is_smr()); - } else { - ceph_assert(!bdev->is_smr()); - } - } - { - bufferlist bl; - int r = db->get(PREFIX_SUPER, "first_sequential_zone", &bl); - if (r >= 0) { - auto p = bl.cbegin(); - decode(first_sequential_zone, p); - dout(1) << __func__ << " first_sequential_zone 0x" << std::hex - << first_sequential_zone << std::dec << dendl; - ceph_assert(bdev->is_smr()); - } else { - ceph_assert(!bdev->is_smr()); - } - } - _set_per_pool_omap(); _open_statfs(); @@ -14003,29 +13568,6 @@ void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t) } } -#ifdef HAVE_LIBZBD - if (bdev->is_smr()) { - for (auto& i : txc->old_zone_offset_refs) { - dout(20) << __func__ << " rm ref zone 0x" << std::hex << i.first.second - << " offset 0x" << i.second << std::dec - << " -> " << i.first.first->oid << dendl; - string key; - get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key); - txc->t->rmkey(PREFIX_ZONED_CL_INFO, key); - } - for (auto& i : txc->new_zone_offset_refs) { - // (zone, offset) -> oid - dout(20) << __func__ << " add ref zone 0x" << std::hex << i.first.second - << " offset 0x" << i.second << std::dec - << " -> " << i.first.first->oid << dendl; - string key; - get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key); - bufferlist v; - txc->t->set(PREFIX_ZONED_CL_INFO, key, v); - } - } -#endif - _txc_update_store_statfs(txc); } @@ -14747,210 +14289,6 @@ void BlueStore::_kv_finalize_thread() kv_finalize_started = false; } -#ifdef HAVE_LIBZBD -void BlueStore::_zoned_cleaner_start() -{ - dout(10) << __func__ << dendl; - zoned_cleaner_thread.create("bstore_zcleaner"); -} - -void BlueStore::_zoned_cleaner_stop() -{ - dout(10) << __func__ << dendl; - { - std::unique_lock l{zoned_cleaner_lock}; - while (!zoned_cleaner_started) { - zoned_cleaner_cond.wait(l); - } - zoned_cleaner_stop = true; - zoned_cleaner_cond.notify_all(); - } - zoned_cleaner_thread.join(); - { - std::lock_guard l{zoned_cleaner_lock}; - zoned_cleaner_stop = false; - } - dout(10) << __func__ << " done" << dendl; -} - -void BlueStore::_zoned_cleaner_thread() -{ - dout(10) << __func__ << " start" << dendl; - std::unique_lock l{zoned_cleaner_lock}; - ceph_assert(!zoned_cleaner_started); - zoned_cleaner_started = true; - zoned_cleaner_cond.notify_all(); - auto a = dynamic_cast(alloc); - ceph_assert(a); - auto f = dynamic_cast(fm); - ceph_assert(f); - while (true) { - // thresholds to trigger cleaning - // FIXME - float min_score = .05; // score: bytes saved / bytes moved - uint64_t min_saved = zone_size / 32; // min bytes saved to consider cleaning - auto zone_to_clean = a->pick_zone_to_clean(min_score, min_saved); - if (zone_to_clean < 0) { - if (zoned_cleaner_stop) { - break; - } - auto period = ceph::make_timespan(cct->_conf->bluestore_cleaner_sleep_interval); - dout(20) << __func__ << " sleep for " << period << dendl; - zoned_cleaner_cond.wait_for(l, period); - dout(20) << __func__ << " wake" << dendl; - } else { - l.unlock(); - a->set_cleaning_zone(zone_to_clean); - _zoned_clean_zone(zone_to_clean, a, f); - a->clear_cleaning_zone(zone_to_clean); - l.lock(); - } - } - dout(10) << __func__ << " finish" << dendl; - zoned_cleaner_started = false; -} - -void BlueStore::_zoned_clean_zone( - uint64_t zone, - ZonedAllocator *a, - ZonedFreelistManager *f - ) -{ - dout(10) << __func__ << " cleaning zone 0x" << std::hex << zone << std::dec << dendl; - - KeyValueDB::Iterator it = db->get_iterator(PREFIX_ZONED_CL_INFO); - std::string zone_start; - get_zone_offset_object_key(zone, 0, ghobject_t(), &zone_start); - for (it->lower_bound(zone_start); it->valid(); it->next()) { - uint32_t z; - uint64_t offset; - ghobject_t oid; - string k = it->key(); - int r = get_key_zone_offset_object(k, &z, &offset, &oid); - if (r < 0) { - derr << __func__ << " failed to decode zone ref " << pretty_binary_string(k) - << dendl; - continue; - } - if (zone != z) { - dout(10) << __func__ << " reached end of zone refs" << dendl; - break; - } - dout(10) << __func__ << " zone 0x" << std::hex << zone << " offset 0x" << offset - << std::dec << " " << oid << dendl; - _clean_some(oid, zone); - } - - if (a->get_live_bytes(zone) > 0) { - derr << "zone 0x" << std::hex << zone << " still has 0x" << a->get_live_bytes(zone) - << " live bytes" << std::dec << dendl; - // should we do something else here to avoid a live-lock in the event of a problem? - return; - } - - // make sure transactions flush/drain/commit (and data is all rewritten - // safely elsewhere) before we blow away the cleaned zone - _osr_drain_all(); - - // reset the device zone - dout(10) << __func__ << " resetting zone 0x" << std::hex << zone << std::dec << dendl; - bdev->reset_zone(zone); - - // record that we can now write there - f->mark_zone_to_clean_free(zone, db); - bdev->flush(); - - // then allow ourselves to start allocating there - dout(10) << __func__ << " done cleaning zone 0x" << std::hex << zone << std::dec - << dendl; - a->reset_zone(zone); -} - -void BlueStore::_clean_some(ghobject_t oid, uint32_t zone) -{ - dout(10) << __func__ << " " << oid << " from zone 0x" << std::hex << zone << std::dec - << dendl; - - CollectionRef cref = _get_collection_by_oid(oid); - if (!cref) { - dout(10) << __func__ << " can't find collection for " << oid << dendl; - return; - } - Collection *c = cref.get(); - - // serialize io dispatch vs other transactions - std::lock_guard l(atomic_alloc_and_submit_lock); - std::unique_lock l2(c->lock); - - auto o = c->get_onode(oid, false); - if (!o) { - dout(10) << __func__ << " can't find " << oid << dendl; - return; - } - - o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); - _dump_onode<30>(cct, *o); - - // NOTE: This is a naive rewrite strategy. If any blobs are - // shared, they will be duplicated for each object that references - // them. That means any cloned/snapshotted objects will explode - // their utilization. This won't matter for RGW workloads, but - // for RBD and CephFS it is completely unacceptable, and it's - // entirely reasonable to have "archival" data workloads on SMR - // for CephFS and (possibly/probably) RBD. - // - // At some point we need to replace this with something more - // sophisticated that ensures that a shared blob gets moved once - // and all referencing objects get updated to point to the new - // location. - - map to_move; - for (auto& e : o->extent_map.extent_map) { - bool touches_zone = false; - for (auto& be : e.blob->get_blob().get_extents()) { - if (be.is_valid()) { - uint32_t z = be.offset / zone_size; - if (z == zone) { - touches_zone = true; - break; - } - } - } - if (touches_zone) { - to_move[e.logical_offset] = e.length; - } - } - if (to_move.empty()) { - dout(10) << __func__ << " no references to zone 0x" << std::hex << zone - << std::dec << " from " << oid << dendl; - return; - } - - dout(10) << __func__ << " rewriting object extents 0x" << std::hex << to_move - << std::dec << dendl; - OpSequencer *osr = c->osr.get(); - TransContext *txc = _txc_create(c, osr, nullptr); - - spg_t pgid; - if (c->cid.is_pg(&pgid)) { - txc->osd_pool_id = pgid.pool(); - } - - for (auto& [offset, length] : to_move) { - bufferlist bl; - int r = _do_read(c, o, offset, length, bl, 0); - ceph_assert(r == (int)length); - - r = _do_write(txc, cref, o, offset, length, bl, 0); - ceph_assert(r >= 0); - } - txc->write_onode(o); - - _txc_write_nodes(txc, txc->t); - _txc_finalize_kv(txc, txc->t); - _txc_state_proc(txc); -} -#endif bluestore_deferred_op_t *BlueStore::_get_deferred_op( TransContext *txc, uint64_t len) @@ -15316,16 +14654,6 @@ int BlueStore::queue_transactions( OpSequencer *osr = c->osr.get(); dout(10) << __func__ << " ch " << c << " " << c->cid << dendl; - // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O - // submission to happen atomically because if I/O submission happens in a - // different order than I/O allocation, we end up issuing non-sequential - // writes to the drive. This is a temporary solution until ZONE APPEND - // support matures in the kernel. For more information please see: - // https://www.usenix.org/conference/vault20/presentation/bjorling - if (bdev->is_smr()) { - atomic_alloc_and_submit_lock.lock(); - } - // prepare TransContext *txc = _txc_create(static_cast(ch.get()), osr, &on_commit, op); @@ -15391,10 +14719,6 @@ int BlueStore::queue_transactions( // execute (start) _txc_state_proc(txc); - if (bdev->is_smr()) { - atomic_alloc_and_submit_lock.unlock(); - } - // we're immediately readable (unlike FileStore) for (auto c : on_applied_sync) { c->complete(0); @@ -15913,36 +15237,6 @@ void BlueStore::_do_write_small( // than 'offset' only). o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off); -#ifdef HAVE_LIBZBD - // On zoned devices, the first goal is to support non-overwrite workloads, - // such as RGW, with large, aligned objects. Therefore, for user writes - // _do_write_small should not trigger. OSDs, however, write and update a tiny - // amount of metadata, such as OSD maps, to disk. For those cases, we - // temporarily just pad them to min_alloc_size and write them to a new place - // on every update. - if (bdev->is_smr()) { - uint64_t b_off = p2phase(offset, alloc_len); - uint64_t b_off0 = b_off; - o->extent_map.punch_hole(c, offset, length, &wctx->old_extents); - - // Zero detection -- small block - if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) { - BlobRef b = c->new_blob(); - _pad_zeros(&bl, &b_off0, min_alloc_size); - wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, true); - } else { // if (bl.is_zero()) - dout(20) << __func__ << " skip small zero block " << std::hex - << " (0x" << b_off0 << "~" << bl.length() << ")" - << " (0x" << b_off << "~" << length << ")" - << std::dec << dendl; - logger->inc(l_bluestore_write_small_skipped); - logger->inc(l_bluestore_write_small_skipped_bytes, length); - } - - return; - } -#endif - // Look for an existing mutable blob we can use. auto begin = o->extent_map.extent_map.begin(); auto end = o->extent_map.extent_map.end(); @@ -16972,27 +16266,6 @@ void BlueStore::_wctx_finish( WriteContext *wctx, set *maybe_unshared_blobs) { -#ifdef HAVE_LIBZBD - bool is_smr = bdev && bdev->is_smr(); - if (is_smr) { - for (auto& w : wctx->writes) { - for (auto& e : w.b->get_blob().get_extents()) { - if (!e.is_valid()) { - continue; - } - uint32_t zone = e.offset / zone_size; - if (!o->onode.zone_offset_refs.count(zone)) { - uint64_t zoff = e.offset % zone_size; - dout(20) << __func__ << " add ref zone 0x" << std::hex << zone - << " offset 0x" << zoff << std::dec << dendl; - txc->note_write_zone_offset(o, zone, zoff); - } - } - } - } - set zones_with_releases; -#endif - auto oep = wctx->old_extents.begin(); while (oep != wctx->old_extents.end()) { auto &lo = *oep; @@ -17020,12 +16293,6 @@ void BlueStore::_wctx_finish( b->shared_blob->put_ref( e.offset, e.length, &final, unshare_ptr); -#ifdef HAVE_LIBZBD - // we also drop zone ref for shared blob extents - if (is_smr && e.is_valid()) { - zones_with_releases.insert(e.offset / zone_size); - } -#endif } if (unshare) { ceph_assert(maybe_unshared_blobs); @@ -17052,11 +16319,6 @@ void BlueStore::_wctx_finish( if (blob.is_compressed()) { txc->statfs_delta.compressed_allocated() -= e.length; } -#ifdef HAVE_LIBZBD - if (is_smr && e.is_valid()) { - zones_with_releases.insert(e.offset / zone_size); - } -#endif } if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) { @@ -17066,29 +16328,6 @@ void BlueStore::_wctx_finish( } delete &lo; } - -#ifdef HAVE_LIBZBD - if (!zones_with_releases.empty()) { - // we need to fault the entire extent range in here to determinte if we've dropped - // all refs to a zone. - o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); - for (auto& b : o->extent_map.extent_map) { - for (auto& e : b.blob->get_blob().get_extents()) { - if (e.is_valid()) { - zones_with_releases.erase(e.offset / zone_size); - } - } - } - for (auto zone : zones_with_releases) { - auto p = o->onode.zone_offset_refs.find(zone); - if (p != o->onode.zone_offset_refs.end()) { - dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone - << " offset 0x" << p->second << std::dec << dendl; - txc->note_release_zone_offset(o, zone, p->second); - } - } - } -#endif } void BlueStore::_do_write_data( @@ -18092,37 +17331,6 @@ int BlueStore::_do_clone_range( oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff); } -#ifdef HAVE_LIBZBD - if (bdev->is_smr()) { - // duplicate the refs for the shared region. - Extent dummy(dstoff); - for (auto e = newo->extent_map.extent_map.lower_bound(dummy); - e != newo->extent_map.extent_map.end(); - ++e) { - if (e->logical_offset >= dstoff + length) { - break; - } - for (auto& ex : e->blob->get_blob().get_extents()) { - // note that we may introduce a new extent reference that is - // earlier than the first zone ref. we allow this since it is - // a lot of work to avoid and has marginal impact on cleaning - // performance. - if (!ex.is_valid()) { - continue; - } - uint32_t zone = ex.offset / zone_size; - if (!newo->onode.zone_offset_refs.count(zone)) { - uint64_t zoff = ex.offset % zone_size; - dout(20) << __func__ << " add ref zone 0x" << std::hex << zone - << " offset 0x" << zoff << std::dec - << " -> " << newo->oid << dendl; - txc->note_write_zone_offset(newo, zone, zoff); - } - } - } - } -#endif - _dump_onode<30>(cct, *oldo); _dump_onode<30>(cct, *newo); return 0; @@ -18227,27 +17435,6 @@ int BlueStore::_rename(TransContext *txc, // and read newo's metadata via the old name). txc->note_modified_object(oldo); -#ifdef HAVE_LIBZBD - if (bdev->is_smr()) { - // adjust zone refs - for (auto& [zone, offset] : newo->onode.zone_offset_refs) { - dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone - << " offset 0x" << offset << std::dec - << " -> " << oldo->oid << dendl; - string key; - get_zone_offset_object_key(zone, offset, oldo->oid, &key); - txc->t->rmkey(PREFIX_ZONED_CL_INFO, key); - - dout(20) << __func__ << " add ref zone 0x" << std::hex << zone - << " offset 0x" << offset << std::dec - << " -> " << newo->oid << dendl; - get_zone_offset_object_key(zone, offset, newo->oid, &key); - bufferlist v; - txc->t->set(PREFIX_ZONED_CL_INFO, key, v); - } - } -#endif - out: dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> " << new_oid << " = " << r << dendl; @@ -19861,7 +19048,6 @@ Allocator* BlueStore::create_bitmap_allocator(uint64_t bdev_size) { // create allocator uint64_t alloc_size = min_alloc_size; Allocator* alloc = Allocator::create(cct, "bitmap", bdev_size, alloc_size, - zone_size, first_sequential_zone, "recovery"); if (alloc) { return alloc; diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 03c1ce124de1..4b832f7cce1a 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1376,17 +1376,6 @@ public: void rewrite_omap_key(const std::string& old, std::string *out); void decode_omap_key(const std::string& key, std::string *user_key); -#ifdef HAVE_LIBZBD - // Return the offset of an object on disk. This function is intended *only* - // for use with zoned storage devices because in these devices, the objects - // are laid out contiguously on disk, which is not the case in general. - // Also, it should always be called after calling extent_map.fault_range(), - // so that the extent map is loaded. - int64_t zoned_get_ondisk_starting_offset() const { - return extent_map.extent_map.begin()->blob-> - get_blob().calc_offset(0, nullptr); - } -#endif private: void _decode(const ceph::buffer::list& v); }; @@ -1847,16 +1836,6 @@ private: std::set onodes; ///< these need to be updated/written std::set modified_objects; ///< objects we modified (and need a ref) -#ifdef HAVE_LIBZBD - // zone refs to add/remove. each zone ref is a (zone, offset) tuple. The offset - // is the first offset in the zone that the onode touched; subsequent writes - // to that zone do not generate additional refs. This is a bit imprecise but - // is sufficient to generate reasonably sequential reads when doing zone - // cleaning with less metadata than a ref for every extent. - std::map, uint64_t> new_zone_offset_refs; - std::map, uint64_t> old_zone_offset_refs; -#endif - std::set shared_blobs; ///< these need to be updated/written std::set blobs_written; ///< update these on io completion KeyValueDB::Transaction t; ///< then we will commit this @@ -1928,17 +1907,6 @@ private: onodes.erase(o); } -#ifdef HAVE_LIBZBD - void note_write_zone_offset(OnodeRef& o, uint32_t zone, uint64_t offset) { - o->onode.zone_offset_refs[zone] = offset; - new_zone_offset_refs[std::make_pair(o, zone)] = offset; - } - void note_release_zone_offset(OnodeRef& o, uint32_t zone, uint64_t offset) { - old_zone_offset_refs[std::make_pair(o, zone)] = offset; - o->onode.zone_offset_refs.erase(zone); - } -#endif - void aio_finish(BlueStore *store) override { store->txc_aio_finish(this); } @@ -2242,17 +2210,6 @@ private: } }; -#ifdef HAVE_LIBZBD - struct ZonedCleanerThread : public Thread { - BlueStore *store; - explicit ZonedCleanerThread(BlueStore *s) : store(s) {} - void *entry() override { - store->_zoned_cleaner_thread(); - return nullptr; - } - }; -#endif - struct BigDeferredWriteContext { uint64_t off = 0; // original logical offset uint32_t b_off = 0; // blob relative offset @@ -2349,15 +2306,6 @@ private: std::deque deferred_stable_to_finalize; ///< pending finalization bool kv_finalize_in_progress = false; -#ifdef HAVE_LIBZBD - ZonedCleanerThread zoned_cleaner_thread; - ceph::mutex zoned_cleaner_lock = ceph::make_mutex("BlueStore::zoned_cleaner_lock"); - ceph::condition_variable zoned_cleaner_cond; - bool zoned_cleaner_started = false; - bool zoned_cleaner_stop = false; - std::deque zoned_cleaner_queue; -#endif - PerfCounters *logger = nullptr; std::list removed_collections; @@ -2382,10 +2330,6 @@ private: "not enough bits for min_alloc_size"); bool elastic_shared_blobs = false; ///< use smart ExtentMap::dup to reduce shared blob count - // smr-only - uint64_t zone_size = 0; ///< number of SMR zones - uint64_t first_sequential_zone = 0; ///< first SMR zone that is sequential-only - enum { // Please preserve the order since it's DB persistent OMAP_BULK = 0, @@ -2820,16 +2764,6 @@ private: void _kv_sync_thread(); void _kv_finalize_thread(); -#ifdef HAVE_LIBZBD - void _zoned_cleaner_start(); - void _zoned_cleaner_stop(); - void _zoned_cleaner_thread(); - void _zoned_clean_zone(uint64_t zone_num, - class ZonedAllocator *a, - class ZonedFreelistManager *f); - void _clean_some(ghobject_t oid, uint32_t zone_num); -#endif - bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, uint64_t len); void _deferred_queue(TransContext *txc); public: diff --git a/src/os/bluestore/FreelistManager.cc b/src/os/bluestore/FreelistManager.cc index 69866fa40cb5..37347ced66bd 100644 --- a/src/os/bluestore/FreelistManager.cc +++ b/src/os/bluestore/FreelistManager.cc @@ -27,17 +27,6 @@ FreelistManager *FreelistManager::create( return fm; } -#ifdef HAVE_LIBZBD - // With zoned drives there is only one FreelistManager implementation that we - // can use, and we also know if a drive is zoned right after opening it - // (BlueStore::_open_bdev). Hence, we set freelist_type to "zoned" whenever - // we open the device and it turns out to be is zoned. We ignore |prefix| - // passed to create and use the prefixes defined for zoned devices at the top - // of BlueStore.cc. - if (type == "zoned") - return new ZonedFreelistManager(cct, "Z", "z"); -#endif - return NULL; } diff --git a/src/os/bluestore/FreelistManager.h b/src/os/bluestore/FreelistManager.h index 7f44fe957316..b647e55c2c16 100644 --- a/src/os/bluestore/FreelistManager.h +++ b/src/os/bluestore/FreelistManager.h @@ -26,7 +26,6 @@ public: static void setup_merge_operators(KeyValueDB *db, const std::string &type); virtual int create(uint64_t size, uint64_t granularity, - uint64_t zone_size, uint64_t first_sequential_zone, KeyValueDB::Transaction txn) = 0; virtual int init(KeyValueDB *kvdb, bool db_in_read_only, diff --git a/src/os/bluestore/ZonedAllocator.cc b/src/os/bluestore/ZonedAllocator.cc deleted file mode 100644 index 4139b4755697..000000000000 --- a/src/os/bluestore/ZonedAllocator.cc +++ /dev/null @@ -1,240 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -// -// A simple allocator that just hands out space from the next empty zone. This -// is temporary, just to get the simplest append-only write workload to work. -// -// Copyright (C) 2020 Abutalib Aghayev -// - -#include "ZonedAllocator.h" -#include "bluestore_types.h" -#include "zoned_types.h" -#include "common/debug.h" - -#define dout_context cct -#define dout_subsys ceph_subsys_bluestore -#undef dout_prefix -#define dout_prefix *_dout << "ZonedAllocator(" << this << ") " << __func__ << " " - -ZonedAllocator::ZonedAllocator(CephContext* cct, - int64_t size, - int64_t blk_size, - int64_t _zone_size, - int64_t _first_sequential_zone, - std::string_view name) - : Allocator(name, size, blk_size), - cct(cct), - size(size), - conventional_size(_first_sequential_zone * _zone_size), - sequential_size(size - conventional_size), - num_sequential_free(0), - block_size(blk_size), - zone_size(_zone_size), - first_seq_zone_num(_first_sequential_zone), - starting_zone_num(first_seq_zone_num), - num_zones(size / zone_size) -{ - ldout(cct, 10) << " size 0x" << std::hex << size - << ", zone size 0x" << zone_size << std::dec - << ", number of zones 0x" << num_zones - << ", first sequential zone 0x" << starting_zone_num - << ", sequential size 0x" << sequential_size - << std::dec - << dendl; - ceph_assert(size % zone_size == 0); - - zone_states.resize(num_zones); -} - -ZonedAllocator::~ZonedAllocator() -{ -} - -int64_t ZonedAllocator::allocate( - uint64_t want_size, - uint64_t alloc_unit, - uint64_t max_alloc_size, - int64_t hint, - PExtentVector *extents) -{ - std::lock_guard l(lock); - - ceph_assert(want_size % 4096 == 0); - - ldout(cct, 10) << " trying to allocate 0x" - << std::hex << want_size << std::dec << dendl; - - uint64_t left = num_zones - first_seq_zone_num; - uint64_t zone_num = starting_zone_num; - for ( ; left > 0; ++zone_num, --left) { - if (zone_num == num_zones) { - zone_num = first_seq_zone_num; - } - if (zone_num == cleaning_zone) { - ldout(cct, 10) << " skipping zone 0x" << std::hex << zone_num - << " because we are cleaning it" << std::dec << dendl; - continue; - } - if (!fits(want_size, zone_num)) { - ldout(cct, 10) << " skipping zone 0x" << std::hex << zone_num - << " because there is not enough space: " - << " want_size = 0x" << want_size - << " available = 0x" << get_remaining_space(zone_num) - << std::dec - << dendl; - continue; - } - break; - } - - if (left == 0) { - ldout(cct, 10) << " failed to allocate" << dendl; - return -ENOSPC; - } - - uint64_t offset = get_offset(zone_num); - - ldout(cct, 10) << " moving zone 0x" << std::hex - << zone_num << " write pointer from 0x" << offset - << " -> 0x" << offset + want_size - << std::dec << dendl; - - increment_write_pointer(zone_num, want_size); - num_sequential_free -= want_size; - if (get_remaining_space(zone_num) == 0) { - starting_zone_num = zone_num + 1; - } - - ldout(cct, 10) << " allocated 0x" << std::hex << offset << "~" << want_size - << " from zone 0x" << zone_num - << " and zone offset 0x" << (offset % zone_size) - << std::dec << dendl; - - extents->emplace_back(bluestore_pextent_t(offset, want_size)); - return want_size; -} - -void ZonedAllocator::release(const interval_set& release_set) -{ - std::lock_guard l(lock); - for (auto p = cbegin(release_set); p != cend(release_set); ++p) { - auto offset = p.get_start(); - auto length = p.get_len(); - uint64_t zone_num = offset / zone_size; - ldout(cct, 10) << " 0x" << std::hex << offset << "~" << length - << " from zone 0x" << zone_num << std::dec << dendl; - uint64_t num_dead = std::min(zone_size - offset % zone_size, length); - for ( ; length; ++zone_num) { - increment_num_dead_bytes(zone_num, num_dead); - length -= num_dead; - num_dead = std::min(zone_size, length); - } - } -} - -uint64_t ZonedAllocator::get_free() -{ - return num_sequential_free; -} - -void ZonedAllocator::dump() -{ - std::lock_guard l(lock); -} - -void ZonedAllocator::foreach( - std::function notify) -{ - std::lock_guard l(lock); -} - -void ZonedAllocator::init_from_zone_pointers( - std::vector &&_zone_states) -{ - // this is called once, based on the device's zone pointers - std::lock_guard l(lock); - ldout(cct, 10) << dendl; - zone_states = std::move(_zone_states); - num_sequential_free = 0; - for (size_t i = first_seq_zone_num; i < num_zones; ++i) { - num_sequential_free += zone_size - (zone_states[i].write_pointer % zone_size); - } - ldout(cct, 10) << "free 0x" << std::hex << num_sequential_free - << " / 0x" << sequential_size << std::dec - << dendl; -} - -int64_t ZonedAllocator::pick_zone_to_clean(float min_score, uint64_t min_saved) -{ - std::lock_guard l(lock); - int32_t best = -1; - float best_score = 0.0; - for (size_t i = first_seq_zone_num; i < num_zones; ++i) { - // value (score) = benefit / cost - // benefit = how much net free space we'll get (dead bytes) - // cost = how many bytes we'll have to rewrite (live bytes) - // avoid divide by zero on a zone with no live bytes - float score = - (float)zone_states[i].num_dead_bytes / - (float)(zone_states[i].get_num_live_bytes() + 1); - if (score > 0) { - ldout(cct, 20) << " zone 0x" << std::hex << i - << " dead 0x" << zone_states[i].num_dead_bytes - << " score " << score - << dendl; - } - if (zone_states[i].num_dead_bytes < min_saved) { - continue; - } - if (best < 0 || score > best_score) { - best = i; - best_score = score; - } - } - if (best_score >= min_score) { - ldout(cct, 10) << " zone 0x" << std::hex << best << " with score " << best_score - << ": 0x" << zone_states[best].num_dead_bytes - << " dead and 0x" - << zone_states[best].write_pointer - zone_states[best].num_dead_bytes - << " live bytes" << std::dec << dendl; - } else if (best > 0) { - ldout(cct, 10) << " zone 0x" << std::hex << best << " with score " << best_score - << ": 0x" << zone_states[best].num_dead_bytes - << " dead and 0x" - << zone_states[best].write_pointer - zone_states[best].num_dead_bytes - << " live bytes" << std::dec - << " but below min_score " << min_score - << dendl; - best = -1; - } else { - ldout(cct, 10) << " no zones found that are good cleaning candidates" << dendl; - } - return best; -} - -void ZonedAllocator::reset_zone(uint32_t zone) -{ - num_sequential_free += zone_states[zone].write_pointer; - zone_states[zone].reset(); -} - -bool ZonedAllocator::low_on_space(void) -{ - std::lock_guard l(lock); - double free_ratio = static_cast(num_sequential_free) / sequential_size; - - ldout(cct, 10) << " free 0x" << std::hex << num_sequential_free - << "/ 0x" << sequential_size << std::dec - << ", free ratio is " << free_ratio << dendl; - ceph_assert(num_sequential_free <= (int64_t)sequential_size); - - // TODO: make 0.25 tunable - return free_ratio <= 0.25; -} - -void ZonedAllocator::shutdown() -{ - ldout(cct, 1) << dendl; -} diff --git a/src/os/bluestore/ZonedAllocator.h b/src/os/bluestore/ZonedAllocator.h deleted file mode 100644 index 0778bd0da9e6..000000000000 --- a/src/os/bluestore/ZonedAllocator.h +++ /dev/null @@ -1,120 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -// -// A simple allocator that just hands out space from the next empty zone. This -// is temporary, just to get the simplest append-only write workload to work. -// -// Copyright (C) 2020 Abutalib Aghayev -// - -#ifndef CEPH_OS_BLUESTORE_ZONEDALLOCATOR_H -#define CEPH_OS_BLUESTORE_ZONEDALLOCATOR_H - -#include - -#include "Allocator.h" -#include "common/ceph_mutex.h" -#include "include/btree_map.h" -#include "include/interval_set.h" -#include "include/mempool.h" -#include "bluestore_types.h" -#include "zoned_types.h" - -class ZonedAllocator : public Allocator { - CephContext* cct; - - // Currently only one thread at a time calls into ZonedAllocator due to - // atomic_alloc_and_submit_lock in BlueStore.cc, but we do locking anyway - // because eventually ZONE_APPEND support will land and - // atomic_alloc_and_submit_lock will be removed. - ceph::mutex lock = ceph::make_mutex("ZonedAllocator::lock"); - - uint64_t size; - uint64_t conventional_size, sequential_size; - std::atomic num_sequential_free; ///< total bytes in freelist - uint64_t block_size; - uint64_t zone_size; - uint64_t first_seq_zone_num; - uint64_t starting_zone_num; - uint64_t num_zones; - std::atomic cleaning_zone = -1; - std::vector zone_states; - - inline uint64_t get_offset(uint64_t zone_num) const { - return zone_num * zone_size + get_write_pointer(zone_num); - } - -public: - inline uint64_t get_write_pointer(uint64_t zone_num) const { - return zone_states[zone_num].get_write_pointer(); - } -private: - inline uint64_t get_remaining_space(uint64_t zone_num) const { - return zone_size - get_write_pointer(zone_num); - } - - inline void increment_write_pointer(uint64_t zone_num, uint64_t want_size) { - zone_states[zone_num].increment_write_pointer(want_size); - } - - inline void increment_num_dead_bytes(uint64_t zone_num, uint64_t length) { - zone_states[zone_num].increment_num_dead_bytes(length); - } - - inline bool fits(uint64_t want_size, uint64_t zone_num) const { - return want_size <= get_remaining_space(zone_num); - } - -public: - ZonedAllocator(CephContext* cct, int64_t size, int64_t block_size, - int64_t _zone_size, - int64_t _first_sequential_zone, - std::string_view name); - ~ZonedAllocator() override; - - const char *get_type() const override { - return "zoned"; - } - - uint64_t get_dead_bytes(uint32_t zone) { - return zone_states[zone].num_dead_bytes; - } - uint64_t get_live_bytes(uint32_t zone) { - std::scoped_lock l(lock); - return zone_states[zone].write_pointer - zone_states[zone].num_dead_bytes; - } - - int64_t allocate( - uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size, - int64_t hint, PExtentVector *extents) override; - - void release(const interval_set& release_set) override; - - uint64_t get_free() override; - - void dump() override; - void foreach( - std::function notify) override; - - int64_t pick_zone_to_clean(float min_score, uint64_t min_saved); - void set_cleaning_zone(uint32_t zone) { - cleaning_zone = zone; - } - void clear_cleaning_zone(uint32_t zone) { - cleaning_zone = -1; - } - void reset_zone(uint32_t zone); - - void init_from_zone_pointers( - std::vector &&_zone_states); - void init_add_free(uint64_t offset, uint64_t length) override {} - void init_rm_free(uint64_t offset, uint64_t length) override {} - - void shutdown() override; - -private: - bool low_on_space(void); -}; - -#endif diff --git a/src/os/bluestore/ZonedFreelistManager.cc b/src/os/bluestore/ZonedFreelistManager.cc deleted file mode 100644 index 3a5bce66fe5d..000000000000 --- a/src/os/bluestore/ZonedFreelistManager.cc +++ /dev/null @@ -1,372 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -// -// A freelist manager for zoned devices. This iteration just keeps the write -// pointer per zone. Following iterations will add enough information to enable -// cleaning of zones. -// -// Copyright (C) 2020 Abutalib Aghayev -// - -#include "ZonedFreelistManager.h" -#include "bluestore_common.h" -#include "include/stringify.h" -#include "kv/KeyValueDB.h" -#include "os/kv.h" -#include "zoned_types.h" - -#include "common/debug.h" - -#define dout_context cct -#define dout_subsys ceph_subsys_bluestore -#undef dout_prefix -#define dout_prefix *_dout << "zoned freelist " - -using std::string; - -using ceph::bufferlist; -using ceph::bufferptr; -using ceph::decode; -using ceph::encode; - -void ZonedFreelistManager::write_zone_state_delta_to_db( - uint64_t zone_num, - const zone_state_t &zone_state, - KeyValueDB::Transaction txn) -{ - string key; - _key_encode_u64(zone_num, &key); - bufferlist bl; - zone_state.encode(bl); - txn->merge(info_prefix, key, bl); -} - -void ZonedFreelistManager::write_zone_state_reset_to_db( - uint64_t zone_num, - const zone_state_t &zone_state, - KeyValueDB::Transaction txn) -{ - string key; - _key_encode_u64(zone_num, &key); - bufferlist bl; - zone_state.encode(bl); - txn->set(info_prefix, key, bl); -} - -void ZonedFreelistManager::load_zone_state_from_db( - uint64_t zone_num, - zone_state_t &zone_state, - KeyValueDB::Iterator& it) const -{ - string k = it->key(); - uint64_t zone_num_from_db; - _key_decode_u64(k.c_str(), &zone_num_from_db); - ceph_assert(zone_num_from_db == zone_num); - - bufferlist bl = it->value(); - auto p = bl.cbegin(); - zone_state.decode(p); -} - -void ZonedFreelistManager::init_zone_states(KeyValueDB::Transaction txn) -{ - dout(10) << __func__ << dendl; - for (uint64_t zone_num = 0; zone_num < num_zones; ++zone_num) { - zone_state_t zone_state; - write_zone_state_reset_to_db(zone_num, zone_state, txn); - } -} - -void ZonedFreelistManager::setup_merge_operator(KeyValueDB *db, string prefix) -{ - std::shared_ptr merge_op( - new Int64ArrayMergeOperator); - db->set_merge_operator(prefix, merge_op); -} - -ZonedFreelistManager::ZonedFreelistManager( - CephContext* cct, - string meta_prefix, - string info_prefix) - : FreelistManager(cct), - meta_prefix(meta_prefix), - info_prefix(info_prefix), - enumerate_zone_num(~0UL) -{ -} - -int ZonedFreelistManager::create( - uint64_t new_size, - uint64_t granularity, - uint64_t new_zone_size, - uint64_t first_sequential_zone, - KeyValueDB::Transaction txn) -{ - size = new_size; - bytes_per_block = granularity; - zone_size = new_zone_size; - num_zones = size / zone_size; - starting_zone_num = first_sequential_zone; - enumerate_zone_num = ~0UL; - - ceph_assert(size % zone_size == 0); - - dout(1) << __func__ << std::hex - << " size 0x" << size - << " bytes_per_block 0x" << bytes_per_block - << " zone size 0x " << zone_size - << " num_zones 0x" << num_zones - << " starting_zone 0x" << starting_zone_num << dendl; - { - bufferlist bl; - encode(size, bl); - txn->set(meta_prefix, "size", bl); - } - { - bufferlist bl; - encode(bytes_per_block, bl); - txn->set(meta_prefix, "bytes_per_block", bl); - } - { - bufferlist bl; - encode(zone_size, bl); - txn->set(meta_prefix, "zone_size", bl); - } - { - bufferlist bl; - encode(num_zones, bl); - txn->set(meta_prefix, "num_zones", bl); - } - { - bufferlist bl; - encode(starting_zone_num, bl); - txn->set(meta_prefix, "starting_zone_num", bl); - } - - init_zone_states(txn); - - return 0; -} - -int ZonedFreelistManager::init( - KeyValueDB *kvdb, - bool db_in_read_only, - cfg_reader_t cfg_reader) -{ - dout(1) << __func__ << dendl; - int r = _read_cfg(cfg_reader); - if (r != 0) { - return r; - } - - ceph_assert(num_zones == size / zone_size); - - dout(10) << __func__ << std::hex - << " size 0x" << size - << " bytes_per_block 0x" << bytes_per_block - << " zone size 0x" << zone_size - << " num_zones 0x" << num_zones - << " starting_zone 0x" << starting_zone_num - << std::dec << dendl; - return 0; -} - -void ZonedFreelistManager::sync(KeyValueDB* kvdb) -{ -} - -void ZonedFreelistManager::shutdown() -{ - dout(1) << __func__ << dendl; -} - -void ZonedFreelistManager::enumerate_reset() -{ - std::lock_guard l(lock); - - dout(1) << __func__ << dendl; - - enumerate_p.reset(); - enumerate_zone_num = ~0UL; -} - -// Currently, this just iterates over the list of zones and sets |offset| and -// |length| to the write pointer and the number of remaining free bytes in a -// given zone. Hence, it can set |length| to 0 if a zone is full, and it can -// also return two contiguous empty zones in two calls. This does not violate -// current semantics of the call and appears to work fine with the clients of -// this call. -bool ZonedFreelistManager::enumerate_next( - KeyValueDB *kvdb, - uint64_t *offset, - uint64_t *length) -{ - std::lock_guard l(lock); - - // starting case - if (enumerate_zone_num == ~0UL) { - dout(30) << __func__ << " start" << dendl; - enumerate_p = kvdb->get_iterator(info_prefix); - enumerate_p->lower_bound(string()); - ceph_assert(enumerate_p->valid()); - enumerate_zone_num = 0; - } else { - enumerate_p->next(); - if (!enumerate_p->valid()) { - dout(30) << __func__ << " end" << dendl; - return false; - } - ++enumerate_zone_num; - } - - zone_state_t zone_state; - load_zone_state_from_db(enumerate_zone_num, zone_state, enumerate_p); - - *offset = enumerate_zone_num * zone_size + zone_state.get_write_pointer(); - *length = zone_size - zone_state.get_write_pointer(); - - dout(30) << __func__ << std::hex << " 0x" << *offset << "~" << *length - << std::dec << dendl; - - return true; -} - -void ZonedFreelistManager::dump(KeyValueDB *kvdb) -{ - enumerate_reset(); - uint64_t offset, length; - while (enumerate_next(kvdb, &offset, &length)) { - dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length - << std::dec << dendl; - } -} - -// Advances the write pointer and writes the updated write pointer to database. -void ZonedFreelistManager::allocate( - uint64_t offset, - uint64_t length, - KeyValueDB::Transaction txn) -{ - while (length > 0) { - uint64_t zone_num = offset / zone_size; - uint64_t this_len = std::min(length, zone_size - offset % zone_size); - dout(10) << __func__ << " 0x" << std::hex << offset << "~" << this_len - << " zone 0x" << zone_num << std::dec << dendl; - zone_state_t zone_state; - zone_state.increment_write_pointer(this_len); - write_zone_state_delta_to_db(zone_num, zone_state, txn); - offset += this_len; - length -= this_len; - } -} - -// Increments the number of dead bytes in a zone and writes the updated value to -// database. The dead bytes in the zone are not usable. The cleaner will later -// copy live objects from the zone to another zone an make the zone writable -// again. The number of dead bytes in a zone is used by the cleaner to select -// which zones to clean -- the ones with most dead bytes are good candidates -// since they require less I/O. -void ZonedFreelistManager::release( - uint64_t offset, - uint64_t length, - KeyValueDB::Transaction txn) -{ - while (length > 0) { - uint64_t zone_num = offset / zone_size; - uint64_t this_len = std::min(length, zone_size - offset % zone_size); - dout(10) << __func__ << " 0x" << std::hex << offset << "~" << this_len - << " zone 0x" << zone_num << std::dec << dendl; - zone_state_t zone_state; - zone_state.increment_num_dead_bytes(this_len); - write_zone_state_delta_to_db(zone_num, zone_state, txn); - length -= this_len; - offset += this_len; - } -} - -void ZonedFreelistManager::get_meta( - uint64_t target_size, - std::vector>* res) const -{ - // We do not support expanding devices for now. - ceph_assert(target_size == 0); - res->emplace_back("zfm_size", stringify(size)); - res->emplace_back("zfm_bytes_per_block", stringify(bytes_per_block)); - res->emplace_back("zfm_zone_size", stringify(zone_size)); - res->emplace_back("zfm_num_zones", stringify(num_zones)); - res->emplace_back("zfm_starting_zone_num", stringify(starting_zone_num)); -} - -std::vector ZonedFreelistManager::get_zone_states( - KeyValueDB *kvdb) const -{ - std::vector zone_states; - auto p = kvdb->get_iterator(info_prefix); - uint64_t zone_num = 0; - for (p->lower_bound(string()); p->valid(); p->next(), ++zone_num) { - zone_state_t zone_state; - load_zone_state_from_db(zone_num, zone_state, p); - zone_states.emplace_back(zone_state); - } - return zone_states; -} - -// TODO: The following function is copied almost verbatim from -// BitmapFreelistManager. Eliminate duplication. -int ZonedFreelistManager::_read_cfg(cfg_reader_t cfg_reader) -{ - dout(1) << __func__ << dendl; - - string err; - - const size_t key_count = 5; - string keys[key_count] = { - "zfm_size", - "zfm_bytes_per_block", - "zfm_zone_size", - "zfm_num_zones", - "zfm_starting_zone_num" - }; - uint64_t* vals[key_count] = { - &size, - &bytes_per_block, - &zone_size, - &num_zones, - &starting_zone_num}; - - for (size_t i = 0; i < key_count; i++) { - string val; - int r = cfg_reader(keys[i], &val); - if (r == 0) { - *(vals[i]) = strict_iecstrtoll(val.c_str(), &err); - if (!err.empty()) { - derr << __func__ << " Failed to parse - " - << keys[i] << ":" << val - << ", error: " << err << dendl; - return -EINVAL; - } - } else { - // this is expected for legacy deployed OSDs - dout(0) << __func__ << " " << keys[i] << " not found in bdev meta" << dendl; - return r; - } - } - return 0; -} - -void ZonedFreelistManager::mark_zone_to_clean_free( - uint64_t zone, - KeyValueDB *kvdb) -{ - dout(10) << __func__ << " zone 0x" << std::hex << zone << std::dec << dendl; - - KeyValueDB::Transaction txn = kvdb->get_transaction(); - - zone_state_t empty_zone_state; - write_zone_state_reset_to_db(zone, empty_zone_state, txn); - - // block here until this commits so that we don't end up starting to allocate and - // write to the new zone before this fully commits. - kvdb->submit_transaction_sync(txn); -} diff --git a/src/os/bluestore/ZonedFreelistManager.h b/src/os/bluestore/ZonedFreelistManager.h deleted file mode 100644 index 378a20f0a796..000000000000 --- a/src/os/bluestore/ZonedFreelistManager.h +++ /dev/null @@ -1,113 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -// -// A freelist manager for zoned devices. -// -// Copyright (C) 2020 Abutalib Aghayev -// - -#ifndef CEPH_OS_BLUESTORE_ZONEDFREELISTMANAGER_H -#define CEPH_OS_BLUESTORE_ZONEDFREELISTMANAGER_H - -#include "FreelistManager.h" - -#include -#include - -#include "common/ceph_mutex.h" -#include "include/buffer.h" -#include "kv/KeyValueDB.h" -#include "zoned_types.h" - -using cfg_reader_t = std::function; - -class ZonedFreelistManager : public FreelistManager { - std::string meta_prefix; ///< device size, zone size, etc. - std::string info_prefix; ///< per zone write pointer, dead bytes - mutable ceph::mutex lock = ceph::make_mutex("ZonedFreelistManager::lock"); - - uint64_t size; ///< size of sequential region (bytes) - uint64_t bytes_per_block; ///< bytes per allocation unit (bytes) - uint64_t zone_size; ///< size of a single zone (bytes) - uint64_t num_zones; ///< number of sequential zones - uint64_t starting_zone_num; ///< the first sequential zone number - - KeyValueDB::Iterator enumerate_p; - uint64_t enumerate_zone_num; - - void write_zone_state_delta_to_db(uint64_t zone_num, - const zone_state_t &zone_state, - KeyValueDB::Transaction txn); - void write_zone_state_reset_to_db(uint64_t zone_num, - const zone_state_t &zone_state, - KeyValueDB::Transaction txn); - void load_zone_state_from_db(uint64_t zone_num, - zone_state_t &zone_state, - KeyValueDB::Iterator &it) const; - - void init_zone_states(KeyValueDB::Transaction txn); - - void increment_write_pointer( - uint64_t zone, uint64_t length, KeyValueDB::Transaction txn); - void increment_num_dead_bytes( - uint64_t zone, uint64_t num_bytes, KeyValueDB::Transaction txn); - - int _read_cfg(cfg_reader_t cfg_reader); - -public: - ZonedFreelistManager(CephContext* cct, - std::string meta_prefix, - std::string info_prefix); - - static void setup_merge_operator(KeyValueDB *db, std::string prefix); - - int create(uint64_t size, - uint64_t granularity, - uint64_t zone_size, - uint64_t first_sequential_zone, - KeyValueDB::Transaction txn) override; - - int init(KeyValueDB *kvdb, - bool db_in_read_only, - cfg_reader_t cfg_reader) override; - - void shutdown() override; - void sync(KeyValueDB* kvdb) override; - void dump(KeyValueDB *kvdb) override; - - void enumerate_reset() override; - bool enumerate_next(KeyValueDB *kvdb, - uint64_t *offset, - uint64_t *length) override; - - void allocate(uint64_t offset, - uint64_t length, - KeyValueDB::Transaction txn) override; - - void release(uint64_t offset, - uint64_t length, - KeyValueDB::Transaction txn) override; - - inline uint64_t get_size() const override { - return size; - } - - inline uint64_t get_alloc_units() const override { - return size / bytes_per_block; - } - - inline uint64_t get_alloc_size() const override { - return bytes_per_block; - } - - void get_meta(uint64_t target_size, - std::vector>*) const override; - - std::vector get_zone_states(KeyValueDB *kvdb) const; - - void mark_zone_to_clean_free(uint64_t zone, - KeyValueDB *kvdb); -}; - -#endif diff --git a/src/os/bluestore/zoned_types.h b/src/os/bluestore/zoned_types.h deleted file mode 100644 index d8ca3a0c7c63..000000000000 --- a/src/os/bluestore/zoned_types.h +++ /dev/null @@ -1,66 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef CEPH_OS_BLUESTORE_ZONED_TYPES_H -#define CEPH_OS_BLUESTORE_ZONED_TYPES_H - -#include "include/types.h" -#include "kv/KeyValueDB.h" -#include "os/kv.h" - -// Tracks two bits of information about the state of a zone: (1) number of dead -// bytes in a zone and (2) the write pointer. We use the existing -// Int64ArrayMergeOperator for merge and avoid the cost of point queries. -// -// We use the same struct for an on-disk and in-memory representation of the -// state. -struct zone_state_t { - uint64_t num_dead_bytes = 0; ///< dead bytes deallocated (behind the write pointer) - uint64_t write_pointer = 0; ///< relative offset within the zone - - void encode(ceph::buffer::list &bl) const { - using ceph::encode; - encode(write_pointer, bl); - encode(num_dead_bytes, bl); - } - void decode(ceph::buffer::list::const_iterator &p) { - using ceph::decode; - decode(write_pointer, p); - decode(num_dead_bytes, p); - } - - void reset() { - write_pointer = 0; - num_dead_bytes = 0; - } - - uint64_t get_num_dead_bytes() const { - return num_dead_bytes; - } - - uint64_t get_num_live_bytes() const { - return write_pointer - num_dead_bytes; - } - - uint64_t get_write_pointer() const { - return write_pointer; - } - - void increment_num_dead_bytes(uint64_t num_bytes) { - num_dead_bytes += num_bytes; - } - - void increment_write_pointer(uint64_t num_bytes) { - write_pointer += num_bytes; - } - - friend std::ostream& operator<<( - std::ostream& out, - const zone_state_t& zone_state) { - return out << std::hex - << " dead bytes: 0x" << zone_state.get_num_dead_bytes() - << " write pointer: 0x" << zone_state.get_write_pointer() - << " " << std::dec; - } -}; - -#endif diff --git a/src/test/objectstore/run_smr_bluestore_test.sh b/src/test/objectstore/run_smr_bluestore_test.sh deleted file mode 100644 index d689cf2c5011..000000000000 --- a/src/test/objectstore/run_smr_bluestore_test.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash -ex - -# 1) run_smr_bluestore_test.sh -# Setup smr device, run all tests - -# 2) run_smr_bluestore_test.sh --smr -# Setup smr device but skip tests failing on smr - - -before_creation=$(mktemp) -lsscsi > $before_creation - -echo "cd /backstores/user:zbc -create name=zbc0 size=20G cfgstring=model-HM/zsize-256/conv-10@zbc0.raw -/loopback create -cd /loopback -create naa.50014055e5f25aa0 -cd naa.50014055e5f25aa0/luns -create /backstores/user:zbc/zbc0 0 -" | sudo targetcli - -sleep 1 #if too fast device does not show up -after_creation=$(mktemp) -lsscsi > $after_creation -if [[ $(diff $before_creation $after_creation | wc -l ) != 2 ]] -then - echo New zbc device not created - false -fi - -function cleanup() { - echo "cd /loopback -delete naa.50014055e5f25aa0 -cd /backstores/user:zbc -delete zbc0" | sudo targetcli - sudo rm -f zbc0.raw - rm -f $before_creation $after_creation -} -trap cleanup EXIT - -DEV=$(diff $before_creation $after_creation |grep zbc |sed "s@.* /@/@") -sudo chmod 666 $DEV -# Need sudo -# https://patchwork.kernel.org/project/linux-block/patch/20210811110505.29649-3-Niklas.Cassel@wdc.com/ -sudo ceph_test_objectstore \ - --bluestore-block-path $DEV \ - --gtest_filter=*/2 \ - $* diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 99da6002ef00..de779eb039dc 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -55,8 +55,6 @@ typedef boost::mt11213b gen_type; const uint64_t DEF_STORE_TEST_BLOCKDEV_SIZE = 10240000000; #define dout_context g_ceph_context -bool smr = false; - static bool bl_eq(bufferlist& expected, bufferlist& actual) { if (expected.contents_equal(actual)) @@ -1278,16 +1276,11 @@ void StoreTest::doCompressionTest() TEST_P(StoreTest, CompressionTest) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "TODO: need to adjust statfs check for smr" << std::endl; - return; - } SetVal(g_conf(), "bluestore_compression_algorithm", "snappy"); SetVal(g_conf(), "bluestore_compression_mode", "force"); g_ceph_context->_conf.apply_changes(nullptr); doCompressionTest(); - SetVal(g_conf(), "bluestore_compression_algorithm", "zlib"); SetVal(g_conf(), "bluestore_compression_mode", "aggressive"); g_ceph_context->_conf.apply_changes(nullptr); @@ -1514,10 +1507,6 @@ TEST_P(StoreTest, SimpleObjectTest) { TEST_P(StoreTestSpecificAUSize, ReproBug41901Test) { if(string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP (smr)" << std::endl; - return; - } SetVal(g_conf(), "bluestore_max_blob_size", "524288"); SetVal(g_conf(), "bluestore_debug_enforce_settings", "hdd"); @@ -1613,10 +1602,6 @@ TEST_P(StoreTestSpecificAUSize, ReproBug41901Test) { TEST_P(StoreTestSpecificAUSize, BluestoreStatFSTest) { if(string(GetParam()) != "bluestore") return; - if (smr) { - cout << "TODO: fix this for smr" << std::endl; - return; - } SetVal(g_conf(), "bluestore_block_db_path", ""); StartDeferred(65536); SetVal(g_conf(), "bluestore_compression_mode", "force"); @@ -2147,10 +2132,6 @@ TEST_P(StoreTestSpecificAUSize, BluestoreStatFSTest) { TEST_P(StoreTestSpecificAUSize, BluestoreFragmentedBlobTest) { if(string(GetParam()) != "bluestore") return; - if (smr) { - cout << "TODO: fix this for smr" << std::endl; - return; - } SetVal(g_conf(), "bluestore_block_db_path", ""); StartDeferred(0x10000); @@ -3782,10 +3763,6 @@ TEST_P(StoreTest, SimpleCloneRangeTest) { TEST_P(StoreTest, BlueStoreUnshareBlobTest) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: non-deterministic behavior with smr" << std::endl; - return; - } int r; coll_t cid; auto ch = store->create_new_collection(cid); @@ -7164,11 +7141,6 @@ void doMany4KWritesTest(ObjectStore* store, TEST_P(StoreTestSpecificAUSize, Many4KWritesTest) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: no deferred; assertions around res_stat.allocated don't apply" - << std::endl; - return; - } StartDeferred(0x10000); @@ -7179,11 +7151,6 @@ TEST_P(StoreTestSpecificAUSize, Many4KWritesTest) { TEST_P(StoreTestSpecificAUSize, Many4KWritesNoCSumTest) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: no deferred; assertions around res_stat.allocated don't apply" - << std::endl; - return; - } StartDeferred(0x10000); SetVal(g_conf(), "bluestore_csum_type", "none"); g_ceph_context->_conf.apply_changes(nullptr); @@ -7195,11 +7162,6 @@ TEST_P(StoreTestSpecificAUSize, Many4KWritesNoCSumTest) { TEST_P(StoreTestSpecificAUSize, TooManyBlobsTest) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: no deferred; assertions around res_stat.allocated don't apply" - << std::endl; - return; - } StartDeferred(0x10000); const unsigned max_object = 4*1024*1024; doMany4KWritesTest(store.get(), 1, 1000, max_object, 4*1024, 0); @@ -7575,9 +7537,6 @@ TEST_P(StoreTestSpecificAUSize, ZeroBlockDetectionSmallOverwrite) { if (string(GetParam()) != "bluestore" || !cct->_conf->bluestore_zero_block_detection) { GTEST_SKIP() << "not bluestore or bluestore_zero_block_detection=false, skipping"; } - if (smr) { - GTEST_SKIP() << "smr, skipping"; - } size_t block_size = 65536; StartDeferred(block_size); @@ -7755,9 +7714,6 @@ TEST_P(StoreTestSpecificAUSize, ZeroBlockDetectionBigOverwrite) { if (string(GetParam()) != "bluestore" || !cct->_conf->bluestore_zero_block_detection) { GTEST_SKIP() << "not bluestore or bluestore_zero_block_detection=false, skipping"; } - if (smr) { - GTEST_SKIP() << "smr, skipping"; - } size_t block_size = 4096; StartDeferred(block_size); @@ -7858,10 +7814,6 @@ TEST_P(StoreTestSpecificAUSize, DeferredOnBigOverwrite1) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: no deferred" << std::endl; - return; - } size_t block_size = 4096; StartDeferred(block_size); @@ -8046,10 +7998,6 @@ TEST_P(StoreTestSpecificAUSize, DeferredOnBigOverwrite2) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: no deferred" << std::endl; - return; - } size_t block_size = 4096; StartDeferred(block_size); @@ -8196,10 +8144,6 @@ TEST_P(StoreTestSpecificAUSize, DeferredOnBigOverwrite3) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: no deferred" << std::endl; - return; - } size_t block_size = 4096; StartDeferred(block_size); @@ -8362,10 +8306,6 @@ TEST_P(StoreTestSpecificAUSize, DeferredOnBigOverwrite4) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: no deferred" << std::endl; - return; - } size_t block_size = 4096; SetVal(g_conf(), "bluestore_block_db_create", "true"); @@ -8440,10 +8380,6 @@ TEST_P(StoreTestSpecificAUSize, DeferredOnBigOverwrite5) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: no deferred" << std::endl; - return; - } size_t block_size = 4096; SetVal(g_conf(), "bluestore_block_db_create", "true"); @@ -8518,10 +8454,6 @@ TEST_P(StoreTestSpecificAUSize, DeferredDifferentChunks) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: no deferred" << std::endl; - return; - } size_t alloc_size = 4096; size_t large_object_size = 1 * 1024 * 1024; @@ -8625,10 +8557,6 @@ TEST_P(StoreTestSpecificAUSize, BlobReuseOnOverwriteReverse) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: no overwrite" << std::endl; - return; - } size_t block_size = 4096; StartDeferred(block_size); @@ -8805,10 +8733,6 @@ TEST_P(StoreTestSpecificAUSize, BlobReuseOnSmallOverwrite) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: no overwrite" << std::endl; - return; - } size_t block_size = 4096; StartDeferred(block_size); @@ -8971,10 +8895,6 @@ TEST_P(StoreTestSpecificAUSize, ReproBug56488Test) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: no deferred" << std::endl; - return; - } size_t alloc_size = 65536; size_t write_size = 4096; @@ -9147,10 +9067,6 @@ TEST_P(StoreTestSpecificAUSize, garbageCollection) { int write_offset = buf_len; if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: assertions about allocations need to be adjusted" << std::endl; - return; - } #define WRITE_AT(offset, _length) {\ ObjectStore::Transaction t;\ @@ -9346,10 +9262,6 @@ namespace { TEST_P(StoreTestSpecificAUSize, BluestoreRepairTest) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "TODO: repair mismatched write pointer (+ dead bytes mismatch)" << std::endl; - return; - } const size_t offs_base = 65536 / 2; @@ -9554,10 +9466,6 @@ TEST_P(StoreTestSpecificAUSize, BluestoreRepairTest) { TEST_P(StoreTestSpecificAUSize, BluestoreBrokenZombieRepairTest) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: smr repair is different" << std::endl; - return; - } SetVal(g_conf(), "bluestore_fsck_on_mount", "false"); SetVal(g_conf(), "bluestore_fsck_on_umount", "false"); @@ -9627,10 +9535,6 @@ TEST_P(StoreTestSpecificAUSize, BluestoreBrokenZombieRepairTest) { TEST_P(StoreTestSpecificAUSize, BluestoreRepairSharedBlobTest) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "TODO: repair mismatched write pointer (+ dead bytes mismatch)" << std::endl; - return; - } SetVal(g_conf(), "bluestore_fsck_on_mount", "false"); SetVal(g_conf(), "bluestore_fsck_on_umount", "false"); @@ -9703,10 +9607,6 @@ TEST_P(StoreTestSpecificAUSize, BluestoreRepairSharedBlobTest) { TEST_P(StoreTestSpecificAUSize, BluestoreBrokenNoSharedBlobRepairTest) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: smr repair is different" << std::endl; - return; - } SetVal(g_conf(), "bluestore_fsck_on_mount", "false"); SetVal(g_conf(), "bluestore_fsck_on_umount", "false"); @@ -10320,41 +10220,6 @@ TEST_P(StoreTest, mergeRegionTest) { } } -TEST_P(StoreTest, FixSMRWritePointer) { - if(string(GetParam()) != "bluestore") - return; - if (!smr) - return; - int r = store->umount(); - ASSERT_EQ(0, r); - - // copied from StoreTestFixture - std::string path = GetParam() + ".test_temp_dir"s; - - std::string p = path + "/block"; - BlockDevice* bdev = BlockDevice::create(g_ceph_context, p, nullptr, nullptr, nullptr, nullptr); - r = bdev->open(p); - ASSERT_EQ(0, r); - ASSERT_EQ(true, bdev->is_smr()); - - std::vector wp = bdev->get_zones(); - uint64_t first_seq_zone = bdev->get_conventional_region_size() / bdev->get_zone_size(); - - IOContext ioc(g_ceph_context, NULL, true); - bufferlist bl; - bl.append(std::string(1024 * 1024, 'x')); - r = bdev->aio_write(wp[first_seq_zone], bl, &ioc, false); - ASSERT_EQ(0, r); - bdev->aio_submit(&ioc); - ioc.aio_wait(); - bdev->close(); - delete bdev; - - r = store->mount(); - ASSERT_EQ(0, r); -} - - TEST_P(StoreTestSpecificAUSize, BluestoreEnforceHWSettingsHdd) { if (string(GetParam()) != "bluestore") return; @@ -10425,10 +10290,6 @@ TEST_P(StoreTestSpecificAUSize, ReproNoBlobMultiTest) { if(string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP (FIXME): bluestore gc does not seem to do the trick here" << std::endl; - return; - } SetVal(g_conf(), "bluestore_block_db_create", "true"); SetVal(g_conf(), "bluestore_block_db_size", "4294967296"); @@ -10546,10 +10407,6 @@ void doManySetAttr(ObjectStore* store, TEST_P(StoreTestSpecificAUSize, SpilloverLegacyTest) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl; - return; - } SetVal(g_conf(), "bluestore_block_db_create", "true"); SetVal(g_conf(), "bluestore_block_db_size", "3221225472"); @@ -10581,10 +10438,6 @@ TEST_P(StoreTestSpecificAUSize, SpilloverLegacyTest) { TEST_P(StoreTestSpecificAUSize, SpilloverLegacyFixedByFitToFastTest) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl; - return; - } SetVal(g_conf(), "bluestore_block_db_create", "true"); SetVal(g_conf(), "bluestore_block_db_size", "3221225472"); @@ -10632,10 +10485,6 @@ void do_bluefs_write(BlueFS* _fs, TEST_P(StoreTestSpecificAUSize, SpilloverTest) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl; - return; - } SetVal(g_conf(), "bluestore_block_db_create", "true"); SetVal(g_conf(), "bluestore_block_db_size", "3221225472"); @@ -10666,10 +10515,6 @@ TEST_P(StoreTestSpecificAUSize, SpilloverTest) { TEST_P(StoreTestSpecificAUSize, SpilloverFixedCompletelyTest) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl; - return; - } SetVal(g_conf(), "bluestore_block_db_create", "true"); SetVal(g_conf(), "bluestore_block_db_size", "3221225472"); @@ -10693,10 +10538,6 @@ TEST_P(StoreTestSpecificAUSize, SpilloverFixedCompletelyTest) { TEST_P(StoreTestSpecificAUSize, SpilloverFixedPartialTest) { if (string(GetParam()) != "bluestore") return; - if (smr) { - cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl; - return; - } SetVal(g_conf(), "bluestore_block_db_create", "true"); SetVal(g_conf(), "bluestore_block_db_size", stringify(3ull << 30).c_str()); @@ -10732,9 +10573,6 @@ TEST_P(StoreTestSpecificAUSize, SpilloverFixedPartialTest) { TEST_P(StoreTestSpecificAUSize, Ticket45195Repro) { if (string(GetParam()) != "bluestore") return; - if (smr) { - return; - } SetVal(g_conf(), "bluestore_default_buffered_write", "true"); SetVal(g_conf(), "bluestore_max_blob_size", "65536"); @@ -11059,18 +10897,6 @@ int main(int argc, char **argv) { CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); common_init_finish(g_ceph_context); - for (auto& i : args) { - if (i == "--smr"s) { -#if defined(HAVE_LIBZBD) - derr << "Adjusting tests for smr mode." << dendl; - smr = true; -#else - derr << "smr mode selected, but support not compiled in" << dendl; - return 1; -#endif - } - } - // make sure we can adjust any config settings g_ceph_context->_conf._clear_safe_to_start_threads(); diff --git a/src/vstart.sh b/src/vstart.sh index 1187e86b9ed8..a76eadfb54ec 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -252,7 +252,6 @@ options: --bluestore-devs: comma-separated list of blockdevs to use for bluestore --bluestore-db-devs: comma-separated list of db-devs to use for bluestore --bluestore-wal-devs: comma-separated list of wal-devs to use for bluestore - --bluestore-zoned: blockdevs listed by --bluestore-devs are zoned devices (HM-SMR HDD or ZNS SSD) --bluestore-io-uring: enable io_uring backend --inc-osd: append some more osds into existing vcluster --cephadm: enable cephadm orchestrator with ~/.ssh/id_rsa[.pub] @@ -586,9 +585,6 @@ case $1 in parse_bluestore_wal_devs --bluestore-wal-devs "$2" shift ;; - --bluestore-zoned) - zoned_enabled=1 - ;; --bluestore-io-uring) io_uring_enabled=1 shift @@ -852,7 +848,6 @@ EOF bluestore prefer deferred size = 0 bluestore prefer deferred size hdd = 0 bluestore prefer deferred size ssd = 0 - bluestore allocator = zoned" fi if [ "$io_uring_enabled" -eq 1 ]; then BLUESTORE_OPTS+="