- stupid
- avl
- hybrid
- - zoned
with_legacy: true
- name: bluestore_freelist_blocks_per_key
type: size
- aio
- spdk
- pmem
- - hm_smr
- name: bluestore_cleaner_sleep_interval
type: float
level: advanced
)
endif(WITH_BLUESTORE)
-if(WITH_ZBD)
- list(APPEND libos_srcs
- bluestore/ZonedFreelistManager.cc
- bluestore/ZonedAllocator.cc)
-endif()
-
if(WITH_FUSE)
list(APPEND libos_srcs
FuseStore.cc)
std::string_view type,
int64_t size,
int64_t block_size,
- int64_t zone_size,
- int64_t first_sequential_zone,
std::string_view name)
{
Allocator* alloc = nullptr;
return new HybridAllocator(cct, size, block_size,
cct->_conf.get_val<uint64_t>("bluestore_hybrid_alloc_mem_cap"),
name);
-#ifdef HAVE_LIBZBD
- } else if (type == "zoned") {
- return new ZonedAllocator(cct, size, block_size, zone_size, first_sequential_zone,
- name);
-#endif
}
if (alloc == nullptr) {
lderr(cct) << "Allocator::" << __func__ << " unknown alloc type "
std::string_view type,
int64_t size,
int64_t block_size,
- int64_t zone_size = 0,
- int64_t firs_sequential_zone = 0,
const std::string_view name = ""
);
}
int BitmapFreelistManager::create(uint64_t new_size, uint64_t granularity,
- uint64_t zone_size, uint64_t first_sequential_zone,
KeyValueDB::Transaction txn)
{
bytes_per_block = granularity;
static void setup_merge_operator(KeyValueDB *db, std::string prefix);
int create(uint64_t size, uint64_t granularity,
- uint64_t zone_size, uint64_t first_sequential_zone,
KeyValueDB::Transaction txn) override;
int init(KeyValueDB *kvdb, bool db_in_read_only,
alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
bdev[id]->get_size(),
alloc_size[id],
- 0, 0,
name);
alloc[id]->init_add_free(
block_reserved[id],
#include "common/WorkQueue.h"
#include "kv/KeyValueHistogram.h"
-#ifdef HAVE_LIBZBD
-#include "ZonedAllocator.h"
-#include "ZonedFreelistManager.h"
-#endif
-
#if defined(WITH_LTTNG)
#define TRACEPOINT_DEFINE
#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
const string PREFIX_SHARED_BLOB = "X"; // u64 SB id -> shared_blob_t
-#ifdef HAVE_LIBZBD
-const string PREFIX_ZONED_FM_META = "Z"; // (see ZonedFreelistManager)
-const string PREFIX_ZONED_FM_INFO = "z"; // (see ZonedFreelistManager)
-const string PREFIX_ZONED_CL_INFO = "G"; // (per-zone cleaner metadata)
-#endif
-
const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
return 0;
}
-#ifdef HAVE_LIBZBD
-static void get_zone_offset_object_key(
- uint32_t zone,
- uint64_t offset,
- ghobject_t oid,
- std::string *key)
-{
- key->clear();
- _key_encode_u32(zone, key);
- _key_encode_u64(offset, key);
- _get_object_key(oid, key);
-}
-
-static int get_key_zone_offset_object(
- const string& key,
- uint32_t *zone,
- uint64_t *offset,
- ghobject_t *oid)
-{
- const char *p = key.c_str();
- if (key.length() < sizeof(uint64_t) + sizeof(uint32_t) + ENCODED_KEY_PREFIX_LEN + 1)
- return -1;
- p = _key_decode_u32(p, zone);
- p = _key_decode_u64(p, offset);
- int r = _get_key_object(p, oid);
- if (r < 0) {
- return r;
- }
- return 0;
-}
-#endif
template <int LogLevelV>
void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
finisher(cct, "commit_finisher", "cfin"),
kv_sync_thread(this),
kv_finalize_thread(this),
-#ifdef HAVE_LIBZBD
- zoned_cleaner_thread(this),
-#endif
min_alloc_size(_min_alloc_size),
min_alloc_size_order(std::countr_zero(_min_alloc_size)),
mempool_thread(this)
{
max_alloc_size = cct->_conf->bluestore_max_alloc_size;
-#ifdef HAVE_LIBZBD
- ceph_assert(bdev);
- if (bdev->is_smr()) {
- prefer_deferred_size = 0;
- } else
-#endif
if (cct->_conf->bluestore_prefer_deferred_size) {
prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
} else {
bool can_have_null_fm = !is_db_rotational() &&
!read_only &&
db_avail &&
- cct->_conf->bluestore_allocation_from_file &&
- !bdev->is_smr();
+ cct->_conf->bluestore_allocation_from_file;
// When allocation-info is stored in a single file we set freelist_type to "null"
if (can_have_null_fm) {
ceph_assert(cct->_conf->bdev_block_size <= min_alloc_size);
uint64_t alloc_size = min_alloc_size;
- if (bdev->is_smr() && freelist_type != "zoned") {
- derr << "SMR device but freelist_type = " << freelist_type << " (not zoned)"
- << dendl;
- return -EINVAL;
- }
if (!bdev->is_smr() && freelist_type == "zoned") {
derr << "non-SMR device (or SMR support not built-in) but freelist_type = zoned"
<< dendl;
return -EINVAL;
}
- fm->create(bdev->get_size(), alloc_size,
- zone_size, first_sequential_zone,
- t);
+ fm->create(bdev->get_size(), alloc_size, t);
// allocate superblock reserved space. note that we do not mark
// bluefs space as allocated in the freelist; we instead rely on
std::string allocator_type = cct->_conf->bluestore_allocator;
-#ifdef HAVE_LIBZBD
- if (freelist_type == "zoned") {
- allocator_type = "zoned";
- }
-#endif
-
alloc = Allocator::create(
cct, allocator_type,
bdev->get_size(),
alloc_size,
- zone_size,
- first_sequential_zone,
"block");
if (!alloc) {
lderr(cct) << __func__ << " failed to create " << allocator_type << " allocator"
return -EINVAL;
}
-#ifdef HAVE_LIBZBD
- if (freelist_type == "zoned") {
- Allocator *a = Allocator::create(
- cct, cct->_conf->bluestore_allocator,
- bdev->get_conventional_region_size(),
- alloc_size,
- zone_size, 0,
- "zoned_block");
- if (!a) {
- lderr(cct) << __func__ << " failed to create " << cct->_conf->bluestore_allocator
- << " allocator" << dendl;
- delete alloc;
- return -EINVAL;
- }
- shared_alloc.set(a, alloc_size);
- } else
-#endif
- {
- // BlueFS will share the same allocator
- shared_alloc.set(alloc, alloc_size);
- }
+ // BlueFS will share the same allocator
+ shared_alloc.set(alloc, alloc_size);
return 0;
}
}
ceph_assert(alloc != NULL);
-#ifdef HAVE_LIBZBD
- if (bdev->is_smr()) {
- auto a = dynamic_cast<ZonedAllocator*>(alloc);
- ceph_assert(a);
- auto f = dynamic_cast<ZonedFreelistManager*>(fm);
- ceph_assert(f);
- vector<uint64_t> wp = bdev->get_zones();
- vector<zone_state_t> zones = f->get_zone_states(db);
- ceph_assert(wp.size() == zones.size());
-
- // reconcile zone state
- auto num_zones = bdev->get_size() / zone_size;
- for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
- ceph_assert(wp[i] >= i * zone_size);
- ceph_assert(wp[i] <= (i + 1) * zone_size); // pos might be at start of next zone
- uint64_t p = wp[i] - i * zone_size;
- if (zones[i].write_pointer > p) {
- derr << __func__ << " zone 0x" << std::hex << i
- << " bluestore write pointer 0x" << zones[i].write_pointer
- << " > device write pointer 0x" << p
- << std::dec << " -- VERY SUSPICIOUS!" << dendl;
- } else if (zones[i].write_pointer < p) {
- // this is "normal" in that it can happen after any crash (if we have a
- // write in flight but did not manage to commit the transaction)
- auto delta = p - zones[i].write_pointer;
- dout(1) << __func__ << " zone 0x" << std::hex << i
- << " device write pointer 0x" << p
- << " > bluestore pointer 0x" << zones[i].write_pointer
- << ", advancing 0x" << delta << std::dec << dendl;
- (*zone_adjustments)[zones[i].write_pointer] = delta;
- zones[i].num_dead_bytes += delta;
- zones[i].write_pointer = p;
- }
- }
-
- // start with conventional zone "free" (bluefs may adjust this when it starts up)
- auto reserved = _get_ondisk_reserved();
- // for now we require a conventional zone
- ceph_assert(bdev->get_conventional_region_size());
- ceph_assert(shared_alloc.a != alloc); // zoned allocator doesn't use conventional region
- shared_alloc.a->init_add_free(
- reserved,
- p2align(bdev->get_conventional_region_size(), min_alloc_size) - reserved);
-
- // init sequential zone based on the device's write pointers
- a->init_from_zone_pointers(std::move(zones));
- dout(1) << __func__
- << " loaded zone pointers: "
- << std::hex
- << ", allocator type " << alloc->get_type()
- << ", capacity 0x" << alloc->get_capacity()
- << ", block size 0x" << alloc->get_block_size()
- << ", free 0x" << alloc->get_free()
- << ", fragmentation " << alloc->get_fragmentation()
- << std::dec << dendl;
-
- return 0;
- }
-#endif
-
uint64_t num = 0, bytes = 0;
utime_t start_time = ceph_clock_now();
if (!fm->is_null_manager()) {
void BlueStore::_post_init_alloc(const std::map<uint64_t, uint64_t>& zone_adjustments)
{
int r = 0;
-#ifdef HAVE_LIBZBD
- if (bdev->is_smr()) {
- if (zone_adjustments.empty()) {
- return;
- }
- dout(1) << __func__ << " adjusting freelist based on device write pointers" << dendl;
- auto f = dynamic_cast<ZonedFreelistManager*>(fm);
- ceph_assert(f);
- KeyValueDB::Transaction t = db->get_transaction();
- for (auto& i : zone_adjustments) {
- // allocate AND release since this gap is now dead space
- // note that the offset is imprecise, but only need to select the zone
- f->allocate(i.first, i.second, t);
- f->release(i.first, i.second, t);
- }
- r = db->submit_transaction_sync(t);
- } else
-#endif
if (fm->is_null_manager()) {
// Now that we load the allocation map we need to invalidate the file as new allocation won't be reflected
// Changes to the allocation map (alloc/release) are not updated inline and will only be stored on umount()
}
// when function is called in repair mode (to_repair=true) we skip db->open()/create()
- if (!is_db_rotational() && !read_only && !to_repair && cct->_conf->bluestore_allocation_from_file
-#ifdef HAVE_LIBZBD
- && !bdev->is_smr()
-#endif
- ) {
+ if (!is_db_rotational() && !read_only && !to_repair && cct->_conf->bluestore_allocation_from_file) {
dout(5) << __func__ << "::NCB::Commit to Null-Manager" << dendl;
commit_to_null_manager();
need_to_destage_allocation_file = true;
if (r < 0)
goto out_close_fsid;
- // choose freelist manager
-#ifdef HAVE_LIBZBD
- if (bdev->is_smr()) {
- freelist_type = "zoned";
- zone_size = bdev->get_zone_size();
- first_sequential_zone = bdev->get_conventional_region_size() / zone_size;
- bdev->reset_all_zones();
- } else
-#endif
- {
- freelist_type = "bitmap";
- }
+ freelist_type = "bitmap";
dout(10) << " freelist_type " << freelist_type << dendl;
// choose min_alloc_size
reserved = _get_ondisk_reserved();
alloc->init_add_free(reserved,
p2align(bdev->get_size(), min_alloc_size) - reserved);
-#ifdef HAVE_LIBZBD
- if (bdev->is_smr() && alloc != shared_alloc.a) {
- shared_alloc.a->init_add_free(reserved,
- p2align(bdev->get_conventional_region_size(),
- min_alloc_size) - reserved);
- }
-#endif
r = _open_db(true);
if (r < 0)
t->set(PREFIX_SUPER, "per_pool_omap", bl);
}
-#ifdef HAVE_LIBZBD
- if (bdev->is_smr()) {
- {
- bufferlist bl;
- encode((uint64_t)zone_size, bl);
- t->set(PREFIX_SUPER, "zone_size", bl);
- }
- {
- bufferlist bl;
- encode((uint64_t)first_sequential_zone, bl);
- t->set(PREFIX_SUPER, "first_sequential_zone", bl);
- }
- }
-#endif
-
ondisk_format = latest_ondisk_format;
_prepare_ondisk_format_super(t);
db->submit_transaction_sync(t);
return r;
}
-#ifdef HAVE_LIBZBD
- if (bdev->is_smr()) {
- _zoned_cleaner_start();
- }
-#endif
-
mempool_thread.init();
if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) &&
if (!_kv_only) {
mempool_thread.shutdown();
-#ifdef HAVE_LIBZBD
- if (bdev->is_smr()) {
- dout(20) << __func__ << " stopping zone cleaner thread" << dendl;
- _zoned_cleaner_stop();
- }
-#endif
dout(20) << __func__ << " stopping kv thread" << dendl;
_kv_stop();
// skip cache cleanup step on fast shutdown
&ctx.expected_pool_statfs[pool_id] :
&ctx.expected_store_statfs;
- map<uint32_t, uint64_t> zone_first_offsets; // for zoned/smr devices
dout(10) << __func__ << " " << oid << dendl;
OnodeRef o;
ceph_assert(l.blob);
const bluestore_blob_t& blob = l.blob->get_blob();
-#ifdef HAVE_LIBZBD
- if (bdev->is_smr() && depth != FSCK_SHALLOW) {
- for (auto& e : blob.get_extents()) {
- if (e.is_valid()) {
- uint32_t zone = e.offset / zone_size;
- uint64_t offset = e.offset % zone_size;
- auto p = zone_first_offsets.find(zone);
- if (p == zone_first_offsets.end() || p->second > offset) {
- // FIXME: use interator for guided insert?
- zone_first_offsets[zone] = offset;
- }
- }
- }
- }
-#endif
-
auto& ref = ref_map[l.blob];
if (ref.is_empty()) {
uint32_t min_release_size = blob.get_release_size(min_alloc_size);
}
}
-#ifdef HAVE_LIBZBD
- if (bdev->is_smr() && depth != FSCK_SHALLOW) {
- for (auto& [zone, first_offset] : zone_first_offsets) {
- auto p = (*ctx.zone_refs)[zone].find(oid);
- if (p != (*ctx.zone_refs)[zone].end()) {
- if (first_offset < p->second) {
- dout(20) << " slightly wonky zone ref 0x" << std::hex << zone
- << " offset 0x" << p->second
- << " but first offset is 0x" << first_offset
- << "; this can happen due to clone_range"
- << dendl;
- } else {
- dout(20) << " good zone ref 0x" << std::hex << zone << " offset 0x" << p->second
- << " <= first offset 0x" << first_offset
- << std::dec << dendl;
- }
- (*ctx.zone_refs)[zone].erase(p);
- } else {
- derr << "fsck error: " << oid << " references zone 0x" << std::hex << zone
- << " but there is no zone ref" << std::dec << dendl;
- // FIXME: add repair
- ++errors;
- }
- }
- }
-#endif
-
if (broken) {
derr << "fsck error: " << oid << " - " << broken
<< " zombie spanning blob(s) found, the first one: "
goto out_scan;
}
-#ifdef HAVE_LIBZBD
- if (bdev->is_smr()) {
- auto a = dynamic_cast<ZonedAllocator*>(alloc);
- ceph_assert(a);
- auto f = dynamic_cast<ZonedFreelistManager*>(fm);
- ceph_assert(f);
- vector<uint64_t> wp = bdev->get_zones();
- vector<zone_state_t> zones = f->get_zone_states(db);
- ceph_assert(wp.size() == zones.size());
- auto num_zones = bdev->get_size() / zone_size;
- for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
- uint64_t p = wp[i] == (i + 1) * zone_size ? zone_size : wp[i] % zone_size;
- if (zones[i].write_pointer > p &&
- zones[i].num_dead_bytes < zones[i].write_pointer) {
- derr << "fsck error: zone 0x" << std::hex << i
- << " bluestore write pointer 0x" << zones[i].write_pointer
- << " > device write pointer 0x" << p
- << " (with only 0x" << zones[i].num_dead_bytes << " dead bytes)"
- << std::dec << dendl;
- ++errors;
- }
- }
-
- if (depth != FSCK_SHALLOW) {
- // load zone refs
- zone_refs.resize(bdev->get_size() / zone_size);
- it = db->get_iterator(PREFIX_ZONED_CL_INFO, KeyValueDB::ITERATOR_NOCACHE);
- if (it) {
- for (it->lower_bound(string());
- it->valid();
- it->next()) {
- uint32_t zone = 0;
- uint64_t offset = 0;
- ghobject_t oid;
- string key = it->key();
- int r = get_key_zone_offset_object(key, &zone, &offset, &oid);
- if (r < 0) {
- derr << "fsck error: invalid zone ref key " << pretty_binary_string(key)
- << dendl;
- if (repair) {
- repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key);
- }
- ++errors;
- continue;
- }
- dout(30) << " zone ref 0x" << std::hex << zone << " offset 0x" << offset
- << " -> " << std::dec << oid << dendl;
- if (zone_refs[zone].count(oid)) {
- derr << "fsck error: second zone ref in zone 0x" << std::hex << zone
- << " offset 0x" << offset << std::dec << " for " << oid << dendl;
- if (repair) {
- repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key);
- }
- ++errors;
- continue;
- }
- zone_refs[zone][oid] = offset;
- }
- }
- }
- }
-#endif
-
dout(1) << __func__ << " checking shared_blobs (phase 1)" << dendl;
it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
if (it) {
_fsck_check_objects(depth, ctx);
}
-#ifdef HAVE_LIBZBD
- if (bdev->is_smr() && depth != FSCK_SHALLOW) {
- dout(1) << __func__ << " checking for leaked zone refs" << dendl;
- for (uint32_t zone = 0; zone < zone_refs.size(); ++zone) {
- for (auto& [oid, offset] : zone_refs[zone]) {
- derr << "fsck error: stray zone ref 0x" << std::hex << zone
- << " offset 0x" << offset << " -> " << std::dec << oid << dendl;
- // FIXME: add repair
- ++errors;
- }
- }
- }
-#endif
-
sb_ref_mismatches = sb_ref_counts.count_non_zero();
if (sb_ref_mismatches != 0) {
derr << "fsck error:" << "*" << sb_ref_mismatches
// skip freelist vs allocated compare when we have Null fm
if (!fm->is_null_manager()) {
dout(1) << __func__ << " checking freelist vs allocated" << dendl;
-#ifdef HAVE_LIBZBD
- if (freelist_type == "zoned") {
- // verify per-zone state
- // - verify no allocations beyond write pointer
- // - verify num_dead_bytes count (neither allocated nor
- // free space past the write pointer)
- auto a = dynamic_cast<ZonedAllocator*>(alloc);
- auto num_zones = bdev->get_size() / zone_size;
-
- // mark the free space past the write pointer
- for (uint32_t zone = first_sequential_zone; zone < num_zones; ++zone) {
- auto wp = a->get_write_pointer(zone);
- uint64_t offset = zone_size * zone + wp;
- uint64_t length = zone_size - wp;
- if (!length) {
- continue;
- }
- bool intersects = false;
- dout(10) << " marking zone 0x" << std::hex << zone
- << " region after wp 0x" << offset << "~" << length
- << std::dec << dendl;
- apply_for_bitset_range(
- offset, length, alloc_size, used_blocks,
- [&](uint64_t pos, mempool_dynamic_bitset &bs) {
- if (bs.test(pos)) {
- derr << "fsck error: zone 0x" << std::hex << zone
- << " has used space at 0x" << pos * alloc_size
- << " beyond write pointer 0x" << wp
- << std::dec << dendl;
- intersects = true;
- } else {
- bs.set(pos);
- }
- }
- );
- if (intersects) {
- ++errors;
- }
- }
-
- used_blocks.flip();
-
- // skip conventional zones
- uint64_t pos = (first_sequential_zone * zone_size) / min_alloc_size - 1;
- pos = used_blocks.find_next(pos);
-
- uint64_t zone_dead = 0;
- for (uint32_t zone = first_sequential_zone;
- zone < num_zones;
- ++zone, zone_dead = 0) {
- while (pos != decltype(used_blocks)::npos &&
- (pos * min_alloc_size) / zone_size == zone) {
- dout(40) << " zone 0x" << std::hex << zone
- << " dead 0x" << (pos * min_alloc_size) << "~" << min_alloc_size
- << std::dec << dendl;
- zone_dead += min_alloc_size;
- pos = used_blocks.find_next(pos);
- }
- dout(20) << " zone 0x" << std::hex << zone << " dead is 0x" << zone_dead
- << std::dec << dendl;
- // cross-check dead bytes against zone state
- if (a->get_dead_bytes(zone) != zone_dead) {
- derr << "fsck error: zone 0x" << std::hex << zone << " has 0x" << zone_dead
- << " dead bytes but freelist says 0x" << a->get_dead_bytes(zone)
- << dendl;
- ++errors;
- // TODO: repair
- }
- }
- used_blocks.flip();
- } else
-#endif
- {
- fm->enumerate_reset();
- uint64_t offset, length;
- while (fm->enumerate_next(db, &offset, &length)) {
- bool intersects = false;
- apply_for_bitset_range(
- offset, length, alloc_size, used_blocks,
- [&](uint64_t pos, mempool_dynamic_bitset &bs) {
- ceph_assert(pos < bs.size());
- if (bs.test(pos) && !bluefs_used_blocks.test(pos)) {
- if (offset == DB_SUPER_RESERVED &&
- length == min_alloc_size - DB_SUPER_RESERVED) {
- // this is due to the change just after luminous to min_alloc_size
- // granularity allocations, and our baked in assumption at the top
- // of _fsck that 0~round_up_to(DB_SUPER_RESERVED,min_alloc_size) is used
- // (vs luminous's round_up_to(DB_SUPER_RESERVED,block_size)). harmless,
- // since we will never allocate this region below min_alloc_size.
- dout(10) << __func__ << " ignoring free extent between DB_SUPER_RESERVED"
- << " and min_alloc_size, 0x" << std::hex << offset << "~"
- << length << std::dec << dendl;
- } else {
- intersects = true;
- if (repair) {
- repairer.fix_false_free(db, fm,
- pos * min_alloc_size,
- min_alloc_size);
- }
- }
- } else {
- bs.set(pos);
- }
- }
- );
- if (intersects) {
- derr << "fsck error: free extent 0x" << std::hex << offset
- << "~" << length << std::dec
- << " intersects allocated blocks" << dendl;
- ++errors;
- }
- }
- fm->enumerate_reset();
-
- // check for leaked extents
- size_t count = used_blocks.count();
- if (used_blocks.size() != count) {
- ceph_assert(used_blocks.size() > count);
- used_blocks.flip();
- size_t start = used_blocks.find_first();
- while (start != decltype(used_blocks)::npos) {
- size_t cur = start;
- while (true) {
- size_t next = used_blocks.find_next(cur);
- if (next != cur + 1) {
- ++errors;
- derr << "fsck error: leaked extent 0x" << std::hex
- << ((uint64_t)start * fm->get_alloc_size()) << "~"
- << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
- << dendl;
- if (repair) {
- repairer.fix_leaked(db,
- fm,
- start * min_alloc_size,
- (cur + 1 - start) * min_alloc_size);
- }
- start = next;
- break;
- }
- cur = next;
- }
- }
- used_blocks.flip();
- }
+ fm->enumerate_reset();
+ uint64_t offset, length;
+ while (fm->enumerate_next(db, &offset, &length)) {
+ bool intersects = false;
+ apply_for_bitset_range(
+ offset, length, alloc_size, used_blocks,
+ [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+ ceph_assert(pos < bs.size());
+ if (bs.test(pos) && !bluefs_used_blocks.test(pos)) {
+ if (offset == DB_SUPER_RESERVED &&
+ length == min_alloc_size - DB_SUPER_RESERVED) {
+ // this is due to the change just after luminous to min_alloc_size
+ // granularity allocations, and our baked in assumption at the top
+ // of _fsck that 0~round_up_to(DB_SUPER_RESERVED,min_alloc_size) is used
+ // (vs luminous's round_up_to(DB_SUPER_RESERVED,block_size)). harmless,
+ // since we will never allocate this region below min_alloc_size.
+ dout(10) << __func__ << " ignoring free extent between DB_SUPER_RESERVED"
+ << " and min_alloc_size, 0x" << std::hex << offset << "~"
+ << length << std::dec << dendl;
+ } else {
+ intersects = true;
+ if (repair) {
+ repairer.fix_false_free(db, fm,
+ pos * min_alloc_size,
+ min_alloc_size);
+ }
+ }
+ } else {
+ bs.set(pos);
+ }
+ }
+ );
+ if (intersects) {
+ derr << "fsck error: free extent 0x" << std::hex << offset
+ << "~" << length << std::dec
+ << " intersects allocated blocks" << dendl;
+ ++errors;
+ }
+ }
+ fm->enumerate_reset();
+
+ // check for leaked extents
+ size_t count = used_blocks.count();
+ if (used_blocks.size() != count) {
+ ceph_assert(used_blocks.size() > count);
+ used_blocks.flip();
+ size_t start = used_blocks.find_first();
+ while (start != decltype(used_blocks)::npos) {
+ size_t cur = start;
+ while (true) {
+ size_t next = used_blocks.find_next(cur);
+ if (next != cur + 1) {
+ ++errors;
+ derr << "fsck error: leaked extent 0x" << std::hex
+ << ((uint64_t)start * fm->get_alloc_size()) << "~"
+ << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
+ << dendl;
+ if (repair) {
+ repairer.fix_leaked(db,
+ fm,
+ start * min_alloc_size,
+ (cur + 1 - start) * min_alloc_size);
+ }
+ start = next;
+ break;
+ }
+ cur = next;
+ }
+ }
+ used_blocks.flip();
}
}
}
logger->set(l_bluestore_alloc_unit, min_alloc_size);
}
- // smr fields
- {
- bufferlist bl;
- int r = db->get(PREFIX_SUPER, "zone_size", &bl);
- if (r >= 0) {
- auto p = bl.cbegin();
- decode(zone_size, p);
- dout(1) << __func__ << " zone_size 0x" << std::hex << zone_size << std::dec << dendl;
- ceph_assert(bdev->is_smr());
- } else {
- ceph_assert(!bdev->is_smr());
- }
- }
- {
- bufferlist bl;
- int r = db->get(PREFIX_SUPER, "first_sequential_zone", &bl);
- if (r >= 0) {
- auto p = bl.cbegin();
- decode(first_sequential_zone, p);
- dout(1) << __func__ << " first_sequential_zone 0x" << std::hex
- << first_sequential_zone << std::dec << dendl;
- ceph_assert(bdev->is_smr());
- } else {
- ceph_assert(!bdev->is_smr());
- }
- }
-
_set_per_pool_omap();
_open_statfs();
}
}
-#ifdef HAVE_LIBZBD
- if (bdev->is_smr()) {
- for (auto& i : txc->old_zone_offset_refs) {
- dout(20) << __func__ << " rm ref zone 0x" << std::hex << i.first.second
- << " offset 0x" << i.second << std::dec
- << " -> " << i.first.first->oid << dendl;
- string key;
- get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key);
- txc->t->rmkey(PREFIX_ZONED_CL_INFO, key);
- }
- for (auto& i : txc->new_zone_offset_refs) {
- // (zone, offset) -> oid
- dout(20) << __func__ << " add ref zone 0x" << std::hex << i.first.second
- << " offset 0x" << i.second << std::dec
- << " -> " << i.first.first->oid << dendl;
- string key;
- get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key);
- bufferlist v;
- txc->t->set(PREFIX_ZONED_CL_INFO, key, v);
- }
- }
-#endif
-
_txc_update_store_statfs(txc);
}
kv_finalize_started = false;
}
-#ifdef HAVE_LIBZBD
-void BlueStore::_zoned_cleaner_start()
-{
- dout(10) << __func__ << dendl;
- zoned_cleaner_thread.create("bstore_zcleaner");
-}
-
-void BlueStore::_zoned_cleaner_stop()
-{
- dout(10) << __func__ << dendl;
- {
- std::unique_lock l{zoned_cleaner_lock};
- while (!zoned_cleaner_started) {
- zoned_cleaner_cond.wait(l);
- }
- zoned_cleaner_stop = true;
- zoned_cleaner_cond.notify_all();
- }
- zoned_cleaner_thread.join();
- {
- std::lock_guard l{zoned_cleaner_lock};
- zoned_cleaner_stop = false;
- }
- dout(10) << __func__ << " done" << dendl;
-}
-
-void BlueStore::_zoned_cleaner_thread()
-{
- dout(10) << __func__ << " start" << dendl;
- std::unique_lock l{zoned_cleaner_lock};
- ceph_assert(!zoned_cleaner_started);
- zoned_cleaner_started = true;
- zoned_cleaner_cond.notify_all();
- auto a = dynamic_cast<ZonedAllocator*>(alloc);
- ceph_assert(a);
- auto f = dynamic_cast<ZonedFreelistManager*>(fm);
- ceph_assert(f);
- while (true) {
- // thresholds to trigger cleaning
- // FIXME
- float min_score = .05; // score: bytes saved / bytes moved
- uint64_t min_saved = zone_size / 32; // min bytes saved to consider cleaning
- auto zone_to_clean = a->pick_zone_to_clean(min_score, min_saved);
- if (zone_to_clean < 0) {
- if (zoned_cleaner_stop) {
- break;
- }
- auto period = ceph::make_timespan(cct->_conf->bluestore_cleaner_sleep_interval);
- dout(20) << __func__ << " sleep for " << period << dendl;
- zoned_cleaner_cond.wait_for(l, period);
- dout(20) << __func__ << " wake" << dendl;
- } else {
- l.unlock();
- a->set_cleaning_zone(zone_to_clean);
- _zoned_clean_zone(zone_to_clean, a, f);
- a->clear_cleaning_zone(zone_to_clean);
- l.lock();
- }
- }
- dout(10) << __func__ << " finish" << dendl;
- zoned_cleaner_started = false;
-}
-
-void BlueStore::_zoned_clean_zone(
- uint64_t zone,
- ZonedAllocator *a,
- ZonedFreelistManager *f
- )
-{
- dout(10) << __func__ << " cleaning zone 0x" << std::hex << zone << std::dec << dendl;
-
- KeyValueDB::Iterator it = db->get_iterator(PREFIX_ZONED_CL_INFO);
- std::string zone_start;
- get_zone_offset_object_key(zone, 0, ghobject_t(), &zone_start);
- for (it->lower_bound(zone_start); it->valid(); it->next()) {
- uint32_t z;
- uint64_t offset;
- ghobject_t oid;
- string k = it->key();
- int r = get_key_zone_offset_object(k, &z, &offset, &oid);
- if (r < 0) {
- derr << __func__ << " failed to decode zone ref " << pretty_binary_string(k)
- << dendl;
- continue;
- }
- if (zone != z) {
- dout(10) << __func__ << " reached end of zone refs" << dendl;
- break;
- }
- dout(10) << __func__ << " zone 0x" << std::hex << zone << " offset 0x" << offset
- << std::dec << " " << oid << dendl;
- _clean_some(oid, zone);
- }
-
- if (a->get_live_bytes(zone) > 0) {
- derr << "zone 0x" << std::hex << zone << " still has 0x" << a->get_live_bytes(zone)
- << " live bytes" << std::dec << dendl;
- // should we do something else here to avoid a live-lock in the event of a problem?
- return;
- }
-
- // make sure transactions flush/drain/commit (and data is all rewritten
- // safely elsewhere) before we blow away the cleaned zone
- _osr_drain_all();
-
- // reset the device zone
- dout(10) << __func__ << " resetting zone 0x" << std::hex << zone << std::dec << dendl;
- bdev->reset_zone(zone);
-
- // record that we can now write there
- f->mark_zone_to_clean_free(zone, db);
- bdev->flush();
-
- // then allow ourselves to start allocating there
- dout(10) << __func__ << " done cleaning zone 0x" << std::hex << zone << std::dec
- << dendl;
- a->reset_zone(zone);
-}
-
-void BlueStore::_clean_some(ghobject_t oid, uint32_t zone)
-{
- dout(10) << __func__ << " " << oid << " from zone 0x" << std::hex << zone << std::dec
- << dendl;
-
- CollectionRef cref = _get_collection_by_oid(oid);
- if (!cref) {
- dout(10) << __func__ << " can't find collection for " << oid << dendl;
- return;
- }
- Collection *c = cref.get();
-
- // serialize io dispatch vs other transactions
- std::lock_guard l(atomic_alloc_and_submit_lock);
- std::unique_lock l2(c->lock);
-
- auto o = c->get_onode(oid, false);
- if (!o) {
- dout(10) << __func__ << " can't find " << oid << dendl;
- return;
- }
-
- o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
- _dump_onode<30>(cct, *o);
-
- // NOTE: This is a naive rewrite strategy. If any blobs are
- // shared, they will be duplicated for each object that references
- // them. That means any cloned/snapshotted objects will explode
- // their utilization. This won't matter for RGW workloads, but
- // for RBD and CephFS it is completely unacceptable, and it's
- // entirely reasonable to have "archival" data workloads on SMR
- // for CephFS and (possibly/probably) RBD.
- //
- // At some point we need to replace this with something more
- // sophisticated that ensures that a shared blob gets moved once
- // and all referencing objects get updated to point to the new
- // location.
-
- map<uint32_t, uint32_t> to_move;
- for (auto& e : o->extent_map.extent_map) {
- bool touches_zone = false;
- for (auto& be : e.blob->get_blob().get_extents()) {
- if (be.is_valid()) {
- uint32_t z = be.offset / zone_size;
- if (z == zone) {
- touches_zone = true;
- break;
- }
- }
- }
- if (touches_zone) {
- to_move[e.logical_offset] = e.length;
- }
- }
- if (to_move.empty()) {
- dout(10) << __func__ << " no references to zone 0x" << std::hex << zone
- << std::dec << " from " << oid << dendl;
- return;
- }
-
- dout(10) << __func__ << " rewriting object extents 0x" << std::hex << to_move
- << std::dec << dendl;
- OpSequencer *osr = c->osr.get();
- TransContext *txc = _txc_create(c, osr, nullptr);
-
- spg_t pgid;
- if (c->cid.is_pg(&pgid)) {
- txc->osd_pool_id = pgid.pool();
- }
-
- for (auto& [offset, length] : to_move) {
- bufferlist bl;
- int r = _do_read(c, o, offset, length, bl, 0);
- ceph_assert(r == (int)length);
-
- r = _do_write(txc, cref, o, offset, length, bl, 0);
- ceph_assert(r >= 0);
- }
- txc->write_onode(o);
-
- _txc_write_nodes(txc, txc->t);
- _txc_finalize_kv(txc, txc->t);
- _txc_state_proc(txc);
-}
-#endif
bluestore_deferred_op_t *BlueStore::_get_deferred_op(
TransContext *txc, uint64_t len)
OpSequencer *osr = c->osr.get();
dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
- // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O
- // submission to happen atomically because if I/O submission happens in a
- // different order than I/O allocation, we end up issuing non-sequential
- // writes to the drive. This is a temporary solution until ZONE APPEND
- // support matures in the kernel. For more information please see:
- // https://www.usenix.org/conference/vault20/presentation/bjorling
- if (bdev->is_smr()) {
- atomic_alloc_and_submit_lock.lock();
- }
-
// prepare
TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
&on_commit, op);
// execute (start)
_txc_state_proc(txc);
- if (bdev->is_smr()) {
- atomic_alloc_and_submit_lock.unlock();
- }
-
// we're immediately readable (unlike FileStore)
for (auto c : on_applied_sync) {
c->complete(0);
// than 'offset' only).
o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
-#ifdef HAVE_LIBZBD
- // On zoned devices, the first goal is to support non-overwrite workloads,
- // such as RGW, with large, aligned objects. Therefore, for user writes
- // _do_write_small should not trigger. OSDs, however, write and update a tiny
- // amount of metadata, such as OSD maps, to disk. For those cases, we
- // temporarily just pad them to min_alloc_size and write them to a new place
- // on every update.
- if (bdev->is_smr()) {
- uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
- uint64_t b_off0 = b_off;
- o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
-
- // Zero detection -- small block
- if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
- BlobRef b = c->new_blob();
- _pad_zeros(&bl, &b_off0, min_alloc_size);
- wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, true);
- } else { // if (bl.is_zero())
- dout(20) << __func__ << " skip small zero block " << std::hex
- << " (0x" << b_off0 << "~" << bl.length() << ")"
- << " (0x" << b_off << "~" << length << ")"
- << std::dec << dendl;
- logger->inc(l_bluestore_write_small_skipped);
- logger->inc(l_bluestore_write_small_skipped_bytes, length);
- }
-
- return;
- }
-#endif
-
// Look for an existing mutable blob we can use.
auto begin = o->extent_map.extent_map.begin();
auto end = o->extent_map.extent_map.end();
WriteContext *wctx,
set<SharedBlob*> *maybe_unshared_blobs)
{
-#ifdef HAVE_LIBZBD
- bool is_smr = bdev && bdev->is_smr();
- if (is_smr) {
- for (auto& w : wctx->writes) {
- for (auto& e : w.b->get_blob().get_extents()) {
- if (!e.is_valid()) {
- continue;
- }
- uint32_t zone = e.offset / zone_size;
- if (!o->onode.zone_offset_refs.count(zone)) {
- uint64_t zoff = e.offset % zone_size;
- dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
- << " offset 0x" << zoff << std::dec << dendl;
- txc->note_write_zone_offset(o, zone, zoff);
- }
- }
- }
- }
- set<uint32_t> zones_with_releases;
-#endif
-
auto oep = wctx->old_extents.begin();
while (oep != wctx->old_extents.end()) {
auto &lo = *oep;
b->shared_blob->put_ref(
e.offset, e.length, &final,
unshare_ptr);
-#ifdef HAVE_LIBZBD
- // we also drop zone ref for shared blob extents
- if (is_smr && e.is_valid()) {
- zones_with_releases.insert(e.offset / zone_size);
- }
-#endif
}
if (unshare) {
ceph_assert(maybe_unshared_blobs);
if (blob.is_compressed()) {
txc->statfs_delta.compressed_allocated() -= e.length;
}
-#ifdef HAVE_LIBZBD
- if (is_smr && e.is_valid()) {
- zones_with_releases.insert(e.offset / zone_size);
- }
-#endif
}
if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
}
delete &lo;
}
-
-#ifdef HAVE_LIBZBD
- if (!zones_with_releases.empty()) {
- // we need to fault the entire extent range in here to determinte if we've dropped
- // all refs to a zone.
- o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
- for (auto& b : o->extent_map.extent_map) {
- for (auto& e : b.blob->get_blob().get_extents()) {
- if (e.is_valid()) {
- zones_with_releases.erase(e.offset / zone_size);
- }
- }
- }
- for (auto zone : zones_with_releases) {
- auto p = o->onode.zone_offset_refs.find(zone);
- if (p != o->onode.zone_offset_refs.end()) {
- dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone
- << " offset 0x" << p->second << std::dec << dendl;
- txc->note_release_zone_offset(o, zone, p->second);
- }
- }
- }
-#endif
}
void BlueStore::_do_write_data(
oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
}
-#ifdef HAVE_LIBZBD
- if (bdev->is_smr()) {
- // duplicate the refs for the shared region.
- Extent dummy(dstoff);
- for (auto e = newo->extent_map.extent_map.lower_bound(dummy);
- e != newo->extent_map.extent_map.end();
- ++e) {
- if (e->logical_offset >= dstoff + length) {
- break;
- }
- for (auto& ex : e->blob->get_blob().get_extents()) {
- // note that we may introduce a new extent reference that is
- // earlier than the first zone ref. we allow this since it is
- // a lot of work to avoid and has marginal impact on cleaning
- // performance.
- if (!ex.is_valid()) {
- continue;
- }
- uint32_t zone = ex.offset / zone_size;
- if (!newo->onode.zone_offset_refs.count(zone)) {
- uint64_t zoff = ex.offset % zone_size;
- dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
- << " offset 0x" << zoff << std::dec
- << " -> " << newo->oid << dendl;
- txc->note_write_zone_offset(newo, zone, zoff);
- }
- }
- }
- }
-#endif
-
_dump_onode<30>(cct, *oldo);
_dump_onode<30>(cct, *newo);
return 0;
// and read newo's metadata via the old name).
txc->note_modified_object(oldo);
-#ifdef HAVE_LIBZBD
- if (bdev->is_smr()) {
- // adjust zone refs
- for (auto& [zone, offset] : newo->onode.zone_offset_refs) {
- dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone
- << " offset 0x" << offset << std::dec
- << " -> " << oldo->oid << dendl;
- string key;
- get_zone_offset_object_key(zone, offset, oldo->oid, &key);
- txc->t->rmkey(PREFIX_ZONED_CL_INFO, key);
-
- dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
- << " offset 0x" << offset << std::dec
- << " -> " << newo->oid << dendl;
- get_zone_offset_object_key(zone, offset, newo->oid, &key);
- bufferlist v;
- txc->t->set(PREFIX_ZONED_CL_INFO, key, v);
- }
- }
-#endif
-
out:
dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
<< new_oid << " = " << r << dendl;
// create allocator
uint64_t alloc_size = min_alloc_size;
Allocator* alloc = Allocator::create(cct, "bitmap", bdev_size, alloc_size,
- zone_size, first_sequential_zone,
"recovery");
if (alloc) {
return alloc;
void rewrite_omap_key(const std::string& old, std::string *out);
void decode_omap_key(const std::string& key, std::string *user_key);
-#ifdef HAVE_LIBZBD
- // Return the offset of an object on disk. This function is intended *only*
- // for use with zoned storage devices because in these devices, the objects
- // are laid out contiguously on disk, which is not the case in general.
- // Also, it should always be called after calling extent_map.fault_range(),
- // so that the extent map is loaded.
- int64_t zoned_get_ondisk_starting_offset() const {
- return extent_map.extent_map.begin()->blob->
- get_blob().calc_offset(0, nullptr);
- }
-#endif
private:
void _decode(const ceph::buffer::list& v);
};
std::set<OnodeRef> onodes; ///< these need to be updated/written
std::set<OnodeRef> modified_objects; ///< objects we modified (and need a ref)
-#ifdef HAVE_LIBZBD
- // zone refs to add/remove. each zone ref is a (zone, offset) tuple. The offset
- // is the first offset in the zone that the onode touched; subsequent writes
- // to that zone do not generate additional refs. This is a bit imprecise but
- // is sufficient to generate reasonably sequential reads when doing zone
- // cleaning with less metadata than a ref for every extent.
- std::map<std::pair<OnodeRef, uint32_t>, uint64_t> new_zone_offset_refs;
- std::map<std::pair<OnodeRef, uint32_t>, uint64_t> old_zone_offset_refs;
-#endif
-
std::set<SharedBlobRef> shared_blobs; ///< these need to be updated/written
std::set<BlobRef> blobs_written; ///< update these on io completion
KeyValueDB::Transaction t; ///< then we will commit this
onodes.erase(o);
}
-#ifdef HAVE_LIBZBD
- void note_write_zone_offset(OnodeRef& o, uint32_t zone, uint64_t offset) {
- o->onode.zone_offset_refs[zone] = offset;
- new_zone_offset_refs[std::make_pair(o, zone)] = offset;
- }
- void note_release_zone_offset(OnodeRef& o, uint32_t zone, uint64_t offset) {
- old_zone_offset_refs[std::make_pair(o, zone)] = offset;
- o->onode.zone_offset_refs.erase(zone);
- }
-#endif
-
void aio_finish(BlueStore *store) override {
store->txc_aio_finish(this);
}
}
};
-#ifdef HAVE_LIBZBD
- struct ZonedCleanerThread : public Thread {
- BlueStore *store;
- explicit ZonedCleanerThread(BlueStore *s) : store(s) {}
- void *entry() override {
- store->_zoned_cleaner_thread();
- return nullptr;
- }
- };
-#endif
-
struct BigDeferredWriteContext {
uint64_t off = 0; // original logical offset
uint32_t b_off = 0; // blob relative offset
std::deque<DeferredBatch*> deferred_stable_to_finalize; ///< pending finalization
bool kv_finalize_in_progress = false;
-#ifdef HAVE_LIBZBD
- ZonedCleanerThread zoned_cleaner_thread;
- ceph::mutex zoned_cleaner_lock = ceph::make_mutex("BlueStore::zoned_cleaner_lock");
- ceph::condition_variable zoned_cleaner_cond;
- bool zoned_cleaner_started = false;
- bool zoned_cleaner_stop = false;
- std::deque<uint64_t> zoned_cleaner_queue;
-#endif
-
PerfCounters *logger = nullptr;
std::list<CollectionRef> removed_collections;
"not enough bits for min_alloc_size");
bool elastic_shared_blobs = false; ///< use smart ExtentMap::dup to reduce shared blob count
- // smr-only
- uint64_t zone_size = 0; ///< number of SMR zones
- uint64_t first_sequential_zone = 0; ///< first SMR zone that is sequential-only
-
enum {
// Please preserve the order since it's DB persistent
OMAP_BULK = 0,
void _kv_sync_thread();
void _kv_finalize_thread();
-#ifdef HAVE_LIBZBD
- void _zoned_cleaner_start();
- void _zoned_cleaner_stop();
- void _zoned_cleaner_thread();
- void _zoned_clean_zone(uint64_t zone_num,
- class ZonedAllocator *a,
- class ZonedFreelistManager *f);
- void _clean_some(ghobject_t oid, uint32_t zone_num);
-#endif
-
bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, uint64_t len);
void _deferred_queue(TransContext *txc);
public:
return fm;
}
-#ifdef HAVE_LIBZBD
- // With zoned drives there is only one FreelistManager implementation that we
- // can use, and we also know if a drive is zoned right after opening it
- // (BlueStore::_open_bdev). Hence, we set freelist_type to "zoned" whenever
- // we open the device and it turns out to be is zoned. We ignore |prefix|
- // passed to create and use the prefixes defined for zoned devices at the top
- // of BlueStore.cc.
- if (type == "zoned")
- return new ZonedFreelistManager(cct, "Z", "z");
-#endif
-
return NULL;
}
static void setup_merge_operators(KeyValueDB *db, const std::string &type);
virtual int create(uint64_t size, uint64_t granularity,
- uint64_t zone_size, uint64_t first_sequential_zone,
KeyValueDB::Transaction txn) = 0;
virtual int init(KeyValueDB *kvdb, bool db_in_read_only,
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-//
-// A simple allocator that just hands out space from the next empty zone. This
-// is temporary, just to get the simplest append-only write workload to work.
-//
-// Copyright (C) 2020 Abutalib Aghayev
-//
-
-#include "ZonedAllocator.h"
-#include "bluestore_types.h"
-#include "zoned_types.h"
-#include "common/debug.h"
-
-#define dout_context cct
-#define dout_subsys ceph_subsys_bluestore
-#undef dout_prefix
-#define dout_prefix *_dout << "ZonedAllocator(" << this << ") " << __func__ << " "
-
-ZonedAllocator::ZonedAllocator(CephContext* cct,
- int64_t size,
- int64_t blk_size,
- int64_t _zone_size,
- int64_t _first_sequential_zone,
- std::string_view name)
- : Allocator(name, size, blk_size),
- cct(cct),
- size(size),
- conventional_size(_first_sequential_zone * _zone_size),
- sequential_size(size - conventional_size),
- num_sequential_free(0),
- block_size(blk_size),
- zone_size(_zone_size),
- first_seq_zone_num(_first_sequential_zone),
- starting_zone_num(first_seq_zone_num),
- num_zones(size / zone_size)
-{
- ldout(cct, 10) << " size 0x" << std::hex << size
- << ", zone size 0x" << zone_size << std::dec
- << ", number of zones 0x" << num_zones
- << ", first sequential zone 0x" << starting_zone_num
- << ", sequential size 0x" << sequential_size
- << std::dec
- << dendl;
- ceph_assert(size % zone_size == 0);
-
- zone_states.resize(num_zones);
-}
-
-ZonedAllocator::~ZonedAllocator()
-{
-}
-
-int64_t ZonedAllocator::allocate(
- uint64_t want_size,
- uint64_t alloc_unit,
- uint64_t max_alloc_size,
- int64_t hint,
- PExtentVector *extents)
-{
- std::lock_guard l(lock);
-
- ceph_assert(want_size % 4096 == 0);
-
- ldout(cct, 10) << " trying to allocate 0x"
- << std::hex << want_size << std::dec << dendl;
-
- uint64_t left = num_zones - first_seq_zone_num;
- uint64_t zone_num = starting_zone_num;
- for ( ; left > 0; ++zone_num, --left) {
- if (zone_num == num_zones) {
- zone_num = first_seq_zone_num;
- }
- if (zone_num == cleaning_zone) {
- ldout(cct, 10) << " skipping zone 0x" << std::hex << zone_num
- << " because we are cleaning it" << std::dec << dendl;
- continue;
- }
- if (!fits(want_size, zone_num)) {
- ldout(cct, 10) << " skipping zone 0x" << std::hex << zone_num
- << " because there is not enough space: "
- << " want_size = 0x" << want_size
- << " available = 0x" << get_remaining_space(zone_num)
- << std::dec
- << dendl;
- continue;
- }
- break;
- }
-
- if (left == 0) {
- ldout(cct, 10) << " failed to allocate" << dendl;
- return -ENOSPC;
- }
-
- uint64_t offset = get_offset(zone_num);
-
- ldout(cct, 10) << " moving zone 0x" << std::hex
- << zone_num << " write pointer from 0x" << offset
- << " -> 0x" << offset + want_size
- << std::dec << dendl;
-
- increment_write_pointer(zone_num, want_size);
- num_sequential_free -= want_size;
- if (get_remaining_space(zone_num) == 0) {
- starting_zone_num = zone_num + 1;
- }
-
- ldout(cct, 10) << " allocated 0x" << std::hex << offset << "~" << want_size
- << " from zone 0x" << zone_num
- << " and zone offset 0x" << (offset % zone_size)
- << std::dec << dendl;
-
- extents->emplace_back(bluestore_pextent_t(offset, want_size));
- return want_size;
-}
-
-void ZonedAllocator::release(const interval_set<uint64_t>& release_set)
-{
- std::lock_guard l(lock);
- for (auto p = cbegin(release_set); p != cend(release_set); ++p) {
- auto offset = p.get_start();
- auto length = p.get_len();
- uint64_t zone_num = offset / zone_size;
- ldout(cct, 10) << " 0x" << std::hex << offset << "~" << length
- << " from zone 0x" << zone_num << std::dec << dendl;
- uint64_t num_dead = std::min(zone_size - offset % zone_size, length);
- for ( ; length; ++zone_num) {
- increment_num_dead_bytes(zone_num, num_dead);
- length -= num_dead;
- num_dead = std::min(zone_size, length);
- }
- }
-}
-
-uint64_t ZonedAllocator::get_free()
-{
- return num_sequential_free;
-}
-
-void ZonedAllocator::dump()
-{
- std::lock_guard l(lock);
-}
-
-void ZonedAllocator::foreach(
- std::function<void(uint64_t offset, uint64_t length)> notify)
-{
- std::lock_guard l(lock);
-}
-
-void ZonedAllocator::init_from_zone_pointers(
- std::vector<zone_state_t> &&_zone_states)
-{
- // this is called once, based on the device's zone pointers
- std::lock_guard l(lock);
- ldout(cct, 10) << dendl;
- zone_states = std::move(_zone_states);
- num_sequential_free = 0;
- for (size_t i = first_seq_zone_num; i < num_zones; ++i) {
- num_sequential_free += zone_size - (zone_states[i].write_pointer % zone_size);
- }
- ldout(cct, 10) << "free 0x" << std::hex << num_sequential_free
- << " / 0x" << sequential_size << std::dec
- << dendl;
-}
-
-int64_t ZonedAllocator::pick_zone_to_clean(float min_score, uint64_t min_saved)
-{
- std::lock_guard l(lock);
- int32_t best = -1;
- float best_score = 0.0;
- for (size_t i = first_seq_zone_num; i < num_zones; ++i) {
- // value (score) = benefit / cost
- // benefit = how much net free space we'll get (dead bytes)
- // cost = how many bytes we'll have to rewrite (live bytes)
- // avoid divide by zero on a zone with no live bytes
- float score =
- (float)zone_states[i].num_dead_bytes /
- (float)(zone_states[i].get_num_live_bytes() + 1);
- if (score > 0) {
- ldout(cct, 20) << " zone 0x" << std::hex << i
- << " dead 0x" << zone_states[i].num_dead_bytes
- << " score " << score
- << dendl;
- }
- if (zone_states[i].num_dead_bytes < min_saved) {
- continue;
- }
- if (best < 0 || score > best_score) {
- best = i;
- best_score = score;
- }
- }
- if (best_score >= min_score) {
- ldout(cct, 10) << " zone 0x" << std::hex << best << " with score " << best_score
- << ": 0x" << zone_states[best].num_dead_bytes
- << " dead and 0x"
- << zone_states[best].write_pointer - zone_states[best].num_dead_bytes
- << " live bytes" << std::dec << dendl;
- } else if (best > 0) {
- ldout(cct, 10) << " zone 0x" << std::hex << best << " with score " << best_score
- << ": 0x" << zone_states[best].num_dead_bytes
- << " dead and 0x"
- << zone_states[best].write_pointer - zone_states[best].num_dead_bytes
- << " live bytes" << std::dec
- << " but below min_score " << min_score
- << dendl;
- best = -1;
- } else {
- ldout(cct, 10) << " no zones found that are good cleaning candidates" << dendl;
- }
- return best;
-}
-
-void ZonedAllocator::reset_zone(uint32_t zone)
-{
- num_sequential_free += zone_states[zone].write_pointer;
- zone_states[zone].reset();
-}
-
-bool ZonedAllocator::low_on_space(void)
-{
- std::lock_guard l(lock);
- double free_ratio = static_cast<double>(num_sequential_free) / sequential_size;
-
- ldout(cct, 10) << " free 0x" << std::hex << num_sequential_free
- << "/ 0x" << sequential_size << std::dec
- << ", free ratio is " << free_ratio << dendl;
- ceph_assert(num_sequential_free <= (int64_t)sequential_size);
-
- // TODO: make 0.25 tunable
- return free_ratio <= 0.25;
-}
-
-void ZonedAllocator::shutdown()
-{
- ldout(cct, 1) << dendl;
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-//
-// A simple allocator that just hands out space from the next empty zone. This
-// is temporary, just to get the simplest append-only write workload to work.
-//
-// Copyright (C) 2020 Abutalib Aghayev
-//
-
-#ifndef CEPH_OS_BLUESTORE_ZONEDALLOCATOR_H
-#define CEPH_OS_BLUESTORE_ZONEDALLOCATOR_H
-
-#include <mutex>
-
-#include "Allocator.h"
-#include "common/ceph_mutex.h"
-#include "include/btree_map.h"
-#include "include/interval_set.h"
-#include "include/mempool.h"
-#include "bluestore_types.h"
-#include "zoned_types.h"
-
-class ZonedAllocator : public Allocator {
- CephContext* cct;
-
- // Currently only one thread at a time calls into ZonedAllocator due to
- // atomic_alloc_and_submit_lock in BlueStore.cc, but we do locking anyway
- // because eventually ZONE_APPEND support will land and
- // atomic_alloc_and_submit_lock will be removed.
- ceph::mutex lock = ceph::make_mutex("ZonedAllocator::lock");
-
- uint64_t size;
- uint64_t conventional_size, sequential_size;
- std::atomic<int64_t> num_sequential_free; ///< total bytes in freelist
- uint64_t block_size;
- uint64_t zone_size;
- uint64_t first_seq_zone_num;
- uint64_t starting_zone_num;
- uint64_t num_zones;
- std::atomic<uint32_t> cleaning_zone = -1;
- std::vector<zone_state_t> zone_states;
-
- inline uint64_t get_offset(uint64_t zone_num) const {
- return zone_num * zone_size + get_write_pointer(zone_num);
- }
-
-public:
- inline uint64_t get_write_pointer(uint64_t zone_num) const {
- return zone_states[zone_num].get_write_pointer();
- }
-private:
- inline uint64_t get_remaining_space(uint64_t zone_num) const {
- return zone_size - get_write_pointer(zone_num);
- }
-
- inline void increment_write_pointer(uint64_t zone_num, uint64_t want_size) {
- zone_states[zone_num].increment_write_pointer(want_size);
- }
-
- inline void increment_num_dead_bytes(uint64_t zone_num, uint64_t length) {
- zone_states[zone_num].increment_num_dead_bytes(length);
- }
-
- inline bool fits(uint64_t want_size, uint64_t zone_num) const {
- return want_size <= get_remaining_space(zone_num);
- }
-
-public:
- ZonedAllocator(CephContext* cct, int64_t size, int64_t block_size,
- int64_t _zone_size,
- int64_t _first_sequential_zone,
- std::string_view name);
- ~ZonedAllocator() override;
-
- const char *get_type() const override {
- return "zoned";
- }
-
- uint64_t get_dead_bytes(uint32_t zone) {
- return zone_states[zone].num_dead_bytes;
- }
- uint64_t get_live_bytes(uint32_t zone) {
- std::scoped_lock l(lock);
- return zone_states[zone].write_pointer - zone_states[zone].num_dead_bytes;
- }
-
- int64_t allocate(
- uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
- int64_t hint, PExtentVector *extents) override;
-
- void release(const interval_set<uint64_t>& release_set) override;
-
- uint64_t get_free() override;
-
- void dump() override;
- void foreach(
- std::function<void(uint64_t offset, uint64_t length)> notify) override;
-
- int64_t pick_zone_to_clean(float min_score, uint64_t min_saved);
- void set_cleaning_zone(uint32_t zone) {
- cleaning_zone = zone;
- }
- void clear_cleaning_zone(uint32_t zone) {
- cleaning_zone = -1;
- }
- void reset_zone(uint32_t zone);
-
- void init_from_zone_pointers(
- std::vector<zone_state_t> &&_zone_states);
- void init_add_free(uint64_t offset, uint64_t length) override {}
- void init_rm_free(uint64_t offset, uint64_t length) override {}
-
- void shutdown() override;
-
-private:
- bool low_on_space(void);
-};
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-//
-// A freelist manager for zoned devices. This iteration just keeps the write
-// pointer per zone. Following iterations will add enough information to enable
-// cleaning of zones.
-//
-// Copyright (C) 2020 Abutalib Aghayev
-//
-
-#include "ZonedFreelistManager.h"
-#include "bluestore_common.h"
-#include "include/stringify.h"
-#include "kv/KeyValueDB.h"
-#include "os/kv.h"
-#include "zoned_types.h"
-
-#include "common/debug.h"
-
-#define dout_context cct
-#define dout_subsys ceph_subsys_bluestore
-#undef dout_prefix
-#define dout_prefix *_dout << "zoned freelist "
-
-using std::string;
-
-using ceph::bufferlist;
-using ceph::bufferptr;
-using ceph::decode;
-using ceph::encode;
-
-void ZonedFreelistManager::write_zone_state_delta_to_db(
- uint64_t zone_num,
- const zone_state_t &zone_state,
- KeyValueDB::Transaction txn)
-{
- string key;
- _key_encode_u64(zone_num, &key);
- bufferlist bl;
- zone_state.encode(bl);
- txn->merge(info_prefix, key, bl);
-}
-
-void ZonedFreelistManager::write_zone_state_reset_to_db(
- uint64_t zone_num,
- const zone_state_t &zone_state,
- KeyValueDB::Transaction txn)
-{
- string key;
- _key_encode_u64(zone_num, &key);
- bufferlist bl;
- zone_state.encode(bl);
- txn->set(info_prefix, key, bl);
-}
-
-void ZonedFreelistManager::load_zone_state_from_db(
- uint64_t zone_num,
- zone_state_t &zone_state,
- KeyValueDB::Iterator& it) const
-{
- string k = it->key();
- uint64_t zone_num_from_db;
- _key_decode_u64(k.c_str(), &zone_num_from_db);
- ceph_assert(zone_num_from_db == zone_num);
-
- bufferlist bl = it->value();
- auto p = bl.cbegin();
- zone_state.decode(p);
-}
-
-void ZonedFreelistManager::init_zone_states(KeyValueDB::Transaction txn)
-{
- dout(10) << __func__ << dendl;
- for (uint64_t zone_num = 0; zone_num < num_zones; ++zone_num) {
- zone_state_t zone_state;
- write_zone_state_reset_to_db(zone_num, zone_state, txn);
- }
-}
-
-void ZonedFreelistManager::setup_merge_operator(KeyValueDB *db, string prefix)
-{
- std::shared_ptr<Int64ArrayMergeOperator> merge_op(
- new Int64ArrayMergeOperator);
- db->set_merge_operator(prefix, merge_op);
-}
-
-ZonedFreelistManager::ZonedFreelistManager(
- CephContext* cct,
- string meta_prefix,
- string info_prefix)
- : FreelistManager(cct),
- meta_prefix(meta_prefix),
- info_prefix(info_prefix),
- enumerate_zone_num(~0UL)
-{
-}
-
-int ZonedFreelistManager::create(
- uint64_t new_size,
- uint64_t granularity,
- uint64_t new_zone_size,
- uint64_t first_sequential_zone,
- KeyValueDB::Transaction txn)
-{
- size = new_size;
- bytes_per_block = granularity;
- zone_size = new_zone_size;
- num_zones = size / zone_size;
- starting_zone_num = first_sequential_zone;
- enumerate_zone_num = ~0UL;
-
- ceph_assert(size % zone_size == 0);
-
- dout(1) << __func__ << std::hex
- << " size 0x" << size
- << " bytes_per_block 0x" << bytes_per_block
- << " zone size 0x " << zone_size
- << " num_zones 0x" << num_zones
- << " starting_zone 0x" << starting_zone_num << dendl;
- {
- bufferlist bl;
- encode(size, bl);
- txn->set(meta_prefix, "size", bl);
- }
- {
- bufferlist bl;
- encode(bytes_per_block, bl);
- txn->set(meta_prefix, "bytes_per_block", bl);
- }
- {
- bufferlist bl;
- encode(zone_size, bl);
- txn->set(meta_prefix, "zone_size", bl);
- }
- {
- bufferlist bl;
- encode(num_zones, bl);
- txn->set(meta_prefix, "num_zones", bl);
- }
- {
- bufferlist bl;
- encode(starting_zone_num, bl);
- txn->set(meta_prefix, "starting_zone_num", bl);
- }
-
- init_zone_states(txn);
-
- return 0;
-}
-
-int ZonedFreelistManager::init(
- KeyValueDB *kvdb,
- bool db_in_read_only,
- cfg_reader_t cfg_reader)
-{
- dout(1) << __func__ << dendl;
- int r = _read_cfg(cfg_reader);
- if (r != 0) {
- return r;
- }
-
- ceph_assert(num_zones == size / zone_size);
-
- dout(10) << __func__ << std::hex
- << " size 0x" << size
- << " bytes_per_block 0x" << bytes_per_block
- << " zone size 0x" << zone_size
- << " num_zones 0x" << num_zones
- << " starting_zone 0x" << starting_zone_num
- << std::dec << dendl;
- return 0;
-}
-
-void ZonedFreelistManager::sync(KeyValueDB* kvdb)
-{
-}
-
-void ZonedFreelistManager::shutdown()
-{
- dout(1) << __func__ << dendl;
-}
-
-void ZonedFreelistManager::enumerate_reset()
-{
- std::lock_guard l(lock);
-
- dout(1) << __func__ << dendl;
-
- enumerate_p.reset();
- enumerate_zone_num = ~0UL;
-}
-
-// Currently, this just iterates over the list of zones and sets |offset| and
-// |length| to the write pointer and the number of remaining free bytes in a
-// given zone. Hence, it can set |length| to 0 if a zone is full, and it can
-// also return two contiguous empty zones in two calls. This does not violate
-// current semantics of the call and appears to work fine with the clients of
-// this call.
-bool ZonedFreelistManager::enumerate_next(
- KeyValueDB *kvdb,
- uint64_t *offset,
- uint64_t *length)
-{
- std::lock_guard l(lock);
-
- // starting case
- if (enumerate_zone_num == ~0UL) {
- dout(30) << __func__ << " start" << dendl;
- enumerate_p = kvdb->get_iterator(info_prefix);
- enumerate_p->lower_bound(string());
- ceph_assert(enumerate_p->valid());
- enumerate_zone_num = 0;
- } else {
- enumerate_p->next();
- if (!enumerate_p->valid()) {
- dout(30) << __func__ << " end" << dendl;
- return false;
- }
- ++enumerate_zone_num;
- }
-
- zone_state_t zone_state;
- load_zone_state_from_db(enumerate_zone_num, zone_state, enumerate_p);
-
- *offset = enumerate_zone_num * zone_size + zone_state.get_write_pointer();
- *length = zone_size - zone_state.get_write_pointer();
-
- dout(30) << __func__ << std::hex << " 0x" << *offset << "~" << *length
- << std::dec << dendl;
-
- return true;
-}
-
-void ZonedFreelistManager::dump(KeyValueDB *kvdb)
-{
- enumerate_reset();
- uint64_t offset, length;
- while (enumerate_next(kvdb, &offset, &length)) {
- dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
- << std::dec << dendl;
- }
-}
-
-// Advances the write pointer and writes the updated write pointer to database.
-void ZonedFreelistManager::allocate(
- uint64_t offset,
- uint64_t length,
- KeyValueDB::Transaction txn)
-{
- while (length > 0) {
- uint64_t zone_num = offset / zone_size;
- uint64_t this_len = std::min(length, zone_size - offset % zone_size);
- dout(10) << __func__ << " 0x" << std::hex << offset << "~" << this_len
- << " zone 0x" << zone_num << std::dec << dendl;
- zone_state_t zone_state;
- zone_state.increment_write_pointer(this_len);
- write_zone_state_delta_to_db(zone_num, zone_state, txn);
- offset += this_len;
- length -= this_len;
- }
-}
-
-// Increments the number of dead bytes in a zone and writes the updated value to
-// database. The dead bytes in the zone are not usable. The cleaner will later
-// copy live objects from the zone to another zone an make the zone writable
-// again. The number of dead bytes in a zone is used by the cleaner to select
-// which zones to clean -- the ones with most dead bytes are good candidates
-// since they require less I/O.
-void ZonedFreelistManager::release(
- uint64_t offset,
- uint64_t length,
- KeyValueDB::Transaction txn)
-{
- while (length > 0) {
- uint64_t zone_num = offset / zone_size;
- uint64_t this_len = std::min(length, zone_size - offset % zone_size);
- dout(10) << __func__ << " 0x" << std::hex << offset << "~" << this_len
- << " zone 0x" << zone_num << std::dec << dendl;
- zone_state_t zone_state;
- zone_state.increment_num_dead_bytes(this_len);
- write_zone_state_delta_to_db(zone_num, zone_state, txn);
- length -= this_len;
- offset += this_len;
- }
-}
-
-void ZonedFreelistManager::get_meta(
- uint64_t target_size,
- std::vector<std::pair<string, string>>* res) const
-{
- // We do not support expanding devices for now.
- ceph_assert(target_size == 0);
- res->emplace_back("zfm_size", stringify(size));
- res->emplace_back("zfm_bytes_per_block", stringify(bytes_per_block));
- res->emplace_back("zfm_zone_size", stringify(zone_size));
- res->emplace_back("zfm_num_zones", stringify(num_zones));
- res->emplace_back("zfm_starting_zone_num", stringify(starting_zone_num));
-}
-
-std::vector<zone_state_t> ZonedFreelistManager::get_zone_states(
- KeyValueDB *kvdb) const
-{
- std::vector<zone_state_t> zone_states;
- auto p = kvdb->get_iterator(info_prefix);
- uint64_t zone_num = 0;
- for (p->lower_bound(string()); p->valid(); p->next(), ++zone_num) {
- zone_state_t zone_state;
- load_zone_state_from_db(zone_num, zone_state, p);
- zone_states.emplace_back(zone_state);
- }
- return zone_states;
-}
-
-// TODO: The following function is copied almost verbatim from
-// BitmapFreelistManager. Eliminate duplication.
-int ZonedFreelistManager::_read_cfg(cfg_reader_t cfg_reader)
-{
- dout(1) << __func__ << dendl;
-
- string err;
-
- const size_t key_count = 5;
- string keys[key_count] = {
- "zfm_size",
- "zfm_bytes_per_block",
- "zfm_zone_size",
- "zfm_num_zones",
- "zfm_starting_zone_num"
- };
- uint64_t* vals[key_count] = {
- &size,
- &bytes_per_block,
- &zone_size,
- &num_zones,
- &starting_zone_num};
-
- for (size_t i = 0; i < key_count; i++) {
- string val;
- int r = cfg_reader(keys[i], &val);
- if (r == 0) {
- *(vals[i]) = strict_iecstrtoll(val.c_str(), &err);
- if (!err.empty()) {
- derr << __func__ << " Failed to parse - "
- << keys[i] << ":" << val
- << ", error: " << err << dendl;
- return -EINVAL;
- }
- } else {
- // this is expected for legacy deployed OSDs
- dout(0) << __func__ << " " << keys[i] << " not found in bdev meta" << dendl;
- return r;
- }
- }
- return 0;
-}
-
-void ZonedFreelistManager::mark_zone_to_clean_free(
- uint64_t zone,
- KeyValueDB *kvdb)
-{
- dout(10) << __func__ << " zone 0x" << std::hex << zone << std::dec << dendl;
-
- KeyValueDB::Transaction txn = kvdb->get_transaction();
-
- zone_state_t empty_zone_state;
- write_zone_state_reset_to_db(zone, empty_zone_state, txn);
-
- // block here until this commits so that we don't end up starting to allocate and
- // write to the new zone before this fully commits.
- kvdb->submit_transaction_sync(txn);
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-//
-// A freelist manager for zoned devices.
-//
-// Copyright (C) 2020 Abutalib Aghayev
-//
-
-#ifndef CEPH_OS_BLUESTORE_ZONEDFREELISTMANAGER_H
-#define CEPH_OS_BLUESTORE_ZONEDFREELISTMANAGER_H
-
-#include "FreelistManager.h"
-
-#include <string>
-#include <mutex>
-
-#include "common/ceph_mutex.h"
-#include "include/buffer.h"
-#include "kv/KeyValueDB.h"
-#include "zoned_types.h"
-
-using cfg_reader_t = std::function<int(const std::string&, std::string*)>;
-
-class ZonedFreelistManager : public FreelistManager {
- std::string meta_prefix; ///< device size, zone size, etc.
- std::string info_prefix; ///< per zone write pointer, dead bytes
- mutable ceph::mutex lock = ceph::make_mutex("ZonedFreelistManager::lock");
-
- uint64_t size; ///< size of sequential region (bytes)
- uint64_t bytes_per_block; ///< bytes per allocation unit (bytes)
- uint64_t zone_size; ///< size of a single zone (bytes)
- uint64_t num_zones; ///< number of sequential zones
- uint64_t starting_zone_num; ///< the first sequential zone number
-
- KeyValueDB::Iterator enumerate_p;
- uint64_t enumerate_zone_num;
-
- void write_zone_state_delta_to_db(uint64_t zone_num,
- const zone_state_t &zone_state,
- KeyValueDB::Transaction txn);
- void write_zone_state_reset_to_db(uint64_t zone_num,
- const zone_state_t &zone_state,
- KeyValueDB::Transaction txn);
- void load_zone_state_from_db(uint64_t zone_num,
- zone_state_t &zone_state,
- KeyValueDB::Iterator &it) const;
-
- void init_zone_states(KeyValueDB::Transaction txn);
-
- void increment_write_pointer(
- uint64_t zone, uint64_t length, KeyValueDB::Transaction txn);
- void increment_num_dead_bytes(
- uint64_t zone, uint64_t num_bytes, KeyValueDB::Transaction txn);
-
- int _read_cfg(cfg_reader_t cfg_reader);
-
-public:
- ZonedFreelistManager(CephContext* cct,
- std::string meta_prefix,
- std::string info_prefix);
-
- static void setup_merge_operator(KeyValueDB *db, std::string prefix);
-
- int create(uint64_t size,
- uint64_t granularity,
- uint64_t zone_size,
- uint64_t first_sequential_zone,
- KeyValueDB::Transaction txn) override;
-
- int init(KeyValueDB *kvdb,
- bool db_in_read_only,
- cfg_reader_t cfg_reader) override;
-
- void shutdown() override;
- void sync(KeyValueDB* kvdb) override;
- void dump(KeyValueDB *kvdb) override;
-
- void enumerate_reset() override;
- bool enumerate_next(KeyValueDB *kvdb,
- uint64_t *offset,
- uint64_t *length) override;
-
- void allocate(uint64_t offset,
- uint64_t length,
- KeyValueDB::Transaction txn) override;
-
- void release(uint64_t offset,
- uint64_t length,
- KeyValueDB::Transaction txn) override;
-
- inline uint64_t get_size() const override {
- return size;
- }
-
- inline uint64_t get_alloc_units() const override {
- return size / bytes_per_block;
- }
-
- inline uint64_t get_alloc_size() const override {
- return bytes_per_block;
- }
-
- void get_meta(uint64_t target_size,
- std::vector<std::pair<std::string, std::string>>*) const override;
-
- std::vector<zone_state_t> get_zone_states(KeyValueDB *kvdb) const;
-
- void mark_zone_to_clean_free(uint64_t zone,
- KeyValueDB *kvdb);
-};
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#ifndef CEPH_OS_BLUESTORE_ZONED_TYPES_H
-#define CEPH_OS_BLUESTORE_ZONED_TYPES_H
-
-#include "include/types.h"
-#include "kv/KeyValueDB.h"
-#include "os/kv.h"
-
-// Tracks two bits of information about the state of a zone: (1) number of dead
-// bytes in a zone and (2) the write pointer. We use the existing
-// Int64ArrayMergeOperator for merge and avoid the cost of point queries.
-//
-// We use the same struct for an on-disk and in-memory representation of the
-// state.
-struct zone_state_t {
- uint64_t num_dead_bytes = 0; ///< dead bytes deallocated (behind the write pointer)
- uint64_t write_pointer = 0; ///< relative offset within the zone
-
- void encode(ceph::buffer::list &bl) const {
- using ceph::encode;
- encode(write_pointer, bl);
- encode(num_dead_bytes, bl);
- }
- void decode(ceph::buffer::list::const_iterator &p) {
- using ceph::decode;
- decode(write_pointer, p);
- decode(num_dead_bytes, p);
- }
-
- void reset() {
- write_pointer = 0;
- num_dead_bytes = 0;
- }
-
- uint64_t get_num_dead_bytes() const {
- return num_dead_bytes;
- }
-
- uint64_t get_num_live_bytes() const {
- return write_pointer - num_dead_bytes;
- }
-
- uint64_t get_write_pointer() const {
- return write_pointer;
- }
-
- void increment_num_dead_bytes(uint64_t num_bytes) {
- num_dead_bytes += num_bytes;
- }
-
- void increment_write_pointer(uint64_t num_bytes) {
- write_pointer += num_bytes;
- }
-
- friend std::ostream& operator<<(
- std::ostream& out,
- const zone_state_t& zone_state) {
- return out << std::hex
- << " dead bytes: 0x" << zone_state.get_num_dead_bytes()
- << " write pointer: 0x" << zone_state.get_write_pointer()
- << " " << std::dec;
- }
-};
-
-#endif
+++ /dev/null
-#!/bin/bash -ex
-
-# 1) run_smr_bluestore_test.sh
-# Setup smr device, run all tests
-
-# 2) run_smr_bluestore_test.sh --smr
-# Setup smr device but skip tests failing on smr
-
-
-before_creation=$(mktemp)
-lsscsi > $before_creation
-
-echo "cd /backstores/user:zbc
-create name=zbc0 size=20G cfgstring=model-HM/zsize-256/conv-10@zbc0.raw
-/loopback create
-cd /loopback
-create naa.50014055e5f25aa0
-cd naa.50014055e5f25aa0/luns
-create /backstores/user:zbc/zbc0 0
-" | sudo targetcli
-
-sleep 1 #if too fast device does not show up
-after_creation=$(mktemp)
-lsscsi > $after_creation
-if [[ $(diff $before_creation $after_creation | wc -l ) != 2 ]]
-then
- echo New zbc device not created
- false
-fi
-
-function cleanup() {
- echo "cd /loopback
-delete naa.50014055e5f25aa0
-cd /backstores/user:zbc
-delete zbc0" | sudo targetcli
- sudo rm -f zbc0.raw
- rm -f $before_creation $after_creation
-}
-trap cleanup EXIT
-
-DEV=$(diff $before_creation $after_creation |grep zbc |sed "s@.* /@/@")
-sudo chmod 666 $DEV
-# Need sudo
-# https://patchwork.kernel.org/project/linux-block/patch/20210811110505.29649-3-Niklas.Cassel@wdc.com/
-sudo ceph_test_objectstore \
- --bluestore-block-path $DEV \
- --gtest_filter=*/2 \
- $*
const uint64_t DEF_STORE_TEST_BLOCKDEV_SIZE = 10240000000;
#define dout_context g_ceph_context
-bool smr = false;
-
static bool bl_eq(bufferlist& expected, bufferlist& actual)
{
if (expected.contents_equal(actual))
TEST_P(StoreTest, CompressionTest) {
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "TODO: need to adjust statfs check for smr" << std::endl;
- return;
- }
SetVal(g_conf(), "bluestore_compression_algorithm", "snappy");
SetVal(g_conf(), "bluestore_compression_mode", "force");
g_ceph_context->_conf.apply_changes(nullptr);
doCompressionTest();
-
SetVal(g_conf(), "bluestore_compression_algorithm", "zlib");
SetVal(g_conf(), "bluestore_compression_mode", "aggressive");
g_ceph_context->_conf.apply_changes(nullptr);
TEST_P(StoreTestSpecificAUSize, ReproBug41901Test) {
if(string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP (smr)" << std::endl;
- return;
- }
SetVal(g_conf(), "bluestore_max_blob_size", "524288");
SetVal(g_conf(), "bluestore_debug_enforce_settings", "hdd");
TEST_P(StoreTestSpecificAUSize, BluestoreStatFSTest) {
if(string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "TODO: fix this for smr" << std::endl;
- return;
- }
SetVal(g_conf(), "bluestore_block_db_path", "");
StartDeferred(65536);
SetVal(g_conf(), "bluestore_compression_mode", "force");
TEST_P(StoreTestSpecificAUSize, BluestoreFragmentedBlobTest) {
if(string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "TODO: fix this for smr" << std::endl;
- return;
- }
SetVal(g_conf(), "bluestore_block_db_path", "");
StartDeferred(0x10000);
TEST_P(StoreTest, BlueStoreUnshareBlobTest) {
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: non-deterministic behavior with smr" << std::endl;
- return;
- }
int r;
coll_t cid;
auto ch = store->create_new_collection(cid);
TEST_P(StoreTestSpecificAUSize, Many4KWritesTest) {
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: no deferred; assertions around res_stat.allocated don't apply"
- << std::endl;
- return;
- }
StartDeferred(0x10000);
TEST_P(StoreTestSpecificAUSize, Many4KWritesNoCSumTest) {
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: no deferred; assertions around res_stat.allocated don't apply"
- << std::endl;
- return;
- }
StartDeferred(0x10000);
SetVal(g_conf(), "bluestore_csum_type", "none");
g_ceph_context->_conf.apply_changes(nullptr);
TEST_P(StoreTestSpecificAUSize, TooManyBlobsTest) {
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: no deferred; assertions around res_stat.allocated don't apply"
- << std::endl;
- return;
- }
StartDeferred(0x10000);
const unsigned max_object = 4*1024*1024;
doMany4KWritesTest(store.get(), 1, 1000, max_object, 4*1024, 0);
if (string(GetParam()) != "bluestore" || !cct->_conf->bluestore_zero_block_detection) {
GTEST_SKIP() << "not bluestore or bluestore_zero_block_detection=false, skipping";
}
- if (smr) {
- GTEST_SKIP() << "smr, skipping";
- }
size_t block_size = 65536;
StartDeferred(block_size);
if (string(GetParam()) != "bluestore" || !cct->_conf->bluestore_zero_block_detection) {
GTEST_SKIP() << "not bluestore or bluestore_zero_block_detection=false, skipping";
}
- if (smr) {
- GTEST_SKIP() << "smr, skipping";
- }
size_t block_size = 4096;
StartDeferred(block_size);
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: no deferred" << std::endl;
- return;
- }
size_t block_size = 4096;
StartDeferred(block_size);
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: no deferred" << std::endl;
- return;
- }
size_t block_size = 4096;
StartDeferred(block_size);
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: no deferred" << std::endl;
- return;
- }
size_t block_size = 4096;
StartDeferred(block_size);
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: no deferred" << std::endl;
- return;
- }
size_t block_size = 4096;
SetVal(g_conf(), "bluestore_block_db_create", "true");
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: no deferred" << std::endl;
- return;
- }
size_t block_size = 4096;
SetVal(g_conf(), "bluestore_block_db_create", "true");
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: no deferred" << std::endl;
- return;
- }
size_t alloc_size = 4096;
size_t large_object_size = 1 * 1024 * 1024;
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: no overwrite" << std::endl;
- return;
- }
size_t block_size = 4096;
StartDeferred(block_size);
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: no overwrite" << std::endl;
- return;
- }
size_t block_size = 4096;
StartDeferred(block_size);
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: no deferred" << std::endl;
- return;
- }
size_t alloc_size = 65536;
size_t write_size = 4096;
int write_offset = buf_len;
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: assertions about allocations need to be adjusted" << std::endl;
- return;
- }
#define WRITE_AT(offset, _length) {\
ObjectStore::Transaction t;\
TEST_P(StoreTestSpecificAUSize, BluestoreRepairTest) {
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "TODO: repair mismatched write pointer (+ dead bytes mismatch)" << std::endl;
- return;
- }
const size_t offs_base = 65536 / 2;
TEST_P(StoreTestSpecificAUSize, BluestoreBrokenZombieRepairTest) {
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: smr repair is different" << std::endl;
- return;
- }
SetVal(g_conf(), "bluestore_fsck_on_mount", "false");
SetVal(g_conf(), "bluestore_fsck_on_umount", "false");
TEST_P(StoreTestSpecificAUSize, BluestoreRepairSharedBlobTest) {
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "TODO: repair mismatched write pointer (+ dead bytes mismatch)" << std::endl;
- return;
- }
SetVal(g_conf(), "bluestore_fsck_on_mount", "false");
SetVal(g_conf(), "bluestore_fsck_on_umount", "false");
TEST_P(StoreTestSpecificAUSize, BluestoreBrokenNoSharedBlobRepairTest) {
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: smr repair is different" << std::endl;
- return;
- }
SetVal(g_conf(), "bluestore_fsck_on_mount", "false");
SetVal(g_conf(), "bluestore_fsck_on_umount", "false");
}
}
-TEST_P(StoreTest, FixSMRWritePointer) {
- if(string(GetParam()) != "bluestore")
- return;
- if (!smr)
- return;
- int r = store->umount();
- ASSERT_EQ(0, r);
-
- // copied from StoreTestFixture
- std::string path = GetParam() + ".test_temp_dir"s;
-
- std::string p = path + "/block";
- BlockDevice* bdev = BlockDevice::create(g_ceph_context, p, nullptr, nullptr, nullptr, nullptr);
- r = bdev->open(p);
- ASSERT_EQ(0, r);
- ASSERT_EQ(true, bdev->is_smr());
-
- std::vector<uint64_t> wp = bdev->get_zones();
- uint64_t first_seq_zone = bdev->get_conventional_region_size() / bdev->get_zone_size();
-
- IOContext ioc(g_ceph_context, NULL, true);
- bufferlist bl;
- bl.append(std::string(1024 * 1024, 'x'));
- r = bdev->aio_write(wp[first_seq_zone], bl, &ioc, false);
- ASSERT_EQ(0, r);
- bdev->aio_submit(&ioc);
- ioc.aio_wait();
- bdev->close();
- delete bdev;
-
- r = store->mount();
- ASSERT_EQ(0, r);
-}
-
-
TEST_P(StoreTestSpecificAUSize, BluestoreEnforceHWSettingsHdd) {
if (string(GetParam()) != "bluestore")
return;
if(string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP (FIXME): bluestore gc does not seem to do the trick here" << std::endl;
- return;
- }
SetVal(g_conf(), "bluestore_block_db_create", "true");
SetVal(g_conf(), "bluestore_block_db_size", "4294967296");
TEST_P(StoreTestSpecificAUSize, SpilloverLegacyTest) {
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl;
- return;
- }
SetVal(g_conf(), "bluestore_block_db_create", "true");
SetVal(g_conf(), "bluestore_block_db_size", "3221225472");
TEST_P(StoreTestSpecificAUSize, SpilloverLegacyFixedByFitToFastTest) {
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl;
- return;
- }
SetVal(g_conf(), "bluestore_block_db_create", "true");
SetVal(g_conf(), "bluestore_block_db_size", "3221225472");
TEST_P(StoreTestSpecificAUSize, SpilloverTest) {
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl;
- return;
- }
SetVal(g_conf(), "bluestore_block_db_create", "true");
SetVal(g_conf(), "bluestore_block_db_size", "3221225472");
TEST_P(StoreTestSpecificAUSize, SpilloverFixedCompletelyTest) {
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl;
- return;
- }
SetVal(g_conf(), "bluestore_block_db_create", "true");
SetVal(g_conf(), "bluestore_block_db_size", "3221225472");
TEST_P(StoreTestSpecificAUSize, SpilloverFixedPartialTest) {
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl;
- return;
- }
SetVal(g_conf(), "bluestore_block_db_create", "true");
SetVal(g_conf(), "bluestore_block_db_size", stringify(3ull << 30).c_str());
TEST_P(StoreTestSpecificAUSize, Ticket45195Repro) {
if (string(GetParam()) != "bluestore")
return;
- if (smr) {
- return;
- }
SetVal(g_conf(), "bluestore_default_buffered_write", "true");
SetVal(g_conf(), "bluestore_max_blob_size", "65536");
CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
common_init_finish(g_ceph_context);
- for (auto& i : args) {
- if (i == "--smr"s) {
-#if defined(HAVE_LIBZBD)
- derr << "Adjusting tests for smr mode." << dendl;
- smr = true;
-#else
- derr << "smr mode selected, but support not compiled in" << dendl;
- return 1;
-#endif
- }
- }
-
// make sure we can adjust any config settings
g_ceph_context->_conf._clear_safe_to_start_threads();
--bluestore-devs: comma-separated list of blockdevs to use for bluestore
--bluestore-db-devs: comma-separated list of db-devs to use for bluestore
--bluestore-wal-devs: comma-separated list of wal-devs to use for bluestore
- --bluestore-zoned: blockdevs listed by --bluestore-devs are zoned devices (HM-SMR HDD or ZNS SSD)
--bluestore-io-uring: enable io_uring backend
--inc-osd: append some more osds into existing vcluster
--cephadm: enable cephadm orchestrator with ~/.ssh/id_rsa[.pub]
parse_bluestore_wal_devs --bluestore-wal-devs "$2"
shift
;;
- --bluestore-zoned)
- zoned_enabled=1
- ;;
--bluestore-io-uring)
io_uring_enabled=1
shift
bluestore prefer deferred size = 0
bluestore prefer deferred size hdd = 0
bluestore prefer deferred size ssd = 0
- bluestore allocator = zoned"
fi
if [ "$io_uring_enabled" -eq 1 ]; then
BLUESTORE_OPTS+="