Replaces the BitmapAllocator used by NCB Recovery code with a dedicated SimpleBitmap.
The SimpleBitmap allows for bits to be set multiple times without any adverse effect.
This is needed beacuse shared-blobs will report the same allocation multiple times.
Fixes: https://tracker.ceph.com/issues/53678
Signed-off-by: Gabriel Benhanokh <gbenhano@redhat.com>
${PROJECT_SOURCE_DIR}/src/os/bluestore/bluefs_types.cc
${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueRocksEnv.cc
${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueStore.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/simple_bitmap.cc
${PROJECT_SOURCE_DIR}/src/os/bluestore/bluestore_types.cc
${PROJECT_SOURCE_DIR}/src/os/bluestore/fastbmap_allocator_impl.cc
${PROJECT_SOURCE_DIR}/src/os/bluestore/FreelistManager.cc
bluestore/bluefs_types.cc
bluestore/BlueRocksEnv.cc
bluestore/BlueStore.cc
+ bluestore/simple_bitmap.cc
bluestore/bluestore_types.cc
bluestore/fastbmap_allocator_impl.cc
bluestore/FreelistManager.cc
#include "BlueStore.h"
#include "bluestore_common.h"
+#include "simple_bitmap.h"
#include "os/kv.h"
#include "include/compat.h"
#include "include/intarith.h"
decode(val, p);
min_alloc_size = val;
min_alloc_size_order = ctz(val);
+ min_alloc_size_mask = min_alloc_size - 1;
+
ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
} catch (ceph::buffer::error& e) {
derr << __func__ << " unable to read min_alloc_size" << dendl;
};
WRITE_CLASS_DENC(allocator_image_header)
-struct extent_t {
- uint64_t offset;
- uint64_t length;
-
- //extent_t(uint64_t _offset, uint64_t _length) : offset(_offset), length(_length) {}
-};
-
// 56 Bytes trailer for on-disk alloator image
struct allocator_image_trailer {
extent_t null_extent; // 0x00
derr << "****spillover, num_entries=" << *p_num_entries << ", spillover=" << (idx - *p_num_entries) << dendl;
ceph_assert(idx <= *p_num_entries);
}
-
+
*p_num_entries = idx;
for (idx = 0; idx < *p_num_entries; idx++) {
derr << "Failed Allocator Creation" << dendl;
return nullptr;
}
-
}
//-----------------------------------------------------------------------------------
int BlueStore::restore_allocator(Allocator* dest_allocator, uint64_t *num, uint64_t *bytes)
{
utime_t start = ceph_clock_now();
- Allocator *temp_allocator = create_bitmap_allocator(bdev->get_size());
- if (temp_allocator == nullptr) {
- derr << "Failed create_bitmap_allocator()" << dendl;
- return -1;
- }
-
- int ret = __restore_allocator(temp_allocator, num, bytes);
+ auto temp_allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
+ int ret = __restore_allocator(temp_allocator.get(), num, bytes);
if (ret != 0) {
- delete temp_allocator;
return ret;
}
uint64_t num_entries = 0;
- dout(5) << " calling copy_allocator(bitmap_allocator -> shared_alloc.a)" << dendl;
- copy_allocator(temp_allocator, dest_allocator, &num_entries);
- delete temp_allocator;
+ dout(5) << " calling copy_allocator(bitmap_allocator -> shared_alloc.a)" << dendl;
+ copy_allocator(temp_allocator.get(), dest_allocator, &num_entries);
utime_t duration = ceph_clock_now() - start;
dout(5) << "restored in " << duration << " seconds, num_entries=" << num_entries << dendl;
return ret;
}
}
+//-----------------------------------------------------------------------------------
+void BlueStore::set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length)
+{
+ ceph_assert((offset & min_alloc_size_mask) == 0);
+ ceph_assert((length & min_alloc_size_mask) == 0);
+ sbmap->set(offset >> min_alloc_size_order, length >> min_alloc_size_order);
+}
+
//---------------------------------------------------------
// Process all physical extents from a given Onode (including all its shards)
void BlueStore::read_allocation_from_single_onode(
- Allocator* allocator,
+ SimpleBitmap* sbmap,
BlueStore::OnodeRef& onode_ref,
read_alloc_stats_t& stats)
{
stats.skipped_repeated_extent++;
} else {
lcl_extnt_map[offset] = length;
- allocator->init_rm_free(offset, length);
+ set_allocation_in_simple_bmap(sbmap, offset, length);
stats.extent_count++;
}
} else {
// extents using shared blobs might have differnt length
- allocator->init_rm_free(offset, length);
+ set_allocation_in_simple_bmap(sbmap, offset, length);
stats.extent_count++;
}
stats.blobs_in_onode[blobs_count]++;
} else {
// store all counts higher than MAX_BLOBS_IN_ONODE in a single bucket at offset zero
- //std::cout << "***BCF::Blob-count=" << blobs_count << std::endl;
stats.blobs_in_onode[MAX_BLOBS_IN_ONODE]++;
}
}
//-------------------------------------------------------------------------
-int BlueStore::read_allocation_from_onodes(Allocator* allocator, read_alloc_stats_t& stats)
+int BlueStore::read_allocation_from_onodes(SimpleBitmap *sbmap, read_alloc_stats_t& stats)
{
// finally add all space take by user data
auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
// make sure we got all shards of this object
if (shard_id == onode_ref->extent_map.shards.size()) {
// We completed an Onode Object -> pass it to be processed
- read_allocation_from_single_onode(allocator, onode_ref, stats);
+ read_allocation_from_single_onode(sbmap, onode_ref, stats);
} else {
derr << "Missing shards! shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl;
ceph_assert(shard_id == onode_ref->extent_map.shards.size());
collection_ref->cid.is_pg(&pgid);
}
-
- //std::cout << "[" << stats.onode_count << "] oid="<< oid << " key=" << pretty_binary_string(it->key()) << std::endl;
onode_ref.reset(BlueStore::Onode::decode(collection_ref, oid, it->key(), it->value()));
}
}
// make sure we got all shards of this object
if (shard_id == onode_ref->extent_map.shards.size()) {
// We completed an Onode Object -> pass it to be processed
- read_allocation_from_single_onode(allocator, onode_ref, stats);
+ read_allocation_from_single_onode(sbmap, onode_ref, stats);
} else {
derr << "Last Object is missing shards! shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl;
ceph_assert(shard_id == onode_ref->extent_map.shards.size());
}
//---------------------------------------------------------
-int BlueStore::reconstruct_allocations(Allocator* allocator, read_alloc_stats_t &stats)
+int BlueStore::reconstruct_allocations(SimpleBitmap *sbmap, read_alloc_stats_t &stats)
{
- uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
- uint64_t bdev_size = bdev->get_size();
- dout(5) << "memory_target=" << memory_target << ", bdev_size=" << bdev_size << dendl;
-
- // start by marking the full device space as allocated and then remove each extent we find
- dout(5) << "init_add_free(0, " << bdev_size << ")" << dendl;
- allocator->init_add_free(0, bdev_size);
-
- // first add space used by superblock
+ // first set space used by superblock
auto super_length = std::max<uint64_t>(min_alloc_size, SUPER_RESERVED);
- dout(5) << "init_rm_free(0, " << super_length << ")" << dendl;
- allocator->init_rm_free(0, super_length);
+ set_allocation_in_simple_bmap(sbmap, 0, super_length);
stats.extent_count++;
- dout(5) << "calling read_allocation_from_onodes()" << dendl;
- // then add all space taken by Objects
- int ret = read_allocation_from_onodes(allocator, stats);
+ // then set all space taken by Objects
+ int ret = read_allocation_from_onodes(sbmap, stats);
if (ret < 0) {
derr << "failed read_allocation_from_onodes()" << dendl;
return ret;
return 0;
}
+//-----------------------------------------------------------------------------------
+static void copy_simple_bitmap_to_allocator(SimpleBitmap* sbmap, Allocator* dest_alloc, uint64_t alloc_size)
+{
+ int alloc_size_shift = ctz(alloc_size);
+ uint64_t offset = 0;
+ extent_t ext = sbmap->get_next_clr_extent(offset);
+ while (ext.length != 0) {
+ dest_alloc->init_add_free(ext.offset << alloc_size_shift, ext.length << alloc_size_shift);
+ offset = ext.offset + ext.length;
+ ext = sbmap->get_next_clr_extent(offset);
+ }
+}
+
//---------------------------------------------------------
int BlueStore::read_allocation_from_drive_on_startup()
{
_shutdown_cache();
});
+ utime_t start = ceph_clock_now();
read_alloc_stats_t stats = {};
- utime_t start = ceph_clock_now();
- Allocator *allocator = create_bitmap_allocator(bdev->get_size());
- if (allocator == nullptr) {
- derr << "****failed create_bitmap_allocator()" << dendl;
- return -1;
- }
-
- ret = reconstruct_allocations(allocator, stats);
+ SimpleBitmap sbmap(cct, div_round_up(bdev->get_size(), min_alloc_size));
+ ret = reconstruct_allocations(&sbmap, stats);
if (ret != 0) {
- delete allocator;
return ret;
}
- uint64_t num_entries = 0;
- dout(5) << " calling copy_allocator(bitmap_allocator -> alloc)" << dendl;
- copy_allocator(allocator, alloc, &num_entries);
- delete allocator;
+ copy_simple_bitmap_to_allocator(&sbmap, alloc, min_alloc_size);
+
utime_t duration = ceph_clock_now() - start;
- dout(5) << " <<<FINISH>>> in " << duration << " seconds, num_entries=" << num_entries << dendl;
- dout(5) << "num_entries=" << num_entries << ", extent_count=" << stats.extent_count << dendl;
- dout(5) << "Allocation Recovery was completed" << dendl;
+ dout(1) << "::Allocation Recovery was completed in " << duration << " seconds, extent_count=" << stats.extent_count << dendl;
return ret;
}
uint64_t size1 = 0, size2 = 0;
uint64_t idx1 = 0, idx2 = 0;
auto iterated_mapper1 = [&](uint64_t offset, uint64_t length) {
- //std::cout << "[" << idx1 << "]<" << offset << "," << length << ">" << std::endl;
size1 += length;
if (idx1 < extent_count) {
arr1[idx1++] = {offset, length};
};
auto iterated_mapper2 = [&](uint64_t offset, uint64_t length) {
- //std::cout << "[" << idx2 << "]<" << offset << "," << length << ">" << std::endl;
size2 += length;
if (idx2 < extent_count) {
arr2[idx2++] = {offset, length};
};
alloc1->dump(iterated_mapper1);
- //std::cout << __func__ << "alloc1->dump()::entry_count=" << idx1 << " size=" << size1 << std::endl;
-
alloc2->dump(iterated_mapper2);
- //std::cout << __func__ << "::alloc2->dump()::entry_count=" << idx2 << " size=" << size2 << std::endl;
qsort(arr1.get(), std::min(idx1, extent_count), sizeof(extent_t), cmpfunc);
qsort(arr2.get(), std::min(idx2, extent_count), sizeof(extent_t), cmpfunc);
}
//---------------------------------------------------------
-int BlueStore::read_allocation_from_drive_for_bluestore_tool(bool test_store_and_restore)
+int BlueStore::read_allocation_from_drive_for_bluestore_tool()
{
- dout(5) << "test_store_and_restore=" << test_store_and_restore << dendl;
+ dout(5) << __func__ << dendl;
int ret = 0;
uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
ret = _open_db_and_around(true, false);
return ret;
}
+ utime_t duration;
read_alloc_stats_t stats = {};
- uint64_t bdev_size = bdev->get_size();
- Allocator* allocator = create_bitmap_allocator(bdev_size);
- if (allocator) {
- dout(5) << "bitmap-allocator=" << allocator << dendl;
- } else {
- return -1;
- }
- dout(5) << " calling reconstruct_allocations()" << dendl;
- utime_t start = ceph_clock_now();
- ret = reconstruct_allocations(allocator, stats);
- if (ret != 0) {
- _close_db_and_around();
- return ret;
- }
+ utime_t start = ceph_clock_now();
- // add allocation space used by the bluefs itself
- ret = add_existing_bluefs_allocation(allocator, stats);
- if (ret < 0) {
+ auto shutdown_cache = make_scope_guard([&] {
+ std::cout << "Allocation Recovery was completed in " << duration
+ << " seconds; insert_count=" << stats.insert_count
+ << "; extent_count=" << stats.extent_count << std::endl;
+ _shutdown_cache();
_close_db_and_around();
- return ret;
- }
-
- utime_t duration = ceph_clock_now() - start;
- stats.insert_count = 0;
- auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
- stats.insert_count++;
- };
- allocator->dump(count_entries);
+ });
- dout(5) << "\n" << " <<<FINISH>>> in " << duration << " seconds; insert_count=" << stats.insert_count << dendl;
- dout(5) << "\n" << " <<<FINISH>>> in " << duration << " seconds; extent_count=" << stats.extent_count << dendl;
+ {
+ auto allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
+ //reconstruct allocations into a temp simple-bitmap and copy into allocator
+ {
+ SimpleBitmap sbmap(cct, div_round_up(bdev->get_size(), min_alloc_size));
+ ret = reconstruct_allocations(&sbmap, stats);
+ if (ret != 0) {
+ return ret;
+ }
+ copy_simple_bitmap_to_allocator(&sbmap, allocator.get(), min_alloc_size);
+ }
+ // add allocation space used by the bluefs itself
+ ret = add_existing_bluefs_allocation(allocator.get(), stats);
+ if (ret < 0) {
+ return ret;
+ }
- dout(5) << "calling compare_allocator(alloc) insert_count=" << stats.insert_count << dendl;
- ret = compare_allocators(allocator, alloc, stats.insert_count, memory_target);
- if (ret == 0) {
- dout(5) << "SUCCESS!!! compare(allocator, alloc)" << dendl;
- } else {
- derr << "**** FAILURE compare(allocator, alloc)::ret=" << ret << dendl;
- }
-
- if (test_store_and_restore) {
- _close_db_leave_bluefs();
- dout(5) << "calling store_allocator(alloc)" << dendl;
- store_allocator(alloc);
- Allocator* alloc2 = create_bitmap_allocator(bdev_size);
- if (alloc2) {
- dout(5) << "bitmap-allocator=" << alloc2 << dendl;
- dout(5) << "calling restore_allocator()" << dendl;
- uint64_t num, bytes;
- int ret = restore_allocator(alloc2, &num, &bytes);
- if (ret == 0) {
- // add allocation space used by the bluefs itself
- ret = add_existing_bluefs_allocation(alloc2, stats);
- if (ret < 0) {
- _close_db_and_around();
- return ret;
- }
- // verify that we can store and restore allocator to/from drive
- ret = compare_allocators(alloc2, alloc, stats.insert_count, memory_target);
- if (ret == 0) {
- dout(5) << "SUCCESS!!! compare(alloc2, alloc)" << dendl;
- } else {
- derr << "**** FAILURE compare(alloc2, alloc)::ret=" << ret << dendl;
- }
- } else {
- derr << "******Failed restore_allocator******\n" << dendl;
- }
- delete alloc2;
+ duration = ceph_clock_now() - start;
+ stats.insert_count = 0;
+ auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
+ stats.insert_count++;
+ };
+ allocator->dump(count_entries);
+ ret = compare_allocators(allocator.get(), alloc, stats.insert_count, memory_target);
+ if (ret != 0) {
+ dout(5) << "Allocator drive - file integrity check OK" << dendl;
} else {
- derr << "Failed allcoator2 create" << dendl;
+ derr << "FAILURE. Allocator from file and allocator from metadata differ::ret=" << ret << dendl;
}
}
- std::cout << "<<<FINISH>>> in " << duration << " seconds; insert_count=" << stats.insert_count << "\n\n" << std::endl;
std::cout << stats << std::endl;
-
- //out_db:
- delete allocator;
- _shutdown_cache();
- _close_db_and_around();
return ret;
}
class Allocator;
class FreelistManager;
class BlueStoreRepairer;
+class SimpleBitmap;
//#define DEBUG_CACHE
//#define DEBUG_DEFERRED
size_t block_size_order = 0; ///< bits to shift to get block size
uint64_t optimal_io_size = 0;///< best performance io size for block device
- uint64_t min_alloc_size; ///< minimum allocation unit (power of 2)
- ///< bits for min_alloc_size
- uint8_t min_alloc_size_order = 0;
+ uint64_t min_alloc_size; ///< minimum allocation unit (power of 2)
+ uint8_t min_alloc_size_order = 0;///< bits to shift to get min_alloc_size
+ uint64_t min_alloc_size_mask;///< mask for fast checking of allocation alignment
static_assert(std::numeric_limits<uint8_t>::max() >
std::numeric_limits<decltype(min_alloc_size)>::digits,
"not enough bits for min_alloc_size");
const BlueStore::FSCK_ObjectCtx& ctx);
#ifdef CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
int push_allocation_to_rocksdb();
- int read_allocation_from_drive_for_bluestore_tool(bool test_store_and_restore);
- int read_allocation_from_drive_for_fsck() { return read_allocation_from_drive_for_bluestore_tool(false); }
+ int read_allocation_from_drive_for_bluestore_tool();
#endif
private:
#define MAX_BLOBS_IN_ONODE 128
int __restore_allocator(Allocator* allocator, uint64_t *num, uint64_t *bytes);
int restore_allocator(Allocator* allocator, uint64_t *num, uint64_t *bytes);
int read_allocation_from_drive_on_startup();
- int reconstruct_allocations(Allocator* allocator, read_alloc_stats_t &stats);
- int read_allocation_from_onodes(Allocator* allocator, read_alloc_stats_t& stats);
+ int reconstruct_allocations(SimpleBitmap *smbmp, read_alloc_stats_t &stats);
+ int read_allocation_from_onodes(SimpleBitmap *smbmp, read_alloc_stats_t& stats);
+ void read_allocation_from_single_onode(SimpleBitmap *smbmp, BlueStore::OnodeRef& onode_ref, read_alloc_stats_t& stats);
+ void set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length);
int commit_to_null_manager();
int commit_to_real_manager();
int db_cleanup(int ret);
Allocator* clone_allocator_without_bluefs(Allocator *src_allocator);
Allocator* initialize_allocator_from_freelist(FreelistManager *real_fm);
void copy_allocator_content_to_fm(Allocator *allocator, FreelistManager *real_fm);
- void read_allocation_from_single_onode(Allocator* allocator, BlueStore::OnodeRef& onode_ref, read_alloc_stats_t& stats);
+
void _fsck_check_object_omap(FSCKDepth depth,
OnodeRef& o,
cout << action << " bluestore.allocmap" << std::endl;
validate_path(cct.get(), path, false);
BlueStore bluestore(cct.get(), path);
- int r = bluestore.read_allocation_from_drive_for_bluestore_tool(true);
+ int r = bluestore.read_allocation_from_drive_for_bluestore_tool();
if (r < 0) {
cerr << action << " failed: " << cpp_strerror(r) << std::endl;
exit(EXIT_FAILURE);
cout << action << " bluestore.quick-fsck" << std::endl;
validate_path(cct.get(), path, false);
BlueStore bluestore(cct.get(), path);
- int r = bluestore.read_allocation_from_drive_for_fsck();
+ int r = bluestore.read_allocation_from_drive_for_bluestore_tool();
if (r < 0) {
cerr << action << " failed: " << cpp_strerror(r) << std::endl;
exit(EXIT_FAILURE);
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Gabriel BenHanokh <gbenhano@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "simple_bitmap.h"
+
+#include "include/ceph_assert.h"
+#include "bluestore_types.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << __func__ << "::SBMAP::" << this << " "
+
+static struct extent_t null_extent = {0, 0};
+
+//----------------------------------------------------------------------------
+//throw bad_alloc
+SimpleBitmap::SimpleBitmap(CephContext *_cct, uint64_t num_bits) :cct(_cct)
+{
+ m_num_bits = num_bits;
+ m_word_count = bits_to_words(num_bits);
+ if (num_bits & BITS_IN_WORD_MASK) {
+ m_word_count++;
+ }
+ m_arr = new uint64_t [m_word_count];
+ clear_all();
+}
+
+//----------------------------------------------------------------------------
+SimpleBitmap::~SimpleBitmap()
+{
+ delete [] m_arr;
+}
+
+//----------------------------------------------------------------------------
+bool SimpleBitmap::set(uint64_t offset, uint64_t length)
+{
+ dout(20) <<" [" << std::hex << offset << ", " << length << "]" << dendl;
+
+ if (offset + length >= m_num_bits) {
+ derr << __func__ << "::offset + length = " << offset + length << " exceeds map size = " << m_num_bits << dendl;
+ ceph_assert(offset + length < m_num_bits);
+ return false;
+ }
+
+ auto [word_index, first_bit_set] = split(offset);
+ // special case optimization
+ if (length == 1) {
+ uint64_t set_mask = 1ULL << first_bit_set;
+ m_arr[word_index] |= set_mask;
+ return true;
+ }
+
+ // handle the first word which might be incomplete
+ if (first_bit_set != 0) {
+ uint64_t set_mask = FULL_MASK << first_bit_set;
+ uint64_t first_bit_clr = first_bit_set + length;
+ if (first_bit_clr <= BITS_IN_WORD) {
+ if (first_bit_clr < BITS_IN_WORD) {
+ uint64_t clr_bits = BITS_IN_WORD - first_bit_clr;
+ uint64_t clr_mask = FULL_MASK >> clr_bits;
+ set_mask &= clr_mask;
+ }
+ m_arr[word_index] |= set_mask;
+ return true;
+ } else {
+ // set all bits in this word starting from first_bit_set
+ m_arr[word_index] |= set_mask;
+ word_index ++;
+ length -= (BITS_IN_WORD - first_bit_set);
+ }
+ }
+
+ // set a range of full words
+ uint64_t full_words_count = bits_to_words(length);
+ uint64_t end = word_index + full_words_count;
+ for (; word_index < end; word_index++) {
+ m_arr[word_index] = FULL_MASK;
+ }
+ length -= words_to_bits(full_words_count);
+
+ // set bits in the last word
+ if (length) {
+ uint64_t set_mask = ~(FULL_MASK << length);
+ m_arr[word_index] |= set_mask;
+ }
+
+ return true;
+}
+
+//----------------------------------------------------------------------------
+bool SimpleBitmap::clr(uint64_t offset, uint64_t length)
+{
+ if (offset + length >= m_num_bits) {
+ derr << __func__ << "::offset + length = " << offset + length << " exceeds map size = " << m_num_bits << dendl;
+ ceph_assert(offset + length < m_num_bits);
+ return false;
+ }
+
+ auto [word_index, first_bit_clr] = split(offset);
+ // special case optimization
+ if (length == 1) {
+ uint64_t set_mask = 1ULL << first_bit_clr;
+ uint64_t clr_mask = ~set_mask;
+ m_arr[word_index] &= clr_mask;
+
+ return true;
+ }
+
+ // handle the first word when we we are unaligned on word boundaries
+ if (first_bit_clr != 0) {
+ uint64_t clr_mask = ~(FULL_MASK << first_bit_clr);
+ uint64_t first_bit_set = first_bit_clr + length;
+ // special case - we only work on a single word
+ if (first_bit_set <= BITS_IN_WORD) {
+ if (first_bit_set < BITS_IN_WORD) {
+ uint64_t set_mask = FULL_MASK << first_bit_set;
+ clr_mask |= set_mask;
+ }
+ m_arr[word_index] &= clr_mask;
+ return true;
+ }
+ else {
+ // clear all bits in this word starting from first_bit_clr
+ // and continue to the next word
+ m_arr[word_index] &= clr_mask;
+ word_index ++;
+ length -= (BITS_IN_WORD - first_bit_clr);
+ }
+ }
+
+
+ // clear a range of full words
+ uint64_t full_words_count = bits_to_words(length);
+ uint64_t end = word_index + full_words_count;
+ for (; word_index < end; word_index++) {
+ m_arr[word_index] = 0;
+ }
+ length -= words_to_bits(full_words_count);
+
+ // set bits in the last word
+ if (length) {
+ uint64_t clr_mask = (FULL_MASK << length);
+ m_arr[word_index] &= clr_mask;
+ }
+
+ return true;
+}
+
+//----------------------------------------------------------------------------
+extent_t SimpleBitmap::get_next_set_extent(uint64_t offset)
+{
+ if (offset >= m_num_bits ) {
+ return null_extent;
+ }
+
+ auto [word_idx, bits_to_clear] = split(offset);
+ uint64_t word = m_arr[word_idx];
+ word &= (FULL_MASK << bits_to_clear);
+
+ // if there are no set bits in this word
+ if (word == 0) {
+ // skip past all clear words
+ while (++word_idx < m_word_count && !m_arr[word_idx]);
+
+ if (word_idx < m_word_count ) {
+ word = m_arr[word_idx];
+ } else {
+ return null_extent;
+ }
+ }
+
+ // ffs is 1 based, must dec by one as we are zero based
+ int ffs = __builtin_ffsll(word) - 1;
+ extent_t ext;
+ ext.offset = words_to_bits(word_idx) + ffs;
+
+ // set all bits from current to LSB
+ uint64_t clr_mask = FULL_MASK << ffs;
+ uint64_t set_mask = ~clr_mask;
+ word |= set_mask;
+
+ // skipped past fully set words
+ if (word == FULL_MASK) {
+ while ( (++word_idx < m_word_count) && (m_arr[word_idx] == FULL_MASK) );
+
+ if (word_idx < m_word_count) {
+ word = m_arr[word_idx];
+ } else {
+ // bitmap is set from ext.offset until the last bit
+ ext.length = (m_num_bits - ext.offset);
+ return ext;
+ }
+ }
+
+ ceph_assert(word != FULL_MASK);
+ int ffz = __builtin_ffsll(~word) - 1;
+ uint64_t zoffset = words_to_bits(word_idx) + ffz;
+ ext.length = (zoffset - ext.offset);
+
+ return ext;
+}
+
+//----------------------------------------------------------------------------
+extent_t SimpleBitmap::get_next_clr_extent(uint64_t offset)
+{
+ if (offset >= m_num_bits ) {
+ return null_extent;
+ }
+
+ uint64_t word_idx = offset_to_index(offset);
+ uint64_t word = m_arr[word_idx];
+
+ // set all bit set before offset
+ offset &= BITS_IN_WORD_MASK;
+ if (offset != 0) {
+ uint64_t bits_to_set = BITS_IN_WORD - offset;
+ uint64_t set_mask = FULL_MASK >> bits_to_set;
+ word |= set_mask;
+ }
+ if (word == FULL_MASK) {
+ // skipped past fully set words
+ while ( (++word_idx < m_word_count) && (m_arr[word_idx] == FULL_MASK) );
+
+ if (word_idx < m_word_count) {
+ word = m_arr[word_idx];
+ } else {
+ dout(10) << "2)Reached the end of the bitmap" << dendl;
+ return null_extent;
+ }
+ }
+
+ int ffz = __builtin_ffsll(~word) - 1;
+ extent_t ext;
+ ext.offset = words_to_bits(word_idx) + ffz;
+
+ // clear all bits from current position to LSB
+ word &= (FULL_MASK << ffz);
+
+ // skip past all clear words
+ if (word == 0) {
+ while ( (++word_idx < m_word_count) && (m_arr[word_idx] == 0) );
+
+ if (word_idx < m_word_count) {
+ word = m_arr[word_idx];
+ } else {
+ // bitmap is set from ext.offset until the last bit
+ ext.length = (m_num_bits - ext.offset);
+ return ext;
+ }
+ }
+
+ // ffs is 1 based, must dec by one as we are zero based
+ int ffs = __builtin_ffsll(word) - 1;
+ uint64_t soffset = words_to_bits(word_idx) + ffs;
+ ext.length = (soffset - ext.offset);
+ return ext;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Gabriel BenHanokh <gbenhano@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#pragma once
+#include <cstdint>
+#include <iostream>
+#include <string>
+#include <cstring>
+#include <cmath>
+#include <iomanip>
+
+#include "include/ceph_assert.h"
+
+struct extent_t {
+ uint64_t offset;
+ uint64_t length;
+};
+
+class SimpleBitmap {
+public:
+ SimpleBitmap(CephContext *_cct, uint64_t num_bits);
+ ~SimpleBitmap();
+
+ SimpleBitmap(const SimpleBitmap&) = delete;
+ SimpleBitmap& operator=(const SimpleBitmap&) = delete;
+
+
+ // set a bit range range of @length starting at @offset
+ bool set(uint64_t offset, uint64_t length);
+ // clear a bit range range of @length starting at @offset
+ bool clr(uint64_t offset, uint64_t length);
+
+ // returns a copy of the next set extent starting at @offset
+ extent_t get_next_set_extent(uint64_t offset);
+
+ // returns a copy of the next clear extent starting at @offset
+ extent_t get_next_clr_extent(uint64_t offset);
+
+ //----------------------------------------------------------------------------
+ inline uint64_t get_size() {
+ return m_num_bits;
+ }
+
+ //----------------------------------------------------------------------------
+ // clears all bits in the bitmap
+ inline void clear_all() {
+ std::memset(m_arr, 0, words_to_bytes(m_word_count));
+ }
+
+ //----------------------------------------------------------------------------
+ // sets all bits in the bitmap
+ inline void set_all() {
+ std::memset(m_arr, 0xFF, words_to_bytes(m_word_count));
+ // clear bits in the last word past the last legal bit
+ uint64_t incomplete_word_bit_offset = (m_num_bits & BITS_IN_WORD_MASK);
+ if (incomplete_word_bit_offset) {
+ uint64_t clr_mask = ~(FULL_MASK << incomplete_word_bit_offset);
+ m_arr[m_word_count - 1] &= clr_mask;
+ }
+ }
+
+ //----------------------------------------------------------------------------
+ bool bit_is_set(uint64_t offset) {
+ if (offset < m_num_bits) {
+ auto [word_index, bit_offset] = split(offset);
+ uint64_t mask = 1ULL << bit_offset;
+ return (m_arr[word_index] & mask);
+ } else {
+ ceph_assert(offset < m_num_bits);
+ return false;
+ }
+ }
+
+ //----------------------------------------------------------------------------
+ bool bit_is_clr(uint64_t offset) {
+ if (offset < m_num_bits) {
+ auto [word_index, bit_offset] = split(offset);
+ uint64_t mask = 1ULL << bit_offset;
+ return ( (m_arr[word_index] & mask) == 0 );
+ } else {
+ ceph_assert(offset < m_num_bits);
+ return false;
+ }
+ }
+
+private:
+ //----------------------------------------------------------------------------
+ static inline std::pair<uint64_t, uint64_t> split(uint64_t offset) {
+ return { offset_to_index(offset), (offset & BITS_IN_WORD_MASK) };
+ }
+
+ //---------------------------------------------------------------------------
+ static inline uint64_t offset_to_index(uint64_t offset) {
+ return offset >> BITS_IN_WORD_SHIFT;
+ }
+
+ //---------------------------------------------------------------------------
+ static inline uint64_t index_to_offset(uint64_t index) {
+ return index << BITS_IN_WORD_SHIFT;
+ }
+
+ //---------------------------------------------------------------------------
+ static inline uint64_t bits_to_words(uint64_t bit_count) {
+ return bit_count >> BITS_IN_WORD_SHIFT;
+ }
+
+ //---------------------------------------------------------------------------
+ static inline uint64_t words_to_bits(uint64_t words_count) {
+ return words_count << BITS_IN_WORD_SHIFT;
+ }
+
+ //---------------------------------------------------------------------------
+ static inline uint64_t bytes_to_words(uint64_t byte_count) {
+ return byte_count >> BYTES_IN_WORD_SHIFT;
+ }
+
+ //---------------------------------------------------------------------------
+ static inline uint64_t words_to_bytes(uint64_t words_count) {
+ return (words_count << BYTES_IN_WORD_SHIFT);
+ }
+
+ constexpr static uint64_t BYTES_IN_WORD = sizeof(uint64_t);
+ constexpr static uint64_t BYTES_IN_WORD_SHIFT = 3;
+ constexpr static uint64_t BITS_IN_WORD = (BYTES_IN_WORD * 8);
+ constexpr static uint64_t BITS_IN_WORD_MASK = (BITS_IN_WORD - 1);
+ constexpr static uint64_t BITS_IN_WORD_SHIFT = 6;
+ constexpr static uint64_t FULL_MASK = (~((uint64_t)0));
+
+ CephContext *cct;
+ uint64_t *m_arr;
+ uint64_t m_num_bits;
+ uint64_t m_word_count;
+};
#include "include/stringify.h"
#include "common/ceph_time.h"
#include "os/bluestore/BlueStore.h"
+#include "os/bluestore/simple_bitmap.h"
#include "os/bluestore/AvlAllocator.h"
#include "common/ceph_argparse.h"
#include "global/global_init.h"
P(bufferptr);
P(range_seg_t);
P(sb_info_t);
+ P(SimpleBitmap);
cout << "map<uint64_t,uint64_t>\t" << sizeof(map<uint64_t,uint64_t>) << std::endl;
cout << "map<char,char>\t" << sizeof(map<char,char>) << std::endl;
}
}
}
+//---------------------------------------------------------------------------------
+static int verify_extent(const extent_t & ext, const extent_t *ext_arr, uint64_t ext_arr_size, uint64_t idx)
+{
+ const extent_t & ext_ref = ext_arr[idx];
+ if (ext.offset == ext_ref.offset && ext.length == ext_ref.length) {
+ return 0;
+ } else {
+ std::cerr << "mismatch was found at index " << idx << std::endl;
+ if (ext.length == 0) {
+ std::cerr << "Null extent was returned at idx = " << idx << std::endl;
+ }
+ unsigned start = std::max(((int32_t)(idx)-3), 0);
+ unsigned end = std::min(idx+3, ext_arr_size);
+ for (unsigned j = start; j < end; j++) {
+ const extent_t & ext_ref = ext_arr[j];
+ std::cerr << j << ") ref_ext = [" << ext_ref.offset << ", " << ext_ref.length << "]" << std::endl;
+ }
+ std::cerr << idx << ") ext = [" << ext.offset << ", " << ext.length << "]" << std::endl;
+ return -1;
+ }
+}
+
+//---------------------------------------------------------------------------------
+static int test_extents(uint64_t index, extent_t *ext_arr, uint64_t ext_arr_size, SimpleBitmap& sbmap, bool set)
+{
+ const uint64_t MAX_JUMP_BIG = 1523;
+ const uint64_t MAX_JUMP_SMALL = 19;
+ const uint64_t MAX_LEN_BIG = 523;
+ const uint64_t MAX_LEN_SMALL = 23;
+
+ uint64_t n = sbmap.get_size();
+ uint64_t offset = 0;
+ unsigned length, jump, i;
+ for (i = 0; i < ext_arr_size; i++) {
+ if (i & 3) {
+ jump = std::rand() % MAX_JUMP_BIG;
+ } else {
+ jump = std::rand() % MAX_JUMP_SMALL;
+ }
+ offset += jump;
+ if (i & 1) {
+ length = std::rand() % MAX_LEN_BIG;
+ } else {
+ length = std::rand() % MAX_LEN_SMALL;
+ }
+ // make sure no zero length will be used
+ length++;
+ if (offset + length >= n) {
+ break;
+ }
+
+ bool success;
+ if (set) {
+ success = sbmap.set(offset, length);
+ } else {
+ success = sbmap.clr(offset, length);
+ }
+ if (!success) {
+ std::cerr << "Failed sbmap." << (set ? "set(" : "clr(") << offset << ", " << length << ")"<< std::endl;
+ return -1;
+ }
+
+ // if this is not the first entry and no jump -> merge extents
+ if ( (i==0) || (jump > 0) ) {
+ ext_arr[i] = {offset, length};
+ } else {
+ // merge 2 extents
+ i --;
+ ext_arr[i].length += length;
+ }
+ offset += length;
+ }
+ unsigned arr_size = std::min((uint64_t)i, ext_arr_size);
+ std::cout << std::hex << std::right;
+ std::cout << "[" << index << "] " << (set ? "Set::" : "Clr::") << " extents count = 0x" << arr_size;
+ std::cout << std::dec << std::endl;
+
+ offset = 0;
+ extent_t ext;
+ for(unsigned i = 0; i < arr_size; i++) {
+ if (set) {
+ ext = sbmap.get_next_set_extent(offset);
+ } else {
+ ext = sbmap.get_next_clr_extent(offset);
+ }
+
+ if (verify_extent(ext, ext_arr, ext_arr_size, i) != 0) {
+ return -1;
+ }
+ offset = ext.offset + ext.length;
+ }
+
+ if (set) {
+ ext = sbmap.get_next_set_extent(offset);
+ } else {
+ ext = sbmap.get_next_clr_extent(offset);
+ }
+ if (ext.length == 0) {
+ return 0;
+ } else {
+ std::cerr << "sbmap.get_next_" << (set ? "set" : "clr") << "_extent(" << offset << ") return length = " << ext.length << std::endl;
+ return -1;
+ }
+}
+
+//---------------------------------------------------------------------------------
+TEST(SimpleBitmap, basic)
+{
+ const uint64_t MAX_EXTENTS_COUNT = 7131177;
+ std::unique_ptr<extent_t[]> ext_arr = std::make_unique<extent_t[]>(MAX_EXTENTS_COUNT);
+ ASSERT_TRUE(ext_arr != nullptr);
+ const uint64_t BIT_COUNT = 4ULL << 30; // 4Gb = 512MB
+ SimpleBitmap sbmap(g_ceph_context, BIT_COUNT);
+
+ // use current time as seed for random generator
+ std::srand(std::time(nullptr));
+ for (unsigned i = 0; i < 3; i++ ) {
+ memset(ext_arr.get(), 0, sizeof(extent_t)*MAX_EXTENTS_COUNT);
+ sbmap.clear_all();
+ ASSERT_TRUE(test_extents(i, ext_arr.get(), MAX_EXTENTS_COUNT, sbmap, true) == 0);
+
+ memset(ext_arr.get(), 0, sizeof(extent_t)*MAX_EXTENTS_COUNT);
+ sbmap.set_all();
+ ASSERT_TRUE(test_extents(i, ext_arr.get(), MAX_EXTENTS_COUNT, sbmap, false) == 0);
+ }
+}
+
+//---------------------------------------------------------------------------------
+static int test_intersections(unsigned test_idx, SimpleBitmap &sbmap, uint8_t map[], uint64_t map_size)
+{
+ const uint64_t MAX_LEN_BIG = 523;
+ const uint64_t MAX_LEN_SMALL = 23;
+
+ bool success;
+ uint64_t set_op_count = 0, clr_op_count = 0;
+ unsigned length, i;
+ for (i = 0; i < map_size / (MAX_LEN_BIG*2); i++) {
+ uint64_t offset = (std::rand() % (map_size - 1));
+ if (i & 1) {
+ length = std::rand() % MAX_LEN_BIG;
+ } else {
+ length = std::rand() % MAX_LEN_SMALL;
+ }
+ // make sure no zero length will be used
+ length++;
+ if (offset + length >= map_size) {
+ continue;
+ }
+ // 2:1 set/clr
+ bool set = (std::rand() % 3);
+ if (set) {
+ success = sbmap.set(offset, length);
+ memset(map+offset, 0xFF, length);
+ set_op_count++;
+ } else {
+ success = sbmap.clr(offset, length);
+ memset(map+offset, 0x0, length);
+ clr_op_count++;
+ }
+ if (!success) {
+ std::cerr << "Failed sbmap." << (set ? "set(" : "clr(") << offset << ", " << length << ")"<< std::endl;
+ return -1;
+ }
+ }
+
+ uint64_t set_bit_count = 0;
+ uint64_t clr_bit_count = 0;
+ for(uint64_t idx = 0; idx < map_size; idx++) {
+ if (map[idx]) {
+ set_bit_count++;
+ success = sbmap.bit_is_set(idx);
+ } else {
+ clr_bit_count++;
+ success = sbmap.bit_is_clr(idx);
+ }
+ if (!success) {
+ std::cerr << "expected: sbmap.bit_is_" << (map[idx] ? "set(" : "clr(") << idx << ")"<< std::endl;
+ return -1;
+ }
+
+ }
+ std::cout << std::hex << std::right << __func__ ;
+ std::cout << " [" << test_idx << "] set_bit_count = 0x" << std::setfill('0') << std::setw(8) << set_bit_count
+ << ", clr_bit_count = 0x" << std::setfill('0') << std::setw(8) << clr_bit_count
+ << ", sum = 0x" << set_bit_count + clr_bit_count << std::endl;
+ std::cout << std::dec;
+ uint64_t offset = 0;
+ for(uint64_t i = 0; i < (set_op_count + clr_op_count); i++) {
+ extent_t ext = sbmap.get_next_set_extent(offset);
+ //std::cout << "set_ext:: " << i << ") [" << ext.offset << ", " << ext.length << "]" << std::endl;
+ for (uint64_t idx = ext.offset; idx < ext.offset + ext.length; idx++) {
+ if (map[idx] != 0xFF) {
+ std::cerr << "map[" << idx << "] is clear, but extent [" << ext.offset << ", " << ext.length << "] is set" << std::endl;
+ return -1;
+ }
+ }
+ offset = ext.offset + ext.length;
+ }
+
+ offset = 0;
+ for(uint64_t i = 0; i < (set_op_count + clr_op_count); i++) {
+ extent_t ext = sbmap.get_next_clr_extent(offset);
+ //std::cout << "clr_ext:: " << i << ") [" << ext.offset << ", " << ext.length << "]" << std::endl;
+ for (uint64_t idx = ext.offset; idx < ext.offset + ext.length; idx++) {
+ if (map[idx] ) {
+ std::cerr << "map[" << idx << "] is set, but extent [" << ext.offset << ", " << ext.length << "] is free" << std::endl;
+ return -1;
+ }
+ }
+ offset = ext.offset + ext.length;
+ }
+
+ return 0;
+}
+
+//---------------------------------------------------------------------------------
+TEST(SimpleBitmap, intersection)
+{
+ const uint64_t MAP_SIZE = 1ULL << 30; // 1G
+ SimpleBitmap sbmap(g_ceph_context, MAP_SIZE);
+
+ // use current time as seed for random generator
+ std::srand(std::time(nullptr));
+
+ std::unique_ptr<uint8_t[]> map = std::make_unique<uint8_t[]> (MAP_SIZE);
+ ASSERT_TRUE(map != nullptr);
+
+ for (unsigned i = 0; i < 1; i++ ) {
+ sbmap.clear_all();
+ memset(map.get(), 0, MAP_SIZE);
+ ASSERT_TRUE(test_intersections(i, sbmap, map.get(), MAP_SIZE) == 0);
+
+ sbmap.set_all();
+ memset(map.get(), 0xFF, MAP_SIZE);
+ ASSERT_TRUE(test_intersections(i, sbmap, map.get(), MAP_SIZE) == 0);
+ }
+}
+
+
+//---------------------------------------------------------------------------------
+static int test_extents_boundaries(uint64_t index, extent_t *ext_arr, uint64_t ext_arr_size, SimpleBitmap& sbmap, bool set)
+{
+ uint64_t n = sbmap.get_size();
+ uint64_t offset = 0, k = 0;
+ for(unsigned i = 0; i < 64; i++) {
+ offset += i;
+ if (offset >= n) {
+ break;
+ }
+
+ for(unsigned length = 1; length <= 128; length++) {
+ if (offset + length >= n) {
+ break;
+ }
+
+ if (k >= ext_arr_size) {
+ break;
+ }
+ bool success;
+ if (set) {
+ success = sbmap.set(offset, length);
+ } else {
+ success = sbmap.clr(offset, length);
+ }
+ if (!success) {
+ std::cerr << "Failed sbmap." << (set ? "set(" : "clr(") << offset << ", " << length << ")"<< std::endl;
+ return -1;
+ }
+ ext_arr[k++] = {offset, length};
+ if (length < 64) {
+ offset += 64;
+ } else {
+ offset += 128;
+ }
+ }
+ if (k >= ext_arr_size) {
+ break;
+ }
+ }
+
+ unsigned arr_size = std::min((uint64_t)k, ext_arr_size);
+ std::cout << std::hex << std::right << __func__ ;
+ std::cout << " [" << index << "] " << (set ? "Set::" : "Clr::") << " extents count = 0x" << arr_size;
+ std::cout << std::dec << std::endl;
+
+ offset = 0;
+ extent_t ext;
+ for(unsigned i = 0; i < arr_size; i++) {
+ if (set) {
+ ext = sbmap.get_next_set_extent(offset);
+ } else {
+ ext = sbmap.get_next_clr_extent(offset);
+ }
+
+ if (verify_extent(ext, ext_arr, ext_arr_size, i) != 0) {
+ return -1;
+ }
+ offset = ext.offset + ext.length;
+ }
+
+ if (set) {
+ ext = sbmap.get_next_set_extent(offset);
+ } else {
+ ext = sbmap.get_next_clr_extent(offset);
+ }
+ if (ext.length == 0) {
+ return 0;
+ } else {
+ std::cerr << "sbmap.get_next_" << (set ? "set" : "clr") << "_extent(" << offset << ") return length = " << ext.length << std::endl;
+ return -1;
+ }
+
+}
+
+//---------------------------------------------------------------------------------
+TEST(SimpleBitmap, boundaries)
+{
+ const uint64_t MAX_EXTENTS_COUNT = 64 << 10;
+ std::unique_ptr<extent_t[]> ext_arr = std::make_unique<extent_t[]>(MAX_EXTENTS_COUNT);
+ ASSERT_TRUE(ext_arr != nullptr);
+
+ // use current time as seed for random generator
+ std::srand(std::time(nullptr));
+
+ uint64_t bit_count = 32 << 20; // 32Mb = 4MB
+ unsigned count = 0;
+ for (unsigned i = 0; i < 64; i++) {
+ SimpleBitmap sbmap(g_ceph_context, bit_count+i);
+ memset(ext_arr.get(), 0, sizeof(extent_t)*MAX_EXTENTS_COUNT);
+ sbmap.clear_all();
+ ASSERT_TRUE(test_extents_boundaries(count, ext_arr.get(), MAX_EXTENTS_COUNT, sbmap, true) == 0);
+
+ memset(ext_arr.get(), 0, sizeof(extent_t)*MAX_EXTENTS_COUNT);
+ sbmap.set_all();
+ ASSERT_TRUE(test_extents_boundaries(count++, ext_arr.get(), MAX_EXTENTS_COUNT, sbmap, false) == 0);
+ }
+}
+
TEST(shared_blob_2hash_tracker_t, basic_test)
{
shared_blob_2hash_tracker_t t1(1024 * 1024, 4096);