From: myoungwon oh Date: Tue, 26 Jul 2022 05:25:27 +0000 (+0900) Subject: seastore/rbm: rename NVMeManager to BlockRBManager X-Git-Tag: v18.0.0~392^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=163b51f2efdde0735a7459ad44fb574d3d2d6b56;p=ceph.git seastore/rbm: rename NVMeManager to BlockRBManager Signed-off-by: Myoungwon Oh --- diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt index ee3aa47cc533..408f7b494dc7 100644 --- a/src/crimson/os/seastore/CMakeLists.txt +++ b/src/crimson/os/seastore/CMakeLists.txt @@ -36,7 +36,7 @@ set(crimson_seastore_srcs extent_placement_manager.cc object_data_handler.cc seastore.cc - random_block_manager/nvme_manager.cc + random_block_manager/block_rb_manager.cc random_block_manager/nvmedevice.cc journal/segmented_journal.cc journal/segment_allocator.cc diff --git a/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc new file mode 100644 index 000000000000..eff58f832985 --- /dev/null +++ b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc @@ -0,0 +1,708 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include "crimson/os/seastore/logging.h" + +#include "include/buffer.h" +#include "nvmedevice.h" +#include "include/interval_set.h" +#include "include/intarith.h" +#include "block_rb_manager.h" + +SET_SUBSYS(seastore_device); + +namespace crimson::os::seastore { + +BlockRBManager::write_ertr::future<> BlockRBManager::rbm_sync_block_bitmap( + rbm_bitmap_block_t &block, blk_no_t block_no) +{ + LOG_PREFIX(BlockRBManager::rbm_sync_block_bitmap); + bufferptr bptr; + try { + bptr = bufferptr(ceph::buffer::create_page_aligned(block.get_size())); + bufferlist bl; + encode(block, bl); + auto iter = bl.cbegin(); + iter.copy(block.get_size(), bptr.c_str()); + } catch (const std::exception &e) { + DEBUG("rbm_sync_block_bitmap: exception creating aligned buffer {}", e); + ceph_assert(0 == "unhandled exception"); + } + uint64_t bitmap_block_no = convert_block_no_to_bitmap_block(block_no); + return device->write(super.start_alloc_area + + bitmap_block_no * super.block_size, + bptr); +} + +BlockRBManager::mkfs_ertr::future<> BlockRBManager::initialize_blk_alloc_area() +{ + LOG_PREFIX(BlockRBManager::initialize_blk_alloc_area); + auto start = super.start_data_area / super.block_size; + DEBUG("initialize_alloc_area: start to read at {} ", start); + + /* write allocated bitmap info to rbm meta block */ + rbm_bitmap_block_t b_block(super.block_size); + alloc_rbm_bitmap_block_buf(b_block); + for (uint64_t i = 0; i < start; i++) { + b_block.set_bit(i); + } + + // CRC calculation is offloaded to NVMeDevice if data protection is enabled. + if (device->is_data_protection_enabled() == false) { + b_block.set_crc(); + } + + return seastar::do_with( + b_block, + [this, start, FNAME](auto &b_block) { + return rbm_sync_block_bitmap(b_block, + super.start_alloc_area / super.block_size + ).safe_then([this, &b_block, start, FNAME]() { + + /* initialize bitmap blocks as unused */ + auto max = max_block_by_bitmap_block(); + auto max_block = super.size / super.block_size; + blk_no_t end = round_up_to(max_block, max) - 1; + DEBUG("init start {} end {} ", start, end); + return rbm_sync_block_bitmap_by_range( + start, + end, + bitmap_op_types_t::ALL_CLEAR + ).safe_then([this, &b_block, FNAME]() { + /* + * Set rest of the block bitmap, which is not used, to 1 + * To do so, we only mark 1 to empty bitmap blocks + */ + uint64_t na_block_no = super.size/super.block_size; + uint64_t remain_block = na_block_no % max_block_by_bitmap_block(); + DEBUG("na_block_no: {}, remain_block: {} ", + na_block_no, remain_block); + if (remain_block) { + DEBUG("try to remained write alloc info "); + if (na_block_no > max_block_by_bitmap_block()) { + b_block.buf.clear(); + alloc_rbm_bitmap_block_buf(b_block); + } + for (uint64_t i = remain_block; i < max_block_by_bitmap_block(); i++) { + b_block.set_bit(i); + } + b_block.set_crc(); + return rbm_sync_block_bitmap(b_block, na_block_no + ).handle_error( + mkfs_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error rbm_sync_block_bitmap to update \ + last bitmap block in BlockRBManager::initialize_blk_alloc_area" + } + ); + } + return mkfs_ertr::now(); + }).handle_error( + mkfs_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error rbm_sync_block_bitmap \ + in BlockRBManager::initialize_blk_alloc_area" + } + ); + }).handle_error( + mkfs_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error rbm_sync_block_bitmap_by_range \ + in BlockRBManager::initialize_blk_alloc_area" + } + ); + }); +} + +BlockRBManager::mkfs_ertr::future<> BlockRBManager::mkfs(mkfs_config_t config) +{ + LOG_PREFIX(BlockRBManager::mkfs); + DEBUG("path {}", path); + return _open_device(path).safe_then([this, &config, FNAME]() { + rbm_abs_addr addr = convert_paddr_to_abs_addr( + config.start); + return read_rbm_header(addr).safe_then([FNAME](auto super) { + DEBUG("already exists "); + return mkfs_ertr::now(); + }).handle_error( + crimson::ct_error::enoent::handle([this, &config, FNAME](auto) { + super.uuid = uuid_d(); // TODO + super.magic = 0xFF; // TODO + super.start = convert_paddr_to_abs_addr( + config.start); + super.end = convert_paddr_to_abs_addr( + config.end); + super.block_size = config.block_size; + super.size = config.total_size; + super.free_block_count = config.total_size/config.block_size - 2; + super.alloc_area_size = get_alloc_area_size(); + super.start_alloc_area = RBM_SUPERBLOCK_SIZE; + super.start_data_area = + super.start_alloc_area + super.alloc_area_size; + super.crc = 0; + super.feature |= RBM_BITMAP_BLOCK_CRC; + super.device_id = config.device_id; + + DEBUG(" super {} ", super); + // write super block + return write_rbm_header().safe_then([this] { + return initialize_blk_alloc_area(); + }).handle_error( + mkfs_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error write_rbm_header in BlockRBManager::mkfs" + }); + }), + mkfs_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error read_rbm_header in BlockRBManager::mkfs" + } + ); + }).safe_then([this]() { + if (device) { + return device->close( + ).safe_then([]() { + return mkfs_ertr::now(); + }); + } + return mkfs_ertr::now(); + }).handle_error( + mkfs_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error open_device in BlockRBManager::mkfs" + }); +} + +BlockRBManager::find_block_ret BlockRBManager::find_free_block(Transaction &t, size_t size) +{ + LOG_PREFIX(BlockRBManager::find_free_block); + auto bp = bufferptr(ceph::buffer::create_page_aligned(super.block_size)); + return seastar::do_with(uint64_t(0), + uint64_t(super.start_alloc_area), + interval_set(), + bp, + [&, this, FNAME](auto &allocated, auto &addr, auto &alloc_extent, auto &bp) mutable { + return crimson::repeat( + [&, this, FNAME]() mutable { + return device->read( + addr, + bp + ).safe_then( + [&bp, &addr, size, &allocated, &alloc_extent, this, FNAME]() mutable { + DEBUG("find_free_list: allocate {}, addr {}", allocated, addr); + rbm_bitmap_block_t b_block(super.block_size); + bufferlist bl_bitmap_block; + bl_bitmap_block.append(bp); + decode(b_block, bl_bitmap_block); + auto max = max_block_by_bitmap_block(); + for (uint64_t i = 0; + i < max && (uint64_t)size/super.block_size > allocated; i++) { + auto block_id = convert_bitmap_block_no_to_block_id(i, addr); + if (b_block.is_allocated(i)) { + continue; + } + DEBUG("find_free_list: allocated block no {} i {}", + convert_bitmap_block_no_to_block_id(i, addr), i); + if (allocated != 0 && alloc_extent.range_end() != block_id) { + /* + * if not continous block, just restart to find continuous blocks + * at the next block. + * in-memory allocator can handle this efficiently. + */ + allocated = 0; + alloc_extent.clear(); // a range of block allocation + DEBUG("find_free_list: rety to find continuous blocks"); + continue; + } + allocated += 1; + alloc_extent.insert(block_id); + } + addr += super.block_size; + DEBUG("find_free_list: allocated: {} alloc_extent {}", + allocated, alloc_extent); + if (((uint64_t)size)/super.block_size == allocated) { + return seastar::stop_iteration::yes; + } else if (addr >= super.start_data_area) { + alloc_extent.clear(); + return seastar::stop_iteration::yes; + } + return seastar::stop_iteration::no; + }); + }).safe_then([&allocated, &alloc_extent, size, this, FNAME]() { + DEBUG(" allocated: {} size {} ", + allocated * super.block_size, size); + if (allocated * super.block_size < size) { + alloc_extent.clear(); + } + return find_block_ret( + find_block_ertr::ready_future_marker{}, + alloc_extent); + }).handle_error( + find_block_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in BlockRBManager::find_free_block" + } + ); + }); +} + +/* TODO : block allocator */ +BlockRBManager::allocate_ret BlockRBManager::alloc_extent( + Transaction &t, size_t size) +{ + + /* + * 1. find free blocks using block allocator + * 2. add free blocks to transaction + * (the free block is reserved state, not stored) + * 3. link free blocks to onode + * Due to in-memory block allocator is the next work to do, + * just read the block bitmap directly to find free blocks. + * + */ + LOG_PREFIX(BlockRBManager::alloc_extent); + return find_free_block(t, size + ).safe_then([this, FNAME](auto alloc_extent) mutable + -> allocate_ertr::future { + DEBUG("after find_free_block: allocated {}", alloc_extent); + if (alloc_extent.empty()) { + return crimson::ct_error::enospc::make(); + } + paddr_t paddr = convert_abs_addr_to_paddr( + alloc_extent.range_start() * super.block_size, + super.device_id); + return allocate_ret( + allocate_ertr::ready_future_marker{}, + paddr); + }).handle_error( + allocate_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error find_free_block in BlockRBManager::alloc_extent" + } + ); +} + +void BlockRBManager::add_free_extent( + std::vector& v, rbm_abs_addr from, size_t len) +{ + ceph_assert(!(len % super.block_size)); + paddr_t paddr = convert_abs_addr_to_paddr( + from, + super.device_id); + alloc_delta_t alloc_info; + alloc_info.alloc_blk_ranges.emplace_back( + paddr, L_ADDR_NULL, len, extent_types_t::ROOT); + alloc_info.op = alloc_delta_t::op_types_t::CLEAR; + v.push_back(alloc_info); +} + +BlockRBManager::write_ertr::future<> BlockRBManager::rbm_sync_block_bitmap_by_range( + blk_no_t start, blk_no_t end, bitmap_op_types_t op) +{ + LOG_PREFIX(BlockRBManager::rbm_sync_block_bitmap_by_range); + auto addr = super.start_alloc_area + + (start / max_block_by_bitmap_block()) + * super.block_size; + // aligned write + if (start % max_block_by_bitmap_block() == 0 && + end % (max_block_by_bitmap_block() - 1) == 0) { + auto num_block = num_block_between_blk_ids(start, end); + bufferlist bl_bitmap_block; + add_cont_bitmap_blocks_to_buf(bl_bitmap_block, num_block, op); + return write( + addr, + bl_bitmap_block); + } + auto bp = bufferptr(ceph::buffer::create_page_aligned(super.block_size)); + // try to read first block, then check the block is aligned + return device->read( + addr, + bp + ).safe_then([bp, start, end, op, addr, this, FNAME]() { + rbm_bitmap_block_t b_block(super.block_size); + bufferlist bl_bitmap_block; + bl_bitmap_block.append(bp); + decode(b_block, bl_bitmap_block); + auto max = max_block_by_bitmap_block(); + auto loop_end = end < (start / max + 1) * max ? + end % max : max - 1; + for (uint64_t i = (start % max); i <= loop_end; i++) { + if (op == bitmap_op_types_t::ALL_SET) { + b_block.set_bit(i); + } else { + b_block.clear_bit(i); + } + } + auto num_block = num_block_between_blk_ids(start, end); + DEBUG("rbm_sync_block_bitmap_by_range: start {}, end {}, \ + loop_end {}, num_block {}", + start, end, loop_end, num_block); + + bl_bitmap_block.clear(); + encode(b_block, bl_bitmap_block); + if (num_block == 1) { + // | front (unaligned) | + return write( + addr, + bl_bitmap_block); + } else if (!((end + 1) % max)) { + // | front (unaligned) | middle (aligned) | + add_cont_bitmap_blocks_to_buf(bl_bitmap_block, num_block - 1, op); + DEBUG("partially aligned write: addr {} length {}", + addr, bl_bitmap_block.length()); + return write( + addr, + bl_bitmap_block); + } else if (num_block > 2) { + // | front (unaligned) | middle | end (unaligned) | + // fill up the middle + add_cont_bitmap_blocks_to_buf(bl_bitmap_block, num_block - 2, op); + } + + auto next_addr = super.start_alloc_area + + (end / max_block_by_bitmap_block()) + * super.block_size; + auto bptr = bufferptr(ceph::buffer::create_page_aligned(super.block_size)); + // | front (unaligned) | middle | end (unaligned) | or + // | front (unaligned) | end (unaligned) | + return device->read( + next_addr, + bptr + ).safe_then( + [bptr, bl_bitmap_block, end, op, addr, this, FNAME]() mutable { + rbm_bitmap_block_t b_block(super.block_size); + bufferlist block; + block.append(bptr); + decode(b_block, block); + auto max = max_block_by_bitmap_block(); + for (uint64_t i = (end - (end % max)) % max; + i <= (end % max); i++) { + if (op == bitmap_op_types_t::ALL_SET) { + b_block.set_bit(i); + } else { + b_block.clear_bit(i); + } + } + DEBUG("start {} end {} ", end - (end % max), end); + bl_bitmap_block.claim_append(block); + return write( + addr, + bl_bitmap_block); + }).handle_error( + write_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in BlockRBManager::rbm_sync_block_bitmap_by_range" + } + ); + }).handle_error( + write_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in BlockRBManager::rbm_sync_block_bitmap_by_range" + } + ); +} + +BlockRBManager::abort_allocation_ertr::future<> BlockRBManager::abort_allocation( + Transaction &t) +{ + /* + * TODO: clear all allocation infos associated with transaction in in-memory allocator + */ + return abort_allocation_ertr::now(); +} + +BlockRBManager::write_ertr::future<> BlockRBManager::complete_allocation( + Transaction &t) +{ + return write_ertr::now(); +} + +BlockRBManager::write_ertr::future<> BlockRBManager::sync_allocation( + std::vector &alloc_blocks) +{ + LOG_PREFIX(BlockRBManager::sync_allocation); + if (alloc_blocks.empty()) { + return write_ertr::now(); + } + return seastar::do_with(move(alloc_blocks), + [&, this, FNAME](auto &alloc_blocks) mutable { + return crimson::do_for_each(alloc_blocks, + [this, FNAME](auto &alloc) { + return crimson::do_for_each(alloc.alloc_blk_ranges, + [this, &alloc, FNAME](auto &range) -> write_ertr::future<> { + DEBUG("range {} ~ {}", range.paddr, range.len); + bitmap_op_types_t op = + (alloc.op == alloc_delta_t::op_types_t::SET) ? + bitmap_op_types_t::ALL_SET : + bitmap_op_types_t::ALL_CLEAR; + rbm_abs_addr addr = convert_paddr_to_abs_addr( + range.paddr); + blk_no_t start = addr / super.block_size; + blk_no_t end = start + + (round_up_to(range.len, super.block_size)) / super.block_size + - 1; + return rbm_sync_block_bitmap_by_range( + start, + end, + op); + }); + }).safe_then([this, &alloc_blocks, FNAME]() mutable { + int alloc_block_count = 0; + for (const auto& b : alloc_blocks) { + for (auto r : b.alloc_blk_ranges) { + if (b.op == alloc_delta_t::op_types_t::SET) { + alloc_block_count += + round_up_to(r.len, super.block_size) / super.block_size; + DEBUG("complete alloc block: start {} len {} ", + r.paddr, r.len); + } else { + alloc_block_count -= + round_up_to(r.len, super.block_size) / super.block_size; + DEBUG("complete alloc block: start {} len {} ", + r.paddr, r.len); + } + } + } + DEBUG("complete_alloction: complete to allocate {} blocks", + alloc_block_count); + super.free_block_count -= alloc_block_count; + return write_ertr::now(); + }); + }); +} + +BlockRBManager::open_ertr::future<> BlockRBManager::open( + const std::string &path, paddr_t paddr) +{ + LOG_PREFIX(BlockRBManager::open); + DEBUG("open: path{}", path); + rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr); + return _open_device(path + ).safe_then([this, addr]() { + return read_rbm_header(addr).safe_then([&](auto s) + -> open_ertr::future<> { + if (s.magic != 0xFF) { + return crimson::ct_error::enoent::make(); + } + super = s; + return check_bitmap_blocks().safe_then([]() { + return open_ertr::now(); + }); + }).handle_error( + open_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error read_rbm_header in BlockRBManager::open" + } + ); + }); +} + +BlockRBManager::write_ertr::future<> BlockRBManager::write( + paddr_t paddr, + bufferptr &bptr) +{ + ceph_assert(device); + rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr); + if (addr > super.end || addr < super.start || + bptr.length() > super.end - super.start) { + return crimson::ct_error::erange::make(); + } + return device->write( + addr, + bptr); +} + +BlockRBManager::read_ertr::future<> BlockRBManager::read( + paddr_t paddr, + bufferptr &bptr) +{ + ceph_assert(device); + rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr); + if (addr > super.end || addr < super.start || + bptr.length() > super.end - super.start) { + return crimson::ct_error::erange::make(); + } + return device->read( + addr, + bptr); +} + +BlockRBManager::close_ertr::future<> BlockRBManager::close() +{ + ceph_assert(device); + return device->close(); +} + +BlockRBManager::open_ertr::future<> BlockRBManager::_open_device( + const std::string path) +{ + ceph_assert(device); + return device->open(path, seastar::open_flags::rw); +} + +BlockRBManager::write_ertr::future<> BlockRBManager::write_rbm_header() +{ + bufferlist meta_b_header; + super.crc = 0; + encode(super, meta_b_header); + // If NVMeDevice supports data protection, CRC for checksum is not required + // NVMeDevice is expected to generate and store checksum internally. + // CPU overhead for CRC might be saved. + if (device->is_data_protection_enabled()) { + super.crc = -1; + } + else { + super.crc = meta_b_header.crc32c(-1); + } + + bufferlist bl; + encode(super, bl); + auto iter = bl.begin(); + auto bp = bufferptr(ceph::buffer::create_page_aligned(super.block_size)); + assert(bl.length() < super.block_size); + iter.copy(bl.length(), bp.c_str()); + + return device->write(super.start, bp); +} + +BlockRBManager::read_ertr::future BlockRBManager::read_rbm_header( + rbm_abs_addr addr) +{ + LOG_PREFIX(BlockRBManager::read_rbm_header); + ceph_assert(device); + bufferptr bptr = + bufferptr(ceph::buffer::create_page_aligned(RBM_SUPERBLOCK_SIZE)); + bptr.zero(); + return device->read( + addr, + bptr + ).safe_then([length=bptr.length(), this, bptr, FNAME]() + -> read_ertr::future { + bufferlist bl; + bl.append(bptr); + auto p = bl.cbegin(); + rbm_metadata_header_t super_block; + try { + decode(super_block, p); + } + catch (ceph::buffer::error& e) { + DEBUG("read_rbm_header: unable to decode rbm super block {}", + e.what()); + return crimson::ct_error::enoent::make(); + } + checksum_t crc = super_block.crc; + bufferlist meta_b_header; + super_block.crc = 0; + encode(super_block, meta_b_header); + + // Do CRC verification only if data protection is not supported. + if (device->is_data_protection_enabled() == false) { + if (meta_b_header.crc32c(-1) != crc) { + DEBUG("bad crc on super block, expected {} != actual {} ", + meta_b_header.crc32c(-1), crc); + return crimson::ct_error::input_output_error::make(); + } + } + DEBUG("got {} ", super); + return read_ertr::future( + read_ertr::ready_future_marker{}, + super_block + ); + }).handle_error( + read_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in BlockRBManager::read_rbm_header" + } + ); +} + +BlockRBManager::check_bitmap_blocks_ertr::future<> BlockRBManager::check_bitmap_blocks() +{ + LOG_PREFIX(BlockRBManager::check_bitmap_blocks); + auto bp = bufferptr(ceph::buffer::create_page_aligned(super.block_size)); + return seastar::do_with(uint64_t(super.start_alloc_area), uint64_t(0), bp, + [&, this, FNAME](auto &addr, auto &free_blocks, auto &bp) mutable { + return crimson::repeat([&, this, FNAME]() mutable { + return device->read(addr, bp + ).safe_then( + [&bp, &addr, &free_blocks, this, FNAME]() mutable { + DEBUG("verify_bitmap_blocks: addr {}", addr); + rbm_bitmap_block_t b_block(super.block_size); + bufferlist bl_bitmap_block; + bl_bitmap_block.append(bp); + decode(b_block, bl_bitmap_block); + auto max = max_block_by_bitmap_block(); + for (uint64_t i = 0; i < max; i++) { + if (!b_block.is_allocated(i)) { + free_blocks++; + } + } + addr += super.block_size; + if (addr >= super.start_data_area) { + return seastar::stop_iteration::yes; + } + return seastar::stop_iteration::no; + }); + }).safe_then([&free_blocks, this, FNAME]() { + DEBUG("free_blocks: {} ", free_blocks); + super.free_block_count = free_blocks; + return check_bitmap_blocks_ertr::now(); + }).handle_error( + check_bitmap_blocks_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in BlockRBManager::find_free_block" + } + ); + }); +} + +BlockRBManager::write_ertr::future<> BlockRBManager::write( + rbm_abs_addr addr, + bufferlist &bl) +{ + LOG_PREFIX(BlockRBManager::write); + ceph_assert(device); + bufferptr bptr; + try { + bptr = bufferptr(ceph::buffer::create_page_aligned(bl.length())); + auto iter = bl.cbegin(); + iter.copy(bl.length(), bptr.c_str()); + } catch (const std::exception &e) { + DEBUG("write: exception creating aligned buffer {}", e); + ceph_assert(0 == "unhandled exception"); + } + return device->write( + addr, + bptr); +} + +std::ostream &operator<<(std::ostream &out, const rbm_metadata_header_t &header) +{ + out << " rbm_metadata_header_t(size=" << header.size + << ", block_size=" << header.block_size + << ", start=" << header.start + << ", end=" << header.end + << ", magic=" << header.magic + << ", uuid=" << header.uuid + << ", free_block_count=" << header.free_block_count + << ", alloc_area_size=" << header.alloc_area_size + << ", start_alloc_area=" << header.start_alloc_area + << ", start_data_area=" << header.start_data_area + << ", flag=" << header.flag + << ", feature=" << header.feature + << ", crc=" << header.crc; + return out << ")"; +} + +std::ostream &operator<<(std::ostream &out, + const rbm_bitmap_block_header_t &header) +{ + out << " rbm_bitmap_block_header_t(size=" << header.size + << ", checksum=" << header.checksum; + return out << ")"; +} + +} diff --git a/src/crimson/os/seastore/random_block_manager/block_rb_manager.h b/src/crimson/os/seastore/random_block_manager/block_rb_manager.h new file mode 100644 index 000000000000..0b46355d5244 --- /dev/null +++ b/src/crimson/os/seastore/random_block_manager/block_rb_manager.h @@ -0,0 +1,379 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include + +#include +#include +#include + +#include "include/ceph_assert.h" +#include "crimson/os/seastore/seastore_types.h" +#include "include/buffer_fwd.h" +#include "crimson/osd/exceptions.h" + +#include "crimson/os/seastore/transaction.h" +#include "nvmedevice.h" +#include "crimson/os/seastore/random_block_manager.h" + +#include "crimson/common/layout.h" +#include "include/buffer.h" +#include "include/uuid.h" + +namespace crimson::os::seastore { + +constexpr uint32_t RBM_SUPERBLOCK_SIZE = 4096; + +using NVMeBlockDevice = nvme_device::NVMeBlockDevice; +using NVMeBlockDeviceRef = std::unique_ptr; + +enum { + // TODO: This allows the device to manage crc on a block by itself + RBM_NVME_END_TO_END_PROTECTION = 1, + RBM_BITMAP_BLOCK_CRC = 2, +}; + +constexpr uint32_t BITS_PER_CHAR = 8; +inline char BIT_CHAR_MASK(uint64_t nr) +{ + return (char(1) << (nr % BITS_PER_CHAR)); +} + +struct rbm_metadata_header_t { + size_t size = 0; + size_t block_size = 0; + uint64_t start; // start location of the device + uint64_t end; // end location of the device + uint64_t magic; // to indicate randomblock_manager + uuid_d uuid; + uint64_t free_block_count; + uint64_t alloc_area_size; // bitmap + uint32_t start_alloc_area; // block number + uint32_t start_data_area; + uint64_t flag; // reserved + uint64_t feature; + device_id_t device_id; + checksum_t crc; + + DENC(rbm_metadata_header_t, v, p) { + DENC_START(1, 1, p); + denc(v.size, p); + denc(v.block_size, p); + denc(v.start, p); + denc(v.end, p); + denc(v.magic, p); + denc(v.uuid, p); + denc(v.free_block_count, p); + denc(v.alloc_area_size, p); + denc(v.start_alloc_area, p); + denc(v.start_data_area, p); + denc(v.flag, p); + denc(v.feature, p); + denc(v.device_id, p); + + denc(v.crc, p); + DENC_FINISH(p); + } + +}; + +struct rbm_bitmap_block_header_t { + uint32_t size; + checksum_t checksum; + DENC(rbm_bitmap_block_header_t, v, p) { + DENC_START(1, 1, p); + denc(v.size, p); + denc(v.checksum, p); + DENC_FINISH(p); + } +}; + +std::ostream &operator<<(std::ostream &out, const rbm_metadata_header_t &header); +std::ostream &operator<<(std::ostream &out, const rbm_bitmap_block_header_t &header); + +enum class bitmap_op_types_t : uint8_t { + ALL_CLEAR = 1, + ALL_SET = 2 +}; + +struct rbm_bitmap_block_t { + rbm_bitmap_block_header_t header; + bufferlist buf; + + uint64_t get_size() { + return header.size; + } + void set_crc() { + header.checksum = buf.crc32c(-1); + } + + bool is_correct_crc() { + ceph_assert(buf.length()); + return buf.crc32c(-1) == header.checksum; + } + + void set_bit(uint64_t nr) { + ceph_assert(buf.length()); + char mask = BIT_CHAR_MASK(nr); + char *p = buf.c_str() + (nr / BITS_PER_CHAR); + *p |= mask; + } + + void set_all_bits() { + ceph_assert(buf.length()); + ::memset(buf.c_str(), std::numeric_limits::max(), buf.length()); + } + + void set_clear_bits() { + ceph_assert(buf.length()); + ::memset(buf.c_str(), 0, buf.length()); + } + + void clear_bit(uint64_t nr) { + ceph_assert(buf.length()); + char mask = ~BIT_CHAR_MASK(nr); + char *p = buf.c_str() + (nr / BITS_PER_CHAR); + *p &= mask; + } + + bool is_allocated(uint64_t nr) { + ceph_assert(buf.length()); + char mask = BIT_CHAR_MASK(nr); + char *p = buf.c_str() + (nr / BITS_PER_CHAR); + return *p & mask; + } + + rbm_bitmap_block_t(size_t size) { + header.size = size; + } + + rbm_bitmap_block_t() = default; + + DENC(rbm_bitmap_block_t, v, p) { + DENC_START(1, 1, p); + denc(v.header, p); + denc(v.buf, p); + DENC_FINISH(p); + } +}; + +} + +WRITE_CLASS_DENC_BOUNDED( + crimson::os::seastore::rbm_metadata_header_t +) +WRITE_CLASS_DENC_BOUNDED( + crimson::os::seastore::rbm_bitmap_block_t +) +WRITE_CLASS_DENC_BOUNDED( + crimson::os::seastore::rbm_bitmap_block_header_t +) + +namespace crimson::os::seastore { + +class BlockRBManager final : public RandomBlockManager { +public: + /* + * Ondisk layout + * + * --------------------------------------------------------------------------- + * | rbm_metadata_header_t | rbm_bitmap_block_t 1 | ... | data blocks | + * --------------------------------------------------------------------------- + */ + + mkfs_ertr::future<> mkfs(mkfs_config_t) final; + read_ertr::future<> read(paddr_t addr, bufferptr &buffer) final; + write_ertr::future<> write(paddr_t addr, bufferptr &buf) final; + open_ertr::future<> open(const std::string &path, paddr_t start) final; + close_ertr::future<> close() final; + + /* + * alloc_extent + * + * The role of this function is to find out free blocks the transaction requires. + * To do so, alloc_extent() looks into both in-memory allocator + * and freebitmap blocks. + * But, in-memory allocator is the future work, and is not implemented yet, + * we use freebitmap directly to allocate freeblocks for now. + * + * Each bit in freebitmap block represents whether a block is allocated or not. + * + * TODO: multiple allocation + * + */ + allocate_ret alloc_extent( + Transaction &t, size_t size) final; // allocator, return blocks + + /* + * free_extent + * + * add a range of free blocks to transaction + * + */ + abort_allocation_ertr::future<> abort_allocation(Transaction &t) final; + write_ertr::future<> complete_allocation(Transaction &t) final; + + open_ertr::future<> _open_device(const std::string path); + read_ertr::future read_rbm_header(rbm_abs_addr addr); + write_ertr::future<> write_rbm_header(); + + size_t get_size() const final { return super.size; }; + size_t get_block_size() const final { return super.block_size; } + + // max block number a block can represent using bitmap + uint64_t max_block_by_bitmap_block() { + return (super.block_size - ceph::encoded_sizeof_bounded()) * 8; + } + + uint64_t convert_block_no_to_bitmap_block(blk_no_t block_no) + { + ceph_assert(super.block_size); + return block_no / max_block_by_bitmap_block(); + } + + /* + * convert_bitmap_block_no_to_block_id + * + * return block id using address where freebitmap is stored and offset + */ + blk_no_t convert_bitmap_block_no_to_block_id(uint64_t offset, rbm_abs_addr addr) + { + ceph_assert(super.block_size); + // freebitmap begins at block 1 + return (addr / super.block_size - 1) * max_block_by_bitmap_block() + offset; + } + + uint64_t get_alloc_area_size() { + ceph_assert(super.size); + ceph_assert(super.block_size); + uint64_t total_block_num = super.size / super.block_size; + uint64_t need_blocks = (total_block_num % max_block_by_bitmap_block()) ? + (total_block_num / max_block_by_bitmap_block() + 1) : + (total_block_num / max_block_by_bitmap_block()); + ceph_assert(need_blocks); + return need_blocks * super.block_size; + } + + using find_block_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::enoent>; + using find_block_ret = find_block_ertr::future>; + /* + * find_free_block + * + * Try to find free blocks by reading bitmap blocks on the disk sequentially + * The free blocks will be added to allocated_blocks in Transaction. + * This needs to be improved after in-memory block allocation is introduced. + * + */ + find_block_ret find_free_block(Transaction &t, size_t size); + + /* + * rbm_sync_block_bitmap + * + * Write rbm_bitmap_block_t to the device + * + * @param rbm_bitmap_block_t + * @param uint64_t the block number the rbm_bitmap_block_t will be stored + * + */ + write_ertr::future<> rbm_sync_block_bitmap( + rbm_bitmap_block_t &block, blk_no_t block_no); + + using check_bitmap_blocks_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg>; + check_bitmap_blocks_ertr::future<> check_bitmap_blocks(); + uint64_t get_free_blocks() const { + return super.free_block_count; + } + /* + * We will have mulitple partitions (circularjournals and randbomblockmanagers) + * on a device, so start and end location of the device are needed to + * support such case. + */ + BlockRBManager(NVMeBlockDevice * device, std::string path) + : device(device), path(path) {} + + /* + * bitmap block area (freebitmap) layout + * + * ----------------------------------------------------------- + * | header 1 | bitmap 1 | header 2 | bitmap 2 | + * ----------------------------------------------------------- + * <-- 1 block --> <-- 1 block --> + * + * 1 block contains both bitmap header and bitmap. + * We use this layout as a default layout here. + * But, we'll consider to exploit end to end data protection. + * If we use the end to end data protection, which is a feature specified in NVMe, + * we can avoid any calculation for checksum. The checksum regarding the block + * will be managed by the NVMe device. + * + */ + mkfs_ertr::future<> initialize_blk_alloc_area(); + uint64_t get_start_block_alloc_area() { + return super.start_alloc_area; + } + + void alloc_rbm_bitmap_block_buf(rbm_bitmap_block_t &b_block) { + auto bitmap_blk = ceph::bufferptr(buffer::create_page_aligned( + super.block_size - + ceph::encoded_sizeof_bounded())); + bitmap_blk.zero(); + b_block.buf.append(bitmap_blk); + } + + rbm_abs_addr get_blk_paddr_by_block_no(blk_no_t id) { + return (id * super.block_size) + super.start; + } + + int num_block_between_blk_ids(blk_no_t start, blk_no_t end) { + auto max = max_block_by_bitmap_block(); + auto block_start = start / max; + auto block_end = end / max; + return block_end - block_start + 1; + } + + write_ertr::future<> rbm_sync_block_bitmap_by_range( + blk_no_t start, blk_no_t end, bitmap_op_types_t op); + void add_cont_bitmap_blocks_to_buf( + bufferlist& buf, int num_block, bitmap_op_types_t op) { + rbm_bitmap_block_t b_block(super.block_size); + alloc_rbm_bitmap_block_buf(b_block); + if (op == bitmap_op_types_t::ALL_SET) { + b_block.set_all_bits(); + } else { + b_block.set_clear_bits(); + } + for (int i = 0; i < num_block; i++) { + encode(b_block, buf); + } + } + + write_ertr::future<> write(rbm_abs_addr addr, bufferlist &bl); + write_ertr::future<> sync_allocation( + std::vector& alloc_blocks); + void add_free_extent( + std::vector& v, rbm_abs_addr from, size_t len); + + device_id_t get_device_id() const final { + return super.device_id; + } + +private: + /* + * this contains the number of bitmap blocks, free blocks and + * rbm specific information + */ + rbm_metadata_header_t super; + //FreelistManager free_manager; // TODO: block management + NVMeBlockDevice * device; + std::string path; + int stream_id; // for multi-stream +}; +using BlockRBManagerRef = std::unique_ptr; + +} diff --git a/src/crimson/os/seastore/random_block_manager/nvme_manager.cc b/src/crimson/os/seastore/random_block_manager/nvme_manager.cc deleted file mode 100644 index 51a8fde12eb8..000000000000 --- a/src/crimson/os/seastore/random_block_manager/nvme_manager.cc +++ /dev/null @@ -1,708 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include -#include - -#include "crimson/os/seastore/logging.h" - -#include "include/buffer.h" -#include "nvmedevice.h" -#include "include/interval_set.h" -#include "include/intarith.h" -#include "nvme_manager.h" - -SET_SUBSYS(seastore_device); - -namespace crimson::os::seastore { - -NVMeManager::write_ertr::future<> NVMeManager::rbm_sync_block_bitmap( - rbm_bitmap_block_t &block, blk_no_t block_no) -{ - LOG_PREFIX(NVMeManager::rbm_sync_block_bitmap); - bufferptr bptr; - try { - bptr = bufferptr(ceph::buffer::create_page_aligned(block.get_size())); - bufferlist bl; - encode(block, bl); - auto iter = bl.cbegin(); - iter.copy(block.get_size(), bptr.c_str()); - } catch (const std::exception &e) { - DEBUG("rbm_sync_block_bitmap: exception creating aligned buffer {}", e); - ceph_assert(0 == "unhandled exception"); - } - uint64_t bitmap_block_no = convert_block_no_to_bitmap_block(block_no); - return device->write(super.start_alloc_area + - bitmap_block_no * super.block_size, - bptr); -} - -NVMeManager::mkfs_ertr::future<> NVMeManager::initialize_blk_alloc_area() -{ - LOG_PREFIX(NVMeManager::initialize_blk_alloc_area); - auto start = super.start_data_area / super.block_size; - DEBUG("initialize_alloc_area: start to read at {} ", start); - - /* write allocated bitmap info to rbm meta block */ - rbm_bitmap_block_t b_block(super.block_size); - alloc_rbm_bitmap_block_buf(b_block); - for (uint64_t i = 0; i < start; i++) { - b_block.set_bit(i); - } - - // CRC calculation is offloaded to NVMeDevice if data protection is enabled. - if (device->is_data_protection_enabled() == false) { - b_block.set_crc(); - } - - return seastar::do_with( - b_block, - [this, start, FNAME](auto &b_block) { - return rbm_sync_block_bitmap(b_block, - super.start_alloc_area / super.block_size - ).safe_then([this, &b_block, start, FNAME]() { - - /* initialize bitmap blocks as unused */ - auto max = max_block_by_bitmap_block(); - auto max_block = super.size / super.block_size; - blk_no_t end = round_up_to(max_block, max) - 1; - DEBUG("init start {} end {} ", start, end); - return rbm_sync_block_bitmap_by_range( - start, - end, - bitmap_op_types_t::ALL_CLEAR - ).safe_then([this, &b_block, FNAME]() { - /* - * Set rest of the block bitmap, which is not used, to 1 - * To do so, we only mark 1 to empty bitmap blocks - */ - uint64_t na_block_no = super.size/super.block_size; - uint64_t remain_block = na_block_no % max_block_by_bitmap_block(); - DEBUG("na_block_no: {}, remain_block: {} ", - na_block_no, remain_block); - if (remain_block) { - DEBUG("try to remained write alloc info "); - if (na_block_no > max_block_by_bitmap_block()) { - b_block.buf.clear(); - alloc_rbm_bitmap_block_buf(b_block); - } - for (uint64_t i = remain_block; i < max_block_by_bitmap_block(); i++) { - b_block.set_bit(i); - } - b_block.set_crc(); - return rbm_sync_block_bitmap(b_block, na_block_no - ).handle_error( - mkfs_ertr::pass_further{}, - crimson::ct_error::assert_all{ - "Invalid error rbm_sync_block_bitmap to update \ - last bitmap block in NVMeManager::initialize_blk_alloc_area" - } - ); - } - return mkfs_ertr::now(); - }).handle_error( - mkfs_ertr::pass_further{}, - crimson::ct_error::assert_all{ - "Invalid error rbm_sync_block_bitmap \ - in NVMeManager::initialize_blk_alloc_area" - } - ); - }).handle_error( - mkfs_ertr::pass_further{}, - crimson::ct_error::assert_all{ - "Invalid error rbm_sync_block_bitmap_by_range \ - in NVMeManager::initialize_blk_alloc_area" - } - ); - }); -} - -NVMeManager::mkfs_ertr::future<> NVMeManager::mkfs(mkfs_config_t config) -{ - LOG_PREFIX(NVMeManager::mkfs); - DEBUG("path {}", path); - return _open_device(path).safe_then([this, &config, FNAME]() { - rbm_abs_addr addr = convert_paddr_to_abs_addr( - config.start); - return read_rbm_header(addr).safe_then([FNAME](auto super) { - DEBUG("already exists "); - return mkfs_ertr::now(); - }).handle_error( - crimson::ct_error::enoent::handle([this, &config, FNAME](auto) { - super.uuid = uuid_d(); // TODO - super.magic = 0xFF; // TODO - super.start = convert_paddr_to_abs_addr( - config.start); - super.end = convert_paddr_to_abs_addr( - config.end); - super.block_size = config.block_size; - super.size = config.total_size; - super.free_block_count = config.total_size/config.block_size - 2; - super.alloc_area_size = get_alloc_area_size(); - super.start_alloc_area = RBM_SUPERBLOCK_SIZE; - super.start_data_area = - super.start_alloc_area + super.alloc_area_size; - super.crc = 0; - super.feature |= RBM_BITMAP_BLOCK_CRC; - super.device_id = config.device_id; - - DEBUG(" super {} ", super); - // write super block - return write_rbm_header().safe_then([this] { - return initialize_blk_alloc_area(); - }).handle_error( - mkfs_ertr::pass_further{}, - crimson::ct_error::assert_all{ - "Invalid error write_rbm_header in NVMeManager::mkfs" - }); - }), - mkfs_ertr::pass_further{}, - crimson::ct_error::assert_all{ - "Invalid error read_rbm_header in NVMeManager::mkfs" - } - ); - }).safe_then([this]() { - if (device) { - return device->close( - ).safe_then([]() { - return mkfs_ertr::now(); - }); - } - return mkfs_ertr::now(); - }).handle_error( - mkfs_ertr::pass_further{}, - crimson::ct_error::assert_all{ - "Invalid error open_device in NVMeManager::mkfs" - }); -} - -NVMeManager::find_block_ret NVMeManager::find_free_block(Transaction &t, size_t size) -{ - LOG_PREFIX(NVMeManager::find_free_block); - auto bp = bufferptr(ceph::buffer::create_page_aligned(super.block_size)); - return seastar::do_with(uint64_t(0), - uint64_t(super.start_alloc_area), - interval_set(), - bp, - [&, this, FNAME](auto &allocated, auto &addr, auto &alloc_extent, auto &bp) mutable { - return crimson::repeat( - [&, this, FNAME]() mutable { - return device->read( - addr, - bp - ).safe_then( - [&bp, &addr, size, &allocated, &alloc_extent, this, FNAME]() mutable { - DEBUG("find_free_list: allocate {}, addr {}", allocated, addr); - rbm_bitmap_block_t b_block(super.block_size); - bufferlist bl_bitmap_block; - bl_bitmap_block.append(bp); - decode(b_block, bl_bitmap_block); - auto max = max_block_by_bitmap_block(); - for (uint64_t i = 0; - i < max && (uint64_t)size/super.block_size > allocated; i++) { - auto block_id = convert_bitmap_block_no_to_block_id(i, addr); - if (b_block.is_allocated(i)) { - continue; - } - DEBUG("find_free_list: allocated block no {} i {}", - convert_bitmap_block_no_to_block_id(i, addr), i); - if (allocated != 0 && alloc_extent.range_end() != block_id) { - /* - * if not continous block, just restart to find continuous blocks - * at the next block. - * in-memory allocator can handle this efficiently. - */ - allocated = 0; - alloc_extent.clear(); // a range of block allocation - DEBUG("find_free_list: rety to find continuous blocks"); - continue; - } - allocated += 1; - alloc_extent.insert(block_id); - } - addr += super.block_size; - DEBUG("find_free_list: allocated: {} alloc_extent {}", - allocated, alloc_extent); - if (((uint64_t)size)/super.block_size == allocated) { - return seastar::stop_iteration::yes; - } else if (addr >= super.start_data_area) { - alloc_extent.clear(); - return seastar::stop_iteration::yes; - } - return seastar::stop_iteration::no; - }); - }).safe_then([&allocated, &alloc_extent, size, this, FNAME]() { - DEBUG(" allocated: {} size {} ", - allocated * super.block_size, size); - if (allocated * super.block_size < size) { - alloc_extent.clear(); - } - return find_block_ret( - find_block_ertr::ready_future_marker{}, - alloc_extent); - }).handle_error( - find_block_ertr::pass_further{}, - crimson::ct_error::assert_all{ - "Invalid error in NVMeManager::find_free_block" - } - ); - }); -} - -/* TODO : block allocator */ -NVMeManager::allocate_ret NVMeManager::alloc_extent( - Transaction &t, size_t size) -{ - - /* - * 1. find free blocks using block allocator - * 2. add free blocks to transaction - * (the free block is reserved state, not stored) - * 3. link free blocks to onode - * Due to in-memory block allocator is the next work to do, - * just read the block bitmap directly to find free blocks. - * - */ - LOG_PREFIX(NVMeManager::alloc_extent); - return find_free_block(t, size - ).safe_then([this, FNAME](auto alloc_extent) mutable - -> allocate_ertr::future { - DEBUG("after find_free_block: allocated {}", alloc_extent); - if (alloc_extent.empty()) { - return crimson::ct_error::enospc::make(); - } - paddr_t paddr = convert_abs_addr_to_paddr( - alloc_extent.range_start() * super.block_size, - super.device_id); - return allocate_ret( - allocate_ertr::ready_future_marker{}, - paddr); - }).handle_error( - allocate_ertr::pass_further{}, - crimson::ct_error::assert_all{ - "Invalid error find_free_block in NVMeManager::alloc_extent" - } - ); -} - -void NVMeManager::add_free_extent( - std::vector& v, rbm_abs_addr from, size_t len) -{ - ceph_assert(!(len % super.block_size)); - paddr_t paddr = convert_abs_addr_to_paddr( - from, - super.device_id); - alloc_delta_t alloc_info; - alloc_info.alloc_blk_ranges.emplace_back( - paddr, L_ADDR_NULL, len, extent_types_t::ROOT); - alloc_info.op = alloc_delta_t::op_types_t::CLEAR; - v.push_back(alloc_info); -} - -NVMeManager::write_ertr::future<> NVMeManager::rbm_sync_block_bitmap_by_range( - blk_no_t start, blk_no_t end, bitmap_op_types_t op) -{ - LOG_PREFIX(NVMeManager::rbm_sync_block_bitmap_by_range); - auto addr = super.start_alloc_area + - (start / max_block_by_bitmap_block()) - * super.block_size; - // aligned write - if (start % max_block_by_bitmap_block() == 0 && - end % (max_block_by_bitmap_block() - 1) == 0) { - auto num_block = num_block_between_blk_ids(start, end); - bufferlist bl_bitmap_block; - add_cont_bitmap_blocks_to_buf(bl_bitmap_block, num_block, op); - return write( - addr, - bl_bitmap_block); - } - auto bp = bufferptr(ceph::buffer::create_page_aligned(super.block_size)); - // try to read first block, then check the block is aligned - return device->read( - addr, - bp - ).safe_then([bp, start, end, op, addr, this, FNAME]() { - rbm_bitmap_block_t b_block(super.block_size); - bufferlist bl_bitmap_block; - bl_bitmap_block.append(bp); - decode(b_block, bl_bitmap_block); - auto max = max_block_by_bitmap_block(); - auto loop_end = end < (start / max + 1) * max ? - end % max : max - 1; - for (uint64_t i = (start % max); i <= loop_end; i++) { - if (op == bitmap_op_types_t::ALL_SET) { - b_block.set_bit(i); - } else { - b_block.clear_bit(i); - } - } - auto num_block = num_block_between_blk_ids(start, end); - DEBUG("rbm_sync_block_bitmap_by_range: start {}, end {}, \ - loop_end {}, num_block {}", - start, end, loop_end, num_block); - - bl_bitmap_block.clear(); - encode(b_block, bl_bitmap_block); - if (num_block == 1) { - // | front (unaligned) | - return write( - addr, - bl_bitmap_block); - } else if (!((end + 1) % max)) { - // | front (unaligned) | middle (aligned) | - add_cont_bitmap_blocks_to_buf(bl_bitmap_block, num_block - 1, op); - DEBUG("partially aligned write: addr {} length {}", - addr, bl_bitmap_block.length()); - return write( - addr, - bl_bitmap_block); - } else if (num_block > 2) { - // | front (unaligned) | middle | end (unaligned) | - // fill up the middle - add_cont_bitmap_blocks_to_buf(bl_bitmap_block, num_block - 2, op); - } - - auto next_addr = super.start_alloc_area + - (end / max_block_by_bitmap_block()) - * super.block_size; - auto bptr = bufferptr(ceph::buffer::create_page_aligned(super.block_size)); - // | front (unaligned) | middle | end (unaligned) | or - // | front (unaligned) | end (unaligned) | - return device->read( - next_addr, - bptr - ).safe_then( - [bptr, bl_bitmap_block, end, op, addr, this, FNAME]() mutable { - rbm_bitmap_block_t b_block(super.block_size); - bufferlist block; - block.append(bptr); - decode(b_block, block); - auto max = max_block_by_bitmap_block(); - for (uint64_t i = (end - (end % max)) % max; - i <= (end % max); i++) { - if (op == bitmap_op_types_t::ALL_SET) { - b_block.set_bit(i); - } else { - b_block.clear_bit(i); - } - } - DEBUG("start {} end {} ", end - (end % max), end); - bl_bitmap_block.claim_append(block); - return write( - addr, - bl_bitmap_block); - }).handle_error( - write_ertr::pass_further{}, - crimson::ct_error::assert_all{ - "Invalid error in NVMeManager::rbm_sync_block_bitmap_by_range" - } - ); - }).handle_error( - write_ertr::pass_further{}, - crimson::ct_error::assert_all{ - "Invalid error in NVMeManager::rbm_sync_block_bitmap_by_range" - } - ); -} - -NVMeManager::abort_allocation_ertr::future<> NVMeManager::abort_allocation( - Transaction &t) -{ - /* - * TODO: clear all allocation infos associated with transaction in in-memory allocator - */ - return abort_allocation_ertr::now(); -} - -NVMeManager::write_ertr::future<> NVMeManager::complete_allocation( - Transaction &t) -{ - return write_ertr::now(); -} - -NVMeManager::write_ertr::future<> NVMeManager::sync_allocation( - std::vector &alloc_blocks) -{ - LOG_PREFIX(NVMeManager::sync_allocation); - if (alloc_blocks.empty()) { - return write_ertr::now(); - } - return seastar::do_with(move(alloc_blocks), - [&, this, FNAME](auto &alloc_blocks) mutable { - return crimson::do_for_each(alloc_blocks, - [this, FNAME](auto &alloc) { - return crimson::do_for_each(alloc.alloc_blk_ranges, - [this, &alloc, FNAME](auto &range) -> write_ertr::future<> { - DEBUG("range {} ~ {}", range.paddr, range.len); - bitmap_op_types_t op = - (alloc.op == alloc_delta_t::op_types_t::SET) ? - bitmap_op_types_t::ALL_SET : - bitmap_op_types_t::ALL_CLEAR; - rbm_abs_addr addr = convert_paddr_to_abs_addr( - range.paddr); - blk_no_t start = addr / super.block_size; - blk_no_t end = start + - (round_up_to(range.len, super.block_size)) / super.block_size - - 1; - return rbm_sync_block_bitmap_by_range( - start, - end, - op); - }); - }).safe_then([this, &alloc_blocks, FNAME]() mutable { - int alloc_block_count = 0; - for (const auto& b : alloc_blocks) { - for (auto r : b.alloc_blk_ranges) { - if (b.op == alloc_delta_t::op_types_t::SET) { - alloc_block_count += - round_up_to(r.len, super.block_size) / super.block_size; - DEBUG("complete alloc block: start {} len {} ", - r.paddr, r.len); - } else { - alloc_block_count -= - round_up_to(r.len, super.block_size) / super.block_size; - DEBUG("complete alloc block: start {} len {} ", - r.paddr, r.len); - } - } - } - DEBUG("complete_alloction: complete to allocate {} blocks", - alloc_block_count); - super.free_block_count -= alloc_block_count; - return write_ertr::now(); - }); - }); -} - -NVMeManager::open_ertr::future<> NVMeManager::open( - const std::string &path, paddr_t paddr) -{ - LOG_PREFIX(NVMeManager::open); - DEBUG("open: path{}", path); - rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr); - return _open_device(path - ).safe_then([this, addr]() { - return read_rbm_header(addr).safe_then([&](auto s) - -> open_ertr::future<> { - if (s.magic != 0xFF) { - return crimson::ct_error::enoent::make(); - } - super = s; - return check_bitmap_blocks().safe_then([]() { - return open_ertr::now(); - }); - }).handle_error( - open_ertr::pass_further{}, - crimson::ct_error::assert_all{ - "Invalid error read_rbm_header in NVMeManager::open" - } - ); - }); -} - -NVMeManager::write_ertr::future<> NVMeManager::write( - paddr_t paddr, - bufferptr &bptr) -{ - ceph_assert(device); - rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr); - if (addr > super.end || addr < super.start || - bptr.length() > super.end - super.start) { - return crimson::ct_error::erange::make(); - } - return device->write( - addr, - bptr); -} - -NVMeManager::read_ertr::future<> NVMeManager::read( - paddr_t paddr, - bufferptr &bptr) -{ - ceph_assert(device); - rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr); - if (addr > super.end || addr < super.start || - bptr.length() > super.end - super.start) { - return crimson::ct_error::erange::make(); - } - return device->read( - addr, - bptr); -} - -NVMeManager::close_ertr::future<> NVMeManager::close() -{ - ceph_assert(device); - return device->close(); -} - -NVMeManager::open_ertr::future<> NVMeManager::_open_device( - const std::string path) -{ - ceph_assert(device); - return device->open(path, seastar::open_flags::rw); -} - -NVMeManager::write_ertr::future<> NVMeManager::write_rbm_header() -{ - bufferlist meta_b_header; - super.crc = 0; - encode(super, meta_b_header); - // If NVMeDevice supports data protection, CRC for checksum is not required - // NVMeDevice is expected to generate and store checksum internally. - // CPU overhead for CRC might be saved. - if (device->is_data_protection_enabled()) { - super.crc = -1; - } - else { - super.crc = meta_b_header.crc32c(-1); - } - - bufferlist bl; - encode(super, bl); - auto iter = bl.begin(); - auto bp = bufferptr(ceph::buffer::create_page_aligned(super.block_size)); - assert(bl.length() < super.block_size); - iter.copy(bl.length(), bp.c_str()); - - return device->write(super.start, bp); -} - -NVMeManager::read_ertr::future NVMeManager::read_rbm_header( - rbm_abs_addr addr) -{ - LOG_PREFIX(NVMeManager::read_rbm_header); - ceph_assert(device); - bufferptr bptr = - bufferptr(ceph::buffer::create_page_aligned(RBM_SUPERBLOCK_SIZE)); - bptr.zero(); - return device->read( - addr, - bptr - ).safe_then([length=bptr.length(), this, bptr, FNAME]() - -> read_ertr::future { - bufferlist bl; - bl.append(bptr); - auto p = bl.cbegin(); - rbm_metadata_header_t super_block; - try { - decode(super_block, p); - } - catch (ceph::buffer::error& e) { - DEBUG("read_rbm_header: unable to decode rbm super block {}", - e.what()); - return crimson::ct_error::enoent::make(); - } - checksum_t crc = super_block.crc; - bufferlist meta_b_header; - super_block.crc = 0; - encode(super_block, meta_b_header); - - // Do CRC verification only if data protection is not supported. - if (device->is_data_protection_enabled() == false) { - if (meta_b_header.crc32c(-1) != crc) { - DEBUG("bad crc on super block, expected {} != actual {} ", - meta_b_header.crc32c(-1), crc); - return crimson::ct_error::input_output_error::make(); - } - } - DEBUG("got {} ", super); - return read_ertr::future( - read_ertr::ready_future_marker{}, - super_block - ); - }).handle_error( - read_ertr::pass_further{}, - crimson::ct_error::assert_all{ - "Invalid error in NVMeManager::read_rbm_header" - } - ); -} - -NVMeManager::check_bitmap_blocks_ertr::future<> NVMeManager::check_bitmap_blocks() -{ - LOG_PREFIX(NVMeManager::check_bitmap_blocks); - auto bp = bufferptr(ceph::buffer::create_page_aligned(super.block_size)); - return seastar::do_with(uint64_t(super.start_alloc_area), uint64_t(0), bp, - [&, this, FNAME](auto &addr, auto &free_blocks, auto &bp) mutable { - return crimson::repeat([&, this, FNAME]() mutable { - return device->read(addr, bp - ).safe_then( - [&bp, &addr, &free_blocks, this, FNAME]() mutable { - DEBUG("verify_bitmap_blocks: addr {}", addr); - rbm_bitmap_block_t b_block(super.block_size); - bufferlist bl_bitmap_block; - bl_bitmap_block.append(bp); - decode(b_block, bl_bitmap_block); - auto max = max_block_by_bitmap_block(); - for (uint64_t i = 0; i < max; i++) { - if (!b_block.is_allocated(i)) { - free_blocks++; - } - } - addr += super.block_size; - if (addr >= super.start_data_area) { - return seastar::stop_iteration::yes; - } - return seastar::stop_iteration::no; - }); - }).safe_then([&free_blocks, this, FNAME]() { - DEBUG("free_blocks: {} ", free_blocks); - super.free_block_count = free_blocks; - return check_bitmap_blocks_ertr::now(); - }).handle_error( - check_bitmap_blocks_ertr::pass_further{}, - crimson::ct_error::assert_all{ - "Invalid error in NVMeManager::find_free_block" - } - ); - }); -} - -NVMeManager::write_ertr::future<> NVMeManager::write( - rbm_abs_addr addr, - bufferlist &bl) -{ - LOG_PREFIX(NVMeManager::write); - ceph_assert(device); - bufferptr bptr; - try { - bptr = bufferptr(ceph::buffer::create_page_aligned(bl.length())); - auto iter = bl.cbegin(); - iter.copy(bl.length(), bptr.c_str()); - } catch (const std::exception &e) { - DEBUG("write: exception creating aligned buffer {}", e); - ceph_assert(0 == "unhandled exception"); - } - return device->write( - addr, - bptr); -} - -std::ostream &operator<<(std::ostream &out, const rbm_metadata_header_t &header) -{ - out << " rbm_metadata_header_t(size=" << header.size - << ", block_size=" << header.block_size - << ", start=" << header.start - << ", end=" << header.end - << ", magic=" << header.magic - << ", uuid=" << header.uuid - << ", free_block_count=" << header.free_block_count - << ", alloc_area_size=" << header.alloc_area_size - << ", start_alloc_area=" << header.start_alloc_area - << ", start_data_area=" << header.start_data_area - << ", flag=" << header.flag - << ", feature=" << header.feature - << ", crc=" << header.crc; - return out << ")"; -} - -std::ostream &operator<<(std::ostream &out, - const rbm_bitmap_block_header_t &header) -{ - out << " rbm_bitmap_block_header_t(size=" << header.size - << ", checksum=" << header.checksum; - return out << ")"; -} - -} diff --git a/src/crimson/os/seastore/random_block_manager/nvme_manager.h b/src/crimson/os/seastore/random_block_manager/nvme_manager.h deleted file mode 100644 index 20f0087a7275..000000000000 --- a/src/crimson/os/seastore/random_block_manager/nvme_manager.h +++ /dev/null @@ -1,379 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#pragma once - -#include - -#include -#include -#include - -#include "include/ceph_assert.h" -#include "crimson/os/seastore/seastore_types.h" -#include "include/buffer_fwd.h" -#include "crimson/osd/exceptions.h" - -#include "crimson/os/seastore/transaction.h" -#include "nvmedevice.h" -#include "crimson/os/seastore/random_block_manager.h" - -#include "crimson/common/layout.h" -#include "include/buffer.h" -#include "include/uuid.h" - -namespace crimson::os::seastore { - -constexpr uint32_t RBM_SUPERBLOCK_SIZE = 4096; - -using NVMeBlockDevice = nvme_device::NVMeBlockDevice; -using NVMeBlockDeviceRef = std::unique_ptr; - -enum { - // TODO: This allows the device to manage crc on a block by itself - RBM_NVME_END_TO_END_PROTECTION = 1, - RBM_BITMAP_BLOCK_CRC = 2, -}; - -constexpr uint32_t BITS_PER_CHAR = 8; -inline char BIT_CHAR_MASK(uint64_t nr) -{ - return (char(1) << (nr % BITS_PER_CHAR)); -} - -struct rbm_metadata_header_t { - size_t size = 0; - size_t block_size = 0; - uint64_t start; // start location of the device - uint64_t end; // end location of the device - uint64_t magic; // to indicate randomblock_manager - uuid_d uuid; - uint64_t free_block_count; - uint64_t alloc_area_size; // bitmap - uint32_t start_alloc_area; // block number - uint32_t start_data_area; - uint64_t flag; // reserved - uint64_t feature; - device_id_t device_id; - checksum_t crc; - - DENC(rbm_metadata_header_t, v, p) { - DENC_START(1, 1, p); - denc(v.size, p); - denc(v.block_size, p); - denc(v.start, p); - denc(v.end, p); - denc(v.magic, p); - denc(v.uuid, p); - denc(v.free_block_count, p); - denc(v.alloc_area_size, p); - denc(v.start_alloc_area, p); - denc(v.start_data_area, p); - denc(v.flag, p); - denc(v.feature, p); - denc(v.device_id, p); - - denc(v.crc, p); - DENC_FINISH(p); - } - -}; - -struct rbm_bitmap_block_header_t { - uint32_t size; - checksum_t checksum; - DENC(rbm_bitmap_block_header_t, v, p) { - DENC_START(1, 1, p); - denc(v.size, p); - denc(v.checksum, p); - DENC_FINISH(p); - } -}; - -std::ostream &operator<<(std::ostream &out, const rbm_metadata_header_t &header); -std::ostream &operator<<(std::ostream &out, const rbm_bitmap_block_header_t &header); - -enum class bitmap_op_types_t : uint8_t { - ALL_CLEAR = 1, - ALL_SET = 2 -}; - -struct rbm_bitmap_block_t { - rbm_bitmap_block_header_t header; - bufferlist buf; - - uint64_t get_size() { - return header.size; - } - void set_crc() { - header.checksum = buf.crc32c(-1); - } - - bool is_correct_crc() { - ceph_assert(buf.length()); - return buf.crc32c(-1) == header.checksum; - } - - void set_bit(uint64_t nr) { - ceph_assert(buf.length()); - char mask = BIT_CHAR_MASK(nr); - char *p = buf.c_str() + (nr / BITS_PER_CHAR); - *p |= mask; - } - - void set_all_bits() { - ceph_assert(buf.length()); - ::memset(buf.c_str(), std::numeric_limits::max(), buf.length()); - } - - void set_clear_bits() { - ceph_assert(buf.length()); - ::memset(buf.c_str(), 0, buf.length()); - } - - void clear_bit(uint64_t nr) { - ceph_assert(buf.length()); - char mask = ~BIT_CHAR_MASK(nr); - char *p = buf.c_str() + (nr / BITS_PER_CHAR); - *p &= mask; - } - - bool is_allocated(uint64_t nr) { - ceph_assert(buf.length()); - char mask = BIT_CHAR_MASK(nr); - char *p = buf.c_str() + (nr / BITS_PER_CHAR); - return *p & mask; - } - - rbm_bitmap_block_t(size_t size) { - header.size = size; - } - - rbm_bitmap_block_t() = default; - - DENC(rbm_bitmap_block_t, v, p) { - DENC_START(1, 1, p); - denc(v.header, p); - denc(v.buf, p); - DENC_FINISH(p); - } -}; - -} - -WRITE_CLASS_DENC_BOUNDED( - crimson::os::seastore::rbm_metadata_header_t -) -WRITE_CLASS_DENC_BOUNDED( - crimson::os::seastore::rbm_bitmap_block_t -) -WRITE_CLASS_DENC_BOUNDED( - crimson::os::seastore::rbm_bitmap_block_header_t -) - -namespace crimson::os::seastore { - -class NVMeManager final : public RandomBlockManager { -public: - /* - * Ondisk layout - * - * --------------------------------------------------------------------------- - * | rbm_metadata_header_t | rbm_bitmap_block_t 1 | ... | data blocks | - * --------------------------------------------------------------------------- - */ - - mkfs_ertr::future<> mkfs(mkfs_config_t) final; - read_ertr::future<> read(paddr_t addr, bufferptr &buffer) final; - write_ertr::future<> write(paddr_t addr, bufferptr &buf) final; - open_ertr::future<> open(const std::string &path, paddr_t start) final; - close_ertr::future<> close() final; - - /* - * alloc_extent - * - * The role of this function is to find out free blocks the transaction requires. - * To do so, alloc_extent() looks into both in-memory allocator - * and freebitmap blocks. - * But, in-memory allocator is the future work, and is not implemented yet, - * we use freebitmap directly to allocate freeblocks for now. - * - * Each bit in freebitmap block represents whether a block is allocated or not. - * - * TODO: multiple allocation - * - */ - allocate_ret alloc_extent( - Transaction &t, size_t size) final; // allocator, return blocks - - /* - * free_extent - * - * add a range of free blocks to transaction - * - */ - abort_allocation_ertr::future<> abort_allocation(Transaction &t) final; - write_ertr::future<> complete_allocation(Transaction &t) final; - - open_ertr::future<> _open_device(const std::string path); - read_ertr::future read_rbm_header(rbm_abs_addr addr); - write_ertr::future<> write_rbm_header(); - - size_t get_size() const final { return super.size; }; - size_t get_block_size() const final { return super.block_size; } - - // max block number a block can represent using bitmap - uint64_t max_block_by_bitmap_block() { - return (super.block_size - ceph::encoded_sizeof_bounded()) * 8; - } - - uint64_t convert_block_no_to_bitmap_block(blk_no_t block_no) - { - ceph_assert(super.block_size); - return block_no / max_block_by_bitmap_block(); - } - - /* - * convert_bitmap_block_no_to_block_id - * - * return block id using address where freebitmap is stored and offset - */ - blk_no_t convert_bitmap_block_no_to_block_id(uint64_t offset, rbm_abs_addr addr) - { - ceph_assert(super.block_size); - // freebitmap begins at block 1 - return (addr / super.block_size - 1) * max_block_by_bitmap_block() + offset; - } - - uint64_t get_alloc_area_size() { - ceph_assert(super.size); - ceph_assert(super.block_size); - uint64_t total_block_num = super.size / super.block_size; - uint64_t need_blocks = (total_block_num % max_block_by_bitmap_block()) ? - (total_block_num / max_block_by_bitmap_block() + 1) : - (total_block_num / max_block_by_bitmap_block()); - ceph_assert(need_blocks); - return need_blocks * super.block_size; - } - - using find_block_ertr = crimson::errorator< - crimson::ct_error::input_output_error, - crimson::ct_error::enoent>; - using find_block_ret = find_block_ertr::future>; - /* - * find_free_block - * - * Try to find free blocks by reading bitmap blocks on the disk sequentially - * The free blocks will be added to allocated_blocks in Transaction. - * This needs to be improved after in-memory block allocation is introduced. - * - */ - find_block_ret find_free_block(Transaction &t, size_t size); - - /* - * rbm_sync_block_bitmap - * - * Write rbm_bitmap_block_t to the device - * - * @param rbm_bitmap_block_t - * @param uint64_t the block number the rbm_bitmap_block_t will be stored - * - */ - write_ertr::future<> rbm_sync_block_bitmap( - rbm_bitmap_block_t &block, blk_no_t block_no); - - using check_bitmap_blocks_ertr = crimson::errorator< - crimson::ct_error::input_output_error, - crimson::ct_error::invarg>; - check_bitmap_blocks_ertr::future<> check_bitmap_blocks(); - uint64_t get_free_blocks() const { - return super.free_block_count; - } - /* - * We will have mulitple partitions (circularjournals and randbomblockmanagers) - * on a device, so start and end location of the device are needed to - * support such case. - */ - NVMeManager(NVMeBlockDevice * device, std::string path) - : device(device), path(path) {} - - /* - * bitmap block area (freebitmap) layout - * - * ----------------------------------------------------------- - * | header 1 | bitmap 1 | header 2 | bitmap 2 | - * ----------------------------------------------------------- - * <-- 1 block --> <-- 1 block --> - * - * 1 block contains both bitmap header and bitmap. - * We use this layout as a default layout here. - * But, we'll consider to exploit end to end data protection. - * If we use the end to end data protection, which is a feature specified in NVMe, - * we can avoid any calculation for checksum. The checksum regarding the block - * will be managed by the NVMe device. - * - */ - mkfs_ertr::future<> initialize_blk_alloc_area(); - uint64_t get_start_block_alloc_area() { - return super.start_alloc_area; - } - - void alloc_rbm_bitmap_block_buf(rbm_bitmap_block_t &b_block) { - auto bitmap_blk = ceph::bufferptr(buffer::create_page_aligned( - super.block_size - - ceph::encoded_sizeof_bounded())); - bitmap_blk.zero(); - b_block.buf.append(bitmap_blk); - } - - rbm_abs_addr get_blk_paddr_by_block_no(blk_no_t id) { - return (id * super.block_size) + super.start; - } - - int num_block_between_blk_ids(blk_no_t start, blk_no_t end) { - auto max = max_block_by_bitmap_block(); - auto block_start = start / max; - auto block_end = end / max; - return block_end - block_start + 1; - } - - write_ertr::future<> rbm_sync_block_bitmap_by_range( - blk_no_t start, blk_no_t end, bitmap_op_types_t op); - void add_cont_bitmap_blocks_to_buf( - bufferlist& buf, int num_block, bitmap_op_types_t op) { - rbm_bitmap_block_t b_block(super.block_size); - alloc_rbm_bitmap_block_buf(b_block); - if (op == bitmap_op_types_t::ALL_SET) { - b_block.set_all_bits(); - } else { - b_block.set_clear_bits(); - } - for (int i = 0; i < num_block; i++) { - encode(b_block, buf); - } - } - - write_ertr::future<> write(rbm_abs_addr addr, bufferlist &bl); - write_ertr::future<> sync_allocation( - std::vector& alloc_blocks); - void add_free_extent( - std::vector& v, rbm_abs_addr from, size_t len); - - device_id_t get_device_id() const final { - return super.device_id; - } - -private: - /* - * this contains the number of bitmap blocks, free blocks and - * rbm specific information - */ - rbm_metadata_header_t super; - //FreelistManager free_manager; // TODO: block management - NVMeBlockDevice * device; - std::string path; - int stream_id; // for multi-stream -}; -using NVMeManagerRef = std::unique_ptr; - -} diff --git a/src/test/crimson/seastore/test_randomblock_manager.cc b/src/test/crimson/seastore/test_randomblock_manager.cc index dafdcdee8ab6..8696828f2c72 100644 --- a/src/test/crimson/seastore/test_randomblock_manager.cc +++ b/src/test/crimson/seastore/test_randomblock_manager.cc @@ -6,7 +6,7 @@ #include #include "crimson/common/log.h" -#include "crimson/os/seastore/random_block_manager/nvme_manager.h" +#include "crimson/os/seastore/random_block_manager/block_rb_manager.h" #include "crimson/os/seastore/random_block_manager/nvmedevice.h" #include "test/crimson/seastore/transaction_manager_test_state.h" @@ -25,7 +25,7 @@ constexpr uint64_t DEFAULT_BLOCK_SIZE = 4096; struct rbm_test_t : public seastar_test_suite_t, TMTestState { - std::unique_ptr rbm_manager; + std::unique_ptr rbm_manager; std::unique_ptr device; struct rbm_transaction { @@ -54,7 +54,7 @@ struct rbm_test_t : seastar::future<> set_up_fut() final { device.reset(new nvme_device::TestMemory(DEFAULT_TEST_SIZE)); - rbm_manager.reset(new NVMeManager(device.get(), std::string())); + rbm_manager.reset(new BlockRBManager(device.get(), std::string())); device_id_t d_id = 1 << (std::numeric_limits::digits - 1); config.start = paddr_t::make_blk_paddr(d_id, 0); config.end = paddr_t::make_blk_paddr(d_id, DEFAULT_TEST_SIZE);