From: Aravind Ramesh Date: Tue, 2 May 2023 05:55:37 +0000 (+0200) Subject: crimson/os/seastore: change zoned device interface name from ZNS to ZBD X-Git-Tag: v18.2.1~181^2~1 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=246c9fad7419736294f96e986cfef986b38a4f63;p=ceph-ci.git crimson/os/seastore: change zoned device interface name from ZNS to ZBD We can essentially support SMR devices(HDD) and ZNS device(SSDs) which are both subsets of Zoned Block Devices under the same interface ZNSSegmentManager with small changes in design. Hence changing the interface name to a more contextual name ZBDSegmentManager. This also helps to add SMR device support for crimson. Signed-off-by: Aravind Ramesh (cherry picked from commit 7fbdeabe38c361a3628e906ab1b9df2c98d62741) --- diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt index 5b1c6187ca2..de11292084a 100644 --- a/src/crimson/os/seastore/CMakeLists.txt +++ b/src/crimson/os/seastore/CMakeLists.txt @@ -61,7 +61,7 @@ CMAKE_DEPENDENT_OPTION(WITH_ZNS "enable Linux ZNS support" OFF if(WITH_ZNS) find_package(LinuxZNS REQUIRED) list(APPEND crimson_seastore_srcs - segment_manager/zns.cc) + segment_manager/zbd.cc) endif() add_library(crimson-seastore STATIC diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc index 9328a03094c..6a47dcea34b 100644 --- a/src/crimson/os/seastore/seastore_types.cc +++ b/src/crimson/os/seastore/seastore_types.cc @@ -779,8 +779,8 @@ device_type_t string_to_device_type(std::string type) { if (type == "SSD") { return device_type_t::SSD; } - if (type == "ZNS") { - return device_type_t::ZNS; + if (type == "ZBD") { + return device_type_t::ZBD; } if (type == "RANDOM_BLOCK_SSD") { return device_type_t::RANDOM_BLOCK_SSD; @@ -797,8 +797,8 @@ std::ostream& operator<<(std::ostream& out, device_type_t t) return out << "HDD"; case device_type_t::SSD: return out << "SSD"; - case device_type_t::ZNS: - return out << "ZNS"; + case device_type_t::ZBD: + return out << "ZBD"; case device_type_t::EPHEMERAL_COLD: return out << "EPHEMERAL_COLD"; case device_type_t::EPHEMERAL_MAIN: diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index 55d8eb4a260..b95e1542954 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -882,7 +882,7 @@ enum class device_type_t : uint8_t { NONE = 0, HDD, SSD, - ZNS, + ZBD, EPHEMERAL_COLD, EPHEMERAL_MAIN, RANDOM_BLOCK_SSD, @@ -896,7 +896,7 @@ bool can_delay_allocation(device_type_t type); device_type_t string_to_device_type(std::string type); enum class backend_type_t { - SEGMENTED, // SegmentManager: SSD, ZNS, HDD + SEGMENTED, // SegmentManager: SSD, ZBD, HDD RANDOM_BLOCK // RBMDevice: RANDOM_BLOCK_SSD }; diff --git a/src/crimson/os/seastore/segment_manager.cc b/src/crimson/os/seastore/segment_manager.cc index 098a9b068f8..dbba31fd207 100644 --- a/src/crimson/os/seastore/segment_manager.cc +++ b/src/crimson/os/seastore/segment_manager.cc @@ -6,7 +6,7 @@ #include "crimson/os/seastore/logging.h" #ifdef HAVE_ZNS -#include "crimson/os/seastore/segment_manager/zns.h" +#include "crimson/os/seastore/segment_manager/zbd.h" SET_SUBSYS(seastore_device); #endif @@ -79,7 +79,7 @@ LOG_PREFIX(SegmentManager::get_segment_manager); INFO("Found {} zones.", nr_zones); if (nr_zones != 0) { return std::make_unique< - segment_manager::zns::ZNSSegmentManager + segment_manager::zbd::ZBDSegmentManager >(device + "/block"); } else { return std::make_unique< diff --git a/src/crimson/os/seastore/segment_manager.h b/src/crimson/os/seastore/segment_manager.h index 1669d124a6b..719fa6075ed 100644 --- a/src/crimson/os/seastore/segment_manager.h +++ b/src/crimson/os/seastore/segment_manager.h @@ -153,7 +153,7 @@ public: * advance_wp * * advance the segment write pointer, - * needed when writing at wp is strictly implemented. ex: ZNS backed segments + * needed when writing at wp is strictly implemented. ex: ZBD backed segments * @param offset: advance write pointer till the given offset */ virtual write_ertr::future<> advance_wp( diff --git a/src/crimson/os/seastore/segment_manager/zbd.cc b/src/crimson/os/seastore/segment_manager/zbd.cc new file mode 100644 index 00000000000..be1d71dacaa --- /dev/null +++ b/src/crimson/os/seastore/segment_manager/zbd.cc @@ -0,0 +1,750 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include + +#include +#include "crimson/os/seastore/segment_manager/zbd.h" +#include "crimson/common/config_proxy.h" +#include "crimson/os/seastore/logging.h" +#include "include/buffer.h" + +SET_SUBSYS(seastore_device); + +#define SECT_SHIFT 9 +#define RESERVED_ZONES 1 +// limit the max padding buf size to 1MB +#define MAX_PADDING_SIZE 1048576 + +using z_op = crimson::os::seastore::segment_manager::zbd::zone_op; +template <> struct fmt::formatter: fmt::formatter { + template + auto format(z_op s, FormatContext& ctx) { + std::string_view name = "Unknown"; + switch (s) { + using enum z_op; + case OPEN: + name = "BLKOPENZONE"; + break; + case FINISH: + name = "BLKFINISHZONE"; + break; + case CLOSE: + name = "BLKCLOSEZONE"; + break; + case RESET: + name = "BLKRESETZONE"; + break; + } + return formatter::format(name, ctx); + } +}; + +namespace crimson::os::seastore::segment_manager::zbd { + +using open_device_ret = ZBDSegmentManager::access_ertr::future< + std::pair>; +static open_device_ret open_device( + const std::string &path, + seastar::open_flags mode) +{ + LOG_PREFIX(ZBDSegmentManager::open_device); + return seastar::file_stat( + path, seastar::follow_symlink::yes + ).then([FNAME, mode, &path](auto stat) mutable { + return seastar::open_file_dma(path, mode).then([=](auto file) { + DEBUG("open of device {} successful, size {}", + path, + stat.size); + return std::make_pair(file, stat); + }); + }).handle_exception( + [FNAME](auto e) -> open_device_ret { + ERROR("got error {}", + e); + return crimson::ct_error::input_output_error::make(); + } + ); +} + +static zbd_sm_metadata_t make_metadata( + uint64_t total_size, + seastore_meta_t meta, + const seastar::stat_data &data, + size_t zone_size_sectors, + size_t zone_capacity_sectors, + size_t num_zones) +{ + LOG_PREFIX(ZBDSegmentManager::make_metadata); + + // TODO: support Option::size_t seastore_segment_size + // to allow zones_per_segment > 1 with striping. + size_t zone_size = zone_size_sectors << SECT_SHIFT; + size_t zone_capacity = zone_capacity_sectors << SECT_SHIFT; + size_t segment_size = zone_size; + size_t zones_per_segment = segment_size / zone_size; + size_t segments = (num_zones - RESERVED_ZONES) / zones_per_segment; + size_t per_shard_segments = segments / seastar::smp::count; + size_t available_size = zone_capacity * segments; + size_t per_shard_available_size = zone_capacity * per_shard_segments; + std::vector shard_infos(seastar::smp::count); + for (unsigned int i = 0; i < seastar::smp::count; i++) { + shard_infos[i].size = per_shard_available_size; + shard_infos[i].segments = per_shard_segments; + shard_infos[i].first_segment_offset = zone_size * RESERVED_ZONES + + i * segment_size* per_shard_segments; + } + + assert(total_size == num_zones * zone_size); + + WARN("Ignoring configuration values for device and segment size"); + INFO( + "device size {}, available_size {}, block_size {}, allocated_size {}," + " total zones {}, zone_size {}, zone_capacity {}," + " total segments {}, zones per segment {}, segment size {}", + total_size, + available_size, + data.block_size, + data.allocated_size, + num_zones, + zone_size, + zone_capacity, + segments, + zones_per_segment, + zone_capacity * zones_per_segment); + + zbd_sm_metadata_t ret = zbd_sm_metadata_t{ + seastar::smp::count, + segment_size, + zone_capacity * zones_per_segment, + zones_per_segment, + zone_capacity, + data.block_size, + zone_size, + shard_infos, + meta}; + ret.validate(); + return ret; +} + +struct ZoneReport { + struct blk_zone_report *hdr; + ZoneReport(int nr_zones) + : hdr((blk_zone_report *)malloc( + sizeof(struct blk_zone_report) + nr_zones * sizeof(struct blk_zone))){;} + ~ZoneReport(){ + free(hdr); + } + ZoneReport(const ZoneReport &) = delete; + ZoneReport(ZoneReport &&rhs) : hdr(rhs.hdr) { + rhs.hdr = nullptr; + } +}; + +static seastar::future get_blk_dev_size( + seastar::file &device) +{ + return seastar::do_with( + (uint64_t)0, + [&](auto& size_sects) { + return device.ioctl( + BLKGETSIZE, + (void *)&size_sects + ).then([&](int ret) { + ceph_assert(size_sects); + size_t size = size_sects << SECT_SHIFT; + return seastar::make_ready_future(size); + }); + }); +} + +// zone_size should be in 512B sectors +static seastar::future<> reset_device( + seastar::file &device, + uint64_t zone_size_sects, + uint64_t nr_zones) +{ + return seastar::do_with( + blk_zone_range{}, + [&, nr_zones, zone_size_sects](auto &range) { + range.sector = 0; + range.nr_sectors = zone_size_sects * nr_zones; + return device.ioctl( + BLKRESETZONE, + &range + ).then([&](int ret){ + return seastar::now(); + }); + } + ); +} + +static seastar::future get_zone_capacity( + seastar::file &device, + uint32_t nr_zones) +{ + return seastar::do_with( + ZoneReport(nr_zones), + [&](auto &zr) { + zr.hdr->sector = 0; + zr.hdr->nr_zones = nr_zones; + return device.ioctl( + BLKREPORTZONE, + zr.hdr + ).then([&](int ret) { + return seastar::make_ready_future(zr.hdr->zones[0].capacity); + }); + } + ); +} + +static write_ertr::future<> do_write( + seastar::file &device, + uint64_t offset, + bufferptr &bptr) +{ + LOG_PREFIX(ZBDSegmentManager::do_write); + DEBUG("offset {} len {}", + offset, + bptr.length()); + return device.dma_write( + offset, + bptr.c_str(), + bptr.length() + ).handle_exception( + [FNAME](auto e) -> write_ertr::future { + ERROR("dma_write got error {}", + e); + return crimson::ct_error::input_output_error::make(); + } + ).then([length = bptr.length()](auto result) -> write_ertr::future<> { + if (result != length) { + return crimson::ct_error::input_output_error::make(); + } + return write_ertr::now(); + }); +} + +static write_ertr::future<> do_writev( + seastar::file &device, + uint64_t offset, + bufferlist&& bl, + size_t block_size) +{ + LOG_PREFIX(ZBDSegmentManager::do_writev); + DEBUG("offset {} len {}", + offset, + bl.length()); + // writev requires each buffer to be aligned to the disks' block + // size, we need to rebuild here + bl.rebuild_aligned(block_size); + + std::vector iov; + bl.prepare_iov(&iov); + return device.dma_write( + offset, + std::move(iov) + ).handle_exception( + [FNAME](auto e) -> write_ertr::future { + ERROR("dma_write got error {}", + e); + return crimson::ct_error::input_output_error::make(); + } + ).then([bl=std::move(bl)/* hold the buf until the end of io */](size_t written) + -> write_ertr::future<> { + if (written != bl.length()) { + return crimson::ct_error::input_output_error::make(); + } + return write_ertr::now(); + }); +} + +static ZBDSegmentManager::access_ertr::future<> +write_metadata(seastar::file &device, zbd_sm_metadata_t sb) +{ + assert(ceph::encoded_sizeof_bounded() < + sb.block_size); + return seastar::do_with( + bufferptr(ceph::buffer::create_page_aligned(sb.block_size)), + [=, &device](auto &bp) { + LOG_PREFIX(ZBDSegmentManager::write_metadata); + DEBUG("block_size {}", sb.block_size); + bufferlist bl; + encode(sb, bl); + auto iter = bl.begin(); + assert(bl.length() < sb.block_size); + DEBUG("buffer length {}", bl.length()); + iter.copy(bl.length(), bp.c_str()); + DEBUG("doing writeout"); + return do_write(device, 0, bp); + }); +} + +static read_ertr::future<> do_read( + seastar::file &device, + uint64_t offset, + size_t len, + bufferptr &bptr) +{ + LOG_PREFIX(ZBDSegmentManager::do_read); + assert(len <= bptr.length()); + DEBUG("offset {} len {}", + offset, + len); + return device.dma_read( + offset, + bptr.c_str(), + len + ).handle_exception( + [FNAME](auto e) -> read_ertr::future { + ERROR("dma_read got error {}", + e); + return crimson::ct_error::input_output_error::make(); + } + ).then([len](auto result) -> read_ertr::future<> { + if (result != len) { + return crimson::ct_error::input_output_error::make(); + } + return read_ertr::now(); + }); +} + +static +ZBDSegmentManager::access_ertr::future +read_metadata(seastar::file &device, seastar::stat_data sd) +{ + assert(ceph::encoded_sizeof_bounded() < + sd.block_size); + return seastar::do_with( + bufferptr(ceph::buffer::create_page_aligned(sd.block_size)), + [=, &device](auto &bp) { + return do_read( + device, + 0, + bp.length(), + bp + ).safe_then([=, &bp] { + bufferlist bl; + bl.push_back(bp); + zbd_sm_metadata_t ret; + auto bliter = bl.cbegin(); + decode(ret, bliter); + ret.validate(); + return ZBDSegmentManager::access_ertr::future( + ZBDSegmentManager::access_ertr::ready_future_marker{}, + ret); + }); + }); +} + +ZBDSegmentManager::mount_ret ZBDSegmentManager::mount() +{ + return shard_devices.invoke_on_all([](auto &local_device) { + return local_device.shard_mount( + ).handle_error( + crimson::ct_error::assert_all{ + "Invalid error in ZBDSegmentManager::mount" + }); + }); +} + +ZBDSegmentManager::mount_ret ZBDSegmentManager::shard_mount() +{ + return open_device( + device_path, seastar::open_flags::rw + ).safe_then([=, this](auto p) { + device = std::move(p.first); + auto sd = p.second; + return read_metadata(device, sd); + }).safe_then([=, this](auto meta){ + shard_info = meta.shard_infos[seastar::this_shard_id()]; + metadata = meta; + return mount_ertr::now(); + }); +} + +ZBDSegmentManager::mkfs_ret ZBDSegmentManager::mkfs( + device_config_t config) +{ + return shard_devices.local().primary_mkfs(config + ).safe_then([this] { + return shard_devices.invoke_on_all([](auto &local_device) { + return local_device.shard_mkfs( + ).handle_error( + crimson::ct_error::assert_all{ + "Invalid error in ZBDSegmentManager::mkfs" + }); + }); + }); +} + +ZBDSegmentManager::mkfs_ret ZBDSegmentManager::primary_mkfs( + device_config_t config) +{ + LOG_PREFIX(ZBDSegmentManager::primary_mkfs); + INFO("starting, device_path {}", device_path); + return seastar::do_with( + seastar::file{}, + seastar::stat_data{}, + zbd_sm_metadata_t{}, + size_t(), + size_t(), + size_t(), + [=, this](auto &device, auto &stat, auto &sb, auto &zone_size_sects, auto &nr_zones, auto &size) { + return open_device( + device_path, + seastar::open_flags::rw + ).safe_then([=, this, &device, &stat, &sb, &zone_size_sects, &nr_zones, &size](auto p) { + device = p.first; + stat = p.second; + return device.ioctl( + BLKGETNRZONES, + (void *)&nr_zones + ).then([&](int ret) { + if (nr_zones == 0) { + return seastar::make_exception_future( + std::system_error(std::make_error_code(std::errc::io_error))); + } + return device.ioctl(BLKGETZONESZ, (void *)&zone_size_sects); + }).then([&](int ret) { + ceph_assert(zone_size_sects); + return reset_device(device, zone_size_sects, nr_zones); + }).then([&] { + return get_blk_dev_size(device); + }).then([&](auto devsize) { + size = devsize; + return get_zone_capacity(device, nr_zones); + }).then([&, FNAME, config](auto zone_capacity_sects) { + ceph_assert(zone_capacity_sects); + DEBUG("zone_size in sectors {}, zone_capacity in sectors {}", + zone_size_sects, zone_capacity_sects); + sb = make_metadata( + size, + config.meta, + stat, + zone_size_sects, + zone_capacity_sects, + nr_zones); + metadata = sb; + stats.metadata_write.increment( + ceph::encoded_sizeof_bounded()); + DEBUG("Wrote to stats."); + return write_metadata(device, sb); + }).finally([&, FNAME] { + DEBUG("Closing device."); + return device.close(); + }).safe_then([FNAME] { + DEBUG("Returning from mkfs."); + return mkfs_ertr::now(); + }); + }); + }); +} + +ZBDSegmentManager::mkfs_ret ZBDSegmentManager::shard_mkfs() +{ + LOG_PREFIX(ZBDSegmentManager::shard_mkfs); + INFO("starting, device_path {}", device_path); + return open_device( + device_path, seastar::open_flags::rw + ).safe_then([=, this](auto p) { + device = std::move(p.first); + auto sd = p.second; + return read_metadata(device, sd); + }).safe_then([=, this](auto meta){ + shard_info = meta.shard_infos[seastar::this_shard_id()]; + metadata = meta; + return device.close(); + }).safe_then([FNAME] { + DEBUG("Returning from shard_mkfs."); + return mkfs_ertr::now(); + }); +} + +// Return range of sectors to operate on. +struct blk_zone_range make_range( + segment_id_t id, + size_t segment_size, + size_t first_segment_offset) +{ + return blk_zone_range{ + (id.device_segment_id() * (segment_size >> SECT_SHIFT) + + (first_segment_offset >> SECT_SHIFT)), + (segment_size >> SECT_SHIFT) + }; +} + +using blk_zone_op_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; +using blk_zone_op_ret = blk_zone_op_ertr::future<>; +blk_zone_op_ret blk_zone_op(seastar::file &device, + blk_zone_range &range, + zone_op op) { + LOG_PREFIX(ZBDSegmentManager::blk_zone_op); + + unsigned long ioctl_op = 0; + switch (op) { + using enum zone_op; + case OPEN: + ioctl_op = BLKOPENZONE; + break; + case FINISH: + ioctl_op = BLKFINISHZONE; + break; + case RESET: + ioctl_op = BLKRESETZONE; + break; + case CLOSE: + ioctl_op = BLKCLOSEZONE; + break; + default: + ERROR("Invalid zone operation {}", op); + ceph_assert(ioctl_op); + } + + return device.ioctl( + ioctl_op, + &range + ).then_wrapped([=](auto f) -> blk_zone_op_ret { + if (f.failed()) { + ERROR("{} ioctl failed", op); + return crimson::ct_error::input_output_error::make(); + } else { + int ret = f.get(); + if (ret == 0) { + return seastar::now(); + } else { + ERROR("{} ioctl failed with return code {}", op, ret); + return crimson::ct_error::input_output_error::make(); + } + } + }); +} + +ZBDSegmentManager::open_ertr::future ZBDSegmentManager::open( + segment_id_t id) +{ + LOG_PREFIX(ZBDSegmentManager::open); + return seastar::do_with( + blk_zone_range{}, + [=, this](auto &range) { + range = make_range( + id, + metadata.segment_size, + shard_info.first_segment_offset); + return blk_zone_op( + device, + range, + zone_op::OPEN + ); + } + ).safe_then([=, this] { + DEBUG("segment {}, open successful", id); + return open_ertr::future( + open_ertr::ready_future_marker{}, + SegmentRef(new ZBDSegment(*this, id)) + ); + }); +} + +ZBDSegmentManager::release_ertr::future<> ZBDSegmentManager::release( + segment_id_t id) +{ + LOG_PREFIX(ZBDSegmentManager::release); + DEBUG("Resetting zone/segment {}", id); + return seastar::do_with( + blk_zone_range{}, + [=, this](auto &range) { + range = make_range( + id, + metadata.segment_size, + shard_info.first_segment_offset); + return blk_zone_op( + device, + range, + zone_op::RESET + ); + } + ).safe_then([=] { + DEBUG("segment release successful"); + return release_ertr::now(); + }); +} + +SegmentManager::read_ertr::future<> ZBDSegmentManager::read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) +{ + LOG_PREFIX(ZBDSegmentManager::read); + auto& seg_addr = addr.as_seg_paddr(); + if (seg_addr.get_segment_id().device_segment_id() >= get_num_segments()) { + ERROR("invalid segment {}", + seg_addr.get_segment_id().device_segment_id()); + return crimson::ct_error::invarg::make(); + } + + if (seg_addr.get_segment_off() + len > metadata.segment_capacity) { + ERROR("invalid read offset {}, len {}", + addr, + len); + return crimson::ct_error::invarg::make(); + } + return do_read( + device, + get_offset(addr), + len, + out); +} + +Segment::close_ertr::future<> ZBDSegmentManager::segment_close( + segment_id_t id, segment_off_t write_pointer) +{ + LOG_PREFIX(ZBDSegmentManager::segment_close); + return seastar::do_with( + blk_zone_range{}, + [=, this](auto &range) { + range = make_range( + id, + metadata.segment_size, + shard_info.first_segment_offset); + return blk_zone_op( + device, + range, + zone_op::FINISH + ); + } + ).safe_then([=] { + DEBUG("zone finish successful"); + return Segment::close_ertr::now(); + }); +} + +Segment::write_ertr::future<> ZBDSegmentManager::segment_write( + paddr_t addr, + ceph::bufferlist bl, + bool ignore_check) +{ + LOG_PREFIX(ZBDSegmentManager::segment_write); + assert(addr.get_device_id() == get_device_id()); + assert((bl.length() % metadata.block_size) == 0); + auto& seg_addr = addr.as_seg_paddr(); + DEBUG("write to segment {} at offset {}, physical offset {}, len {}", + seg_addr.get_segment_id(), + seg_addr.get_segment_off(), + get_offset(addr), + bl.length()); + stats.data_write.increment(bl.length()); + return do_writev( + device, + get_offset(addr), + std::move(bl), + metadata.block_size); +} + +device_id_t ZBDSegmentManager::get_device_id() const +{ + return metadata.device_id; +}; + +secondary_device_set_t& ZBDSegmentManager::get_secondary_devices() +{ + return metadata.secondary_devices; +}; + +magic_t ZBDSegmentManager::get_magic() const +{ + return metadata.magic; +}; + +segment_off_t ZBDSegment::get_write_capacity() const +{ + return manager.get_segment_size(); +} + +SegmentManager::close_ertr::future<> ZBDSegmentManager::close() +{ + if (device) { + return device.close(); + } + return seastar::now(); +} + +Segment::close_ertr::future<> ZBDSegment::close() +{ + return manager.segment_close(id, write_pointer); +} + +Segment::write_ertr::future<> ZBDSegment::write( + segment_off_t offset, ceph::bufferlist bl) +{ + LOG_PREFIX(ZBDSegment::write); + if (offset != write_pointer || offset % manager.metadata.block_size != 0) { + ERROR("Segment offset and zone write pointer mismatch. " + "segment {} segment-offset {} write pointer {}", + id, offset, write_pointer); + return crimson::ct_error::invarg::make(); + } + if (offset + bl.length() > manager.metadata.segment_capacity) { + return crimson::ct_error::enospc::make(); + } + + write_pointer = offset + bl.length(); + return manager.segment_write(paddr_t::make_seg_paddr(id, offset), bl); +} + +Segment::write_ertr::future<> ZBDSegment::write_padding_bytes( + size_t padding_bytes) +{ + LOG_PREFIX(ZBDSegment::write_padding_bytes); + DEBUG("Writing {} padding bytes to segment {} at wp {}", + padding_bytes, id, write_pointer); + + return crimson::repeat([FNAME, padding_bytes, this] () mutable { + size_t bufsize = 0; + if (padding_bytes >= MAX_PADDING_SIZE) { + bufsize = MAX_PADDING_SIZE; + } else { + bufsize = padding_bytes; + } + + padding_bytes -= bufsize; + bufferptr bp(ceph::buffer::create_page_aligned(bufsize)); + bp.zero(); + bufferlist padd_bl; + padd_bl.append(bp); + return write(write_pointer, padd_bl).safe_then([FNAME, padding_bytes, this]() { + if (padding_bytes == 0) { + return write_ertr::make_ready_future(seastar::stop_iteration::yes); + } else { + return write_ertr::make_ready_future(seastar::stop_iteration::no); + } + }); + }); +} + +// Advance write pointer, to given offset. +Segment::write_ertr::future<> ZBDSegment::advance_wp( + segment_off_t offset) +{ + LOG_PREFIX(ZBDSegment::advance_wp); + + DEBUG("Advancing write pointer from {} to {}", write_pointer, offset); + if (offset < write_pointer) { + return crimson::ct_error::invarg::make(); + } + + size_t padding_bytes = offset - write_pointer; + + if (padding_bytes == 0) { + return write_ertr::now(); + } + + assert(padding_bytes % manager.metadata.block_size == 0); + + return write_padding_bytes(padding_bytes); +} + +} diff --git a/src/crimson/os/seastore/segment_manager/zbd.h b/src/crimson/os/seastore/segment_manager/zbd.h new file mode 100644 index 00000000000..c18f46336ae --- /dev/null +++ b/src/crimson/os/seastore/segment_manager/zbd.h @@ -0,0 +1,246 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include + +#include +#include + +#include +#include +#include + +#include "crimson/common/layout.h" + +#include "crimson/os/seastore/segment_manager.h" + +#include "include/uuid.h" + +namespace crimson::os::seastore::segment_manager::zbd { + + struct zbd_shard_info_t { + size_t size = 0; + size_t segments = 0; + size_t first_segment_offset = 0; + + DENC(zbd_shard_info_t, v, p) { + DENC_START(1, 1, p); + denc(v.size, p); + denc(v.segments, p); + denc(v.first_segment_offset, p); + DENC_FINISH(p); + } + }; + + struct zbd_sm_metadata_t { + unsigned int shard_num = 0; + size_t segment_size = 0; + size_t segment_capacity = 0; + size_t zones_per_segment = 0; + size_t zone_capacity = 0; + size_t block_size = 0; + size_t zone_size = 0; + + std::vector shard_infos; + + seastore_meta_t meta; + + bool major_dev = false; + magic_t magic = 0; + device_type_t dtype = device_type_t::NONE; + device_id_t device_id = 0; + secondary_device_set_t secondary_devices; + + DENC(zbd_sm_metadata_t, v, p) { + DENC_START(1, 1, p); + denc(v.shard_num, p); + denc(v.segment_size, p); + denc(v.segment_capacity, p); + denc(v.zones_per_segment, p); + denc(v.zone_capacity, p); + denc(v.block_size, p); + denc(v.zone_size, p); + denc(v.shard_infos, p); + denc(v.meta, p); + denc(v.magic, p); + denc(v.dtype, p); + denc(v.device_id, p); + if (v.major_dev) { + denc(v.secondary_devices, p); + } + DENC_FINISH(p); + } + + void validate() const { + ceph_assert_always(shard_num == seastar::smp::count); + for (unsigned int i = 0; i < seastar::smp::count; i++) { + ceph_assert_always(shard_infos[i].size > 0); + ceph_assert_always(shard_infos[i].size <= DEVICE_OFF_MAX); + ceph_assert_always(shard_infos[i].segments > 0); + ceph_assert_always(shard_infos[i].segments <= DEVICE_SEGMENT_ID_MAX); + } + ceph_assert_always(segment_capacity > 0); + ceph_assert_always(segment_capacity <= SEGMENT_OFF_MAX); + } + }; + + using write_ertr = crimson::errorator; + using read_ertr = crimson::errorator; + + enum class zone_op { + OPEN, + FINISH, + CLOSE, + RESET, + }; + + class ZBDSegmentManager; + + class ZBDSegment final : public Segment { + public: + ZBDSegment(ZBDSegmentManager &man, segment_id_t i) : manager(man), id(i){}; + + segment_id_t get_segment_id() const final { return id; } + segment_off_t get_write_capacity() const final; + segment_off_t get_write_ptr() const final { return write_pointer; } + close_ertr::future<> close() final; + write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final; + write_ertr::future<> advance_wp(segment_off_t offset) final; + + ~ZBDSegment() {} + private: + friend class ZBDSegmentManager; + ZBDSegmentManager &manager; + const segment_id_t id; + segment_off_t write_pointer = 0; + write_ertr::future<> write_padding_bytes(size_t padding_bytes); + }; + + class ZBDSegmentManager final : public SegmentManager{ + // interfaces used by Device + public: + seastar::future<> start() { + return shard_devices.start(device_path); + } + + seastar::future<> stop() { + return shard_devices.stop(); + } + + Device& get_sharded_device() final { + return shard_devices.local(); + } + + mount_ret mount() final; + mkfs_ret mkfs(device_config_t meta) final; + + ZBDSegmentManager(const std::string &path) : device_path(path) {} + + ~ZBDSegmentManager() final = default; + + //interfaces used by each shard device + public: + open_ertr::future open(segment_id_t id) final; + close_ertr::future<> close() final; + + release_ertr::future<> release(segment_id_t id) final; + + read_ertr::future<> read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) final; + + device_type_t get_device_type() const final { + return device_type_t::ZBD; + } + + size_t get_available_size() const final { + return shard_info.size; + }; + + extent_len_t get_block_size() const final { + return metadata.block_size; + }; + + segment_off_t get_segment_size() const final { + return metadata.segment_capacity; + }; + + const seastore_meta_t &get_meta() const { + return metadata.meta; + }; + + device_id_t get_device_id() const final; + + secondary_device_set_t& get_secondary_devices() final; + + magic_t get_magic() const final; + + Segment::write_ertr::future<> segment_write( + paddr_t addr, + ceph::bufferlist bl, + bool ignore_check=false); + + private: + friend class ZBDSegment; + std::string device_path; + zbd_shard_info_t shard_info; + zbd_sm_metadata_t metadata; + seastar::file device; + uint32_t nr_zones; + struct effort_t { + uint64_t num = 0; + uint64_t bytes = 0; + + void increment(uint64_t read_bytes) { + ++num; + bytes += read_bytes; + } + }; + + struct zbd_sm_stats { + effort_t data_read = {}; + effort_t data_write = {}; + effort_t metadata_write = {}; + uint64_t opened_segments = 0; + uint64_t closed_segments = 0; + uint64_t closed_segments_unused_bytes = 0; + uint64_t released_segments = 0; + + void reset() { + *this = zbd_sm_stats{}; + } + } stats; + + void register_metrics(); + seastar::metrics::metric_group metrics; + + Segment::close_ertr::future<> segment_close( + segment_id_t id, segment_off_t write_pointer); + + uint64_t get_offset(paddr_t addr) { + auto& seg_addr = addr.as_seg_paddr(); + return (shard_info.first_segment_offset + + (seg_addr.get_segment_id().device_segment_id() * + metadata.segment_size)) + seg_addr.get_segment_off(); + } + private: + // shard 0 mkfs + mkfs_ret primary_mkfs(device_config_t meta); + // all shards mkfs + mkfs_ret shard_mkfs(); + + mount_ret shard_mount(); + + seastar::sharded shard_devices; + }; + +} + +WRITE_CLASS_DENC_BOUNDED( + crimson::os::seastore::segment_manager::zbd::zbd_shard_info_t +) +WRITE_CLASS_DENC_BOUNDED( + crimson::os::seastore::segment_manager::zbd::zbd_sm_metadata_t +) diff --git a/src/crimson/os/seastore/segment_manager/zns.cc b/src/crimson/os/seastore/segment_manager/zns.cc deleted file mode 100644 index deaaadf6668..00000000000 --- a/src/crimson/os/seastore/segment_manager/zns.cc +++ /dev/null @@ -1,750 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include -#include -#include - -#include -#include "crimson/os/seastore/segment_manager/zns.h" -#include "crimson/common/config_proxy.h" -#include "crimson/os/seastore/logging.h" -#include "include/buffer.h" - -SET_SUBSYS(seastore_device); - -#define SECT_SHIFT 9 -#define RESERVED_ZONES 1 -// limit the max padding buf size to 1MB -#define MAX_PADDING_SIZE 1048576 - -using z_op = crimson::os::seastore::segment_manager::zns::zone_op; -template <> struct fmt::formatter: fmt::formatter { - template - auto format(z_op s, FormatContext& ctx) { - std::string_view name = "Unknown"; - switch (s) { - using enum z_op; - case OPEN: - name = "BLKOPENZONE"; - break; - case FINISH: - name = "BLKFINISHZONE"; - break; - case CLOSE: - name = "BLKCLOSEZONE"; - break; - case RESET: - name = "BLKRESETZONE"; - break; - } - return formatter::format(name, ctx); - } -}; - -namespace crimson::os::seastore::segment_manager::zns { - -using open_device_ret = ZNSSegmentManager::access_ertr::future< - std::pair>; -static open_device_ret open_device( - const std::string &path, - seastar::open_flags mode) -{ - LOG_PREFIX(ZNSSegmentManager::open_device); - return seastar::file_stat( - path, seastar::follow_symlink::yes - ).then([FNAME, mode, &path](auto stat) mutable { - return seastar::open_file_dma(path, mode).then([=](auto file) { - DEBUG("open of device {} successful, size {}", - path, - stat.size); - return std::make_pair(file, stat); - }); - }).handle_exception( - [FNAME](auto e) -> open_device_ret { - ERROR("got error {}", - e); - return crimson::ct_error::input_output_error::make(); - } - ); -} - -static zns_sm_metadata_t make_metadata( - uint64_t total_size, - seastore_meta_t meta, - const seastar::stat_data &data, - size_t zone_size_sectors, - size_t zone_capacity_sectors, - size_t num_zones) -{ - LOG_PREFIX(ZNSSegmentManager::make_metadata); - - // TODO: support Option::size_t seastore_segment_size - // to allow zones_per_segment > 1 with striping. - size_t zone_size = zone_size_sectors << SECT_SHIFT; - size_t zone_capacity = zone_capacity_sectors << SECT_SHIFT; - size_t segment_size = zone_size; - size_t zones_per_segment = segment_size / zone_size; - size_t segments = (num_zones - RESERVED_ZONES) / zones_per_segment; - size_t per_shard_segments = segments / seastar::smp::count; - size_t available_size = zone_capacity * segments; - size_t per_shard_available_size = zone_capacity * per_shard_segments; - std::vector shard_infos(seastar::smp::count); - for (unsigned int i = 0; i < seastar::smp::count; i++) { - shard_infos[i].size = per_shard_available_size; - shard_infos[i].segments = per_shard_segments; - shard_infos[i].first_segment_offset = zone_size * RESERVED_ZONES - + i * segment_size* per_shard_segments; - } - - assert(total_size == num_zones * zone_size); - - WARN("Ignoring configuration values for device and segment size"); - INFO( - "device size {}, available_size {}, block_size {}, allocated_size {}," - " total zones {}, zone_size {}, zone_capacity {}," - " total segments {}, zones per segment {}, segment size {}", - total_size, - available_size, - data.block_size, - data.allocated_size, - num_zones, - zone_size, - zone_capacity, - segments, - zones_per_segment, - zone_capacity * zones_per_segment); - - zns_sm_metadata_t ret = zns_sm_metadata_t{ - seastar::smp::count, - segment_size, - zone_capacity * zones_per_segment, - zones_per_segment, - zone_capacity, - data.block_size, - zone_size, - shard_infos, - meta}; - ret.validate(); - return ret; -} - -struct ZoneReport { - struct blk_zone_report *hdr; - ZoneReport(int nr_zones) - : hdr((blk_zone_report *)malloc( - sizeof(struct blk_zone_report) + nr_zones * sizeof(struct blk_zone))){;} - ~ZoneReport(){ - free(hdr); - } - ZoneReport(const ZoneReport &) = delete; - ZoneReport(ZoneReport &&rhs) : hdr(rhs.hdr) { - rhs.hdr = nullptr; - } -}; - -static seastar::future get_blk_dev_size( - seastar::file &device) -{ - return seastar::do_with( - (uint64_t)0, - [&](auto& size_sects) { - return device.ioctl( - BLKGETSIZE, - (void *)&size_sects - ).then([&](int ret) { - ceph_assert(size_sects); - size_t size = size_sects << SECT_SHIFT; - return seastar::make_ready_future(size); - }); - }); -} - -// zone_size should be in 512B sectors -static seastar::future<> reset_device( - seastar::file &device, - uint64_t zone_size_sects, - uint64_t nr_zones) -{ - return seastar::do_with( - blk_zone_range{}, - [&, nr_zones, zone_size_sects](auto &range) { - range.sector = 0; - range.nr_sectors = zone_size_sects * nr_zones; - return device.ioctl( - BLKRESETZONE, - &range - ).then([&](int ret){ - return seastar::now(); - }); - } - ); -} - -static seastar::future get_zone_capacity( - seastar::file &device, - uint32_t nr_zones) -{ - return seastar::do_with( - ZoneReport(nr_zones), - [&](auto &zr) { - zr.hdr->sector = 0; - zr.hdr->nr_zones = nr_zones; - return device.ioctl( - BLKREPORTZONE, - zr.hdr - ).then([&](int ret) { - return seastar::make_ready_future(zr.hdr->zones[0].capacity); - }); - } - ); -} - -static write_ertr::future<> do_write( - seastar::file &device, - uint64_t offset, - bufferptr &bptr) -{ - LOG_PREFIX(ZNSSegmentManager::do_write); - DEBUG("offset {} len {}", - offset, - bptr.length()); - return device.dma_write( - offset, - bptr.c_str(), - bptr.length() - ).handle_exception( - [FNAME](auto e) -> write_ertr::future { - ERROR("dma_write got error {}", - e); - return crimson::ct_error::input_output_error::make(); - } - ).then([length = bptr.length()](auto result) -> write_ertr::future<> { - if (result != length) { - return crimson::ct_error::input_output_error::make(); - } - return write_ertr::now(); - }); -} - -static write_ertr::future<> do_writev( - seastar::file &device, - uint64_t offset, - bufferlist&& bl, - size_t block_size) -{ - LOG_PREFIX(ZNSSegmentManager::do_writev); - DEBUG("offset {} len {}", - offset, - bl.length()); - // writev requires each buffer to be aligned to the disks' block - // size, we need to rebuild here - bl.rebuild_aligned(block_size); - - std::vector iov; - bl.prepare_iov(&iov); - return device.dma_write( - offset, - std::move(iov) - ).handle_exception( - [FNAME](auto e) -> write_ertr::future { - ERROR("dma_write got error {}", - e); - return crimson::ct_error::input_output_error::make(); - } - ).then([bl=std::move(bl)/* hold the buf until the end of io */](size_t written) - -> write_ertr::future<> { - if (written != bl.length()) { - return crimson::ct_error::input_output_error::make(); - } - return write_ertr::now(); - }); -} - -static ZNSSegmentManager::access_ertr::future<> -write_metadata(seastar::file &device, zns_sm_metadata_t sb) -{ - assert(ceph::encoded_sizeof_bounded() < - sb.block_size); - return seastar::do_with( - bufferptr(ceph::buffer::create_page_aligned(sb.block_size)), - [=, &device](auto &bp) { - LOG_PREFIX(ZNSSegmentManager::write_metadata); - DEBUG("block_size {}", sb.block_size); - bufferlist bl; - encode(sb, bl); - auto iter = bl.begin(); - assert(bl.length() < sb.block_size); - DEBUG("buffer length {}", bl.length()); - iter.copy(bl.length(), bp.c_str()); - DEBUG("doing writeout"); - return do_write(device, 0, bp); - }); -} - -static read_ertr::future<> do_read( - seastar::file &device, - uint64_t offset, - size_t len, - bufferptr &bptr) -{ - LOG_PREFIX(ZNSSegmentManager::do_read); - assert(len <= bptr.length()); - DEBUG("offset {} len {}", - offset, - len); - return device.dma_read( - offset, - bptr.c_str(), - len - ).handle_exception( - [FNAME](auto e) -> read_ertr::future { - ERROR("dma_read got error {}", - e); - return crimson::ct_error::input_output_error::make(); - } - ).then([len](auto result) -> read_ertr::future<> { - if (result != len) { - return crimson::ct_error::input_output_error::make(); - } - return read_ertr::now(); - }); -} - -static -ZNSSegmentManager::access_ertr::future -read_metadata(seastar::file &device, seastar::stat_data sd) -{ - assert(ceph::encoded_sizeof_bounded() < - sd.block_size); - return seastar::do_with( - bufferptr(ceph::buffer::create_page_aligned(sd.block_size)), - [=, &device](auto &bp) { - return do_read( - device, - 0, - bp.length(), - bp - ).safe_then([=, &bp] { - bufferlist bl; - bl.push_back(bp); - zns_sm_metadata_t ret; - auto bliter = bl.cbegin(); - decode(ret, bliter); - ret.validate(); - return ZNSSegmentManager::access_ertr::future( - ZNSSegmentManager::access_ertr::ready_future_marker{}, - ret); - }); - }); -} - -ZNSSegmentManager::mount_ret ZNSSegmentManager::mount() -{ - return shard_devices.invoke_on_all([](auto &local_device) { - return local_device.shard_mount( - ).handle_error( - crimson::ct_error::assert_all{ - "Invalid error in ZNSSegmentManager::mount" - }); - }); -} - -ZNSSegmentManager::mount_ret ZNSSegmentManager::shard_mount() -{ - return open_device( - device_path, seastar::open_flags::rw - ).safe_then([=, this](auto p) { - device = std::move(p.first); - auto sd = p.second; - return read_metadata(device, sd); - }).safe_then([=, this](auto meta){ - shard_info = meta.shard_infos[seastar::this_shard_id()]; - metadata = meta; - return mount_ertr::now(); - }); -} - -ZNSSegmentManager::mkfs_ret ZNSSegmentManager::mkfs( - device_config_t config) -{ - return shard_devices.local().primary_mkfs(config - ).safe_then([this] { - return shard_devices.invoke_on_all([](auto &local_device) { - return local_device.shard_mkfs( - ).handle_error( - crimson::ct_error::assert_all{ - "Invalid error in ZNSSegmentManager::mkfs" - }); - }); - }); -} - -ZNSSegmentManager::mkfs_ret ZNSSegmentManager::primary_mkfs( - device_config_t config) -{ - LOG_PREFIX(ZNSSegmentManager::primary_mkfs); - INFO("starting, device_path {}", device_path); - return seastar::do_with( - seastar::file{}, - seastar::stat_data{}, - zns_sm_metadata_t{}, - size_t(), - size_t(), - size_t(), - [=, this](auto &device, auto &stat, auto &sb, auto &zone_size_sects, auto &nr_zones, auto &size) { - return open_device( - device_path, - seastar::open_flags::rw - ).safe_then([=, this, &device, &stat, &sb, &zone_size_sects, &nr_zones, &size](auto p) { - device = p.first; - stat = p.second; - return device.ioctl( - BLKGETNRZONES, - (void *)&nr_zones - ).then([&](int ret) { - if (nr_zones == 0) { - return seastar::make_exception_future( - std::system_error(std::make_error_code(std::errc::io_error))); - } - return device.ioctl(BLKGETZONESZ, (void *)&zone_size_sects); - }).then([&](int ret) { - ceph_assert(zone_size_sects); - return reset_device(device, zone_size_sects, nr_zones); - }).then([&] { - return get_blk_dev_size(device); - }).then([&](auto devsize) { - size = devsize; - return get_zone_capacity(device, nr_zones); - }).then([&, FNAME, config](auto zone_capacity_sects) { - ceph_assert(zone_capacity_sects); - DEBUG("zone_size in sectors {}, zone_capacity in sectors {}", - zone_size_sects, zone_capacity_sects); - sb = make_metadata( - size, - config.meta, - stat, - zone_size_sects, - zone_capacity_sects, - nr_zones); - metadata = sb; - stats.metadata_write.increment( - ceph::encoded_sizeof_bounded()); - DEBUG("Wrote to stats."); - return write_metadata(device, sb); - }).finally([&, FNAME] { - DEBUG("Closing device."); - return device.close(); - }).safe_then([FNAME] { - DEBUG("Returning from mkfs."); - return mkfs_ertr::now(); - }); - }); - }); -} - -ZNSSegmentManager::mkfs_ret ZNSSegmentManager::shard_mkfs() -{ - LOG_PREFIX(ZNSSegmentManager::shard_mkfs); - INFO("starting, device_path {}", device_path); - return open_device( - device_path, seastar::open_flags::rw - ).safe_then([=, this](auto p) { - device = std::move(p.first); - auto sd = p.second; - return read_metadata(device, sd); - }).safe_then([=, this](auto meta){ - shard_info = meta.shard_infos[seastar::this_shard_id()]; - metadata = meta; - return device.close(); - }).safe_then([FNAME] { - DEBUG("Returning from shard_mkfs."); - return mkfs_ertr::now(); - }); -} - -// Return range of sectors to operate on. -struct blk_zone_range make_range( - segment_id_t id, - size_t segment_size, - size_t first_segment_offset) -{ - return blk_zone_range{ - (id.device_segment_id() * (segment_size >> SECT_SHIFT) - + (first_segment_offset >> SECT_SHIFT)), - (segment_size >> SECT_SHIFT) - }; -} - -using blk_zone_op_ertr = crimson::errorator< - crimson::ct_error::input_output_error>; -using blk_zone_op_ret = blk_zone_op_ertr::future<>; -blk_zone_op_ret blk_zone_op(seastar::file &device, - blk_zone_range &range, - zone_op op) { - LOG_PREFIX(ZNSSegmentManager::blk_zone_op); - - unsigned long ioctl_op = 0; - switch (op) { - using enum zone_op; - case OPEN: - ioctl_op = BLKOPENZONE; - break; - case FINISH: - ioctl_op = BLKFINISHZONE; - break; - case RESET: - ioctl_op = BLKRESETZONE; - break; - case CLOSE: - ioctl_op = BLKCLOSEZONE; - break; - default: - ERROR("Invalid zone operation {}", op); - ceph_assert(ioctl_op); - } - - return device.ioctl( - ioctl_op, - &range - ).then_wrapped([=](auto f) -> blk_zone_op_ret { - if (f.failed()) { - ERROR("{} ioctl failed", op); - return crimson::ct_error::input_output_error::make(); - } else { - int ret = f.get(); - if (ret == 0) { - return seastar::now(); - } else { - ERROR("{} ioctl failed with return code {}", op, ret); - return crimson::ct_error::input_output_error::make(); - } - } - }); -} - -ZNSSegmentManager::open_ertr::future ZNSSegmentManager::open( - segment_id_t id) -{ - LOG_PREFIX(ZNSSegmentManager::open); - return seastar::do_with( - blk_zone_range{}, - [=, this](auto &range) { - range = make_range( - id, - metadata.segment_size, - shard_info.first_segment_offset); - return blk_zone_op( - device, - range, - zone_op::OPEN - ); - } - ).safe_then([=, this] { - DEBUG("segment {}, open successful", id); - return open_ertr::future( - open_ertr::ready_future_marker{}, - SegmentRef(new ZNSSegment(*this, id)) - ); - }); -} - -ZNSSegmentManager::release_ertr::future<> ZNSSegmentManager::release( - segment_id_t id) -{ - LOG_PREFIX(ZNSSegmentManager::release); - DEBUG("Resetting zone/segment {}", id); - return seastar::do_with( - blk_zone_range{}, - [=, this](auto &range) { - range = make_range( - id, - metadata.segment_size, - shard_info.first_segment_offset); - return blk_zone_op( - device, - range, - zone_op::RESET - ); - } - ).safe_then([=] { - DEBUG("segment release successful"); - return release_ertr::now(); - }); -} - -SegmentManager::read_ertr::future<> ZNSSegmentManager::read( - paddr_t addr, - size_t len, - ceph::bufferptr &out) -{ - LOG_PREFIX(ZNSSegmentManager::read); - auto& seg_addr = addr.as_seg_paddr(); - if (seg_addr.get_segment_id().device_segment_id() >= get_num_segments()) { - ERROR("invalid segment {}", - seg_addr.get_segment_id().device_segment_id()); - return crimson::ct_error::invarg::make(); - } - - if (seg_addr.get_segment_off() + len > metadata.segment_capacity) { - ERROR("invalid read offset {}, len {}", - addr, - len); - return crimson::ct_error::invarg::make(); - } - return do_read( - device, - get_offset(addr), - len, - out); -} - -Segment::close_ertr::future<> ZNSSegmentManager::segment_close( - segment_id_t id, segment_off_t write_pointer) -{ - LOG_PREFIX(ZNSSegmentManager::segment_close); - return seastar::do_with( - blk_zone_range{}, - [=, this](auto &range) { - range = make_range( - id, - metadata.segment_size, - shard_info.first_segment_offset); - return blk_zone_op( - device, - range, - zone_op::FINISH - ); - } - ).safe_then([=] { - DEBUG("zone finish successful"); - return Segment::close_ertr::now(); - }); -} - -Segment::write_ertr::future<> ZNSSegmentManager::segment_write( - paddr_t addr, - ceph::bufferlist bl, - bool ignore_check) -{ - LOG_PREFIX(ZNSSegmentManager::segment_write); - assert(addr.get_device_id() == get_device_id()); - assert((bl.length() % metadata.block_size) == 0); - auto& seg_addr = addr.as_seg_paddr(); - DEBUG("write to segment {} at offset {}, physical offset {}, len {}", - seg_addr.get_segment_id(), - seg_addr.get_segment_off(), - get_offset(addr), - bl.length()); - stats.data_write.increment(bl.length()); - return do_writev( - device, - get_offset(addr), - std::move(bl), - metadata.block_size); -} - -device_id_t ZNSSegmentManager::get_device_id() const -{ - return metadata.device_id; -}; - -secondary_device_set_t& ZNSSegmentManager::get_secondary_devices() -{ - return metadata.secondary_devices; -}; - -magic_t ZNSSegmentManager::get_magic() const -{ - return metadata.magic; -}; - -segment_off_t ZNSSegment::get_write_capacity() const -{ - return manager.get_segment_size(); -} - -SegmentManager::close_ertr::future<> ZNSSegmentManager::close() -{ - if (device) { - return device.close(); - } - return seastar::now(); -} - -Segment::close_ertr::future<> ZNSSegment::close() -{ - return manager.segment_close(id, write_pointer); -} - -Segment::write_ertr::future<> ZNSSegment::write( - segment_off_t offset, ceph::bufferlist bl) -{ - LOG_PREFIX(ZNSSegment::write); - if (offset != write_pointer || offset % manager.metadata.block_size != 0) { - ERROR("Segment offset and zone write pointer mismatch. " - "segment {} segment-offset {} write pointer {}", - id, offset, write_pointer); - return crimson::ct_error::invarg::make(); - } - if (offset + bl.length() > manager.metadata.segment_capacity) { - return crimson::ct_error::enospc::make(); - } - - write_pointer = offset + bl.length(); - return manager.segment_write(paddr_t::make_seg_paddr(id, offset), bl); -} - -Segment::write_ertr::future<> ZNSSegment::write_padding_bytes( - size_t padding_bytes) -{ - LOG_PREFIX(ZNSSegment::write_padding_bytes); - DEBUG("Writing {} padding bytes to segment {} at wp {}", - padding_bytes, id, write_pointer); - - return crimson::repeat([FNAME, padding_bytes, this] () mutable { - size_t bufsize = 0; - if (padding_bytes >= MAX_PADDING_SIZE) { - bufsize = MAX_PADDING_SIZE; - } else { - bufsize = padding_bytes; - } - - padding_bytes -= bufsize; - bufferptr bp(ceph::buffer::create_page_aligned(bufsize)); - bp.zero(); - bufferlist padd_bl; - padd_bl.append(bp); - return write(write_pointer, padd_bl).safe_then([FNAME, padding_bytes, this]() { - if (padding_bytes == 0) { - return write_ertr::make_ready_future(seastar::stop_iteration::yes); - } else { - return write_ertr::make_ready_future(seastar::stop_iteration::no); - } - }); - }); -} - -// Advance write pointer, to given offset. -Segment::write_ertr::future<> ZNSSegment::advance_wp( - segment_off_t offset) -{ - LOG_PREFIX(ZNSSegment::advance_wp); - - DEBUG("Advancing write pointer from {} to {}", write_pointer, offset); - if (offset < write_pointer) { - return crimson::ct_error::invarg::make(); - } - - size_t padding_bytes = offset - write_pointer; - - if (padding_bytes == 0) { - return write_ertr::now(); - } - - assert(padding_bytes % manager.metadata.block_size == 0); - - return write_padding_bytes(padding_bytes); -} - -} diff --git a/src/crimson/os/seastore/segment_manager/zns.h b/src/crimson/os/seastore/segment_manager/zns.h deleted file mode 100644 index b98ff1c89f4..00000000000 --- a/src/crimson/os/seastore/segment_manager/zns.h +++ /dev/null @@ -1,246 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#pragma once - -#include - -#include -#include - -#include -#include -#include - -#include "crimson/common/layout.h" - -#include "crimson/os/seastore/segment_manager.h" - -#include "include/uuid.h" - -namespace crimson::os::seastore::segment_manager::zns { - - struct zns_shard_info_t { - size_t size = 0; - size_t segments = 0; - size_t first_segment_offset = 0; - - DENC(zns_shard_info_t, v, p) { - DENC_START(1, 1, p); - denc(v.size, p); - denc(v.segments, p); - denc(v.first_segment_offset, p); - DENC_FINISH(p); - } - }; - - struct zns_sm_metadata_t { - unsigned int shard_num = 0; - size_t segment_size = 0; - size_t segment_capacity = 0; - size_t zones_per_segment = 0; - size_t zone_capacity = 0; - size_t block_size = 0; - size_t zone_size = 0; - - std::vector shard_infos; - - seastore_meta_t meta; - - bool major_dev = false; - magic_t magic = 0; - device_type_t dtype = device_type_t::NONE; - device_id_t device_id = 0; - secondary_device_set_t secondary_devices; - - DENC(zns_sm_metadata_t, v, p) { - DENC_START(1, 1, p); - denc(v.shard_num, p); - denc(v.segment_size, p); - denc(v.segment_capacity, p); - denc(v.zones_per_segment, p); - denc(v.zone_capacity, p); - denc(v.block_size, p); - denc(v.zone_size, p); - denc(v.shard_infos, p); - denc(v.meta, p); - denc(v.magic, p); - denc(v.dtype, p); - denc(v.device_id, p); - if (v.major_dev) { - denc(v.secondary_devices, p); - } - DENC_FINISH(p); - } - - void validate() const { - ceph_assert_always(shard_num == seastar::smp::count); - for (unsigned int i = 0; i < seastar::smp::count; i++) { - ceph_assert_always(shard_infos[i].size > 0); - ceph_assert_always(shard_infos[i].size <= DEVICE_OFF_MAX); - ceph_assert_always(shard_infos[i].segments > 0); - ceph_assert_always(shard_infos[i].segments <= DEVICE_SEGMENT_ID_MAX); - } - ceph_assert_always(segment_capacity > 0); - ceph_assert_always(segment_capacity <= SEGMENT_OFF_MAX); - } - }; - - using write_ertr = crimson::errorator; - using read_ertr = crimson::errorator; - - enum class zone_op { - OPEN, - FINISH, - CLOSE, - RESET, - }; - - class ZNSSegmentManager; - - class ZNSSegment final : public Segment { - public: - ZNSSegment(ZNSSegmentManager &man, segment_id_t i) : manager(man), id(i){}; - - segment_id_t get_segment_id() const final { return id; } - segment_off_t get_write_capacity() const final; - segment_off_t get_write_ptr() const final { return write_pointer; } - close_ertr::future<> close() final; - write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final; - write_ertr::future<> advance_wp(segment_off_t offset) final; - - ~ZNSSegment() {} - private: - friend class ZNSSegmentManager; - ZNSSegmentManager &manager; - const segment_id_t id; - segment_off_t write_pointer = 0; - write_ertr::future<> write_padding_bytes(size_t padding_bytes); - }; - - class ZNSSegmentManager final : public SegmentManager{ - // interfaces used by Device - public: - seastar::future<> start() { - return shard_devices.start(device_path); - } - - seastar::future<> stop() { - return shard_devices.stop(); - } - - Device& get_sharded_device() final { - return shard_devices.local(); - } - - mount_ret mount() final; - mkfs_ret mkfs(device_config_t meta) final; - - ZNSSegmentManager(const std::string &path) : device_path(path) {} - - ~ZNSSegmentManager() final = default; - - //interfaces used by each shard device - public: - open_ertr::future open(segment_id_t id) final; - close_ertr::future<> close() final; - - release_ertr::future<> release(segment_id_t id) final; - - read_ertr::future<> read( - paddr_t addr, - size_t len, - ceph::bufferptr &out) final; - - device_type_t get_device_type() const final { - return device_type_t::ZNS; - } - - size_t get_available_size() const final { - return shard_info.size; - }; - - extent_len_t get_block_size() const final { - return metadata.block_size; - }; - - segment_off_t get_segment_size() const final { - return metadata.segment_capacity; - }; - - const seastore_meta_t &get_meta() const { - return metadata.meta; - }; - - device_id_t get_device_id() const final; - - secondary_device_set_t& get_secondary_devices() final; - - magic_t get_magic() const final; - - Segment::write_ertr::future<> segment_write( - paddr_t addr, - ceph::bufferlist bl, - bool ignore_check=false); - - private: - friend class ZNSSegment; - std::string device_path; - zns_shard_info_t shard_info; - zns_sm_metadata_t metadata; - seastar::file device; - uint32_t nr_zones; - struct effort_t { - uint64_t num = 0; - uint64_t bytes = 0; - - void increment(uint64_t read_bytes) { - ++num; - bytes += read_bytes; - } - }; - - struct zns_sm_stats { - effort_t data_read = {}; - effort_t data_write = {}; - effort_t metadata_write = {}; - uint64_t opened_segments = 0; - uint64_t closed_segments = 0; - uint64_t closed_segments_unused_bytes = 0; - uint64_t released_segments = 0; - - void reset() { - *this = zns_sm_stats{}; - } - } stats; - - void register_metrics(); - seastar::metrics::metric_group metrics; - - Segment::close_ertr::future<> segment_close( - segment_id_t id, segment_off_t write_pointer); - - uint64_t get_offset(paddr_t addr) { - auto& seg_addr = addr.as_seg_paddr(); - return (shard_info.first_segment_offset + - (seg_addr.get_segment_id().device_segment_id() * - metadata.segment_size)) + seg_addr.get_segment_off(); - } - private: - // shard 0 mkfs - mkfs_ret primary_mkfs(device_config_t meta); - // all shards mkfs - mkfs_ret shard_mkfs(); - - mount_ret shard_mount(); - - seastar::sharded shard_devices; - }; - -} - -WRITE_CLASS_DENC_BOUNDED( - crimson::os::seastore::segment_manager::zns::zns_shard_info_t -) -WRITE_CLASS_DENC_BOUNDED( - crimson::os::seastore::segment_manager::zns::zns_sm_metadata_t -)