if(WITH_ZNS)
find_package(LinuxZNS REQUIRED)
list(APPEND crimson_seastore_srcs
- segment_manager/zns.cc)
+ segment_manager/zbd.cc)
endif()
add_library(crimson-seastore STATIC
if (type == "SSD") {
return device_type_t::SSD;
}
- if (type == "ZNS") {
- return device_type_t::ZNS;
+ if (type == "ZBD") {
+ return device_type_t::ZBD;
}
if (type == "RANDOM_BLOCK_SSD") {
return device_type_t::RANDOM_BLOCK_SSD;
return out << "HDD";
case device_type_t::SSD:
return out << "SSD";
- case device_type_t::ZNS:
- return out << "ZNS";
+ case device_type_t::ZBD:
+ return out << "ZBD";
case device_type_t::EPHEMERAL_COLD:
return out << "EPHEMERAL_COLD";
case device_type_t::EPHEMERAL_MAIN:
NONE = 0,
HDD,
SSD,
- ZNS,
+ ZBD,
EPHEMERAL_COLD,
EPHEMERAL_MAIN,
RANDOM_BLOCK_SSD,
device_type_t string_to_device_type(std::string type);
enum class backend_type_t {
- SEGMENTED, // SegmentManager: SSD, ZNS, HDD
+ SEGMENTED, // SegmentManager: SSD, ZBD, HDD
RANDOM_BLOCK // RBMDevice: RANDOM_BLOCK_SSD
};
#include "crimson/os/seastore/logging.h"
#ifdef HAVE_ZNS
-#include "crimson/os/seastore/segment_manager/zns.h"
+#include "crimson/os/seastore/segment_manager/zbd.h"
SET_SUBSYS(seastore_device);
#endif
INFO("Found {} zones.", nr_zones);
if (nr_zones != 0) {
return std::make_unique<
- segment_manager::zns::ZNSSegmentManager
+ segment_manager::zbd::ZBDSegmentManager
>(device + "/block");
} else {
return std::make_unique<
* advance_wp
*
* advance the segment write pointer,
- * needed when writing at wp is strictly implemented. ex: ZNS backed segments
+ * needed when writing at wp is strictly implemented. ex: ZBD backed segments
* @param offset: advance write pointer till the given offset
*/
virtual write_ertr::future<> advance_wp(
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+#include <linux/blkzoned.h>
+
+#include <fmt/format.h>
+#include "crimson/os/seastore/segment_manager/zbd.h"
+#include "crimson/common/config_proxy.h"
+#include "crimson/os/seastore/logging.h"
+#include "include/buffer.h"
+
+SET_SUBSYS(seastore_device);
+
+#define SECT_SHIFT 9
+#define RESERVED_ZONES 1
+// limit the max padding buf size to 1MB
+#define MAX_PADDING_SIZE 1048576
+
+using z_op = crimson::os::seastore::segment_manager::zbd::zone_op;
+template <> struct fmt::formatter<z_op>: fmt::formatter<std::string_view> {
+ template <typename FormatContext>
+ auto format(z_op s, FormatContext& ctx) {
+ std::string_view name = "Unknown";
+ switch (s) {
+ using enum z_op;
+ case OPEN:
+ name = "BLKOPENZONE";
+ break;
+ case FINISH:
+ name = "BLKFINISHZONE";
+ break;
+ case CLOSE:
+ name = "BLKCLOSEZONE";
+ break;
+ case RESET:
+ name = "BLKRESETZONE";
+ break;
+ }
+ return formatter<string_view>::format(name, ctx);
+ }
+};
+
+namespace crimson::os::seastore::segment_manager::zbd {
+
+using open_device_ret = ZBDSegmentManager::access_ertr::future<
+ std::pair<seastar::file, seastar::stat_data>>;
+static open_device_ret open_device(
+ const std::string &path,
+ seastar::open_flags mode)
+{
+ LOG_PREFIX(ZBDSegmentManager::open_device);
+ return seastar::file_stat(
+ path, seastar::follow_symlink::yes
+ ).then([FNAME, mode, &path](auto stat) mutable {
+ return seastar::open_file_dma(path, mode).then([=](auto file) {
+ DEBUG("open of device {} successful, size {}",
+ path,
+ stat.size);
+ return std::make_pair(file, stat);
+ });
+ }).handle_exception(
+ [FNAME](auto e) -> open_device_ret {
+ ERROR("got error {}",
+ e);
+ return crimson::ct_error::input_output_error::make();
+ }
+ );
+}
+
+static zbd_sm_metadata_t make_metadata(
+ uint64_t total_size,
+ seastore_meta_t meta,
+ const seastar::stat_data &data,
+ size_t zone_size_sectors,
+ size_t zone_capacity_sectors,
+ size_t num_zones)
+{
+ LOG_PREFIX(ZBDSegmentManager::make_metadata);
+
+ // TODO: support Option::size_t seastore_segment_size
+ // to allow zones_per_segment > 1 with striping.
+ size_t zone_size = zone_size_sectors << SECT_SHIFT;
+ size_t zone_capacity = zone_capacity_sectors << SECT_SHIFT;
+ size_t segment_size = zone_size;
+ size_t zones_per_segment = segment_size / zone_size;
+ size_t segments = (num_zones - RESERVED_ZONES) / zones_per_segment;
+ size_t per_shard_segments = segments / seastar::smp::count;
+ size_t available_size = zone_capacity * segments;
+ size_t per_shard_available_size = zone_capacity * per_shard_segments;
+ std::vector<zbd_shard_info_t> shard_infos(seastar::smp::count);
+ for (unsigned int i = 0; i < seastar::smp::count; i++) {
+ shard_infos[i].size = per_shard_available_size;
+ shard_infos[i].segments = per_shard_segments;
+ shard_infos[i].first_segment_offset = zone_size * RESERVED_ZONES
+ + i * segment_size* per_shard_segments;
+ }
+
+ assert(total_size == num_zones * zone_size);
+
+ WARN("Ignoring configuration values for device and segment size");
+ INFO(
+ "device size {}, available_size {}, block_size {}, allocated_size {},"
+ " total zones {}, zone_size {}, zone_capacity {},"
+ " total segments {}, zones per segment {}, segment size {}",
+ total_size,
+ available_size,
+ data.block_size,
+ data.allocated_size,
+ num_zones,
+ zone_size,
+ zone_capacity,
+ segments,
+ zones_per_segment,
+ zone_capacity * zones_per_segment);
+
+ zbd_sm_metadata_t ret = zbd_sm_metadata_t{
+ seastar::smp::count,
+ segment_size,
+ zone_capacity * zones_per_segment,
+ zones_per_segment,
+ zone_capacity,
+ data.block_size,
+ zone_size,
+ shard_infos,
+ meta};
+ ret.validate();
+ return ret;
+}
+
+struct ZoneReport {
+ struct blk_zone_report *hdr;
+ ZoneReport(int nr_zones)
+ : hdr((blk_zone_report *)malloc(
+ sizeof(struct blk_zone_report) + nr_zones * sizeof(struct blk_zone))){;}
+ ~ZoneReport(){
+ free(hdr);
+ }
+ ZoneReport(const ZoneReport &) = delete;
+ ZoneReport(ZoneReport &&rhs) : hdr(rhs.hdr) {
+ rhs.hdr = nullptr;
+ }
+};
+
+static seastar::future<size_t> get_blk_dev_size(
+ seastar::file &device)
+{
+ return seastar::do_with(
+ (uint64_t)0,
+ [&](auto& size_sects) {
+ return device.ioctl(
+ BLKGETSIZE,
+ (void *)&size_sects
+ ).then([&](int ret) {
+ ceph_assert(size_sects);
+ size_t size = size_sects << SECT_SHIFT;
+ return seastar::make_ready_future<size_t>(size);
+ });
+ });
+}
+
+// zone_size should be in 512B sectors
+static seastar::future<> reset_device(
+ seastar::file &device,
+ uint64_t zone_size_sects,
+ uint64_t nr_zones)
+{
+ return seastar::do_with(
+ blk_zone_range{},
+ [&, nr_zones, zone_size_sects](auto &range) {
+ range.sector = 0;
+ range.nr_sectors = zone_size_sects * nr_zones;
+ return device.ioctl(
+ BLKRESETZONE,
+ &range
+ ).then([&](int ret){
+ return seastar::now();
+ });
+ }
+ );
+}
+
+static seastar::future<size_t> get_zone_capacity(
+ seastar::file &device,
+ uint32_t nr_zones)
+{
+ return seastar::do_with(
+ ZoneReport(nr_zones),
+ [&](auto &zr) {
+ zr.hdr->sector = 0;
+ zr.hdr->nr_zones = nr_zones;
+ return device.ioctl(
+ BLKREPORTZONE,
+ zr.hdr
+ ).then([&](int ret) {
+ return seastar::make_ready_future<size_t>(zr.hdr->zones[0].capacity);
+ });
+ }
+ );
+}
+
+static write_ertr::future<> do_write(
+ seastar::file &device,
+ uint64_t offset,
+ bufferptr &bptr)
+{
+ LOG_PREFIX(ZBDSegmentManager::do_write);
+ DEBUG("offset {} len {}",
+ offset,
+ bptr.length());
+ return device.dma_write(
+ offset,
+ bptr.c_str(),
+ bptr.length()
+ ).handle_exception(
+ [FNAME](auto e) -> write_ertr::future<size_t> {
+ ERROR("dma_write got error {}",
+ e);
+ return crimson::ct_error::input_output_error::make();
+ }
+ ).then([length = bptr.length()](auto result) -> write_ertr::future<> {
+ if (result != length) {
+ return crimson::ct_error::input_output_error::make();
+ }
+ return write_ertr::now();
+ });
+}
+
+static write_ertr::future<> do_writev(
+ seastar::file &device,
+ uint64_t offset,
+ bufferlist&& bl,
+ size_t block_size)
+{
+ LOG_PREFIX(ZBDSegmentManager::do_writev);
+ DEBUG("offset {} len {}",
+ offset,
+ bl.length());
+ // writev requires each buffer to be aligned to the disks' block
+ // size, we need to rebuild here
+ bl.rebuild_aligned(block_size);
+
+ std::vector<iovec> iov;
+ bl.prepare_iov(&iov);
+ return device.dma_write(
+ offset,
+ std::move(iov)
+ ).handle_exception(
+ [FNAME](auto e) -> write_ertr::future<size_t> {
+ ERROR("dma_write got error {}",
+ e);
+ return crimson::ct_error::input_output_error::make();
+ }
+ ).then([bl=std::move(bl)/* hold the buf until the end of io */](size_t written)
+ -> write_ertr::future<> {
+ if (written != bl.length()) {
+ return crimson::ct_error::input_output_error::make();
+ }
+ return write_ertr::now();
+ });
+}
+
+static ZBDSegmentManager::access_ertr::future<>
+write_metadata(seastar::file &device, zbd_sm_metadata_t sb)
+{
+ assert(ceph::encoded_sizeof_bounded<zbd_sm_metadata_t>() <
+ sb.block_size);
+ return seastar::do_with(
+ bufferptr(ceph::buffer::create_page_aligned(sb.block_size)),
+ [=, &device](auto &bp) {
+ LOG_PREFIX(ZBDSegmentManager::write_metadata);
+ DEBUG("block_size {}", sb.block_size);
+ bufferlist bl;
+ encode(sb, bl);
+ auto iter = bl.begin();
+ assert(bl.length() < sb.block_size);
+ DEBUG("buffer length {}", bl.length());
+ iter.copy(bl.length(), bp.c_str());
+ DEBUG("doing writeout");
+ return do_write(device, 0, bp);
+ });
+}
+
+static read_ertr::future<> do_read(
+ seastar::file &device,
+ uint64_t offset,
+ size_t len,
+ bufferptr &bptr)
+{
+ LOG_PREFIX(ZBDSegmentManager::do_read);
+ assert(len <= bptr.length());
+ DEBUG("offset {} len {}",
+ offset,
+ len);
+ return device.dma_read(
+ offset,
+ bptr.c_str(),
+ len
+ ).handle_exception(
+ [FNAME](auto e) -> read_ertr::future<size_t> {
+ ERROR("dma_read got error {}",
+ e);
+ return crimson::ct_error::input_output_error::make();
+ }
+ ).then([len](auto result) -> read_ertr::future<> {
+ if (result != len) {
+ return crimson::ct_error::input_output_error::make();
+ }
+ return read_ertr::now();
+ });
+}
+
+static
+ZBDSegmentManager::access_ertr::future<zbd_sm_metadata_t>
+read_metadata(seastar::file &device, seastar::stat_data sd)
+{
+ assert(ceph::encoded_sizeof_bounded<zbd_sm_metadata_t>() <
+ sd.block_size);
+ return seastar::do_with(
+ bufferptr(ceph::buffer::create_page_aligned(sd.block_size)),
+ [=, &device](auto &bp) {
+ return do_read(
+ device,
+ 0,
+ bp.length(),
+ bp
+ ).safe_then([=, &bp] {
+ bufferlist bl;
+ bl.push_back(bp);
+ zbd_sm_metadata_t ret;
+ auto bliter = bl.cbegin();
+ decode(ret, bliter);
+ ret.validate();
+ return ZBDSegmentManager::access_ertr::future<zbd_sm_metadata_t>(
+ ZBDSegmentManager::access_ertr::ready_future_marker{},
+ ret);
+ });
+ });
+}
+
+ZBDSegmentManager::mount_ret ZBDSegmentManager::mount()
+{
+ return shard_devices.invoke_on_all([](auto &local_device) {
+ return local_device.shard_mount(
+ ).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid error in ZBDSegmentManager::mount"
+ });
+ });
+}
+
+ZBDSegmentManager::mount_ret ZBDSegmentManager::shard_mount()
+{
+ return open_device(
+ device_path, seastar::open_flags::rw
+ ).safe_then([=, this](auto p) {
+ device = std::move(p.first);
+ auto sd = p.second;
+ return read_metadata(device, sd);
+ }).safe_then([=, this](auto meta){
+ shard_info = meta.shard_infos[seastar::this_shard_id()];
+ metadata = meta;
+ return mount_ertr::now();
+ });
+}
+
+ZBDSegmentManager::mkfs_ret ZBDSegmentManager::mkfs(
+ device_config_t config)
+{
+ return shard_devices.local().primary_mkfs(config
+ ).safe_then([this] {
+ return shard_devices.invoke_on_all([](auto &local_device) {
+ return local_device.shard_mkfs(
+ ).handle_error(
+ crimson::ct_error::assert_all{
+ "Invalid error in ZBDSegmentManager::mkfs"
+ });
+ });
+ });
+}
+
+ZBDSegmentManager::mkfs_ret ZBDSegmentManager::primary_mkfs(
+ device_config_t config)
+{
+ LOG_PREFIX(ZBDSegmentManager::primary_mkfs);
+ INFO("starting, device_path {}", device_path);
+ return seastar::do_with(
+ seastar::file{},
+ seastar::stat_data{},
+ zbd_sm_metadata_t{},
+ size_t(),
+ size_t(),
+ size_t(),
+ [=, this](auto &device, auto &stat, auto &sb, auto &zone_size_sects, auto &nr_zones, auto &size) {
+ return open_device(
+ device_path,
+ seastar::open_flags::rw
+ ).safe_then([=, this, &device, &stat, &sb, &zone_size_sects, &nr_zones, &size](auto p) {
+ device = p.first;
+ stat = p.second;
+ return device.ioctl(
+ BLKGETNRZONES,
+ (void *)&nr_zones
+ ).then([&](int ret) {
+ if (nr_zones == 0) {
+ return seastar::make_exception_future<int>(
+ std::system_error(std::make_error_code(std::errc::io_error)));
+ }
+ return device.ioctl(BLKGETZONESZ, (void *)&zone_size_sects);
+ }).then([&](int ret) {
+ ceph_assert(zone_size_sects);
+ return reset_device(device, zone_size_sects, nr_zones);
+ }).then([&] {
+ return get_blk_dev_size(device);
+ }).then([&](auto devsize) {
+ size = devsize;
+ return get_zone_capacity(device, nr_zones);
+ }).then([&, FNAME, config](auto zone_capacity_sects) {
+ ceph_assert(zone_capacity_sects);
+ DEBUG("zone_size in sectors {}, zone_capacity in sectors {}",
+ zone_size_sects, zone_capacity_sects);
+ sb = make_metadata(
+ size,
+ config.meta,
+ stat,
+ zone_size_sects,
+ zone_capacity_sects,
+ nr_zones);
+ metadata = sb;
+ stats.metadata_write.increment(
+ ceph::encoded_sizeof_bounded<zbd_sm_metadata_t>());
+ DEBUG("Wrote to stats.");
+ return write_metadata(device, sb);
+ }).finally([&, FNAME] {
+ DEBUG("Closing device.");
+ return device.close();
+ }).safe_then([FNAME] {
+ DEBUG("Returning from mkfs.");
+ return mkfs_ertr::now();
+ });
+ });
+ });
+}
+
+ZBDSegmentManager::mkfs_ret ZBDSegmentManager::shard_mkfs()
+{
+ LOG_PREFIX(ZBDSegmentManager::shard_mkfs);
+ INFO("starting, device_path {}", device_path);
+ return open_device(
+ device_path, seastar::open_flags::rw
+ ).safe_then([=, this](auto p) {
+ device = std::move(p.first);
+ auto sd = p.second;
+ return read_metadata(device, sd);
+ }).safe_then([=, this](auto meta){
+ shard_info = meta.shard_infos[seastar::this_shard_id()];
+ metadata = meta;
+ return device.close();
+ }).safe_then([FNAME] {
+ DEBUG("Returning from shard_mkfs.");
+ return mkfs_ertr::now();
+ });
+}
+
+// Return range of sectors to operate on.
+struct blk_zone_range make_range(
+ segment_id_t id,
+ size_t segment_size,
+ size_t first_segment_offset)
+{
+ return blk_zone_range{
+ (id.device_segment_id() * (segment_size >> SECT_SHIFT)
+ + (first_segment_offset >> SECT_SHIFT)),
+ (segment_size >> SECT_SHIFT)
+ };
+}
+
+using blk_zone_op_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+using blk_zone_op_ret = blk_zone_op_ertr::future<>;
+blk_zone_op_ret blk_zone_op(seastar::file &device,
+ blk_zone_range &range,
+ zone_op op) {
+ LOG_PREFIX(ZBDSegmentManager::blk_zone_op);
+
+ unsigned long ioctl_op = 0;
+ switch (op) {
+ using enum zone_op;
+ case OPEN:
+ ioctl_op = BLKOPENZONE;
+ break;
+ case FINISH:
+ ioctl_op = BLKFINISHZONE;
+ break;
+ case RESET:
+ ioctl_op = BLKRESETZONE;
+ break;
+ case CLOSE:
+ ioctl_op = BLKCLOSEZONE;
+ break;
+ default:
+ ERROR("Invalid zone operation {}", op);
+ ceph_assert(ioctl_op);
+ }
+
+ return device.ioctl(
+ ioctl_op,
+ &range
+ ).then_wrapped([=](auto f) -> blk_zone_op_ret {
+ if (f.failed()) {
+ ERROR("{} ioctl failed", op);
+ return crimson::ct_error::input_output_error::make();
+ } else {
+ int ret = f.get();
+ if (ret == 0) {
+ return seastar::now();
+ } else {
+ ERROR("{} ioctl failed with return code {}", op, ret);
+ return crimson::ct_error::input_output_error::make();
+ }
+ }
+ });
+}
+
+ZBDSegmentManager::open_ertr::future<SegmentRef> ZBDSegmentManager::open(
+ segment_id_t id)
+{
+ LOG_PREFIX(ZBDSegmentManager::open);
+ return seastar::do_with(
+ blk_zone_range{},
+ [=, this](auto &range) {
+ range = make_range(
+ id,
+ metadata.segment_size,
+ shard_info.first_segment_offset);
+ return blk_zone_op(
+ device,
+ range,
+ zone_op::OPEN
+ );
+ }
+ ).safe_then([=, this] {
+ DEBUG("segment {}, open successful", id);
+ return open_ertr::future<SegmentRef>(
+ open_ertr::ready_future_marker{},
+ SegmentRef(new ZBDSegment(*this, id))
+ );
+ });
+}
+
+ZBDSegmentManager::release_ertr::future<> ZBDSegmentManager::release(
+ segment_id_t id)
+{
+ LOG_PREFIX(ZBDSegmentManager::release);
+ DEBUG("Resetting zone/segment {}", id);
+ return seastar::do_with(
+ blk_zone_range{},
+ [=, this](auto &range) {
+ range = make_range(
+ id,
+ metadata.segment_size,
+ shard_info.first_segment_offset);
+ return blk_zone_op(
+ device,
+ range,
+ zone_op::RESET
+ );
+ }
+ ).safe_then([=] {
+ DEBUG("segment release successful");
+ return release_ertr::now();
+ });
+}
+
+SegmentManager::read_ertr::future<> ZBDSegmentManager::read(
+ paddr_t addr,
+ size_t len,
+ ceph::bufferptr &out)
+{
+ LOG_PREFIX(ZBDSegmentManager::read);
+ auto& seg_addr = addr.as_seg_paddr();
+ if (seg_addr.get_segment_id().device_segment_id() >= get_num_segments()) {
+ ERROR("invalid segment {}",
+ seg_addr.get_segment_id().device_segment_id());
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (seg_addr.get_segment_off() + len > metadata.segment_capacity) {
+ ERROR("invalid read offset {}, len {}",
+ addr,
+ len);
+ return crimson::ct_error::invarg::make();
+ }
+ return do_read(
+ device,
+ get_offset(addr),
+ len,
+ out);
+}
+
+Segment::close_ertr::future<> ZBDSegmentManager::segment_close(
+ segment_id_t id, segment_off_t write_pointer)
+{
+ LOG_PREFIX(ZBDSegmentManager::segment_close);
+ return seastar::do_with(
+ blk_zone_range{},
+ [=, this](auto &range) {
+ range = make_range(
+ id,
+ metadata.segment_size,
+ shard_info.first_segment_offset);
+ return blk_zone_op(
+ device,
+ range,
+ zone_op::FINISH
+ );
+ }
+ ).safe_then([=] {
+ DEBUG("zone finish successful");
+ return Segment::close_ertr::now();
+ });
+}
+
+Segment::write_ertr::future<> ZBDSegmentManager::segment_write(
+ paddr_t addr,
+ ceph::bufferlist bl,
+ bool ignore_check)
+{
+ LOG_PREFIX(ZBDSegmentManager::segment_write);
+ assert(addr.get_device_id() == get_device_id());
+ assert((bl.length() % metadata.block_size) == 0);
+ auto& seg_addr = addr.as_seg_paddr();
+ DEBUG("write to segment {} at offset {}, physical offset {}, len {}",
+ seg_addr.get_segment_id(),
+ seg_addr.get_segment_off(),
+ get_offset(addr),
+ bl.length());
+ stats.data_write.increment(bl.length());
+ return do_writev(
+ device,
+ get_offset(addr),
+ std::move(bl),
+ metadata.block_size);
+}
+
+device_id_t ZBDSegmentManager::get_device_id() const
+{
+ return metadata.device_id;
+};
+
+secondary_device_set_t& ZBDSegmentManager::get_secondary_devices()
+{
+ return metadata.secondary_devices;
+};
+
+magic_t ZBDSegmentManager::get_magic() const
+{
+ return metadata.magic;
+};
+
+segment_off_t ZBDSegment::get_write_capacity() const
+{
+ return manager.get_segment_size();
+}
+
+SegmentManager::close_ertr::future<> ZBDSegmentManager::close()
+{
+ if (device) {
+ return device.close();
+ }
+ return seastar::now();
+}
+
+Segment::close_ertr::future<> ZBDSegment::close()
+{
+ return manager.segment_close(id, write_pointer);
+}
+
+Segment::write_ertr::future<> ZBDSegment::write(
+ segment_off_t offset, ceph::bufferlist bl)
+{
+ LOG_PREFIX(ZBDSegment::write);
+ if (offset != write_pointer || offset % manager.metadata.block_size != 0) {
+ ERROR("Segment offset and zone write pointer mismatch. "
+ "segment {} segment-offset {} write pointer {}",
+ id, offset, write_pointer);
+ return crimson::ct_error::invarg::make();
+ }
+ if (offset + bl.length() > manager.metadata.segment_capacity) {
+ return crimson::ct_error::enospc::make();
+ }
+
+ write_pointer = offset + bl.length();
+ return manager.segment_write(paddr_t::make_seg_paddr(id, offset), bl);
+}
+
+Segment::write_ertr::future<> ZBDSegment::write_padding_bytes(
+ size_t padding_bytes)
+{
+ LOG_PREFIX(ZBDSegment::write_padding_bytes);
+ DEBUG("Writing {} padding bytes to segment {} at wp {}",
+ padding_bytes, id, write_pointer);
+
+ return crimson::repeat([FNAME, padding_bytes, this] () mutable {
+ size_t bufsize = 0;
+ if (padding_bytes >= MAX_PADDING_SIZE) {
+ bufsize = MAX_PADDING_SIZE;
+ } else {
+ bufsize = padding_bytes;
+ }
+
+ padding_bytes -= bufsize;
+ bufferptr bp(ceph::buffer::create_page_aligned(bufsize));
+ bp.zero();
+ bufferlist padd_bl;
+ padd_bl.append(bp);
+ return write(write_pointer, padd_bl).safe_then([FNAME, padding_bytes, this]() {
+ if (padding_bytes == 0) {
+ return write_ertr::make_ready_future<seastar::stop_iteration>(seastar::stop_iteration::yes);
+ } else {
+ return write_ertr::make_ready_future<seastar::stop_iteration>(seastar::stop_iteration::no);
+ }
+ });
+ });
+}
+
+// Advance write pointer, to given offset.
+Segment::write_ertr::future<> ZBDSegment::advance_wp(
+ segment_off_t offset)
+{
+ LOG_PREFIX(ZBDSegment::advance_wp);
+
+ DEBUG("Advancing write pointer from {} to {}", write_pointer, offset);
+ if (offset < write_pointer) {
+ return crimson::ct_error::invarg::make();
+ }
+
+ size_t padding_bytes = offset - write_pointer;
+
+ if (padding_bytes == 0) {
+ return write_ertr::now();
+ }
+
+ assert(padding_bytes % manager.metadata.block_size == 0);
+
+ return write_padding_bytes(padding_bytes);
+}
+
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <linux/blkzoned.h>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <seastar/core/file.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/reactor.hh>
+
+#include "crimson/common/layout.h"
+
+#include "crimson/os/seastore/segment_manager.h"
+
+#include "include/uuid.h"
+
+namespace crimson::os::seastore::segment_manager::zbd {
+
+ struct zbd_shard_info_t {
+ size_t size = 0;
+ size_t segments = 0;
+ size_t first_segment_offset = 0;
+
+ DENC(zbd_shard_info_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.size, p);
+ denc(v.segments, p);
+ denc(v.first_segment_offset, p);
+ DENC_FINISH(p);
+ }
+ };
+
+ struct zbd_sm_metadata_t {
+ unsigned int shard_num = 0;
+ size_t segment_size = 0;
+ size_t segment_capacity = 0;
+ size_t zones_per_segment = 0;
+ size_t zone_capacity = 0;
+ size_t block_size = 0;
+ size_t zone_size = 0;
+
+ std::vector<zbd_shard_info_t> shard_infos;
+
+ seastore_meta_t meta;
+
+ bool major_dev = false;
+ magic_t magic = 0;
+ device_type_t dtype = device_type_t::NONE;
+ device_id_t device_id = 0;
+ secondary_device_set_t secondary_devices;
+
+ DENC(zbd_sm_metadata_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.shard_num, p);
+ denc(v.segment_size, p);
+ denc(v.segment_capacity, p);
+ denc(v.zones_per_segment, p);
+ denc(v.zone_capacity, p);
+ denc(v.block_size, p);
+ denc(v.zone_size, p);
+ denc(v.shard_infos, p);
+ denc(v.meta, p);
+ denc(v.magic, p);
+ denc(v.dtype, p);
+ denc(v.device_id, p);
+ if (v.major_dev) {
+ denc(v.secondary_devices, p);
+ }
+ DENC_FINISH(p);
+ }
+
+ void validate() const {
+ ceph_assert_always(shard_num == seastar::smp::count);
+ for (unsigned int i = 0; i < seastar::smp::count; i++) {
+ ceph_assert_always(shard_infos[i].size > 0);
+ ceph_assert_always(shard_infos[i].size <= DEVICE_OFF_MAX);
+ ceph_assert_always(shard_infos[i].segments > 0);
+ ceph_assert_always(shard_infos[i].segments <= DEVICE_SEGMENT_ID_MAX);
+ }
+ ceph_assert_always(segment_capacity > 0);
+ ceph_assert_always(segment_capacity <= SEGMENT_OFF_MAX);
+ }
+ };
+
+ using write_ertr = crimson::errorator<crimson::ct_error::input_output_error>;
+ using read_ertr = crimson::errorator<crimson::ct_error::input_output_error>;
+
+ enum class zone_op {
+ OPEN,
+ FINISH,
+ CLOSE,
+ RESET,
+ };
+
+ class ZBDSegmentManager;
+
+ class ZBDSegment final : public Segment {
+ public:
+ ZBDSegment(ZBDSegmentManager &man, segment_id_t i) : manager(man), id(i){};
+
+ segment_id_t get_segment_id() const final { return id; }
+ segment_off_t get_write_capacity() const final;
+ segment_off_t get_write_ptr() const final { return write_pointer; }
+ close_ertr::future<> close() final;
+ write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final;
+ write_ertr::future<> advance_wp(segment_off_t offset) final;
+
+ ~ZBDSegment() {}
+ private:
+ friend class ZBDSegmentManager;
+ ZBDSegmentManager &manager;
+ const segment_id_t id;
+ segment_off_t write_pointer = 0;
+ write_ertr::future<> write_padding_bytes(size_t padding_bytes);
+ };
+
+ class ZBDSegmentManager final : public SegmentManager{
+ // interfaces used by Device
+ public:
+ seastar::future<> start() {
+ return shard_devices.start(device_path);
+ }
+
+ seastar::future<> stop() {
+ return shard_devices.stop();
+ }
+
+ Device& get_sharded_device() final {
+ return shard_devices.local();
+ }
+
+ mount_ret mount() final;
+ mkfs_ret mkfs(device_config_t meta) final;
+
+ ZBDSegmentManager(const std::string &path) : device_path(path) {}
+
+ ~ZBDSegmentManager() final = default;
+
+ //interfaces used by each shard device
+ public:
+ open_ertr::future<SegmentRef> open(segment_id_t id) final;
+ close_ertr::future<> close() final;
+
+ release_ertr::future<> release(segment_id_t id) final;
+
+ read_ertr::future<> read(
+ paddr_t addr,
+ size_t len,
+ ceph::bufferptr &out) final;
+
+ device_type_t get_device_type() const final {
+ return device_type_t::ZBD;
+ }
+
+ size_t get_available_size() const final {
+ return shard_info.size;
+ };
+
+ extent_len_t get_block_size() const final {
+ return metadata.block_size;
+ };
+
+ segment_off_t get_segment_size() const final {
+ return metadata.segment_capacity;
+ };
+
+ const seastore_meta_t &get_meta() const {
+ return metadata.meta;
+ };
+
+ device_id_t get_device_id() const final;
+
+ secondary_device_set_t& get_secondary_devices() final;
+
+ magic_t get_magic() const final;
+
+ Segment::write_ertr::future<> segment_write(
+ paddr_t addr,
+ ceph::bufferlist bl,
+ bool ignore_check=false);
+
+ private:
+ friend class ZBDSegment;
+ std::string device_path;
+ zbd_shard_info_t shard_info;
+ zbd_sm_metadata_t metadata;
+ seastar::file device;
+ uint32_t nr_zones;
+ struct effort_t {
+ uint64_t num = 0;
+ uint64_t bytes = 0;
+
+ void increment(uint64_t read_bytes) {
+ ++num;
+ bytes += read_bytes;
+ }
+ };
+
+ struct zbd_sm_stats {
+ effort_t data_read = {};
+ effort_t data_write = {};
+ effort_t metadata_write = {};
+ uint64_t opened_segments = 0;
+ uint64_t closed_segments = 0;
+ uint64_t closed_segments_unused_bytes = 0;
+ uint64_t released_segments = 0;
+
+ void reset() {
+ *this = zbd_sm_stats{};
+ }
+ } stats;
+
+ void register_metrics();
+ seastar::metrics::metric_group metrics;
+
+ Segment::close_ertr::future<> segment_close(
+ segment_id_t id, segment_off_t write_pointer);
+
+ uint64_t get_offset(paddr_t addr) {
+ auto& seg_addr = addr.as_seg_paddr();
+ return (shard_info.first_segment_offset +
+ (seg_addr.get_segment_id().device_segment_id() *
+ metadata.segment_size)) + seg_addr.get_segment_off();
+ }
+ private:
+ // shard 0 mkfs
+ mkfs_ret primary_mkfs(device_config_t meta);
+ // all shards mkfs
+ mkfs_ret shard_mkfs();
+
+ mount_ret shard_mount();
+
+ seastar::sharded<ZBDSegmentManager> shard_devices;
+ };
+
+}
+
+WRITE_CLASS_DENC_BOUNDED(
+ crimson::os::seastore::segment_manager::zbd::zbd_shard_info_t
+)
+WRITE_CLASS_DENC_BOUNDED(
+ crimson::os::seastore::segment_manager::zbd::zbd_sm_metadata_t
+)
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include <sys/mman.h>
-#include <string.h>
-#include <linux/blkzoned.h>
-
-#include <fmt/format.h>
-#include "crimson/os/seastore/segment_manager/zns.h"
-#include "crimson/common/config_proxy.h"
-#include "crimson/os/seastore/logging.h"
-#include "include/buffer.h"
-
-SET_SUBSYS(seastore_device);
-
-#define SECT_SHIFT 9
-#define RESERVED_ZONES 1
-// limit the max padding buf size to 1MB
-#define MAX_PADDING_SIZE 1048576
-
-using z_op = crimson::os::seastore::segment_manager::zns::zone_op;
-template <> struct fmt::formatter<z_op>: fmt::formatter<std::string_view> {
- template <typename FormatContext>
- auto format(z_op s, FormatContext& ctx) {
- std::string_view name = "Unknown";
- switch (s) {
- using enum z_op;
- case OPEN:
- name = "BLKOPENZONE";
- break;
- case FINISH:
- name = "BLKFINISHZONE";
- break;
- case CLOSE:
- name = "BLKCLOSEZONE";
- break;
- case RESET:
- name = "BLKRESETZONE";
- break;
- }
- return formatter<string_view>::format(name, ctx);
- }
-};
-
-namespace crimson::os::seastore::segment_manager::zns {
-
-using open_device_ret = ZNSSegmentManager::access_ertr::future<
- std::pair<seastar::file, seastar::stat_data>>;
-static open_device_ret open_device(
- const std::string &path,
- seastar::open_flags mode)
-{
- LOG_PREFIX(ZNSSegmentManager::open_device);
- return seastar::file_stat(
- path, seastar::follow_symlink::yes
- ).then([FNAME, mode, &path](auto stat) mutable {
- return seastar::open_file_dma(path, mode).then([=](auto file) {
- DEBUG("open of device {} successful, size {}",
- path,
- stat.size);
- return std::make_pair(file, stat);
- });
- }).handle_exception(
- [FNAME](auto e) -> open_device_ret {
- ERROR("got error {}",
- e);
- return crimson::ct_error::input_output_error::make();
- }
- );
-}
-
-static zns_sm_metadata_t make_metadata(
- uint64_t total_size,
- seastore_meta_t meta,
- const seastar::stat_data &data,
- size_t zone_size_sectors,
- size_t zone_capacity_sectors,
- size_t num_zones)
-{
- LOG_PREFIX(ZNSSegmentManager::make_metadata);
-
- // TODO: support Option::size_t seastore_segment_size
- // to allow zones_per_segment > 1 with striping.
- size_t zone_size = zone_size_sectors << SECT_SHIFT;
- size_t zone_capacity = zone_capacity_sectors << SECT_SHIFT;
- size_t segment_size = zone_size;
- size_t zones_per_segment = segment_size / zone_size;
- size_t segments = (num_zones - RESERVED_ZONES) / zones_per_segment;
- size_t per_shard_segments = segments / seastar::smp::count;
- size_t available_size = zone_capacity * segments;
- size_t per_shard_available_size = zone_capacity * per_shard_segments;
- std::vector<zns_shard_info_t> shard_infos(seastar::smp::count);
- for (unsigned int i = 0; i < seastar::smp::count; i++) {
- shard_infos[i].size = per_shard_available_size;
- shard_infos[i].segments = per_shard_segments;
- shard_infos[i].first_segment_offset = zone_size * RESERVED_ZONES
- + i * segment_size* per_shard_segments;
- }
-
- assert(total_size == num_zones * zone_size);
-
- WARN("Ignoring configuration values for device and segment size");
- INFO(
- "device size {}, available_size {}, block_size {}, allocated_size {},"
- " total zones {}, zone_size {}, zone_capacity {},"
- " total segments {}, zones per segment {}, segment size {}",
- total_size,
- available_size,
- data.block_size,
- data.allocated_size,
- num_zones,
- zone_size,
- zone_capacity,
- segments,
- zones_per_segment,
- zone_capacity * zones_per_segment);
-
- zns_sm_metadata_t ret = zns_sm_metadata_t{
- seastar::smp::count,
- segment_size,
- zone_capacity * zones_per_segment,
- zones_per_segment,
- zone_capacity,
- data.block_size,
- zone_size,
- shard_infos,
- meta};
- ret.validate();
- return ret;
-}
-
-struct ZoneReport {
- struct blk_zone_report *hdr;
- ZoneReport(int nr_zones)
- : hdr((blk_zone_report *)malloc(
- sizeof(struct blk_zone_report) + nr_zones * sizeof(struct blk_zone))){;}
- ~ZoneReport(){
- free(hdr);
- }
- ZoneReport(const ZoneReport &) = delete;
- ZoneReport(ZoneReport &&rhs) : hdr(rhs.hdr) {
- rhs.hdr = nullptr;
- }
-};
-
-static seastar::future<size_t> get_blk_dev_size(
- seastar::file &device)
-{
- return seastar::do_with(
- (uint64_t)0,
- [&](auto& size_sects) {
- return device.ioctl(
- BLKGETSIZE,
- (void *)&size_sects
- ).then([&](int ret) {
- ceph_assert(size_sects);
- size_t size = size_sects << SECT_SHIFT;
- return seastar::make_ready_future<size_t>(size);
- });
- });
-}
-
-// zone_size should be in 512B sectors
-static seastar::future<> reset_device(
- seastar::file &device,
- uint64_t zone_size_sects,
- uint64_t nr_zones)
-{
- return seastar::do_with(
- blk_zone_range{},
- [&, nr_zones, zone_size_sects](auto &range) {
- range.sector = 0;
- range.nr_sectors = zone_size_sects * nr_zones;
- return device.ioctl(
- BLKRESETZONE,
- &range
- ).then([&](int ret){
- return seastar::now();
- });
- }
- );
-}
-
-static seastar::future<size_t> get_zone_capacity(
- seastar::file &device,
- uint32_t nr_zones)
-{
- return seastar::do_with(
- ZoneReport(nr_zones),
- [&](auto &zr) {
- zr.hdr->sector = 0;
- zr.hdr->nr_zones = nr_zones;
- return device.ioctl(
- BLKREPORTZONE,
- zr.hdr
- ).then([&](int ret) {
- return seastar::make_ready_future<size_t>(zr.hdr->zones[0].capacity);
- });
- }
- );
-}
-
-static write_ertr::future<> do_write(
- seastar::file &device,
- uint64_t offset,
- bufferptr &bptr)
-{
- LOG_PREFIX(ZNSSegmentManager::do_write);
- DEBUG("offset {} len {}",
- offset,
- bptr.length());
- return device.dma_write(
- offset,
- bptr.c_str(),
- bptr.length()
- ).handle_exception(
- [FNAME](auto e) -> write_ertr::future<size_t> {
- ERROR("dma_write got error {}",
- e);
- return crimson::ct_error::input_output_error::make();
- }
- ).then([length = bptr.length()](auto result) -> write_ertr::future<> {
- if (result != length) {
- return crimson::ct_error::input_output_error::make();
- }
- return write_ertr::now();
- });
-}
-
-static write_ertr::future<> do_writev(
- seastar::file &device,
- uint64_t offset,
- bufferlist&& bl,
- size_t block_size)
-{
- LOG_PREFIX(ZNSSegmentManager::do_writev);
- DEBUG("offset {} len {}",
- offset,
- bl.length());
- // writev requires each buffer to be aligned to the disks' block
- // size, we need to rebuild here
- bl.rebuild_aligned(block_size);
-
- std::vector<iovec> iov;
- bl.prepare_iov(&iov);
- return device.dma_write(
- offset,
- std::move(iov)
- ).handle_exception(
- [FNAME](auto e) -> write_ertr::future<size_t> {
- ERROR("dma_write got error {}",
- e);
- return crimson::ct_error::input_output_error::make();
- }
- ).then([bl=std::move(bl)/* hold the buf until the end of io */](size_t written)
- -> write_ertr::future<> {
- if (written != bl.length()) {
- return crimson::ct_error::input_output_error::make();
- }
- return write_ertr::now();
- });
-}
-
-static ZNSSegmentManager::access_ertr::future<>
-write_metadata(seastar::file &device, zns_sm_metadata_t sb)
-{
- assert(ceph::encoded_sizeof_bounded<zns_sm_metadata_t>() <
- sb.block_size);
- return seastar::do_with(
- bufferptr(ceph::buffer::create_page_aligned(sb.block_size)),
- [=, &device](auto &bp) {
- LOG_PREFIX(ZNSSegmentManager::write_metadata);
- DEBUG("block_size {}", sb.block_size);
- bufferlist bl;
- encode(sb, bl);
- auto iter = bl.begin();
- assert(bl.length() < sb.block_size);
- DEBUG("buffer length {}", bl.length());
- iter.copy(bl.length(), bp.c_str());
- DEBUG("doing writeout");
- return do_write(device, 0, bp);
- });
-}
-
-static read_ertr::future<> do_read(
- seastar::file &device,
- uint64_t offset,
- size_t len,
- bufferptr &bptr)
-{
- LOG_PREFIX(ZNSSegmentManager::do_read);
- assert(len <= bptr.length());
- DEBUG("offset {} len {}",
- offset,
- len);
- return device.dma_read(
- offset,
- bptr.c_str(),
- len
- ).handle_exception(
- [FNAME](auto e) -> read_ertr::future<size_t> {
- ERROR("dma_read got error {}",
- e);
- return crimson::ct_error::input_output_error::make();
- }
- ).then([len](auto result) -> read_ertr::future<> {
- if (result != len) {
- return crimson::ct_error::input_output_error::make();
- }
- return read_ertr::now();
- });
-}
-
-static
-ZNSSegmentManager::access_ertr::future<zns_sm_metadata_t>
-read_metadata(seastar::file &device, seastar::stat_data sd)
-{
- assert(ceph::encoded_sizeof_bounded<zns_sm_metadata_t>() <
- sd.block_size);
- return seastar::do_with(
- bufferptr(ceph::buffer::create_page_aligned(sd.block_size)),
- [=, &device](auto &bp) {
- return do_read(
- device,
- 0,
- bp.length(),
- bp
- ).safe_then([=, &bp] {
- bufferlist bl;
- bl.push_back(bp);
- zns_sm_metadata_t ret;
- auto bliter = bl.cbegin();
- decode(ret, bliter);
- ret.validate();
- return ZNSSegmentManager::access_ertr::future<zns_sm_metadata_t>(
- ZNSSegmentManager::access_ertr::ready_future_marker{},
- ret);
- });
- });
-}
-
-ZNSSegmentManager::mount_ret ZNSSegmentManager::mount()
-{
- return shard_devices.invoke_on_all([](auto &local_device) {
- return local_device.shard_mount(
- ).handle_error(
- crimson::ct_error::assert_all{
- "Invalid error in ZNSSegmentManager::mount"
- });
- });
-}
-
-ZNSSegmentManager::mount_ret ZNSSegmentManager::shard_mount()
-{
- return open_device(
- device_path, seastar::open_flags::rw
- ).safe_then([=, this](auto p) {
- device = std::move(p.first);
- auto sd = p.second;
- return read_metadata(device, sd);
- }).safe_then([=, this](auto meta){
- shard_info = meta.shard_infos[seastar::this_shard_id()];
- metadata = meta;
- return mount_ertr::now();
- });
-}
-
-ZNSSegmentManager::mkfs_ret ZNSSegmentManager::mkfs(
- device_config_t config)
-{
- return shard_devices.local().primary_mkfs(config
- ).safe_then([this] {
- return shard_devices.invoke_on_all([](auto &local_device) {
- return local_device.shard_mkfs(
- ).handle_error(
- crimson::ct_error::assert_all{
- "Invalid error in ZNSSegmentManager::mkfs"
- });
- });
- });
-}
-
-ZNSSegmentManager::mkfs_ret ZNSSegmentManager::primary_mkfs(
- device_config_t config)
-{
- LOG_PREFIX(ZNSSegmentManager::primary_mkfs);
- INFO("starting, device_path {}", device_path);
- return seastar::do_with(
- seastar::file{},
- seastar::stat_data{},
- zns_sm_metadata_t{},
- size_t(),
- size_t(),
- size_t(),
- [=, this](auto &device, auto &stat, auto &sb, auto &zone_size_sects, auto &nr_zones, auto &size) {
- return open_device(
- device_path,
- seastar::open_flags::rw
- ).safe_then([=, this, &device, &stat, &sb, &zone_size_sects, &nr_zones, &size](auto p) {
- device = p.first;
- stat = p.second;
- return device.ioctl(
- BLKGETNRZONES,
- (void *)&nr_zones
- ).then([&](int ret) {
- if (nr_zones == 0) {
- return seastar::make_exception_future<int>(
- std::system_error(std::make_error_code(std::errc::io_error)));
- }
- return device.ioctl(BLKGETZONESZ, (void *)&zone_size_sects);
- }).then([&](int ret) {
- ceph_assert(zone_size_sects);
- return reset_device(device, zone_size_sects, nr_zones);
- }).then([&] {
- return get_blk_dev_size(device);
- }).then([&](auto devsize) {
- size = devsize;
- return get_zone_capacity(device, nr_zones);
- }).then([&, FNAME, config](auto zone_capacity_sects) {
- ceph_assert(zone_capacity_sects);
- DEBUG("zone_size in sectors {}, zone_capacity in sectors {}",
- zone_size_sects, zone_capacity_sects);
- sb = make_metadata(
- size,
- config.meta,
- stat,
- zone_size_sects,
- zone_capacity_sects,
- nr_zones);
- metadata = sb;
- stats.metadata_write.increment(
- ceph::encoded_sizeof_bounded<zns_sm_metadata_t>());
- DEBUG("Wrote to stats.");
- return write_metadata(device, sb);
- }).finally([&, FNAME] {
- DEBUG("Closing device.");
- return device.close();
- }).safe_then([FNAME] {
- DEBUG("Returning from mkfs.");
- return mkfs_ertr::now();
- });
- });
- });
-}
-
-ZNSSegmentManager::mkfs_ret ZNSSegmentManager::shard_mkfs()
-{
- LOG_PREFIX(ZNSSegmentManager::shard_mkfs);
- INFO("starting, device_path {}", device_path);
- return open_device(
- device_path, seastar::open_flags::rw
- ).safe_then([=, this](auto p) {
- device = std::move(p.first);
- auto sd = p.second;
- return read_metadata(device, sd);
- }).safe_then([=, this](auto meta){
- shard_info = meta.shard_infos[seastar::this_shard_id()];
- metadata = meta;
- return device.close();
- }).safe_then([FNAME] {
- DEBUG("Returning from shard_mkfs.");
- return mkfs_ertr::now();
- });
-}
-
-// Return range of sectors to operate on.
-struct blk_zone_range make_range(
- segment_id_t id,
- size_t segment_size,
- size_t first_segment_offset)
-{
- return blk_zone_range{
- (id.device_segment_id() * (segment_size >> SECT_SHIFT)
- + (first_segment_offset >> SECT_SHIFT)),
- (segment_size >> SECT_SHIFT)
- };
-}
-
-using blk_zone_op_ertr = crimson::errorator<
- crimson::ct_error::input_output_error>;
-using blk_zone_op_ret = blk_zone_op_ertr::future<>;
-blk_zone_op_ret blk_zone_op(seastar::file &device,
- blk_zone_range &range,
- zone_op op) {
- LOG_PREFIX(ZNSSegmentManager::blk_zone_op);
-
- unsigned long ioctl_op = 0;
- switch (op) {
- using enum zone_op;
- case OPEN:
- ioctl_op = BLKOPENZONE;
- break;
- case FINISH:
- ioctl_op = BLKFINISHZONE;
- break;
- case RESET:
- ioctl_op = BLKRESETZONE;
- break;
- case CLOSE:
- ioctl_op = BLKCLOSEZONE;
- break;
- default:
- ERROR("Invalid zone operation {}", op);
- ceph_assert(ioctl_op);
- }
-
- return device.ioctl(
- ioctl_op,
- &range
- ).then_wrapped([=](auto f) -> blk_zone_op_ret {
- if (f.failed()) {
- ERROR("{} ioctl failed", op);
- return crimson::ct_error::input_output_error::make();
- } else {
- int ret = f.get();
- if (ret == 0) {
- return seastar::now();
- } else {
- ERROR("{} ioctl failed with return code {}", op, ret);
- return crimson::ct_error::input_output_error::make();
- }
- }
- });
-}
-
-ZNSSegmentManager::open_ertr::future<SegmentRef> ZNSSegmentManager::open(
- segment_id_t id)
-{
- LOG_PREFIX(ZNSSegmentManager::open);
- return seastar::do_with(
- blk_zone_range{},
- [=, this](auto &range) {
- range = make_range(
- id,
- metadata.segment_size,
- shard_info.first_segment_offset);
- return blk_zone_op(
- device,
- range,
- zone_op::OPEN
- );
- }
- ).safe_then([=, this] {
- DEBUG("segment {}, open successful", id);
- return open_ertr::future<SegmentRef>(
- open_ertr::ready_future_marker{},
- SegmentRef(new ZNSSegment(*this, id))
- );
- });
-}
-
-ZNSSegmentManager::release_ertr::future<> ZNSSegmentManager::release(
- segment_id_t id)
-{
- LOG_PREFIX(ZNSSegmentManager::release);
- DEBUG("Resetting zone/segment {}", id);
- return seastar::do_with(
- blk_zone_range{},
- [=, this](auto &range) {
- range = make_range(
- id,
- metadata.segment_size,
- shard_info.first_segment_offset);
- return blk_zone_op(
- device,
- range,
- zone_op::RESET
- );
- }
- ).safe_then([=] {
- DEBUG("segment release successful");
- return release_ertr::now();
- });
-}
-
-SegmentManager::read_ertr::future<> ZNSSegmentManager::read(
- paddr_t addr,
- size_t len,
- ceph::bufferptr &out)
-{
- LOG_PREFIX(ZNSSegmentManager::read);
- auto& seg_addr = addr.as_seg_paddr();
- if (seg_addr.get_segment_id().device_segment_id() >= get_num_segments()) {
- ERROR("invalid segment {}",
- seg_addr.get_segment_id().device_segment_id());
- return crimson::ct_error::invarg::make();
- }
-
- if (seg_addr.get_segment_off() + len > metadata.segment_capacity) {
- ERROR("invalid read offset {}, len {}",
- addr,
- len);
- return crimson::ct_error::invarg::make();
- }
- return do_read(
- device,
- get_offset(addr),
- len,
- out);
-}
-
-Segment::close_ertr::future<> ZNSSegmentManager::segment_close(
- segment_id_t id, segment_off_t write_pointer)
-{
- LOG_PREFIX(ZNSSegmentManager::segment_close);
- return seastar::do_with(
- blk_zone_range{},
- [=, this](auto &range) {
- range = make_range(
- id,
- metadata.segment_size,
- shard_info.first_segment_offset);
- return blk_zone_op(
- device,
- range,
- zone_op::FINISH
- );
- }
- ).safe_then([=] {
- DEBUG("zone finish successful");
- return Segment::close_ertr::now();
- });
-}
-
-Segment::write_ertr::future<> ZNSSegmentManager::segment_write(
- paddr_t addr,
- ceph::bufferlist bl,
- bool ignore_check)
-{
- LOG_PREFIX(ZNSSegmentManager::segment_write);
- assert(addr.get_device_id() == get_device_id());
- assert((bl.length() % metadata.block_size) == 0);
- auto& seg_addr = addr.as_seg_paddr();
- DEBUG("write to segment {} at offset {}, physical offset {}, len {}",
- seg_addr.get_segment_id(),
- seg_addr.get_segment_off(),
- get_offset(addr),
- bl.length());
- stats.data_write.increment(bl.length());
- return do_writev(
- device,
- get_offset(addr),
- std::move(bl),
- metadata.block_size);
-}
-
-device_id_t ZNSSegmentManager::get_device_id() const
-{
- return metadata.device_id;
-};
-
-secondary_device_set_t& ZNSSegmentManager::get_secondary_devices()
-{
- return metadata.secondary_devices;
-};
-
-magic_t ZNSSegmentManager::get_magic() const
-{
- return metadata.magic;
-};
-
-segment_off_t ZNSSegment::get_write_capacity() const
-{
- return manager.get_segment_size();
-}
-
-SegmentManager::close_ertr::future<> ZNSSegmentManager::close()
-{
- if (device) {
- return device.close();
- }
- return seastar::now();
-}
-
-Segment::close_ertr::future<> ZNSSegment::close()
-{
- return manager.segment_close(id, write_pointer);
-}
-
-Segment::write_ertr::future<> ZNSSegment::write(
- segment_off_t offset, ceph::bufferlist bl)
-{
- LOG_PREFIX(ZNSSegment::write);
- if (offset != write_pointer || offset % manager.metadata.block_size != 0) {
- ERROR("Segment offset and zone write pointer mismatch. "
- "segment {} segment-offset {} write pointer {}",
- id, offset, write_pointer);
- return crimson::ct_error::invarg::make();
- }
- if (offset + bl.length() > manager.metadata.segment_capacity) {
- return crimson::ct_error::enospc::make();
- }
-
- write_pointer = offset + bl.length();
- return manager.segment_write(paddr_t::make_seg_paddr(id, offset), bl);
-}
-
-Segment::write_ertr::future<> ZNSSegment::write_padding_bytes(
- size_t padding_bytes)
-{
- LOG_PREFIX(ZNSSegment::write_padding_bytes);
- DEBUG("Writing {} padding bytes to segment {} at wp {}",
- padding_bytes, id, write_pointer);
-
- return crimson::repeat([FNAME, padding_bytes, this] () mutable {
- size_t bufsize = 0;
- if (padding_bytes >= MAX_PADDING_SIZE) {
- bufsize = MAX_PADDING_SIZE;
- } else {
- bufsize = padding_bytes;
- }
-
- padding_bytes -= bufsize;
- bufferptr bp(ceph::buffer::create_page_aligned(bufsize));
- bp.zero();
- bufferlist padd_bl;
- padd_bl.append(bp);
- return write(write_pointer, padd_bl).safe_then([FNAME, padding_bytes, this]() {
- if (padding_bytes == 0) {
- return write_ertr::make_ready_future<seastar::stop_iteration>(seastar::stop_iteration::yes);
- } else {
- return write_ertr::make_ready_future<seastar::stop_iteration>(seastar::stop_iteration::no);
- }
- });
- });
-}
-
-// Advance write pointer, to given offset.
-Segment::write_ertr::future<> ZNSSegment::advance_wp(
- segment_off_t offset)
-{
- LOG_PREFIX(ZNSSegment::advance_wp);
-
- DEBUG("Advancing write pointer from {} to {}", write_pointer, offset);
- if (offset < write_pointer) {
- return crimson::ct_error::invarg::make();
- }
-
- size_t padding_bytes = offset - write_pointer;
-
- if (padding_bytes == 0) {
- return write_ertr::now();
- }
-
- assert(padding_bytes % manager.metadata.block_size == 0);
-
- return write_padding_bytes(padding_bytes);
-}
-
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#pragma once
-
-#include <linux/blkzoned.h>
-
-#include <boost/intrusive_ptr.hpp>
-#include <boost/smart_ptr/intrusive_ref_counter.hpp>
-
-#include <seastar/core/file.hh>
-#include <seastar/core/future.hh>
-#include <seastar/core/reactor.hh>
-
-#include "crimson/common/layout.h"
-
-#include "crimson/os/seastore/segment_manager.h"
-
-#include "include/uuid.h"
-
-namespace crimson::os::seastore::segment_manager::zns {
-
- struct zns_shard_info_t {
- size_t size = 0;
- size_t segments = 0;
- size_t first_segment_offset = 0;
-
- DENC(zns_shard_info_t, v, p) {
- DENC_START(1, 1, p);
- denc(v.size, p);
- denc(v.segments, p);
- denc(v.first_segment_offset, p);
- DENC_FINISH(p);
- }
- };
-
- struct zns_sm_metadata_t {
- unsigned int shard_num = 0;
- size_t segment_size = 0;
- size_t segment_capacity = 0;
- size_t zones_per_segment = 0;
- size_t zone_capacity = 0;
- size_t block_size = 0;
- size_t zone_size = 0;
-
- std::vector<zns_shard_info_t> shard_infos;
-
- seastore_meta_t meta;
-
- bool major_dev = false;
- magic_t magic = 0;
- device_type_t dtype = device_type_t::NONE;
- device_id_t device_id = 0;
- secondary_device_set_t secondary_devices;
-
- DENC(zns_sm_metadata_t, v, p) {
- DENC_START(1, 1, p);
- denc(v.shard_num, p);
- denc(v.segment_size, p);
- denc(v.segment_capacity, p);
- denc(v.zones_per_segment, p);
- denc(v.zone_capacity, p);
- denc(v.block_size, p);
- denc(v.zone_size, p);
- denc(v.shard_infos, p);
- denc(v.meta, p);
- denc(v.magic, p);
- denc(v.dtype, p);
- denc(v.device_id, p);
- if (v.major_dev) {
- denc(v.secondary_devices, p);
- }
- DENC_FINISH(p);
- }
-
- void validate() const {
- ceph_assert_always(shard_num == seastar::smp::count);
- for (unsigned int i = 0; i < seastar::smp::count; i++) {
- ceph_assert_always(shard_infos[i].size > 0);
- ceph_assert_always(shard_infos[i].size <= DEVICE_OFF_MAX);
- ceph_assert_always(shard_infos[i].segments > 0);
- ceph_assert_always(shard_infos[i].segments <= DEVICE_SEGMENT_ID_MAX);
- }
- ceph_assert_always(segment_capacity > 0);
- ceph_assert_always(segment_capacity <= SEGMENT_OFF_MAX);
- }
- };
-
- using write_ertr = crimson::errorator<crimson::ct_error::input_output_error>;
- using read_ertr = crimson::errorator<crimson::ct_error::input_output_error>;
-
- enum class zone_op {
- OPEN,
- FINISH,
- CLOSE,
- RESET,
- };
-
- class ZNSSegmentManager;
-
- class ZNSSegment final : public Segment {
- public:
- ZNSSegment(ZNSSegmentManager &man, segment_id_t i) : manager(man), id(i){};
-
- segment_id_t get_segment_id() const final { return id; }
- segment_off_t get_write_capacity() const final;
- segment_off_t get_write_ptr() const final { return write_pointer; }
- close_ertr::future<> close() final;
- write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final;
- write_ertr::future<> advance_wp(segment_off_t offset) final;
-
- ~ZNSSegment() {}
- private:
- friend class ZNSSegmentManager;
- ZNSSegmentManager &manager;
- const segment_id_t id;
- segment_off_t write_pointer = 0;
- write_ertr::future<> write_padding_bytes(size_t padding_bytes);
- };
-
- class ZNSSegmentManager final : public SegmentManager{
- // interfaces used by Device
- public:
- seastar::future<> start() {
- return shard_devices.start(device_path);
- }
-
- seastar::future<> stop() {
- return shard_devices.stop();
- }
-
- Device& get_sharded_device() final {
- return shard_devices.local();
- }
-
- mount_ret mount() final;
- mkfs_ret mkfs(device_config_t meta) final;
-
- ZNSSegmentManager(const std::string &path) : device_path(path) {}
-
- ~ZNSSegmentManager() final = default;
-
- //interfaces used by each shard device
- public:
- open_ertr::future<SegmentRef> open(segment_id_t id) final;
- close_ertr::future<> close() final;
-
- release_ertr::future<> release(segment_id_t id) final;
-
- read_ertr::future<> read(
- paddr_t addr,
- size_t len,
- ceph::bufferptr &out) final;
-
- device_type_t get_device_type() const final {
- return device_type_t::ZNS;
- }
-
- size_t get_available_size() const final {
- return shard_info.size;
- };
-
- extent_len_t get_block_size() const final {
- return metadata.block_size;
- };
-
- segment_off_t get_segment_size() const final {
- return metadata.segment_capacity;
- };
-
- const seastore_meta_t &get_meta() const {
- return metadata.meta;
- };
-
- device_id_t get_device_id() const final;
-
- secondary_device_set_t& get_secondary_devices() final;
-
- magic_t get_magic() const final;
-
- Segment::write_ertr::future<> segment_write(
- paddr_t addr,
- ceph::bufferlist bl,
- bool ignore_check=false);
-
- private:
- friend class ZNSSegment;
- std::string device_path;
- zns_shard_info_t shard_info;
- zns_sm_metadata_t metadata;
- seastar::file device;
- uint32_t nr_zones;
- struct effort_t {
- uint64_t num = 0;
- uint64_t bytes = 0;
-
- void increment(uint64_t read_bytes) {
- ++num;
- bytes += read_bytes;
- }
- };
-
- struct zns_sm_stats {
- effort_t data_read = {};
- effort_t data_write = {};
- effort_t metadata_write = {};
- uint64_t opened_segments = 0;
- uint64_t closed_segments = 0;
- uint64_t closed_segments_unused_bytes = 0;
- uint64_t released_segments = 0;
-
- void reset() {
- *this = zns_sm_stats{};
- }
- } stats;
-
- void register_metrics();
- seastar::metrics::metric_group metrics;
-
- Segment::close_ertr::future<> segment_close(
- segment_id_t id, segment_off_t write_pointer);
-
- uint64_t get_offset(paddr_t addr) {
- auto& seg_addr = addr.as_seg_paddr();
- return (shard_info.first_segment_offset +
- (seg_addr.get_segment_id().device_segment_id() *
- metadata.segment_size)) + seg_addr.get_segment_off();
- }
- private:
- // shard 0 mkfs
- mkfs_ret primary_mkfs(device_config_t meta);
- // all shards mkfs
- mkfs_ret shard_mkfs();
-
- mount_ret shard_mount();
-
- seastar::sharded<ZNSSegmentManager> shard_devices;
- };
-
-}
-
-WRITE_CLASS_DENC_BOUNDED(
- crimson::os::seastore::segment_manager::zns::zns_shard_info_t
-)
-WRITE_CLASS_DENC_BOUNDED(
- crimson::os::seastore::segment_manager::zns::zns_sm_metadata_t
-)