]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/os/seastore: add ZNSSegmentManager 44197/head
authorJoseph Sawaya <jsawaya@redhat.com>
Fri, 3 Dec 2021 17:28:21 +0000 (17:28 +0000)
committerJoseph Sawaya <jsawaya@redhat.com>
Tue, 14 Dec 2021 20:27:59 +0000 (20:27 +0000)
This commit adds the ZNSSegmentManager, so SeaStore can
interface with ZNS devices using the linux/blkzoned
library.

Signed-off-by: Joseph Sawaya <jsawaya@redhat.com>
CMakeLists.txt
src/crimson/os/futurized_store.cc
src/crimson/os/seastore/CMakeLists.txt
src/crimson/os/seastore/seastore.cc
src/crimson/os/seastore/seastore.h
src/crimson/os/seastore/segment_manager.cc [new file with mode: 0644]
src/crimson/os/seastore/segment_manager.h
src/crimson/os/seastore/segment_manager/zns.cc [new file with mode: 0644]
src/crimson/os/seastore/segment_manager/zns.h [new file with mode: 0644]
src/include/config-h.in.cmake

index 5406669eda0817990017cca4a898aee4a296589f..8264a083d9c4ed50e30c3d51fc6b5d66826a167a 100644 (file)
@@ -193,6 +193,12 @@ if(WITH_ZFS)
   set(HAVE_LIBZFS ${ZFS_FOUND})
 endif()
 
+option(WITH_ZNS "enable zns support" OFF)
+if (WITH_ZNS)
+  # TODO: add detection, need kernel header >= 5.5
+  set(HAVE_ZNS ON)
+endif()
+
 option(WITH_BLUESTORE "Bluestore OSD backend" ON)
 if(WITH_BLUESTORE)
   if(LINUX)
index 864510b631b943f93ece571a506888cac364dde2..e072c0d262bd70154ee5871c943450afef426252 100644 (file)
@@ -16,14 +16,20 @@ FuturizedStore::create(const std::string& type,
                        const ConfigValues& values)
 {
   if (type == "cyanstore") {
-    return seastar::make_ready_future<std::unique_ptr<FuturizedStore>>(std::make_unique<crimson::os::CyanStore>(data));
+    return seastar::make_ready_future<std::unique_ptr<FuturizedStore>>(
+      std::make_unique<crimson::os::CyanStore>(data));
   } else if (type == "seastore") {
-    return seastar::make_ready_future<std::unique_ptr<FuturizedStore>>(crimson::os::seastore::make_seastore(data, values));
+    return crimson::os::seastore::make_seastore(
+      data, values
+    ).then([] (auto seastore) {
+      return seastar::make_ready_future<std::unique_ptr<FuturizedStore>>(
+       seastore.release());
+    });
   } else {
 #ifdef WITH_BLUESTORE
     // use AlienStore as a fallback. It adapts e.g. BlueStore.
-    return seastar::make_ready_future<std::unique_ptr<FuturizedStore>>(std::make_unique<crimson::os::AlienStore>(
-      type, data, values));
+    return seastar::make_ready_future<std::unique_ptr<FuturizedStore>>(
+      std::make_unique<crimson::os::AlienStore>(type, data, values));
 #else
     ceph_abort_msgf("unsupported objectstore type: %s", type.c_str());
     return {};
index 6a03e8faa873d95ad82b4443c32cff4f0e702d06..e28a1a8a7c573c9a1f00a1c4cba73bd9592d06c9 100644 (file)
@@ -1,6 +1,7 @@
-add_library(crimson-seastore STATIC
+set(crimson_seastore_srcs
   cached_extent.cc
   seastore_types.cc
+  segment_manager.cc
   segment_manager/ephemeral.cc
   segment_manager/block.cc
   transaction_manager.cc
@@ -41,6 +42,15 @@ add_library(crimson-seastore STATIC
   ../../../test/crimson/seastore/test_block.cc
   ${PROJECT_SOURCE_DIR}/src/os/Transaction.cc
        )
+
+if(HAVE_ZNS)
+  list(APPEND crimson_seastore_srcs
+    segment_manager/zns.cc)
+endif()
+
+add_library(crimson-seastore STATIC
+  ${crimson_seastore_srcs})
+
 target_link_libraries(crimson-seastore
   crimson)
 set_target_properties(crimson-seastore PROPERTIES
index 600ba4b58f77aa8e12b7377d1bffe4922d36ec59..f4cf1804801361a4b41cbcfe13c403fc2f6ab29b 100644 (file)
@@ -22,6 +22,7 @@
 #include "crimson/os/futurized_collection.h"
 
 #include "crimson/os/seastore/segment_cleaner.h"
+#include "crimson/os/seastore/segment_manager.h"
 #include "crimson/os/seastore/segment_manager/block.h"
 #include "crimson/os/seastore/collection_manager/flat_collection_manager.h"
 #include "crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h"
@@ -30,6 +31,7 @@
 #include "crimson/os/seastore/onode_manager.h"
 #include "crimson/os/seastore/object_data_handler.h"
 
+
 using std::string;
 using crimson::common::local_conf;
 
@@ -1376,45 +1378,45 @@ uuid_d SeaStore::get_fsid() const
   return segment_manager->get_meta().seastore_id;
 }
 
-std::unique_ptr<SeaStore> make_seastore(
+seastar::future<std::unique_ptr<SeaStore>> make_seastore(
   const std::string &device,
   const ConfigValues &config)
 {
-  auto sm = std::make_unique<
-    segment_manager::block::BlockSegmentManager
-    >(device + "/block");
-
-  auto scanner = std::make_unique<ExtentReader>();
-  auto& scanner_ref = *scanner.get();
-  auto segment_cleaner = std::make_unique<SegmentCleaner>(
-    SegmentCleaner::config_t::get_default(),
-    std::move(scanner),
-    false /* detailed */);
-
-  auto journal = std::make_unique<Journal>(*sm, scanner_ref);
-  auto cache = std::make_unique<Cache>(scanner_ref);
-  auto lba_manager = lba_manager::create_lba_manager(*sm, *cache);
-
-  auto epm = std::make_unique<ExtentPlacementManager>(*cache, *lba_manager);
-
-  journal->set_segment_provider(&*segment_cleaner);
-
-  auto tm = std::make_unique<TransactionManager>(
-    *sm,
-    std::move(segment_cleaner),
-    std::move(journal),
-    std::move(cache),
-    std::move(lba_manager),
-    std::move(epm),
-    scanner_ref);
-
-  auto cm = std::make_unique<collection_manager::FlatCollectionManager>(*tm);
-  return std::make_unique<SeaStore>(
-    device,
-    std::move(sm),
-    std::move(tm),
-    std::move(cm),
-    std::make_unique<crimson::os::seastore::onode::FLTreeOnodeManager>(*tm));
+  return SegmentManager::get_segment_manager(
+    device
+  ).then([&device](auto sm) {
+    auto scanner = std::make_unique<ExtentReader>();
+    auto& scanner_ref = *scanner.get();
+    auto segment_cleaner = std::make_unique<SegmentCleaner>(
+      SegmentCleaner::config_t::get_default(),
+      std::move(scanner),
+      false /* detailed */);
+
+    auto journal = std::make_unique<Journal>(*sm, scanner_ref);
+    auto cache = std::make_unique<Cache>(scanner_ref);
+    auto lba_manager = lba_manager::create_lba_manager(*sm, *cache);
+
+    auto epm = std::make_unique<ExtentPlacementManager>(*cache, *lba_manager);
+
+    journal->set_segment_provider(&*segment_cleaner);
+
+    auto tm = std::make_unique<TransactionManager>(
+      *sm,
+      std::move(segment_cleaner),
+      std::move(journal),
+      std::move(cache),
+      std::move(lba_manager),
+      std::move(epm),
+      scanner_ref);
+
+    auto cm = std::make_unique<collection_manager::FlatCollectionManager>(*tm);
+    return std::make_unique<SeaStore>(
+      device,
+      std::move(sm),
+      std::move(tm),
+      std::move(cm),
+      std::make_unique<crimson::os::seastore::onode::FLTreeOnodeManager>(*tm));
+  });
 }
 
 }
index b205872141d790083494bc0ed9b788540a803b8f..e533d42e1d746fdcf53ce875651b526002e89b08 100644 (file)
@@ -388,7 +388,7 @@ private:
   seastar::future<> write_fsid(uuid_d new_osd_fsid);
 };
 
-std::unique_ptr<SeaStore> make_seastore(
+seastar::future<std::unique_ptr<SeaStore>> make_seastore(
   const std::string &device,
   const ConfigValues &config);
 }
diff --git a/src/crimson/os/seastore/segment_manager.cc b/src/crimson/os/seastore/segment_manager.cc
new file mode 100644 (file)
index 0000000..012990d
--- /dev/null
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/segment_manager.h"
+#include "crimson/os/seastore/segment_manager/block.h"
+#include "crimson/common/log.h"
+
+
+#ifdef HAVE_ZNS
+#include "crimson/os/seastore/segment_manager/zns.h"
+#endif
+
+namespace{
+  seastar::logger &logger(){
+    return crimson::get_logger(ceph_subsys_seastore);
+  }
+}
+
+namespace crimson::os::seastore {
+
+seastar::future<crimson::os::seastore::SegmentManagerRef>
+SegmentManager::get_segment_manager(
+  const std::string &device)
+{
+#ifdef HAVE_ZNS
+  return seastar::do_with(
+    static_cast<size_t>(0),
+    [&](auto &nr_zones) {
+      return seastar::open_file_dma(
+       device + "/block",
+       seastar::open_flags::rw
+      ).then([&](auto file) {
+       return seastar::do_with(
+         file,
+         [=, &nr_zones](auto &f) -> seastar::future<int> {
+           ceph_assert(f);
+           return f.ioctl(BLKGETNRZONES, (void *)&nr_zones);
+         });
+      }).then([&](auto ret) -> crimson::os::seastore::SegmentManagerRef {
+       crimson::os::seastore::SegmentManagerRef sm;
+       logger().error("NR_ZONES: {}", nr_zones);
+       if (nr_zones != 0) {
+         return std::make_unique<
+           segment_manager::zns::ZNSSegmentManager
+           >(device + "/block");
+       } else {
+         return std::make_unique<
+           segment_manager::block::BlockSegmentManager
+           >(device + "/block");
+       }
+      });
+    });
+#else
+  return seastar::make_ready_future<crimson::os::seastore::SegmentManagerRef>(
+    std::make_unique<
+      segment_manager::block::BlockSegmentManager
+    >(device + "/block"));
+#endif
+}
+
+}
index 437aa72ad49f38b6adc29ce8240660ebb6d7967a..4315829618c24b7d5a3bbc77239255cd51328a70 100644 (file)
@@ -13,6 +13,7 @@
 #include "include/ceph_assert.h"
 #include "crimson/os/seastore/seastore_types.h"
 #include "include/buffer_fwd.h"
+#include "crimson/common/config_proxy.h"
 #include "crimson/osd/exceptions.h"
 
 namespace crimson::os::seastore {
@@ -141,6 +142,9 @@ public:
 using SegmentRef = boost::intrusive_ptr<Segment>;
 
 constexpr size_t PADDR_SIZE = sizeof(paddr_t);
+class SegmentManager;
+
+using SegmentManagerRef = std::unique_ptr<SegmentManager>;
 
 class SegmentManager {
 public:
@@ -213,8 +217,9 @@ public:
   virtual magic_t get_magic() const = 0;
 
   virtual ~SegmentManager() {}
+
+  static seastar::future<SegmentManagerRef> get_segment_manager(const std::string &device);
 };
-using SegmentManagerRef = std::unique_ptr<SegmentManager>;
 
 }
 
diff --git a/src/crimson/os/seastore/segment_manager/zns.cc b/src/crimson/os/seastore/segment_manager/zns.cc
new file mode 100644 (file)
index 0000000..bc9752b
--- /dev/null
@@ -0,0 +1,597 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+#include <linux/blkzoned.h>
+
+#include "crimson/os/seastore/segment_manager/zns.h"
+#include "crimson/common/config_proxy.h"
+#include "crimson/common/log.h"
+#include "include/buffer.h"
+
+namespace {
+seastar::logger &logger(){
+  return crimson::get_logger(ceph_subsys_seastore);
+}
+}
+
+namespace crimson::os::seastore::segment_manager::zns {
+
+using open_device_ret = ZNSSegmentManager::access_ertr::future<
+  std::pair<seastar::file, seastar::stat_data>>;
+static open_device_ret open_device(
+  const std::string &path,
+  seastar::open_flags mode)
+{
+  return seastar::file_stat(
+    path, seastar::follow_symlink::yes
+  ).then([mode, &path](auto stat) mutable{
+    return seastar::open_file_dma(path, mode).then([=](auto file){
+      logger().error(
+       "open_device: open successful, size {}",
+       stat.size);
+      return std::make_pair(file, stat);
+    });
+  }).handle_exception(
+    [](auto e) -> open_device_ret {
+      logger().error(
+       "open_device: got error {}",
+       e);
+      return crimson::ct_error::input_output_error::make();
+    }
+  );
+}
+
+static zns_sm_metadata_t make_metadata(
+  seastore_meta_t meta,
+  const seastar::stat_data &data,
+  size_t zone_size,
+  size_t zone_capacity,
+  size_t num_zones)
+{
+  using crimson::common::get_conf;
+  
+  auto config_size = get_conf<Option::size_t>(
+    "seastore_device_size");
+  
+  size_t size = (data.size == 0) ? config_size : data.size;
+  
+  auto config_segment_size = get_conf<Option::size_t>(
+    "seastore_segment_size");
+  logger().error("CONFIG SIZE: {}", config_segment_size);
+  size_t zones_per_segment = config_segment_size / zone_capacity;
+  
+  size_t segments = (num_zones - 1) * zones_per_segment;
+  
+  logger().debug(
+    "{}: size {}, block_size {}, allocated_size {}, configured_size {}, "
+    "segment_size {}",
+    __func__,
+    data.size,
+    data.block_size,
+    data.allocated_size,
+    config_size,
+    config_segment_size);
+  
+  zns_sm_metadata_t ret = zns_sm_metadata_t{
+    size,
+    config_segment_size,
+    zone_capacity * zones_per_segment,
+    zones_per_segment,
+    zone_capacity,
+    data.block_size,
+    segments,
+    zone_size,
+    zone_size,
+    meta};
+  return ret;
+}
+
+struct ZoneReport {
+  struct blk_zone_report *hdr;
+  ZoneReport(int nr_zones) 
+    : hdr((blk_zone_report *)malloc(
+           sizeof(struct blk_zone_report) + nr_zones * sizeof(struct blk_zone))){;}
+  ~ZoneReport(){
+    free(hdr);
+  }
+  ZoneReport(const ZoneReport &) = delete;
+  ZoneReport(ZoneReport &&rhs) : hdr(rhs.hdr) {
+    rhs.hdr = nullptr;
+  }
+};
+
+static seastar::future<> reset_device(
+  seastar::file &device, 
+  uint32_t zone_size, 
+  uint32_t nr_zones)
+{
+  return seastar::do_with(
+    blk_zone_range{},
+    ZoneReport(nr_zones),
+    [&, nr_zones] (auto &range, auto &zr){
+      range.sector = 0;
+      range.nr_sectors = zone_size * nr_zones;
+      return device.ioctl(
+       BLKRESETZONE, 
+       &range
+      ).then([&](int ret){
+       return seastar::now();
+      });
+    }
+  );
+}
+
+static seastar::future<size_t> get_zone_capacity(
+  seastar::file &device, 
+  uint32_t zone_size, 
+  uint32_t nr_zones)
+{
+  return seastar::do_with(
+    blk_zone_range{},
+    ZoneReport(nr_zones),
+    [&] (auto &first_zone_range, auto &zr){
+      first_zone_range.sector = 0;
+      first_zone_range.nr_sectors = zone_size;
+      return device.ioctl(
+       BLKOPENZONE, 
+       &first_zone_range
+      ).then([&](int ret){
+       return device.ioctl(BLKREPORTZONE, zr.hdr);
+      }).then([&] (int ret){
+       return device.ioctl(BLKRESETZONE, &first_zone_range);
+      }).then([&](int ret){
+       return seastar::make_ready_future<size_t>(zr.hdr->zones[0].wp);
+      });
+    }
+  );
+}
+
+static write_ertr::future<> do_write(
+  seastar::file &device,
+  uint64_t offset,
+  bufferptr &bptr)
+{
+  logger().debug(
+    "zns: do_write offset {} len {}",
+    offset,
+    bptr.length());
+  return device.dma_write(
+    offset,
+    bptr.c_str(),
+    bptr.length() 
+  ).handle_exception(
+    [](auto e) -> write_ertr::future<size_t> {
+      logger().error(
+        "do_write: dma_write got error {}",
+        e);
+      return crimson::ct_error::input_output_error::make();
+    }
+  ).then([length = bptr.length()](auto result) -> write_ertr::future<> {
+    if (result != length) {
+      return crimson::ct_error::input_output_error::make();
+    }
+    return write_ertr::now();
+  });
+}
+
+static write_ertr::future<> do_writev(
+  seastar::file &device,
+  uint64_t offset,
+  bufferlist&& bl,
+  size_t block_size)
+{
+  logger().error(
+    "block: do_writev offset {} len {}",
+    offset,
+    bl.length());
+  // writev requires each buffer to be aligned to the disks' block
+  // size, we need to rebuild here
+  bl.rebuild_aligned(block_size);
+  
+  std::vector<iovec> iov;
+  bl.prepare_iov(&iov);
+  return device.dma_write(
+    offset,
+    std::move(iov)
+  ).handle_exception(
+    [](auto e) -> write_ertr::future<size_t> {
+      logger().error(
+       "do_writev: dma_write got error {}",
+       e);
+      return crimson::ct_error::input_output_error::make();
+    }
+  ).then([bl=std::move(bl)/* hold the buf until the end of io */](size_t written)
+        -> write_ertr::future<> {
+    if (written != bl.length()) {
+      return crimson::ct_error::input_output_error::make();
+    }
+    return write_ertr::now();
+  });
+}
+
+static ZNSSegmentManager::access_ertr::future<>
+write_metadata(seastar::file &device, zns_sm_metadata_t sb)
+{
+  assert(ceph::encoded_sizeof_bounded<zns_sm_metadata_t>() <
+        sb.block_size);
+  return seastar::do_with(
+    bufferptr(ceph::buffer::create_page_aligned(sb.block_size)),
+    [=, &device](auto &bp){
+      logger().error("BLOCK SIZE: {}", sb.block_size);
+      bufferlist bl;
+      encode(sb, bl);
+      auto iter = bl.begin();
+      assert(bl.length() < sb.block_size);
+      logger().error("{}", bl.length());
+      iter.copy(bl.length(), bp.c_str());
+      logger().debug("write_metadata: doing writeout");
+      return do_write(device, 0, bp);
+    });
+}
+
+static read_ertr::future<> do_read(
+  seastar::file &device,
+  uint64_t offset,
+  size_t len,
+  bufferptr &bptr)
+{
+  assert(len <= bptr.length());
+  logger().debug(
+    "block: do_read offset {} len {}",
+    offset,
+    len);
+  return device.dma_read(
+    offset,
+    bptr.c_str(),
+    len
+  ).handle_exception(
+    [](auto e) -> read_ertr::future<size_t> {
+      logger().error(
+        "do_read: dma_read got error {}",
+        e);
+      return crimson::ct_error::input_output_error::make();
+    }
+  ).then([len](auto result) -> read_ertr::future<> {
+    if (result != len) {
+      return crimson::ct_error::input_output_error::make();
+    }
+    return read_ertr::now();
+  });
+}
+
+static
+ZNSSegmentManager::access_ertr::future<zns_sm_metadata_t>
+read_metadata(seastar::file &device, seastar::stat_data sd)
+{
+  assert(ceph::encoded_sizeof_bounded<zns_sm_metadata_t>() <
+        sd.block_size);
+  return seastar::do_with(
+    bufferptr(ceph::buffer::create_page_aligned(sd.block_size)),
+    [=, &device](auto &bp) {
+      return do_read(
+       device,
+       0,
+       bp.length(),
+       bp
+      ).safe_then([=, &bp] {
+       bufferlist bl;
+       bl.push_back(bp);
+       zns_sm_metadata_t ret;
+       auto bliter = bl.cbegin();
+       decode(ret, bliter);
+       return ZNSSegmentManager::access_ertr::future<zns_sm_metadata_t>(
+         ZNSSegmentManager::access_ertr::ready_future_marker{},
+         ret);
+      });
+    });
+}
+
+ZNSSegmentManager::mount_ret ZNSSegmentManager::mount() 
+{
+  return open_device(
+    device_path, seastar::open_flags::rw
+  ).safe_then([=](auto p) {
+    device = std::move(p.first);
+    auto sd = p.second;
+    return read_metadata(device, sd);
+  }).safe_then([=](auto meta){
+    metadata = meta;
+    return mount_ertr::now();
+  });
+}
+
+ZNSSegmentManager::mkfs_ret ZNSSegmentManager::mkfs(
+  segment_manager_config_t config)
+{
+  logger().error("ZNSSegmentManager::mkfs: starting");
+  return seastar::do_with(
+    seastar::file{},
+    seastar::stat_data{},
+    zns_sm_metadata_t{},
+    size_t(),
+    size_t(),
+    [=](auto &device, auto &stat, auto &sb, auto &zone_size, auto &nr_zones){
+      logger().error("ZNSSegmentManager::mkfs path {}", device_path);
+      return open_device(
+       device_path, 
+       seastar::open_flags::rw
+      ).safe_then([=, &device, &stat, &sb, &zone_size, &nr_zones](auto p){
+       device = p.first;
+       stat = p.second;
+       return device.ioctl(
+         BLKGETNRZONES, 
+         (void *)&nr_zones
+       ).then([&](int ret){
+         if (nr_zones == 0) {
+           return seastar::make_exception_future<int>(
+             std::system_error(std::make_error_code(std::errc::io_error)));
+         }
+         return device.ioctl(BLKGETZONESZ, (void *)&zone_size);
+       }).then([&] (int ret){
+         return reset_device(device, zone_size, nr_zones);
+       }).then([&] {
+         return get_zone_capacity(device, zone_size, nr_zones); 
+       }).then([&, config] (auto zone_capacity){
+         sb = make_metadata(
+           config.meta, 
+           stat, 
+           zone_size, 
+           zone_capacity, 
+           nr_zones);
+         metadata = sb;
+         stats.metadata_write.increment(
+           ceph::encoded_sizeof_bounded<zns_sm_metadata_t>());
+         logger().error("WROTE TO STATS");
+         return write_metadata(device, sb);
+       }).finally([&] {
+         logger().error("CLOSING DEVICE");
+         return device.close(); 
+       }).safe_then([] {
+         logger().error("RETURNING FROM MKFS");
+         return mkfs_ertr::now();
+       });
+      });
+    });
+}
+
+struct blk_zone_range make_range(
+  segment_id_t id, 
+  size_t segment_size, 
+  size_t block_size, 
+  size_t first_segment_offset)
+{
+  return blk_zone_range{
+    (id.device_segment_id() * segment_size + first_segment_offset),
+    (segment_size)  
+  };
+}
+
+using blk_open_zone_ertr = crimson::errorator<
+  crimson::ct_error::input_output_error>;
+using blk_open_zone_ret = blk_open_zone_ertr::future<>;
+blk_open_zone_ret blk_open_zone(seastar::file &device, blk_zone_range &range){
+  return device.ioctl(
+    BLKOPENZONE, 
+    &range
+  ).then_wrapped([=](auto f) -> blk_open_zone_ret{
+    if (f.failed()) {
+      return crimson::ct_error::input_output_error::make();
+    }
+    else {
+      int ret = f.get();
+      if (ret == 0) {
+       return seastar::now();
+      } else {
+       return crimson::ct_error::input_output_error::make();
+      }
+    }
+  });
+}
+
+ZNSSegmentManager::open_ertr::future<SegmentRef> ZNSSegmentManager::open(
+  segment_id_t id)
+{
+  return seastar::do_with(
+    blk_zone_range{},
+    [=] (auto &range){
+      range = make_range(
+       id, 
+       metadata.zone_size, 
+       metadata.block_size, 
+       metadata.first_segment_offset);
+      return blk_open_zone(
+       device, 
+       range
+      );
+    }
+  ).safe_then([=] {
+    logger().error("open _segment: open successful");
+    return open_ertr::future<SegmentRef>(
+      open_ertr::ready_future_marker{},
+      SegmentRef(new ZNSSegment(*this, id))
+    );
+  });
+}
+
+using blk_close_zone_ertr = crimson::errorator<
+  crimson::ct_error::input_output_error>;
+using blk_close_zone_ret = blk_close_zone_ertr::future<>;
+blk_close_zone_ret blk_close_zone(
+  seastar::file &device, 
+  blk_zone_range &range)
+{
+  return device.ioctl(
+    BLKCLOSEZONE, 
+    &range
+  ).then_wrapped([=](auto f) -> blk_open_zone_ret{
+    if (f.failed()) {
+      return crimson::ct_error::input_output_error::make();
+    }
+    else {
+      int ret = f.get();
+      if (ret == 0) {
+       return seastar::now();
+      } else {
+       return crimson::ct_error::input_output_error::make();
+      }
+    }
+  });
+}
+
+ZNSSegmentManager::release_ertr::future<> ZNSSegmentManager::release(
+  segment_id_t id) 
+{
+  return seastar::do_with(
+    blk_zone_range{},
+    [=] (auto &range){
+      range = make_range(
+       id, 
+       metadata.zone_size, 
+       metadata.block_size, 
+       metadata.first_segment_offset);
+      return blk_close_zone(
+       device, 
+       range
+      );
+    }
+  ).safe_then([=] {
+    logger().error("release _segment: release successful");
+    return release_ertr::now();
+  });
+}
+
+SegmentManager::read_ertr::future<> ZNSSegmentManager::read(
+  paddr_t addr,
+  size_t len,
+  ceph::bufferptr &out)
+{
+  auto& seg_addr = addr.as_seg_paddr();
+  if (seg_addr.get_segment_id().device_segment_id() >= get_num_segments()) {
+    logger().error(
+      "ZNSSegmentManager::read: invalid segment {}",
+      addr);
+    return crimson::ct_error::invarg::make();
+  }
+  
+  if (seg_addr.get_segment_off() + len > metadata.zone_size) {
+    logger().error(
+      "ZNSSegmentManager::read: invalid offset {}~{}!",
+      addr,
+      len);
+    return crimson::ct_error::invarg::make();
+  }
+  return do_read(
+    device,
+    get_offset(addr),
+    len,
+    out);
+}
+
+Segment::close_ertr::future<> ZNSSegmentManager::segment_close(
+  segment_id_t id, segment_off_t write_pointer)
+{
+  return seastar::do_with(
+    blk_zone_range{},
+    [=] (auto &range){
+      range = make_range(
+       id, 
+       metadata.zone_size, 
+       metadata.block_size, 
+       metadata.first_segment_offset);
+      return blk_close_zone(
+       device, 
+       range
+      );
+    }
+  ).safe_then([=] {
+    logger().error("open _segment: open successful");
+    return Segment::close_ertr::now();
+  });
+}
+
+Segment::write_ertr::future<> ZNSSegmentManager::segment_write(
+  paddr_t addr,
+  ceph::bufferlist bl,
+  bool ignore_check)
+{
+  assert(addr.get_device_id() == get_device_id());
+  assert((bl.length() % metadata.block_size) == 0);
+  auto& seg_addr = addr.as_seg_paddr();
+  logger().debug(
+    "BlockSegmentManager::segment_write: "
+    "segment_write to segment {} at offset {}, physical offset {}, len {}",
+    seg_addr.get_segment_id(),
+    seg_addr.get_segment_off(),
+    get_offset(addr),
+    bl.length());
+  stats.data_write.increment(bl.length());
+  return do_writev(
+    device, 
+    get_offset(addr), 
+    std::move(bl), 
+    metadata.block_size);
+}
+
+device_id_t ZNSSegmentManager::get_device_id() const
+{
+  return metadata.device_id;
+};
+
+secondary_device_set_t& ZNSSegmentManager::get_secondary_devices()
+{
+  return metadata.secondary_devices;
+};
+
+device_spec_t ZNSSegmentManager::get_device_spec() const
+{
+  auto spec = device_spec_t();
+  spec.magic = metadata.magic;
+  spec.dtype = metadata.dtype;
+  spec.id = metadata.device_id;
+  return spec;
+};
+
+magic_t ZNSSegmentManager::get_magic() const
+{
+  return metadata.magic;
+};
+
+segment_off_t ZNSSegment::get_write_capacity() const
+{
+  return manager.get_segment_size();
+}
+
+SegmentManager::close_ertr::future<> ZNSSegmentManager::close()
+{
+  if (device) {
+    return device.close();
+  }
+  return seastar::now();
+}
+
+Segment::close_ertr::future<> ZNSSegment::close()
+{
+  return manager.segment_close(id, write_pointer);
+}
+
+Segment::write_ertr::future<> ZNSSegment::write(
+  segment_off_t offset, ceph::bufferlist bl)
+{
+  if (offset < write_pointer || offset % manager.metadata.block_size != 0) {
+    logger().error(
+      "ZNSSegmentManager::ZNSSegment::write: "
+      "invalid segment write on segment {} to offset {}",
+      id,
+      offset);
+    return crimson::ct_error::invarg::make();
+  }
+  if (offset + bl.length() > manager.metadata.segment_size)
+    return crimson::ct_error::enospc::make();
+  
+  write_pointer = offset + bl.length();
+  return manager.segment_write(paddr_t::make_seg_paddr(id, offset), bl);
+}
+
+}
diff --git a/src/crimson/os/seastore/segment_manager/zns.h b/src/crimson/os/seastore/segment_manager/zns.h
new file mode 100644 (file)
index 0000000..0bb8043
--- /dev/null
@@ -0,0 +1,181 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <linux/blkzoned.h>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <seastar/core/file.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/reactor.hh>
+
+#include "crimson/common/layout.h"
+
+#include "crimson/os/seastore/segment_manager.h"
+
+#include "include/uuid.h"
+
+namespace crimson::os::seastore::segment_manager::zns {
+
+  struct zns_sm_metadata_t {
+    size_t size = 0;
+    size_t segment_size = 0;
+    size_t segment_capacity = 0;
+    size_t zones_per_segment = 0;
+    size_t zone_capacity = 0;
+    size_t block_size = 0;
+    size_t segments = 0;
+    size_t zone_size = 0;
+    uint64_t first_segment_offset = 0;
+    seastore_meta_t meta;
+    
+    bool major_dev = false;
+    magic_t magic = 0;
+    device_type_t dtype = device_type_t::NONE;
+    device_id_t device_id = 0;
+    secondary_device_set_t secondary_devices;
+
+    DENC(zns_sm_metadata_t, v, p) {
+      DENC_START(1, 1, p);
+      denc(v.size, p);
+      denc(v.segment_size, p);
+      denc(v.zone_capacity, p);
+      denc(v.zones_per_segment, p);
+      denc(v.block_size, p);
+      denc(v.segments, p);
+      denc(v.zone_size, p);
+      denc(v.first_segment_offset, p);
+      denc(v.meta, p);
+      denc(v.magic, p);
+      denc(v.dtype, p);
+      denc(v.device_id, p);
+      if (v.major_dev) {
+       denc(v.secondary_devices, p);
+      }
+      DENC_FINISH(p);
+    }
+  };
+
+  using write_ertr = crimson::errorator<crimson::ct_error::input_output_error>;
+  using read_ertr = crimson::errorator<crimson::ct_error::input_output_error>;
+
+  class ZNSSegmentManager;
+
+  class ZNSSegment final : public Segment {
+  public:
+    ZNSSegment(ZNSSegmentManager &man, segment_id_t i) : manager(man), id(i){};
+
+    segment_id_t get_segment_id() const final { return id; }
+    segment_off_t get_write_capacity() const final;
+    segment_off_t get_write_ptr() const final { return write_pointer; }
+    close_ertr::future<> close() final;
+    write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final;
+
+    ~ZNSSegment() {}
+  private:
+    friend class ZNSSegmentManager;
+    ZNSSegmentManager &manager;
+    const segment_id_t id;
+    segment_off_t write_pointer = 0;
+  };
+
+  class ZNSSegmentManager final : public SegmentManager{
+  public:
+    mount_ret mount() final;
+    mkfs_ret mkfs(segment_manager_config_t meta) final;
+    open_ertr::future<SegmentRef> open(segment_id_t id) final;
+    close_ertr::future<> close() final;
+
+    release_ertr::future<> release(segment_id_t id) final;
+
+    read_ertr::future<> read(
+      paddr_t addr, 
+      size_t len, 
+      ceph::bufferptr &out) final;
+
+    size_t get_size() const final {
+      return metadata.size;
+    };
+
+    segment_off_t get_block_size() const final {
+      return metadata.block_size;
+    };
+
+    segment_off_t get_segment_size() const final {
+      return metadata.segment_size;
+    };
+
+    const seastore_meta_t &get_meta() const {
+      return metadata.meta;
+    };
+
+    device_id_t get_device_id() const final;
+
+    secondary_device_set_t& get_secondary_devices() final;
+
+    device_spec_t get_device_spec() const final;
+
+    magic_t get_magic() const final;
+
+    ZNSSegmentManager(const std::string &path) : device_path(path) {}
+
+    ~ZNSSegmentManager() final = default;
+
+    Segment::write_ertr::future<> segment_write(
+    paddr_t addr,
+    ceph::bufferlist bl,
+    bool ignore_check=false);
+
+  private:
+    friend class ZNSSegment;
+    std::string device_path;
+    zns_sm_metadata_t metadata;
+    seastar::file device;
+    uint32_t nr_zones;
+    struct effort_t {
+      uint64_t num = 0;
+      uint64_t bytes = 0;
+
+      void increment(uint64_t read_bytes) {
+        ++num;
+        bytes += read_bytes;
+      }
+    };
+
+    struct zns_sm_stats {
+      effort_t data_read = {};
+      effort_t data_write = {};
+      effort_t metadata_write = {};
+      uint64_t opened_segments = 0;
+      uint64_t closed_segments = 0;
+      uint64_t closed_segments_unused_bytes = 0;
+      uint64_t released_segments = 0;
+
+      void reset() {
+       *this = zns_sm_stats{};
+      }
+    } stats;
+
+    void register_metrics();
+    seastar::metrics::metric_group metrics;
+
+    Segment::close_ertr::future<> segment_close(
+      segment_id_t id, segment_off_t write_pointer);
+
+    uint64_t get_offset(paddr_t addr) {
+      auto& seg_addr = addr.as_seg_paddr();
+      const auto default_sector_size = 512;
+      return (metadata.first_segment_offset +
+             (seg_addr.get_segment_id().device_segment_id() * 
+              metadata.zone_size)) * default_sector_size + 
+       seg_addr.get_segment_off();
+    }
+  };
+
+}
+
+WRITE_CLASS_DENC_BOUNDED(
+  crimson::os::seastore::segment_manager::zns::zns_sm_metadata_t
+)
index 01681c494612c5fc3a8dd1bb3f671c3240d2b8c5..3499e64893ddf31151cef40882290b9be384ae2f 100644 (file)
 /* Define to 1 if you have libxfs */
 #cmakedefine HAVE_LIBXFS 1
 
+/* Define to 1 if zns support enabled */
+#cmakedefine HAVE_ZNS
+
 /* SPDK conditional compilation */
 #cmakedefine HAVE_SPDK