]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
seastore: add NormalNBD
authorJinyong Ha <jyha200@gmail.com>
Fri, 16 Apr 2021 08:12:55 +0000 (17:12 +0900)
committermyoungwon oh <ohmyoungwon@gmail.com>
Tue, 15 Jun 2021 02:09:06 +0000 (11:09 +0900)
NormalNBD - NVMe Block device with seastar file
 1. Support preffered write granularity/alignment which is reported
 from NVMe SSD. User should follow this write guidance for enhancing write
 performance.
 2. Support multi-stream IO in NVMe SSD.

Signed-off-by: Jinyong Ha <jy200.ha@samsung.com>
src/crimson/os/seastore/CMakeLists.txt
src/crimson/os/seastore/nvmedevice/memory.cc
src/crimson/os/seastore/nvmedevice/nvmedevice.cc [new file with mode: 0644]
src/crimson/os/seastore/nvmedevice/nvmedevice.h
src/test/crimson/seastore/CMakeLists.txt
src/test/crimson/seastore/nvmedevice/test_nvmedevice.cc [new file with mode: 0644]

index f9ad475b29dadd3c66016460b77621190e3053e4..c697d7e3acbbfae3f940c2095198a2a3402563a6 100644 (file)
@@ -34,6 +34,7 @@ add_library(crimson-seastore STATIC
   seastore.cc
   randomblock_manager.cc
   nvmedevice/memory.cc
+  nvmedevice/nvmedevice.cc
   ../../../test/crimson/seastore/test_block.cc
   ${PROJECT_SOURCE_DIR}/src/os/Transaction.cc
        )
index 66be008d589f849431dcdbf78955c66a3581e755..eb371b490503419277712b5f7168e08833cd697a 100644 (file)
@@ -44,7 +44,8 @@ TestMemory::open(const std::string &in_path, seastar::open_flags mode)
 write_ertr::future<>
 TestMemory::write(
   uint64_t offset,
-  bufferptr &bptr)
+  bufferptr &bptr,
+  uint16_t stream)
 {
   ceph_assert(buf);
   logger().debug(
diff --git a/src/crimson/os/seastore/nvmedevice/nvmedevice.cc b/src/crimson/os/seastore/nvmedevice/nvmedevice.cc
new file mode 100644 (file)
index 0000000..84f7079
--- /dev/null
@@ -0,0 +1,227 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include <fcntl.h>
+
+#include "crimson/common/log.h"
+
+#include "include/buffer.h"
+#include "crimson/os/seastore/nvmedevice/nvmedevice.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+static constexpr uint32_t MAX_EVENTS = 1024;
+
+namespace crimson::os::seastore::nvme_device {
+
+/* background io poller for multi-stream write */
+void poll_completion(std::vector<::io_context_t>* ctxs, bool* exit) {
+  while (*exit == false) {
+    for (auto& ctx : *ctxs) {
+      io_event events[MAX_EVENTS];
+
+      /*
+       * At least a completion should be returned. Otherwise, thread is blocked
+       * until it is possible
+       */
+      int num_events = io_getevents(ctx, 1, MAX_EVENTS, events, NULL);
+
+      for (int i = 0; i < num_events; i++) {
+        io_context_t* io_context = (io_context_t*)events[i].obj;
+        io_context->done = true;
+      }
+    }
+  }
+}
+
+open_ertr::future<>
+NormalNBD::open(const std::string &in_path, seastar::open_flags mode) {
+  /* Open with posix fd for pass generic NVMe commands */
+  fd = seastar::file_desc::open(in_path, (int)mode);
+  identify_controller_data_t controller_data = {0, };
+  return identify_controller(controller_data).safe_then(
+      [this, controller_data, in_path, mode]() {
+      protocol_version = controller_data.version;
+      logger().debug("nvme protocol {}.{} {}",
+          (uint32_t)protocol_version.major_ver,
+          (uint32_t)protocol_version.minor_ver,
+          (uint32_t)protocol_version.tertiary_ver);
+
+      /*
+       * Multi Stream Write
+       *
+       * When NVMe SSD supports multi stream functionality, it marks oacs bit of
+       * identify_controller_data structure (from NVMe Specification 1.4).
+       * If oacs field is true, NormalNBD class opens device file multiple times
+       * with different stream IDs. When user calls write() with stream argument,
+       * NormalNBD finds pre-opened FD with stream ID and submit write IO to the
+       * found FD.
+       */
+      support_multistream = controller_data.oacs.support_directives;
+      if (support_multistream) {
+        write_life_max = 6;
+      }
+
+      open_for_io(in_path, mode);
+
+      /* PWG and PWA are supported from NVMe 1.4 */
+      if (protocol_version.major_ver >= 1 && protocol_version.minor_ver >= 4) {
+        identify_namespace_data_t namespace_data = {0, };
+        identify_namespace(namespace_data).safe_then([this, namespace_data]() {
+            /* Revise 0-based value */
+            write_granularity = namespace_data.npwg + 1;
+            write_alignment = namespace_data.npwa + 1;
+            });
+      }
+      return seastar::now();
+      }).handle_error(
+        /* If device does not support ioctl, just open without stream */
+        crimson::ct_error::input_output_error::handle([this, in_path, mode](auto) {
+          open_for_io(in_path, mode);
+          return seastar::now();
+        }));
+}
+
+void
+NormalNBD::open_for_io(const std::string &in_path, seastar::open_flags mode)
+{
+  ctx.resize(write_life_max);
+  for (uint32_t i = 0; i < write_life_max; i++) {
+    stream_fd.push_back(seastar::file_desc::open(in_path, (int)mode));
+    if (i != write_life_not_set) {
+      int posix_fd = stream_fd[i].get();
+      fcntl(posix_fd, F_SET_FILE_RW_HINT, &i);
+    }
+
+    io_setup(MAX_EVENTS, &ctx[i]);
+  }
+  completion_poller = std::thread(poll_completion, &ctx, &exit);
+}
+
+write_ertr::future<>
+NormalNBD::write(
+  uint64_t offset,
+  bufferptr &bptr,
+  uint16_t stream) {
+  logger().debug(
+      "block: do_write offset {} len {}",
+      offset,
+      bptr.length());
+  io_context_t io_context = io_context_t();
+  io_prep_pwrite(
+      &io_context.cb,
+      stream_fd[stream].get(),
+      bptr.c_str(),
+      bptr.length(),
+      offset);
+  iocb* cb_ptr[1] = {&io_context.cb};
+  io_submit(ctx[stream], 1, cb_ptr);
+  return seastar::do_with(std::move(io_context), [] (auto& io_context) {
+    /*
+     * libaio needs additional poller thread (see poll_completion) to poll IO
+     * completion. When the poller catches a completion, it marks "done" field
+     * of corresponding io_context.
+     */
+    if (io_context.done) {
+      return seastar::now();
+    }
+      return seastar::later();
+    });
+}
+
+read_ertr::future<>
+NormalNBD::read(
+  uint64_t offset,
+  bufferptr &bptr) {
+  logger().debug(
+      "block: do_read offset {} len {}",
+      offset,
+      bptr.length());
+  io_context_t io_context = io_context_t();
+  io_prep_pread(
+      &io_context.cb,
+      stream_fd[0].get(),
+      bptr.c_str(),
+      bptr.length(),
+      offset);
+  iocb* cb_ptr[1] = {&io_context.cb};
+  io_submit(ctx[0], 1, cb_ptr);
+  return seastar::do_with(std::move(io_context), [] (auto& io_context) {
+      if (io_context.done) {
+      return seastar::now();
+      }
+      return seastar::later();
+      });
+}
+
+seastar::future<>
+NormalNBD::close() {
+  logger().debug(" close ");
+  exit = true;
+  completion_poller.join();
+  fd.close();
+  return seastar::now();
+}
+
+nvme_command_ertr::future<>
+NormalNBD::pass_through_io(NVMePassThroughCommand& command) {
+  logger().debug("block: pass through");
+  int ret = fd.ioctl(NVME_IOCTL_IO_CMD, command);
+  if (ret < 0) {
+    logger().debug("block: pass through failed");
+    return crimson::ct_error::input_output_error::make();
+  }
+  else {
+    return nvme_command_ertr::now();
+  }
+}
+
+nvme_command_ertr::future<>
+NormalNBD::identify_namespace(identify_namespace_data_t& namespace_data) {
+  nvme_admin_command_t command = {0,};
+  command.common_cmd.opcode = nvme_admin_command_t::OPCODE_IDENTIFY;
+  command.identify_cmd.cns = nvme_identify_command_t::CNS_NAMESPACE;
+  command.common_cmd.addr = (uint64_t)&namespace_data;
+  command.common_cmd.data_len = sizeof(identify_namespace_data_t);
+
+  return pass_admin(command);
+}
+
+nvme_command_ertr::future<>
+NormalNBD::identify_controller(identify_controller_data_t& controller_data) {
+  nvme_admin_command_t command = {0,};
+  command.common_cmd.opcode = nvme_admin_command_t::OPCODE_IDENTIFY;
+  command.identify_cmd.cns = nvme_identify_command_t::CNS_CONTROLLER;
+  command.common_cmd.addr = (uint64_t)&controller_data;
+  command.common_cmd.data_len = sizeof(identify_controller_data_t);
+
+  return pass_admin(command);
+}
+
+nvme_command_ertr::future<>
+NormalNBD::pass_admin(nvme_admin_command_t& command) {
+  logger().debug("block: pass admin");
+  try {
+    int ret = fd.ioctl(NVME_IOCTL_ADMIN_CMD, command);
+    if (ret < 0) {
+      logger().debug("block: pass admin failed");
+      return crimson::ct_error::input_output_error::make();
+    }
+    else {
+      return nvme_command_ertr::now();
+    }
+  }
+  catch (...) {
+    logger().debug("block: pass admin failed");
+    return crimson::ct_error::input_output_error::make();
+  }
+}
+
+}
index e2bc56c9e340139722413f66002c8370100281c5..c31fd0d844ce045bf0cee57927a8b9845db48167 100644 (file)
@@ -24,6 +24,85 @@ namespace ceph {
 
 namespace crimson::os::seastore::nvme_device {
 
+/*
+ * NVMe protocol structures (nvme_XX, identify_XX)
+ *
+ * All structures relative to NVMe protocol are following NVMe protocol v1.4
+ * (latest). NVMe is protocol for fast interfacing between user and SSD device.
+ * We selectively adopted features among various NVMe features to ease
+ * implementation. And also, NVMeBlockDevice provides generic command submission
+ * APIs for IO and Admin commands. Please use pass_through_io() and pass_admin()
+ * to do it.
+ *
+ * For more information about NVMe protocol, refer https://nvmexpress.org/
+ */
+
+struct nvme_identify_command_t {
+  uint32_t common_dw[10];
+  uint32_t cns : 8;
+  uint32_t reserved : 8;
+  uint32_t cntroller_id : 16;
+
+  static const uint8_t CNS_NAMESPACE = 0x00;
+  static const uint8_t CNS_CONTROLLER = 0x01;
+};
+
+struct nvme_admin_command_t {
+  union
+  {
+    nvme_passthru_cmd common_cmd;
+    nvme_identify_command_t identify_cmd;
+  };
+
+  static const uint8_t OPCODE_IDENTIFY = 0x06;
+};
+
+struct nvme_version_t {
+  uint32_t major_ver : 16;
+  uint32_t minor_ver : 8;
+  uint32_t tertiary_ver : 8;
+};
+
+struct admin_command_support_t {
+  uint16_t unused : 5;
+  uint16_t support_directives : 1;
+  uint16_t unused2 : 10;
+};
+
+struct identify_controller_data_t {
+  union
+  {
+    struct
+    {
+      uint8_t raw[1024];
+    };
+    struct
+    {
+      uint8_t unused[80];
+      nvme_version_t version;
+      uint8_t unused2[172];
+      admin_command_support_t oacs;
+    };
+  };
+};
+
+struct identify_namespace_data_t {
+  union
+  {
+    struct
+    {
+      uint8_t raw[4096];
+    };
+    struct
+    {
+      uint8_t unused[64];
+      uint16_t npwg;
+      uint16_t npwa;
+    };
+  };
+};
+
+using NVMePassThroughCommand = nvme_passthru_cmd;
 
 using read_ertr = crimson::errorator<
   crimson::ct_error::input_output_error,
@@ -45,6 +124,11 @@ using open_ertr = crimson::errorator<
 using nvme_command_ertr = crimson::errorator<
   crimson::ct_error::input_output_error>;
 
+struct io_context_t {
+  iocb cb;
+  bool done = false;
+};
+
 /*
  * Interface between NVMe SSD and its user.
  *
@@ -95,9 +179,17 @@ public:
     uint64_t offset,
     bufferptr &bptr) = 0;
 
+  /*
+   * Multi-stream write
+   *
+   * Give hint to device about classification of data whose life time is similar
+   * with each other. Data with same stream value will be managed together in
+   * SSD for better write performance.
+   */
   virtual write_ertr::future<> write(
     uint64_t offset,
-    bufferptr &bptr) = 0;
+    bufferptr &bptr,
+    uint16_t stream = 0) = 0;
 
   // TODO
   virtual int discard(uint64_t offset, uint64_t len) { return 0; }
@@ -105,9 +197,104 @@ public:
   virtual open_ertr::future<> open(
       const std::string& path,
       seastar::open_flags mode) = 0;
-
   virtual seastar::future<> close() = 0;
 
+  /*
+   * For passsing through nvme IO or Admin command to SSD
+   * Caller can construct and execute its own nvme command
+   */
+  virtual nvme_command_ertr::future<> pass_through_io(
+    NVMePassThroughCommand& command) { return nvme_command_ertr::now(); }
+  virtual nvme_command_ertr::future<> pass_admin(
+    nvme_admin_command_t& command) { return nvme_command_ertr::now(); }
+
+  /*
+   * End-to-End Data Protection
+   *
+   * NVMe device keeps track of data integrity similar with checksum. Client can
+   * offload checksuming to NVMe device to reduce its CPU utilization
+   */
+   virtual write_ertr::future<> protected_write(
+    uint64_t offset,
+    bufferptr &bptr,
+    uint16_t stream = 0) { return write_ertr::now(); }
+
+  /*
+   * Data Health
+   *
+   * Returns list of LBAs which have almost corrupted data. Data of the LBAs
+   * will be corrupted very soon. Caller can overwrite, unmap or refresh data to
+   * protect data
+   */
+   virtual nvme_command_ertr::future<> get_data_health(
+     std::list<uint64_t>& fragile_lbas) { return nvme_command_ertr::now(); }
+
+  /*
+   * Recovery Level
+   *
+   * Regulate magnitude of SSD-internal data recovery. Caller can get good read
+   * latency with lower magnitude.
+   */
+   virtual nvme_command_ertr::future<> set_data_recovery_level(
+     uint32_t level) { return nvme_command_ertr::now(); }
+
+  /*
+   * Predictable Latency
+   *
+   * NVMe device can guarantee IO latency within pre-defined time window. This
+   * functionality will be analyzed soon.
+   */
+};
+
+/*
+ * Implementation of NVMeBlockDevice with POSIX APIs
+ *
+ * NormalNBD provides NVMe SSD interfaces through POSIX APIs which is generally
+ * available at most operating environment.
+ */
+class NormalNBD : public NVMeBlockDevice {
+public:
+  NormalNBD() {}
+  ~NormalNBD() override {}
+
+  open_ertr::future<> open(
+    const std::string &in_path,
+    seastar::open_flags mode) override;
+
+  write_ertr::future<> write(
+    uint64_t offset,
+    bufferptr &bptr,
+    uint16_t stream = 0) override;
+
+  read_ertr::future<> read(
+    uint64_t offset,
+    bufferptr &bptr) override;
+
+  nvme_command_ertr::future<> pass_through_io(
+    NVMePassThroughCommand& command) override;
+
+  nvme_command_ertr::future<> pass_admin(
+    nvme_admin_command_t& command) override;
+
+  seastar::future<> close() override;
+
+private:
+  seastar::file_desc fd = seastar::file_desc::from_fd(-1);
+  std::vector<seastar::file_desc> stream_fd;
+  nvme_version_t protocol_version;
+  bool support_multistream = false;
+  std::vector<::io_context_t> ctx;
+  std::thread completion_poller;
+  bool exit = false;
+
+  uint32_t write_life_not_set = 0;
+  uint32_t write_life_max = 1;
+
+  nvme_command_ertr::future<> identify_controller(
+    identify_controller_data_t& controller_data);
+  nvme_command_ertr::future<> identify_namespace(
+    identify_namespace_data_t& namespace_data);
+  void open_for_io(const std::string& in_path, seastar::open_flags mode);
 };
 
 
@@ -122,17 +309,18 @@ public:
     }
   }
 
-  open_ertr::future<> open(const std::string &in_path, seastar::open_flags mode);
+  open_ertr::future<> open(const std::string &in_path, seastar::open_flags mode) override;
 
   write_ertr::future<> write(
     uint64_t offset,
-    bufferptr &bptr);
+    bufferptr &bptr,
+    uint16_t stream = 0) override;
 
   read_ertr::future<> read(
     uint64_t offset,
-    bufferptr &bptr);
+    bufferptr &bptr) override;
 
-  seastar::future<> close();
+  seastar::future<> close() override;
 
   char *buf;
   size_t size;
index 12b5ff65165910080081cdbce02090774a167520..af6c69568c7785794cbd79677e0bb43e586ab51b 100644 (file)
@@ -93,4 +93,14 @@ target_link_libraries(
   ${CMAKE_DL_LIBS}
   crimson-seastore)
 
+add_executable(unittest-seastore-nvmedevice
+  nvmedevice/test_nvmedevice.cc)
+add_ceph_test(unittest-seastore-nvmedevice
+  unittest-seastore-nvmedevice --memory 256M --smp 1)
+target_link_libraries(
+  unittest-seastore-nvmedevice
+  crimson::gtest
+  crimson-seastore
+  aio)
+
 add_subdirectory(onode_tree)
diff --git a/src/test/crimson/seastore/nvmedevice/test_nvmedevice.cc b/src/test/crimson/seastore/nvmedevice/test_nvmedevice.cc
new file mode 100644 (file)
index 0000000..e43d9d7
--- /dev/null
@@ -0,0 +1,85 @@
+//-*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/buffer.h"
+#include "crimson/os/seastore/nvmedevice/nvmedevice.h"
+#include "test/crimson/gtest_seastar.h"
+#include "include/stringify.h"
+
+using namespace crimson;
+using namespace crimson::os;
+using namespace crimson::os::seastore;
+using namespace nvme_device;
+
+struct nvdev_test_t : seastar_test_suite_t {
+  NVMeBlockDevice* device;
+  std::string dev_path;
+
+  static const uint64_t DEV_SIZE = 1024 * 1024 * 1024;
+
+  nvdev_test_t() :
+    device(NVMeBlockDevice::create<NormalNBD>()),
+    dev_path("randomblock_manager.test_nvmedevice" + stringify(getpid())) {
+    int fd = ::open(dev_path.c_str(), O_CREAT|O_RDWR|O_TRUNC, 0644);
+    ceph_assert(fd >= 0);
+    ::ftruncate(fd, DEV_SIZE);
+    ::close(fd);
+  }
+  ~nvdev_test_t() {
+    ::unlink(dev_path.c_str());
+    delete device;
+  }
+};
+
+static const uint64_t BUF_SIZE = 8192;
+
+struct nvdev_test_block_t {
+  uint8_t data[BUF_SIZE];
+
+  DENC(nvdev_test_block_t, v, p) {
+    DENC_START(1, 1, p);
+    for (uint64_t i = 0 ; i < BUF_SIZE; i++)
+    {
+      denc(v.data[i], p);
+    }
+    DENC_FINISH(p);
+  }
+};
+
+WRITE_CLASS_DENC_BOUNDED(
+  nvdev_test_block_t
+)
+
+TEST_F(nvdev_test_t, write_and_verify_test)
+{
+  run_async([this] {
+    device->open(dev_path, seastar::open_flags::rw).unsafe_get();
+    nvdev_test_block_t original_data;
+    std::minstd_rand0 generator;
+    uint8_t value = generator();
+    memset(original_data.data, value, BUF_SIZE);
+    uint64_t bl_length = 0;
+    {
+      bufferlist bl;
+      encode(original_data, bl);
+      bl_length = bl.length();
+      auto write_buf = ceph::bufferptr(buffer::create_page_aligned(bl_length));
+      bl.begin().copy(bl.length(), write_buf.c_str());
+      device->write(0, write_buf).unsafe_get();
+    }
+
+    nvdev_test_block_t read_data;
+    {
+      auto read_buf = ceph::bufferptr(buffer::create_page_aligned(bl_length));
+      device->read(0, read_buf).unsafe_get();
+      bufferlist bl;
+      bl.push_back(read_buf);
+      auto bliter = bl.cbegin();
+      decode(read_data, bliter);
+    }
+
+    int ret = memcmp(original_data.data, read_data.data, BUF_SIZE);
+    device->close().wait();
+    ASSERT_TRUE(ret == 0);
+  });
+}