]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
seastore: replace POSIX call to seastar::file in NormalNBD
authorJinyong Ha <jy200.ha@samsung.com>
Mon, 24 May 2021 09:33:02 +0000 (18:33 +0900)
committermyoungwon oh <ohmyoungwon@gmail.com>
Tue, 15 Jun 2021 02:09:06 +0000 (11:09 +0900)
Signed-off-by: Jinyong Ha <jy200.ha@samsung.com>
src/crimson/os/seastore/nvmedevice/nvmedevice.cc
src/crimson/os/seastore/nvmedevice/nvmedevice.h
src/test/crimson/seastore/nvmedevice/test_nvmedevice.cc

index 84f707983d21e569058250c6dc6db3472874aa62..f6c6cfeef4d93ed881bc2322c7a9ccdead6e8bb5 100644 (file)
@@ -17,211 +17,78 @@ namespace {
   }
 }
 
-static constexpr uint32_t MAX_EVENTS = 1024;
-
 namespace crimson::os::seastore::nvme_device {
 
-/* background io poller for multi-stream write */
-void poll_completion(std::vector<::io_context_t>* ctxs, bool* exit) {
-  while (*exit == false) {
-    for (auto& ctx : *ctxs) {
-      io_event events[MAX_EVENTS];
-
-      /*
-       * At least a completion should be returned. Otherwise, thread is blocked
-       * until it is possible
-       */
-      int num_events = io_getevents(ctx, 1, MAX_EVENTS, events, NULL);
-
-      for (int i = 0; i < num_events; i++) {
-        io_context_t* io_context = (io_context_t*)events[i].obj;
-        io_context->done = true;
-      }
-    }
-  }
-}
-
-open_ertr::future<>
-NormalNBD::open(const std::string &in_path, seastar::open_flags mode) {
-  /* Open with posix fd for pass generic NVMe commands */
-  fd = seastar::file_desc::open(in_path, (int)mode);
-  identify_controller_data_t controller_data = {0, };
-  return identify_controller(controller_data).safe_then(
-      [this, controller_data, in_path, mode]() {
-      protocol_version = controller_data.version;
-      logger().debug("nvme protocol {}.{} {}",
-          (uint32_t)protocol_version.major_ver,
-          (uint32_t)protocol_version.minor_ver,
-          (uint32_t)protocol_version.tertiary_ver);
-
-      /*
-       * Multi Stream Write
-       *
-       * When NVMe SSD supports multi stream functionality, it marks oacs bit of
-       * identify_controller_data structure (from NVMe Specification 1.4).
-       * If oacs field is true, NormalNBD class opens device file multiple times
-       * with different stream IDs. When user calls write() with stream argument,
-       * NormalNBD finds pre-opened FD with stream ID and submit write IO to the
-       * found FD.
-       */
-      support_multistream = controller_data.oacs.support_directives;
-      if (support_multistream) {
-        write_life_max = 6;
-      }
-
-      open_for_io(in_path, mode);
-
-      /* PWG and PWA are supported from NVMe 1.4 */
-      if (protocol_version.major_ver >= 1 && protocol_version.minor_ver >= 4) {
-        identify_namespace_data_t namespace_data = {0, };
-        identify_namespace(namespace_data).safe_then([this, namespace_data]() {
-            /* Revise 0-based value */
-            write_granularity = namespace_data.npwg + 1;
-            write_alignment = namespace_data.npwa + 1;
-            });
-      }
-      return seastar::now();
-      }).handle_error(
-        /* If device does not support ioctl, just open without stream */
-        crimson::ct_error::input_output_error::handle([this, in_path, mode](auto) {
-          open_for_io(in_path, mode);
-          return seastar::now();
-        }));
-}
-
-void
-NormalNBD::open_for_io(const std::string &in_path, seastar::open_flags mode)
-{
-  ctx.resize(write_life_max);
-  for (uint32_t i = 0; i < write_life_max; i++) {
-    stream_fd.push_back(seastar::file_desc::open(in_path, (int)mode));
-    if (i != write_life_not_set) {
-      int posix_fd = stream_fd[i].get();
-      fcntl(posix_fd, F_SET_FILE_RW_HINT, &i);
-    }
-
-    io_setup(MAX_EVENTS, &ctx[i]);
-  }
-  completion_poller = std::thread(poll_completion, &ctx, &exit);
+open_ertr::future<> NormalNBD::open(
+  const std::string &in_path,
+  seastar::open_flags mode) {
+  return seastar::do_with(in_path, [this, mode](auto& in_path) {
+    return seastar::file_stat(in_path).then([this, mode, in_path](auto stat) {
+      this->block_size = stat.block_size;
+      this->size = stat.size;
+      return seastar::open_file_dma(in_path, mode).then([=](auto file) {
+        this->device = file;
+        logger().debug("open");
+        return seastar::now();
+      });
+    }).handle_exception([](auto e) -> open_ertr::future<> {
+      logger().error("open: got error{}", e);
+      return crimson::ct_error::input_output_error::make();
+    });
+  });
 }
 
-write_ertr::future<>
-NormalNBD::write(
+write_ertr::future<> NormalNBD::write(
   uint64_t offset,
   bufferptr &bptr,
   uint16_t stream) {
   logger().debug(
-      "block: do_write offset {} len {}",
+      "block: write offset {} len {}",
       offset,
       bptr.length());
-  io_context_t io_context = io_context_t();
-  io_prep_pwrite(
-      &io_context.cb,
-      stream_fd[stream].get(),
-      bptr.c_str(),
-      bptr.length(),
-      offset);
-  iocb* cb_ptr[1] = {&io_context.cb};
-  io_submit(ctx[stream], 1, cb_ptr);
-  return seastar::do_with(std::move(io_context), [] (auto& io_context) {
-    /*
-     * libaio needs additional poller thread (see poll_completion) to poll IO
-     * completion. When the poller catches a completion, it marks "done" field
-     * of corresponding io_context.
-     */
-    if (io_context.done) {
-      return seastar::now();
-    }
-      return seastar::later();
+  auto length = bptr.length();
+
+  assert((length % block_size) == 0);
+
+  return device.dma_write(offset, bptr.c_str(), length).handle_exception(
+    [length](auto e) -> write_ertr::future<size_t> {
+      logger().error("write: dma_write got error{}", e);
+      return crimson::ct_error::input_output_error::make();
+    }).then([=](auto result) -> write_ertr::future<> {
+      if (result != length) {
+        logger().error("write: dma_write got error with not proper length");
+        return crimson::ct_error::input_output_error::make();
+      }
+      return write_ertr::now();
     });
 }
 
-read_ertr::future<>
-NormalNBD::read(
+read_ertr::future<> NormalNBD::read(
   uint64_t offset,
   bufferptr &bptr) {
   logger().debug(
-      "block: do_read offset {} len {}",
+      "block: read offset {} len {}",
       offset,
       bptr.length());
-  io_context_t io_context = io_context_t();
-  io_prep_pread(
-      &io_context.cb,
-      stream_fd[0].get(),
-      bptr.c_str(),
-      bptr.length(),
-      offset);
-  iocb* cb_ptr[1] = {&io_context.cb};
-  io_submit(ctx[0], 1, cb_ptr);
-  return seastar::do_with(std::move(io_context), [] (auto& io_context) {
-      if (io_context.done) {
-      return seastar::now();
-      }
-      return seastar::later();
-      });
-}
-
-seastar::future<>
-NormalNBD::close() {
-  logger().debug(" close ");
-  exit = true;
-  completion_poller.join();
-  fd.close();
-  return seastar::now();
-}
-
-nvme_command_ertr::future<>
-NormalNBD::pass_through_io(NVMePassThroughCommand& command) {
-  logger().debug("block: pass through");
-  int ret = fd.ioctl(NVME_IOCTL_IO_CMD, command);
-  if (ret < 0) {
-    logger().debug("block: pass through failed");
-    return crimson::ct_error::input_output_error::make();
-  }
-  else {
-    return nvme_command_ertr::now();
-  }
-}
+  auto length = bptr.length();
 
-nvme_command_ertr::future<>
-NormalNBD::identify_namespace(identify_namespace_data_t& namespace_data) {
-  nvme_admin_command_t command = {0,};
-  command.common_cmd.opcode = nvme_admin_command_t::OPCODE_IDENTIFY;
-  command.identify_cmd.cns = nvme_identify_command_t::CNS_NAMESPACE;
-  command.common_cmd.addr = (uint64_t)&namespace_data;
-  command.common_cmd.data_len = sizeof(identify_namespace_data_t);
+  assert((length % block_size) == 0);
 
-  return pass_admin(command);
-}
-
-nvme_command_ertr::future<>
-NormalNBD::identify_controller(identify_controller_data_t& controller_data) {
-  nvme_admin_command_t command = {0,};
-  command.common_cmd.opcode = nvme_admin_command_t::OPCODE_IDENTIFY;
-  command.identify_cmd.cns = nvme_identify_command_t::CNS_CONTROLLER;
-  command.common_cmd.addr = (uint64_t)&controller_data;
-  command.common_cmd.data_len = sizeof(identify_controller_data_t);
-
-  return pass_admin(command);
-}
-
-nvme_command_ertr::future<>
-NormalNBD::pass_admin(nvme_admin_command_t& command) {
-  logger().debug("block: pass admin");
-  try {
-    int ret = fd.ioctl(NVME_IOCTL_ADMIN_CMD, command);
-    if (ret < 0) {
-      logger().debug("block: pass admin failed");
+  return device.dma_read(offset, bptr.c_str(), length).handle_exception(
+    [length](auto e) -> read_ertr::future<size_t> {
+      logger().error("read: dma_read got error{}", e);
       return crimson::ct_error::input_output_error::make();
-    }
-    else {
-      return nvme_command_ertr::now();
-    }
-  }
-  catch (...) {
-    logger().debug("block: pass admin failed");
-    return crimson::ct_error::input_output_error::make();
-  }
+    }).then([=](auto result) -> read_ertr::future<> {
+      if (result != length) {
+        logger().error("read: dma_read got error with not proper length");
+        return crimson::ct_error::input_output_error::make();
+      }
+      return read_ertr::now();
+    });
 }
 
+seastar::future<> NormalNBD::close() {
+  logger().debug(" close ");
+  return device.close();
+}
 }
index c31fd0d844ce045bf0cee57927a8b9845db48167..3bfaab2aeae67a27dc2ac8f2498a85dba8d2582f 100644 (file)
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <memory>
 #include <vector>
 #include <boost/intrusive_ptr.hpp>
 #include <boost/smart_ptr/intrusive_ref_counter.hpp>
@@ -143,7 +144,7 @@ struct io_context_t {
 class NVMeBlockDevice {
 protected:
   uint64_t size = 0;
-  uint64_t block_size = 0;
+  uint64_t block_size = 4096;
 
   uint64_t write_granularity = 4096;
   uint64_t write_alignment = 4096;
@@ -172,6 +173,7 @@ public:
    */
   uint64_t get_size() const { return size; }
   uint64_t get_block_size() const { return block_size; }
+
   uint64_t get_preffered_write_granularity() const { return write_granularity; }
   uint64_t get_preffered_write_alignment() const { return write_alignment; }
 
@@ -255,7 +257,7 @@ public:
 class NormalNBD : public NVMeBlockDevice {
 public:
   NormalNBD() {}
-  ~NormalNBD() override {}
+  ~NormalNBD() = default;
 
   open_ertr::future<> open(
     const std::string &in_path,
@@ -270,31 +272,13 @@ public:
     uint64_t offset,
     bufferptr &bptr) override;
 
-  nvme_command_ertr::future<> pass_through_io(
-    NVMePassThroughCommand& command) override;
-
-  nvme_command_ertr::future<> pass_admin(
-    nvme_admin_command_t& command) override;
-
   seastar::future<> close() override;
 
+  // TODO Servicing NVMe features (multi-stream, protected write etc..) should
+  // be followed by upstreaming ioctl to seastar.
+
 private:
-  seastar::file_desc fd = seastar::file_desc::from_fd(-1);
-  std::vector<seastar::file_desc> stream_fd;
-  nvme_version_t protocol_version;
-  bool support_multistream = false;
-  std::vector<::io_context_t> ctx;
-  std::thread completion_poller;
-  bool exit = false;
-
-  uint32_t write_life_not_set = 0;
-  uint32_t write_life_max = 1;
-
-  nvme_command_ertr::future<> identify_controller(
-    identify_controller_data_t& controller_data);
-  nvme_command_ertr::future<> identify_namespace(
-    identify_namespace_data_t& namespace_data);
-  void open_for_io(const std::string& in_path, seastar::open_flags mode);
+  seastar::file device;
 };
 
 
@@ -309,7 +293,9 @@ public:
     }
   }
 
-  open_ertr::future<> open(const std::string &in_path, seastar::open_flags mode) override;
+  open_ertr::future<> open(
+    const std::string &in_path,
+    seastar::open_flags mode) override;
 
   write_ertr::future<> write(
     uint64_t offset,
index e43d9d7591dacd194d66baa7952df46b10b25463..1d79a01ec0cf3deb08063b3f91911b534d1fbfd7 100644 (file)
@@ -18,7 +18,7 @@ struct nvdev_test_t : seastar_test_suite_t {
   static const uint64_t DEV_SIZE = 1024 * 1024 * 1024;
 
   nvdev_test_t() :
-    device(NVMeBlockDevice::create<NormalNBD>()),
+    device(nullptr),
     dev_path("randomblock_manager.test_nvmedevice" + stringify(getpid())) {
     int fd = ::open(dev_path.c_str(), O_CREAT|O_RDWR|O_TRUNC, 0644);
     ceph_assert(fd >= 0);
@@ -27,11 +27,11 @@ struct nvdev_test_t : seastar_test_suite_t {
   }
   ~nvdev_test_t() {
     ::unlink(dev_path.c_str());
-    delete device;
   }
 };
 
-static const uint64_t BUF_SIZE = 8192;
+static const uint64_t BUF_SIZE = 1024;
+static const uint64_t BLK_SIZE = 4096;
 
 struct nvdev_test_block_t {
   uint8_t data[BUF_SIZE];
@@ -53,6 +53,7 @@ WRITE_CLASS_DENC_BOUNDED(
 TEST_F(nvdev_test_t, write_and_verify_test)
 {
   run_async([this] {
+    device = NVMeBlockDevice::create<PosixNVMeDevice>();
     device->open(dev_path, seastar::open_flags::rw).unsafe_get();
     nvdev_test_block_t original_data;
     std::minstd_rand0 generator;
@@ -63,14 +64,14 @@ TEST_F(nvdev_test_t, write_and_verify_test)
       bufferlist bl;
       encode(original_data, bl);
       bl_length = bl.length();
-      auto write_buf = ceph::bufferptr(buffer::create_page_aligned(bl_length));
-      bl.begin().copy(bl.length(), write_buf.c_str());
+      auto write_buf = ceph::bufferptr(buffer::create_page_aligned(BLK_SIZE));
+      bl.begin().copy(bl_length, write_buf.c_str());
       device->write(0, write_buf).unsafe_get();
     }
 
     nvdev_test_block_t read_data;
     {
-      auto read_buf = ceph::bufferptr(buffer::create_page_aligned(bl_length));
+      auto read_buf = ceph::bufferptr(buffer::create_page_aligned(BLK_SIZE));
       device->read(0, read_buf).unsafe_get();
       bufferlist bl;
       bl.push_back(read_buf);
@@ -81,5 +82,7 @@ TEST_F(nvdev_test_t, write_and_verify_test)
     int ret = memcmp(original_data.data, read_data.data, BUF_SIZE);
     device->close().wait();
     ASSERT_TRUE(ret == 0);
+    device.reset(nullptr);
   });
 }
+