]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/os/seastore: add nvme_read and nvme_write to be used when end to end data...
authormyoungwon oh <ohmyoungwon@gmail.com>
Tue, 28 May 2024 10:35:45 +0000 (10:35 +0000)
committermyoungwon oh <ohmyoungwon@gmail.com>
Mon, 22 Jul 2024 02:02:46 +0000 (02:02 +0000)
Signed-off-by: Myoungwon Oh <myoungwon.oh@samsung.com>
src/crimson/os/seastore/random_block_manager.h
src/crimson/os/seastore/random_block_manager/block_rb_manager.cc
src/crimson/os/seastore/random_block_manager/nvme_block_device.cc
src/crimson/os/seastore/random_block_manager/nvme_block_device.h

index fca76c313966e693d974386a8e832bc73a809164..5e8a4d7c553450ddc8b343bf71e7567a7d105b21 100644 (file)
@@ -51,6 +51,8 @@ struct rbm_superblock_t {
   checksum_t crc = 0;
   device_config_t config;
   unsigned int shard_num = 0;
+  // Must be assigned if ent-to-end-data-protection features is enabled
+  uint32_t nvme_block_size = 0;
   std::vector<rbm_shard_info_t> shard_infos;
 
   DENC(rbm_superblock_t, v, p) {
@@ -63,6 +65,7 @@ struct rbm_superblock_t {
     denc(v.crc, p);
     denc(v.config, p);
     denc(v.shard_num, p);
+    denc(v.nvme_block_size, p);
     denc(v.shard_infos, p);
     DENC_FINISH(p);
   }
index afe1128bc925775911cfe949f26a0d320c8a068f..2b303fbc4d9026472f116c86f5cca95103bbcd6b 100644 (file)
@@ -210,7 +210,8 @@ std::ostream &operator<<(std::ostream &out, const rbm_superblock_t &header)
        << ", crc=" << header.crc
        << ", config=" << header.config
        << ", shard_num=" << header.shard_num
-       << ", end_to_end_data_protection=" << header.is_end_to_end_data_protection();
+       << ", end_to_end_data_protection=" << header.is_end_to_end_data_protection()
+       << ", device_block_size=" << header.nvme_block_size;
   for (auto p : header.shard_infos) {
     out << p;
   }
index f0a23cb4077d2843d009e0b05359326ca42d92a8..2576ee7466b9b8adcde9176610b96794d1600ffb 100644 (file)
@@ -133,6 +133,13 @@ write_ertr::future<> NVMeBlockDevice::write(
   if (stream >= stream_id_count) {
     supported_stream = WRITE_LIFE_NOT_SET;
   }
+  if (is_end_to_end_data_protection()) {
+    return seastar::do_with(
+      std::move(bptr),
+      [this, offset] (auto &bptr) {
+      return nvme_write(offset, bptr.length(), bptr.c_str());
+    });
+  }
   return seastar::do_with(
     std::move(bptr),
     [this, offset, length, supported_stream] (auto& bptr) {
@@ -159,9 +166,15 @@ read_ertr::future<> NVMeBlockDevice::read(
       offset,
       bptr.length());
   auto length = bptr.length();
-
+  if (length == 0) {
+    return read_ertr::now();
+  }
   assert((length % super.block_size) == 0);
 
+  if (is_end_to_end_data_protection()) {
+    return nvme_read(offset, length, bptr.c_str());
+  }
+
   return device.dma_read(offset, bptr.c_str(), length).handle_exception(
     [](auto e) -> read_ertr::future<size_t> {
       logger().error("read: dma_read got error{}", e);
@@ -188,6 +201,13 @@ write_ertr::future<> NVMeBlockDevice::writev(
   if (stream >= stream_id_count) {
     supported_stream = WRITE_LIFE_NOT_SET;
   }
+  if (is_end_to_end_data_protection()) {
+    return seastar::do_with(
+      std::move(bl),
+      [this, offset] (auto &bl) {
+      return nvme_write(offset, bl.length(), bl.c_str());
+    });
+  }
   bl.rebuild_aligned(super.block_size);
 
   return seastar::do_with(
@@ -256,6 +276,7 @@ discard_ertr::future<> NVMeBlockDevice::discard(uint64_t offset, uint64_t len) {
 nvme_command_ertr::future<nvme_identify_namespace_data_t>
 NVMeBlockDevice::identify_namespace(seastar::file f) {
   return get_nsid(f).safe_then([this, f](auto nsid) {
+    namespace_id = nsid;
     return seastar::do_with(
       nvme_admin_command_t(),
       nvme_identify_namespace_data_t(),
@@ -314,6 +335,7 @@ nvme_command_ertr::future<> NVMeBlockDevice::try_enable_end_to_end_protection()
       if (id_namespace_data.lbaf[i].ms ==
          nvme_identify_namespace_data_t::METASIZE_FOR_CHECKSUM_OFFLOAD) {
        lba_format_index = i;
+       super.nvme_block_size = (1 << id_namespace_data.lbaf[i].lbads);
        break;
       }
     }
@@ -362,4 +384,60 @@ nvme_command_ertr::future<> NVMeBlockDevice::initialize_nvme_features() {
   return nvme_command_ertr::now();
 }
 
+write_ertr::future<> NVMeBlockDevice::nvme_write(
+  uint64_t offset, size_t len, void *buffer_ptr) {
+  return seastar::do_with(
+    nvme_io_command_t(),
+    [this, offset, len, buffer_ptr] (auto &cmd) {
+    cmd.common.opcode = nvme_io_command_t::OPCODE_WRITE;
+    cmd.common.nsid = namespace_id;
+    cmd.common.data_len = len;
+    // To perform checksum offload, we need to set PRACT to 1 and PRCHK to 4
+    // according to NVMe spec.
+    cmd.rw.prinfo_pract = nvme_rw_command_t::PROTECT_INFORMATION_ACTION_ENABLE;
+    cmd.rw.prinfo_prchk = nvme_rw_command_t::PROTECT_INFORMATION_CHECK_GUARD;
+    cmd.common.addr = (__u64)(uintptr_t)buffer_ptr;
+    ceph_assert(super.nvme_block_size > 0);
+    auto lba_shift = ffsll(super.nvme_block_size) - 1;
+    cmd.rw.s_lba = offset >> lba_shift;
+    cmd.rw.nlb = (len >> lba_shift) - 1;
+    return pass_through_io(cmd
+    ).safe_then([] (auto ret) {
+      if (ret != 0) {
+       logger().error(
+         "write nvm command with checksum offload fails : {}", ret);
+       ceph_abort();
+      }
+      return nvme_command_ertr::now();
+    });
+  });
+}
+
+read_ertr::future<> NVMeBlockDevice::nvme_read(
+  uint64_t offset, size_t len, void *buffer_ptr) {
+  return seastar::do_with(
+    nvme_io_command_t(),
+    [this, offset, len, buffer_ptr] (auto &cmd) {
+    cmd.common.opcode = nvme_io_command_t::OPCODE_READ;
+    cmd.common.nsid = namespace_id;
+    cmd.common.data_len = len;
+    cmd.rw.prinfo_pract = nvme_rw_command_t::PROTECT_INFORMATION_ACTION_ENABLE;
+    cmd.rw.prinfo_prchk = nvme_rw_command_t::PROTECT_INFORMATION_CHECK_GUARD;
+    cmd.common.addr = (__u64)(uintptr_t)buffer_ptr;
+    ceph_assert(super.nvme_block_size > 0);
+    auto lba_shift = ffsll(super.nvme_block_size) - 1;
+    cmd.rw.s_lba = offset >> lba_shift;
+    cmd.rw.nlb = (len >> lba_shift) - 1;
+    return pass_through_io(cmd
+    ).safe_then([] (auto ret) {
+      if (ret != 0) {
+       logger().error(
+         "read nvm command with checksum offload fails : {}", ret);
+       ceph_abort();
+      }
+      return nvme_command_ertr::now();
+    });
+  });
+}
+
 }
index 1a9d6297efb05c62fe54221ae2faec83f210c977..01755d047a0c0cafc566485ec15039edb4de256d 100644 (file)
@@ -170,6 +170,11 @@ struct nvme_rw_command_t {
   uint32_t dspec : 16;
 
   static const uint32_t DTYPE_STREAM = 1;
+
+  static const uint8_t PROTECT_INFORMATION_ACTION_ENABLE = 1;
+  static const uint8_t PROTECT_INFORMATION_CHECK_GUARD = 4;
+  static const uint8_t PROTECT_INFORMATION_CHECK_APPLICATION_TAG = 2;
+  static const uint8_t PROTECT_INFORMATION_CHECK_LOGICAL_REFERENCE_TAG = 1;
 };
 
 struct nvme_io_command_t {
@@ -178,7 +183,7 @@ struct nvme_io_command_t {
     nvme_rw_command_t rw;
   };
   static const uint8_t OPCODE_WRITE = 0x01;
-  static const uint8_t OPCODE_READ = 0x01;
+  static const uint8_t OPCODE_READ = 0x02;
 };
 
 /*
@@ -224,6 +229,9 @@ public:
     uint64_t offset,
     bufferptr &bptr) final;
 
+  read_ertr::future<> nvme_read(
+    uint64_t offset, size_t len, void *buffer_ptr);
+
   close_ertr::future<> close() override;
 
   discard_ertr::future<> discard(
@@ -241,6 +249,9 @@ public:
     ceph::bufferlist bl,
     uint16_t stream = 0) final;
 
+  write_ertr::future<> nvme_write(
+    uint64_t offset, size_t len, void *buffer_ptr);
+
   stat_device_ret stat_device() final {
     return seastar::file_stat(device_path, seastar::follow_symlink::yes
     ).handle_exception([](auto e) -> stat_device_ret {
@@ -376,6 +387,7 @@ private:
   uint64_t write_alignment = 4096;
   uint32_t atomic_write_unit = 4096;
 
+  int namespace_id; // TODO: multi namespaces
   std::string device_path;
   seastar::sharded<NVMeBlockDevice> shard_devices;
 };