]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/os/seastore/rbm: turn on end-to-end-data-protection during mkfs if possible
authormyoungwon oh <ohmyoungwon@gmail.com>
Mon, 27 May 2024 09:32:56 +0000 (09:32 +0000)
committermyoungwon oh <ohmyoungwon@gmail.com>
Mon, 22 Jul 2024 02:02:33 +0000 (02:02 +0000)
Signed-off-by: Myoungwon Oh <myoungwon.oh@samsung.com>
src/common/options/crimson.yaml.in
src/crimson/os/seastore/device.h
src/crimson/os/seastore/random_block_manager.h
src/crimson/os/seastore/random_block_manager/block_rb_manager.cc
src/crimson/os/seastore/random_block_manager/nvme_block_device.cc
src/crimson/os/seastore/random_block_manager/nvme_block_device.h
src/crimson/os/seastore/random_block_manager/rbm_device.cc
src/crimson/os/seastore/random_block_manager/rbm_device.h

index 73f4fdd6b2d6e1e7676e8c08711f3b47f7af50ce..36b7f8bc1e335bf1034ef6cb60928c8c7283dccc 100644 (file)
@@ -154,3 +154,10 @@ options:
   level: dev
   desc: overwrite the existing data block based on delta if the overwrite size is equal to or less than the value, otherwise do overwrite based on remapping, set to 0 to enforce the remap-based overwrite.
   default: 0
+- name: seastore_disable_end_to_end_data_protection 
+  type: bool
+  level: dev
+  desc: When false, upon mkfs, try to discover whether the nvme device supports
+        internal checksum feature without using sever CPU then enable if available,
+        set to true to disable unconditionally.
+  default: true
index ceb1ede64531f4e1a12ef5f1b6f13e598ef4c347..56d0c889b7b5297f7752aefc8aae2ae9f65d0fd5 100644 (file)
@@ -137,6 +137,10 @@ public:
 
   virtual secondary_device_set_t& get_secondary_devices() = 0;
 
+  virtual bool is_end_to_end_data_protection() const {
+    return false;
+  }
+
   using close_ertr = crimson::errorator<
     crimson::ct_error::input_output_error>;
   virtual close_ertr::future<> close() = 0;
index 449fdeb4ef13e6b141e387511fbfa2efbcf58580..fca76c313966e693d974386a8e832bc73a809164 100644 (file)
@@ -39,6 +39,10 @@ struct rbm_shard_info_t {
   }
 };
 
+enum class rbm_feature_t : uint64_t {
+  RBM_NVME_END_TO_END_PROTECTION = 1,
+};
+
 struct rbm_superblock_t {
   size_t size = 0;
   size_t block_size = 0;
@@ -80,6 +84,13 @@ struct rbm_superblock_t {
                backend_type_t::RANDOM_BLOCK);
     ceph_assert(config.spec.id <= DEVICE_ID_MAX_VALID);
   }
+
+  bool is_end_to_end_data_protection() const {
+    return (feature & (uint64_t)rbm_feature_t::RBM_NVME_END_TO_END_PROTECTION);
+  }
+  void set_end_to_end_data_protection() {
+    feature |= (uint64_t)rbm_feature_t::RBM_NVME_END_TO_END_PROTECTION;
+  }
 };
 
 enum class rbm_extent_state_t {
index 7a94c20fc46abee261d2fbada8607bdb91746c0b..afe1128bc925775911cfe949f26a0d320c8a068f 100644 (file)
@@ -209,7 +209,8 @@ std::ostream &operator<<(std::ostream &out, const rbm_superblock_t &header)
        << ", journal_size=" << header.journal_size
        << ", crc=" << header.crc
        << ", config=" << header.config
-       << ", shard_num=" << header.shard_num;
+       << ", shard_num=" << header.shard_num
+       << ", end_to_end_data_protection=" << header.is_end_to_end_data_protection();
   for (auto p : header.shard_infos) {
     out << p;
   }
index 6437f06a484f8676184d72cf46be070947846524..f0a23cb4077d2843d009e0b05359326ca42d92a8 100644 (file)
@@ -50,8 +50,6 @@ open_ertr::future<> NVMeBlockDevice::open(
           return identify_namespace(device).safe_then([this, in_path, mode] (
             auto id_namespace_data) {
             atomic_write_unit = awupf * super.block_size;
-            data_protection_type = id_namespace_data.dps.protection_type;
-            data_protection_enabled = (data_protection_type > 0);
             if (id_namespace_data.nsfeat.opterf == 1){
               // NPWG and NPWA is 0'based value
               write_granularity = super.block_size * (id_namespace_data.npwg + 1);
@@ -94,8 +92,29 @@ NVMeBlockDevice::mount_ret NVMeBlockDevice::mount()
     return local_device.do_shard_mount(
     ).handle_error(
       crimson::ct_error::assert_all{
-        "Invalid error in RBMDevice::do_mount"
+       "Invalid error in NVMeBlockDevice::do_shard_mount"
     });
+  }).then([this] () {
+    if (is_end_to_end_data_protection()) {
+      return identify_namespace(device
+      ).safe_then([] (auto id_namespace_data) {
+       if (id_namespace_data.dps.protection_type !=
+           nvme_format_nvm_command_t::PROTECT_INFORMATION_TYPE_2) {
+         logger().error("seastore was formated with end-to-end-data-protection \
+           but the device being mounted to use seastore does not support \
+           the functionality. Please check the device.");
+         ceph_abort();
+       }
+       if (id_namespace_data.lbaf[id_namespace_data.flbas.lba_index].ms != 
+           nvme_identify_namespace_data_t::METASIZE_FOR_CHECKSUM_OFFLOAD) {
+         logger().error("seastore was formated with end-to-end-data-protection \
+           but the formatted device meta size is wrong. Please check the device.");
+         ceph_abort();
+       }
+       return mount_ertr::now();
+      });
+    }
+    return mount_ertr::now();
   });
 }
 
@@ -267,7 +286,7 @@ nvme_command_ertr::future<int> NVMeBlockDevice::pass_admin(
   nvme_admin_command_t& admin_cmd, seastar::file f) {
   return f.ioctl(NVME_IOCTL_ADMIN_CMD, &admin_cmd).handle_exception(
     [](auto e)->nvme_command_ertr::future<int> {
-      logger().error("pass_admin: ioctl failed");
+      logger().error("pass_admin: ioctl failed {}", e);
       return crimson::ct_error::input_output_error::make();
     });
 }
@@ -277,4 +296,70 @@ nvme_command_ertr::future<int> NVMeBlockDevice::pass_through_io(
   return device.ioctl(NVME_IOCTL_IO_CMD, &io_cmd);
 }
 
+nvme_command_ertr::future<> NVMeBlockDevice::try_enable_end_to_end_protection() {
+  return identify_namespace(device
+  ).safe_then([this] (auto id_namespace_data) -> nvme_command_ertr::future<> {
+    if (!id_namespace_data.nlbaf) {
+      logger().info("the device does not support end to end data protection,\
+       mkfs() will be done without this functionality.");
+      return nvme_command_ertr::now();
+    }
+    int lba_format_index = -1;
+    for (int i = 0; i < id_namespace_data.nlbaf; i++) {
+      // TODO: enable other types of end to end data protection 
+      // Note that the nvme device will generate crc if the namespace
+      // is formatted with meta size 8
+      // The nvme device can provide other types of data protections.
+      // But, for now, we only consider the checksum offload in the device side.
+      if (id_namespace_data.lbaf[i].ms ==
+         nvme_identify_namespace_data_t::METASIZE_FOR_CHECKSUM_OFFLOAD) {
+       lba_format_index = i;
+       break;
+      }
+    }
+    if (lba_format_index == -1) {
+      logger().info("the device does not support end to end data protection,\
+       mkfs() will be done without this functionality.");
+      return nvme_command_ertr::now();
+    }
+    return get_nsid(device
+    ).safe_then([this, i=lba_format_index](auto nsid) {
+      return seastar::do_with(
+       nvme_admin_command_t(),
+       [this, nsid=nsid, i=i] (auto &cmd) {
+       cmd.common.opcode = nvme_admin_command_t::OPCODE_FORMAT_NVM;
+       cmd.common.nsid = nsid;
+       // TODO: configure other protect information types (2 or 3) see above
+       cmd.format.pi = nvme_format_nvm_command_t::PROTECT_INFORMATION_TYPE_2;
+       cmd.format.lbaf = i;
+       return pass_admin(cmd, device
+       ).safe_then([this](auto ret) {
+         if (ret != 0) {
+           logger().error(
+             "formt nvm command to use end-to-end-protection fails : {}", ret);
+           ceph_abort();
+         }
+         return identify_namespace(device
+         ).safe_then([this] (auto id_namespace_data) -> nvme_command_ertr::future<> {
+           ceph_assert(id_namespace_data.dps.protection_type ==
+              nvme_format_nvm_command_t::PROTECT_INFORMATION_TYPE_2);
+           super.set_end_to_end_data_protection();
+           return nvme_command_ertr::now();
+         });
+       });
+      });
+    });
+  }).handle_error(crimson::ct_error::input_output_error::handle([]{
+    logger().info("the device does not support identify namespace command");
+    return nvme_command_ertr::now();
+  }), crimson::ct_error::pass_further_all{});
+}
+
+nvme_command_ertr::future<> NVMeBlockDevice::initialize_nvme_features() {
+  if (!crimson::common::get_conf<bool>("seastore_disable_end_to_end_data_protection")) {
+    return try_enable_end_to_end_protection();
+  }
+  return nvme_command_ertr::now();
+}
+
 }
index ed8f99be8dc248fc6a8baafd57f713fdc5098c47..1a9d6297efb05c62fe54221ae2faec83f210c977 100644 (file)
@@ -43,13 +43,26 @@ struct nvme_identify_command_t {
   static const uint8_t CNS_CONTROLLER = 0x01;
 };
 
+struct nvme_format_nvm_command_t {
+  uint32_t common_dw[10];
+
+  uint8_t lbaf : 4;
+  uint8_t mset : 1;
+  uint8_t pi : 3;
+  uint8_t pil : 1;
+  
+  static const uint8_t PROTECT_INFORMATION_TYPE_2 = 2;
+};
+
 struct nvme_admin_command_t {
   union {
     nvme_passthru_cmd common;
     nvme_identify_command_t identify;
+    nvme_format_nvm_command_t format;
   };
 
   static const uint8_t OPCODE_IDENTIFY = 0x06;
+  static const uint8_t OPCODE_FORMAT_NVM = 0x80;
 };
 
 // Optional Admin Command Support (OACS)
@@ -111,22 +124,32 @@ struct lbaf_t {
   uint32_t reserved : 6;
 };
 
+struct flbas_t {
+  uint8_t lba_index : 4;
+  uint8_t ms_transferred :1;
+  uint8_t reserved : 3;
+};
+
 struct nvme_identify_namespace_data_t {
   union {
     struct {
       uint8_t unused[24];   // [23:0]
       nsfeat_t nsfeat;      // [24]
-      uint8_t unused2[3];   // [27:25]
+      uint8_t nlbaf;      // [25]
+      flbas_t flbas;      // [26]
+      uint8_t unused2;   // [27]
       dpc_t dpc;            // [28]
       dps_t dps;            // [29]
       uint8_t unused3[34];  // [63:30]
       uint16_t npwg;        // [65:64]
       uint16_t npwa;        // [67:66]
       uint8_t unused4[60];  // [127:68]
-      lbaf_t lbaf0;         // [131:128]
+      lbaf_t lbaf[64];         // [383:128]
     };
     uint8_t raw[4096];
   };
+  // meta size value to use device-level checksum
+  static const uint8_t METASIZE_FOR_CHECKSUM_OFFLOAD = 8; 
 };
 
 struct nvme_rw_command_t {
@@ -209,6 +232,8 @@ public:
 
   mount_ret mount() final;
 
+  nvme_command_ertr::future<> initialize_nvme_features() final;
+
   mkfs_ret mkfs(device_config_t config) final;
 
   write_ertr::future<> writev(
@@ -231,7 +256,7 @@ public:
          ).safe_then([stat] (auto id_namespace_data) mutable {
            // LBA format provides LBA size which is power of 2. LBA is the
            // minimum size of read and write.
-           stat.block_size = (1 << id_namespace_data.lbaf0.lbads);
+           stat.block_size = (1 << id_namespace_data.lbaf[0].lbads);
            if (stat.block_size < RBM_SUPERBLOCK_SIZE) {
              stat.block_size = RBM_SUPERBLOCK_SIZE;
            } 
@@ -286,7 +311,7 @@ public:
    * protection is enabled, checksum is calculated on every write and used to
    * verify data on every read.
    */
-   bool is_data_protection_enabled() const { return data_protection_enabled; }
+  nvme_command_ertr::future<> try_enable_end_to_end_protection();
 
   /*
    * Data Health
@@ -321,7 +346,6 @@ public:
     nvme_io_command_t& io_cmd);
 
   bool support_multistream = false;
-  uint8_t data_protection_type = 0;
 
   /*
    * Predictable Latency
@@ -352,7 +376,6 @@ private:
   uint64_t write_alignment = 4096;
   uint32_t atomic_write_unit = 4096;
 
-  bool data_protection_enabled = false;
   std::string device_path;
   seastar::sharded<NVMeBlockDevice> shard_devices;
 };
index c1fa6f2b2c7e624aaeb426d1805ffd0a3d5db0b8..f31bafcef9fb7636783c13d0eed02c74e0adf658 100644 (file)
@@ -30,7 +30,6 @@ RBMDevice::mkfs_ret RBMDevice::do_primary_mkfs(device_config_t config,
     [this, FNAME, config=std::move(config), shard_num, journal_size](auto st) {
     super.block_size = st.block_size;
     super.size = st.size;
-    super.feature |= RBM_BITMAP_BLOCK_CRC;
     super.config = std::move(config);
     super.journal_size = journal_size;
     ceph_assert_always(super.journal_size > 0);
@@ -59,13 +58,16 @@ RBMDevice::mkfs_ret RBMDevice::do_primary_mkfs(device_config_t config,
       crimson::ct_error::assert_all{
       "Invalid error open in RBMDevice::do_primary_mkfs"}
     ).safe_then([this] {
-      return write_rbm_superblock(
+      return initialize_nvme_features(
       ).safe_then([this] {
-       return close();
-      }).handle_error(
-       mkfs_ertr::pass_further{},
-       crimson::ct_error::assert_all{
-       "Invalid error write_rbm_superblock in RBMDevice::do_primary_mkfs"
+       return write_rbm_superblock(
+       ).safe_then([this] {
+         return close();
+       }).handle_error(
+         mkfs_ertr::pass_further{},
+         crimson::ct_error::assert_all{
+         "Invalid error write_rbm_superblock in RBMDevice::do_primary_mkfs"
+       });
       });
     });
   });
@@ -79,7 +81,7 @@ write_ertr::future<> RBMDevice::write_rbm_superblock()
   // If NVMeDevice supports data protection, CRC for checksum is not required
   // NVMeDevice is expected to generate and store checksum internally.
   // CPU overhead for CRC might be saved.
-  if (is_data_protection_enabled()) {
+  if (is_end_to_end_data_protection()) {
     super.crc = -1;
   } else {
     super.crc = meta_b_header.crc32c(-1);
@@ -127,7 +129,7 @@ read_ertr::future<rbm_superblock_t> RBMDevice::read_rbm_superblock(
          super_block.block_size);
 
       // Do CRC verification only if data protection is not supported.
-      if (is_data_protection_enabled() == false) {
+      if (super_block.is_end_to_end_data_protection() == false) {
        if (meta_b_header.crc32c(-1) != crc) {
          DEBUG("bad crc on super block, expected {} != actual {} ",
                meta_b_header.crc32c(-1), crc);
index 28c20c6e72dbae4d3c15acd43d9db01ff1ecf493..b74e6b1439567d07468d98fd361757f89092636e 100644 (file)
@@ -66,11 +66,6 @@ using discard_ertr = crimson::errorator<
   crimson::ct_error::input_output_error>;
 
 constexpr uint32_t RBM_SUPERBLOCK_SIZE = 4096;
-enum {
-  // TODO: This allows the device to manage crc on a block by itself
-  RBM_NVME_END_TO_END_PROTECTION = 1,
-  RBM_BITMAP_BLOCK_CRC = 2,
-};
 
 class RBMDevice : public Device {
 public:
@@ -149,7 +144,13 @@ public:
     ceph::bufferlist bl,
     uint16_t stream = 0) = 0;
 
-  bool is_data_protection_enabled() const { return false; }
+  bool is_end_to_end_data_protection() const final {
+    return super.is_end_to_end_data_protection();
+  }
+
+  virtual nvme_command_ertr::future<> initialize_nvme_features() { 
+    return nvme_command_ertr::now(); 
+  }
 
   mkfs_ret do_mkfs(device_config_t);