From 6373b6d253aaa628f47cb74921e34d8cdf6b4473 Mon Sep 17 00:00:00 2001 From: myoungwon oh Date: Mon, 27 May 2024 09:32:56 +0000 Subject: [PATCH] crimson/os/seastore/rbm: turn on end-to-end-data-protection during mkfs if possible Signed-off-by: Myoungwon Oh (cherry picked from commit 71398e2d4fe4fe38bee1ff16db08c8977127273a) --- src/common/options/crimson.yaml.in | 7 ++ src/crimson/os/seastore/device.h | 4 + .../os/seastore/random_block_manager.h | 11 +++ .../random_block_manager/block_rb_manager.cc | 3 +- .../random_block_manager/nvme_block_device.cc | 93 ++++++++++++++++++- .../random_block_manager/nvme_block_device.h | 35 +++++-- .../random_block_manager/rbm_device.cc | 20 ++-- .../random_block_manager/rbm_device.h | 13 +-- 8 files changed, 160 insertions(+), 26 deletions(-) diff --git a/src/common/options/crimson.yaml.in b/src/common/options/crimson.yaml.in index 73f4fdd6b2d..36b7f8bc1e3 100644 --- a/src/common/options/crimson.yaml.in +++ b/src/common/options/crimson.yaml.in @@ -154,3 +154,10 @@ options: level: dev desc: overwrite the existing data block based on delta if the overwrite size is equal to or less than the value, otherwise do overwrite based on remapping, set to 0 to enforce the remap-based overwrite. default: 0 +- name: seastore_disable_end_to_end_data_protection + type: bool + level: dev + desc: When false, upon mkfs, try to discover whether the nvme device supports + internal checksum feature without using sever CPU then enable if available, + set to true to disable unconditionally. + default: true diff --git a/src/crimson/os/seastore/device.h b/src/crimson/os/seastore/device.h index ceb1ede6453..56d0c889b7b 100644 --- a/src/crimson/os/seastore/device.h +++ b/src/crimson/os/seastore/device.h @@ -137,6 +137,10 @@ public: virtual secondary_device_set_t& get_secondary_devices() = 0; + virtual bool is_end_to_end_data_protection() const { + return false; + } + using close_ertr = crimson::errorator< crimson::ct_error::input_output_error>; virtual close_ertr::future<> close() = 0; diff --git a/src/crimson/os/seastore/random_block_manager.h b/src/crimson/os/seastore/random_block_manager.h index 449fdeb4ef1..fca76c31396 100644 --- a/src/crimson/os/seastore/random_block_manager.h +++ b/src/crimson/os/seastore/random_block_manager.h @@ -39,6 +39,10 @@ struct rbm_shard_info_t { } }; +enum class rbm_feature_t : uint64_t { + RBM_NVME_END_TO_END_PROTECTION = 1, +}; + struct rbm_superblock_t { size_t size = 0; size_t block_size = 0; @@ -80,6 +84,13 @@ struct rbm_superblock_t { backend_type_t::RANDOM_BLOCK); ceph_assert(config.spec.id <= DEVICE_ID_MAX_VALID); } + + bool is_end_to_end_data_protection() const { + return (feature & (uint64_t)rbm_feature_t::RBM_NVME_END_TO_END_PROTECTION); + } + void set_end_to_end_data_protection() { + feature |= (uint64_t)rbm_feature_t::RBM_NVME_END_TO_END_PROTECTION; + } }; enum class rbm_extent_state_t { diff --git a/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc index 7a94c20fc46..afe1128bc92 100644 --- a/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc +++ b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc @@ -209,7 +209,8 @@ std::ostream &operator<<(std::ostream &out, const rbm_superblock_t &header) << ", journal_size=" << header.journal_size << ", crc=" << header.crc << ", config=" << header.config - << ", shard_num=" << header.shard_num; + << ", shard_num=" << header.shard_num + << ", end_to_end_data_protection=" << header.is_end_to_end_data_protection(); for (auto p : header.shard_infos) { out << p; } diff --git a/src/crimson/os/seastore/random_block_manager/nvme_block_device.cc b/src/crimson/os/seastore/random_block_manager/nvme_block_device.cc index 6437f06a484..f0a23cb4077 100644 --- a/src/crimson/os/seastore/random_block_manager/nvme_block_device.cc +++ b/src/crimson/os/seastore/random_block_manager/nvme_block_device.cc @@ -50,8 +50,6 @@ open_ertr::future<> NVMeBlockDevice::open( return identify_namespace(device).safe_then([this, in_path, mode] ( auto id_namespace_data) { atomic_write_unit = awupf * super.block_size; - data_protection_type = id_namespace_data.dps.protection_type; - data_protection_enabled = (data_protection_type > 0); if (id_namespace_data.nsfeat.opterf == 1){ // NPWG and NPWA is 0'based value write_granularity = super.block_size * (id_namespace_data.npwg + 1); @@ -94,8 +92,29 @@ NVMeBlockDevice::mount_ret NVMeBlockDevice::mount() return local_device.do_shard_mount( ).handle_error( crimson::ct_error::assert_all{ - "Invalid error in RBMDevice::do_mount" + "Invalid error in NVMeBlockDevice::do_shard_mount" }); + }).then([this] () { + if (is_end_to_end_data_protection()) { + return identify_namespace(device + ).safe_then([] (auto id_namespace_data) { + if (id_namespace_data.dps.protection_type != + nvme_format_nvm_command_t::PROTECT_INFORMATION_TYPE_2) { + logger().error("seastore was formated with end-to-end-data-protection \ + but the device being mounted to use seastore does not support \ + the functionality. Please check the device."); + ceph_abort(); + } + if (id_namespace_data.lbaf[id_namespace_data.flbas.lba_index].ms != + nvme_identify_namespace_data_t::METASIZE_FOR_CHECKSUM_OFFLOAD) { + logger().error("seastore was formated with end-to-end-data-protection \ + but the formatted device meta size is wrong. Please check the device."); + ceph_abort(); + } + return mount_ertr::now(); + }); + } + return mount_ertr::now(); }); } @@ -267,7 +286,7 @@ nvme_command_ertr::future NVMeBlockDevice::pass_admin( nvme_admin_command_t& admin_cmd, seastar::file f) { return f.ioctl(NVME_IOCTL_ADMIN_CMD, &admin_cmd).handle_exception( [](auto e)->nvme_command_ertr::future { - logger().error("pass_admin: ioctl failed"); + logger().error("pass_admin: ioctl failed {}", e); return crimson::ct_error::input_output_error::make(); }); } @@ -277,4 +296,70 @@ nvme_command_ertr::future NVMeBlockDevice::pass_through_io( return device.ioctl(NVME_IOCTL_IO_CMD, &io_cmd); } +nvme_command_ertr::future<> NVMeBlockDevice::try_enable_end_to_end_protection() { + return identify_namespace(device + ).safe_then([this] (auto id_namespace_data) -> nvme_command_ertr::future<> { + if (!id_namespace_data.nlbaf) { + logger().info("the device does not support end to end data protection,\ + mkfs() will be done without this functionality."); + return nvme_command_ertr::now(); + } + int lba_format_index = -1; + for (int i = 0; i < id_namespace_data.nlbaf; i++) { + // TODO: enable other types of end to end data protection + // Note that the nvme device will generate crc if the namespace + // is formatted with meta size 8 + // The nvme device can provide other types of data protections. + // But, for now, we only consider the checksum offload in the device side. + if (id_namespace_data.lbaf[i].ms == + nvme_identify_namespace_data_t::METASIZE_FOR_CHECKSUM_OFFLOAD) { + lba_format_index = i; + break; + } + } + if (lba_format_index == -1) { + logger().info("the device does not support end to end data protection,\ + mkfs() will be done without this functionality."); + return nvme_command_ertr::now(); + } + return get_nsid(device + ).safe_then([this, i=lba_format_index](auto nsid) { + return seastar::do_with( + nvme_admin_command_t(), + [this, nsid=nsid, i=i] (auto &cmd) { + cmd.common.opcode = nvme_admin_command_t::OPCODE_FORMAT_NVM; + cmd.common.nsid = nsid; + // TODO: configure other protect information types (2 or 3) see above + cmd.format.pi = nvme_format_nvm_command_t::PROTECT_INFORMATION_TYPE_2; + cmd.format.lbaf = i; + return pass_admin(cmd, device + ).safe_then([this](auto ret) { + if (ret != 0) { + logger().error( + "formt nvm command to use end-to-end-protection fails : {}", ret); + ceph_abort(); + } + return identify_namespace(device + ).safe_then([this] (auto id_namespace_data) -> nvme_command_ertr::future<> { + ceph_assert(id_namespace_data.dps.protection_type == + nvme_format_nvm_command_t::PROTECT_INFORMATION_TYPE_2); + super.set_end_to_end_data_protection(); + return nvme_command_ertr::now(); + }); + }); + }); + }); + }).handle_error(crimson::ct_error::input_output_error::handle([]{ + logger().info("the device does not support identify namespace command"); + return nvme_command_ertr::now(); + }), crimson::ct_error::pass_further_all{}); +} + +nvme_command_ertr::future<> NVMeBlockDevice::initialize_nvme_features() { + if (!crimson::common::get_conf("seastore_disable_end_to_end_data_protection")) { + return try_enable_end_to_end_protection(); + } + return nvme_command_ertr::now(); +} + } diff --git a/src/crimson/os/seastore/random_block_manager/nvme_block_device.h b/src/crimson/os/seastore/random_block_manager/nvme_block_device.h index ed8f99be8dc..1a9d6297efb 100644 --- a/src/crimson/os/seastore/random_block_manager/nvme_block_device.h +++ b/src/crimson/os/seastore/random_block_manager/nvme_block_device.h @@ -43,13 +43,26 @@ struct nvme_identify_command_t { static const uint8_t CNS_CONTROLLER = 0x01; }; +struct nvme_format_nvm_command_t { + uint32_t common_dw[10]; + + uint8_t lbaf : 4; + uint8_t mset : 1; + uint8_t pi : 3; + uint8_t pil : 1; + + static const uint8_t PROTECT_INFORMATION_TYPE_2 = 2; +}; + struct nvme_admin_command_t { union { nvme_passthru_cmd common; nvme_identify_command_t identify; + nvme_format_nvm_command_t format; }; static const uint8_t OPCODE_IDENTIFY = 0x06; + static const uint8_t OPCODE_FORMAT_NVM = 0x80; }; // Optional Admin Command Support (OACS) @@ -111,22 +124,32 @@ struct lbaf_t { uint32_t reserved : 6; }; +struct flbas_t { + uint8_t lba_index : 4; + uint8_t ms_transferred :1; + uint8_t reserved : 3; +}; + struct nvme_identify_namespace_data_t { union { struct { uint8_t unused[24]; // [23:0] nsfeat_t nsfeat; // [24] - uint8_t unused2[3]; // [27:25] + uint8_t nlbaf; // [25] + flbas_t flbas; // [26] + uint8_t unused2; // [27] dpc_t dpc; // [28] dps_t dps; // [29] uint8_t unused3[34]; // [63:30] uint16_t npwg; // [65:64] uint16_t npwa; // [67:66] uint8_t unused4[60]; // [127:68] - lbaf_t lbaf0; // [131:128] + lbaf_t lbaf[64]; // [383:128] }; uint8_t raw[4096]; }; + // meta size value to use device-level checksum + static const uint8_t METASIZE_FOR_CHECKSUM_OFFLOAD = 8; }; struct nvme_rw_command_t { @@ -209,6 +232,8 @@ public: mount_ret mount() final; + nvme_command_ertr::future<> initialize_nvme_features() final; + mkfs_ret mkfs(device_config_t config) final; write_ertr::future<> writev( @@ -231,7 +256,7 @@ public: ).safe_then([stat] (auto id_namespace_data) mutable { // LBA format provides LBA size which is power of 2. LBA is the // minimum size of read and write. - stat.block_size = (1 << id_namespace_data.lbaf0.lbads); + stat.block_size = (1 << id_namespace_data.lbaf[0].lbads); if (stat.block_size < RBM_SUPERBLOCK_SIZE) { stat.block_size = RBM_SUPERBLOCK_SIZE; } @@ -286,7 +311,7 @@ public: * protection is enabled, checksum is calculated on every write and used to * verify data on every read. */ - bool is_data_protection_enabled() const { return data_protection_enabled; } + nvme_command_ertr::future<> try_enable_end_to_end_protection(); /* * Data Health @@ -321,7 +346,6 @@ public: nvme_io_command_t& io_cmd); bool support_multistream = false; - uint8_t data_protection_type = 0; /* * Predictable Latency @@ -352,7 +376,6 @@ private: uint64_t write_alignment = 4096; uint32_t atomic_write_unit = 4096; - bool data_protection_enabled = false; std::string device_path; seastar::sharded shard_devices; }; diff --git a/src/crimson/os/seastore/random_block_manager/rbm_device.cc b/src/crimson/os/seastore/random_block_manager/rbm_device.cc index c1fa6f2b2c7..f31bafcef9f 100644 --- a/src/crimson/os/seastore/random_block_manager/rbm_device.cc +++ b/src/crimson/os/seastore/random_block_manager/rbm_device.cc @@ -30,7 +30,6 @@ RBMDevice::mkfs_ret RBMDevice::do_primary_mkfs(device_config_t config, [this, FNAME, config=std::move(config), shard_num, journal_size](auto st) { super.block_size = st.block_size; super.size = st.size; - super.feature |= RBM_BITMAP_BLOCK_CRC; super.config = std::move(config); super.journal_size = journal_size; ceph_assert_always(super.journal_size > 0); @@ -59,13 +58,16 @@ RBMDevice::mkfs_ret RBMDevice::do_primary_mkfs(device_config_t config, crimson::ct_error::assert_all{ "Invalid error open in RBMDevice::do_primary_mkfs"} ).safe_then([this] { - return write_rbm_superblock( + return initialize_nvme_features( ).safe_then([this] { - return close(); - }).handle_error( - mkfs_ertr::pass_further{}, - crimson::ct_error::assert_all{ - "Invalid error write_rbm_superblock in RBMDevice::do_primary_mkfs" + return write_rbm_superblock( + ).safe_then([this] { + return close(); + }).handle_error( + mkfs_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error write_rbm_superblock in RBMDevice::do_primary_mkfs" + }); }); }); }); @@ -79,7 +81,7 @@ write_ertr::future<> RBMDevice::write_rbm_superblock() // If NVMeDevice supports data protection, CRC for checksum is not required // NVMeDevice is expected to generate and store checksum internally. // CPU overhead for CRC might be saved. - if (is_data_protection_enabled()) { + if (is_end_to_end_data_protection()) { super.crc = -1; } else { super.crc = meta_b_header.crc32c(-1); @@ -127,7 +129,7 @@ read_ertr::future RBMDevice::read_rbm_superblock( super_block.block_size); // Do CRC verification only if data protection is not supported. - if (is_data_protection_enabled() == false) { + if (super_block.is_end_to_end_data_protection() == false) { if (meta_b_header.crc32c(-1) != crc) { DEBUG("bad crc on super block, expected {} != actual {} ", meta_b_header.crc32c(-1), crc); diff --git a/src/crimson/os/seastore/random_block_manager/rbm_device.h b/src/crimson/os/seastore/random_block_manager/rbm_device.h index 28c20c6e72d..b74e6b14395 100644 --- a/src/crimson/os/seastore/random_block_manager/rbm_device.h +++ b/src/crimson/os/seastore/random_block_manager/rbm_device.h @@ -66,11 +66,6 @@ using discard_ertr = crimson::errorator< crimson::ct_error::input_output_error>; constexpr uint32_t RBM_SUPERBLOCK_SIZE = 4096; -enum { - // TODO: This allows the device to manage crc on a block by itself - RBM_NVME_END_TO_END_PROTECTION = 1, - RBM_BITMAP_BLOCK_CRC = 2, -}; class RBMDevice : public Device { public: @@ -149,7 +144,13 @@ public: ceph::bufferlist bl, uint16_t stream = 0) = 0; - bool is_data_protection_enabled() const { return false; } + bool is_end_to_end_data_protection() const final { + return super.is_end_to_end_data_protection(); + } + + virtual nvme_command_ertr::future<> initialize_nvme_features() { + return nvme_command_ertr::now(); + } mkfs_ret do_mkfs(device_config_t); -- 2.39.5