level: dev
desc: overwrite the existing data block based on delta if the overwrite size is equal to or less than the value, otherwise do overwrite based on remapping, set to 0 to enforce the remap-based overwrite.
default: 0
+- name: seastore_disable_end_to_end_data_protection
+ type: bool
+ level: dev
+ desc: When false, upon mkfs, try to discover whether the nvme device supports
+ internal checksum feature without using sever CPU then enable if available,
+ set to true to disable unconditionally.
+ default: true
virtual secondary_device_set_t& get_secondary_devices() = 0;
+ virtual bool is_end_to_end_data_protection() const {
+ return false;
+ }
+
using close_ertr = crimson::errorator<
crimson::ct_error::input_output_error>;
virtual close_ertr::future<> close() = 0;
}
};
+enum class rbm_feature_t : uint64_t {
+ RBM_NVME_END_TO_END_PROTECTION = 1,
+};
+
struct rbm_superblock_t {
size_t size = 0;
size_t block_size = 0;
backend_type_t::RANDOM_BLOCK);
ceph_assert(config.spec.id <= DEVICE_ID_MAX_VALID);
}
+
+ bool is_end_to_end_data_protection() const {
+ return (feature & (uint64_t)rbm_feature_t::RBM_NVME_END_TO_END_PROTECTION);
+ }
+ void set_end_to_end_data_protection() {
+ feature |= (uint64_t)rbm_feature_t::RBM_NVME_END_TO_END_PROTECTION;
+ }
};
enum class rbm_extent_state_t {
<< ", journal_size=" << header.journal_size
<< ", crc=" << header.crc
<< ", config=" << header.config
- << ", shard_num=" << header.shard_num;
+ << ", shard_num=" << header.shard_num
+ << ", end_to_end_data_protection=" << header.is_end_to_end_data_protection();
for (auto p : header.shard_infos) {
out << p;
}
return identify_namespace(device).safe_then([this, in_path, mode] (
auto id_namespace_data) {
atomic_write_unit = awupf * super.block_size;
- data_protection_type = id_namespace_data.dps.protection_type;
- data_protection_enabled = (data_protection_type > 0);
if (id_namespace_data.nsfeat.opterf == 1){
// NPWG and NPWA is 0'based value
write_granularity = super.block_size * (id_namespace_data.npwg + 1);
return local_device.do_shard_mount(
).handle_error(
crimson::ct_error::assert_all{
- "Invalid error in RBMDevice::do_mount"
+ "Invalid error in NVMeBlockDevice::do_shard_mount"
});
+ }).then([this] () {
+ if (is_end_to_end_data_protection()) {
+ return identify_namespace(device
+ ).safe_then([] (auto id_namespace_data) {
+ if (id_namespace_data.dps.protection_type !=
+ nvme_format_nvm_command_t::PROTECT_INFORMATION_TYPE_2) {
+ logger().error("seastore was formated with end-to-end-data-protection \
+ but the device being mounted to use seastore does not support \
+ the functionality. Please check the device.");
+ ceph_abort();
+ }
+ if (id_namespace_data.lbaf[id_namespace_data.flbas.lba_index].ms !=
+ nvme_identify_namespace_data_t::METASIZE_FOR_CHECKSUM_OFFLOAD) {
+ logger().error("seastore was formated with end-to-end-data-protection \
+ but the formatted device meta size is wrong. Please check the device.");
+ ceph_abort();
+ }
+ return mount_ertr::now();
+ });
+ }
+ return mount_ertr::now();
});
}
nvme_admin_command_t& admin_cmd, seastar::file f) {
return f.ioctl(NVME_IOCTL_ADMIN_CMD, &admin_cmd).handle_exception(
[](auto e)->nvme_command_ertr::future<int> {
- logger().error("pass_admin: ioctl failed");
+ logger().error("pass_admin: ioctl failed {}", e);
return crimson::ct_error::input_output_error::make();
});
}
return device.ioctl(NVME_IOCTL_IO_CMD, &io_cmd);
}
+nvme_command_ertr::future<> NVMeBlockDevice::try_enable_end_to_end_protection() {
+ return identify_namespace(device
+ ).safe_then([this] (auto id_namespace_data) -> nvme_command_ertr::future<> {
+ if (!id_namespace_data.nlbaf) {
+ logger().info("the device does not support end to end data protection,\
+ mkfs() will be done without this functionality.");
+ return nvme_command_ertr::now();
+ }
+ int lba_format_index = -1;
+ for (int i = 0; i < id_namespace_data.nlbaf; i++) {
+ // TODO: enable other types of end to end data protection
+ // Note that the nvme device will generate crc if the namespace
+ // is formatted with meta size 8
+ // The nvme device can provide other types of data protections.
+ // But, for now, we only consider the checksum offload in the device side.
+ if (id_namespace_data.lbaf[i].ms ==
+ nvme_identify_namespace_data_t::METASIZE_FOR_CHECKSUM_OFFLOAD) {
+ lba_format_index = i;
+ break;
+ }
+ }
+ if (lba_format_index == -1) {
+ logger().info("the device does not support end to end data protection,\
+ mkfs() will be done without this functionality.");
+ return nvme_command_ertr::now();
+ }
+ return get_nsid(device
+ ).safe_then([this, i=lba_format_index](auto nsid) {
+ return seastar::do_with(
+ nvme_admin_command_t(),
+ [this, nsid=nsid, i=i] (auto &cmd) {
+ cmd.common.opcode = nvme_admin_command_t::OPCODE_FORMAT_NVM;
+ cmd.common.nsid = nsid;
+ // TODO: configure other protect information types (2 or 3) see above
+ cmd.format.pi = nvme_format_nvm_command_t::PROTECT_INFORMATION_TYPE_2;
+ cmd.format.lbaf = i;
+ return pass_admin(cmd, device
+ ).safe_then([this](auto ret) {
+ if (ret != 0) {
+ logger().error(
+ "formt nvm command to use end-to-end-protection fails : {}", ret);
+ ceph_abort();
+ }
+ return identify_namespace(device
+ ).safe_then([this] (auto id_namespace_data) -> nvme_command_ertr::future<> {
+ ceph_assert(id_namespace_data.dps.protection_type ==
+ nvme_format_nvm_command_t::PROTECT_INFORMATION_TYPE_2);
+ super.set_end_to_end_data_protection();
+ return nvme_command_ertr::now();
+ });
+ });
+ });
+ });
+ }).handle_error(crimson::ct_error::input_output_error::handle([]{
+ logger().info("the device does not support identify namespace command");
+ return nvme_command_ertr::now();
+ }), crimson::ct_error::pass_further_all{});
+}
+
+nvme_command_ertr::future<> NVMeBlockDevice::initialize_nvme_features() {
+ if (!crimson::common::get_conf<bool>("seastore_disable_end_to_end_data_protection")) {
+ return try_enable_end_to_end_protection();
+ }
+ return nvme_command_ertr::now();
+}
+
}
static const uint8_t CNS_CONTROLLER = 0x01;
};
+struct nvme_format_nvm_command_t {
+ uint32_t common_dw[10];
+
+ uint8_t lbaf : 4;
+ uint8_t mset : 1;
+ uint8_t pi : 3;
+ uint8_t pil : 1;
+
+ static const uint8_t PROTECT_INFORMATION_TYPE_2 = 2;
+};
+
struct nvme_admin_command_t {
union {
nvme_passthru_cmd common;
nvme_identify_command_t identify;
+ nvme_format_nvm_command_t format;
};
static const uint8_t OPCODE_IDENTIFY = 0x06;
+ static const uint8_t OPCODE_FORMAT_NVM = 0x80;
};
// Optional Admin Command Support (OACS)
uint32_t reserved : 6;
};
+struct flbas_t {
+ uint8_t lba_index : 4;
+ uint8_t ms_transferred :1;
+ uint8_t reserved : 3;
+};
+
struct nvme_identify_namespace_data_t {
union {
struct {
uint8_t unused[24]; // [23:0]
nsfeat_t nsfeat; // [24]
- uint8_t unused2[3]; // [27:25]
+ uint8_t nlbaf; // [25]
+ flbas_t flbas; // [26]
+ uint8_t unused2; // [27]
dpc_t dpc; // [28]
dps_t dps; // [29]
uint8_t unused3[34]; // [63:30]
uint16_t npwg; // [65:64]
uint16_t npwa; // [67:66]
uint8_t unused4[60]; // [127:68]
- lbaf_t lbaf0; // [131:128]
+ lbaf_t lbaf[64]; // [383:128]
};
uint8_t raw[4096];
};
+ // meta size value to use device-level checksum
+ static const uint8_t METASIZE_FOR_CHECKSUM_OFFLOAD = 8;
};
struct nvme_rw_command_t {
mount_ret mount() final;
+ nvme_command_ertr::future<> initialize_nvme_features() final;
+
mkfs_ret mkfs(device_config_t config) final;
write_ertr::future<> writev(
).safe_then([stat] (auto id_namespace_data) mutable {
// LBA format provides LBA size which is power of 2. LBA is the
// minimum size of read and write.
- stat.block_size = (1 << id_namespace_data.lbaf0.lbads);
+ stat.block_size = (1 << id_namespace_data.lbaf[0].lbads);
if (stat.block_size < RBM_SUPERBLOCK_SIZE) {
stat.block_size = RBM_SUPERBLOCK_SIZE;
}
* protection is enabled, checksum is calculated on every write and used to
* verify data on every read.
*/
- bool is_data_protection_enabled() const { return data_protection_enabled; }
+ nvme_command_ertr::future<> try_enable_end_to_end_protection();
/*
* Data Health
nvme_io_command_t& io_cmd);
bool support_multistream = false;
- uint8_t data_protection_type = 0;
/*
* Predictable Latency
uint64_t write_alignment = 4096;
uint32_t atomic_write_unit = 4096;
- bool data_protection_enabled = false;
std::string device_path;
seastar::sharded<NVMeBlockDevice> shard_devices;
};
[this, FNAME, config=std::move(config), shard_num, journal_size](auto st) {
super.block_size = st.block_size;
super.size = st.size;
- super.feature |= RBM_BITMAP_BLOCK_CRC;
super.config = std::move(config);
super.journal_size = journal_size;
ceph_assert_always(super.journal_size > 0);
crimson::ct_error::assert_all{
"Invalid error open in RBMDevice::do_primary_mkfs"}
).safe_then([this] {
- return write_rbm_superblock(
+ return initialize_nvme_features(
).safe_then([this] {
- return close();
- }).handle_error(
- mkfs_ertr::pass_further{},
- crimson::ct_error::assert_all{
- "Invalid error write_rbm_superblock in RBMDevice::do_primary_mkfs"
+ return write_rbm_superblock(
+ ).safe_then([this] {
+ return close();
+ }).handle_error(
+ mkfs_ertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error write_rbm_superblock in RBMDevice::do_primary_mkfs"
+ });
});
});
});
// If NVMeDevice supports data protection, CRC for checksum is not required
// NVMeDevice is expected to generate and store checksum internally.
// CPU overhead for CRC might be saved.
- if (is_data_protection_enabled()) {
+ if (is_end_to_end_data_protection()) {
super.crc = -1;
} else {
super.crc = meta_b_header.crc32c(-1);
super_block.block_size);
// Do CRC verification only if data protection is not supported.
- if (is_data_protection_enabled() == false) {
+ if (super_block.is_end_to_end_data_protection() == false) {
if (meta_b_header.crc32c(-1) != crc) {
DEBUG("bad crc on super block, expected {} != actual {} ",
meta_b_header.crc32c(-1), crc);
crimson::ct_error::input_output_error>;
constexpr uint32_t RBM_SUPERBLOCK_SIZE = 4096;
-enum {
- // TODO: This allows the device to manage crc on a block by itself
- RBM_NVME_END_TO_END_PROTECTION = 1,
- RBM_BITMAP_BLOCK_CRC = 2,
-};
class RBMDevice : public Device {
public:
ceph::bufferlist bl,
uint16_t stream = 0) = 0;
- bool is_data_protection_enabled() const { return false; }
+ bool is_end_to_end_data_protection() const final {
+ return super.is_end_to_end_data_protection();
+ }
+
+ virtual nvme_command_ertr::future<> initialize_nvme_features() {
+ return nvme_command_ertr::now();
+ }
mkfs_ret do_mkfs(device_config_t);