From: Jinyong Ha Date: Fri, 8 Oct 2021 05:30:35 +0000 (+0900) Subject: seastore : implement multi-stream write of NVMeDevice class X-Git-Tag: v17.1.0~359^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=ca6bf0947589281b6f9fc846429433b1005590e4;p=ceph.git seastore : implement multi-stream write of NVMeDevice class For checking an SSD supports multi-stream or not, add identifing interfaces. identify_controller/namespace command provides feature information which SSD provides. If the feature is not supported, write() ignores stream ID. Because there is no interface to know that the range of valid stream ID is not provided by NVMe specification. For the stream ID which is out of supported range, SSD regards it as on-streamd IO. Signed-off-by: Jinyong Ha --- diff --git a/src/crimson/os/seastore/random_block_manager/nvmedevice.cc b/src/crimson/os/seastore/random_block_manager/nvmedevice.cc index 5cf85e9406b..1949aca7248 100644 --- a/src/crimson/os/seastore/random_block_manager/nvmedevice.cc +++ b/src/crimson/os/seastore/random_block_manager/nvmedevice.cc @@ -24,16 +24,22 @@ open_ertr::future<> PosixNVMeDevice::open( seastar::open_flags mode) { return seastar::do_with(in_path, [this, mode](auto& in_path) { return seastar::file_stat(in_path).then([this, mode, in_path](auto stat) { - this->block_size = stat.block_size; - this->size = stat.size; + size = stat.size; return seastar::open_file_dma(in_path, mode).then([=](auto file) { - this->device = file; + device = file; logger().debug("open"); // Get SSD's features from identify_controller and namespace command. // Do identify_controller first, and then identify_namespace. - return identify_controller().safe_then([this](auto id_controller_data) { + return identify_controller().safe_then([this, in_path, mode]( + auto id_controller_data) { support_multistream = id_controller_data.oacs.support_directives; - return identify_namespace().safe_then([this] (auto id_namespace_data) { + if (support_multistream) { + stream_id_count = WRITE_LIFE_MAX; + } + return identify_namespace().safe_then([this, in_path, mode] ( + auto id_namespace_data) { + // LBA format provides LBA size which is power of 2. LBA is the + // minimum size of read and write. block_size = (1 << id_namespace_data.lbaf0.lbads); data_protection_type = id_namespace_data.dps.protection_type; data_protection_enabled = (data_protection_type > 0); @@ -42,13 +48,31 @@ open_ertr::future<> PosixNVMeDevice::open( write_granularity = block_size * (id_namespace_data.npwg + 1); write_alignment = block_size * (id_namespace_data.npwa + 1); } - return seastar::now(); + return open_for_io(in_path, mode); }); - }); + }).handle_error(crimson::ct_error::input_output_error::handle([this, in_path, mode]{ + logger().error("open: id ctrlr failed. open without ioctl"); + return open_for_io(in_path, mode); + }), crimson::ct_error::pass_further_all{}); + }); + }); + }); +} + +open_ertr::future<> PosixNVMeDevice::open_for_io( + const std::string& in_path, + seastar::open_flags mode) { + io_device.resize(stream_id_count); + return seastar::do_for_each(io_device, [=](auto &target_device) { + return seastar::open_file_dma(in_path, mode).then([this]( + auto file) { + io_device[stream_index_to_open] = file; + return io_device[stream_index_to_open].fcntl( + F_SET_FILE_RW_HINT, + (uintptr_t)&stream_index_to_open).then([this](auto ret) { + stream_index_to_open++; + return seastar::now(); }); - }).handle_exception([](auto e) -> open_ertr::future<> { - logger().error("open: got error{}", e); - return crimson::ct_error::input_output_error::make(); }); }); } @@ -64,17 +88,22 @@ write_ertr::future<> PosixNVMeDevice::write( auto length = bptr.length(); assert((length % block_size) == 0); - return device.dma_write(offset, bptr.c_str(), length).handle_exception( + uint16_t supported_stream = stream; + if (stream >= stream_id_count) { + supported_stream = WRITE_LIFE_NOT_SET; + } + return io_device[supported_stream].dma_write( + offset, bptr.c_str(), length).handle_exception( [](auto e) -> write_ertr::future { - logger().error("write: dma_write got error{}", e); - return crimson::ct_error::input_output_error::make(); - }).then([length](auto result) -> write_ertr::future<> { - if (result != length) { - logger().error("write: dma_write got error with not proper length"); - return crimson::ct_error::input_output_error::make(); - } - return write_ertr::now(); - }); + logger().error("write: dma_write got error{}", e); + return crimson::ct_error::input_output_error::make(); + }).then([length](auto result) -> write_ertr::future<> { + if (result != length) { + logger().error("write: dma_write got error with not proper length"); + return crimson::ct_error::input_output_error::make(); + } + return write_ertr::now(); + }); } read_ertr::future<> PosixNVMeDevice::read( @@ -103,7 +132,11 @@ read_ertr::future<> PosixNVMeDevice::read( seastar::future<> PosixNVMeDevice::close() { logger().debug(" close "); - return device.close(); + return device.close().then([this]() { + return seastar::do_for_each(io_device, [](auto target_device) { + return target_device.close(); + }); + }); } nvme_command_ertr::future @@ -155,7 +188,11 @@ nvme_command_ertr::future PosixNVMeDevice::get_nsid() { nvme_command_ertr::future PosixNVMeDevice::pass_admin( nvme_admin_command_t& admin_cmd) { - return device.ioctl(NVME_IOCTL_ADMIN_CMD, &admin_cmd); + return device.ioctl(NVME_IOCTL_ADMIN_CMD, &admin_cmd).handle_exception( + [](auto e)->nvme_command_ertr::future { + logger().error("pass_admin: ioctl failed"); + return crimson::ct_error::input_output_error::make(); + }); } nvme_command_ertr::future PosixNVMeDevice::pass_through_io( diff --git a/src/crimson/os/seastore/random_block_manager/nvmedevice.h b/src/crimson/os/seastore/random_block_manager/nvmedevice.h index b602cdc0415..2b3e99f53da 100644 --- a/src/crimson/os/seastore/random_block_manager/nvmedevice.h +++ b/src/crimson/os/seastore/random_block_manager/nvmedevice.h @@ -25,6 +25,33 @@ namespace ceph { namespace crimson::os::seastore::nvme_device { +// from blk/BlockDevice.h +#if defined(__linux__) +#if !defined(F_SET_FILE_RW_HINT) +#define F_LINUX_SPECIFIC_BASE 1024 +#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) +#endif +// These values match Linux definition +// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56 +#define WRITE_LIFE_NOT_SET 0 // No hint information set +#define WRITE_LIFE_NONE 1 // No hints about write life time +#define WRITE_LIFE_SHORT 2 // Data written has a short life time +#define WRITE_LIFE_MEDIUM 3 // Data written has a medium life time +#define WRITE_LIFE_LONG 4 // Data written has a long life time +#define WRITE_LIFE_EXTREME 5 // Data written has an extremely long life time +#define WRITE_LIFE_MAX 6 +#else +// On systems don't have WRITE_LIFE_* only use one FD +// And all files are created equal +#define WRITE_LIFE_NOT_SET 0 // No hint information set +#define WRITE_LIFE_NONE 0 // No hints about write life time +#define WRITE_LIFE_SHORT 0 // Data written has a short life time +#define WRITE_LIFE_MEDIUM 0 // Data written has a medium life time +#define WRITE_LIFE_LONG 0 // Data written has a long life time +#define WRITE_LIFE_EXTREME 0 // Data written has an extremely long life time +#define WRITE_LIFE_MAX 1 +#endif + /* * NVMe protocol structures (nvme_XX, identify_XX) * @@ -37,22 +64,21 @@ namespace crimson::os::seastore::nvme_device { * * For more information about NVMe protocol, refer https://nvmexpress.org/ */ - struct nvme_identify_command_t { uint32_t common_dw[10]; + uint32_t cns : 8; uint32_t reserved : 8; - uint32_t cntroller_id : 16; + uint32_t cnt_id : 16; static const uint8_t CNS_NAMESPACE = 0x00; static const uint8_t CNS_CONTROLLER = 0x01; }; struct nvme_admin_command_t { - union - { - nvme_passthru_cmd common_cmd; - nvme_identify_command_t identify_cmd; + union { + nvme_passthru_cmd common; + nvme_identify_command_t identify; }; static const uint8_t OPCODE_IDENTIFY = 0x06; @@ -204,6 +230,8 @@ struct io_context_t { class NVMeBlockDevice { protected: uint64_t size = 0; + + // LBA Size uint64_t block_size = 4096; uint64_t write_granularity = 4096; @@ -268,10 +296,10 @@ public: * For passsing through nvme IO or Admin command to SSD * Caller can construct and execute its own nvme command */ - virtual nvme_command_ertr::future<> pass_through_io( - const NVMePassThroughCommand& command) { return nvme_command_ertr::now(); } - virtual nvme_command_ertr::future<> pass_admin( - const nvme_admin_command_t& command) { return nvme_command_ertr::now(); } + virtual nvme_command_ertr::future pass_through_io( + nvme_io_command_t& command) { return seastar::make_ready_future(0); } + virtual nvme_command_ertr::future pass_admin( + nvme_admin_command_t& command) { return seastar::make_ready_future(0); } /* * End-to-End Data Protection @@ -355,11 +383,18 @@ public: private: // identify_controller/namespace are used to get SSD internal information such - // as supported features, NPWG and NPWA; + // as supported features, NPWG and NPWA nvme_command_ertr::future identify_controller(); nvme_command_ertr::future identify_namespace(); nvme_command_ertr::future get_nsid(); + open_ertr::future<> open_for_io( + const std::string& in_path, + seastar::open_flags mode); + seastar::file device; + std::vector io_device; + uint32_t stream_index_to_open = WRITE_LIFE_NOT_SET; + uint32_t stream_id_count = 1; // stream is disabled, defaultly. };