From 70bced5b17a04938570a21aaf489684d45e96c1e Mon Sep 17 00:00:00 2001 From: Jinyong Ha Date: Fri, 16 Apr 2021 17:12:55 +0900 Subject: [PATCH] seastore: add NormalNBD NormalNBD - NVMe Block device with seastar file 1. Support preffered write granularity/alignment which is reported from NVMe SSD. User should follow this write guidance for enhancing write performance. 2. Support multi-stream IO in NVMe SSD. Signed-off-by: Jinyong Ha --- src/crimson/os/seastore/CMakeLists.txt | 1 + src/crimson/os/seastore/nvmedevice/memory.cc | 3 +- .../os/seastore/nvmedevice/nvmedevice.cc | 227 ++++++++++++++++++ .../os/seastore/nvmedevice/nvmedevice.h | 200 ++++++++++++++- src/test/crimson/seastore/CMakeLists.txt | 10 + .../seastore/nvmedevice/test_nvmedevice.cc | 85 +++++++ 6 files changed, 519 insertions(+), 7 deletions(-) create mode 100644 src/crimson/os/seastore/nvmedevice/nvmedevice.cc create mode 100644 src/test/crimson/seastore/nvmedevice/test_nvmedevice.cc diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt index f9ad475b29dad..c697d7e3acbbf 100644 --- a/src/crimson/os/seastore/CMakeLists.txt +++ b/src/crimson/os/seastore/CMakeLists.txt @@ -34,6 +34,7 @@ add_library(crimson-seastore STATIC seastore.cc randomblock_manager.cc nvmedevice/memory.cc + nvmedevice/nvmedevice.cc ../../../test/crimson/seastore/test_block.cc ${PROJECT_SOURCE_DIR}/src/os/Transaction.cc ) diff --git a/src/crimson/os/seastore/nvmedevice/memory.cc b/src/crimson/os/seastore/nvmedevice/memory.cc index 66be008d589f8..eb371b4905034 100644 --- a/src/crimson/os/seastore/nvmedevice/memory.cc +++ b/src/crimson/os/seastore/nvmedevice/memory.cc @@ -44,7 +44,8 @@ TestMemory::open(const std::string &in_path, seastar::open_flags mode) write_ertr::future<> TestMemory::write( uint64_t offset, - bufferptr &bptr) + bufferptr &bptr, + uint16_t stream) { ceph_assert(buf); logger().debug( diff --git a/src/crimson/os/seastore/nvmedevice/nvmedevice.cc b/src/crimson/os/seastore/nvmedevice/nvmedevice.cc new file mode 100644 index 0000000000000..84f707983d21e --- /dev/null +++ b/src/crimson/os/seastore/nvmedevice/nvmedevice.cc @@ -0,0 +1,227 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include + +#include "crimson/common/log.h" + +#include "include/buffer.h" +#include "crimson/os/seastore/nvmedevice/nvmedevice.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +static constexpr uint32_t MAX_EVENTS = 1024; + +namespace crimson::os::seastore::nvme_device { + +/* background io poller for multi-stream write */ +void poll_completion(std::vector<::io_context_t>* ctxs, bool* exit) { + while (*exit == false) { + for (auto& ctx : *ctxs) { + io_event events[MAX_EVENTS]; + + /* + * At least a completion should be returned. Otherwise, thread is blocked + * until it is possible + */ + int num_events = io_getevents(ctx, 1, MAX_EVENTS, events, NULL); + + for (int i = 0; i < num_events; i++) { + io_context_t* io_context = (io_context_t*)events[i].obj; + io_context->done = true; + } + } + } +} + +open_ertr::future<> +NormalNBD::open(const std::string &in_path, seastar::open_flags mode) { + /* Open with posix fd for pass generic NVMe commands */ + fd = seastar::file_desc::open(in_path, (int)mode); + identify_controller_data_t controller_data = {0, }; + return identify_controller(controller_data).safe_then( + [this, controller_data, in_path, mode]() { + protocol_version = controller_data.version; + logger().debug("nvme protocol {}.{} {}", + (uint32_t)protocol_version.major_ver, + (uint32_t)protocol_version.minor_ver, + (uint32_t)protocol_version.tertiary_ver); + + /* + * Multi Stream Write + * + * When NVMe SSD supports multi stream functionality, it marks oacs bit of + * identify_controller_data structure (from NVMe Specification 1.4). + * If oacs field is true, NormalNBD class opens device file multiple times + * with different stream IDs. When user calls write() with stream argument, + * NormalNBD finds pre-opened FD with stream ID and submit write IO to the + * found FD. + */ + support_multistream = controller_data.oacs.support_directives; + if (support_multistream) { + write_life_max = 6; + } + + open_for_io(in_path, mode); + + /* PWG and PWA are supported from NVMe 1.4 */ + if (protocol_version.major_ver >= 1 && protocol_version.minor_ver >= 4) { + identify_namespace_data_t namespace_data = {0, }; + identify_namespace(namespace_data).safe_then([this, namespace_data]() { + /* Revise 0-based value */ + write_granularity = namespace_data.npwg + 1; + write_alignment = namespace_data.npwa + 1; + }); + } + return seastar::now(); + }).handle_error( + /* If device does not support ioctl, just open without stream */ + crimson::ct_error::input_output_error::handle([this, in_path, mode](auto) { + open_for_io(in_path, mode); + return seastar::now(); + })); +} + +void +NormalNBD::open_for_io(const std::string &in_path, seastar::open_flags mode) +{ + ctx.resize(write_life_max); + for (uint32_t i = 0; i < write_life_max; i++) { + stream_fd.push_back(seastar::file_desc::open(in_path, (int)mode)); + if (i != write_life_not_set) { + int posix_fd = stream_fd[i].get(); + fcntl(posix_fd, F_SET_FILE_RW_HINT, &i); + } + + io_setup(MAX_EVENTS, &ctx[i]); + } + completion_poller = std::thread(poll_completion, &ctx, &exit); +} + +write_ertr::future<> +NormalNBD::write( + uint64_t offset, + bufferptr &bptr, + uint16_t stream) { + logger().debug( + "block: do_write offset {} len {}", + offset, + bptr.length()); + io_context_t io_context = io_context_t(); + io_prep_pwrite( + &io_context.cb, + stream_fd[stream].get(), + bptr.c_str(), + bptr.length(), + offset); + iocb* cb_ptr[1] = {&io_context.cb}; + io_submit(ctx[stream], 1, cb_ptr); + return seastar::do_with(std::move(io_context), [] (auto& io_context) { + /* + * libaio needs additional poller thread (see poll_completion) to poll IO + * completion. When the poller catches a completion, it marks "done" field + * of corresponding io_context. + */ + if (io_context.done) { + return seastar::now(); + } + return seastar::later(); + }); +} + +read_ertr::future<> +NormalNBD::read( + uint64_t offset, + bufferptr &bptr) { + logger().debug( + "block: do_read offset {} len {}", + offset, + bptr.length()); + io_context_t io_context = io_context_t(); + io_prep_pread( + &io_context.cb, + stream_fd[0].get(), + bptr.c_str(), + bptr.length(), + offset); + iocb* cb_ptr[1] = {&io_context.cb}; + io_submit(ctx[0], 1, cb_ptr); + return seastar::do_with(std::move(io_context), [] (auto& io_context) { + if (io_context.done) { + return seastar::now(); + } + return seastar::later(); + }); +} + +seastar::future<> +NormalNBD::close() { + logger().debug(" close "); + exit = true; + completion_poller.join(); + fd.close(); + return seastar::now(); +} + +nvme_command_ertr::future<> +NormalNBD::pass_through_io(NVMePassThroughCommand& command) { + logger().debug("block: pass through"); + int ret = fd.ioctl(NVME_IOCTL_IO_CMD, command); + if (ret < 0) { + logger().debug("block: pass through failed"); + return crimson::ct_error::input_output_error::make(); + } + else { + return nvme_command_ertr::now(); + } +} + +nvme_command_ertr::future<> +NormalNBD::identify_namespace(identify_namespace_data_t& namespace_data) { + nvme_admin_command_t command = {0,}; + command.common_cmd.opcode = nvme_admin_command_t::OPCODE_IDENTIFY; + command.identify_cmd.cns = nvme_identify_command_t::CNS_NAMESPACE; + command.common_cmd.addr = (uint64_t)&namespace_data; + command.common_cmd.data_len = sizeof(identify_namespace_data_t); + + return pass_admin(command); +} + +nvme_command_ertr::future<> +NormalNBD::identify_controller(identify_controller_data_t& controller_data) { + nvme_admin_command_t command = {0,}; + command.common_cmd.opcode = nvme_admin_command_t::OPCODE_IDENTIFY; + command.identify_cmd.cns = nvme_identify_command_t::CNS_CONTROLLER; + command.common_cmd.addr = (uint64_t)&controller_data; + command.common_cmd.data_len = sizeof(identify_controller_data_t); + + return pass_admin(command); +} + +nvme_command_ertr::future<> +NormalNBD::pass_admin(nvme_admin_command_t& command) { + logger().debug("block: pass admin"); + try { + int ret = fd.ioctl(NVME_IOCTL_ADMIN_CMD, command); + if (ret < 0) { + logger().debug("block: pass admin failed"); + return crimson::ct_error::input_output_error::make(); + } + else { + return nvme_command_ertr::now(); + } + } + catch (...) { + logger().debug("block: pass admin failed"); + return crimson::ct_error::input_output_error::make(); + } +} + +} diff --git a/src/crimson/os/seastore/nvmedevice/nvmedevice.h b/src/crimson/os/seastore/nvmedevice/nvmedevice.h index e2bc56c9e3401..c31fd0d844ce0 100644 --- a/src/crimson/os/seastore/nvmedevice/nvmedevice.h +++ b/src/crimson/os/seastore/nvmedevice/nvmedevice.h @@ -24,6 +24,85 @@ namespace ceph { namespace crimson::os::seastore::nvme_device { +/* + * NVMe protocol structures (nvme_XX, identify_XX) + * + * All structures relative to NVMe protocol are following NVMe protocol v1.4 + * (latest). NVMe is protocol for fast interfacing between user and SSD device. + * We selectively adopted features among various NVMe features to ease + * implementation. And also, NVMeBlockDevice provides generic command submission + * APIs for IO and Admin commands. Please use pass_through_io() and pass_admin() + * to do it. + * + * For more information about NVMe protocol, refer https://nvmexpress.org/ + */ + +struct nvme_identify_command_t { + uint32_t common_dw[10]; + uint32_t cns : 8; + uint32_t reserved : 8; + uint32_t cntroller_id : 16; + + static const uint8_t CNS_NAMESPACE = 0x00; + static const uint8_t CNS_CONTROLLER = 0x01; +}; + +struct nvme_admin_command_t { + union + { + nvme_passthru_cmd common_cmd; + nvme_identify_command_t identify_cmd; + }; + + static const uint8_t OPCODE_IDENTIFY = 0x06; +}; + +struct nvme_version_t { + uint32_t major_ver : 16; + uint32_t minor_ver : 8; + uint32_t tertiary_ver : 8; +}; + +struct admin_command_support_t { + uint16_t unused : 5; + uint16_t support_directives : 1; + uint16_t unused2 : 10; +}; + +struct identify_controller_data_t { + union + { + struct + { + uint8_t raw[1024]; + }; + struct + { + uint8_t unused[80]; + nvme_version_t version; + uint8_t unused2[172]; + admin_command_support_t oacs; + }; + }; +}; + +struct identify_namespace_data_t { + union + { + struct + { + uint8_t raw[4096]; + }; + struct + { + uint8_t unused[64]; + uint16_t npwg; + uint16_t npwa; + }; + }; +}; + +using NVMePassThroughCommand = nvme_passthru_cmd; using read_ertr = crimson::errorator< crimson::ct_error::input_output_error, @@ -45,6 +124,11 @@ using open_ertr = crimson::errorator< using nvme_command_ertr = crimson::errorator< crimson::ct_error::input_output_error>; +struct io_context_t { + iocb cb; + bool done = false; +}; + /* * Interface between NVMe SSD and its user. * @@ -95,9 +179,17 @@ public: uint64_t offset, bufferptr &bptr) = 0; + /* + * Multi-stream write + * + * Give hint to device about classification of data whose life time is similar + * with each other. Data with same stream value will be managed together in + * SSD for better write performance. + */ virtual write_ertr::future<> write( uint64_t offset, - bufferptr &bptr) = 0; + bufferptr &bptr, + uint16_t stream = 0) = 0; // TODO virtual int discard(uint64_t offset, uint64_t len) { return 0; } @@ -105,9 +197,104 @@ public: virtual open_ertr::future<> open( const std::string& path, seastar::open_flags mode) = 0; - virtual seastar::future<> close() = 0; + /* + * For passsing through nvme IO or Admin command to SSD + * Caller can construct and execute its own nvme command + */ + virtual nvme_command_ertr::future<> pass_through_io( + NVMePassThroughCommand& command) { return nvme_command_ertr::now(); } + virtual nvme_command_ertr::future<> pass_admin( + nvme_admin_command_t& command) { return nvme_command_ertr::now(); } + + /* + * End-to-End Data Protection + * + * NVMe device keeps track of data integrity similar with checksum. Client can + * offload checksuming to NVMe device to reduce its CPU utilization + */ + virtual write_ertr::future<> protected_write( + uint64_t offset, + bufferptr &bptr, + uint16_t stream = 0) { return write_ertr::now(); } + + /* + * Data Health + * + * Returns list of LBAs which have almost corrupted data. Data of the LBAs + * will be corrupted very soon. Caller can overwrite, unmap or refresh data to + * protect data + */ + virtual nvme_command_ertr::future<> get_data_health( + std::list& fragile_lbas) { return nvme_command_ertr::now(); } + + /* + * Recovery Level + * + * Regulate magnitude of SSD-internal data recovery. Caller can get good read + * latency with lower magnitude. + */ + virtual nvme_command_ertr::future<> set_data_recovery_level( + uint32_t level) { return nvme_command_ertr::now(); } + + /* + * Predictable Latency + * + * NVMe device can guarantee IO latency within pre-defined time window. This + * functionality will be analyzed soon. + */ +}; + +/* + * Implementation of NVMeBlockDevice with POSIX APIs + * + * NormalNBD provides NVMe SSD interfaces through POSIX APIs which is generally + * available at most operating environment. + */ +class NormalNBD : public NVMeBlockDevice { +public: + NormalNBD() {} + ~NormalNBD() override {} + + open_ertr::future<> open( + const std::string &in_path, + seastar::open_flags mode) override; + + write_ertr::future<> write( + uint64_t offset, + bufferptr &bptr, + uint16_t stream = 0) override; + + read_ertr::future<> read( + uint64_t offset, + bufferptr &bptr) override; + + nvme_command_ertr::future<> pass_through_io( + NVMePassThroughCommand& command) override; + + nvme_command_ertr::future<> pass_admin( + nvme_admin_command_t& command) override; + + seastar::future<> close() override; + +private: + seastar::file_desc fd = seastar::file_desc::from_fd(-1); + std::vector stream_fd; + nvme_version_t protocol_version; + bool support_multistream = false; + std::vector<::io_context_t> ctx; + std::thread completion_poller; + bool exit = false; + + uint32_t write_life_not_set = 0; + uint32_t write_life_max = 1; + + nvme_command_ertr::future<> identify_controller( + identify_controller_data_t& controller_data); + nvme_command_ertr::future<> identify_namespace( + identify_namespace_data_t& namespace_data); + void open_for_io(const std::string& in_path, seastar::open_flags mode); }; @@ -122,17 +309,18 @@ public: } } - open_ertr::future<> open(const std::string &in_path, seastar::open_flags mode); + open_ertr::future<> open(const std::string &in_path, seastar::open_flags mode) override; write_ertr::future<> write( uint64_t offset, - bufferptr &bptr); + bufferptr &bptr, + uint16_t stream = 0) override; read_ertr::future<> read( uint64_t offset, - bufferptr &bptr); + bufferptr &bptr) override; - seastar::future<> close(); + seastar::future<> close() override; char *buf; size_t size; diff --git a/src/test/crimson/seastore/CMakeLists.txt b/src/test/crimson/seastore/CMakeLists.txt index 12b5ff6516591..af6c69568c778 100644 --- a/src/test/crimson/seastore/CMakeLists.txt +++ b/src/test/crimson/seastore/CMakeLists.txt @@ -93,4 +93,14 @@ target_link_libraries( ${CMAKE_DL_LIBS} crimson-seastore) +add_executable(unittest-seastore-nvmedevice + nvmedevice/test_nvmedevice.cc) +add_ceph_test(unittest-seastore-nvmedevice + unittest-seastore-nvmedevice --memory 256M --smp 1) +target_link_libraries( + unittest-seastore-nvmedevice + crimson::gtest + crimson-seastore + aio) + add_subdirectory(onode_tree) diff --git a/src/test/crimson/seastore/nvmedevice/test_nvmedevice.cc b/src/test/crimson/seastore/nvmedevice/test_nvmedevice.cc new file mode 100644 index 0000000000000..e43d9d7591dac --- /dev/null +++ b/src/test/crimson/seastore/nvmedevice/test_nvmedevice.cc @@ -0,0 +1,85 @@ +//-*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/buffer.h" +#include "crimson/os/seastore/nvmedevice/nvmedevice.h" +#include "test/crimson/gtest_seastar.h" +#include "include/stringify.h" + +using namespace crimson; +using namespace crimson::os; +using namespace crimson::os::seastore; +using namespace nvme_device; + +struct nvdev_test_t : seastar_test_suite_t { + NVMeBlockDevice* device; + std::string dev_path; + + static const uint64_t DEV_SIZE = 1024 * 1024 * 1024; + + nvdev_test_t() : + device(NVMeBlockDevice::create()), + dev_path("randomblock_manager.test_nvmedevice" + stringify(getpid())) { + int fd = ::open(dev_path.c_str(), O_CREAT|O_RDWR|O_TRUNC, 0644); + ceph_assert(fd >= 0); + ::ftruncate(fd, DEV_SIZE); + ::close(fd); + } + ~nvdev_test_t() { + ::unlink(dev_path.c_str()); + delete device; + } +}; + +static const uint64_t BUF_SIZE = 8192; + +struct nvdev_test_block_t { + uint8_t data[BUF_SIZE]; + + DENC(nvdev_test_block_t, v, p) { + DENC_START(1, 1, p); + for (uint64_t i = 0 ; i < BUF_SIZE; i++) + { + denc(v.data[i], p); + } + DENC_FINISH(p); + } +}; + +WRITE_CLASS_DENC_BOUNDED( + nvdev_test_block_t +) + +TEST_F(nvdev_test_t, write_and_verify_test) +{ + run_async([this] { + device->open(dev_path, seastar::open_flags::rw).unsafe_get(); + nvdev_test_block_t original_data; + std::minstd_rand0 generator; + uint8_t value = generator(); + memset(original_data.data, value, BUF_SIZE); + uint64_t bl_length = 0; + { + bufferlist bl; + encode(original_data, bl); + bl_length = bl.length(); + auto write_buf = ceph::bufferptr(buffer::create_page_aligned(bl_length)); + bl.begin().copy(bl.length(), write_buf.c_str()); + device->write(0, write_buf).unsafe_get(); + } + + nvdev_test_block_t read_data; + { + auto read_buf = ceph::bufferptr(buffer::create_page_aligned(bl_length)); + device->read(0, read_buf).unsafe_get(); + bufferlist bl; + bl.push_back(read_buf); + auto bliter = bl.cbegin(); + decode(read_data, bliter); + } + + int ret = memcmp(original_data.data, read_data.data, BUF_SIZE); + device->close().wait(); + ASSERT_TRUE(ret == 0); + }); +} -- 2.39.5