From 6c5c8416862bdc88ee6a003e919d18d12f7cdf47 Mon Sep 17 00:00:00 2001 From: Jinyong Ha Date: Mon, 24 May 2021 18:33:02 +0900 Subject: [PATCH] seastore: replace POSIX call to seastar::file in NormalNBD Signed-off-by: Jinyong Ha --- .../os/seastore/nvmedevice/nvmedevice.cc | 235 ++++-------------- .../os/seastore/nvmedevice/nvmedevice.h | 36 +-- .../seastore/nvmedevice/test_nvmedevice.cc | 15 +- 3 files changed, 71 insertions(+), 215 deletions(-) diff --git a/src/crimson/os/seastore/nvmedevice/nvmedevice.cc b/src/crimson/os/seastore/nvmedevice/nvmedevice.cc index 84f707983d21e..f6c6cfeef4d93 100644 --- a/src/crimson/os/seastore/nvmedevice/nvmedevice.cc +++ b/src/crimson/os/seastore/nvmedevice/nvmedevice.cc @@ -17,211 +17,78 @@ namespace { } } -static constexpr uint32_t MAX_EVENTS = 1024; - namespace crimson::os::seastore::nvme_device { -/* background io poller for multi-stream write */ -void poll_completion(std::vector<::io_context_t>* ctxs, bool* exit) { - while (*exit == false) { - for (auto& ctx : *ctxs) { - io_event events[MAX_EVENTS]; - - /* - * At least a completion should be returned. Otherwise, thread is blocked - * until it is possible - */ - int num_events = io_getevents(ctx, 1, MAX_EVENTS, events, NULL); - - for (int i = 0; i < num_events; i++) { - io_context_t* io_context = (io_context_t*)events[i].obj; - io_context->done = true; - } - } - } -} - -open_ertr::future<> -NormalNBD::open(const std::string &in_path, seastar::open_flags mode) { - /* Open with posix fd for pass generic NVMe commands */ - fd = seastar::file_desc::open(in_path, (int)mode); - identify_controller_data_t controller_data = {0, }; - return identify_controller(controller_data).safe_then( - [this, controller_data, in_path, mode]() { - protocol_version = controller_data.version; - logger().debug("nvme protocol {}.{} {}", - (uint32_t)protocol_version.major_ver, - (uint32_t)protocol_version.minor_ver, - (uint32_t)protocol_version.tertiary_ver); - - /* - * Multi Stream Write - * - * When NVMe SSD supports multi stream functionality, it marks oacs bit of - * identify_controller_data structure (from NVMe Specification 1.4). - * If oacs field is true, NormalNBD class opens device file multiple times - * with different stream IDs. When user calls write() with stream argument, - * NormalNBD finds pre-opened FD with stream ID and submit write IO to the - * found FD. - */ - support_multistream = controller_data.oacs.support_directives; - if (support_multistream) { - write_life_max = 6; - } - - open_for_io(in_path, mode); - - /* PWG and PWA are supported from NVMe 1.4 */ - if (protocol_version.major_ver >= 1 && protocol_version.minor_ver >= 4) { - identify_namespace_data_t namespace_data = {0, }; - identify_namespace(namespace_data).safe_then([this, namespace_data]() { - /* Revise 0-based value */ - write_granularity = namespace_data.npwg + 1; - write_alignment = namespace_data.npwa + 1; - }); - } - return seastar::now(); - }).handle_error( - /* If device does not support ioctl, just open without stream */ - crimson::ct_error::input_output_error::handle([this, in_path, mode](auto) { - open_for_io(in_path, mode); - return seastar::now(); - })); -} - -void -NormalNBD::open_for_io(const std::string &in_path, seastar::open_flags mode) -{ - ctx.resize(write_life_max); - for (uint32_t i = 0; i < write_life_max; i++) { - stream_fd.push_back(seastar::file_desc::open(in_path, (int)mode)); - if (i != write_life_not_set) { - int posix_fd = stream_fd[i].get(); - fcntl(posix_fd, F_SET_FILE_RW_HINT, &i); - } - - io_setup(MAX_EVENTS, &ctx[i]); - } - completion_poller = std::thread(poll_completion, &ctx, &exit); +open_ertr::future<> NormalNBD::open( + const std::string &in_path, + seastar::open_flags mode) { + return seastar::do_with(in_path, [this, mode](auto& in_path) { + return seastar::file_stat(in_path).then([this, mode, in_path](auto stat) { + this->block_size = stat.block_size; + this->size = stat.size; + return seastar::open_file_dma(in_path, mode).then([=](auto file) { + this->device = file; + logger().debug("open"); + return seastar::now(); + }); + }).handle_exception([](auto e) -> open_ertr::future<> { + logger().error("open: got error{}", e); + return crimson::ct_error::input_output_error::make(); + }); + }); } -write_ertr::future<> -NormalNBD::write( +write_ertr::future<> NormalNBD::write( uint64_t offset, bufferptr &bptr, uint16_t stream) { logger().debug( - "block: do_write offset {} len {}", + "block: write offset {} len {}", offset, bptr.length()); - io_context_t io_context = io_context_t(); - io_prep_pwrite( - &io_context.cb, - stream_fd[stream].get(), - bptr.c_str(), - bptr.length(), - offset); - iocb* cb_ptr[1] = {&io_context.cb}; - io_submit(ctx[stream], 1, cb_ptr); - return seastar::do_with(std::move(io_context), [] (auto& io_context) { - /* - * libaio needs additional poller thread (see poll_completion) to poll IO - * completion. When the poller catches a completion, it marks "done" field - * of corresponding io_context. - */ - if (io_context.done) { - return seastar::now(); - } - return seastar::later(); + auto length = bptr.length(); + + assert((length % block_size) == 0); + + return device.dma_write(offset, bptr.c_str(), length).handle_exception( + [length](auto e) -> write_ertr::future { + logger().error("write: dma_write got error{}", e); + return crimson::ct_error::input_output_error::make(); + }).then([=](auto result) -> write_ertr::future<> { + if (result != length) { + logger().error("write: dma_write got error with not proper length"); + return crimson::ct_error::input_output_error::make(); + } + return write_ertr::now(); }); } -read_ertr::future<> -NormalNBD::read( +read_ertr::future<> NormalNBD::read( uint64_t offset, bufferptr &bptr) { logger().debug( - "block: do_read offset {} len {}", + "block: read offset {} len {}", offset, bptr.length()); - io_context_t io_context = io_context_t(); - io_prep_pread( - &io_context.cb, - stream_fd[0].get(), - bptr.c_str(), - bptr.length(), - offset); - iocb* cb_ptr[1] = {&io_context.cb}; - io_submit(ctx[0], 1, cb_ptr); - return seastar::do_with(std::move(io_context), [] (auto& io_context) { - if (io_context.done) { - return seastar::now(); - } - return seastar::later(); - }); -} - -seastar::future<> -NormalNBD::close() { - logger().debug(" close "); - exit = true; - completion_poller.join(); - fd.close(); - return seastar::now(); -} - -nvme_command_ertr::future<> -NormalNBD::pass_through_io(NVMePassThroughCommand& command) { - logger().debug("block: pass through"); - int ret = fd.ioctl(NVME_IOCTL_IO_CMD, command); - if (ret < 0) { - logger().debug("block: pass through failed"); - return crimson::ct_error::input_output_error::make(); - } - else { - return nvme_command_ertr::now(); - } -} + auto length = bptr.length(); -nvme_command_ertr::future<> -NormalNBD::identify_namespace(identify_namespace_data_t& namespace_data) { - nvme_admin_command_t command = {0,}; - command.common_cmd.opcode = nvme_admin_command_t::OPCODE_IDENTIFY; - command.identify_cmd.cns = nvme_identify_command_t::CNS_NAMESPACE; - command.common_cmd.addr = (uint64_t)&namespace_data; - command.common_cmd.data_len = sizeof(identify_namespace_data_t); + assert((length % block_size) == 0); - return pass_admin(command); -} - -nvme_command_ertr::future<> -NormalNBD::identify_controller(identify_controller_data_t& controller_data) { - nvme_admin_command_t command = {0,}; - command.common_cmd.opcode = nvme_admin_command_t::OPCODE_IDENTIFY; - command.identify_cmd.cns = nvme_identify_command_t::CNS_CONTROLLER; - command.common_cmd.addr = (uint64_t)&controller_data; - command.common_cmd.data_len = sizeof(identify_controller_data_t); - - return pass_admin(command); -} - -nvme_command_ertr::future<> -NormalNBD::pass_admin(nvme_admin_command_t& command) { - logger().debug("block: pass admin"); - try { - int ret = fd.ioctl(NVME_IOCTL_ADMIN_CMD, command); - if (ret < 0) { - logger().debug("block: pass admin failed"); + return device.dma_read(offset, bptr.c_str(), length).handle_exception( + [length](auto e) -> read_ertr::future { + logger().error("read: dma_read got error{}", e); return crimson::ct_error::input_output_error::make(); - } - else { - return nvme_command_ertr::now(); - } - } - catch (...) { - logger().debug("block: pass admin failed"); - return crimson::ct_error::input_output_error::make(); - } + }).then([=](auto result) -> read_ertr::future<> { + if (result != length) { + logger().error("read: dma_read got error with not proper length"); + return crimson::ct_error::input_output_error::make(); + } + return read_ertr::now(); + }); } +seastar::future<> NormalNBD::close() { + logger().debug(" close "); + return device.close(); +} } diff --git a/src/crimson/os/seastore/nvmedevice/nvmedevice.h b/src/crimson/os/seastore/nvmedevice/nvmedevice.h index c31fd0d844ce0..3bfaab2aeae67 100644 --- a/src/crimson/os/seastore/nvmedevice/nvmedevice.h +++ b/src/crimson/os/seastore/nvmedevice/nvmedevice.h @@ -3,6 +3,7 @@ #pragma once +#include #include #include #include @@ -143,7 +144,7 @@ struct io_context_t { class NVMeBlockDevice { protected: uint64_t size = 0; - uint64_t block_size = 0; + uint64_t block_size = 4096; uint64_t write_granularity = 4096; uint64_t write_alignment = 4096; @@ -172,6 +173,7 @@ public: */ uint64_t get_size() const { return size; } uint64_t get_block_size() const { return block_size; } + uint64_t get_preffered_write_granularity() const { return write_granularity; } uint64_t get_preffered_write_alignment() const { return write_alignment; } @@ -255,7 +257,7 @@ public: class NormalNBD : public NVMeBlockDevice { public: NormalNBD() {} - ~NormalNBD() override {} + ~NormalNBD() = default; open_ertr::future<> open( const std::string &in_path, @@ -270,31 +272,13 @@ public: uint64_t offset, bufferptr &bptr) override; - nvme_command_ertr::future<> pass_through_io( - NVMePassThroughCommand& command) override; - - nvme_command_ertr::future<> pass_admin( - nvme_admin_command_t& command) override; - seastar::future<> close() override; + // TODO Servicing NVMe features (multi-stream, protected write etc..) should + // be followed by upstreaming ioctl to seastar. + private: - seastar::file_desc fd = seastar::file_desc::from_fd(-1); - std::vector stream_fd; - nvme_version_t protocol_version; - bool support_multistream = false; - std::vector<::io_context_t> ctx; - std::thread completion_poller; - bool exit = false; - - uint32_t write_life_not_set = 0; - uint32_t write_life_max = 1; - - nvme_command_ertr::future<> identify_controller( - identify_controller_data_t& controller_data); - nvme_command_ertr::future<> identify_namespace( - identify_namespace_data_t& namespace_data); - void open_for_io(const std::string& in_path, seastar::open_flags mode); + seastar::file device; }; @@ -309,7 +293,9 @@ public: } } - open_ertr::future<> open(const std::string &in_path, seastar::open_flags mode) override; + open_ertr::future<> open( + const std::string &in_path, + seastar::open_flags mode) override; write_ertr::future<> write( uint64_t offset, diff --git a/src/test/crimson/seastore/nvmedevice/test_nvmedevice.cc b/src/test/crimson/seastore/nvmedevice/test_nvmedevice.cc index e43d9d7591dac..1d79a01ec0cf3 100644 --- a/src/test/crimson/seastore/nvmedevice/test_nvmedevice.cc +++ b/src/test/crimson/seastore/nvmedevice/test_nvmedevice.cc @@ -18,7 +18,7 @@ struct nvdev_test_t : seastar_test_suite_t { static const uint64_t DEV_SIZE = 1024 * 1024 * 1024; nvdev_test_t() : - device(NVMeBlockDevice::create()), + device(nullptr), dev_path("randomblock_manager.test_nvmedevice" + stringify(getpid())) { int fd = ::open(dev_path.c_str(), O_CREAT|O_RDWR|O_TRUNC, 0644); ceph_assert(fd >= 0); @@ -27,11 +27,11 @@ struct nvdev_test_t : seastar_test_suite_t { } ~nvdev_test_t() { ::unlink(dev_path.c_str()); - delete device; } }; -static const uint64_t BUF_SIZE = 8192; +static const uint64_t BUF_SIZE = 1024; +static const uint64_t BLK_SIZE = 4096; struct nvdev_test_block_t { uint8_t data[BUF_SIZE]; @@ -53,6 +53,7 @@ WRITE_CLASS_DENC_BOUNDED( TEST_F(nvdev_test_t, write_and_verify_test) { run_async([this] { + device = NVMeBlockDevice::create(); device->open(dev_path, seastar::open_flags::rw).unsafe_get(); nvdev_test_block_t original_data; std::minstd_rand0 generator; @@ -63,14 +64,14 @@ TEST_F(nvdev_test_t, write_and_verify_test) bufferlist bl; encode(original_data, bl); bl_length = bl.length(); - auto write_buf = ceph::bufferptr(buffer::create_page_aligned(bl_length)); - bl.begin().copy(bl.length(), write_buf.c_str()); + auto write_buf = ceph::bufferptr(buffer::create_page_aligned(BLK_SIZE)); + bl.begin().copy(bl_length, write_buf.c_str()); device->write(0, write_buf).unsafe_get(); } nvdev_test_block_t read_data; { - auto read_buf = ceph::bufferptr(buffer::create_page_aligned(bl_length)); + auto read_buf = ceph::bufferptr(buffer::create_page_aligned(BLK_SIZE)); device->read(0, read_buf).unsafe_get(); bufferlist bl; bl.push_back(read_buf); @@ -81,5 +82,7 @@ TEST_F(nvdev_test_t, write_and_verify_test) int ret = memcmp(original_data.data, read_data.data, BUF_SIZE); device->close().wait(); ASSERT_TRUE(ret == 0); + device.reset(nullptr); }); } + -- 2.39.5