seastore.cc
randomblock_manager.cc
nvmedevice/memory.cc
+ nvmedevice/nvmedevice.cc
../../../test/crimson/seastore/test_block.cc
${PROJECT_SOURCE_DIR}/src/os/Transaction.cc
)
write_ertr::future<>
TestMemory::write(
uint64_t offset,
- bufferptr &bptr)
+ bufferptr &bptr,
+ uint16_t stream)
{
ceph_assert(buf);
logger().debug(
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include <fcntl.h>
+
+#include "crimson/common/log.h"
+
+#include "include/buffer.h"
+#include "crimson/os/seastore/nvmedevice/nvmedevice.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+static constexpr uint32_t MAX_EVENTS = 1024;
+
+namespace crimson::os::seastore::nvme_device {
+
+/* background io poller for multi-stream write */
+void poll_completion(std::vector<::io_context_t>* ctxs, bool* exit) {
+ while (*exit == false) {
+ for (auto& ctx : *ctxs) {
+ io_event events[MAX_EVENTS];
+
+ /*
+ * At least a completion should be returned. Otherwise, thread is blocked
+ * until it is possible
+ */
+ int num_events = io_getevents(ctx, 1, MAX_EVENTS, events, NULL);
+
+ for (int i = 0; i < num_events; i++) {
+ io_context_t* io_context = (io_context_t*)events[i].obj;
+ io_context->done = true;
+ }
+ }
+ }
+}
+
+open_ertr::future<>
+NormalNBD::open(const std::string &in_path, seastar::open_flags mode) {
+ /* Open with posix fd for pass generic NVMe commands */
+ fd = seastar::file_desc::open(in_path, (int)mode);
+ identify_controller_data_t controller_data = {0, };
+ return identify_controller(controller_data).safe_then(
+ [this, controller_data, in_path, mode]() {
+ protocol_version = controller_data.version;
+ logger().debug("nvme protocol {}.{} {}",
+ (uint32_t)protocol_version.major_ver,
+ (uint32_t)protocol_version.minor_ver,
+ (uint32_t)protocol_version.tertiary_ver);
+
+ /*
+ * Multi Stream Write
+ *
+ * When NVMe SSD supports multi stream functionality, it marks oacs bit of
+ * identify_controller_data structure (from NVMe Specification 1.4).
+ * If oacs field is true, NormalNBD class opens device file multiple times
+ * with different stream IDs. When user calls write() with stream argument,
+ * NormalNBD finds pre-opened FD with stream ID and submit write IO to the
+ * found FD.
+ */
+ support_multistream = controller_data.oacs.support_directives;
+ if (support_multistream) {
+ write_life_max = 6;
+ }
+
+ open_for_io(in_path, mode);
+
+ /* PWG and PWA are supported from NVMe 1.4 */
+ if (protocol_version.major_ver >= 1 && protocol_version.minor_ver >= 4) {
+ identify_namespace_data_t namespace_data = {0, };
+ identify_namespace(namespace_data).safe_then([this, namespace_data]() {
+ /* Revise 0-based value */
+ write_granularity = namespace_data.npwg + 1;
+ write_alignment = namespace_data.npwa + 1;
+ });
+ }
+ return seastar::now();
+ }).handle_error(
+ /* If device does not support ioctl, just open without stream */
+ crimson::ct_error::input_output_error::handle([this, in_path, mode](auto) {
+ open_for_io(in_path, mode);
+ return seastar::now();
+ }));
+}
+
+void
+NormalNBD::open_for_io(const std::string &in_path, seastar::open_flags mode)
+{
+ ctx.resize(write_life_max);
+ for (uint32_t i = 0; i < write_life_max; i++) {
+ stream_fd.push_back(seastar::file_desc::open(in_path, (int)mode));
+ if (i != write_life_not_set) {
+ int posix_fd = stream_fd[i].get();
+ fcntl(posix_fd, F_SET_FILE_RW_HINT, &i);
+ }
+
+ io_setup(MAX_EVENTS, &ctx[i]);
+ }
+ completion_poller = std::thread(poll_completion, &ctx, &exit);
+}
+
+write_ertr::future<>
+NormalNBD::write(
+ uint64_t offset,
+ bufferptr &bptr,
+ uint16_t stream) {
+ logger().debug(
+ "block: do_write offset {} len {}",
+ offset,
+ bptr.length());
+ io_context_t io_context = io_context_t();
+ io_prep_pwrite(
+ &io_context.cb,
+ stream_fd[stream].get(),
+ bptr.c_str(),
+ bptr.length(),
+ offset);
+ iocb* cb_ptr[1] = {&io_context.cb};
+ io_submit(ctx[stream], 1, cb_ptr);
+ return seastar::do_with(std::move(io_context), [] (auto& io_context) {
+ /*
+ * libaio needs additional poller thread (see poll_completion) to poll IO
+ * completion. When the poller catches a completion, it marks "done" field
+ * of corresponding io_context.
+ */
+ if (io_context.done) {
+ return seastar::now();
+ }
+ return seastar::later();
+ });
+}
+
+read_ertr::future<>
+NormalNBD::read(
+ uint64_t offset,
+ bufferptr &bptr) {
+ logger().debug(
+ "block: do_read offset {} len {}",
+ offset,
+ bptr.length());
+ io_context_t io_context = io_context_t();
+ io_prep_pread(
+ &io_context.cb,
+ stream_fd[0].get(),
+ bptr.c_str(),
+ bptr.length(),
+ offset);
+ iocb* cb_ptr[1] = {&io_context.cb};
+ io_submit(ctx[0], 1, cb_ptr);
+ return seastar::do_with(std::move(io_context), [] (auto& io_context) {
+ if (io_context.done) {
+ return seastar::now();
+ }
+ return seastar::later();
+ });
+}
+
+seastar::future<>
+NormalNBD::close() {
+ logger().debug(" close ");
+ exit = true;
+ completion_poller.join();
+ fd.close();
+ return seastar::now();
+}
+
+nvme_command_ertr::future<>
+NormalNBD::pass_through_io(NVMePassThroughCommand& command) {
+ logger().debug("block: pass through");
+ int ret = fd.ioctl(NVME_IOCTL_IO_CMD, command);
+ if (ret < 0) {
+ logger().debug("block: pass through failed");
+ return crimson::ct_error::input_output_error::make();
+ }
+ else {
+ return nvme_command_ertr::now();
+ }
+}
+
+nvme_command_ertr::future<>
+NormalNBD::identify_namespace(identify_namespace_data_t& namespace_data) {
+ nvme_admin_command_t command = {0,};
+ command.common_cmd.opcode = nvme_admin_command_t::OPCODE_IDENTIFY;
+ command.identify_cmd.cns = nvme_identify_command_t::CNS_NAMESPACE;
+ command.common_cmd.addr = (uint64_t)&namespace_data;
+ command.common_cmd.data_len = sizeof(identify_namespace_data_t);
+
+ return pass_admin(command);
+}
+
+nvme_command_ertr::future<>
+NormalNBD::identify_controller(identify_controller_data_t& controller_data) {
+ nvme_admin_command_t command = {0,};
+ command.common_cmd.opcode = nvme_admin_command_t::OPCODE_IDENTIFY;
+ command.identify_cmd.cns = nvme_identify_command_t::CNS_CONTROLLER;
+ command.common_cmd.addr = (uint64_t)&controller_data;
+ command.common_cmd.data_len = sizeof(identify_controller_data_t);
+
+ return pass_admin(command);
+}
+
+nvme_command_ertr::future<>
+NormalNBD::pass_admin(nvme_admin_command_t& command) {
+ logger().debug("block: pass admin");
+ try {
+ int ret = fd.ioctl(NVME_IOCTL_ADMIN_CMD, command);
+ if (ret < 0) {
+ logger().debug("block: pass admin failed");
+ return crimson::ct_error::input_output_error::make();
+ }
+ else {
+ return nvme_command_ertr::now();
+ }
+ }
+ catch (...) {
+ logger().debug("block: pass admin failed");
+ return crimson::ct_error::input_output_error::make();
+ }
+}
+
+}
namespace crimson::os::seastore::nvme_device {
+/*
+ * NVMe protocol structures (nvme_XX, identify_XX)
+ *
+ * All structures relative to NVMe protocol are following NVMe protocol v1.4
+ * (latest). NVMe is protocol for fast interfacing between user and SSD device.
+ * We selectively adopted features among various NVMe features to ease
+ * implementation. And also, NVMeBlockDevice provides generic command submission
+ * APIs for IO and Admin commands. Please use pass_through_io() and pass_admin()
+ * to do it.
+ *
+ * For more information about NVMe protocol, refer https://nvmexpress.org/
+ */
+
+struct nvme_identify_command_t {
+ uint32_t common_dw[10];
+ uint32_t cns : 8;
+ uint32_t reserved : 8;
+ uint32_t cntroller_id : 16;
+
+ static const uint8_t CNS_NAMESPACE = 0x00;
+ static const uint8_t CNS_CONTROLLER = 0x01;
+};
+
+struct nvme_admin_command_t {
+ union
+ {
+ nvme_passthru_cmd common_cmd;
+ nvme_identify_command_t identify_cmd;
+ };
+
+ static const uint8_t OPCODE_IDENTIFY = 0x06;
+};
+
+struct nvme_version_t {
+ uint32_t major_ver : 16;
+ uint32_t minor_ver : 8;
+ uint32_t tertiary_ver : 8;
+};
+
+struct admin_command_support_t {
+ uint16_t unused : 5;
+ uint16_t support_directives : 1;
+ uint16_t unused2 : 10;
+};
+
+struct identify_controller_data_t {
+ union
+ {
+ struct
+ {
+ uint8_t raw[1024];
+ };
+ struct
+ {
+ uint8_t unused[80];
+ nvme_version_t version;
+ uint8_t unused2[172];
+ admin_command_support_t oacs;
+ };
+ };
+};
+
+struct identify_namespace_data_t {
+ union
+ {
+ struct
+ {
+ uint8_t raw[4096];
+ };
+ struct
+ {
+ uint8_t unused[64];
+ uint16_t npwg;
+ uint16_t npwa;
+ };
+ };
+};
+
+using NVMePassThroughCommand = nvme_passthru_cmd;
using read_ertr = crimson::errorator<
crimson::ct_error::input_output_error,
using nvme_command_ertr = crimson::errorator<
crimson::ct_error::input_output_error>;
+struct io_context_t {
+ iocb cb;
+ bool done = false;
+};
+
/*
* Interface between NVMe SSD and its user.
*
uint64_t offset,
bufferptr &bptr) = 0;
+ /*
+ * Multi-stream write
+ *
+ * Give hint to device about classification of data whose life time is similar
+ * with each other. Data with same stream value will be managed together in
+ * SSD for better write performance.
+ */
virtual write_ertr::future<> write(
uint64_t offset,
- bufferptr &bptr) = 0;
+ bufferptr &bptr,
+ uint16_t stream = 0) = 0;
// TODO
virtual int discard(uint64_t offset, uint64_t len) { return 0; }
virtual open_ertr::future<> open(
const std::string& path,
seastar::open_flags mode) = 0;
-
virtual seastar::future<> close() = 0;
+ /*
+ * For passsing through nvme IO or Admin command to SSD
+ * Caller can construct and execute its own nvme command
+ */
+ virtual nvme_command_ertr::future<> pass_through_io(
+ NVMePassThroughCommand& command) { return nvme_command_ertr::now(); }
+ virtual nvme_command_ertr::future<> pass_admin(
+ nvme_admin_command_t& command) { return nvme_command_ertr::now(); }
+
+ /*
+ * End-to-End Data Protection
+ *
+ * NVMe device keeps track of data integrity similar with checksum. Client can
+ * offload checksuming to NVMe device to reduce its CPU utilization
+ */
+ virtual write_ertr::future<> protected_write(
+ uint64_t offset,
+ bufferptr &bptr,
+ uint16_t stream = 0) { return write_ertr::now(); }
+
+ /*
+ * Data Health
+ *
+ * Returns list of LBAs which have almost corrupted data. Data of the LBAs
+ * will be corrupted very soon. Caller can overwrite, unmap or refresh data to
+ * protect data
+ */
+ virtual nvme_command_ertr::future<> get_data_health(
+ std::list<uint64_t>& fragile_lbas) { return nvme_command_ertr::now(); }
+
+ /*
+ * Recovery Level
+ *
+ * Regulate magnitude of SSD-internal data recovery. Caller can get good read
+ * latency with lower magnitude.
+ */
+ virtual nvme_command_ertr::future<> set_data_recovery_level(
+ uint32_t level) { return nvme_command_ertr::now(); }
+
+ /*
+ * Predictable Latency
+ *
+ * NVMe device can guarantee IO latency within pre-defined time window. This
+ * functionality will be analyzed soon.
+ */
+};
+
+/*
+ * Implementation of NVMeBlockDevice with POSIX APIs
+ *
+ * NormalNBD provides NVMe SSD interfaces through POSIX APIs which is generally
+ * available at most operating environment.
+ */
+class NormalNBD : public NVMeBlockDevice {
+public:
+ NormalNBD() {}
+ ~NormalNBD() override {}
+
+ open_ertr::future<> open(
+ const std::string &in_path,
+ seastar::open_flags mode) override;
+
+ write_ertr::future<> write(
+ uint64_t offset,
+ bufferptr &bptr,
+ uint16_t stream = 0) override;
+
+ read_ertr::future<> read(
+ uint64_t offset,
+ bufferptr &bptr) override;
+
+ nvme_command_ertr::future<> pass_through_io(
+ NVMePassThroughCommand& command) override;
+
+ nvme_command_ertr::future<> pass_admin(
+ nvme_admin_command_t& command) override;
+
+ seastar::future<> close() override;
+
+private:
+ seastar::file_desc fd = seastar::file_desc::from_fd(-1);
+ std::vector<seastar::file_desc> stream_fd;
+ nvme_version_t protocol_version;
+ bool support_multistream = false;
+ std::vector<::io_context_t> ctx;
+ std::thread completion_poller;
+ bool exit = false;
+
+ uint32_t write_life_not_set = 0;
+ uint32_t write_life_max = 1;
+
+ nvme_command_ertr::future<> identify_controller(
+ identify_controller_data_t& controller_data);
+ nvme_command_ertr::future<> identify_namespace(
+ identify_namespace_data_t& namespace_data);
+ void open_for_io(const std::string& in_path, seastar::open_flags mode);
};
}
}
- open_ertr::future<> open(const std::string &in_path, seastar::open_flags mode);
+ open_ertr::future<> open(const std::string &in_path, seastar::open_flags mode) override;
write_ertr::future<> write(
uint64_t offset,
- bufferptr &bptr);
+ bufferptr &bptr,
+ uint16_t stream = 0) override;
read_ertr::future<> read(
uint64_t offset,
- bufferptr &bptr);
+ bufferptr &bptr) override;
- seastar::future<> close();
+ seastar::future<> close() override;
char *buf;
size_t size;
${CMAKE_DL_LIBS}
crimson-seastore)
+add_executable(unittest-seastore-nvmedevice
+ nvmedevice/test_nvmedevice.cc)
+add_ceph_test(unittest-seastore-nvmedevice
+ unittest-seastore-nvmedevice --memory 256M --smp 1)
+target_link_libraries(
+ unittest-seastore-nvmedevice
+ crimson::gtest
+ crimson-seastore
+ aio)
+
add_subdirectory(onode_tree)
--- /dev/null
+//-*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/buffer.h"
+#include "crimson/os/seastore/nvmedevice/nvmedevice.h"
+#include "test/crimson/gtest_seastar.h"
+#include "include/stringify.h"
+
+using namespace crimson;
+using namespace crimson::os;
+using namespace crimson::os::seastore;
+using namespace nvme_device;
+
+struct nvdev_test_t : seastar_test_suite_t {
+ NVMeBlockDevice* device;
+ std::string dev_path;
+
+ static const uint64_t DEV_SIZE = 1024 * 1024 * 1024;
+
+ nvdev_test_t() :
+ device(NVMeBlockDevice::create<NormalNBD>()),
+ dev_path("randomblock_manager.test_nvmedevice" + stringify(getpid())) {
+ int fd = ::open(dev_path.c_str(), O_CREAT|O_RDWR|O_TRUNC, 0644);
+ ceph_assert(fd >= 0);
+ ::ftruncate(fd, DEV_SIZE);
+ ::close(fd);
+ }
+ ~nvdev_test_t() {
+ ::unlink(dev_path.c_str());
+ delete device;
+ }
+};
+
+static const uint64_t BUF_SIZE = 8192;
+
+struct nvdev_test_block_t {
+ uint8_t data[BUF_SIZE];
+
+ DENC(nvdev_test_block_t, v, p) {
+ DENC_START(1, 1, p);
+ for (uint64_t i = 0 ; i < BUF_SIZE; i++)
+ {
+ denc(v.data[i], p);
+ }
+ DENC_FINISH(p);
+ }
+};
+
+WRITE_CLASS_DENC_BOUNDED(
+ nvdev_test_block_t
+)
+
+TEST_F(nvdev_test_t, write_and_verify_test)
+{
+ run_async([this] {
+ device->open(dev_path, seastar::open_flags::rw).unsafe_get();
+ nvdev_test_block_t original_data;
+ std::minstd_rand0 generator;
+ uint8_t value = generator();
+ memset(original_data.data, value, BUF_SIZE);
+ uint64_t bl_length = 0;
+ {
+ bufferlist bl;
+ encode(original_data, bl);
+ bl_length = bl.length();
+ auto write_buf = ceph::bufferptr(buffer::create_page_aligned(bl_length));
+ bl.begin().copy(bl.length(), write_buf.c_str());
+ device->write(0, write_buf).unsafe_get();
+ }
+
+ nvdev_test_block_t read_data;
+ {
+ auto read_buf = ceph::bufferptr(buffer::create_page_aligned(bl_length));
+ device->read(0, read_buf).unsafe_get();
+ bufferlist bl;
+ bl.push_back(read_buf);
+ auto bliter = bl.cbegin();
+ decode(read_data, bliter);
+ }
+
+ int ret = memcmp(original_data.data, read_data.data, BUF_SIZE);
+ device->close().wait();
+ ASSERT_TRUE(ret == 0);
+ });
+}