seastar::open_flags mode) {
return seastar::do_with(in_path, [this, mode](auto& in_path) {
return seastar::file_stat(in_path).then([this, mode, in_path](auto stat) {
- this->block_size = stat.block_size;
- this->size = stat.size;
+ size = stat.size;
return seastar::open_file_dma(in_path, mode).then([=](auto file) {
- this->device = file;
+ device = file;
logger().debug("open");
// Get SSD's features from identify_controller and namespace command.
// Do identify_controller first, and then identify_namespace.
- return identify_controller().safe_then([this](auto id_controller_data) {
+ return identify_controller().safe_then([this, in_path, mode](
+ auto id_controller_data) {
support_multistream = id_controller_data.oacs.support_directives;
- return identify_namespace().safe_then([this] (auto id_namespace_data) {
+ if (support_multistream) {
+ stream_id_count = WRITE_LIFE_MAX;
+ }
+ return identify_namespace().safe_then([this, in_path, mode] (
+ auto id_namespace_data) {
+ // LBA format provides LBA size which is power of 2. LBA is the
+ // minimum size of read and write.
block_size = (1 << id_namespace_data.lbaf0.lbads);
data_protection_type = id_namespace_data.dps.protection_type;
data_protection_enabled = (data_protection_type > 0);
write_granularity = block_size * (id_namespace_data.npwg + 1);
write_alignment = block_size * (id_namespace_data.npwa + 1);
}
- return seastar::now();
+ return open_for_io(in_path, mode);
});
- });
+ }).handle_error(crimson::ct_error::input_output_error::handle([this, in_path, mode]{
+ logger().error("open: id ctrlr failed. open without ioctl");
+ return open_for_io(in_path, mode);
+ }), crimson::ct_error::pass_further_all{});
+ });
+ });
+ });
+}
+
+open_ertr::future<> PosixNVMeDevice::open_for_io(
+ const std::string& in_path,
+ seastar::open_flags mode) {
+ io_device.resize(stream_id_count);
+ return seastar::do_for_each(io_device, [=](auto &target_device) {
+ return seastar::open_file_dma(in_path, mode).then([this](
+ auto file) {
+ io_device[stream_index_to_open] = file;
+ return io_device[stream_index_to_open].fcntl(
+ F_SET_FILE_RW_HINT,
+ (uintptr_t)&stream_index_to_open).then([this](auto ret) {
+ stream_index_to_open++;
+ return seastar::now();
});
- }).handle_exception([](auto e) -> open_ertr::future<> {
- logger().error("open: got error{}", e);
- return crimson::ct_error::input_output_error::make();
});
});
}
auto length = bptr.length();
assert((length % block_size) == 0);
- return device.dma_write(offset, bptr.c_str(), length).handle_exception(
+ uint16_t supported_stream = stream;
+ if (stream >= stream_id_count) {
+ supported_stream = WRITE_LIFE_NOT_SET;
+ }
+ return io_device[supported_stream].dma_write(
+ offset, bptr.c_str(), length).handle_exception(
[](auto e) -> write_ertr::future<size_t> {
- logger().error("write: dma_write got error{}", e);
- return crimson::ct_error::input_output_error::make();
- }).then([length](auto result) -> write_ertr::future<> {
- if (result != length) {
- logger().error("write: dma_write got error with not proper length");
- return crimson::ct_error::input_output_error::make();
- }
- return write_ertr::now();
- });
+ logger().error("write: dma_write got error{}", e);
+ return crimson::ct_error::input_output_error::make();
+ }).then([length](auto result) -> write_ertr::future<> {
+ if (result != length) {
+ logger().error("write: dma_write got error with not proper length");
+ return crimson::ct_error::input_output_error::make();
+ }
+ return write_ertr::now();
+ });
}
read_ertr::future<> PosixNVMeDevice::read(
seastar::future<> PosixNVMeDevice::close() {
logger().debug(" close ");
- return device.close();
+ return device.close().then([this]() {
+ return seastar::do_for_each(io_device, [](auto target_device) {
+ return target_device.close();
+ });
+ });
}
nvme_command_ertr::future<nvme_identify_controller_data_t>
nvme_command_ertr::future<int> PosixNVMeDevice::pass_admin(
nvme_admin_command_t& admin_cmd) {
- return device.ioctl(NVME_IOCTL_ADMIN_CMD, &admin_cmd);
+ return device.ioctl(NVME_IOCTL_ADMIN_CMD, &admin_cmd).handle_exception(
+ [](auto e)->nvme_command_ertr::future<int> {
+ logger().error("pass_admin: ioctl failed");
+ return crimson::ct_error::input_output_error::make();
+ });
}
nvme_command_ertr::future<int> PosixNVMeDevice::pass_through_io(
namespace crimson::os::seastore::nvme_device {
+// from blk/BlockDevice.h
+#if defined(__linux__)
+#if !defined(F_SET_FILE_RW_HINT)
+#define F_LINUX_SPECIFIC_BASE 1024
+#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14)
+#endif
+// These values match Linux definition
+// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
+#define WRITE_LIFE_NOT_SET 0 // No hint information set
+#define WRITE_LIFE_NONE 1 // No hints about write life time
+#define WRITE_LIFE_SHORT 2 // Data written has a short life time
+#define WRITE_LIFE_MEDIUM 3 // Data written has a medium life time
+#define WRITE_LIFE_LONG 4 // Data written has a long life time
+#define WRITE_LIFE_EXTREME 5 // Data written has an extremely long life time
+#define WRITE_LIFE_MAX 6
+#else
+// On systems don't have WRITE_LIFE_* only use one FD
+// And all files are created equal
+#define WRITE_LIFE_NOT_SET 0 // No hint information set
+#define WRITE_LIFE_NONE 0 // No hints about write life time
+#define WRITE_LIFE_SHORT 0 // Data written has a short life time
+#define WRITE_LIFE_MEDIUM 0 // Data written has a medium life time
+#define WRITE_LIFE_LONG 0 // Data written has a long life time
+#define WRITE_LIFE_EXTREME 0 // Data written has an extremely long life time
+#define WRITE_LIFE_MAX 1
+#endif
+
/*
* NVMe protocol structures (nvme_XX, identify_XX)
*
*
* For more information about NVMe protocol, refer https://nvmexpress.org/
*/
-
struct nvme_identify_command_t {
uint32_t common_dw[10];
+
uint32_t cns : 8;
uint32_t reserved : 8;
- uint32_t cntroller_id : 16;
+ uint32_t cnt_id : 16;
static const uint8_t CNS_NAMESPACE = 0x00;
static const uint8_t CNS_CONTROLLER = 0x01;
};
struct nvme_admin_command_t {
- union
- {
- nvme_passthru_cmd common_cmd;
- nvme_identify_command_t identify_cmd;
+ union {
+ nvme_passthru_cmd common;
+ nvme_identify_command_t identify;
};
static const uint8_t OPCODE_IDENTIFY = 0x06;
class NVMeBlockDevice {
protected:
uint64_t size = 0;
+
+ // LBA Size
uint64_t block_size = 4096;
uint64_t write_granularity = 4096;
* For passsing through nvme IO or Admin command to SSD
* Caller can construct and execute its own nvme command
*/
- virtual nvme_command_ertr::future<> pass_through_io(
- const NVMePassThroughCommand& command) { return nvme_command_ertr::now(); }
- virtual nvme_command_ertr::future<> pass_admin(
- const nvme_admin_command_t& command) { return nvme_command_ertr::now(); }
+ virtual nvme_command_ertr::future<int> pass_through_io(
+ nvme_io_command_t& command) { return seastar::make_ready_future<int>(0); }
+ virtual nvme_command_ertr::future<int> pass_admin(
+ nvme_admin_command_t& command) { return seastar::make_ready_future<int>(0); }
/*
* End-to-End Data Protection
private:
// identify_controller/namespace are used to get SSD internal information such
- // as supported features, NPWG and NPWA;
+ // as supported features, NPWG and NPWA
nvme_command_ertr::future<nvme_identify_controller_data_t> identify_controller();
nvme_command_ertr::future<nvme_identify_namespace_data_t> identify_namespace();
nvme_command_ertr::future<int> get_nsid();
+ open_ertr::future<> open_for_io(
+ const std::string& in_path,
+ seastar::open_flags mode);
+
seastar::file device;
+ std::vector<seastar::file> io_device;
+ uint32_t stream_index_to_open = WRITE_LIFE_NOT_SET;
+ uint32_t stream_id_count = 1; // stream is disabled, defaultly.
};