From 27b7ccacd4d45689880a35081a3d13f027008c2d Mon Sep 17 00:00:00 2001 From: Ziye Yang Date: Mon, 18 Apr 2022 07:40:24 +0800 Subject: [PATCH] blk/spdk: Add the support to use nvme device provided by NVMe-of Target This patch is used to add the support to use the nvmedevice provided by NVMe-oF target. Signed-off-by: Ziye Yang --- PendingReleaseNotes | 2 + .../configuration/bluestore-config-ref.rst | 9 +- src/blk/spdk/NVMEDevice.cc | 83 +++++++++++++------ src/os/bluestore/BlueStore.cc | 7 +- 4 files changed, 69 insertions(+), 32 deletions(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index f648a0e4868f0..7f4381b34455c 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -3,3 +3,5 @@ * RGW's default backend for `rgw_enable_ops_log` changed from RADOS to file. The default value of `rgw_ops_log_rados` is now false, and `rgw_ops_log_file_path` defaults to "/var/log/ceph/ops-log-$cluster-$name.log". +* The SPDK backend for BlueStore is now able to connect to an NVMeoF target. + Please note that this is not an officially supported feature. diff --git a/doc/rados/configuration/bluestore-config-ref.rst b/doc/rados/configuration/bluestore-config-ref.rst index d3f5ba994abe4..493e529f76b01 100644 --- a/doc/rados/configuration/bluestore-config-ref.rst +++ b/doc/rados/configuration/bluestore-config-ref.rst @@ -357,14 +357,19 @@ The device selector always has the form of ``DDDD:BB:DD.FF`` or ``DDDD.BB.DD.FF` and then set:: - bluestore_block_path = spdk:0000:01:00.0 + bluestore_block_path = "spdk:trtype:PCIe traddr:0000:01:00.0" Where ``0000:01:00.0`` is the device selector found in the output of ``lspci`` command above. +You may also specify a remote NVMeoF target over the TCP transport as in the +following example:: + + bluestore_block_path = "spdk:trtype:TCP traddr:10.67.110.197 trsvcid:4420 subnqn:nqn.2019-02.io.spdk:cnode1" + To run multiple SPDK instances per node, you must specify the amount of dpdk memory in MB that each instance will use, to make sure each -instance uses its own dpdk memory +instance uses its own DPDK memory. In most cases, a single device can be used for data, DB, and WAL. We describe this strategy as *colocating* these components. Be sure to enter the below diff --git a/src/blk/spdk/NVMEDevice.cc b/src/blk/spdk/NVMEDevice.cc index 80377cd9b976d..4461f6a072d08 100644 --- a/src/blk/spdk/NVMEDevice.cc +++ b/src/blk/spdk/NVMEDevice.cc @@ -55,6 +55,9 @@ static constexpr uint32_t data_buffer_size = 8192; static constexpr uint16_t inline_segment_num = 32; +/* Default to 10 seconds for the keep alive value. This value is arbitrary. */ +static constexpr uint32_t nvme_ctrlr_keep_alive_timeout_in_ms = 10000; + static void io_complete(void *t, const struct spdk_nvme_cpl *completion); struct IORequest { @@ -78,6 +81,7 @@ class SharedDriverData { spdk_nvme_ns *ns; uint32_t block_size = 0; uint64_t size = 0; + std::thread admin_thread; public: std::vector registered_devices; @@ -90,12 +94,30 @@ class SharedDriverData { ns(ns_) { block_size = spdk_nvme_ns_get_extended_sector_size(ns); size = spdk_nvme_ns_get_size(ns); + if (trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + return; + } + + // For Non-PCIe transport, we need to send keep-alive periodically. + admin_thread = std::thread( + [this]() { + int rc; + while (true) { + rc = spdk_nvme_ctrlr_process_admin_completions(ctrlr); + ceph_assert(rc >= 0); + sleep(1); + } + } + ); } bool is_equal(const spdk_nvme_transport_id& trid2) const { return spdk_nvme_transport_id_compare(&trid, &trid2) == 0; } ~SharedDriverData() { + if (admin_thread.joinable()) { + admin_thread.join(); + } } void register_device(NVMEDevice *device) { @@ -146,7 +168,7 @@ class SharedDriverQueueData { struct spdk_nvme_io_qpair_opts opts = {}; spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); opts.qprio = SPDK_NVME_QPRIO_URGENT; - // usable queue depth should minus 1 to aovid overflow. + // usable queue depth should minus 1 to avoid overflow. max_queue_depth = opts.io_queue_size - 1; qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts)); ceph_assert(qpair != NULL); @@ -478,23 +500,31 @@ static NVMEManager manager; static bool probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, struct spdk_nvme_ctrlr_opts *opts) { NVMEManager::ProbeContext *ctx = static_cast(cb_ctx); + bool do_attach = false; - if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) { - dout(0) << __func__ << " only probe local nvme device" << dendl; - return false; + if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { + do_attach = spdk_nvme_transport_id_compare(&ctx->trid, trid) == 0; + if (!do_attach) { + dout(0) << __func__ << " device traddr (" << ctx->trid.traddr + << ") not match " << trid->traddr << dendl; + } + } else { + // for non-pcie devices, should always match the specified trid + assert(!spdk_nvme_transport_id_compare(&ctx->trid, trid)); + do_attach = true; } - dout(0) << __func__ << " found device at: " - << "trtype=" << spdk_nvme_transport_id_trtype_str(trid->trtype) << ", " - << "traddr=" << trid->traddr << dendl; - if (spdk_nvme_transport_id_compare(&ctx->trid, trid)) { - dout(0) << __func__ << " device traddr (" << ctx->trid.traddr << ") not match " << trid->traddr << dendl; - return false; - } + if (do_attach) { + dout(0) << __func__ << " found device at: " + << "trtype=" << spdk_nvme_transport_id_trtype_str(trid->trtype) << ", " + << "traddr=" << trid->traddr << dendl; - opts->io_queue_size = UINT16_MAX; + opts->io_queue_size = UINT16_MAX; + opts->io_queue_requests = UINT16_MAX; + opts->keep_alive_timeout_ms = nvme_ctrlr_keep_alive_timeout_in_ms; + } - return true; + return do_attach; } static void attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, @@ -543,12 +573,6 @@ int NVMEManager::try_get(const spdk_nvme_transport_id& trid, SharedDriverData ** } } - struct spdk_pci_addr pci_addr; - int rc = spdk_pci_addr_parse(&pci_addr, trid.traddr); - if (rc < 0) { - derr << __func__ << " invalid transport address: " << trid.traddr << dendl; - return -ENOENT; - } auto coremask_arg = g_conf().get_val("bluestore_spdk_coremask"); int m_core_arg = find_first_bitset(coremask_arg); // at least one core is needed for using spdk @@ -563,18 +587,24 @@ int NVMEManager::try_get(const spdk_nvme_transport_id& trid, SharedDriverData ** if (!dpdk_thread.joinable()) { dpdk_thread = std::thread( - [this, coremask_arg, m_core_arg, mem_size_arg, pci_addr]() { + [this, coremask_arg, m_core_arg, mem_size_arg, trid]() { struct spdk_env_opts opts; - struct spdk_pci_addr addr = pci_addr; + struct spdk_pci_addr addr; int r; - spdk_env_opts_init(&opts); + bool local_pci_device = false; + int rc = spdk_pci_addr_parse(&addr, trid.traddr); + if (!rc) { + local_pci_device = true; + opts.pci_whitelist = &addr; + opts.num_pci_addr = 1; + } + + spdk_env_opts_init(&opts); opts.name = "nvme-device-manager"; opts.core_mask = coremask_arg.c_str(); opts.master_core = m_core_arg; opts.mem_size = mem_size_arg; - opts.pci_whitelist = &addr; - opts.num_pci_addr = 1; spdk_env_init(&opts); spdk_unaffinitize_thread(); @@ -583,7 +613,7 @@ int NVMEManager::try_get(const spdk_nvme_transport_id& trid, SharedDriverData ** if (!probe_queue.empty()) { ProbeContext* ctxt = probe_queue.front(); probe_queue.pop_front(); - r = spdk_nvme_probe(NULL, ctxt, probe_cb, attach_cb, NULL); + r = spdk_nvme_probe(local_pci_device ? NULL : &trid, ctxt, probe_cb, attach_cb, NULL); if (r < 0) { ceph_assert(!ctxt->driver); derr << __func__ << " device probe nvme failed" << dendl; @@ -714,7 +744,8 @@ int NVMEDevice::open(const string& p) return r; } if (int r = manager.try_get(trid, &driver); r < 0) { - derr << __func__ << " failed to get nvme device with transport address " << trid.traddr << dendl; + derr << __func__ << " failed to get nvme device with transport address " + << trid.traddr << " type " << trid.trtype << dendl; return r; } diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 90046936983fb..b4225f5a06ca0 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -6870,12 +6870,11 @@ int BlueStore::_setup_block_symlink_or_file( return r; } // write the Transport ID of the NVMe device - // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0" + // a transport id for PCIe looks like: "trtype:PCIe traddr:0000:02:00.0" // where "0000:02:00.0" is the selector of a PCI device, see // the first column of "lspci -mm -n -D" - string trid{"trtype:PCIe "}; - trid += "traddr:"; - trid += epath.substr(strlen(SPDK_PREFIX)); + // a transport id for tcp looks like: "trype:TCP adrfam:IPv4 traddr:172.31.89.152 trsvcid:4420" + string trid = epath.substr(strlen(SPDK_PREFIX)); r = ::write(fd, trid.c_str(), trid.size()); ceph_assert(r == static_cast(trid.size())); dout(1) << __func__ << " created " << name << " symlink to " -- 2.39.5