From: Haomai Wang Date: Mon, 4 Jan 2016 09:20:18 +0000 (+0800) Subject: bluestore: add NVMEDevice backend X-Git-Tag: v10.0.4~81^2~54 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=e76f716ac2e4d7906f2e78dd0380174528f4f35d;p=ceph.git bluestore: add NVMEDevice backend Signed-off-by: Haomai Wang --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 9107b220e4bf..25c5a3372cdf 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -852,6 +852,16 @@ OPTION(bdev_aio, OPT_BOOL, true) OPTION(bdev_aio_poll_ms, OPT_INT, 250) // milliseconds OPTION(bdev_aio_max_queue_depth, OPT_INT, 32) +// Define the whitelist of NVMe controllers to initialize. +// Users can use 'lspci -vvv -d 8086:0953 | grep "Device Serial Number"' to +// get the serial number of Intel(R) Fultondale NVMe controllers. +OPTION(bdev_nvme_serial_number, OPT_STR, "") +// if yes, osd will unbind all NVMe devices from kernel driver and bind them +// to the uio_pci_generic driver. The purpose is to prevent the case where +// NVMe driver is loaded while osd is running. +OPTION(bdev_nvme_unbind_from_kernel, OPT_BOOL, false) +OPTION(bdev_nvme_retry_count, OPT_INT, -1) // -1 means by default which is 4 + OPTION(bluefs_alloc_size, OPT_U64, 1048576) OPTION(bluefs_max_prefetch, OPT_U64, 1048576) OPTION(bluefs_min_log_runway, OPT_U64, 1048576) // alloc when we get this low diff --git a/src/os/bluestore/NVMEDevice.cc b/src/os/bluestore/NVMEDevice.cc new file mode 100644 index 000000000000..b212dbbb1bda --- /dev/null +++ b/src/os/bluestore/NVMEDevice.cc @@ -0,0 +1,348 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky + * + * Author: Haomai Wang + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "NVMEDevice.h" +#include "include/types.h" +#include "include/compat.h" +#include "common/errno.h" +#include "common/debug.h" +#include "common/blkdev.h" + +#define dout_subsys ceph_subsys_bdev +#undef dout_prefix +#define dout_prefix *_dout << "bdev " + +void IOContext::aio_wait() +{ + Mutex::Locker l(lock); + // see _aio_thread for waker logic + num_waiting.inc(); + while (num_running.read() > 0 || num_reading.read() > 0) { + dout(10) << __func__ << " " << this + << " waiting for " << num_running.read() << " aios and/or " + << num_reading.read() << " readers to complete" << dendl; + cond.Wait(lock); + } + num_waiting.dec(); + dout(20) << __func__ << " " << this << " done" << dendl; +} + +static void io_complete(void *ctx, const struct nvme_completion *completion) { + if (nvme_completion_is_error(completion)) { + assert(0); + } + + IOContext *ioc; + if (ioc->priv) { + aio_callback(aio_callback_priv, ioc->priv); + } +} + +// ---------------- +#undef dout_prefix +#define dout_prefix *_dout << "bdev(" << path << ") " + +NVMEDevice::NVMEDevice(aio_callback_t cb, void *cbpriv) + : aio_queue(g_conf->bdev_aio_max_queue_depth), + aio_callback(cb), + aio_callback_priv(cbpriv), + aio_stop(false), + aio_thread(this) +{ + zeros = buffer::create_page_aligned(1048576); + zeros.zero(); +} + +int NVMEDevice::_lock() +{ + struct flock l; + memset(&l, 0, sizeof(l)); + l.l_type = F_WRLCK; + l.l_whence = SEEK_SET; + l.l_start = 0; + l.l_len = 0; + int r = ::fcntl(fd_direct, F_SETLK, &l); + if (r < 0) + return -errno; + return 0; +} + +struct nvme_whitelist { + const char *sn; + int id; +}; + +int NVMEDevice::open(string p) +{ + int r = 0; + dout(1) << __func__ << " path " << path << dendl; + + config_section *sp; + nvme_device *dev; + pci_device *pci_dev; + const char *sn_tag, *id_tag, *val; + int id_tag_i, id = -1; + int i, num_whitelist_controllers = 0; + int controllers_remaining; + nvme_whitelist whitelist[NVME_MAX_CONTROLLERS]; + + pci_system_init(); + + // Search for matching devices + pci_id_match match; + match.vendor_id = PCI_MATCH_ANY; + match.subvendor_id = PCI_MATCH_ANY; + match.subdevice_id = PCI_MATCH_ANY; + match.device_id = PCI_MATCH_ANY; + match.device_class = NVME_CLASS_CODE; + match.device_class_mask = 0xFFFFFF; + + pci_device_iterator *iter = pci_id_match_iterator_create(&match); + + nvme_retry_count = g_conf->bdev_nvme_retry_count; + if (nvme_retry_count < 0) + nvme_retry_count = NVME_DEFAULT_RETRY_COUNT; + + /* Init the whitelist */ + string sn_tag = g_conf->bdev_nvme_serial_number; + if (sn_tag.empty()) { + int r = -ENOENT; + derr << __func__ << " empty serial number: " << cpp_strerror(r) << dendl; + return r; + } + + unbindfromkernel = g_conf->bdev_nvme_unbind_from_kernel; + + char serial_number[128]; + while ((pci_dev = pci_device_next(iter)) != NULL) { + dout(10) << __func__ << " found device at "<< pci_dev->bus << ":" << pci_dev->dev << ":" + << pci_dev->func << " vendor:0x" << pci_dev->vendor_id << " device:0x" << pci_dev->device_id + << " name:" << pci_device_get_device_name(pci_dev) << dendl; + r = pci_device_get_serial_number(pci_dev, serial_number, 128); + if (r < 0) { + dout(10) << __func__ << " failed to get serial number from " << pci_device_get_device_name(pci_dev) << dendl; + continue; + } + + if (sn_tag.compare(sn_tag, serial_number, 16)) { + dout(10) << __func__ << " device serial number not match " << serial_number << dendl; + continue; + } + + if (pci_device_has_kernel_driver(pci_dev)) { + if (!pci_device_has_uio_driver(pci_dev)) { + /*NVMe kernel driver case*/ + if (unbindfromkernel) { + r = pci_device_switch_to_uio_driver(pci_dev); + if (r < 0) { + derr << __func__ << " device " << pci_device_get_device_name(pci_dev) << " " << pci_dev->bus + << ":" << pci_dev->dev << ":" << pci_dev->func + << " switch to uio driver failed" << dendl; + return r; + } + } else { + derr << __func__ << " device has kernel nvme driver attached, exiting..." << dendl; + r = -EBUSY; + return r; + } + } + } else { + r = pci_device_bind_uio_driver(pci_dev, PCI_UIO_DRIVER); + if (r < 0) { + derr << __func__ << " device " << pci_device_get_device_name(pci_dev) << " " << pci_dev->bus + << ":" << pci_dev->dev << ":" << pci_dev->func + << " bind to uio driver failed" << dendl; + return r; + } + } + + /* Claim the device in case conflict with other ids process */ + r = pci_device_claim(pci_dev); + if (r < 0) { + derr << __func__ << " device " << pci_device_get_device_name(pci_dev) << " " << pci_dev->bus + << ":" << pci_dev->dev << ":" << pci_dev->func + << " claim failed" << dendl; + return r; + } + + pci_device_probe(pci_dev); + + ctrlr = nvme_attach(pci_dev); + if (!dev->ctrlr) { + derr << __func__ << " device attach nvme failed" << dendl; + r = -1; + return r; + } + + int num_ns = nvme_ctrlr_get_num_ns(ctrlr); + assert(num_ns >= 1); + if (num_ns > 1) { + dout(0) << __func__ << " namespace count larger than 1, currently only use the first namespace" << dendl; + } + nvme_namespace *ns = nvme_ctrlr_get_ns(ctrlr, 0); + block_size = nvme_ns_get_sector_size(ns); + size = block_size * nvme_ns_get_num_sectors(ns); + dout(1) << __func__ << " successfully attach nvme device at" << pci_device_get_device_name(pci_dev) + << " " << pci_dev->bus << ":" << pci_dev->dev << ":" << pci_dev->func << dendl; + break; + } + + pci_iterator_destroy(iter); + + dout(1) << __func__ << " size " << size << " (" << pretty_si_t(size) << "B)" + << " block_size " << block_size << " (" << pretty_si_t(block_size) + << "B)" << dendl; + + r = nvme_register_io_thread(); + assert(r == 0); + r = _aio_start(); + assert(r == 0); + return 0; +} + +void NVMEDevice::close() +{ + dout(1) << __func__ << dendl; + nvme_unregister_io_thread(); + _aio_stop(); + path.clear(); +} + +int NVMEDevice::flush() +{ + dout(10) << __func__ << " start" << dendl; + return r; +} + +int NVMEDevice::_aio_start() +{ + dout(10) << __func__ << dendl; + aio_thread.create(); + return 0; +} + +void NVMEDevice::_aio_stop() +{ + dout(10) << __func__ << dendl; + aio_stop = true; + aio_thread.join(); + aio_stop = false; +} + +void NVMEDevice::_aio_thread() +{ + dout(10) << __func__ << " start" << dendl; + while (!aio_stop) { + dout(40) << __func__ << " polling" << dendl; + + nvme_ctrlr_process_io_completions(ctrlr, 0); + + } + dout(10) << __func__ << " end" << dendl; +} + +int NVMEDevice::aio_write( + uint64_t off, + bufferlist &bl, + IOContext *ioc, + bool buffered) +{ + uint64_t len = bl.length(); + dout(20) << __func__ << " " << off << "~" << len << dendl; + assert(off % block_size == 0); + assert(len % block_size == 0); + assert(len > 0); + assert(off < size); + assert(off + len <= size); + + if (!bl.is_n_page_sized() || !bl.is_page_aligned()) { + dout(20) << __func__ << " rebuilding buffer to be page-aligned" << dendl; + bl.rebuild(); + } + + int rc = nvme_ns_cmd_write(ns, bl.c_str(), off, + bl.length()/block_size, io_complete, ioc->priv); + if (rc < 0) { + derr << __func__ << " failed to do write command" << dendl; + return rc; + } + + dout(5) << __func__ << " " << off << "~" << len << dendl; + + return 0; +} + +int NVMEDevice::aio_zero( + uint64_t off, + uint64_t len, + IOContext *ioc) +{ + dout(5) << __func__ << " " << off << "~" << len << dendl; + assert(off % block_size == 0); + assert(len % block_size == 0); + assert(len > 0); + assert(off < size); + assert(off + len <= size); + + bufferlist bl; + while (len > 0) { + bufferlist t; + t.append(zeros, 0, MIN(zeros.length(), len)); + len -= t.length(); + bl.claim_append(t); + } + bufferlist foo; + // note: this works with aio only becaues the actual buffer is + // this->zeros, which is page-aligned and never freed. + return aio_write(off, bl, ioc, false); +} + +int NVMEDevice::read(uint64_t off, uint64_t len, bufferlist *pbl, + IOContext *ioc, + bool buffered) +{ + dout(5) << __func__ << " " << off << "~" << len << dendl; + assert(off % block_size == 0); + assert(len % block_size == 0); + assert(len > 0); + assert(off < size); + assert(off + len <= size); + + bufferptr p = buffer::create_page_aligned(len); + int r = nvme_ns_cmd_read(ns, p.c_str(), off, len / block_size, io_complete, ioc->priv); + if (r < 0) { + r = -errno; + derr << __func__ << " failed to read" << dendl; + return r; + } + pbl->clear(); + pbl->push_back(p); + + return r < 0 ? r : 0; +} + +int NVMEDevice::invalidate_cache(uint64_t off, uint64_t len) +{ + dout(5) << __func__ << " " << off << "~" << len << dendl; + return r; +} diff --git a/src/os/bluestore/NVMEDevice.h b/src/os/bluestore/NVMEDevice.h new file mode 100644 index 000000000000..30cda26b389d --- /dev/null +++ b/src/os/bluestore/NVMEDevice.h @@ -0,0 +1,117 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky + * + * Author: Haomai Wang + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OS_BLUESTORE_NVMEDEVICE +#define CEPH_OS_BLUESTORE_NVMEDEVICE + +#include "include/interval_set.h" + +/// track in-flight io +struct IOContext { + void *priv; + + Mutex lock; + Cond cond; + //interval_set blocks; ///< blocks with aio in flight + + list pending_aios; ///< not yet submitted + list running_aios; ///< submitting or submitted + atomic_t num_pending; + atomic_t num_running; + atomic_t num_reading; + atomic_t num_waiting; + + IOContext(void *p) + : priv(p), + lock("IOContext::lock") + {} + + // no copying + IOContext(const IOContext& other); + IOContext &operator=(const IOContext& other); + + bool has_aios() { + Mutex::Locker l(lock); + return num_pending.read() + num_running.read(); + } + + void aio_wait(); +}; + +typedef void (*blockdev_completion_cb)(void *ref, int status); + +class NVMEDevice { + private: + /** + * points to pinned, physically contiguous memory region; + * contains 4KB IDENTIFY structure for controller which is + * target for CONTROLLER IDENTIFY command during initialization + */ + nvme_controller *ctrlr; + nvme_namespace *ns; + uint64_t blocklen; + int unbindfromkernel = 0; + + uint64_t size; + uint64_t block_size; + + aio_callback_t aio_callback; + void *aio_callback_priv; + bool aio_stop; + + struct AioCompletionThread : public Thread { + NVMEDevice *bdev; + AioCompletionThread(NVMEDevice *b) : bdev(b) {} + void *entry() { + bdev->_aio_thread(); + return NULL; + } + } aio_thread; + + void _aio_thread(); + int _aio_start(); + void _aio_stop(); + + public: + NVMEDevice(aio_callback_t cb, void *cbpriv); + + void aio_submit(IOContext *ioc) {} + + uint64_t get_size() const { + return size; + } + uint64_t get_block_size() const { + return block_size; + } + + int read(uint64_t off, uint64_t len, bufferlist *pbl, + IOContext *ioc, + bool buffered); + + int aio_write(uint64_t off, bufferlist& bl, + IOContext *ioc, + bool buffered); + int aio_zero(uint64_t off, uint64_t len, + IOContext *ioc); + int flush(); + + // for managing buffered readers/writers + int invalidate_cache(uint64_t off, uint64_t len); + int open(string path); + void close(); +}; + +#endif