From: Alan Somers Date: Mon, 25 Sep 2017 20:53:14 +0000 (-0600) Subject: os/bluestore: support for FreeBSD X-Git-Tag: v14.1.0~559^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=9ae94e48be813d1b5cd5c3c8eaeb62a880c0ccc2;p=ceph.git os/bluestore: support for FreeBSD Signed-off-by: Alan Somers --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 257f864dcc04..d2935a36f8d4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -258,8 +258,13 @@ endif() option(WITH_BLUESTORE "Bluestore OSD backend" ON) if(WITH_BLUESTORE) - find_package(aio) - set(HAVE_LIBAIO ${AIO_FOUND}) + if(LINUX) + find_package(aio) + set(HAVE_LIBAIO ${AIO_FOUND}) + elseif(FREEBSD) + # POSIX AIO is integrated into FreeBSD kernel, and exposed by libc. + set(HAVE_POSIXAIO ON) + endif() endif() if(CMAKE_SYSTEM_PROCESSOR MATCHES "i386|i686|amd64|x86_64|AMD64|aarch64") @@ -285,7 +290,7 @@ if(WITH_PMEM) endif() if(WITH_BLUESTORE) - if(NOT AIO_FOUND AND NOT WITH_SPDK AND NOT WITH_PMEM) + if(NOT AIO_FOUND AND NOT HAVE_POSIXAIO AND NOT WITH_SPDK AND NOT WITH_PMEM) message(SEND_ERROR "WITH_BLUESTORE is ON, " "but none of the bluestore backends is enabled. " "Please install libaio, or enable WITH_SPDK or WITH_PMEM (experimental)") diff --git a/src/include/config-h.in.cmake b/src/include/config-h.in.cmake index 749409d76960..cfbc80240cd6 100644 --- a/src/include/config-h.in.cmake +++ b/src/include/config-h.in.cmake @@ -93,6 +93,9 @@ /* Defined if you have libaio */ #cmakedefine HAVE_LIBAIO +/* Defind if you have POSIX AIO */ +#cmakedefine HAVE_POSIXAIO + /* Defined if OpenLDAP enabled */ #cmakedefine HAVE_OPENLDAP diff --git a/src/os/CMakeLists.txt b/src/os/CMakeLists.txt index 9e30c2ad4de3..8f5d6b70791a 100644 --- a/src/os/CMakeLists.txt +++ b/src/os/CMakeLists.txt @@ -35,7 +35,7 @@ if(WITH_BLUESTORE) ) endif(WITH_BLUESTORE) -if(HAVE_LIBAIO) +if(HAVE_LIBAIO OR HAVE_POSIXAIO) list(APPEND libos_srcs bluestore/KernelDevice.cc bluestore/aio.cc) diff --git a/src/os/bluestore/BlockDevice.cc b/src/os/bluestore/BlockDevice.cc index ac2df83b7353..bba9a96cd680 100644 --- a/src/os/bluestore/BlockDevice.cc +++ b/src/os/bluestore/BlockDevice.cc @@ -19,7 +19,7 @@ #include "BlockDevice.h" -#if defined(HAVE_LIBAIO) +#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) #include "KernelDevice.h" #endif @@ -63,7 +63,7 @@ uint64_t IOContext::get_num_ios() const // a configurable (with different hdd and ssd defaults), and add // that to the bytes value. uint64_t ios = 0; -#ifdef HAVE_LIBAIO +#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) for (auto& p : pending_aios) { ios += p.iov.size(); } @@ -77,7 +77,7 @@ uint64_t IOContext::get_num_ios() const void IOContext::release_running_aios() { ceph_assert(!num_running); -#ifdef HAVE_LIBAIO +#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) // release aio contexts (including pinned buffers). running_aios.clear(); #endif @@ -120,7 +120,7 @@ BlockDevice *BlockDevice::create(CephContext* cct, const string& path, return new PMEMDevice(cct, cb, cbpriv); } #endif -#if defined(HAVE_LIBAIO) +#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) if (type == "kernel") { return new KernelDevice(cct, cb, cbpriv, d_cb, d_cbpriv); } diff --git a/src/os/bluestore/BlockDevice.h b/src/os/bluestore/BlockDevice.h index 92c1443546b4..da62e88bb3c0 100644 --- a/src/os/bluestore/BlockDevice.h +++ b/src/os/bluestore/BlockDevice.h @@ -29,7 +29,7 @@ #include "acconfig.h" #include "common/ceph_mutex.h" -#ifdef HAVE_LIBAIO +#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) #include "ceph_aio.h" #endif #include "include/ceph_assert.h" @@ -70,7 +70,7 @@ public: std::atomic_int total_nseg = {0}; #endif -#ifdef HAVE_LIBAIO +#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) std::list pending_aios; ///< not yet submitted std::list running_aios; ///< submitting or submitted #endif diff --git a/src/os/bluestore/KernelDevice.cc b/src/os/bluestore/KernelDevice.cc index 0cb62627e51a..2fd3a489a3d2 100644 --- a/src/os/bluestore/KernelDevice.cc +++ b/src/os/bluestore/KernelDevice.cc @@ -24,6 +24,9 @@ #include "include/stringify.h" #include "common/blkdev.h" #include "common/errno.h" +#if defined(__FreeBSD__) +#include "bsm/audit_errno.h" +#endif #include "common/debug.h" #include "common/align.h" @@ -451,9 +454,14 @@ static bool is_expected_ioerr(const int r) { // https://lxr.missinglinkelectronics.com/linux+v4.15/block/blk-core.c#L135 return (r == -EOPNOTSUPP || r == -ETIMEDOUT || r == -ENOSPC || - r == -ENOLINK || r == -EREMOTEIO || r == -EBADE || + r == -ENOLINK || r == -EREMOTEIO || r == -EAGAIN || r == -EIO || r == -ENODATA || r == -EILSEQ || r == -ENOMEM || - r == -EAGAIN || r == -EREMCHG || r == -EIO); +#if defined(__linux__) + r == -EREMCHG || r == -EBADE +#elif defined(__FreeBSD__) + r == - BSM_ERRNO_EREMCHG || r == -BSM_ERRNO_EBADE +#endif + ); } void KernelDevice::_aio_thread() @@ -729,6 +737,7 @@ int KernelDevice::_sync_write(uint64_t off, bufferlist &bl, bool buffered, int w derr << __func__ << " pwritev error: " << cpp_strerror(r) << dendl; return r; } +#ifdef HAVE_SYNC_FILE_RANGE if (buffered) { // initiate IO (but do not wait) r = ::sync_file_range(fd_buffereds[WRITE_LIFE_NOT_SET], off, len, SYNC_FILE_RANGE_WRITE); @@ -738,6 +747,7 @@ int KernelDevice::_sync_write(uint64_t off, bufferlist &bl, bool buffered, int w return r; } } +#endif io_since_flush.store(true); diff --git a/src/os/bluestore/aio.cc b/src/os/bluestore/aio.cc index 46856b85b61d..379bd9c9efa6 100644 --- a/src/os/bluestore/aio.cc +++ b/src/os/bluestore/aio.cc @@ -22,20 +22,37 @@ int aio_queue_t::submit_batch(aio_iter begin, aio_iter end, // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds int attempts = 16; int delay = 125; + int r; aio_iter cur = begin; - struct iocb *piocb[aios_size]; + struct aio_t *piocb[aios_size]; int left = 0; while (cur != end) { cur->priv = priv; - *(piocb+left) = &cur->iocb; + *(piocb+left) = &(*cur); ++left; ++cur; } ceph_assert(aios_size >= left); int done = 0; while (left > 0) { - int r = io_submit(ctx, std::min(left, max_iodepth), piocb + done); +#if defined(HAVE_LIBAIO) + r = io_submit(ctx, std::min(left, max_iodepth), (struct iocb**)(piocb + done)); +#elif defined(HAVE_POSIXAIO) + if (piocb[done]->n_aiocb == 1) { + // TODO: consider batching multiple reads together with lio_listio + piocb[done]->aio.aiocb.aio_sigevent.sigev_notify = SIGEV_KEVENT; + piocb[done]->aio.aiocb.aio_sigevent.sigev_notify_kqueue = ctx; + piocb[done]->aio.aiocb.aio_sigevent.sigev_value.sival_ptr = piocb[done]; + r = aio_read(&piocb[done]->aio.aiocb); + } else { + struct sigevent sev; + sev.sigev_notify = SIGEV_KEVENT; + sev.sigev_notify_kqueue = ctx; + sev.sigev_value.sival_ptr = piocb[done]; + r = lio_listio(LIO_NOWAIT, &piocb[done]->aio.aiocbp, piocb[done]->n_aiocb, &sev); + } +#endif if (r < 0) { if (r == -EAGAIN && attempts-- > 0) { usleep(delay); @@ -56,7 +73,11 @@ int aio_queue_t::submit_batch(aio_iter begin, aio_iter end, int aio_queue_t::get_next_completed(int timeout_ms, aio_t **paio, int max) { +#if defined(HAVE_LIBAIO) io_event event[max]; +#elif defined(HAVE_POSIXAIO) + struct kevent events[max]; +#endif struct timespec t = { timeout_ms / 1000, (timeout_ms % 1000) * 1000 * 1000 @@ -64,12 +85,40 @@ int aio_queue_t::get_next_completed(int timeout_ms, aio_t **paio, int max) int r = 0; do { +#if defined(HAVE_LIBAIO) r = io_getevents(ctx, 1, max, event, &t); +#elif defined(HAVE_POSIXAIO) + r = kevent(ctx, NULL, 0, events, max, &t); + if (r < 0) + r = -errno; +#endif } while (r == -EINTR); for (int i=0; irval = event[i].res; +#else + paio[i] = (aio_t*)events[i].udata; + if (paio[i]->n_aiocb == 1) { + paio[i]->rval = aio_return(&paio[i]->aio.aiocb); + } else { + // Emulate the return value of pwritev. I can't find any documentation + // for what the value of io_event.res is supposed to be. I'm going to + // assume that it's just like pwritev/preadv/pwrite/pread. + paio[i]->rval = 0; + for (int j = 0; j < paio[i]->n_aiocb; j++) { + int res = aio_return(&paio[i]->aio.aiocbp[j]); + if (res < 0) { + paio[i]->rval = res; + break; + } else { + paio[i]->rval += res; + } + } + free(paio[i]->aio.aiocbp); + } +#endif } return r; } diff --git a/src/os/bluestore/ceph_aio.h b/src/os/bluestore/ceph_aio.h index bc6acb7ec5f3..ab03388631b8 100644 --- a/src/os/bluestore/ceph_aio.h +++ b/src/os/bluestore/ceph_aio.h @@ -2,7 +2,15 @@ // vim: ts=8 sw=2 smarttab #pragma once -# include + +#include "acconfig.h" + +#if defined(HAVE_LIBAIO) +#include +#elif defined(HAVE_POSIXAIO) +#include +#include +#endif #include #include @@ -11,7 +19,16 @@ #include "include/types.h" struct aio_t { +#if defined(HAVE_LIBAIO) struct iocb iocb{}; // must be first element; see shenanigans in aio_queue_t +#elif defined(HAVE_POSIXAIO) + // static long aio_listio_max = -1; + union { + struct aiocb aiocb; + struct aiocb *aiocbp; + } aio; + int n_aiocb; +#endif void *priv; int fd; boost::container::small_vector iov; @@ -27,13 +44,34 @@ struct aio_t { void pwritev(uint64_t _offset, uint64_t len) { offset = _offset; length = len; +#if defined(HAVE_LIBAIO) io_prep_pwritev(&iocb, fd, &iov[0], iov.size(), offset); +#elif defined(HAVE_POSIXAIO) + n_aiocb = iov.size(); + aio.aiocbp = (struct aiocb*)calloc(iov.size(), sizeof(struct aiocb)); + for (int i = 0; i < iov.size(); i++) { + aio.aiocbp[i].aio_fildes = fd; + aio.aiocbp[i].aio_offset = offset; + aio.aiocbp[i].aio_buf = iov[i].iov_base; + aio.aiocbp[i].aio_nbytes = iov[i].iov_len; + aio.aiocbp[i].aio_lio_opcode = LIO_WRITE; + offset += iov[i].iov_len; + } +#endif } void pread(uint64_t _offset, uint64_t len) { offset = _offset; length = len; bufferptr p = buffer::create_small_page_aligned(length); +#if defined(HAVE_LIBAIO) io_prep_pread(&iocb, fd, p.c_str(), length, offset); +#elif defined(HAVE_POSIXAIO) + n_aiocb = 1; + aio.aiocb.aio_fildes = fd; + aio.aiocb.aio_buf = p.c_str(); + aio.aiocb.aio_nbytes = length; + aio.aiocb.aio_offset = offset; +#endif bl.append(std::move(p)); } @@ -53,7 +91,11 @@ typedef boost::intrusive::list< struct aio_queue_t { int max_iodepth; +#if defined(HAVE_LIBAIO) io_context_t ctx; +#elif defined(HAVE_POSIXAIO) + int ctx; +#endif typedef list::iterator aio_iter; @@ -67,6 +109,7 @@ struct aio_queue_t { int init() { ceph_assert(ctx == 0); +#if defined(HAVE_LIBAIO) int r = io_setup(max_iodepth, &ctx); if (r < 0) { if (ctx) { @@ -75,10 +118,21 @@ struct aio_queue_t { } } return r; +#elif defined(HAVE_POSIXAIO) + ctx = kqueue(); + if (ctx < 0) + return -errno; + else + return 0; +#endif } void shutdown() { if (ctx) { +#if defined(HAVE_LIBAIO) int r = io_destroy(ctx); +#elif defined(HAVE_POSIXAIO) + int r = close(ctx); +#endif ceph_assert(r == 0); ctx = 0; }