option(WITH_BLUESTORE "Bluestore OSD backend" ON)
if(WITH_BLUESTORE)
- find_package(aio)
- set(HAVE_LIBAIO ${AIO_FOUND})
+ if(LINUX)
+ find_package(aio)
+ set(HAVE_LIBAIO ${AIO_FOUND})
+ elseif(FREEBSD)
+ # POSIX AIO is integrated into FreeBSD kernel, and exposed by libc.
+ set(HAVE_POSIXAIO ON)
+ endif()
endif()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "i386|i686|amd64|x86_64|AMD64|aarch64")
endif()
if(WITH_BLUESTORE)
- if(NOT AIO_FOUND AND NOT WITH_SPDK AND NOT WITH_PMEM)
+ if(NOT AIO_FOUND AND NOT HAVE_POSIXAIO AND NOT WITH_SPDK AND NOT WITH_PMEM)
message(SEND_ERROR "WITH_BLUESTORE is ON, "
"but none of the bluestore backends is enabled. "
"Please install libaio, or enable WITH_SPDK or WITH_PMEM (experimental)")
/* Defined if you have libaio */
#cmakedefine HAVE_LIBAIO
+/* Defind if you have POSIX AIO */
+#cmakedefine HAVE_POSIXAIO
+
/* Defined if OpenLDAP enabled */
#cmakedefine HAVE_OPENLDAP
)
endif(WITH_BLUESTORE)
-if(HAVE_LIBAIO)
+if(HAVE_LIBAIO OR HAVE_POSIXAIO)
list(APPEND libos_srcs
bluestore/KernelDevice.cc
bluestore/aio.cc)
#include "BlockDevice.h"
-#if defined(HAVE_LIBAIO)
+#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
#include "KernelDevice.h"
#endif
// a configurable (with different hdd and ssd defaults), and add
// that to the bytes value.
uint64_t ios = 0;
-#ifdef HAVE_LIBAIO
+#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
for (auto& p : pending_aios) {
ios += p.iov.size();
}
void IOContext::release_running_aios()
{
ceph_assert(!num_running);
-#ifdef HAVE_LIBAIO
+#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
// release aio contexts (including pinned buffers).
running_aios.clear();
#endif
return new PMEMDevice(cct, cb, cbpriv);
}
#endif
-#if defined(HAVE_LIBAIO)
+#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
if (type == "kernel") {
return new KernelDevice(cct, cb, cbpriv, d_cb, d_cbpriv);
}
#include "acconfig.h"
#include "common/ceph_mutex.h"
-#ifdef HAVE_LIBAIO
+#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
#include "ceph_aio.h"
#endif
#include "include/ceph_assert.h"
std::atomic_int total_nseg = {0};
#endif
-#ifdef HAVE_LIBAIO
+#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
std::list<aio_t> pending_aios; ///< not yet submitted
std::list<aio_t> running_aios; ///< submitting or submitted
#endif
#include "include/stringify.h"
#include "common/blkdev.h"
#include "common/errno.h"
+#if defined(__FreeBSD__)
+#include "bsm/audit_errno.h"
+#endif
#include "common/debug.h"
#include "common/align.h"
{
// https://lxr.missinglinkelectronics.com/linux+v4.15/block/blk-core.c#L135
return (r == -EOPNOTSUPP || r == -ETIMEDOUT || r == -ENOSPC ||
- r == -ENOLINK || r == -EREMOTEIO || r == -EBADE ||
+ r == -ENOLINK || r == -EREMOTEIO || r == -EAGAIN || r == -EIO ||
r == -ENODATA || r == -EILSEQ || r == -ENOMEM ||
- r == -EAGAIN || r == -EREMCHG || r == -EIO);
+#if defined(__linux__)
+ r == -EREMCHG || r == -EBADE
+#elif defined(__FreeBSD__)
+ r == - BSM_ERRNO_EREMCHG || r == -BSM_ERRNO_EBADE
+#endif
+ );
}
void KernelDevice::_aio_thread()
derr << __func__ << " pwritev error: " << cpp_strerror(r) << dendl;
return r;
}
+#ifdef HAVE_SYNC_FILE_RANGE
if (buffered) {
// initiate IO (but do not wait)
r = ::sync_file_range(fd_buffereds[WRITE_LIFE_NOT_SET], off, len, SYNC_FILE_RANGE_WRITE);
return r;
}
}
+#endif
io_since_flush.store(true);
// 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds
int attempts = 16;
int delay = 125;
+ int r;
aio_iter cur = begin;
- struct iocb *piocb[aios_size];
+ struct aio_t *piocb[aios_size];
int left = 0;
while (cur != end) {
cur->priv = priv;
- *(piocb+left) = &cur->iocb;
+ *(piocb+left) = &(*cur);
++left;
++cur;
}
ceph_assert(aios_size >= left);
int done = 0;
while (left > 0) {
- int r = io_submit(ctx, std::min(left, max_iodepth), piocb + done);
+#if defined(HAVE_LIBAIO)
+ r = io_submit(ctx, std::min(left, max_iodepth), (struct iocb**)(piocb + done));
+#elif defined(HAVE_POSIXAIO)
+ if (piocb[done]->n_aiocb == 1) {
+ // TODO: consider batching multiple reads together with lio_listio
+ piocb[done]->aio.aiocb.aio_sigevent.sigev_notify = SIGEV_KEVENT;
+ piocb[done]->aio.aiocb.aio_sigevent.sigev_notify_kqueue = ctx;
+ piocb[done]->aio.aiocb.aio_sigevent.sigev_value.sival_ptr = piocb[done];
+ r = aio_read(&piocb[done]->aio.aiocb);
+ } else {
+ struct sigevent sev;
+ sev.sigev_notify = SIGEV_KEVENT;
+ sev.sigev_notify_kqueue = ctx;
+ sev.sigev_value.sival_ptr = piocb[done];
+ r = lio_listio(LIO_NOWAIT, &piocb[done]->aio.aiocbp, piocb[done]->n_aiocb, &sev);
+ }
+#endif
if (r < 0) {
if (r == -EAGAIN && attempts-- > 0) {
usleep(delay);
int aio_queue_t::get_next_completed(int timeout_ms, aio_t **paio, int max)
{
+#if defined(HAVE_LIBAIO)
io_event event[max];
+#elif defined(HAVE_POSIXAIO)
+ struct kevent events[max];
+#endif
struct timespec t = {
timeout_ms / 1000,
(timeout_ms % 1000) * 1000 * 1000
int r = 0;
do {
+#if defined(HAVE_LIBAIO)
r = io_getevents(ctx, 1, max, event, &t);
+#elif defined(HAVE_POSIXAIO)
+ r = kevent(ctx, NULL, 0, events, max, &t);
+ if (r < 0)
+ r = -errno;
+#endif
} while (r == -EINTR);
for (int i=0; i<r; ++i) {
+#if defined(HAVE_LIBAIO)
paio[i] = (aio_t *)event[i].obj;
paio[i]->rval = event[i].res;
+#else
+ paio[i] = (aio_t*)events[i].udata;
+ if (paio[i]->n_aiocb == 1) {
+ paio[i]->rval = aio_return(&paio[i]->aio.aiocb);
+ } else {
+ // Emulate the return value of pwritev. I can't find any documentation
+ // for what the value of io_event.res is supposed to be. I'm going to
+ // assume that it's just like pwritev/preadv/pwrite/pread.
+ paio[i]->rval = 0;
+ for (int j = 0; j < paio[i]->n_aiocb; j++) {
+ int res = aio_return(&paio[i]->aio.aiocbp[j]);
+ if (res < 0) {
+ paio[i]->rval = res;
+ break;
+ } else {
+ paio[i]->rval += res;
+ }
+ }
+ free(paio[i]->aio.aiocbp);
+ }
+#endif
}
return r;
}
// vim: ts=8 sw=2 smarttab
#pragma once
-# include <libaio.h>
+
+#include "acconfig.h"
+
+#if defined(HAVE_LIBAIO)
+#include <libaio.h>
+#elif defined(HAVE_POSIXAIO)
+#include <aio.h>
+#include <sys/event.h>
+#endif
#include <boost/intrusive/list.hpp>
#include <boost/container/small_vector.hpp>
#include "include/types.h"
struct aio_t {
+#if defined(HAVE_LIBAIO)
struct iocb iocb{}; // must be first element; see shenanigans in aio_queue_t
+#elif defined(HAVE_POSIXAIO)
+ // static long aio_listio_max = -1;
+ union {
+ struct aiocb aiocb;
+ struct aiocb *aiocbp;
+ } aio;
+ int n_aiocb;
+#endif
void *priv;
int fd;
boost::container::small_vector<iovec,4> iov;
void pwritev(uint64_t _offset, uint64_t len) {
offset = _offset;
length = len;
+#if defined(HAVE_LIBAIO)
io_prep_pwritev(&iocb, fd, &iov[0], iov.size(), offset);
+#elif defined(HAVE_POSIXAIO)
+ n_aiocb = iov.size();
+ aio.aiocbp = (struct aiocb*)calloc(iov.size(), sizeof(struct aiocb));
+ for (int i = 0; i < iov.size(); i++) {
+ aio.aiocbp[i].aio_fildes = fd;
+ aio.aiocbp[i].aio_offset = offset;
+ aio.aiocbp[i].aio_buf = iov[i].iov_base;
+ aio.aiocbp[i].aio_nbytes = iov[i].iov_len;
+ aio.aiocbp[i].aio_lio_opcode = LIO_WRITE;
+ offset += iov[i].iov_len;
+ }
+#endif
}
void pread(uint64_t _offset, uint64_t len) {
offset = _offset;
length = len;
bufferptr p = buffer::create_small_page_aligned(length);
+#if defined(HAVE_LIBAIO)
io_prep_pread(&iocb, fd, p.c_str(), length, offset);
+#elif defined(HAVE_POSIXAIO)
+ n_aiocb = 1;
+ aio.aiocb.aio_fildes = fd;
+ aio.aiocb.aio_buf = p.c_str();
+ aio.aiocb.aio_nbytes = length;
+ aio.aiocb.aio_offset = offset;
+#endif
bl.append(std::move(p));
}
struct aio_queue_t {
int max_iodepth;
+#if defined(HAVE_LIBAIO)
io_context_t ctx;
+#elif defined(HAVE_POSIXAIO)
+ int ctx;
+#endif
typedef list<aio_t>::iterator aio_iter;
int init() {
ceph_assert(ctx == 0);
+#if defined(HAVE_LIBAIO)
int r = io_setup(max_iodepth, &ctx);
if (r < 0) {
if (ctx) {
}
}
return r;
+#elif defined(HAVE_POSIXAIO)
+ ctx = kqueue();
+ if (ctx < 0)
+ return -errno;
+ else
+ return 0;
+#endif
}
void shutdown() {
if (ctx) {
+#if defined(HAVE_LIBAIO)
int r = io_destroy(ctx);
+#elif defined(HAVE_POSIXAIO)
+ int r = close(ctx);
+#endif
ceph_assert(r == 0);
ctx = 0;
}