From: kungf Date: Tue, 16 Apr 2019 11:21:41 +0000 (+0800) Subject: os/bluestore: fix aio pwritev lost data problem. X-Git-Tag: v15.1.0~2772^2~1 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=4d33114a40d5ae0d541c36175977ca22789a3b88;p=ceph-ci.git os/bluestore: fix aio pwritev lost data problem. On Linux, write() (and similar system calls) will transfer at most 0x7ffff000 (2,147,479,552) bytes, it will cap data if aio pwritev more than 0x7ffff000, so we have the split the data to more aio submit. Signed-off-by: kungf --- diff --git a/src/os/bluestore/KernelDevice.cc b/src/os/bluestore/KernelDevice.cc index 711d5e72d7e..ec163c7f1c0 100644 --- a/src/os/bluestore/KernelDevice.cc +++ b/src/os/bluestore/KernelDevice.cc @@ -845,9 +845,6 @@ int KernelDevice::aio_write( #ifdef HAVE_LIBAIO if (aio && dio && !buffered) { - ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint))); - ++ioc->num_pending; - aio_t& aio = ioc->pending_aios.back(); if (cct->_conf->bdev_inject_crash && rand() % cct->_conf->bdev_inject_crash == 0) { derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex @@ -855,16 +852,48 @@ int KernelDevice::aio_write( << dendl; // generate a real io so that aio_wait behaves properly, but make it // a read instead of write, and toss the result. + ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint))); + ++ioc->num_pending; + auto& aio = ioc->pending_aios.back(); aio.pread(off, len); ++injecting_crash; } else { - bl.prepare_iov(&aio.iov); - dout(30) << aio << dendl; - aio.bl.claim_append(bl); - aio.pwritev(off, len); + if (bl.length() <= RW_IO_MAX) { + // fast path (non-huge write) + ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint))); + ++ioc->num_pending; + auto& aio = ioc->pending_aios.back(); + bl.prepare_iov(&aio.iov); + aio.bl.claim_append(bl); + aio.pwritev(off, len); + dout(30) << aio << dendl; + dout(5) << __func__ << " 0x" << std::hex << off << "~" << len + << std::dec << " aio " << &aio << dendl; + } else { + // write in RW_IO_MAX-sized chunks + uint64_t prev_len = 0; + while (prev_len < bl.length()) { + bufferlist tmp; + if (prev_len + RW_IO_MAX < bl.length()) { + tmp.substr_of(bl, prev_len, RW_IO_MAX); + } else { + tmp.substr_of(bl, prev_len, bl.length() - prev_len); + } + auto len = tmp.length(); + ioc->pending_aios.push_back(aio_t(ioc, choose_fd(false, write_hint))); + ++ioc->num_pending; + auto& aio = ioc->pending_aios.back(); + tmp.prepare_iov(&aio.iov); + aio.bl.claim_append(tmp); + aio.pwritev(off + prev_len, len); + dout(30) << aio << dendl; + dout(5) << __func__ << " 0x" << std::hex << off + prev_len + << "~" << len + << std::dec << " aio " << &aio << " (piece)" << dendl; + prev_len += len; + } + } } - dout(5) << __func__ << " 0x" << std::hex << off << "~" << len - << std::dec << " aio " << &aio << dendl; } else #endif { diff --git a/src/os/bluestore/KernelDevice.h b/src/os/bluestore/KernelDevice.h index e24b45b3ac0..ec8cf2f8dc4 100644 --- a/src/os/bluestore/KernelDevice.h +++ b/src/os/bluestore/KernelDevice.h @@ -25,6 +25,11 @@ #include "ceph_aio.h" #include "BlockDevice.h" +#ifndef RW_IO_MAX +#define RW_IO_MAX 0x7FFFF000 +#endif + + class KernelDevice : public BlockDevice { std::vector fd_directs, fd_buffereds; bool enable_wrt = true;