From: kungf Date: Tue, 16 Apr 2019 11:21:41 +0000 (+0800) Subject: os/bluestore: fix aio pwritev lost data problem. X-Git-Tag: v12.2.13~193^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=c0454abcc5bfceaf3dc76e690ccdce0e42162b3c;p=ceph.git os/bluestore: fix aio pwritev lost data problem. On Linux, write() (and similar system calls) will transfer at most 0x7ffff000 (2,147,479,552) bytes, it will cap data if aio pwritev more than 0x7ffff000, so we have the split the data to more aio submit. Signed-off-by: kungf (cherry picked from commit 4d33114a40d5ae0d541c36175977ca22789a3b88) - conflict due to fd_direct instead of choose_fd() --- diff --git a/src/os/bluestore/KernelDevice.cc b/src/os/bluestore/KernelDevice.cc index 6d34f0935ae..4bd1d4cde17 100644 --- a/src/os/bluestore/KernelDevice.cc +++ b/src/os/bluestore/KernelDevice.cc @@ -641,9 +641,6 @@ int KernelDevice::aio_write( #ifdef HAVE_LIBAIO if (aio && dio && !buffered) { - ioc->pending_aios.push_back(aio_t(ioc, fd_direct)); - ++ioc->num_pending; - aio_t& aio = ioc->pending_aios.back(); if (cct->_conf->bdev_inject_crash && rand() % cct->_conf->bdev_inject_crash == 0) { derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex @@ -651,16 +648,48 @@ int KernelDevice::aio_write( << dendl; // generate a real io so that aio_wait behaves properly, but make it // a read instead of write, and toss the result. + ioc->pending_aios.push_back(aio_t(ioc, fd_direct)); + ++ioc->num_pending; + auto& aio = ioc->pending_aios.back(); aio.pread(off, len); ++injecting_crash; } else { - bl.prepare_iov(&aio.iov); - dout(30) << aio << dendl; - aio.bl.claim_append(bl); - aio.pwritev(off, len); + if (bl.length() <= RW_IO_MAX) { + // fast path (non-huge write) + ioc->pending_aios.push_back(aio_t(ioc, fd_direct)); + ++ioc->num_pending; + auto& aio = ioc->pending_aios.back(); + bl.prepare_iov(&aio.iov); + aio.bl.claim_append(bl); + aio.pwritev(off, len); + dout(30) << aio << dendl; + dout(5) << __func__ << " 0x" << std::hex << off << "~" << len + << std::dec << " aio " << &aio << dendl; + } else { + // write in RW_IO_MAX-sized chunks + uint64_t prev_len = 0; + while (prev_len < bl.length()) { + bufferlist tmp; + if (prev_len + RW_IO_MAX < bl.length()) { + tmp.substr_of(bl, prev_len, RW_IO_MAX); + } else { + tmp.substr_of(bl, prev_len, bl.length() - prev_len); + } + auto len = tmp.length(); + ioc->pending_aios.push_back(aio_t(ioc, fd_direct)); + ++ioc->num_pending; + auto& aio = ioc->pending_aios.back(); + tmp.prepare_iov(&aio.iov); + aio.bl.claim_append(tmp); + aio.pwritev(off + prev_len, len); + dout(30) << aio << dendl; + dout(5) << __func__ << " 0x" << std::hex << off + prev_len + << "~" << len + << std::dec << " aio " << &aio << " (piece)" << dendl; + prev_len += len; + } + } } - dout(5) << __func__ << " 0x" << std::hex << off << "~" << len - << std::dec << " aio " << &aio << dendl; } else #endif { diff --git a/src/os/bluestore/KernelDevice.h b/src/os/bluestore/KernelDevice.h index f04b7f972af..bea69a936fe 100644 --- a/src/os/bluestore/KernelDevice.h +++ b/src/os/bluestore/KernelDevice.h @@ -23,6 +23,11 @@ #include "aio.h" #include "BlockDevice.h" +#ifndef RW_IO_MAX +#define RW_IO_MAX 0x7FFFF000 +#endif + + class KernelDevice : public BlockDevice { int fd_direct, fd_buffered; uint64_t size;