]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: fix aio pwritev lost data problem.
authorkungf <yang.wang@easystack.cn>
Tue, 16 Apr 2019 11:21:41 +0000 (19:21 +0800)
committerSage Weil <sage@redhat.com>
Wed, 10 Jul 2019 15:26:08 +0000 (10:26 -0500)
On Linux, write() (and similar system calls) will transfer at most
0x7ffff000 (2,147,479,552) bytes, it will cap data if aio pwritev
more than 0x7ffff000, so we have the split the data to more aio submit.

Signed-off-by: kungf <yang.wang@easystack.cn>
(cherry picked from commit 4d33114a40d5ae0d541c36175977ca22789a3b88)

- conflict due to fd_direct instead of choose_fd()

src/os/bluestore/KernelDevice.cc
src/os/bluestore/KernelDevice.h

index 6d34f0935ae7b35b972b51691aeee41766d28328..4bd1d4cde17be20bf3f28e84e6b9bff5461bb50d 100644 (file)
@@ -641,9 +641,6 @@ int KernelDevice::aio_write(
 
 #ifdef HAVE_LIBAIO
   if (aio && dio && !buffered) {
-    ioc->pending_aios.push_back(aio_t(ioc, fd_direct));
-    ++ioc->num_pending;
-    aio_t& aio = ioc->pending_aios.back();
     if (cct->_conf->bdev_inject_crash &&
        rand() % cct->_conf->bdev_inject_crash == 0) {
       derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex
@@ -651,16 +648,48 @@ int KernelDevice::aio_write(
           << dendl;
       // generate a real io so that aio_wait behaves properly, but make it
       // a read instead of write, and toss the result.
+      ioc->pending_aios.push_back(aio_t(ioc, fd_direct));
+      ++ioc->num_pending;
+      auto& aio = ioc->pending_aios.back();
       aio.pread(off, len);
       ++injecting_crash;
     } else {
-      bl.prepare_iov(&aio.iov);
-      dout(30) << aio << dendl;
-      aio.bl.claim_append(bl);
-      aio.pwritev(off, len);
+      if (bl.length() <= RW_IO_MAX) {
+       // fast path (non-huge write)
+       ioc->pending_aios.push_back(aio_t(ioc, fd_direct));
+       ++ioc->num_pending;
+       auto& aio = ioc->pending_aios.back();
+       bl.prepare_iov(&aio.iov);
+       aio.bl.claim_append(bl);
+       aio.pwritev(off, len);
+       dout(30) << aio << dendl;
+       dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
+               << std::dec << " aio " << &aio << dendl;
+      } else {
+       // write in RW_IO_MAX-sized chunks
+       uint64_t prev_len = 0;
+       while (prev_len < bl.length()) {
+         bufferlist tmp;
+         if (prev_len + RW_IO_MAX < bl.length()) {
+           tmp.substr_of(bl, prev_len, RW_IO_MAX);
+         } else {
+           tmp.substr_of(bl, prev_len, bl.length() - prev_len);
+         }
+         auto len = tmp.length();
+         ioc->pending_aios.push_back(aio_t(ioc, fd_direct));
+         ++ioc->num_pending;
+         auto& aio = ioc->pending_aios.back();
+         tmp.prepare_iov(&aio.iov);
+         aio.bl.claim_append(tmp);
+         aio.pwritev(off + prev_len, len);
+         dout(30) << aio << dendl;
+         dout(5) << __func__ << " 0x" << std::hex << off + prev_len
+                 << "~" << len
+                 << std::dec << " aio " << &aio << " (piece)" << dendl;
+         prev_len += len;
+       }
+      }
     }
-    dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
-           << std::dec << " aio " << &aio << dendl;
   } else
 #endif
   {
index f04b7f972af3a8c46e6e86e569e945c1571d3f94..bea69a936fe3e383fd0307fd940e655c9306611d 100644 (file)
 #include "aio.h"
 #include "BlockDevice.h"
 
+#ifndef RW_IO_MAX
+#define RW_IO_MAX 0x7FFFF000
+#endif
+
+
 class KernelDevice : public BlockDevice {
   int fd_direct, fd_buffered;
   uint64_t size;