]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
blk: bring MAP_HUGETLB-based buffer pool to KernelDevice.
authorRadoslaw Zarzynski <rzarzyns@redhat.com>
Thu, 4 Nov 2021 20:50:17 +0000 (20:50 +0000)
committerRadoslaw Zarzynski <rzarzyns@redhat.com>
Wed, 12 Jan 2022 20:35:50 +0000 (20:35 +0000)
The idea here is to bring a pool of `mmap`-allocated,
constantly-sized buffers which would take precedence
over the 2 MB-aligned, THP-based mechanism. On first
attempt to acquire a 4 MB buffer, KernelDevice mmaps
`bdev_read_preallocated_huge_buffer_num` (default 128)
memory regions using the MAP_HUGETLB option. If this
fails, the entire process is aborted. Buffers, after
their life-times going over, are recycled with lock-
free queue shared across entire process.

Remember about allocating the appropriate number of
huge pages in the system! For instance:

```
echo 256 | sudo tee /proc/sys/vm/nr_hugepages
```

This commit bases on / cherry-picks with changes
897a4932bee5cba3641c18619cccd0ee945bfcf8.

Signed-off-by: Radoslaw Zarzynski <rzarzyns@redhat.com>
src/blk/kernel/KernelDevice.cc
src/blk/kernel/KernelDevice.h
src/common/options/global.yaml.in

index d5251411dcde130a0197d8835d89eef6a0ba9473..837015d45fee1066d664ca3587e27df8f4a7d23a 100644 (file)
 #include <fcntl.h>
 #include <sys/file.h>
 
+#include <boost/lockfree/queue.hpp>
+
 #include "KernelDevice.h"
+#include "include/buffer_raw.h"
 #include "include/intarith.h"
 #include "include/types.h"
 #include "include/compat.h"
@@ -1041,20 +1044,111 @@ int KernelDevice::discard(uint64_t offset, uint64_t len)
   return r;
 }
 
+template <std::size_t BufferSizeV>
+struct ExplicitHugePagePool {
+  using region_queue_t = boost::lockfree::queue<void*>;
+
+  struct mmaped_buffer_raw : public buffer::raw {
+    region_queue_t& region_q; // for recycling
+
+    mmaped_buffer_raw(void* mmaped_region, region_queue_t& region_q)
+      : raw(static_cast<char*>(mmaped_region), BufferSizeV),
+       region_q(region_q) {
+      // the `mmaped_region` has been passed to `raw` as the buffer's `data`
+    }
+    ~mmaped_buffer_raw() override {
+      // don't delete nor unmmap; recycle the region instead
+      region_q.push(data);
+    }
+    raw* clone_empty() override {
+      // the entire cloning facility is used solely by the dev-only MemDB.
+      // see: https://github.com/ceph/ceph/pull/36282
+      ceph_abort_msg("this should be never called on this path!");
+    }
+  };
+
+  ExplicitHugePagePool(size_t pool_size)
+    : region_q(pool_size) {
+    while (pool_size--) {
+      void* const mmaped_region = ::mmap(
+        nullptr,
+        BufferSizeV,
+        PROT_READ | PROT_WRITE,
+        MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE | MAP_HUGETLB,
+        -1,
+        0);
+      if (mmaped_region == MAP_FAILED) {
+        ceph_abort("can't allocate huge buffer;"
+                   " /proc/sys/vm/nr_hugepages misconfigured?");
+      } else {
+        region_q.push(mmaped_region);
+      }
+    }
+  }
+  ~ExplicitHugePagePool() {
+    void* mmaped_region;
+    while (region_q.pop(mmaped_region)) {
+      ::munmap(mmaped_region, BufferSizeV);
+    }
+  }
+
+  ceph::unique_leakable_ptr<buffer::raw> try_create() {
+    if (void* mmaped_region; region_q.pop(mmaped_region)) {
+      return ceph::unique_leakable_ptr<buffer::raw> {
+       new mmaped_buffer_raw(mmaped_region, region_q)
+      };
+    } else {
+      // oops, empty queue.
+      return nullptr;
+    }
+  }
+
+  bool empty_estimation() const {
+    return region_q.empty();
+  }
+
+private:
+  region_queue_t region_q;
+};
+
+
+#define LUCKY_BUFFER_SIZE 4 * 1024 * 1024
 
 // create a buffer basing on user-configurable. it's intended to make
 // our buffers THP-able.
-static ceph::unique_leakable_ptr<buffer::raw> create_custom_aligned(
-  CephContext* const cct,
-  const size_t len)
+ceph::unique_leakable_ptr<buffer::raw> KernelDevice::create_custom_aligned(
+  const size_t len) const
 {
   // just to preserve the logic of create_small_page_aligned().
   if (len < CEPH_PAGE_SIZE) {
     return ceph::buffer::create_small_page_aligned(len);
-  } else {
-    const size_t custom_alignment = cct->_conf->bdev_read_buffer_alignment;
-    return ceph::buffer::create_aligned(len, custom_alignment);
+  } else if (len == LUCKY_BUFFER_SIZE) {
+    static ExplicitHugePagePool<LUCKY_BUFFER_SIZE> hp_pool{
+      cct->_conf->bdev_read_preallocated_huge_buffer_num
+    };
+    if (auto lucky_raw = hp_pool.try_create(); lucky_raw) {
+      dout(20) << __func__ << " allocated from huge pool"
+              << " lucky_raw.data=" << (void*)lucky_raw->get_data()
+              << " bdev_read_preallocated_huge_buffer_num="
+              << cct->_conf->bdev_read_preallocated_huge_buffer_num
+              << dendl;
+      return lucky_raw;
+    } else {
+      // fallthrough due to empty buffer pool. this can happen also
+      // when the configurable was explicitly set to 0.
+      dout(20) << __func__ << " cannot allocate from huge pool"
+              << " hp_pool.empty_estimation=" << hp_pool.empty_estimation()
+              << " bdev_read_preallocated_huge_buffer_num="
+              << cct->_conf->bdev_read_preallocated_huge_buffer_num
+              << dendl;
+    }
   }
+  const size_t custom_alignment = cct->_conf->bdev_read_buffer_alignment;
+  dout(20) << __func__ << " with the custom alignment;"
+           << " len=" << len
+           << " custom_alignment=" << custom_alignment
+           << dendl;
+  return ceph::buffer::create_aligned(len, custom_alignment);
 }
 
 int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
@@ -1070,7 +1164,7 @@ int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
 
   auto start1 = mono_clock::now();
 
-  auto p = ceph::buffer::ptr_node::create(create_custom_aligned(cct, len));
+  auto p = ceph::buffer::ptr_node::create(create_custom_aligned(len));
   int r = ::pread(choose_fd(buffered,  WRITE_LIFE_NOT_SET),
                  p->c_str(), len, off);
   auto age = cct->_conf->bdev_debug_aio_log_age;
@@ -1122,7 +1216,7 @@ int KernelDevice::aio_read(
     ++ioc->num_pending;
     aio_t& aio = ioc->pending_aios.back();
     aio.bl.push_back(
-      ceph::buffer::ptr_node::create(create_custom_aligned(cct, len)));
+      ceph::buffer::ptr_node::create(create_custom_aligned(len)));
     aio.bl.prepare_iov(&aio.iov);
     aio.preadv(off, len);
     dout(30) << aio << dendl;
index 050a8f879029ae37712bea72fbb7e6a8ca7d1ed9..14d9a470117d16adeb926ac0fb9d43749efaa633 100644 (file)
@@ -112,6 +112,8 @@ private:
   void _detect_vdo();
   int choose_fd(bool buffered, int write_hint) const;
 
+  ceph::unique_leakable_ptr<buffer::raw> create_custom_aligned(size_t len) const;
+
 public:
   KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv);
 
index 39866897d21a05898c83ff6919fbebf16b1fb3cb..6bc97f20185fa4eb45b17249d2c3a1ee3306ae2e 100644 (file)
@@ -3920,6 +3920,11 @@ options:
   level: advanced
   default: 4_K
   with_legacy: true
+- name: bdev_read_preallocated_huge_buffer_num
+  type: size
+  level: advanced
+  default: 128
+  with_legacy: true
 - name: bdev_debug_aio
   type: bool
   level: dev