From 64aae2b9554a467050fa22540d5556813287e3c1 Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Thu, 4 Nov 2021 20:50:17 +0000 Subject: [PATCH] blk: bring MAP_HUGETLB-based buffer pool to KernelDevice. The idea here is to bring a pool of `mmap`-allocated, constantly-sized buffers which would take precedence over the 2 MB-aligned, THP-based mechanism. On first attempt to acquire a 4 MB buffer, KernelDevice mmaps `bdev_read_preallocated_huge_buffer_num` (default 128) memory regions using the MAP_HUGETLB option. If this fails, the entire process is aborted. Buffers, after their life-times going over, are recycled with lock- free queue shared across entire process. Remember about allocating the appropriate number of huge pages in the system! For instance: ``` echo 256 | sudo tee /proc/sys/vm/nr_hugepages ``` This commit bases on / cherry-picks with changes 897a4932bee5cba3641c18619cccd0ee945bfcf8. Signed-off-by: Radoslaw Zarzynski --- src/blk/kernel/KernelDevice.cc | 110 +++++++++++++++++++++++++++--- src/blk/kernel/KernelDevice.h | 2 + src/common/options/global.yaml.in | 5 ++ 3 files changed, 109 insertions(+), 8 deletions(-) diff --git a/src/blk/kernel/KernelDevice.cc b/src/blk/kernel/KernelDevice.cc index d5251411dcde1..837015d45fee1 100644 --- a/src/blk/kernel/KernelDevice.cc +++ b/src/blk/kernel/KernelDevice.cc @@ -20,7 +20,10 @@ #include #include +#include + #include "KernelDevice.h" +#include "include/buffer_raw.h" #include "include/intarith.h" #include "include/types.h" #include "include/compat.h" @@ -1041,20 +1044,111 @@ int KernelDevice::discard(uint64_t offset, uint64_t len) return r; } +template +struct ExplicitHugePagePool { + using region_queue_t = boost::lockfree::queue; + + struct mmaped_buffer_raw : public buffer::raw { + region_queue_t& region_q; // for recycling + + mmaped_buffer_raw(void* mmaped_region, region_queue_t& region_q) + : raw(static_cast(mmaped_region), BufferSizeV), + region_q(region_q) { + // the `mmaped_region` has been passed to `raw` as the buffer's `data` + } + ~mmaped_buffer_raw() override { + // don't delete nor unmmap; recycle the region instead + region_q.push(data); + } + raw* clone_empty() override { + // the entire cloning facility is used solely by the dev-only MemDB. + // see: https://github.com/ceph/ceph/pull/36282 + ceph_abort_msg("this should be never called on this path!"); + } + }; + + ExplicitHugePagePool(size_t pool_size) + : region_q(pool_size) { + while (pool_size--) { + void* const mmaped_region = ::mmap( + nullptr, + BufferSizeV, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE | MAP_HUGETLB, + -1, + 0); + if (mmaped_region == MAP_FAILED) { + ceph_abort("can't allocate huge buffer;" + " /proc/sys/vm/nr_hugepages misconfigured?"); + } else { + region_q.push(mmaped_region); + } + } + } + ~ExplicitHugePagePool() { + void* mmaped_region; + while (region_q.pop(mmaped_region)) { + ::munmap(mmaped_region, BufferSizeV); + } + } + + ceph::unique_leakable_ptr try_create() { + if (void* mmaped_region; region_q.pop(mmaped_region)) { + return ceph::unique_leakable_ptr { + new mmaped_buffer_raw(mmaped_region, region_q) + }; + } else { + // oops, empty queue. + return nullptr; + } + } + + bool empty_estimation() const { + return region_q.empty(); + } + +private: + region_queue_t region_q; +}; + + +#define LUCKY_BUFFER_SIZE 4 * 1024 * 1024 // create a buffer basing on user-configurable. it's intended to make // our buffers THP-able. -static ceph::unique_leakable_ptr create_custom_aligned( - CephContext* const cct, - const size_t len) +ceph::unique_leakable_ptr KernelDevice::create_custom_aligned( + const size_t len) const { // just to preserve the logic of create_small_page_aligned(). if (len < CEPH_PAGE_SIZE) { return ceph::buffer::create_small_page_aligned(len); - } else { - const size_t custom_alignment = cct->_conf->bdev_read_buffer_alignment; - return ceph::buffer::create_aligned(len, custom_alignment); + } else if (len == LUCKY_BUFFER_SIZE) { + static ExplicitHugePagePool hp_pool{ + cct->_conf->bdev_read_preallocated_huge_buffer_num + }; + if (auto lucky_raw = hp_pool.try_create(); lucky_raw) { + dout(20) << __func__ << " allocated from huge pool" + << " lucky_raw.data=" << (void*)lucky_raw->get_data() + << " bdev_read_preallocated_huge_buffer_num=" + << cct->_conf->bdev_read_preallocated_huge_buffer_num + << dendl; + return lucky_raw; + } else { + // fallthrough due to empty buffer pool. this can happen also + // when the configurable was explicitly set to 0. + dout(20) << __func__ << " cannot allocate from huge pool" + << " hp_pool.empty_estimation=" << hp_pool.empty_estimation() + << " bdev_read_preallocated_huge_buffer_num=" + << cct->_conf->bdev_read_preallocated_huge_buffer_num + << dendl; + } } + const size_t custom_alignment = cct->_conf->bdev_read_buffer_alignment; + dout(20) << __func__ << " with the custom alignment;" + << " len=" << len + << " custom_alignment=" << custom_alignment + << dendl; + return ceph::buffer::create_aligned(len, custom_alignment); } int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl, @@ -1070,7 +1164,7 @@ int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl, auto start1 = mono_clock::now(); - auto p = ceph::buffer::ptr_node::create(create_custom_aligned(cct, len)); + auto p = ceph::buffer::ptr_node::create(create_custom_aligned(len)); int r = ::pread(choose_fd(buffered, WRITE_LIFE_NOT_SET), p->c_str(), len, off); auto age = cct->_conf->bdev_debug_aio_log_age; @@ -1122,7 +1216,7 @@ int KernelDevice::aio_read( ++ioc->num_pending; aio_t& aio = ioc->pending_aios.back(); aio.bl.push_back( - ceph::buffer::ptr_node::create(create_custom_aligned(cct, len))); + ceph::buffer::ptr_node::create(create_custom_aligned(len))); aio.bl.prepare_iov(&aio.iov); aio.preadv(off, len); dout(30) << aio << dendl; diff --git a/src/blk/kernel/KernelDevice.h b/src/blk/kernel/KernelDevice.h index 050a8f879029a..14d9a470117d1 100644 --- a/src/blk/kernel/KernelDevice.h +++ b/src/blk/kernel/KernelDevice.h @@ -112,6 +112,8 @@ private: void _detect_vdo(); int choose_fd(bool buffered, int write_hint) const; + ceph::unique_leakable_ptr create_custom_aligned(size_t len) const; + public: KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv); diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index 39866897d21a0..6bc97f20185fa 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -3920,6 +3920,11 @@ options: level: advanced default: 4_K with_legacy: true +- name: bdev_read_preallocated_huge_buffer_num + type: size + level: advanced + default: 128 + with_legacy: true - name: bdev_debug_aio type: bool level: dev -- 2.39.5