#include <fcntl.h>
#include <sys/file.h>
+#include <boost/lockfree/queue.hpp>
+
#include "KernelDevice.h"
+#include "include/buffer_raw.h"
#include "include/intarith.h"
#include "include/types.h"
#include "include/compat.h"
return r;
}
+template <std::size_t BufferSizeV>
+struct ExplicitHugePagePool {
+ using region_queue_t = boost::lockfree::queue<void*>;
+
+ struct mmaped_buffer_raw : public buffer::raw {
+ region_queue_t& region_q; // for recycling
+
+ mmaped_buffer_raw(void* mmaped_region, region_queue_t& region_q)
+ : raw(static_cast<char*>(mmaped_region), BufferSizeV),
+ region_q(region_q) {
+ // the `mmaped_region` has been passed to `raw` as the buffer's `data`
+ }
+ ~mmaped_buffer_raw() override {
+ // don't delete nor unmmap; recycle the region instead
+ region_q.push(data);
+ }
+ raw* clone_empty() override {
+ // the entire cloning facility is used solely by the dev-only MemDB.
+ // see: https://github.com/ceph/ceph/pull/36282
+ ceph_abort_msg("this should be never called on this path!");
+ }
+ };
+
+ ExplicitHugePagePool(size_t pool_size)
+ : region_q(pool_size) {
+ while (pool_size--) {
+ void* const mmaped_region = ::mmap(
+ nullptr,
+ BufferSizeV,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE | MAP_HUGETLB,
+ -1,
+ 0);
+ if (mmaped_region == MAP_FAILED) {
+ ceph_abort("can't allocate huge buffer;"
+ " /proc/sys/vm/nr_hugepages misconfigured?");
+ } else {
+ region_q.push(mmaped_region);
+ }
+ }
+ }
+ ~ExplicitHugePagePool() {
+ void* mmaped_region;
+ while (region_q.pop(mmaped_region)) {
+ ::munmap(mmaped_region, BufferSizeV);
+ }
+ }
+
+ ceph::unique_leakable_ptr<buffer::raw> try_create() {
+ if (void* mmaped_region; region_q.pop(mmaped_region)) {
+ return ceph::unique_leakable_ptr<buffer::raw> {
+ new mmaped_buffer_raw(mmaped_region, region_q)
+ };
+ } else {
+ // oops, empty queue.
+ return nullptr;
+ }
+ }
+
+ bool empty_estimation() const {
+ return region_q.empty();
+ }
+
+private:
+ region_queue_t region_q;
+};
+
+
+#define LUCKY_BUFFER_SIZE 4 * 1024 * 1024
// create a buffer basing on user-configurable. it's intended to make
// our buffers THP-able.
-static ceph::unique_leakable_ptr<buffer::raw> create_custom_aligned(
- CephContext* const cct,
- const size_t len)
+ceph::unique_leakable_ptr<buffer::raw> KernelDevice::create_custom_aligned(
+ const size_t len) const
{
// just to preserve the logic of create_small_page_aligned().
if (len < CEPH_PAGE_SIZE) {
return ceph::buffer::create_small_page_aligned(len);
- } else {
- const size_t custom_alignment = cct->_conf->bdev_read_buffer_alignment;
- return ceph::buffer::create_aligned(len, custom_alignment);
+ } else if (len == LUCKY_BUFFER_SIZE) {
+ static ExplicitHugePagePool<LUCKY_BUFFER_SIZE> hp_pool{
+ cct->_conf->bdev_read_preallocated_huge_buffer_num
+ };
+ if (auto lucky_raw = hp_pool.try_create(); lucky_raw) {
+ dout(20) << __func__ << " allocated from huge pool"
+ << " lucky_raw.data=" << (void*)lucky_raw->get_data()
+ << " bdev_read_preallocated_huge_buffer_num="
+ << cct->_conf->bdev_read_preallocated_huge_buffer_num
+ << dendl;
+ return lucky_raw;
+ } else {
+ // fallthrough due to empty buffer pool. this can happen also
+ // when the configurable was explicitly set to 0.
+ dout(20) << __func__ << " cannot allocate from huge pool"
+ << " hp_pool.empty_estimation=" << hp_pool.empty_estimation()
+ << " bdev_read_preallocated_huge_buffer_num="
+ << cct->_conf->bdev_read_preallocated_huge_buffer_num
+ << dendl;
+ }
}
+ const size_t custom_alignment = cct->_conf->bdev_read_buffer_alignment;
+ dout(20) << __func__ << " with the custom alignment;"
+ << " len=" << len
+ << " custom_alignment=" << custom_alignment
+ << dendl;
+ return ceph::buffer::create_aligned(len, custom_alignment);
}
int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
auto start1 = mono_clock::now();
- auto p = ceph::buffer::ptr_node::create(create_custom_aligned(cct, len));
+ auto p = ceph::buffer::ptr_node::create(create_custom_aligned(len));
int r = ::pread(choose_fd(buffered, WRITE_LIFE_NOT_SET),
p->c_str(), len, off);
auto age = cct->_conf->bdev_debug_aio_log_age;
++ioc->num_pending;
aio_t& aio = ioc->pending_aios.back();
aio.bl.push_back(
- ceph::buffer::ptr_node::create(create_custom_aligned(cct, len)));
+ ceph::buffer::ptr_node::create(create_custom_aligned(len)));
aio.bl.prepare_iov(&aio.iov);
aio.preadv(off, len);
dout(30) << aio << dendl;