From: Peter Dillinger Date: Mon, 28 Nov 2022 17:34:16 +0000 (-0800) Subject: Revert "Improve / refactor anonymous mmap capabilities (#10810)" X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=62f3e58936d0cd4abdd0911f6aaccefe0b5dfa2a;p=rocksdb.git Revert "Improve / refactor anonymous mmap capabilities (#10810)" This reverts commit 8367f0d2d76de0f7d096cc65f5f9ebfb907d551a. --- diff --git a/.circleci/config.yml b/.circleci/config.yml index bba65c8a..d9386aab 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -76,7 +76,7 @@ commands: name: "Test RocksDB" shell: powershell.exe command: | - build_tools\run_ci_db_test.ps1 -SuiteRun arena_test,db_basic_test,db_test,db_test2,db_merge_operand_test,bloom_test,c_test,coding_test,crc32c_test,dynamic_bloom_test,env_basic_test,env_test,hash_test,random_test -Concurrency 16 + build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test,db_test2,db_merge_operand_test,bloom_test,c_test,coding_test,crc32c_test,dynamic_bloom_test,env_basic_test,env_test,hash_test,random_test -Concurrency 16 pre-steps-macos: steps: - pre-steps diff --git a/CMakeLists.txt b/CMakeLists.txt index 1c716874..dbef0590 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -796,7 +796,6 @@ set(SOURCES options/options.cc options/options_helper.cc options/options_parser.cc - port/mmap.cc port/stack_trace.cc table/adaptive/adaptive_table_factory.cc table/block_based/binary_search_index_reader.cc diff --git a/TARGETS b/TARGETS index 762afa35..40f0717b 100644 --- a/TARGETS +++ b/TARGETS @@ -163,7 +163,6 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "options/options.cc", "options/options_helper.cc", "options/options_parser.cc", - "port/mmap.cc", "port/port_posix.cc", "port/stack_trace.cc", "port/win/env_default.cc", @@ -503,7 +502,6 @@ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[ "options/options.cc", "options/options_helper.cc", "options/options_parser.cc", - "port/mmap.cc", "port/port_posix.cc", "port/stack_trace.cc", "port/win/env_default.cc", diff --git a/db/db_test_util.h b/db/db_test_util.h index c6f740c9..29d5cd9d 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -49,9 +49,6 @@ #include "util/string_util.h" #include "utilities/merge_operators.h" -// In case defined by Windows headers -#undef small - namespace ROCKSDB_NAMESPACE { class MockEnv; diff --git a/db/memtable.cc b/db/memtable.cc index 98b86446..45b139e8 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -76,7 +76,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp, : comparator_(cmp), moptions_(ioptions, mutable_cf_options), refs_(0), - kArenaBlockSize(Arena::OptimizeBlockSize(moptions_.arena_block_size)), + kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)), mem_tracker_(write_buffer_manager), arena_(moptions_.arena_block_size, (write_buffer_manager != nullptr && diff --git a/memory/arena.cc b/memory/arena.cc index d42868ed..10b8969b 100644 --- a/memory/arena.cc +++ b/memory/arena.cc @@ -8,7 +8,9 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "memory/arena.h" - +#ifndef OS_WIN +#include +#endif #include #include "logging/logging.h" @@ -20,7 +22,16 @@ namespace ROCKSDB_NAMESPACE { -size_t Arena::OptimizeBlockSize(size_t block_size) { +// MSVC complains that it is already defined since it is static in the header. +#ifndef _MSC_VER +const size_t Arena::kInlineSize; +#endif + +const size_t Arena::kMinBlockSize = 4096; +const size_t Arena::kMaxBlockSize = 2u << 30; +static const int kAlignUnit = alignof(max_align_t); + +size_t OptimizeBlockSize(size_t block_size) { // Make sure block_size is in optimal range block_size = std::max(Arena::kMinBlockSize, block_size); block_size = std::min(Arena::kMaxBlockSize, block_size); @@ -42,12 +53,14 @@ Arena::Arena(size_t block_size, AllocTracker* tracker, size_t huge_page_size) blocks_memory_ += alloc_bytes_remaining_; aligned_alloc_ptr_ = inline_block_; unaligned_alloc_ptr_ = inline_block_ + alloc_bytes_remaining_; - if (MemMapping::kHugePageSupported) { - hugetlb_size_ = huge_page_size; - if (hugetlb_size_ && kBlockSize > hugetlb_size_) { - hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_; - } +#ifdef MAP_HUGETLB + hugetlb_size_ = huge_page_size; + if (hugetlb_size_ && kBlockSize > hugetlb_size_) { + hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_; } +#else + (void)huge_page_size; +#endif if (tracker_ != nullptr) { tracker_->Allocate(kInlineSize); } @@ -58,6 +71,21 @@ Arena::~Arena() { assert(tracker_->is_freed()); tracker_->FreeMem(); } + for (const auto& block : blocks_) { + delete[] block; + } + +#ifdef MAP_HUGETLB + for (const auto& mmap_info : huge_blocks_) { + if (mmap_info.addr_ == nullptr) { + continue; + } + auto ret = munmap(mmap_info.addr_, mmap_info.length_); + if (ret != 0) { + // TODO(sdong): Better handling + } + } +#endif } char* Arena::AllocateFallback(size_t bytes, bool aligned) { @@ -71,10 +99,12 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) { // We waste the remaining space in the current block. size_t size = 0; char* block_head = nullptr; - if (MemMapping::kHugePageSupported && hugetlb_size_ > 0) { +#ifdef MAP_HUGETLB + if (hugetlb_size_) { size = hugetlb_size_; block_head = AllocateFromHugePage(size); } +#endif if (!block_head) { size = kBlockSize; block_head = AllocateNewBlock(size); @@ -93,22 +123,45 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) { } char* Arena::AllocateFromHugePage(size_t bytes) { - MemMapping mm = MemMapping::AllocateHuge(bytes); - auto addr = static_cast(mm.Get()); - if (addr) { - huge_blocks_.push_back(std::move(mm)); - blocks_memory_ += bytes; - if (tracker_ != nullptr) { - tracker_->Allocate(bytes); - } +#ifdef MAP_HUGETLB + if (hugetlb_size_ == 0) { + return nullptr; + } + // Reserve space in `huge_blocks_` before calling `mmap`. + // Use `emplace_back()` instead of `reserve()` to let std::vector manage its + // own memory and do fewer reallocations. + // + // - If `emplace_back` throws, no memory leaks because we haven't called + // `mmap` yet. + // - If `mmap` throws, no memory leaks because the vector will be cleaned up + // via RAII. + huge_blocks_.emplace_back(nullptr /* addr */, 0 /* length */); + + void* addr = mmap(nullptr, bytes, (PROT_READ | PROT_WRITE), + (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), -1, 0); + + if (addr == MAP_FAILED) { + return nullptr; + } + huge_blocks_.back() = MmapInfo(addr, bytes); + blocks_memory_ += bytes; + if (tracker_ != nullptr) { + tracker_->Allocate(bytes); } - return addr; + return reinterpret_cast(addr); +#else + (void)bytes; + return nullptr; +#endif } char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size, Logger* logger) { - if (MemMapping::kHugePageSupported && hugetlb_size_ > 0 && - huge_page_size > 0 && bytes > 0) { + assert((kAlignUnit & (kAlignUnit - 1)) == + 0); // Pointer size should be a power of 2 + +#ifdef MAP_HUGETLB + if (huge_page_size > 0 && bytes > 0) { // Allocate from a huge page TLB table. size_t reserved_size = ((bytes - 1U) / huge_page_size + 1U) * huge_page_size; @@ -124,6 +177,10 @@ char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size, return addr; } } +#else + (void)huge_page_size; + (void)logger; +#endif size_t current_mod = reinterpret_cast(aligned_alloc_ptr_) & (kAlignUnit - 1); @@ -143,10 +200,17 @@ char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size, } char* Arena::AllocateNewBlock(size_t block_bytes) { - auto uniq = std::make_unique(block_bytes); - char* block = uniq.get(); - blocks_.push_back(std::move(uniq)); - + // Reserve space in `blocks_` before allocating memory via new. + // Use `emplace_back()` instead of `reserve()` to let std::vector manage its + // own memory and do fewer reallocations. + // + // - If `emplace_back` throws, no memory leaks because we haven't called `new` + // yet. + // - If `new` throws, no memory leaks because the vector will be cleaned up + // via RAII. + blocks_.emplace_back(nullptr); + + char* block = new char[block_bytes]; size_t allocated_size; #ifdef ROCKSDB_MALLOC_USABLE_SIZE allocated_size = malloc_usable_size(block); @@ -163,6 +227,7 @@ char* Arena::AllocateNewBlock(size_t block_bytes) { if (tracker_ != nullptr) { tracker_->Allocate(allocated_size); } + blocks_.back() = block; return block; } diff --git a/memory/arena.h b/memory/arena.h index 39399aa7..1de04c47 100644 --- a/memory/arena.h +++ b/memory/arena.h @@ -12,13 +12,16 @@ // size, it uses malloc to directly get the requested size. #pragma once - +#ifndef OS_WIN +#include +#endif +#include +#include +#include #include -#include - +#include #include "memory/allocator.h" -#include "port/mmap.h" -#include "rocksdb/env.h" +#include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { @@ -28,13 +31,9 @@ class Arena : public Allocator { Arena(const Arena&) = delete; void operator=(const Arena&) = delete; - static constexpr size_t kInlineSize = 2048; - static constexpr size_t kMinBlockSize = 4096; - static constexpr size_t kMaxBlockSize = 2u << 30; - - static constexpr unsigned kAlignUnit = alignof(std::max_align_t); - static_assert((kAlignUnit & (kAlignUnit - 1)) == 0, - "Pointer size should be power of 2"); + static const size_t kInlineSize = 2048; + static const size_t kMinBlockSize; + static const size_t kMaxBlockSize; // huge_page_size: if 0, don't use huge page TLB. If > 0 (should set to the // supported hugepage size of the system), block allocation will try huge @@ -64,7 +63,7 @@ class Arena : public Allocator { // by the arena (exclude the space allocated but not yet used for future // allocations). size_t ApproximateMemoryUsage() const { - return blocks_memory_ + blocks_.size() * sizeof(char*) - + return blocks_memory_ + blocks_.capacity() * sizeof(char*) - alloc_bytes_remaining_; } @@ -82,19 +81,21 @@ class Arena : public Allocator { return blocks_.empty() && huge_blocks_.empty(); } - // check and adjust the block_size so that the return value is - // 1. in the range of [kMinBlockSize, kMaxBlockSize]. - // 2. the multiple of align unit. - static size_t OptimizeBlockSize(size_t block_size); - private: - alignas(std::max_align_t) char inline_block_[kInlineSize]; + char inline_block_[kInlineSize] __attribute__((__aligned__(alignof(max_align_t)))); // Number of bytes allocated in one block const size_t kBlockSize; - // Allocated memory blocks - std::deque> blocks_; - // Huge page allocations - std::deque huge_blocks_; + // Array of new[] allocated memory blocks + using Blocks = std::vector; + Blocks blocks_; + + struct MmapInfo { + void* addr_; + size_t length_; + + MmapInfo(void* addr, size_t length) : addr_(addr), length_(length) {} + }; + std::vector huge_blocks_; size_t irregular_block_num = 0; // Stats for current active block. @@ -107,15 +108,15 @@ class Arena : public Allocator { // How many bytes left in currently active block? size_t alloc_bytes_remaining_ = 0; +#ifdef MAP_HUGETLB size_t hugetlb_size_ = 0; - +#endif // MAP_HUGETLB char* AllocateFromHugePage(size_t bytes); char* AllocateFallback(size_t bytes, bool aligned); char* AllocateNewBlock(size_t block_bytes); // Bytes of memory in blocks allocated so far size_t blocks_memory_ = 0; - // Non-owned AllocTracker* tracker_; }; @@ -132,4 +133,9 @@ inline char* Arena::Allocate(size_t bytes) { return AllocateFallback(bytes, false /* unaligned */); } +// check and adjust the block_size so that the return value is +// 1. in the range of [kMinBlockSize, kMaxBlockSize]. +// 2. the multiple of align unit. +extern size_t OptimizeBlockSize(size_t block_size); + } // namespace ROCKSDB_NAMESPACE diff --git a/memory/arena_test.cc b/memory/arena_test.cc index 30887c23..96e69a93 100644 --- a/memory/arena_test.cc +++ b/memory/arena_test.cc @@ -8,11 +8,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "memory/arena.h" - -#ifndef OS_WIN -#include -#endif -#include "port/port.h" #include "test_util/testharness.h" #include "util/random.h" @@ -201,61 +196,6 @@ TEST_F(ArenaTest, Simple) { SimpleTest(0); SimpleTest(kHugePageSize); } - -// Number of minor page faults since last call -size_t PopMinorPageFaultCount() { -#ifdef RUSAGE_SELF - static long prev = 0; - struct rusage usage; - EXPECT_EQ(getrusage(RUSAGE_SELF, &usage), 0); - size_t rv = usage.ru_minflt - prev; - prev = usage.ru_minflt; - return rv; -#else - // Conservative - return SIZE_MAX; -#endif // RUSAGE_SELF -} - -TEST(MmapTest, AllocateLazyZeroed) { - // Doesn't have to be page aligned - constexpr size_t len = 1234567; - MemMapping m = MemMapping::AllocateLazyZeroed(len); - auto arr = static_cast(m.Get()); - - // Should generally work - ASSERT_NE(arr, nullptr); - - // Start counting page faults - PopMinorPageFaultCount(); - - // Access half of the allocation - size_t i = 0; - for (; i < len / 2; ++i) { - ASSERT_EQ(arr[i], 0); - arr[i] = static_cast(i & 255); - } - - // Appropriate page faults (maybe more) - size_t faults = PopMinorPageFaultCount(); - ASSERT_GE(faults, len / 2 / port::kPageSize); - - // Access rest of the allocation - for (; i < len; ++i) { - ASSERT_EQ(arr[i], 0); - arr[i] = static_cast(i & 255); - } - - // Appropriate page faults (maybe more) - faults = PopMinorPageFaultCount(); - ASSERT_GE(faults, len / 2 / port::kPageSize); - - // Verify data - for (i = 0; i < len; ++i) { - ASSERT_EQ(arr[i], static_cast(i & 255)); - } -} - } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/port/mmap.cc b/port/mmap.cc deleted file mode 100644 index 36e8f326..00000000 --- a/port/mmap.cc +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#include "port/mmap.h" - -#include -#include -#include -#include -#include - -#include "util/hash.h" - -namespace ROCKSDB_NAMESPACE { - -MemMapping::~MemMapping() { -#ifdef OS_WIN - if (addr_ != nullptr) { - (void)::UnmapViewOfFile(addr_); - } - if (page_file_handle_ != NULL) { - (void)::CloseHandle(page_file_handle_); - } -#else // OS_WIN -> !OS_WIN - if (addr_ != nullptr) { - auto status = munmap(addr_, length_); - assert(status == 0); - if (status != 0) { - // TODO: handle error? - } - } -#endif // OS_WIN -} - -MemMapping::MemMapping(MemMapping&& other) noexcept { - *this = std::move(other); -} - -MemMapping& MemMapping::operator=(MemMapping&& other) noexcept { - if (&other == this) { - return *this; - } - this->~MemMapping(); - std::memcpy(this, &other, sizeof(*this)); - new (&other) MemMapping(); - return *this; -} - -MemMapping MemMapping::AllocateAnonymous(size_t length, bool huge) { - MemMapping mm; - mm.length_ = length; - assert(mm.addr_ == nullptr); - if (length == 0) { - // OK to leave addr as nullptr - return mm; - } - int huge_flag = 0; -#ifdef OS_WIN - if (huge) { -#ifdef FILE_MAP_LARGE_PAGES - huge_flag = FILE_MAP_LARGE_PAGES; -#endif // FILE_MAP_LARGE_PAGES - } - mm.page_file_handle_ = ::CreateFileMapping( - INVALID_HANDLE_VALUE, nullptr, PAGE_READWRITE | SEC_COMMIT, - Upper32of64(length), Lower32of64(length), nullptr); - if (mm.page_file_handle_ == NULL) { - // Failure - return mm; - } - mm.addr_ = ::MapViewOfFile(mm.page_file_handle_, FILE_MAP_WRITE | huge_flag, - 0, 0, length); -#else // OS_WIN -> !OS_WIN - if (huge) { -#ifdef MAP_HUGETLB - huge_flag = MAP_HUGETLB; -#endif // MAP_HUGE_TLB - } - mm.addr_ = mmap(nullptr, length, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | huge_flag, -1, 0); - if (mm.addr_ == MAP_FAILED) { - mm.addr_ = nullptr; - } -#endif // OS_WIN - return mm; -} - -MemMapping MemMapping::AllocateHuge(size_t length) { - return AllocateAnonymous(length, /*huge*/ true); -} - -MemMapping MemMapping::AllocateLazyZeroed(size_t length) { - return AllocateAnonymous(length, /*huge*/ false); -} - -} // namespace ROCKSDB_NAMESPACE diff --git a/port/mmap.h b/port/mmap.h deleted file mode 100644 index 720b6708..00000000 --- a/port/mmap.h +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#pragma once - -#ifdef OS_WIN -#include -// ^^^ Must come first -#include -#else -#include -#endif // OS_WIN - -#include - -#include "rocksdb/rocksdb_namespace.h" - -namespace ROCKSDB_NAMESPACE { - -// An RAII wrapper for mmaped memory -class MemMapping { - public: - static constexpr bool kHugePageSupported = -#if defined(MAP_HUGETLB) || defined(FILE_MAP_LARGE_PAGES) - true; -#else - false; -#endif - - // Allocate memory requesting to be backed by huge pages - static MemMapping AllocateHuge(size_t length); - - // Allocate memory that is only lazily mapped to resident memory and - // guaranteed to be zero-initialized. Note that some platforms like - // Linux allow memory over-commit, where only the used portion of memory - // matters, while other platforms require enough swap space (page file) to - // back the full mapping. - static MemMapping AllocateLazyZeroed(size_t length); - - // No copies - MemMapping(const MemMapping&) = delete; - MemMapping& operator=(const MemMapping&) = delete; - // Move - MemMapping(MemMapping&&) noexcept; - MemMapping& operator=(MemMapping&&) noexcept; - - // Releases the mapping - ~MemMapping(); - - inline void* Get() const { return addr_; } - inline size_t Length() const { return length_; } - - private: - MemMapping() {} - - // The mapped memory, or nullptr on failure / not supported - void* addr_ = nullptr; - // The known usable number of bytes starting at that address - size_t length_ = 0; - -#ifdef OS_WIN - HANDLE page_file_handle_ = NULL; -#endif // OS_WIN - - static MemMapping AllocateAnonymous(size_t length, bool huge); -}; - -} // namespace ROCKSDB_NAMESPACE diff --git a/src.mk b/src.mk index 0269fe1d..f955efc6 100644 --- a/src.mk +++ b/src.mk @@ -154,7 +154,6 @@ LIB_SOURCES = \ options/options.cc \ options/options_helper.cc \ options/options_parser.cc \ - port/mmap.cc \ port/port_posix.cc \ port/win/env_default.cc \ port/win/env_win.cc \