]> git-server-git.apps.pok.os.sepia.ceph.com Git - rocksdb.git/commitdiff
Revert "Improve / refactor anonymous mmap capabilities (#10810)"
authorPeter Dillinger <peterd@fb.com>
Mon, 28 Nov 2022 17:34:16 +0000 (09:34 -0800)
committerPeter Dillinger <peterd@fb.com>
Thu, 1 Dec 2022 01:14:01 +0000 (17:14 -0800)
This reverts commit 8367f0d2d76de0f7d096cc65f5f9ebfb907d551a.

.circleci/config.yml
CMakeLists.txt
TARGETS
db/db_test_util.h
db/memtable.cc
memory/arena.cc
memory/arena.h
memory/arena_test.cc
port/mmap.cc [deleted file]
port/mmap.h [deleted file]
src.mk

index bba65c8aa202224b626385098e15698c2c8ce457..d9386aab1df8d9fc1249887d6f61d7f016932d1f 100644 (file)
@@ -76,7 +76,7 @@ commands:
           name: "Test RocksDB"
           shell: powershell.exe
           command: |
-            build_tools\run_ci_db_test.ps1 -SuiteRun arena_test,db_basic_test,db_test,db_test2,db_merge_operand_test,bloom_test,c_test,coding_test,crc32c_test,dynamic_bloom_test,env_basic_test,env_test,hash_test,random_test -Concurrency 16
+            build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test,db_test2,db_merge_operand_test,bloom_test,c_test,coding_test,crc32c_test,dynamic_bloom_test,env_basic_test,env_test,hash_test,random_test -Concurrency 16
   pre-steps-macos:
       steps:
         - pre-steps
index 1c716874a396ccbc6526dae7ad818d8711eb3bdb..dbef059028aa795458454b2b6189aa4cc18c856e 100644 (file)
@@ -796,7 +796,6 @@ set(SOURCES
         options/options.cc
         options/options_helper.cc
         options/options_parser.cc
-        port/mmap.cc
         port/stack_trace.cc
         table/adaptive/adaptive_table_factory.cc
         table/block_based/binary_search_index_reader.cc
diff --git a/TARGETS b/TARGETS
index 762afa35165db58c58d5fc8d317193bc1c189b09..40f0717b3a16df895a622aec7e62258933250198 100644 (file)
--- a/TARGETS
+++ b/TARGETS
@@ -163,7 +163,6 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
         "options/options.cc",
         "options/options_helper.cc",
         "options/options_parser.cc",
-        "port/mmap.cc",
         "port/port_posix.cc",
         "port/stack_trace.cc",
         "port/win/env_default.cc",
@@ -503,7 +502,6 @@ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[
         "options/options.cc",
         "options/options_helper.cc",
         "options/options_parser.cc",
-        "port/mmap.cc",
         "port/port_posix.cc",
         "port/stack_trace.cc",
         "port/win/env_default.cc",
index c6f740c92c1299f58c014ed41218d2dea415d73b..29d5cd9d743cd8546be5cb09482a1a7a31dab581 100644 (file)
@@ -49,9 +49,6 @@
 #include "util/string_util.h"
 #include "utilities/merge_operators.h"
 
-// In case defined by Windows headers
-#undef small
-
 namespace ROCKSDB_NAMESPACE {
 class MockEnv;
 
index 98b86446882c09a97f6826b5cb315f148fa9dd05..45b139e80989258ece7a506c97ebd775830f5fca 100644 (file)
@@ -76,7 +76,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
     : comparator_(cmp),
       moptions_(ioptions, mutable_cf_options),
       refs_(0),
-      kArenaBlockSize(Arena::OptimizeBlockSize(moptions_.arena_block_size)),
+      kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)),
       mem_tracker_(write_buffer_manager),
       arena_(moptions_.arena_block_size,
              (write_buffer_manager != nullptr &&
index d42868edf04514d0d563656b53494990ebfb33f9..10b8969b4d71c70556a0f119db9031e05c8c1a00 100644 (file)
@@ -8,7 +8,9 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "memory/arena.h"
-
+#ifndef OS_WIN
+#include <sys/mman.h>
+#endif
 #include <algorithm>
 
 #include "logging/logging.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-size_t Arena::OptimizeBlockSize(size_t block_size) {
+// MSVC complains that it is already defined since it is static in the header.
+#ifndef _MSC_VER
+const size_t Arena::kInlineSize;
+#endif
+
+const size_t Arena::kMinBlockSize = 4096;
+const size_t Arena::kMaxBlockSize = 2u << 30;
+static const int kAlignUnit = alignof(max_align_t);
+
+size_t OptimizeBlockSize(size_t block_size) {
   // Make sure block_size is in optimal range
   block_size = std::max(Arena::kMinBlockSize, block_size);
   block_size = std::min(Arena::kMaxBlockSize, block_size);
@@ -42,12 +53,14 @@ Arena::Arena(size_t block_size, AllocTracker* tracker, size_t huge_page_size)
   blocks_memory_ += alloc_bytes_remaining_;
   aligned_alloc_ptr_ = inline_block_;
   unaligned_alloc_ptr_ = inline_block_ + alloc_bytes_remaining_;
-  if (MemMapping::kHugePageSupported) {
-    hugetlb_size_ = huge_page_size;
-    if (hugetlb_size_ && kBlockSize > hugetlb_size_) {
-      hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_;
-    }
+#ifdef MAP_HUGETLB
+  hugetlb_size_ = huge_page_size;
+  if (hugetlb_size_ && kBlockSize > hugetlb_size_) {
+    hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_;
   }
+#else
+  (void)huge_page_size;
+#endif
   if (tracker_ != nullptr) {
     tracker_->Allocate(kInlineSize);
   }
@@ -58,6 +71,21 @@ Arena::~Arena() {
     assert(tracker_->is_freed());
     tracker_->FreeMem();
   }
+  for (const auto& block : blocks_) {
+    delete[] block;
+  }
+
+#ifdef MAP_HUGETLB
+  for (const auto& mmap_info : huge_blocks_) {
+    if (mmap_info.addr_ == nullptr) {
+      continue;
+    }
+    auto ret = munmap(mmap_info.addr_, mmap_info.length_);
+    if (ret != 0) {
+      // TODO(sdong): Better handling
+    }
+  }
+#endif
 }
 
 char* Arena::AllocateFallback(size_t bytes, bool aligned) {
@@ -71,10 +99,12 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) {
   // We waste the remaining space in the current block.
   size_t size = 0;
   char* block_head = nullptr;
-  if (MemMapping::kHugePageSupported && hugetlb_size_ > 0) {
+#ifdef MAP_HUGETLB
+  if (hugetlb_size_) {
     size = hugetlb_size_;
     block_head = AllocateFromHugePage(size);
   }
+#endif
   if (!block_head) {
     size = kBlockSize;
     block_head = AllocateNewBlock(size);
@@ -93,22 +123,45 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) {
 }
 
 char* Arena::AllocateFromHugePage(size_t bytes) {
-  MemMapping mm = MemMapping::AllocateHuge(bytes);
-  auto addr = static_cast<char*>(mm.Get());
-  if (addr) {
-    huge_blocks_.push_back(std::move(mm));
-    blocks_memory_ += bytes;
-    if (tracker_ != nullptr) {
-      tracker_->Allocate(bytes);
-    }
+#ifdef MAP_HUGETLB
+  if (hugetlb_size_ == 0) {
+    return nullptr;
+  }
+  // Reserve space in `huge_blocks_` before calling `mmap`.
+  // Use `emplace_back()` instead of `reserve()` to let std::vector manage its
+  // own memory and do fewer reallocations.
+  //
+  // - If `emplace_back` throws, no memory leaks because we haven't called
+  //   `mmap` yet.
+  // - If `mmap` throws, no memory leaks because the vector will be cleaned up
+  //   via RAII.
+  huge_blocks_.emplace_back(nullptr /* addr */, 0 /* length */);
+
+  void* addr = mmap(nullptr, bytes, (PROT_READ | PROT_WRITE),
+                    (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), -1, 0);
+
+  if (addr == MAP_FAILED) {
+    return nullptr;
+  }
+  huge_blocks_.back() = MmapInfo(addr, bytes);
+  blocks_memory_ += bytes;
+  if (tracker_ != nullptr) {
+    tracker_->Allocate(bytes);
   }
-  return addr;
+  return reinterpret_cast<char*>(addr);
+#else
+  (void)bytes;
+  return nullptr;
+#endif
 }
 
 char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size,
                              Logger* logger) {
-  if (MemMapping::kHugePageSupported && hugetlb_size_ > 0 &&
-      huge_page_size > 0 && bytes > 0) {
+  assert((kAlignUnit & (kAlignUnit - 1)) ==
+         0);  // Pointer size should be a power of 2
+
+#ifdef MAP_HUGETLB
+  if (huge_page_size > 0 && bytes > 0) {
     // Allocate from a huge page TLB table.
     size_t reserved_size =
         ((bytes - 1U) / huge_page_size + 1U) * huge_page_size;
@@ -124,6 +177,10 @@ char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size,
       return addr;
     }
   }
+#else
+  (void)huge_page_size;
+  (void)logger;
+#endif
 
   size_t current_mod =
       reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1);
@@ -143,10 +200,17 @@ char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size,
 }
 
 char* Arena::AllocateNewBlock(size_t block_bytes) {
-  auto uniq = std::make_unique<char[]>(block_bytes);
-  char* block = uniq.get();
-  blocks_.push_back(std::move(uniq));
-
+  // Reserve space in `blocks_` before allocating memory via new.
+  // Use `emplace_back()` instead of `reserve()` to let std::vector manage its
+  // own memory and do fewer reallocations.
+  //
+  // - If `emplace_back` throws, no memory leaks because we haven't called `new`
+  //   yet.
+  // - If `new` throws, no memory leaks because the vector will be cleaned up
+  //   via RAII.
+  blocks_.emplace_back(nullptr);
+
+  char* block = new char[block_bytes];
   size_t allocated_size;
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
   allocated_size = malloc_usable_size(block);
@@ -163,6 +227,7 @@ char* Arena::AllocateNewBlock(size_t block_bytes) {
   if (tracker_ != nullptr) {
     tracker_->Allocate(allocated_size);
   }
+  blocks_.back() = block;
   return block;
 }
 
index 39399aa71b4f5ffb850c82453b0d824768ffa5b1..1de04c4770ed0440164bb9dafa334dd18ef19fc8 100644 (file)
 // size, it uses malloc to directly get the requested size.
 
 #pragma once
-
+#ifndef OS_WIN
+#include <sys/mman.h>
+#endif
+#include <assert.h>
+#include <stdint.h>
+#include <cerrno>
 #include <cstddef>
-#include <deque>
-
+#include <vector>
 #include "memory/allocator.h"
-#include "port/mmap.h"
-#include "rocksdb/env.h"
+#include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -28,13 +31,9 @@ class Arena : public Allocator {
   Arena(const Arena&) = delete;
   void operator=(const Arena&) = delete;
 
-  static constexpr size_t kInlineSize = 2048;
-  static constexpr size_t kMinBlockSize = 4096;
-  static constexpr size_t kMaxBlockSize = 2u << 30;
-
-  static constexpr unsigned kAlignUnit = alignof(std::max_align_t);
-  static_assert((kAlignUnit & (kAlignUnit - 1)) == 0,
-                "Pointer size should be power of 2");
+  static const size_t kInlineSize = 2048;
+  static const size_t kMinBlockSize;
+  static const size_t kMaxBlockSize;
 
   // huge_page_size: if 0, don't use huge page TLB. If > 0 (should set to the
   // supported hugepage size of the system), block allocation will try huge
@@ -64,7 +63,7 @@ class Arena : public Allocator {
   // by the arena (exclude the space allocated but not yet used for future
   // allocations).
   size_t ApproximateMemoryUsage() const {
-    return blocks_memory_ + blocks_.size() * sizeof(char*) -
+    return blocks_memory_ + blocks_.capacity() * sizeof(char*) -
            alloc_bytes_remaining_;
   }
 
@@ -82,19 +81,21 @@ class Arena : public Allocator {
     return blocks_.empty() && huge_blocks_.empty();
   }
 
-  // check and adjust the block_size so that the return value is
-  //  1. in the range of [kMinBlockSize, kMaxBlockSize].
-  //  2. the multiple of align unit.
-  static size_t OptimizeBlockSize(size_t block_size);
-
  private:
-  alignas(std::max_align_t) char inline_block_[kInlineSize];
+  char inline_block_[kInlineSize] __attribute__((__aligned__(alignof(max_align_t))));
   // Number of bytes allocated in one block
   const size_t kBlockSize;
-  // Allocated memory blocks
-  std::deque<std::unique_ptr<char[]>> blocks_;
-  // Huge page allocations
-  std::deque<MemMapping> huge_blocks_;
+  // Array of new[] allocated memory blocks
+  using Blocks = std::vector<char*>;
+  Blocks blocks_;
+
+  struct MmapInfo {
+    void* addr_;
+    size_t length_;
+
+    MmapInfo(void* addr, size_t length) : addr_(addr), length_(length) {}
+  };
+  std::vector<MmapInfo> huge_blocks_;
   size_t irregular_block_num = 0;
 
   // Stats for current active block.
@@ -107,15 +108,15 @@ class Arena : public Allocator {
   // How many bytes left in currently active block?
   size_t alloc_bytes_remaining_ = 0;
 
+#ifdef MAP_HUGETLB
   size_t hugetlb_size_ = 0;
-
+#endif  // MAP_HUGETLB
   char* AllocateFromHugePage(size_t bytes);
   char* AllocateFallback(size_t bytes, bool aligned);
   char* AllocateNewBlock(size_t block_bytes);
 
   // Bytes of memory in blocks allocated so far
   size_t blocks_memory_ = 0;
-  // Non-owned
   AllocTracker* tracker_;
 };
 
@@ -132,4 +133,9 @@ inline char* Arena::Allocate(size_t bytes) {
   return AllocateFallback(bytes, false /* unaligned */);
 }
 
+// check and adjust the block_size so that the return value is
+//  1. in the range of [kMinBlockSize, kMaxBlockSize].
+//  2. the multiple of align unit.
+extern size_t OptimizeBlockSize(size_t block_size);
+
 }  // namespace ROCKSDB_NAMESPACE
index 30887c23a06f92a51c86f3459ff4ed6636514509..96e69a932b6e9ae60e2cf43f646995bbeefbbd78 100644 (file)
@@ -8,11 +8,6 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "memory/arena.h"
-
-#ifndef OS_WIN
-#include <sys/resource.h>
-#endif
-#include "port/port.h"
 #include "test_util/testharness.h"
 #include "util/random.h"
 
@@ -201,61 +196,6 @@ TEST_F(ArenaTest, Simple) {
   SimpleTest(0);
   SimpleTest(kHugePageSize);
 }
-
-// Number of minor page faults since last call
-size_t PopMinorPageFaultCount() {
-#ifdef RUSAGE_SELF
-  static long prev = 0;
-  struct rusage usage;
-  EXPECT_EQ(getrusage(RUSAGE_SELF, &usage), 0);
-  size_t rv = usage.ru_minflt - prev;
-  prev = usage.ru_minflt;
-  return rv;
-#else
-  // Conservative
-  return SIZE_MAX;
-#endif  // RUSAGE_SELF
-}
-
-TEST(MmapTest, AllocateLazyZeroed) {
-  // Doesn't have to be page aligned
-  constexpr size_t len = 1234567;
-  MemMapping m = MemMapping::AllocateLazyZeroed(len);
-  auto arr = static_cast<char*>(m.Get());
-
-  // Should generally work
-  ASSERT_NE(arr, nullptr);
-
-  // Start counting page faults
-  PopMinorPageFaultCount();
-
-  // Access half of the allocation
-  size_t i = 0;
-  for (; i < len / 2; ++i) {
-    ASSERT_EQ(arr[i], 0);
-    arr[i] = static_cast<char>(i & 255);
-  }
-
-  // Appropriate page faults (maybe more)
-  size_t faults = PopMinorPageFaultCount();
-  ASSERT_GE(faults, len / 2 / port::kPageSize);
-
-  // Access rest of the allocation
-  for (; i < len; ++i) {
-    ASSERT_EQ(arr[i], 0);
-    arr[i] = static_cast<char>(i & 255);
-  }
-
-  // Appropriate page faults (maybe more)
-  faults = PopMinorPageFaultCount();
-  ASSERT_GE(faults, len / 2 / port::kPageSize);
-
-  // Verify data
-  for (i = 0; i < len; ++i) {
-    ASSERT_EQ(arr[i], static_cast<char>(i & 255));
-  }
-}
-
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/port/mmap.cc b/port/mmap.cc
deleted file mode 100644 (file)
index 36e8f32..0000000
+++ /dev/null
@@ -1,98 +0,0 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#include "port/mmap.h"
-
-#include <cassert>
-#include <cstdio>
-#include <cstring>
-#include <new>
-#include <utility>
-
-#include "util/hash.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-MemMapping::~MemMapping() {
-#ifdef OS_WIN
-  if (addr_ != nullptr) {
-    (void)::UnmapViewOfFile(addr_);
-  }
-  if (page_file_handle_ != NULL) {
-    (void)::CloseHandle(page_file_handle_);
-  }
-#else   // OS_WIN -> !OS_WIN
-  if (addr_ != nullptr) {
-    auto status = munmap(addr_, length_);
-    assert(status == 0);
-    if (status != 0) {
-      // TODO: handle error?
-    }
-  }
-#endif  // OS_WIN
-}
-
-MemMapping::MemMapping(MemMapping&& other) noexcept {
-  *this = std::move(other);
-}
-
-MemMapping& MemMapping::operator=(MemMapping&& other) noexcept {
-  if (&other == this) {
-    return *this;
-  }
-  this->~MemMapping();
-  std::memcpy(this, &other, sizeof(*this));
-  new (&other) MemMapping();
-  return *this;
-}
-
-MemMapping MemMapping::AllocateAnonymous(size_t length, bool huge) {
-  MemMapping mm;
-  mm.length_ = length;
-  assert(mm.addr_ == nullptr);
-  if (length == 0) {
-    // OK to leave addr as nullptr
-    return mm;
-  }
-  int huge_flag = 0;
-#ifdef OS_WIN
-  if (huge) {
-#ifdef FILE_MAP_LARGE_PAGES
-    huge_flag = FILE_MAP_LARGE_PAGES;
-#endif  // FILE_MAP_LARGE_PAGES
-  }
-  mm.page_file_handle_ = ::CreateFileMapping(
-      INVALID_HANDLE_VALUE, nullptr, PAGE_READWRITE | SEC_COMMIT,
-      Upper32of64(length), Lower32of64(length), nullptr);
-  if (mm.page_file_handle_ == NULL) {
-    // Failure
-    return mm;
-  }
-  mm.addr_ = ::MapViewOfFile(mm.page_file_handle_, FILE_MAP_WRITE | huge_flag,
-                             0, 0, length);
-#else  // OS_WIN -> !OS_WIN
-  if (huge) {
-#ifdef MAP_HUGETLB
-    huge_flag = MAP_HUGETLB;
-#endif  // MAP_HUGE_TLB
-  }
-  mm.addr_ = mmap(nullptr, length, PROT_READ | PROT_WRITE,
-                  MAP_PRIVATE | MAP_ANONYMOUS | huge_flag, -1, 0);
-  if (mm.addr_ == MAP_FAILED) {
-    mm.addr_ = nullptr;
-  }
-#endif  // OS_WIN
-  return mm;
-}
-
-MemMapping MemMapping::AllocateHuge(size_t length) {
-  return AllocateAnonymous(length, /*huge*/ true);
-}
-
-MemMapping MemMapping::AllocateLazyZeroed(size_t length) {
-  return AllocateAnonymous(length, /*huge*/ false);
-}
-
-}  // namespace ROCKSDB_NAMESPACE
diff --git a/port/mmap.h b/port/mmap.h
deleted file mode 100644 (file)
index 720b670..0000000
+++ /dev/null
@@ -1,70 +0,0 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-
-#ifdef OS_WIN
-#include <windows.h>
-// ^^^ Must come first
-#include <memoryapi.h>
-#else
-#include <sys/mman.h>
-#endif  // OS_WIN
-
-#include <cstdint>
-
-#include "rocksdb/rocksdb_namespace.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-// An RAII wrapper for mmaped memory
-class MemMapping {
- public:
-  static constexpr bool kHugePageSupported =
-#if defined(MAP_HUGETLB) || defined(FILE_MAP_LARGE_PAGES)
-      true;
-#else
-      false;
-#endif
-
-  // Allocate memory requesting to be backed by huge pages
-  static MemMapping AllocateHuge(size_t length);
-
-  // Allocate memory that is only lazily mapped to resident memory and
-  // guaranteed to be zero-initialized. Note that some platforms like
-  // Linux allow memory over-commit, where only the used portion of memory
-  // matters, while other platforms require enough swap space (page file) to
-  // back the full mapping.
-  static MemMapping AllocateLazyZeroed(size_t length);
-
-  // No copies
-  MemMapping(const MemMapping&) = delete;
-  MemMapping& operator=(const MemMapping&) = delete;
-  // Move
-  MemMapping(MemMapping&&) noexcept;
-  MemMapping& operator=(MemMapping&&) noexcept;
-
-  // Releases the mapping
-  ~MemMapping();
-
-  inline void* Get() const { return addr_; }
-  inline size_t Length() const { return length_; }
-
- private:
-  MemMapping() {}
-
-  // The mapped memory, or nullptr on failure / not supported
-  void* addr_ = nullptr;
-  // The known usable number of bytes starting at that address
-  size_t length_ = 0;
-
-#ifdef OS_WIN
-  HANDLE page_file_handle_ = NULL;
-#endif  // OS_WIN
-
-  static MemMapping AllocateAnonymous(size_t length, bool huge);
-};
-
-}  // namespace ROCKSDB_NAMESPACE
diff --git a/src.mk b/src.mk
index 0269fe1d33236e3acea4a02a66df9eb7f2d4fca2..f955efc6014320c5e73ff21ed3dd080083e7a741 100644 (file)
--- a/src.mk
+++ b/src.mk
@@ -154,7 +154,6 @@ LIB_SOURCES =                                                   \
   options/options.cc                                            \
   options/options_helper.cc                                     \
   options/options_parser.cc                                     \
-  port/mmap.cc                                                  \
   port/port_posix.cc                                            \
   port/win/env_default.cc                                       \
   port/win/env_win.cc                                           \