]> git-server-git.apps.pok.os.sepia.ceph.com Git - rocksdb.git/commitdiff
Allow allocating dynamic bloom, plain table indexes and hash linked list from huge...
authorsdong <siying.d@fb.com>
Fri, 25 Apr 2014 22:45:37 +0000 (15:45 -0700)
committersdong <siying.d@fb.com>
Wed, 30 Apr 2014 21:57:41 +0000 (14:57 -0700)
Summary: Add an option to allocate a piece of memory from huge page TLB. Add options to trigger it in dynamic bloom, plain table indexes andhash linked list hash table.

Test Plan: make all check

Reviewers: haobo, ljin

Reviewed By: haobo

CC: nkg-, dhruba, leveldb, igor, yhchiang
Differential Revision: https://reviews.facebook.net/D18357

Conflicts:
db/plain_table_db_test.cc
util/options.cc

18 files changed:
db/db_test.cc
db/memtable.cc
db/plain_table_db_test.cc
db/prefix_test.cc
include/rocksdb/memtablerep.h
include/rocksdb/options.h
include/rocksdb/table.h
table/plain_table_factory.cc
table/plain_table_factory.h
table/plain_table_reader.cc
table/plain_table_reader.h
util/arena.cc
util/arena.h
util/dynamic_bloom.cc
util/dynamic_bloom.h
util/hash_linklist_rep.cc
util/hash_linklist_rep.h
util/options.cc

index 4a14be7cf66da307afb50294a330a6cabe075e00..5e843dfa813367b31bda0a7cec9e3e5d2b60b990 100644 (file)
@@ -438,7 +438,7 @@ class DBTest {
         break;
       case kHashLinkList:
         options.prefix_extractor.reset(NewFixedPrefixTransform(1));
-        options.memtable_factory.reset(NewHashLinkListRepFactory(4));
+        options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0));
         break;
       case kUniversalCompaction:
         options.compaction_style = kCompactionStyleUniversal;
index f3503fb6576d814252d9a0504ff525dbc4eb4aea..07be2ae1fa27640146e9c6722ffc281bb7d8bba0 100644 (file)
@@ -52,9 +52,10 @@ MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
   // gone wrong already.
   assert(!should_flush_);
   if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
-    prefix_bloom_.reset(new DynamicBloom(options.memtable_prefix_bloom_bits,
-                                         options.bloom_locality,
-                                         options.memtable_prefix_bloom_probes));
+    prefix_bloom_.reset(new DynamicBloom(
+        options.memtable_prefix_bloom_bits, options.bloom_locality,
+        options.memtable_prefix_bloom_probes, nullptr,
+        options.memtable_prefix_bloom_huge_page_tlb_size));
   }
 }
 
index b28abdc1dd9bab06813dc11f4075b2886868bc4a..3ec5da64b708585aa6950af08295c5737a90475b 100644 (file)
@@ -188,7 +188,7 @@ class TestPlainTableReader : public PlainTableReader {
                        const Options& options, bool* expect_bloom_not_match)
       : PlainTableReader(options, std::move(file), storage_options, icomparator,
                          file_size, bloom_bits_per_key, hash_table_ratio,
-                         index_sparseness, table_properties),
+                         index_sparseness, table_properties, 2 * 1024 * 1024),
         expect_bloom_not_match_(expect_bloom_not_match) {
     Status s = PopulateIndex(const_cast<TableProperties*>(table_properties));
     ASSERT_TRUE(s.ok());
@@ -209,13 +209,12 @@ extern const uint64_t kPlainTableMagicNumber;
 class TestPlainTableFactory : public PlainTableFactory {
  public:
   explicit TestPlainTableFactory(bool* expect_bloom_not_match,
-                                 uint32_t user_key_len =
-                                     kPlainTableVariableLength,
-                                 int bloom_bits_per_key = 0,
-                                 double hash_table_ratio = 0.75,
-                                 size_t index_sparseness = 16)
+                                 uint32_t user_key_len, int bloom_bits_per_key,
+                                 double hash_table_ratio,
+                                 size_t index_sparseness,
+                                 size_t huge_page_tlb_size)
       : PlainTableFactory(user_key_len, user_key_len, hash_table_ratio,
-                          hash_table_ratio),
+                          index_sparseness, huge_page_tlb_size),
         bloom_bits_per_key_(bloom_bits_per_key),
         hash_table_ratio_(hash_table_ratio),
         index_sparseness_(index_sparseness),
@@ -247,197 +246,208 @@ class TestPlainTableFactory : public PlainTableFactory {
 };
 
 TEST(PlainTableDBTest, Flush) {
-  for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
-    for (int total_order = 0; total_order <= 1; total_order++) {
-      Options options = CurrentOptions();
-      options.create_if_missing = true;
-      // Set only one bucket to force bucket conflict.
-      // Test index interval for the same prefix to be 1, 2 and 4
-      if (total_order) {
-        options.table_factory.reset(
-            NewTotalOrderPlainTableFactory(16, bloom_bits, 2));
-      } else {
-        options.table_factory.reset(NewPlainTableFactory(16, bloom_bits));
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+      for (int total_order = 0; total_order <= 1; total_order++) {
+        Options options = CurrentOptions();
+        options.create_if_missing = true;
+        // Set only one bucket to force bucket conflict.
+        // Test index interval for the same prefix to be 1, 2 and 4
+        if (total_order) {
+          options.table_factory.reset(NewTotalOrderPlainTableFactory(
+              16, bloom_bits, 2, huge_page_tlb_size));
+        } else {
+          options.table_factory.reset(NewPlainTableFactory(
+              16, bloom_bits, 0.75, 16, huge_page_tlb_size));
+        }
+        DestroyAndReopen(&options);
+
+        ASSERT_OK(Put("1000000000000foo", "v1"));
+        ASSERT_OK(Put("0000000000000bar", "v2"));
+        ASSERT_OK(Put("1000000000000foo", "v3"));
+        dbfull()->TEST_FlushMemTable();
+
+        TablePropertiesCollection ptc;
+        reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+        ASSERT_EQ(1U, ptc.size());
+        auto row = ptc.begin();
+        auto tp = row->second;
+        ASSERT_EQ(total_order ? "4" : "12", (tp->user_collected_properties).at(
+                                                "plain_table_hash_table_size"));
+        ASSERT_EQ(total_order ? "9" : "0", (tp->user_collected_properties).at(
+                                               "plain_table_sub_index_size"));
+
+        ASSERT_EQ("v3", Get("1000000000000foo"));
+        ASSERT_EQ("v2", Get("0000000000000bar"));
       }
-      DestroyAndReopen(&options);
-
-      ASSERT_OK(Put("1000000000000foo", "v1"));
-      ASSERT_OK(Put("0000000000000bar", "v2"));
-      ASSERT_OK(Put("1000000000000foo", "v3"));
-      dbfull()->TEST_FlushMemTable();
-
-      TablePropertiesCollection ptc;
-      reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
-      ASSERT_EQ(1U, ptc.size());
-      auto row = ptc.begin();
-      auto tp = row->second;
-      ASSERT_EQ(
-          total_order ? "4" : "12",
-          (tp->user_collected_properties).at("plain_table_hash_table_size"));
-      ASSERT_EQ(
-          total_order ? "9" : "0",
-          (tp->user_collected_properties).at("plain_table_sub_index_size"));
-
-      ASSERT_EQ("v3", Get("1000000000000foo"));
-      ASSERT_EQ("v2", Get("0000000000000bar"));
     }
   }
 }
 
 TEST(PlainTableDBTest, Flush2) {
-  for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
-    for (int total_order = 0; total_order <= 1; total_order++) {
-      bool expect_bloom_not_match = false;
-      Options options = CurrentOptions();
-      options.create_if_missing = true;
-      // Set only one bucket to force bucket conflict.
-      // Test index interval for the same prefix to be 1, 2 and 4
-      if (total_order) {
-        options.prefix_extractor = nullptr;
-        options.table_factory.reset(new TestPlainTableFactory(
-            &expect_bloom_not_match, 16, bloom_bits, 0, 2));
-      } else {
-        options.table_factory.reset(
-            new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits));
-      }
-      DestroyAndReopen(&options);
-      ASSERT_OK(Put("0000000000000bar", "b"));
-      ASSERT_OK(Put("1000000000000foo", "v1"));
-      dbfull()->TEST_FlushMemTable();
-
-      ASSERT_OK(Put("1000000000000foo", "v2"));
-      dbfull()->TEST_FlushMemTable();
-      ASSERT_EQ("v2", Get("1000000000000foo"));
-
-      ASSERT_OK(Put("0000000000000eee", "v3"));
-      dbfull()->TEST_FlushMemTable();
-      ASSERT_EQ("v3", Get("0000000000000eee"));
-
-      ASSERT_OK(Delete("0000000000000bar"));
-      dbfull()->TEST_FlushMemTable();
-      ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
-
-      ASSERT_OK(Put("0000000000000eee", "v5"));
-      ASSERT_OK(Put("9000000000000eee", "v5"));
-      dbfull()->TEST_FlushMemTable();
-      ASSERT_EQ("v5", Get("0000000000000eee"));
-
-      // Test Bloom Filter
-      if (bloom_bits > 0) {
-        // Neither key nor value should exist.
-        expect_bloom_not_match = true;
-        ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
-
-        // Key doesn't exist any more but prefix exists.
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+      for (int total_order = 0; total_order <= 1; total_order++) {
+        bool expect_bloom_not_match = false;
+        Options options = CurrentOptions();
+        options.create_if_missing = true;
+        // Set only one bucket to force bucket conflict.
+        // Test index interval for the same prefix to be 1, 2 and 4
         if (total_order) {
-          ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
-          ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
+          options.prefix_extractor = nullptr;
+          options.table_factory.reset(
+              new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
+                                        0, 2, huge_page_tlb_size));
+        } else {
+          options.table_factory.reset(
+              new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
+                                        0.75, 16, huge_page_tlb_size));
+        }
+        DestroyAndReopen(&options);
+        ASSERT_OK(Put("0000000000000bar", "b"));
+        ASSERT_OK(Put("1000000000000foo", "v1"));
+        dbfull()->TEST_FlushMemTable();
+
+        ASSERT_OK(Put("1000000000000foo", "v2"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("v2", Get("1000000000000foo"));
+
+        ASSERT_OK(Put("0000000000000eee", "v3"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("v3", Get("0000000000000eee"));
+
+        ASSERT_OK(Delete("0000000000000bar"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
+
+        ASSERT_OK(Put("0000000000000eee", "v5"));
+        ASSERT_OK(Put("9000000000000eee", "v5"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("v5", Get("0000000000000eee"));
+
+        // Test Bloom Filter
+        if (bloom_bits > 0) {
+          // Neither key nor value should exist.
+          expect_bloom_not_match = true;
+          ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
+
+          // Key doesn't exist any more but prefix exists.
+          if (total_order) {
+            ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
+            ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
+          }
+          expect_bloom_not_match = false;
         }
-        expect_bloom_not_match = false;
       }
     }
   }
 }
 
 TEST(PlainTableDBTest, Iterator) {
-  for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
-    for (int total_order = 0; total_order <= 1; total_order++) {
-      bool expect_bloom_not_match = false;
-      Options options = CurrentOptions();
-      options.create_if_missing = true;
-      // Set only one bucket to force bucket conflict.
-      // Test index interval for the same prefix to be 1, 2 and 4
-      if (total_order) {
-        options.prefix_extractor = nullptr;
-        options.table_factory.reset(new TestPlainTableFactory(
-            &expect_bloom_not_match, 16, bloom_bits, 0, 2));
-      } else {
-        options.table_factory.reset(
-            new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits));
-      }
-      DestroyAndReopen(&options);
-
-      ASSERT_OK(Put("1000000000foo002", "v_2"));
-      ASSERT_OK(Put("0000000000000bar", "random"));
-      ASSERT_OK(Put("1000000000foo001", "v1"));
-      ASSERT_OK(Put("3000000000000bar", "bar_v"));
-      ASSERT_OK(Put("1000000000foo003", "v__3"));
-      ASSERT_OK(Put("1000000000foo004", "v__4"));
-      ASSERT_OK(Put("1000000000foo005", "v__5"));
-      ASSERT_OK(Put("1000000000foo007", "v__7"));
-      ASSERT_OK(Put("1000000000foo008", "v__8"));
-      dbfull()->TEST_FlushMemTable();
-      ASSERT_EQ("v1", Get("1000000000foo001"));
-      ASSERT_EQ("v__3", Get("1000000000foo003"));
-      Iterator* iter = dbfull()->NewIterator(ro_);
-      iter->Seek("1000000000foo000");
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("1000000000foo001", iter->key().ToString());
-      ASSERT_EQ("v1", iter->value().ToString());
-
-      iter->Next();
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("1000000000foo002", iter->key().ToString());
-      ASSERT_EQ("v_2", iter->value().ToString());
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+      for (int total_order = 0; total_order <= 1; total_order++) {
+        bool expect_bloom_not_match = false;
+        Options options = CurrentOptions();
+        options.create_if_missing = true;
+        // Set only one bucket to force bucket conflict.
+        // Test index interval for the same prefix to be 1, 2 and 4
+        if (total_order) {
+          options.prefix_extractor = nullptr;
+          options.table_factory.reset(new TestPlainTableFactory(
+              &expect_bloom_not_match, 16, bloom_bits, 0, 2, huge_page_tlb_size));
+        } else {
+          options.table_factory.reset(
+              new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
+                                        0.75, 16, huge_page_tlb_size));
+        }
+        DestroyAndReopen(&options);
+
+        ASSERT_OK(Put("1000000000foo002", "v_2"));
+        ASSERT_OK(Put("0000000000000bar", "random"));
+        ASSERT_OK(Put("1000000000foo001", "v1"));
+        ASSERT_OK(Put("3000000000000bar", "bar_v"));
+        ASSERT_OK(Put("1000000000foo003", "v__3"));
+        ASSERT_OK(Put("1000000000foo004", "v__4"));
+        ASSERT_OK(Put("1000000000foo005", "v__5"));
+        ASSERT_OK(Put("1000000000foo007", "v__7"));
+        ASSERT_OK(Put("1000000000foo008", "v__8"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("v1", Get("1000000000foo001"));
+        ASSERT_EQ("v__3", Get("1000000000foo003"));
+        Iterator* iter = dbfull()->NewIterator(ro_);
+        iter->Seek("1000000000foo000");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo001", iter->key().ToString());
+        ASSERT_EQ("v1", iter->value().ToString());
 
-      iter->Next();
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("1000000000foo003", iter->key().ToString());
-      ASSERT_EQ("v__3", iter->value().ToString());
+        iter->Next();
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo002", iter->key().ToString());
+        ASSERT_EQ("v_2", iter->value().ToString());
 
-      iter->Next();
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("1000000000foo004", iter->key().ToString());
-      ASSERT_EQ("v__4", iter->value().ToString());
+        iter->Next();
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo003", iter->key().ToString());
+        ASSERT_EQ("v__3", iter->value().ToString());
 
-      iter->Seek("3000000000000bar");
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("3000000000000bar", iter->key().ToString());
-      ASSERT_EQ("bar_v", iter->value().ToString());
+        iter->Next();
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo004", iter->key().ToString());
+        ASSERT_EQ("v__4", iter->value().ToString());
 
-      iter->Seek("1000000000foo000");
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("1000000000foo001", iter->key().ToString());
-      ASSERT_EQ("v1", iter->value().ToString());
+        iter->Seek("3000000000000bar");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("3000000000000bar", iter->key().ToString());
+        ASSERT_EQ("bar_v", iter->value().ToString());
 
-      iter->Seek("1000000000foo005");
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("1000000000foo005", iter->key().ToString());
-      ASSERT_EQ("v__5", iter->value().ToString());
+        iter->Seek("1000000000foo000");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo001", iter->key().ToString());
+        ASSERT_EQ("v1", iter->value().ToString());
 
-      iter->Seek("1000000000foo006");
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("1000000000foo007", iter->key().ToString());
-      ASSERT_EQ("v__7", iter->value().ToString());
+        iter->Seek("1000000000foo005");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo005", iter->key().ToString());
+        ASSERT_EQ("v__5", iter->value().ToString());
 
-      iter->Seek("1000000000foo008");
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("1000000000foo008", iter->key().ToString());
-      ASSERT_EQ("v__8", iter->value().ToString());
+        iter->Seek("1000000000foo006");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo007", iter->key().ToString());
+        ASSERT_EQ("v__7", iter->value().ToString());
 
-      if (total_order == 0) {
-        iter->Seek("1000000000foo009");
+        iter->Seek("1000000000foo008");
         ASSERT_TRUE(iter->Valid());
-        ASSERT_EQ("3000000000000bar", iter->key().ToString());
-      }
+        ASSERT_EQ("1000000000foo008", iter->key().ToString());
+        ASSERT_EQ("v__8", iter->value().ToString());
 
-      // Test Bloom Filter
-      if (bloom_bits > 0) {
-        if (!total_order) {
-          // Neither key nor value should exist.
-          expect_bloom_not_match = true;
-          iter->Seek("2not000000000bar");
-          ASSERT_TRUE(!iter->Valid());
-          ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
-          expect_bloom_not_match = false;
-        } else {
-          expect_bloom_not_match = true;
-          ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
-          expect_bloom_not_match = false;
+        if (total_order == 0) {
+          iter->Seek("1000000000foo009");
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("3000000000000bar", iter->key().ToString());
         }
-      }
 
-      delete iter;
+        // Test Bloom Filter
+        if (bloom_bits > 0) {
+          if (!total_order) {
+            // Neither key nor value should exist.
+            expect_bloom_not_match = true;
+            iter->Seek("2not000000000bar");
+            ASSERT_TRUE(!iter->Valid());
+            ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+            expect_bloom_not_match = false;
+          } else {
+            expect_bloom_not_match = true;
+            ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+            expect_bloom_not_match = false;
+          }
+        }
+
+        delete iter;
+      }
     }
   }
 }
@@ -582,165 +592,173 @@ TEST(PlainTableDBTest, IteratorReverseSuffixComparator) {
 }
 
 TEST(PlainTableDBTest, HashBucketConflict) {
-  for (unsigned char i = 1; i <= 3; i++) {
-    Options options = CurrentOptions();
-    options.create_if_missing = true;
-    // Set only one bucket to force bucket conflict.
-    // Test index interval for the same prefix to be 1, 2 and 4
-    options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i));
-    DestroyAndReopen(&options);
-    ASSERT_OK(Put("5000000000000fo0", "v1"));
-    ASSERT_OK(Put("5000000000000fo1", "v2"));
-    ASSERT_OK(Put("5000000000000fo2", "v"));
-    ASSERT_OK(Put("2000000000000fo0", "v3"));
-    ASSERT_OK(Put("2000000000000fo1", "v4"));
-    ASSERT_OK(Put("2000000000000fo2", "v"));
-    ASSERT_OK(Put("2000000000000fo3", "v"));
-
-    dbfull()->TEST_FlushMemTable();
-
-    ASSERT_EQ("v1", Get("5000000000000fo0"));
-    ASSERT_EQ("v2", Get("5000000000000fo1"));
-    ASSERT_EQ("v3", Get("2000000000000fo0"));
-    ASSERT_EQ("v4", Get("2000000000000fo1"));
-
-    ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
-    ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
-    ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
-    ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
-
-    ReadOptions ro;
-    Iterator* iter = dbfull()->NewIterator(ro);
-
-    iter->Seek("5000000000000fo0");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("5000000000000fo0", iter->key().ToString());
-    iter->Next();
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (unsigned char i = 1; i <= 3; i++) {
+      Options options = CurrentOptions();
+      options.create_if_missing = true;
+      // Set only one bucket to force bucket conflict.
+      // Test index interval for the same prefix to be 1, 2 and 4
+      options.table_factory.reset(
+          NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
+      DestroyAndReopen(&options);
+      ASSERT_OK(Put("5000000000000fo0", "v1"));
+      ASSERT_OK(Put("5000000000000fo1", "v2"));
+      ASSERT_OK(Put("5000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo0", "v3"));
+      ASSERT_OK(Put("2000000000000fo1", "v4"));
+      ASSERT_OK(Put("2000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo3", "v"));
 
-    iter->Seek("5000000000000fo1");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+      dbfull()->TEST_FlushMemTable();
 
-    iter->Seek("2000000000000fo0");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("2000000000000fo0", iter->key().ToString());
-    iter->Next();
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+      ASSERT_EQ("v1", Get("5000000000000fo0"));
+      ASSERT_EQ("v2", Get("5000000000000fo1"));
+      ASSERT_EQ("v3", Get("2000000000000fo0"));
+      ASSERT_EQ("v4", Get("2000000000000fo1"));
 
-    iter->Seek("2000000000000fo1");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
 
-    iter->Seek("2000000000000bar");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+      ReadOptions ro;
+      Iterator* iter = dbfull()->NewIterator(ro);
 
-    iter->Seek("5000000000000bar");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+      iter->Seek("5000000000000fo0");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
 
-    iter->Seek("2000000000000fo8");
-    ASSERT_TRUE(!iter->Valid() ||
-                options.comparator->Compare(iter->key(), "20000001") > 0);
+      iter->Seek("2000000000000fo0");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000bar");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+
+      iter->Seek("5000000000000bar");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
 
-    iter->Seek("5000000000000fo8");
-    ASSERT_TRUE(!iter->Valid());
+      iter->Seek("2000000000000fo8");
+      ASSERT_TRUE(!iter->Valid() ||
+                  options.comparator->Compare(iter->key(), "20000001") > 0);
 
-    iter->Seek("1000000000000fo2");
-    ASSERT_TRUE(!iter->Valid());
+      iter->Seek("5000000000000fo8");
+      ASSERT_TRUE(!iter->Valid());
 
-    iter->Seek("3000000000000fo2");
-    ASSERT_TRUE(!iter->Valid());
+      iter->Seek("1000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
 
-    iter->Seek("8000000000000fo2");
-    ASSERT_TRUE(!iter->Valid());
+      iter->Seek("3000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
 
-    delete iter;
+      iter->Seek("8000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      delete iter;
+    }
   }
 }
 
 TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
-  for (unsigned char i = 1; i <= 3; i++) {
-    Options options = CurrentOptions();
-    options.create_if_missing = true;
-    SimpleSuffixReverseComparator comp;
-    options.comparator = &comp;
-    // Set only one bucket to force bucket conflict.
-    // Test index interval for the same prefix to be 1, 2 and 4
-    options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i));
-    DestroyAndReopen(&options);
-    ASSERT_OK(Put("5000000000000fo0", "v1"));
-    ASSERT_OK(Put("5000000000000fo1", "v2"));
-    ASSERT_OK(Put("5000000000000fo2", "v"));
-    ASSERT_OK(Put("2000000000000fo0", "v3"));
-    ASSERT_OK(Put("2000000000000fo1", "v4"));
-    ASSERT_OK(Put("2000000000000fo2", "v"));
-    ASSERT_OK(Put("2000000000000fo3", "v"));
-
-    dbfull()->TEST_FlushMemTable();
-
-    ASSERT_EQ("v1", Get("5000000000000fo0"));
-    ASSERT_EQ("v2", Get("5000000000000fo1"));
-    ASSERT_EQ("v3", Get("2000000000000fo0"));
-    ASSERT_EQ("v4", Get("2000000000000fo1"));
-
-    ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
-    ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
-    ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
-    ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
-
-    ReadOptions ro;
-    Iterator* iter = dbfull()->NewIterator(ro);
-
-    iter->Seek("5000000000000fo1");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("5000000000000fo1", iter->key().ToString());
-    iter->Next();
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (unsigned char i = 1; i <= 3; i++) {
+      Options options = CurrentOptions();
+      options.create_if_missing = true;
+      SimpleSuffixReverseComparator comp;
+      options.comparator = &comp;
+      // Set only one bucket to force bucket conflict.
+      // Test index interval for the same prefix to be 1, 2 and 4
+      options.table_factory.reset(
+          NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
+      DestroyAndReopen(&options);
+      ASSERT_OK(Put("5000000000000fo0", "v1"));
+      ASSERT_OK(Put("5000000000000fo1", "v2"));
+      ASSERT_OK(Put("5000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo0", "v3"));
+      ASSERT_OK(Put("2000000000000fo1", "v4"));
+      ASSERT_OK(Put("2000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo3", "v"));
 
-    iter->Seek("5000000000000fo1");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+      dbfull()->TEST_FlushMemTable();
 
-    iter->Seek("2000000000000fo1");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("2000000000000fo1", iter->key().ToString());
-    iter->Next();
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+      ASSERT_EQ("v1", Get("5000000000000fo0"));
+      ASSERT_EQ("v2", Get("5000000000000fo1"));
+      ASSERT_EQ("v3", Get("2000000000000fo0"));
+      ASSERT_EQ("v4", Get("2000000000000fo1"));
 
-    iter->Seek("2000000000000fo1");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
 
-    iter->Seek("2000000000000var");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("2000000000000fo3", iter->key().ToString());
+      ReadOptions ro;
+      Iterator* iter = dbfull()->NewIterator(ro);
 
-    iter->Seek("5000000000000var");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("5000000000000fo2", iter->key().ToString());
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
 
-    std::string seek_key = "2000000000000bar";
-    iter->Seek(seek_key);
-    ASSERT_TRUE(!iter->Valid() ||
-                options.prefix_extractor->Transform(iter->key()) !=
-                    options.prefix_extractor->Transform(seek_key));
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
 
-    iter->Seek("1000000000000fo2");
-    ASSERT_TRUE(!iter->Valid());
+      iter->Seek("2000000000000var");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo3", iter->key().ToString());
+
+      iter->Seek("5000000000000var");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo2", iter->key().ToString());
 
-    iter->Seek("3000000000000fo2");
-    ASSERT_TRUE(!iter->Valid());
+      std::string seek_key = "2000000000000bar";
+      iter->Seek(seek_key);
+      ASSERT_TRUE(!iter->Valid() ||
+                  options.prefix_extractor->Transform(iter->key()) !=
+                      options.prefix_extractor->Transform(seek_key));
 
-    iter->Seek("8000000000000fo2");
-    ASSERT_TRUE(!iter->Valid());
+      iter->Seek("1000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
 
-    delete iter;
+      iter->Seek("3000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("8000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      delete iter;
+    }
   }
 }
 
index 89e31b60c7c4897762c0f136818f2efa8ea3fc77..17f830db06d337592d1dc27862a162b37fb5b901 100644 (file)
@@ -31,6 +31,7 @@ DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
 DEFINE_int32(skiplist_height, 4, "");
 DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
 DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
+DEFINE_int32(memtable_prefix_bloom_huge_page_tlb_size, 2 * 1024 * 1024, "");
 DEFINE_int32(value_size, 40, "");
 
 // Path to the database on file system
@@ -147,6 +148,8 @@ class PrefixTest {
 
     options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits;
     options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes;
+    options.memtable_prefix_bloom_huge_page_tlb_size =
+        FLAGS_memtable_prefix_bloom_huge_page_tlb_size;
 
     Status s = DB::Open(options, kDbName,  &db);
     ASSERT_OK(s);
@@ -171,6 +174,10 @@ class PrefixTest {
           options.memtable_factory.reset(
               NewHashLinkListRepFactory(bucket_count));
           return true;
+        case kHashLinkListHugePageTlb:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024));
+          return true;
         default:
           return false;
       }
@@ -189,6 +196,7 @@ class PrefixTest {
     kBegin,
     kHashSkipList,
     kHashLinkList,
+    kHashLinkListHugePageTlb,
     kEnd
   };
   int option_config_;
index 05f1aebca3c8372e2f340e06be74b2f1b59db564..ea3a0fe886b19f0564887f0b8111cae5274e347a 100644 (file)
@@ -223,8 +223,13 @@ extern MemTableRepFactory* NewHashSkipListRepFactory(
 // The factory is to create memtables with a hashed linked list:
 // it contains a fixed array of buckets, each pointing to a sorted single
 // linked list (null if the bucket is empty).
-// bucket_count: number of fixed array buckets
+// @bucket_count: number of fixed array buckets
+// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
+//                      Otherwise from huge page TLB. The user needs to reserve
+//                      huge pages for it to be allocated, like:
+//                          sysctl -w vm.nr_hugepages=20
+//                      See linux doc Documentation/vm/hugetlbpage.txt
 extern MemTableRepFactory* NewHashLinkListRepFactory(
-    size_t bucket_count = 50000);
+    size_t bucket_count = 50000, size_t huge_page_tlb_size = 2 * 1024 * 1024);
 
 }  // namespace rocksdb
index 54b4ef38fd5c3dfa96698811d8d44f4ed7bfe1ec..715a418a355002a55541fdb3088204a4527c3a70 100644 (file)
@@ -719,6 +719,14 @@ struct Options {
   // number of hash probes per key
   uint32_t memtable_prefix_bloom_probes;
 
+  // Page size for huge page TLB for bloom in memtable. If <=0, not allocate
+  // from huge page TLB but from malloc.
+  // Need to reserve huge pages for it to be allocated. For example:
+  //      sysctl -w vm.nr_hugepages=20
+  // See linux doc Documentation/vm/hugetlbpage.txt
+
+  size_t memtable_prefix_bloom_huge_page_tlb_size;
+
   // Control locality of bloom filter probes to improve cache miss rate.
   // This option only applies to memtable prefix bloom and plaintable
   // prefix bloom. It essentially limits the max number of cache lines each
index 1016bcf14735d5d27d6ec3bc3b765f73fd62b8c6..365efa327e2608e565480eb81e1bfea1d39814fc 100644 (file)
@@ -97,12 +97,19 @@ extern TableFactory* NewBlockBasedTableFactory(
 //                    in the hash table
 // @index_sparseness: inside each prefix, need to build one index record for how
 //                    many keys for binary search inside each hash bucket.
+// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
+//                      Otherwise from huge page TLB. The user needs to reserve
+//                      huge pages for it to be allocated, like:
+//                          sysctl -w vm.nr_hugepages=20
+//                      See linux doc Documentation/vm/hugetlbpage.txt
+
 const uint32_t kPlainTableVariableLength = 0;
 extern TableFactory* NewPlainTableFactory(uint32_t user_key_len =
                                               kPlainTableVariableLength,
                                           int bloom_bits_per_prefix = 10,
                                           double hash_table_ratio = 0.75,
-                                          size_t index_sparseness = 16);
+                                          size_t index_sparseness = 16,
+                                          size_t huge_page_tlb_size = 0);
 
 // -- Plain Table
 // This factory of plain table ignores Options.prefix_extractor and assumes no
@@ -116,9 +123,15 @@ extern TableFactory* NewPlainTableFactory(uint32_t user_key_len =
 //                  disable it by passing a zero.
 // @index_sparseness: need to build one index record for how many keys for
 //                    binary search.
+// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
+//                      Otherwise from huge page TLB. The user needs to reserve
+//                      huge pages for it to be allocated, like:
+//                          sysctl -w vm.nr_hugepages=20
+//                      See linux doc Documentation/vm/hugetlbpage.txt
 extern TableFactory* NewTotalOrderPlainTableFactory(
     uint32_t user_key_len = kPlainTableVariableLength,
-    int bloom_bits_per_key = 0, size_t index_sparseness = 16);
+    int bloom_bits_per_key = 0, size_t index_sparseness = 16,
+    size_t huge_page_tlb_size = 0);
 
 // A base class for table factories.
 class TableFactory {
index 16ee24eb4afb0c39cbf2ba8ba6f4db2db92f567a..4ccbaba0f30fd30f54288e45b4dfceeed2bf86d9 100644 (file)
@@ -21,7 +21,8 @@ Status PlainTableFactory::NewTableReader(const Options& options,
                                          unique_ptr<TableReader>* table) const {
   return PlainTableReader::Open(options, soptions, icomp, std::move(file),
                                 file_size, table, bloom_bits_per_key_,
-                                hash_table_ratio_, index_sparseness_);
+                                hash_table_ratio_, index_sparseness_,
+                                huge_page_tlb_size_);
 }
 
 TableBuilder* PlainTableFactory::NewTableBuilder(
@@ -33,16 +34,19 @@ TableBuilder* PlainTableFactory::NewTableBuilder(
 extern TableFactory* NewPlainTableFactory(uint32_t user_key_len,
                                           int bloom_bits_per_key,
                                           double hash_table_ratio,
-                                          size_t index_sparseness) {
+                                          size_t index_sparseness,
+                                          size_t huge_page_tlb_size) {
   return new PlainTableFactory(user_key_len, bloom_bits_per_key,
-                               hash_table_ratio, index_sparseness);
+                               hash_table_ratio, index_sparseness,
+                               huge_page_tlb_size);
 }
 
 extern TableFactory* NewTotalOrderPlainTableFactory(uint32_t user_key_len,
                                                     int bloom_bits_per_key,
-                                                    size_t index_sparseness) {
+                                                    size_t index_sparseness,
+                                                    size_t huge_page_tlb_size) {
   return new PlainTableFactory(user_key_len, bloom_bits_per_key, 0,
-                               index_sparseness);
+                               index_sparseness, huge_page_tlb_size);
 }
 
 }  // namespace rocksdb
index a0a7fbe6f29f0e807a1d3cb2ee2ec62bf25abdc9..d44b26a166a0d6c0713584fe9306c7a87c8d591c 100644 (file)
@@ -54,14 +54,19 @@ class PlainTableFactory : public TableFactory {
   // inside the same prefix. It will be the maximum number of linear search
   // required after hash and binary search.
   // index_sparseness = 0 means index for every key.
+  // huge_page_tlb_size determines whether to allocate hash indexes from huge
+  // page TLB and the page size if allocating from there. See comments of
+  // Arena::AllocateAligned() for details.
   explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength,
                              int bloom_bits_per_key = 0,
                              double hash_table_ratio = 0.75,
-                             size_t index_sparseness = 16)
+                             size_t index_sparseness = 16,
+                             size_t huge_page_tlb_size = 2 * 1024 * 1024)
       : user_key_len_(user_key_len),
         bloom_bits_per_key_(bloom_bits_per_key),
         hash_table_ratio_(hash_table_ratio),
-        index_sparseness_(index_sparseness) {}
+        index_sparseness_(index_sparseness),
+        huge_page_tlb_size_(huge_page_tlb_size) {}
   const char* Name() const override { return "PlainTable"; }
   Status NewTableReader(const Options& options, const EnvOptions& soptions,
                         const InternalKeyComparator& internal_comparator,
@@ -80,6 +85,7 @@ class PlainTableFactory : public TableFactory {
   int bloom_bits_per_key_;
   double hash_table_ratio_;
   size_t index_sparseness_;
+  size_t huge_page_tlb_size_;
 };
 
 }  // namespace rocksdb
index 02e8f12d41185770a6521226131f9554f76b6359..a62d537ce19609bd59a0af65fe44f671f566a1ed 100644 (file)
@@ -23,6 +23,7 @@
 #include "table/two_level_iterator.h"
 #include "table/plain_table_factory.h"
 
+#include "util/arena.h"
 #include "util/coding.h"
 #include "util/dynamic_bloom.h"
 #include "util/hash.h"
@@ -94,7 +95,8 @@ PlainTableReader::PlainTableReader(
     const Options& options, unique_ptr<RandomAccessFile>&& file,
     const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
     uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio,
-    size_t index_sparseness, const TableProperties* table_properties)
+    size_t index_sparseness, const TableProperties* table_properties,
+    size_t huge_page_tlb_size)
     : options_(options),
       soptions_(storage_options),
       file_(std::move(file)),
@@ -105,19 +107,23 @@ PlainTableReader::PlainTableReader(
       kIndexIntervalForSamePrefixKeys(index_sparseness),
       table_properties_(nullptr),
       data_end_offset_(table_properties->data_size),
-      user_key_len_(table_properties->fixed_key_len) {
+      user_key_len_(table_properties->fixed_key_len),
+      huge_page_tlb_size_(huge_page_tlb_size) {
   assert(kHashTableRatio >= 0.0);
 }
 
 PlainTableReader::~PlainTableReader() {
 }
 
-Status PlainTableReader::Open(
-    const Options& options, const EnvOptions& soptions,
-    const InternalKeyComparator& internal_comparator,
-    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-    unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
-    double hash_table_ratio, size_t index_sparseness) {
+Status PlainTableReader::Open(const Options& options,
+                              const EnvOptions& soptions,
+                              const InternalKeyComparator& internal_comparator,
+                              unique_ptr<RandomAccessFile>&& file,
+                              uint64_t file_size,
+                              unique_ptr<TableReader>* table_reader,
+                              const int bloom_bits_per_key,
+                              double hash_table_ratio, size_t index_sparseness,
+                              size_t huge_page_tlb_size) {
   assert(options.allow_mmap_reads);
 
   if (file_size > kMaxFileSize) {
@@ -133,7 +139,8 @@ Status PlainTableReader::Open(
 
   std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
       options, std::move(file), soptions, internal_comparator, file_size,
-      bloom_bits_per_key, hash_table_ratio, index_sparseness, props));
+      bloom_bits_per_key, hash_table_ratio, index_sparseness, props,
+      huge_page_tlb_size));
 
   // -- Populate Index
   s = new_reader->PopulateIndex(props);
@@ -264,12 +271,11 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
 }
 
 void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
-  index_.reset();
-
   if (options_.prefix_extractor.get() != nullptr) {
     uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey;
     if (bloom_total_bits > 0) {
-      bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality));
+      bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality,
+                                    6, nullptr, huge_page_tlb_size_));
     }
   }
 
@@ -281,7 +287,6 @@ void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
     double hash_table_size_multipier = 1.0 / kHashTableRatio;
     index_size_ = num_prefixes * hash_table_size_multipier + 1;
   }
-  index_.reset(new uint32_t[index_size_]);
 }
 
 size_t PlainTableReader::BucketizeIndexesAndFillBloom(
@@ -325,7 +330,12 @@ void PlainTableReader::FillIndexes(
     const std::vector<uint32_t>& entries_per_bucket) {
   Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
       kSubIndexSize);
-  sub_index_.reset(new char[kSubIndexSize]);
+  auto total_allocate_size = sizeof(uint32_t) * index_size_ + kSubIndexSize;
+  char* allocated =
+      arena_.AllocateAligned(total_allocate_size, huge_page_tlb_size_);
+  index_ = reinterpret_cast<uint32_t*>(allocated);
+  sub_index_ = allocated + sizeof(uint32_t) * index_size_;
+
   size_t sub_index_offset = 0;
   for (int i = 0; i < index_size_; i++) {
     uint32_t num_keys_for_bucket = entries_per_bucket[i];
@@ -390,7 +400,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props) {
   if (IsTotalOrderMode()) {
     uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey;
     if (num_bloom_bits > 0) {
-      bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality));
+      bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality, 6,
+                                    nullptr, huge_page_tlb_size_));
     }
   }
 
index 7f0c3b537fea2366440f45003a8758ce32b05f69..c88a597bb5a6a4b61f2915ea44908c098572e83b 100644 (file)
@@ -17,6 +17,7 @@
 #include "rocksdb/table_properties.h"
 #include "table/table_reader.h"
 #include "table/plain_table_factory.h"
+#include "util/arena.h"
 
 namespace rocksdb {
 
@@ -50,7 +51,7 @@ class PlainTableReader: public TableReader {
                      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                      unique_ptr<TableReader>* table,
                      const int bloom_bits_per_key, double hash_table_ratio,
-                     size_t index_sparseness);
+                     size_t index_sparseness, size_t huge_page_tlb_size);
 
   bool PrefixMayMatch(const Slice& internal_prefix);
 
@@ -74,7 +75,8 @@ class PlainTableReader: public TableReader {
                    const InternalKeyComparator& internal_comparator,
                    uint64_t file_size, int bloom_num_bits,
                    double hash_table_ratio, size_t index_sparseness,
-                   const TableProperties* table_properties);
+                   const TableProperties* table_properties,
+                   size_t huge_page_tlb_size);
   virtual ~PlainTableReader();
 
  protected:
@@ -136,9 +138,9 @@ class PlainTableReader: public TableReader {
   // For more details about the in-memory index, please refer to:
   // https://github.com/facebook/rocksdb/wiki/PlainTable-Format
   // #wiki-in-memory-index-format
-  std::unique_ptr<uint32_t[]> index_;
+  uint32_t* index_;
   int index_size_ = 0;
-  std::unique_ptr<char[]> sub_index_;
+  char* sub_index_;
 
   Options options_;
   const EnvOptions& soptions_;
@@ -159,6 +161,7 @@ class PlainTableReader: public TableReader {
   const size_t kIndexIntervalForSamePrefixKeys = 16;
   // Bloom filter is used to rule out non-existent key
   unique_ptr<DynamicBloom> bloom_;
+  Arena arena_;
 
   std::shared_ptr<const TableProperties> table_properties_;
   // data_start_offset_ and data_end_offset_ defines the range of the
@@ -166,6 +169,7 @@ class PlainTableReader: public TableReader {
   const uint32_t data_start_offset_ = 0;
   const uint32_t data_end_offset_;
   const size_t user_key_len_;
+  const size_t huge_page_tlb_size_;
 
   static const size_t kNumInternalBytes = 8;
   static const uint32_t kSubIndexMask = 0x80000000;
index 9b2cb82d1a00376319233b6e34f1acc3c9cfc7cf..3575f2d9065cc7d159e3e70e878e5d643b931825 100644 (file)
@@ -8,6 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "util/arena.h"
+#include <sys/mman.h>
 #include <algorithm>
 
 namespace rocksdb {
@@ -38,6 +39,13 @@ Arena::~Arena() {
   for (const auto& block : blocks_) {
     delete[] block;
   }
+  for (const auto& mmap_info : huge_blocks_) {
+    auto ret = munmap(mmap_info.addr_, mmap_info.length_);
+    if (ret != 0) {
+      // TODO(sdong): Better handling
+      perror("munmap");
+    }
+  }
 }
 
 char* Arena::AllocateFallback(size_t bytes, bool aligned) {
@@ -63,9 +71,29 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) {
   }
 }
 
-char* Arena::AllocateAligned(size_t bytes) {
+char* Arena::AllocateAligned(size_t bytes, size_t huge_page_tlb_size) {
   assert((kAlignUnit & (kAlignUnit - 1)) ==
          0);  // Pointer size should be a power of 2
+
+#ifdef OS_LINUX
+  if (huge_page_tlb_size > 0 && bytes > 0) {
+    // Allocate from a huge page TBL table.
+    size_t reserved_size =
+        ((bytes - 1U) / huge_page_tlb_size + 1U) * huge_page_tlb_size;
+    assert(reserved_size >= bytes);
+    void* addr = mmap(nullptr, reserved_size, (PROT_READ | PROT_WRITE),
+                      (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), 0, 0);
+    if (addr == MAP_FAILED) {
+      perror("mmap");
+      // fail back to malloc
+    } else {
+      blocks_memory_ += reserved_size;
+      huge_blocks_.push_back(MmapInfo(addr, reserved_size));
+      return reinterpret_cast<char*>(addr);
+    }
+  }
+#endif
+
   size_t current_mod =
       reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1);
   size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod);
index 6ce5a438da5f2be5f1ed05cb0d63f06dbd64e311..a4dff495b893290ac56e2dfdb74fd8d2a27c509b 100644 (file)
@@ -34,7 +34,14 @@ class Arena {
 
   char* Allocate(size_t bytes);
 
-  char* AllocateAligned(size_t bytes);
+  // huge_page_tlb_size: if >0, allocate bytes from huge page TLB and the size
+  // of the huge page TLB. Bytes will be rounded up to multiple and 2MB and
+  // allocate huge pages through mmap anonymous option with huge page on.
+  // The extra  space allocated will be wasted. To enable it, need to reserve
+  // huge pages for it to be allocated, like:
+  //     sysctl -w vm.nr_hugepages=20
+  // See linux doc Documentation/vm/hugetlbpage.txt for details.
+  char* AllocateAligned(size_t bytes, size_t huge_page_tlb_size = 0);
 
   // Returns an estimate of the total memory usage of data allocated
   // by the arena (exclude the space allocated but not yet used for future
@@ -60,6 +67,14 @@ class Arena {
   // Array of new[] allocated memory blocks
   typedef std::vector<char*> Blocks;
   Blocks blocks_;
+
+  struct MmapInfo {
+    void* addr_;
+    size_t length_;
+
+    MmapInfo(void* addr, size_t length) : addr_(addr), length_(length) {}
+  };
+  std::vector<MmapInfo> huge_blocks_;
   size_t irregular_block_num = 0;
 
   // Stats for current active block.
index a4c8e11cb12d94913927d29731b26bb39df9d96b..bc48b9fd3712b168d2f934ddc5f9bfd703835371 100644 (file)
@@ -19,18 +19,19 @@ static uint32_t BloomHash(const Slice& key) {
 }
 }
 
-DynamicBloom::DynamicBloom(uint32_t total_bits,
-                           uint32_t cl_per_block,
+DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t cl_per_block,
                            uint32_t num_probes,
-                           uint32_t (*hash_func)(const Slice& key))
-  : kBlocked(cl_per_block > 0),
-    kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8),
-    kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock
-                              * kBitsPerBlock :
-                           total_bits + 7) / 8 * 8),
-    kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1),
-    kNumProbes(num_probes),
-    hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {
+                           uint32_t (*hash_func)(const Slice& key),
+                           size_t huge_page_tlb_size)
+    : kBlocked(cl_per_block > 0),
+      kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8),
+      kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock *
+                                 kBitsPerBlock
+                           : total_bits + 7) /
+                 8 * 8),
+      kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1),
+      kNumProbes(num_probes),
+      hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {
   assert(kBlocked ? kTotalBits > 0 : kTotalBits >= kBitsPerBlock);
   assert(kNumProbes > 0);
 
@@ -38,7 +39,9 @@ DynamicBloom::DynamicBloom(uint32_t total_bits,
   if (kBlocked) {
     sz += CACHE_LINE_SIZE - 1;
   }
-  raw_ = new unsigned char[sz]();
+  raw_ = reinterpret_cast<unsigned char*>(
+      arena_.AllocateAligned(sz, huge_page_tlb_size));
+  memset(raw_, 0, sz);
   if (kBlocked && (reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE)) {
     data_ = raw_ + CACHE_LINE_SIZE -
       reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE;
index efc461cf98656e772508b6f55f04e0aaef2dd3c3..f91bb8f917d04f711ed14dca3db565bfb5976c4c 100644 (file)
@@ -8,6 +8,8 @@
 #include <atomic>
 #include <memory>
 
+#include <util/arena.h>
+
 namespace rocksdb {
 
 class Slice;
@@ -19,13 +21,17 @@ class DynamicBloom {
   // cl_per_block: block size in cache lines. When this is non-zero, a
   //               query/set is done within a block to improve cache locality.
   // hash_func:  customized hash function
+  // huge_page_tlb_size:  if >0, try to allocate bloom bytes from huge page TLB
+  //                      withi this page size. Need to reserve huge pages for
+  //                      it to be allocated, like:
+  //                         sysctl -w vm.nr_hugepages=20
+  //                     See linux doc Documentation/vm/hugetlbpage.txt
   explicit DynamicBloom(uint32_t total_bits, uint32_t cl_per_block = 0,
-      uint32_t num_probes = 6,
-      uint32_t (*hash_func)(const Slice& key) = nullptr);
+                        uint32_t num_probes = 6,
+                        uint32_t (*hash_func)(const Slice& key) = nullptr,
+                        size_t huge_page_tlb_size = 0);
 
-  ~DynamicBloom() {
-    delete[] raw_;
-  }
+  ~DynamicBloom() {}
 
   // Assuming single threaded access to this function.
   void Add(const Slice& key);
@@ -49,6 +55,8 @@ class DynamicBloom {
   uint32_t (*hash_func_)(const Slice& key);
   unsigned char* data_;
   unsigned char* raw_;
+
+  Arena arena_;
 };
 
 inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); }
index 441f5c9939566d63f8df52171231b691f7b495f0..4ca2a1b870afb6d6a2f5eea7896f06a237733fa7 100644 (file)
@@ -52,7 +52,8 @@ struct Node {
 class HashLinkListRep : public MemTableRep {
  public:
   HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
-                  const SliceTransform* transform, size_t bucket_size);
+                  const SliceTransform* transform, size_t bucket_size,
+                  size_t huge_page_tlb_size);
 
   virtual KeyHandle Allocate(const size_t len, char** buf) override;
 
@@ -308,13 +309,13 @@ class HashLinkListRep : public MemTableRep {
 
 HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
                                  Arena* arena, const SliceTransform* transform,
-                                 size_t bucket_size)
-  : MemTableRep(arena),
-    bucket_size_(bucket_size),
-    transform_(transform),
-    compare_(compare) {
-  char* mem = arena_->AllocateAligned(
-      sizeof(port::AtomicPointer) * bucket_size);
+                                 size_t bucket_size, size_t huge_page_tlb_size)
+    : MemTableRep(arena),
+      bucket_size_(bucket_size),
+      transform_(transform),
+      compare_(compare) {
+  char* mem = arena_->AllocateAligned(sizeof(port::AtomicPointer) * bucket_size,
+                                      huge_page_tlb_size);
 
   buckets_ = new (mem) port::AtomicPointer[bucket_size];
 
@@ -476,11 +477,13 @@ Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
 MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
     const MemTableRep::KeyComparator& compare, Arena* arena,
     const SliceTransform* transform) {
-  return new HashLinkListRep(compare, arena, transform, bucket_count_);
+  return new HashLinkListRep(compare, arena, transform, bucket_count_,
+                             huge_page_tlb_size_);
 }
 
-MemTableRepFactory* NewHashLinkListRepFactory(size_t bucket_count) {
-  return new HashLinkListRepFactory(bucket_count);
+MemTableRepFactory* NewHashLinkListRepFactory(size_t bucket_count,
+                                              size_t huge_page_tlb_size) {
+  return new HashLinkListRepFactory(bucket_count, huge_page_tlb_size);
 }
 
 } // namespace rocksdb
index 11fb7467f6b615a5c1ac1e0c3e323e86236e4b7c..25035f5f26ba6c82b8baabf1d4dd7dfffa61c938 100644 (file)
@@ -14,8 +14,9 @@ namespace rocksdb {
 
 class HashLinkListRepFactory : public MemTableRepFactory {
  public:
-  explicit HashLinkListRepFactory(size_t bucket_count)
-      : bucket_count_(bucket_count) { }
+  explicit HashLinkListRepFactory(size_t bucket_count,
+                                  size_t huge_page_tlb_size)
+      : bucket_count_(bucket_count), huge_page_tlb_size_(huge_page_tlb_size) {}
 
   virtual ~HashLinkListRepFactory() {}
 
@@ -29,6 +30,7 @@ class HashLinkListRepFactory : public MemTableRepFactory {
 
  private:
   const size_t bucket_count_;
+  const size_t huge_page_tlb_size_;
 };
 
 }
index 5c5bab5579144a2d1fcaa79d1fde1464398aa3c1..602182b2abdd107aa82e034f655557640fce18fa 100644 (file)
@@ -317,6 +317,10 @@ Options::Dump(Logger* log) const
         memtable_prefix_bloom_bits);
     Log(log, "            Options.memtable_prefix_bloom_probes: %d",
         memtable_prefix_bloom_probes);
+    Log(log, "  Options.memtable_prefix_bloom_huge_page_tlb_size: %zu",
+        memtable_prefix_bloom_huge_page_tlb_size);
+    Log(log, "                          Options.bloom_locality: %d",
+        bloom_locality);
     Log(log, "                   Options.max_successive_merges: %zd",
         max_successive_merges);
 }   // Options::Dump