]> git-server-git.apps.pok.os.sepia.ceph.com Git - rocksdb.git/commitdiff
Revert "Revert "Allow allocating dynamic bloom, plain table indexes and hash linked...
authorsdong <siying.d@fb.com>
Sun, 4 May 2014 20:55:53 +0000 (13:55 -0700)
committersdong <siying.d@fb.com>
Sun, 4 May 2014 20:56:29 +0000 (13:56 -0700)
And make the default 0 for hash linked list memtable

This reverts commit d69dc64be78a8da3ce661454655966d11ff61bb6.

18 files changed:
db/db_test.cc
db/memtable.cc
db/plain_table_db_test.cc
db/prefix_test.cc
include/rocksdb/memtablerep.h
include/rocksdb/options.h
include/rocksdb/table.h
table/plain_table_factory.cc
table/plain_table_factory.h
table/plain_table_reader.cc
table/plain_table_reader.h
util/arena.cc
util/arena.h
util/dynamic_bloom.cc
util/dynamic_bloom.h
util/hash_linklist_rep.cc
util/hash_linklist_rep.h
util/options.cc

index 5162cec9932c4d1ebc58f0d363b3a06183fb1230..350160af67d0b28e5f4ef6c878ec4f273d58c46c 100644 (file)
@@ -481,7 +481,7 @@ class DBTest {
         break;
       case kHashLinkList:
         options.prefix_extractor.reset(NewFixedPrefixTransform(1));
-        options.memtable_factory.reset(NewHashLinkListRepFactory(4));
+        options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0));
         break;
       case kHashCuckoo:
         options.memtable_factory.reset(
index 424efe84598d35d397b761e9bc5cc65695eb3e41..f95ad3c98840297cf45a4861f2b40ef90be8a227 100644 (file)
@@ -52,9 +52,10 @@ MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
   // gone wrong already.
   assert(!should_flush_);
   if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
-    prefix_bloom_.reset(new DynamicBloom(options.memtable_prefix_bloom_bits,
-                                         options.bloom_locality,
-                                         options.memtable_prefix_bloom_probes));
+    prefix_bloom_.reset(new DynamicBloom(
+        options.memtable_prefix_bloom_bits, options.bloom_locality,
+        options.memtable_prefix_bloom_probes, nullptr,
+        options.memtable_prefix_bloom_huge_page_tlb_size));
   }
 }
 
index 517ef0a9488372b19f63e169b8186acc9c959523..17e3e61d8993299791e8462b20f6d0863167d452 100644 (file)
@@ -185,7 +185,7 @@ class TestPlainTableReader : public PlainTableReader {
                        const Options& options, bool* expect_bloom_not_match)
       : PlainTableReader(options, std::move(file), storage_options, icomparator,
                          file_size, bloom_bits_per_key, hash_table_ratio,
-                         index_sparseness, table_properties),
+                         index_sparseness, table_properties, 2 * 1024 * 1024),
         expect_bloom_not_match_(expect_bloom_not_match) {
     Status s = PopulateIndex(const_cast<TableProperties*>(table_properties));
     ASSERT_TRUE(s.ok());
@@ -206,13 +206,12 @@ extern const uint64_t kPlainTableMagicNumber;
 class TestPlainTableFactory : public PlainTableFactory {
  public:
   explicit TestPlainTableFactory(bool* expect_bloom_not_match,
-                                 uint32_t user_key_len =
-                                     kPlainTableVariableLength,
-                                 int bloom_bits_per_key = 0,
-                                 double hash_table_ratio = 0.75,
-                                 size_t index_sparseness = 16)
+                                 uint32_t user_key_len, int bloom_bits_per_key,
+                                 double hash_table_ratio,
+                                 size_t index_sparseness,
+                                 size_t huge_page_tlb_size)
       : PlainTableFactory(user_key_len, user_key_len, hash_table_ratio,
-                          hash_table_ratio),
+                          index_sparseness, huge_page_tlb_size),
         bloom_bits_per_key_(bloom_bits_per_key),
         hash_table_ratio_(hash_table_ratio),
         index_sparseness_(index_sparseness),
@@ -244,197 +243,209 @@ class TestPlainTableFactory : public PlainTableFactory {
 };
 
 TEST(PlainTableDBTest, Flush) {
-  for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
-    for (int total_order = 0; total_order <= 1; total_order++) {
-      Options options = CurrentOptions();
-      options.create_if_missing = true;
-      // Set only one bucket to force bucket conflict.
-      // Test index interval for the same prefix to be 1, 2 and 4
-      if (total_order) {
-        options.table_factory.reset(
-            NewTotalOrderPlainTableFactory(16, bloom_bits, 2));
-      } else {
-        options.table_factory.reset(NewPlainTableFactory(16, bloom_bits));
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+      for (int total_order = 0; total_order <= 1; total_order++) {
+        Options options = CurrentOptions();
+        options.create_if_missing = true;
+        // Set only one bucket to force bucket conflict.
+        // Test index interval for the same prefix to be 1, 2 and 4
+        if (total_order) {
+          options.table_factory.reset(NewTotalOrderPlainTableFactory(
+              16, bloom_bits, 2, huge_page_tlb_size));
+        } else {
+          options.table_factory.reset(NewPlainTableFactory(
+              16, bloom_bits, 0.75, 16, huge_page_tlb_size));
+        }
+        DestroyAndReopen(&options);
+
+        ASSERT_OK(Put("1000000000000foo", "v1"));
+        ASSERT_OK(Put("0000000000000bar", "v2"));
+        ASSERT_OK(Put("1000000000000foo", "v3"));
+        dbfull()->TEST_FlushMemTable();
+
+        TablePropertiesCollection ptc;
+        reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+        ASSERT_EQ(1U, ptc.size());
+        auto row = ptc.begin();
+        auto tp = row->second;
+        ASSERT_EQ(total_order ? "4" : "12", (tp->user_collected_properties).at(
+                                                "plain_table_hash_table_size"));
+        ASSERT_EQ(total_order ? "9" : "0", (tp->user_collected_properties).at(
+                                               "plain_table_sub_index_size"));
+
+        ASSERT_EQ("v3", Get("1000000000000foo"));
+        ASSERT_EQ("v2", Get("0000000000000bar"));
       }
-      DestroyAndReopen(&options);
-
-      ASSERT_OK(Put("1000000000000foo", "v1"));
-      ASSERT_OK(Put("0000000000000bar", "v2"));
-      ASSERT_OK(Put("1000000000000foo", "v3"));
-      dbfull()->TEST_FlushMemTable();
-
-      TablePropertiesCollection ptc;
-      reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
-      ASSERT_EQ(1U, ptc.size());
-      auto row = ptc.begin();
-      auto tp = row->second;
-      ASSERT_EQ(
-          total_order ? "4" : "12",
-          (tp->user_collected_properties).at("plain_table_hash_table_size"));
-      ASSERT_EQ(
-          total_order ? "9" : "0",
-          (tp->user_collected_properties).at("plain_table_sub_index_size"));
-
-      ASSERT_EQ("v3", Get("1000000000000foo"));
-      ASSERT_EQ("v2", Get("0000000000000bar"));
     }
   }
 }
 
 TEST(PlainTableDBTest, Flush2) {
-  for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
-    for (int total_order = 0; total_order <= 1; total_order++) {
-      bool expect_bloom_not_match = false;
-      Options options = CurrentOptions();
-      options.create_if_missing = true;
-      // Set only one bucket to force bucket conflict.
-      // Test index interval for the same prefix to be 1, 2 and 4
-      if (total_order) {
-        options.prefix_extractor = nullptr;
-        options.table_factory.reset(new TestPlainTableFactory(
-            &expect_bloom_not_match, 16, bloom_bits, 0, 2));
-      } else {
-        options.table_factory.reset(
-            new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits));
-      }
-      DestroyAndReopen(&options);
-      ASSERT_OK(Put("0000000000000bar", "b"));
-      ASSERT_OK(Put("1000000000000foo", "v1"));
-      dbfull()->TEST_FlushMemTable();
-
-      ASSERT_OK(Put("1000000000000foo", "v2"));
-      dbfull()->TEST_FlushMemTable();
-      ASSERT_EQ("v2", Get("1000000000000foo"));
-
-      ASSERT_OK(Put("0000000000000eee", "v3"));
-      dbfull()->TEST_FlushMemTable();
-      ASSERT_EQ("v3", Get("0000000000000eee"));
-
-      ASSERT_OK(Delete("0000000000000bar"));
-      dbfull()->TEST_FlushMemTable();
-      ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
-
-      ASSERT_OK(Put("0000000000000eee", "v5"));
-      ASSERT_OK(Put("9000000000000eee", "v5"));
-      dbfull()->TEST_FlushMemTable();
-      ASSERT_EQ("v5", Get("0000000000000eee"));
-
-      // Test Bloom Filter
-      if (bloom_bits > 0) {
-        // Neither key nor value should exist.
-        expect_bloom_not_match = true;
-        ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
-
-        // Key doesn't exist any more but prefix exists.
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+      for (int total_order = 0; total_order <= 1; total_order++) {
+        bool expect_bloom_not_match = false;
+        Options options = CurrentOptions();
+        options.create_if_missing = true;
+        // Set only one bucket to force bucket conflict.
+        // Test index interval for the same prefix to be 1, 2 and 4
         if (total_order) {
-          ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
-          ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
+          options.prefix_extractor = nullptr;
+          options.table_factory.reset(
+              new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
+                                        0, 2, huge_page_tlb_size));
+        } else {
+          options.table_factory.reset(
+              new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
+                                        0.75, 16, huge_page_tlb_size));
+        }
+        DestroyAndReopen(&options);
+        ASSERT_OK(Put("0000000000000bar", "b"));
+        ASSERT_OK(Put("1000000000000foo", "v1"));
+        dbfull()->TEST_FlushMemTable();
+
+        ASSERT_OK(Put("1000000000000foo", "v2"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("v2", Get("1000000000000foo"));
+
+        ASSERT_OK(Put("0000000000000eee", "v3"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("v3", Get("0000000000000eee"));
+
+        ASSERT_OK(Delete("0000000000000bar"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
+
+        ASSERT_OK(Put("0000000000000eee", "v5"));
+        ASSERT_OK(Put("9000000000000eee", "v5"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("v5", Get("0000000000000eee"));
+
+        // Test Bloom Filter
+        if (bloom_bits > 0) {
+          // Neither key nor value should exist.
+          expect_bloom_not_match = true;
+          ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
+
+          // Key doesn't exist any more but prefix exists.
+          if (total_order) {
+            ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
+            ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
+          }
+          expect_bloom_not_match = false;
         }
-        expect_bloom_not_match = false;
       }
     }
   }
 }
 
 TEST(PlainTableDBTest, Iterator) {
-  for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
-    for (int total_order = 0; total_order <= 1; total_order++) {
-      bool expect_bloom_not_match = false;
-      Options options = CurrentOptions();
-      options.create_if_missing = true;
-      // Set only one bucket to force bucket conflict.
-      // Test index interval for the same prefix to be 1, 2 and 4
-      if (total_order) {
-        options.prefix_extractor = nullptr;
-        options.table_factory.reset(new TestPlainTableFactory(
-            &expect_bloom_not_match, 16, bloom_bits, 0, 2));
-      } else {
-        options.table_factory.reset(
-            new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits));
-      }
-      DestroyAndReopen(&options);
-
-      ASSERT_OK(Put("1000000000foo002", "v_2"));
-      ASSERT_OK(Put("0000000000000bar", "random"));
-      ASSERT_OK(Put("1000000000foo001", "v1"));
-      ASSERT_OK(Put("3000000000000bar", "bar_v"));
-      ASSERT_OK(Put("1000000000foo003", "v__3"));
-      ASSERT_OK(Put("1000000000foo004", "v__4"));
-      ASSERT_OK(Put("1000000000foo005", "v__5"));
-      ASSERT_OK(Put("1000000000foo007", "v__7"));
-      ASSERT_OK(Put("1000000000foo008", "v__8"));
-      dbfull()->TEST_FlushMemTable();
-      ASSERT_EQ("v1", Get("1000000000foo001"));
-      ASSERT_EQ("v__3", Get("1000000000foo003"));
-      Iterator* iter = dbfull()->NewIterator(ReadOptions());
-      iter->Seek("1000000000foo000");
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("1000000000foo001", iter->key().ToString());
-      ASSERT_EQ("v1", iter->value().ToString());
-
-      iter->Next();
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("1000000000foo002", iter->key().ToString());
-      ASSERT_EQ("v_2", iter->value().ToString());
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+      for (int total_order = 0; total_order <= 1; total_order++) {
+        bool expect_bloom_not_match = false;
+        Options options = CurrentOptions();
+        options.create_if_missing = true;
+        // Set only one bucket to force bucket conflict.
+        // Test index interval for the same prefix to be 1, 2 and 4
+        if (total_order) {
+          options.prefix_extractor = nullptr;
+          options.table_factory.reset(
+              new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
+                                        0, 2, huge_page_tlb_size));
+        } else {
+          options.table_factory.reset(
+              new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
+                                        0.75, 16, huge_page_tlb_size));
+        }
+        DestroyAndReopen(&options);
+
+        ASSERT_OK(Put("1000000000foo002", "v_2"));
+        ASSERT_OK(Put("0000000000000bar", "random"));
+        ASSERT_OK(Put("1000000000foo001", "v1"));
+        ASSERT_OK(Put("3000000000000bar", "bar_v"));
+        ASSERT_OK(Put("1000000000foo003", "v__3"));
+        ASSERT_OK(Put("1000000000foo004", "v__4"));
+        ASSERT_OK(Put("1000000000foo005", "v__5"));
+        ASSERT_OK(Put("1000000000foo007", "v__7"));
+        ASSERT_OK(Put("1000000000foo008", "v__8"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("v1", Get("1000000000foo001"));
+        ASSERT_EQ("v__3", Get("1000000000foo003"));
+        Iterator* iter = dbfull()->NewIterator(ReadOptions());
+        iter->Seek("1000000000foo000");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo001", iter->key().ToString());
+        ASSERT_EQ("v1", iter->value().ToString());
 
-      iter->Next();
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("1000000000foo003", iter->key().ToString());
-      ASSERT_EQ("v__3", iter->value().ToString());
+        iter->Next();
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo002", iter->key().ToString());
+        ASSERT_EQ("v_2", iter->value().ToString());
 
-      iter->Next();
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("1000000000foo004", iter->key().ToString());
-      ASSERT_EQ("v__4", iter->value().ToString());
+        iter->Next();
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo003", iter->key().ToString());
+        ASSERT_EQ("v__3", iter->value().ToString());
 
-      iter->Seek("3000000000000bar");
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("3000000000000bar", iter->key().ToString());
-      ASSERT_EQ("bar_v", iter->value().ToString());
+        iter->Next();
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo004", iter->key().ToString());
+        ASSERT_EQ("v__4", iter->value().ToString());
 
-      iter->Seek("1000000000foo000");
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("1000000000foo001", iter->key().ToString());
-      ASSERT_EQ("v1", iter->value().ToString());
+        iter->Seek("3000000000000bar");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("3000000000000bar", iter->key().ToString());
+        ASSERT_EQ("bar_v", iter->value().ToString());
 
-      iter->Seek("1000000000foo005");
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("1000000000foo005", iter->key().ToString());
-      ASSERT_EQ("v__5", iter->value().ToString());
+        iter->Seek("1000000000foo000");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo001", iter->key().ToString());
+        ASSERT_EQ("v1", iter->value().ToString());
 
-      iter->Seek("1000000000foo006");
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("1000000000foo007", iter->key().ToString());
-      ASSERT_EQ("v__7", iter->value().ToString());
+        iter->Seek("1000000000foo005");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo005", iter->key().ToString());
+        ASSERT_EQ("v__5", iter->value().ToString());
 
-      iter->Seek("1000000000foo008");
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ("1000000000foo008", iter->key().ToString());
-      ASSERT_EQ("v__8", iter->value().ToString());
+        iter->Seek("1000000000foo006");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo007", iter->key().ToString());
+        ASSERT_EQ("v__7", iter->value().ToString());
 
-      if (total_order == 0) {
-        iter->Seek("1000000000foo009");
+        iter->Seek("1000000000foo008");
         ASSERT_TRUE(iter->Valid());
-        ASSERT_EQ("3000000000000bar", iter->key().ToString());
-      }
+        ASSERT_EQ("1000000000foo008", iter->key().ToString());
+        ASSERT_EQ("v__8", iter->value().ToString());
 
-      // Test Bloom Filter
-      if (bloom_bits > 0) {
-        if (!total_order) {
-          // Neither key nor value should exist.
-          expect_bloom_not_match = true;
-          iter->Seek("2not000000000bar");
-          ASSERT_TRUE(!iter->Valid());
-          ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
-          expect_bloom_not_match = false;
-        } else {
-          expect_bloom_not_match = true;
-          ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
-          expect_bloom_not_match = false;
+        if (total_order == 0) {
+          iter->Seek("1000000000foo009");
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("3000000000000bar", iter->key().ToString());
         }
-      }
 
-      delete iter;
+        // Test Bloom Filter
+        if (bloom_bits > 0) {
+          if (!total_order) {
+            // Neither key nor value should exist.
+            expect_bloom_not_match = true;
+            iter->Seek("2not000000000bar");
+            ASSERT_TRUE(!iter->Valid());
+            ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+            expect_bloom_not_match = false;
+          } else {
+            expect_bloom_not_match = true;
+            ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+            expect_bloom_not_match = false;
+          }
+        }
+
+        delete iter;
+      }
     }
   }
 }
@@ -581,165 +592,173 @@ TEST(PlainTableDBTest, IteratorReverseSuffixComparator) {
 }
 
 TEST(PlainTableDBTest, HashBucketConflict) {
-  for (unsigned char i = 1; i <= 3; i++) {
-    Options options = CurrentOptions();
-    options.create_if_missing = true;
-    // Set only one bucket to force bucket conflict.
-    // Test index interval for the same prefix to be 1, 2 and 4
-    options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i));
-    DestroyAndReopen(&options);
-    ASSERT_OK(Put("5000000000000fo0", "v1"));
-    ASSERT_OK(Put("5000000000000fo1", "v2"));
-    ASSERT_OK(Put("5000000000000fo2", "v"));
-    ASSERT_OK(Put("2000000000000fo0", "v3"));
-    ASSERT_OK(Put("2000000000000fo1", "v4"));
-    ASSERT_OK(Put("2000000000000fo2", "v"));
-    ASSERT_OK(Put("2000000000000fo3", "v"));
-
-    dbfull()->TEST_FlushMemTable();
-
-    ASSERT_EQ("v1", Get("5000000000000fo0"));
-    ASSERT_EQ("v2", Get("5000000000000fo1"));
-    ASSERT_EQ("v3", Get("2000000000000fo0"));
-    ASSERT_EQ("v4", Get("2000000000000fo1"));
-
-    ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
-    ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
-    ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
-    ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
-
-    ReadOptions ro;
-    Iterator* iter = dbfull()->NewIterator(ro);
-
-    iter->Seek("5000000000000fo0");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("5000000000000fo0", iter->key().ToString());
-    iter->Next();
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (unsigned char i = 1; i <= 3; i++) {
+      Options options = CurrentOptions();
+      options.create_if_missing = true;
+      // Set only one bucket to force bucket conflict.
+      // Test index interval for the same prefix to be 1, 2 and 4
+      options.table_factory.reset(
+          NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
+      DestroyAndReopen(&options);
+      ASSERT_OK(Put("5000000000000fo0", "v1"));
+      ASSERT_OK(Put("5000000000000fo1", "v2"));
+      ASSERT_OK(Put("5000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo0", "v3"));
+      ASSERT_OK(Put("2000000000000fo1", "v4"));
+      ASSERT_OK(Put("2000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo3", "v"));
 
-    iter->Seek("5000000000000fo1");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+      dbfull()->TEST_FlushMemTable();
 
-    iter->Seek("2000000000000fo0");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("2000000000000fo0", iter->key().ToString());
-    iter->Next();
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+      ASSERT_EQ("v1", Get("5000000000000fo0"));
+      ASSERT_EQ("v2", Get("5000000000000fo1"));
+      ASSERT_EQ("v3", Get("2000000000000fo0"));
+      ASSERT_EQ("v4", Get("2000000000000fo1"));
 
-    iter->Seek("2000000000000fo1");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
 
-    iter->Seek("2000000000000bar");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+      ReadOptions ro;
+      Iterator* iter = dbfull()->NewIterator(ro);
 
-    iter->Seek("5000000000000bar");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+      iter->Seek("5000000000000fo0");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
 
-    iter->Seek("2000000000000fo8");
-    ASSERT_TRUE(!iter->Valid() ||
-                options.comparator->Compare(iter->key(), "20000001") > 0);
+      iter->Seek("2000000000000fo0");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000bar");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+
+      iter->Seek("5000000000000bar");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
 
-    iter->Seek("5000000000000fo8");
-    ASSERT_TRUE(!iter->Valid());
+      iter->Seek("2000000000000fo8");
+      ASSERT_TRUE(!iter->Valid() ||
+                  options.comparator->Compare(iter->key(), "20000001") > 0);
 
-    iter->Seek("1000000000000fo2");
-    ASSERT_TRUE(!iter->Valid());
+      iter->Seek("5000000000000fo8");
+      ASSERT_TRUE(!iter->Valid());
 
-    iter->Seek("3000000000000fo2");
-    ASSERT_TRUE(!iter->Valid());
+      iter->Seek("1000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
 
-    iter->Seek("8000000000000fo2");
-    ASSERT_TRUE(!iter->Valid());
+      iter->Seek("3000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
 
-    delete iter;
+      iter->Seek("8000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      delete iter;
+    }
   }
 }
 
 TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
-  for (unsigned char i = 1; i <= 3; i++) {
-    Options options = CurrentOptions();
-    options.create_if_missing = true;
-    SimpleSuffixReverseComparator comp;
-    options.comparator = &comp;
-    // Set only one bucket to force bucket conflict.
-    // Test index interval for the same prefix to be 1, 2 and 4
-    options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i));
-    DestroyAndReopen(&options);
-    ASSERT_OK(Put("5000000000000fo0", "v1"));
-    ASSERT_OK(Put("5000000000000fo1", "v2"));
-    ASSERT_OK(Put("5000000000000fo2", "v"));
-    ASSERT_OK(Put("2000000000000fo0", "v3"));
-    ASSERT_OK(Put("2000000000000fo1", "v4"));
-    ASSERT_OK(Put("2000000000000fo2", "v"));
-    ASSERT_OK(Put("2000000000000fo3", "v"));
-
-    dbfull()->TEST_FlushMemTable();
-
-    ASSERT_EQ("v1", Get("5000000000000fo0"));
-    ASSERT_EQ("v2", Get("5000000000000fo1"));
-    ASSERT_EQ("v3", Get("2000000000000fo0"));
-    ASSERT_EQ("v4", Get("2000000000000fo1"));
-
-    ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
-    ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
-    ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
-    ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
-
-    ReadOptions ro;
-    Iterator* iter = dbfull()->NewIterator(ro);
-
-    iter->Seek("5000000000000fo1");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("5000000000000fo1", iter->key().ToString());
-    iter->Next();
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (unsigned char i = 1; i <= 3; i++) {
+      Options options = CurrentOptions();
+      options.create_if_missing = true;
+      SimpleSuffixReverseComparator comp;
+      options.comparator = &comp;
+      // Set only one bucket to force bucket conflict.
+      // Test index interval for the same prefix to be 1, 2 and 4
+      options.table_factory.reset(
+          NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
+      DestroyAndReopen(&options);
+      ASSERT_OK(Put("5000000000000fo0", "v1"));
+      ASSERT_OK(Put("5000000000000fo1", "v2"));
+      ASSERT_OK(Put("5000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo0", "v3"));
+      ASSERT_OK(Put("2000000000000fo1", "v4"));
+      ASSERT_OK(Put("2000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo3", "v"));
 
-    iter->Seek("5000000000000fo1");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+      dbfull()->TEST_FlushMemTable();
 
-    iter->Seek("2000000000000fo1");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("2000000000000fo1", iter->key().ToString());
-    iter->Next();
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+      ASSERT_EQ("v1", Get("5000000000000fo0"));
+      ASSERT_EQ("v2", Get("5000000000000fo1"));
+      ASSERT_EQ("v3", Get("2000000000000fo0"));
+      ASSERT_EQ("v4", Get("2000000000000fo1"));
 
-    iter->Seek("2000000000000fo1");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
 
-    iter->Seek("2000000000000var");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("2000000000000fo3", iter->key().ToString());
+      ReadOptions ro;
+      Iterator* iter = dbfull()->NewIterator(ro);
 
-    iter->Seek("5000000000000var");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("5000000000000fo2", iter->key().ToString());
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
 
-    std::string seek_key = "2000000000000bar";
-    iter->Seek(seek_key);
-    ASSERT_TRUE(!iter->Valid() ||
-                options.prefix_extractor->Transform(iter->key()) !=
-                    options.prefix_extractor->Transform(seek_key));
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
 
-    iter->Seek("1000000000000fo2");
-    ASSERT_TRUE(!iter->Valid());
+      iter->Seek("2000000000000var");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo3", iter->key().ToString());
+
+      iter->Seek("5000000000000var");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo2", iter->key().ToString());
 
-    iter->Seek("3000000000000fo2");
-    ASSERT_TRUE(!iter->Valid());
+      std::string seek_key = "2000000000000bar";
+      iter->Seek(seek_key);
+      ASSERT_TRUE(!iter->Valid() ||
+                  options.prefix_extractor->Transform(iter->key()) !=
+                      options.prefix_extractor->Transform(seek_key));
 
-    iter->Seek("8000000000000fo2");
-    ASSERT_TRUE(!iter->Valid());
+      iter->Seek("1000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
 
-    delete iter;
+      iter->Seek("3000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("8000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      delete iter;
+    }
   }
 }
 
index 18036bb930079c8a87b5168ce30c1491b1fe5005..3a88fc8ce86ce77e87362f6cbbfc6f981f3535d6 100644 (file)
@@ -30,6 +30,7 @@ DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
 DEFINE_int32(skiplist_height, 4, "");
 DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
 DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
+DEFINE_int32(memtable_prefix_bloom_huge_page_tlb_size, 2 * 1024 * 1024, "");
 DEFINE_int32(value_size, 40, "");
 
 // Path to the database on file system
@@ -148,6 +149,8 @@ class PrefixTest {
 
     options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits;
     options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes;
+    options.memtable_prefix_bloom_huge_page_tlb_size =
+        FLAGS_memtable_prefix_bloom_huge_page_tlb_size;
 
     Status s = DB::Open(options, kDbName,  &db);
     ASSERT_OK(s);
@@ -172,6 +175,10 @@ class PrefixTest {
           options.memtable_factory.reset(
               NewHashLinkListRepFactory(bucket_count));
           return true;
+        case kHashLinkListHugePageTlb:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024));
+          return true;
         default:
           return false;
       }
@@ -190,6 +197,7 @@ class PrefixTest {
     kBegin,
     kHashSkipList,
     kHashLinkList,
+    kHashLinkListHugePageTlb,
     kEnd
   };
   int option_config_;
index 445edccace1c0515a12e953a840bf8427fc3727e..be15a608cb5497c85301e0b32b1aca69e3c17e69 100644 (file)
@@ -223,9 +223,14 @@ extern MemTableRepFactory* NewHashSkipListRepFactory(
 // The factory is to create memtables with a hashed linked list:
 // it contains a fixed array of buckets, each pointing to a sorted single
 // linked list (null if the bucket is empty).
-// bucket_count: number of fixed array buckets
+// @bucket_count: number of fixed array buckets
+// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
+//                      Otherwise from huge page TLB. The user needs to reserve
+//                      huge pages for it to be allocated, like:
+//                          sysctl -w vm.nr_hugepages=20
+//                      See linux doc Documentation/vm/hugetlbpage.txt
 extern MemTableRepFactory* NewHashLinkListRepFactory(
-    size_t bucket_count = 50000);
+    size_t bucket_count = 50000, size_t huge_page_tlb_size = 0);
 
 // This factory creates a cuckoo-hashing based mem-table representation.
 // Cuckoo-hash is a closed-hash strategy, in which all key/value pairs
index c283a5e5395f559d8cac90c69d22d47530cbb67d..93dbf0d8824c988f9fd6869ffaa32023d12b9f9e 100644 (file)
@@ -498,6 +498,14 @@ struct ColumnFamilyOptions {
   // number of hash probes per key
   uint32_t memtable_prefix_bloom_probes;
 
+  // Page size for huge page TLB for bloom in memtable. If <=0, not allocate
+  // from huge page TLB but from malloc.
+  // Need to reserve huge pages for it to be allocated. For example:
+  //      sysctl -w vm.nr_hugepages=20
+  // See linux doc Documentation/vm/hugetlbpage.txt
+
+  size_t memtable_prefix_bloom_huge_page_tlb_size;
+
   // Control locality of bloom filter probes to improve cache miss rate.
   // This option only applies to memtable prefix bloom and plaintable
   // prefix bloom. It essentially limits the max number of cache lines each
index 14a505a6f989ab357e656b69817552d36936a880..11adfec8cfd6d8a1e14564be5e16d73248462203 100644 (file)
@@ -107,12 +107,19 @@ extern TableFactory* NewBlockBasedTableFactory(
 //                    in the hash table
 // @index_sparseness: inside each prefix, need to build one index record for how
 //                    many keys for binary search inside each hash bucket.
+// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
+//                      Otherwise from huge page TLB. The user needs to reserve
+//                      huge pages for it to be allocated, like:
+//                          sysctl -w vm.nr_hugepages=20
+//                      See linux doc Documentation/vm/hugetlbpage.txt
+
 const uint32_t kPlainTableVariableLength = 0;
 extern TableFactory* NewPlainTableFactory(uint32_t user_key_len =
                                               kPlainTableVariableLength,
                                           int bloom_bits_per_prefix = 10,
                                           double hash_table_ratio = 0.75,
-                                          size_t index_sparseness = 16);
+                                          size_t index_sparseness = 16,
+                                          size_t huge_page_tlb_size = 0);
 
 // -- Plain Table
 // This factory of plain table ignores Options.prefix_extractor and assumes no
@@ -126,9 +133,15 @@ extern TableFactory* NewPlainTableFactory(uint32_t user_key_len =
 //                  disable it by passing a zero.
 // @index_sparseness: need to build one index record for how many keys for
 //                    binary search.
+// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
+//                      Otherwise from huge page TLB. The user needs to reserve
+//                      huge pages for it to be allocated, like:
+//                          sysctl -w vm.nr_hugepages=20
+//                      See linux doc Documentation/vm/hugetlbpage.txt
 extern TableFactory* NewTotalOrderPlainTableFactory(
     uint32_t user_key_len = kPlainTableVariableLength,
-    int bloom_bits_per_key = 0, size_t index_sparseness = 16);
+    int bloom_bits_per_key = 0, size_t index_sparseness = 16,
+    size_t huge_page_tlb_size = 0);
 
 #endif  // ROCKSDB_LITE
 
index 4e844687dbd1b72da578e038ebdb1ad53d715962..f9d88e9ef0b2b3d331ababe42e5b0b76cd99c276 100644 (file)
@@ -22,7 +22,8 @@ Status PlainTableFactory::NewTableReader(const Options& options,
                                          unique_ptr<TableReader>* table) const {
   return PlainTableReader::Open(options, soptions, icomp, std::move(file),
                                 file_size, table, bloom_bits_per_key_,
-                                hash_table_ratio_, index_sparseness_);
+                                hash_table_ratio_, index_sparseness_,
+                                huge_page_tlb_size_);
 }
 
 TableBuilder* PlainTableFactory::NewTableBuilder(
@@ -34,16 +35,19 @@ TableBuilder* PlainTableFactory::NewTableBuilder(
 extern TableFactory* NewPlainTableFactory(uint32_t user_key_len,
                                           int bloom_bits_per_key,
                                           double hash_table_ratio,
-                                          size_t index_sparseness) {
+                                          size_t index_sparseness,
+                                          size_t huge_page_tlb_size) {
   return new PlainTableFactory(user_key_len, bloom_bits_per_key,
-                               hash_table_ratio, index_sparseness);
+                               hash_table_ratio, index_sparseness,
+                               huge_page_tlb_size);
 }
 
 extern TableFactory* NewTotalOrderPlainTableFactory(uint32_t user_key_len,
                                                     int bloom_bits_per_key,
-                                                    size_t index_sparseness) {
+                                                    size_t index_sparseness,
+                                                    size_t huge_page_tlb_size) {
   return new PlainTableFactory(user_key_len, bloom_bits_per_key, 0,
-                               index_sparseness);
+                               index_sparseness, huge_page_tlb_size);
 }
 
 }  // namespace rocksdb
index 84af22fb9a5a9bea78faa4015d2186c497e29724..77d24f7116b9fc87c0abe218b69385f480c8ff64 100644 (file)
@@ -56,14 +56,19 @@ class PlainTableFactory : public TableFactory {
   // inside the same prefix. It will be the maximum number of linear search
   // required after hash and binary search.
   // index_sparseness = 0 means index for every key.
+  // huge_page_tlb_size determines whether to allocate hash indexes from huge
+  // page TLB and the page size if allocating from there. See comments of
+  // Arena::AllocateAligned() for details.
   explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength,
                              int bloom_bits_per_key = 0,
                              double hash_table_ratio = 0.75,
-                             size_t index_sparseness = 16)
+                             size_t index_sparseness = 16,
+                             size_t huge_page_tlb_size = 2 * 1024 * 1024)
       : user_key_len_(user_key_len),
         bloom_bits_per_key_(bloom_bits_per_key),
         hash_table_ratio_(hash_table_ratio),
-        index_sparseness_(index_sparseness) {}
+        index_sparseness_(index_sparseness),
+        huge_page_tlb_size_(huge_page_tlb_size) {}
   const char* Name() const override { return "PlainTable"; }
   Status NewTableReader(const Options& options, const EnvOptions& soptions,
                         const InternalKeyComparator& internal_comparator,
@@ -82,6 +87,7 @@ class PlainTableFactory : public TableFactory {
   int bloom_bits_per_key_;
   double hash_table_ratio_;
   size_t index_sparseness_;
+  size_t huge_page_tlb_size_;
 };
 
 }  // namespace rocksdb
index 196201730bdb5ed57663a732c5682f7d81eb4a4c..f1cb3db476c7b70141299060cfcc020f096511b1 100644 (file)
@@ -24,6 +24,7 @@
 #include "table/two_level_iterator.h"
 #include "table/plain_table_factory.h"
 
+#include "util/arena.h"
 #include "util/coding.h"
 #include "util/dynamic_bloom.h"
 #include "util/hash.h"
@@ -95,7 +96,8 @@ PlainTableReader::PlainTableReader(
     const Options& options, unique_ptr<RandomAccessFile>&& file,
     const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
     uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio,
-    size_t index_sparseness, const TableProperties* table_properties)
+    size_t index_sparseness, const TableProperties* table_properties,
+    size_t huge_page_tlb_size)
     : options_(options),
       soptions_(storage_options),
       file_(std::move(file)),
@@ -106,19 +108,23 @@ PlainTableReader::PlainTableReader(
       kIndexIntervalForSamePrefixKeys(index_sparseness),
       table_properties_(nullptr),
       data_end_offset_(table_properties->data_size),
-      user_key_len_(table_properties->fixed_key_len) {
+      user_key_len_(table_properties->fixed_key_len),
+      huge_page_tlb_size_(huge_page_tlb_size) {
   assert(kHashTableRatio >= 0.0);
 }
 
 PlainTableReader::~PlainTableReader() {
 }
 
-Status PlainTableReader::Open(
-    const Options& options, const EnvOptions& soptions,
-    const InternalKeyComparator& internal_comparator,
-    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-    unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
-    double hash_table_ratio, size_t index_sparseness) {
+Status PlainTableReader::Open(const Options& options,
+                              const EnvOptions& soptions,
+                              const InternalKeyComparator& internal_comparator,
+                              unique_ptr<RandomAccessFile>&& file,
+                              uint64_t file_size,
+                              unique_ptr<TableReader>* table_reader,
+                              const int bloom_bits_per_key,
+                              double hash_table_ratio, size_t index_sparseness,
+                              size_t huge_page_tlb_size) {
   assert(options.allow_mmap_reads);
 
   if (file_size > kMaxFileSize) {
@@ -134,7 +140,8 @@ Status PlainTableReader::Open(
 
   std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
       options, std::move(file), soptions, internal_comparator, file_size,
-      bloom_bits_per_key, hash_table_ratio, index_sparseness, props));
+      bloom_bits_per_key, hash_table_ratio, index_sparseness, props,
+      huge_page_tlb_size));
 
   // -- Populate Index
   s = new_reader->PopulateIndex(props);
@@ -261,12 +268,11 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
 }
 
 void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
-  index_.reset();
-
   if (options_.prefix_extractor.get() != nullptr) {
     uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey;
     if (bloom_total_bits > 0) {
-      bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality));
+      bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality,
+                                    6, nullptr, huge_page_tlb_size_));
     }
   }
 
@@ -278,7 +284,6 @@ void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
     double hash_table_size_multipier = 1.0 / kHashTableRatio;
     index_size_ = num_prefixes * hash_table_size_multipier + 1;
   }
-  index_.reset(new uint32_t[index_size_]);
 }
 
 size_t PlainTableReader::BucketizeIndexesAndFillBloom(
@@ -322,7 +327,12 @@ void PlainTableReader::FillIndexes(
     const std::vector<uint32_t>& entries_per_bucket) {
   Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
       kSubIndexSize);
-  sub_index_.reset(new char[kSubIndexSize]);
+  auto total_allocate_size = sizeof(uint32_t) * index_size_ + kSubIndexSize;
+  char* allocated =
+      arena_.AllocateAligned(total_allocate_size, huge_page_tlb_size_);
+  index_ = reinterpret_cast<uint32_t*>(allocated);
+  sub_index_ = allocated + sizeof(uint32_t) * index_size_;
+
   size_t sub_index_offset = 0;
   for (int i = 0; i < index_size_; i++) {
     uint32_t num_keys_for_bucket = entries_per_bucket[i];
@@ -387,7 +397,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props) {
   if (IsTotalOrderMode()) {
     uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey;
     if (num_bloom_bits > 0) {
-      bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality));
+      bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality, 6,
+                                    nullptr, huge_page_tlb_size_));
     }
   }
 
index 756439b5cd1280b26eccfafbe45d8fa708c8f60f..e6373dc827907077202849a525cc977d64f89622 100644 (file)
@@ -19,6 +19,7 @@
 #include "rocksdb/table_properties.h"
 #include "table/table_reader.h"
 #include "table/plain_table_factory.h"
+#include "util/arena.h"
 
 namespace rocksdb {
 
@@ -52,7 +53,7 @@ class PlainTableReader: public TableReader {
                      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                      unique_ptr<TableReader>* table,
                      const int bloom_bits_per_key, double hash_table_ratio,
-                     size_t index_sparseness);
+                     size_t index_sparseness, size_t huge_page_tlb_size);
 
   Iterator* NewIterator(const ReadOptions&);
 
@@ -74,7 +75,8 @@ class PlainTableReader: public TableReader {
                    const InternalKeyComparator& internal_comparator,
                    uint64_t file_size, int bloom_num_bits,
                    double hash_table_ratio, size_t index_sparseness,
-                   const TableProperties* table_properties);
+                   const TableProperties* table_properties,
+                   size_t huge_page_tlb_size);
   virtual ~PlainTableReader();
 
  protected:
@@ -136,9 +138,9 @@ class PlainTableReader: public TableReader {
   // For more details about the in-memory index, please refer to:
   // https://github.com/facebook/rocksdb/wiki/PlainTable-Format
   // #wiki-in-memory-index-format
-  std::unique_ptr<uint32_t[]> index_;
+  uint32_t* index_;
   int index_size_ = 0;
-  std::unique_ptr<char[]> sub_index_;
+  char* sub_index_;
 
   Options options_;
   const EnvOptions& soptions_;
@@ -159,6 +161,7 @@ class PlainTableReader: public TableReader {
   const size_t kIndexIntervalForSamePrefixKeys = 16;
   // Bloom filter is used to rule out non-existent key
   unique_ptr<DynamicBloom> bloom_;
+  Arena arena_;
 
   std::shared_ptr<const TableProperties> table_properties_;
   // data_start_offset_ and data_end_offset_ defines the range of the
@@ -166,6 +169,7 @@ class PlainTableReader: public TableReader {
   const uint32_t data_start_offset_ = 0;
   const uint32_t data_end_offset_;
   const size_t user_key_len_;
+  const size_t huge_page_tlb_size_;
 
   static const size_t kNumInternalBytes = 8;
   static const uint32_t kSubIndexMask = 0x80000000;
index 9b2cb82d1a00376319233b6e34f1acc3c9cfc7cf..3575f2d9065cc7d159e3e70e878e5d643b931825 100644 (file)
@@ -8,6 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "util/arena.h"
+#include <sys/mman.h>
 #include <algorithm>
 
 namespace rocksdb {
@@ -38,6 +39,13 @@ Arena::~Arena() {
   for (const auto& block : blocks_) {
     delete[] block;
   }
+  for (const auto& mmap_info : huge_blocks_) {
+    auto ret = munmap(mmap_info.addr_, mmap_info.length_);
+    if (ret != 0) {
+      // TODO(sdong): Better handling
+      perror("munmap");
+    }
+  }
 }
 
 char* Arena::AllocateFallback(size_t bytes, bool aligned) {
@@ -63,9 +71,29 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) {
   }
 }
 
-char* Arena::AllocateAligned(size_t bytes) {
+char* Arena::AllocateAligned(size_t bytes, size_t huge_page_tlb_size) {
   assert((kAlignUnit & (kAlignUnit - 1)) ==
          0);  // Pointer size should be a power of 2
+
+#ifdef OS_LINUX
+  if (huge_page_tlb_size > 0 && bytes > 0) {
+    // Allocate from a huge page TBL table.
+    size_t reserved_size =
+        ((bytes - 1U) / huge_page_tlb_size + 1U) * huge_page_tlb_size;
+    assert(reserved_size >= bytes);
+    void* addr = mmap(nullptr, reserved_size, (PROT_READ | PROT_WRITE),
+                      (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), 0, 0);
+    if (addr == MAP_FAILED) {
+      perror("mmap");
+      // fail back to malloc
+    } else {
+      blocks_memory_ += reserved_size;
+      huge_blocks_.push_back(MmapInfo(addr, reserved_size));
+      return reinterpret_cast<char*>(addr);
+    }
+  }
+#endif
+
   size_t current_mod =
       reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1);
   size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod);
index 6ce5a438da5f2be5f1ed05cb0d63f06dbd64e311..a4dff495b893290ac56e2dfdb74fd8d2a27c509b 100644 (file)
@@ -34,7 +34,14 @@ class Arena {
 
   char* Allocate(size_t bytes);
 
-  char* AllocateAligned(size_t bytes);
+  // huge_page_tlb_size: if >0, allocate bytes from huge page TLB and the size
+  // of the huge page TLB. Bytes will be rounded up to multiple and 2MB and
+  // allocate huge pages through mmap anonymous option with huge page on.
+  // The extra  space allocated will be wasted. To enable it, need to reserve
+  // huge pages for it to be allocated, like:
+  //     sysctl -w vm.nr_hugepages=20
+  // See linux doc Documentation/vm/hugetlbpage.txt for details.
+  char* AllocateAligned(size_t bytes, size_t huge_page_tlb_size = 0);
 
   // Returns an estimate of the total memory usage of data allocated
   // by the arena (exclude the space allocated but not yet used for future
@@ -60,6 +67,14 @@ class Arena {
   // Array of new[] allocated memory blocks
   typedef std::vector<char*> Blocks;
   Blocks blocks_;
+
+  struct MmapInfo {
+    void* addr_;
+    size_t length_;
+
+    MmapInfo(void* addr, size_t length) : addr_(addr), length_(length) {}
+  };
+  std::vector<MmapInfo> huge_blocks_;
   size_t irregular_block_num = 0;
 
   // Stats for current active block.
index a4c8e11cb12d94913927d29731b26bb39df9d96b..bc48b9fd3712b168d2f934ddc5f9bfd703835371 100644 (file)
@@ -19,18 +19,19 @@ static uint32_t BloomHash(const Slice& key) {
 }
 }
 
-DynamicBloom::DynamicBloom(uint32_t total_bits,
-                           uint32_t cl_per_block,
+DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t cl_per_block,
                            uint32_t num_probes,
-                           uint32_t (*hash_func)(const Slice& key))
-  : kBlocked(cl_per_block > 0),
-    kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8),
-    kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock
-                              * kBitsPerBlock :
-                           total_bits + 7) / 8 * 8),
-    kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1),
-    kNumProbes(num_probes),
-    hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {
+                           uint32_t (*hash_func)(const Slice& key),
+                           size_t huge_page_tlb_size)
+    : kBlocked(cl_per_block > 0),
+      kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8),
+      kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock *
+                                 kBitsPerBlock
+                           : total_bits + 7) /
+                 8 * 8),
+      kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1),
+      kNumProbes(num_probes),
+      hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {
   assert(kBlocked ? kTotalBits > 0 : kTotalBits >= kBitsPerBlock);
   assert(kNumProbes > 0);
 
@@ -38,7 +39,9 @@ DynamicBloom::DynamicBloom(uint32_t total_bits,
   if (kBlocked) {
     sz += CACHE_LINE_SIZE - 1;
   }
-  raw_ = new unsigned char[sz]();
+  raw_ = reinterpret_cast<unsigned char*>(
+      arena_.AllocateAligned(sz, huge_page_tlb_size));
+  memset(raw_, 0, sz);
   if (kBlocked && (reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE)) {
     data_ = raw_ + CACHE_LINE_SIZE -
       reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE;
index efc461cf98656e772508b6f55f04e0aaef2dd3c3..f91bb8f917d04f711ed14dca3db565bfb5976c4c 100644 (file)
@@ -8,6 +8,8 @@
 #include <atomic>
 #include <memory>
 
+#include <util/arena.h>
+
 namespace rocksdb {
 
 class Slice;
@@ -19,13 +21,17 @@ class DynamicBloom {
   // cl_per_block: block size in cache lines. When this is non-zero, a
   //               query/set is done within a block to improve cache locality.
   // hash_func:  customized hash function
+  // huge_page_tlb_size:  if >0, try to allocate bloom bytes from huge page TLB
+  //                      withi this page size. Need to reserve huge pages for
+  //                      it to be allocated, like:
+  //                         sysctl -w vm.nr_hugepages=20
+  //                     See linux doc Documentation/vm/hugetlbpage.txt
   explicit DynamicBloom(uint32_t total_bits, uint32_t cl_per_block = 0,
-      uint32_t num_probes = 6,
-      uint32_t (*hash_func)(const Slice& key) = nullptr);
+                        uint32_t num_probes = 6,
+                        uint32_t (*hash_func)(const Slice& key) = nullptr,
+                        size_t huge_page_tlb_size = 0);
 
-  ~DynamicBloom() {
-    delete[] raw_;
-  }
+  ~DynamicBloom() {}
 
   // Assuming single threaded access to this function.
   void Add(const Slice& key);
@@ -49,6 +55,8 @@ class DynamicBloom {
   uint32_t (*hash_func_)(const Slice& key);
   unsigned char* data_;
   unsigned char* raw_;
+
+  Arena arena_;
 };
 
 inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); }
index 64aa2d9e8154ce21d35d4f77d726f287abd0ab84..acd78c5bba5a2cf55ac323164d2b3196778cc0c0 100644 (file)
@@ -53,7 +53,8 @@ struct Node {
 class HashLinkListRep : public MemTableRep {
  public:
   HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
-                  const SliceTransform* transform, size_t bucket_size);
+                  const SliceTransform* transform, size_t bucket_size,
+                  size_t huge_page_tlb_size);
 
   virtual KeyHandle Allocate(const size_t len, char** buf) override;
 
@@ -306,13 +307,13 @@ class HashLinkListRep : public MemTableRep {
 
 HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
                                  Arena* arena, const SliceTransform* transform,
-                                 size_t bucket_size)
-  : MemTableRep(arena),
-    bucket_size_(bucket_size),
-    transform_(transform),
-    compare_(compare) {
-  char* mem = arena_->AllocateAligned(
-      sizeof(port::AtomicPointer) * bucket_size);
+                                 size_t bucket_size, size_t huge_page_tlb_size)
+    : MemTableRep(arena),
+      bucket_size_(bucket_size),
+      transform_(transform),
+      compare_(compare) {
+  char* mem = arena_->AllocateAligned(sizeof(port::AtomicPointer) * bucket_size,
+                                      huge_page_tlb_size);
 
   buckets_ = new (mem) port::AtomicPointer[bucket_size];
 
@@ -469,11 +470,13 @@ Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
 MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
     const MemTableRep::KeyComparator& compare, Arena* arena,
     const SliceTransform* transform) {
-  return new HashLinkListRep(compare, arena, transform, bucket_count_);
+  return new HashLinkListRep(compare, arena, transform, bucket_count_,
+                             huge_page_tlb_size_);
 }
 
-MemTableRepFactory* NewHashLinkListRepFactory(size_t bucket_count) {
-  return new HashLinkListRepFactory(bucket_count);
+MemTableRepFactory* NewHashLinkListRepFactory(size_t bucket_count,
+                                              size_t huge_page_tlb_size) {
+  return new HashLinkListRepFactory(bucket_count, huge_page_tlb_size);
 }
 
 } // namespace rocksdb
index f1ab5d560f6e5cb3491dfe027db985c4603b8409..4a9fd0009c2adcd099b1fbf6271841cd3a83f9c3 100644 (file)
@@ -15,8 +15,9 @@ namespace rocksdb {
 
 class HashLinkListRepFactory : public MemTableRepFactory {
  public:
-  explicit HashLinkListRepFactory(size_t bucket_count)
-      : bucket_count_(bucket_count) { }
+  explicit HashLinkListRepFactory(size_t bucket_count,
+                                  size_t huge_page_tlb_size)
+      : bucket_count_(bucket_count), huge_page_tlb_size_(huge_page_tlb_size) {}
 
   virtual ~HashLinkListRepFactory() {}
 
@@ -30,6 +31,7 @@ class HashLinkListRepFactory : public MemTableRepFactory {
 
  private:
   const size_t bucket_count_;
+  const size_t huge_page_tlb_size_;
 };
 
 }
index e33d44ebea0f0a2b4db969cf6458a584f93ea54d..c8d1e3889872bcf56ba397e3c77df6f703666f94 100644 (file)
@@ -34,8 +34,7 @@ ColumnFamilyOptions::ColumnFamilyOptions()
       compaction_filter(nullptr),
       compaction_filter_factory(std::shared_ptr<CompactionFilterFactory>(
           new DefaultCompactionFilterFactory())),
-      compaction_filter_factory_v2(
-          new DefaultCompactionFilterFactoryV2()),
+      compaction_filter_factory_v2(new DefaultCompactionFilterFactoryV2()),
       write_buffer_size(4 << 20),
       max_write_buffer_number(2),
       min_write_buffer_number_to_merge(1),
@@ -81,6 +80,7 @@ ColumnFamilyOptions::ColumnFamilyOptions()
       inplace_callback(nullptr),
       memtable_prefix_bloom_bits(0),
       memtable_prefix_bloom_probes(6),
+      memtable_prefix_bloom_huge_page_tlb_size(0),
       bloom_locality(0),
       max_successive_merges(0),
       min_partial_merge_operands(2) {
@@ -146,6 +146,8 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
       inplace_callback(options.inplace_callback),
       memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
       memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
+      memtable_prefix_bloom_huge_page_tlb_size(
+          options.memtable_prefix_bloom_huge_page_tlb_size),
       bloom_locality(options.bloom_locality),
       max_successive_merges(options.max_successive_merges),
       min_partial_merge_operands(options.min_partial_merge_operands) {
@@ -428,6 +430,8 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
         memtable_prefix_bloom_bits);
     Log(log, "            Options.memtable_prefix_bloom_probes: %d",
         memtable_prefix_bloom_probes);
+    Log(log, "  Options.memtable_prefix_bloom_huge_page_tlb_size: %zu",
+        memtable_prefix_bloom_huge_page_tlb_size);
     Log(log, "                          Options.bloom_locality: %d",
         bloom_locality);
     Log(log, "                   Options.max_successive_merges: %zd",