]> git-server-git.apps.pok.os.sepia.ceph.com Git - rocksdb.git/commitdiff
Stable cache keys using DB session ids in SSTs (#8659)
authorPeter Dillinger <peterd@fb.com>
Tue, 17 Aug 2021 03:36:19 +0000 (20:36 -0700)
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Tue, 17 Aug 2021 03:37:20 +0000 (20:37 -0700)
Summary:
Use DB session ids in SST table properties to make cache keys
stable across DB re-open and copy / move / restore / etc.

These new cache keys are currently only enabled when FileSystem does not
provide GetUniqueId. For now, they are typically larger, so slightly
less efficient.

Relevant to https://github.com/facebook/rocksdb/issues/7405

This change has a minor regression in PersistentCache functionality:
metaindex blocks are no longer cached in PersistentCache. Table properties
blocks already were not but ideally should be. I didn't spent effort to
fix & test these issues because we don't believe PersistentCache is used much
if at all and expect SecondaryCache to replace it. (Though PRs are welcome.)

FIXME: there is more to be fixed for stable cache keys on external SST files

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8659

Test Plan:
new unit test added, which fails when disabling new
functionality

Reviewed By: zhichao-cao

Differential Revision: D30297705

Pulled By: pdillinger

fbshipit-source-id: e8539a5c8802a79340405629870f2e3fb3822d3a

db/db_basic_test.cc
db/db_block_cache_test.cc
table/block_based/block_based_table_builder.cc
table/block_based/block_based_table_reader.cc
table/block_based/block_based_table_reader.h
table/meta_blocks.cc

index 13644025b209d10be85ca4e3e7074ab0e088472b..b22010528dcb2eda2369e0d950ad2963c42e1d3a 100644 (file)
@@ -2756,6 +2756,11 @@ class DBBasicTestMultiGet : public DBTestBase {
         EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf]));
       }
     }
+    // Clear compressed cache, which is always pre-populated
+    if (compressed_cache_) {
+      compressed_cache_->SetCapacity(0);
+      compressed_cache_->SetCapacity(1048576);
+    }
   }
 
   bool CheckValue(int i, const std::string& value) {
index 0b4389fd17fb5fd739f4c2a29c659cb787d78cf9..c7c100f6341a38ca5b94e435b271f7e916f41f05 100644 (file)
@@ -7,6 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include <cstdlib>
+#include <functional>
 #include <memory>
 
 #include "cache/cache_entry_roles.h"
 #include "db/column_family.h"
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/table.h"
 #include "util/compression.h"
 #include "util/random.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -1298,6 +1301,102 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
 
 #endif  // ROCKSDB_LITE
 
+// Disable LinkFile so that we can physically copy a DB using Checkpoint.
+// Disable file GetUniqueId to enable stable cache keys.
+class StableCacheKeyTestFS : public FaultInjectionTestFS {
+ public:
+  explicit StableCacheKeyTestFS(const std::shared_ptr<FileSystem>& base)
+      : FaultInjectionTestFS(base) {
+    SetFailGetUniqueId(true);
+  }
+
+  virtual ~StableCacheKeyTestFS() {}
+
+  IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&,
+                    IODebugContext*) override {
+    return IOStatus::NotSupported("Disabled");
+  }
+};
+
+TEST_F(DBBlockCacheTest, StableCacheKeys) {
+  std::shared_ptr<StableCacheKeyTestFS> test_fs{
+      new StableCacheKeyTestFS(env_->GetFileSystem())};
+  std::unique_ptr<CompositeEnvWrapper> test_env{
+      new CompositeEnvWrapper(env_, test_fs)};
+
+  for (bool compressed : {false, true}) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    options.env = test_env.get();
+
+    BlockBasedTableOptions table_options;
+
+    std::function<void()> verify_stats;
+    if (compressed) {
+      if (!Snappy_Supported()) {
+        fprintf(stderr, "skipping compressed test, snappy unavailable\n");
+        continue;
+      }
+      options.compression = CompressionType::kSnappyCompression;
+      table_options.no_block_cache = true;
+      table_options.block_cache_compressed = NewLRUCache(1 << 25, 0, false);
+      verify_stats = [&options] {
+        ASSERT_EQ(
+            1, options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_ADD));
+      };
+    } else {
+      table_options.cache_index_and_filter_blocks = true;
+      table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+      verify_stats = [&options] {
+        ASSERT_EQ(1, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+        ASSERT_EQ(1, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+        ASSERT_EQ(1,
+                  options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+      };
+    }
+
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("key1", "abc"));
+    std::string something_compressible(500U, 'x');
+    ASSERT_OK(Put("key2", something_compressible));
+    ASSERT_OK(Flush());
+
+    ASSERT_EQ(Get("key1"), std::string("abc"));
+    verify_stats();
+
+    // Make sure we can cache hit after re-open
+    Reopen(options);
+
+    ASSERT_EQ(Get("key1"), std::string("abc"));
+    verify_stats();
+
+    // Make sure we can cache hit even on a full copy of the DB. Using
+    // StableCacheKeyTestFS, Checkpoint will resort to full copy not hard link.
+    // (Checkpoint  not available in LITE mode to test this.)
+#ifndef ROCKSDB_LITE
+    auto db_copy_name = dbname_ + "-copy";
+    Checkpoint* checkpoint;
+    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(checkpoint->CreateCheckpoint(db_copy_name));
+    delete checkpoint;
+
+    Close();
+    Destroy(options);
+    dbname_ = db_copy_name;
+    Reopen(options);
+
+    ASSERT_EQ(Get("key1"), std::string("abc"));
+    verify_stats();
+#endif  // !ROCKSDB_LITE
+
+    Close();
+  }
+}
+
 class DBBlockCachePinningTest
     : public DBTestBase,
       public testing::WithParamInterface<
index 07c545656d6fec882161e485dc85e549b2bd2043..2b20cdc46876cfb309a4795d74bcf5f9af54bb01 100644 (file)
@@ -1398,6 +1398,7 @@ void DeleteEntryCached(const Slice& /*key*/, void* value) {
 // Helper function to setup the cache key's prefix for the Table.
 void BlockBasedTableBuilder::SetupCacheKeyPrefix(
     const TableBuilderOptions& tbo) {
+  // FIXME: Unify with BlockBasedTable::SetupCacheKeyPrefix
   if (rep_->table_options.block_cache.get() != nullptr) {
     BlockBasedTable::GenerateCachePrefix<Cache, FSWritableFile>(
         rep_->table_options.block_cache.get(), rep_->file->writable_file(),
index f6eca75e3e3406d7fe63c608f533efe4f5de5330..5c020084d62c835258fdece2f72af505177eb30b 100644 (file)
@@ -57,6 +57,7 @@
 #include "table/meta_blocks.h"
 #include "table/multiget_context.h"
 #include "table/persistent_cache_helper.h"
+#include "table/persistent_cache_options.h"
 #include "table/sst_file_writer_collectors.h"
 #include "table/two_level_iterator.h"
 #include "test_util/sync_point.h"
@@ -371,7 +372,7 @@ Cache::Handle* BlockBasedTable::GetEntryFromCache(
 // Helper function to setup the cache key's prefix for the Table.
 void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep,
                                           const std::string& db_session_id,
-                                          uint64_t cur_file_num) {
+                                          uint64_t file_num) {
   assert(kMaxCacheKeyPrefixSize >= 10);
   rep->cache_key_prefix_size = 0;
   rep->compressed_cache_key_prefix_size = 0;
@@ -379,19 +380,28 @@ void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep,
     GenerateCachePrefix<Cache, FSRandomAccessFile>(
         rep->table_options.block_cache.get(), rep->file->file(),
         &rep->cache_key_prefix[0], &rep->cache_key_prefix_size, db_session_id,
-        cur_file_num);
-  }
-  if (rep->table_options.persistent_cache != nullptr) {
-    GenerateCachePrefix<PersistentCache, FSRandomAccessFile>(
-        rep->table_options.persistent_cache.get(), rep->file->file(),
-        &rep->persistent_cache_key_prefix[0],
-        &rep->persistent_cache_key_prefix_size, "", cur_file_num);
+        file_num);
   }
   if (rep->table_options.block_cache_compressed != nullptr) {
     GenerateCachePrefix<Cache, FSRandomAccessFile>(
         rep->table_options.block_cache_compressed.get(), rep->file->file(),
         &rep->compressed_cache_key_prefix[0],
-        &rep->compressed_cache_key_prefix_size, "", cur_file_num);
+        &rep->compressed_cache_key_prefix_size, db_session_id, file_num);
+  }
+  if (rep->table_options.persistent_cache != nullptr) {
+    char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
+    size_t persistent_cache_key_prefix_size = 0;
+
+    GenerateCachePrefix<PersistentCache, FSRandomAccessFile>(
+        rep->table_options.persistent_cache.get(), rep->file->file(),
+        &persistent_cache_key_prefix[0], &persistent_cache_key_prefix_size,
+        db_session_id, file_num);
+
+    rep->persistent_cache_options =
+        PersistentCacheOptions(rep->table_options.persistent_cache,
+                               std::string(persistent_cache_key_prefix,
+                                           persistent_cache_key_prefix_size),
+                               rep->ioptions.stats);
   }
 }
 
@@ -513,7 +523,7 @@ Status BlockBasedTable::Open(
     const SequenceNumber largest_seqno, const bool force_direct_prefetch,
     TailPrefetchStats* tail_prefetch_stats,
     BlockCacheTracer* const block_cache_tracer,
-    size_t max_file_size_for_l0_meta_pin, const std::string& db_session_id,
+    size_t max_file_size_for_l0_meta_pin, const std::string& cur_db_session_id,
     uint64_t cur_file_num) {
   table_reader->reset();
 
@@ -588,16 +598,11 @@ Status BlockBasedTable::Open(
     rep->internal_prefix_transform.reset(
         new InternalKeySliceTransform(prefix_extractor));
   }
-  SetupCacheKeyPrefix(rep, db_session_id, cur_file_num);
-  std::unique_ptr<BlockBasedTable> new_table(
-      new BlockBasedTable(rep, block_cache_tracer));
 
-  // page cache options
-  rep->persistent_cache_options =
-      PersistentCacheOptions(rep->table_options.persistent_cache,
-                             std::string(rep->persistent_cache_key_prefix,
-                                         rep->persistent_cache_key_prefix_size),
-                             rep->ioptions.stats);
+  // For fully portable/stable cache keys, we need to read the properties
+  // block before setting up cache keys. TODO: consider setting up a bootstrap
+  // cache key for PersistentCache to use for metaindex and properties blocks.
+  rep->persistent_cache_options = PersistentCacheOptions();
 
   // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
   // handle to null, otherwise it may be seen as uninitialized during the below
@@ -605,6 +610,8 @@ Status BlockBasedTable::Open(
   rep->compression_dict_handle = BlockHandle::NullBlockHandle();
 
   // Read metaindex
+  std::unique_ptr<BlockBasedTable> new_table(
+      new BlockBasedTable(rep, block_cache_tracer));
   std::unique_ptr<Block> metaindex;
   std::unique_ptr<InternalIterator> metaindex_iter;
   s = new_table->ReadMetaIndexBlock(ro, prefetch_buffer.get(), &metaindex,
@@ -620,6 +627,36 @@ Status BlockBasedTable::Open(
   if (!s.ok()) {
     return s;
   }
+
+  // With properties loaded, we can set up portable/stable cache keys if
+  // necessary info is available
+  std::string db_session_id = cur_db_session_id;
+  uint64_t file_num = cur_file_num;
+  if (rep->table_properties && !rep->table_properties->db_session_id.empty()) {
+    const auto& uprops = rep->table_properties->user_collected_properties;
+    auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion);
+    if (version_iter == uprops.end()) {
+      // Normal (non-external) SST file - can only use embedded db_session_id
+      // with current file number (which should be original file number)
+      if (file_num > 0) {
+        db_session_id = rep->table_properties->db_session_id;
+      }
+    } else {
+      // External (ingested) SST file - should not use current file number
+      // (which is changed from original), so that same file ingested into
+      // different DBs can share block cache entries. Although they can modify
+      // the embedded global_seqno, that information is not currently cached
+      // under these portable/stable keys.
+      // Note: For now, each external SST file gets its own unique session id,
+      // so we can use a fixed file number under than session id.
+      // ... except FIXME (peterd): sst_file_writer currently uses wrong
+      // format for db_session_ids so this approach doesn't work yet.
+      db_session_id = rep->table_properties->db_session_id;
+      file_num = 1;
+    }
+  }
+  SetupCacheKeyPrefix(rep, db_session_id, file_num);
+
   s = new_table->ReadRangeDelBlock(ro, prefetch_buffer.get(),
                                    metaindex_iter.get(), internal_comparator,
                                    &lookup_context);
index 43b56a68ca3ae65330241dbbe0ffa7f7bfc3044e..e64d09d209edad501fbcf6d12ca215b74f737d85 100644 (file)
@@ -9,6 +9,8 @@
 
 #pragma once
 
+#include <cstdint>
+
 #include "db/range_tombstone_fragmenter.h"
 #include "file/filename.h"
 #include "table/block_based/block_based_table_factory.h"
@@ -19,7 +21,6 @@
 #include "table/table_properties_internal.h"
 #include "table/table_reader.h"
 #include "table/two_level_iterator.h"
-
 #include "trace_replay/block_cache_tracer.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -100,7 +101,7 @@ class BlockBasedTable : public TableReader {
                      TailPrefetchStats* tail_prefetch_stats = nullptr,
                      BlockCacheTracer* const block_cache_tracer = nullptr,
                      size_t max_file_size_for_l0_meta_pin = 0,
-                     const std::string& db_session_id = "",
+                     const std::string& cur_db_session_id = "",
                      uint64_t cur_file_num = 0);
 
   bool PrefixMayMatch(const Slice& internal_key,
@@ -555,11 +556,11 @@ struct BlockBasedTable::Rep {
   Status status;
   std::unique_ptr<RandomAccessFileReader> file;
   char cache_key_prefix[kMaxCacheKeyPrefixSize];
-  size_t cache_key_prefix_size = 0;
-  char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
-  size_t persistent_cache_key_prefix_size = 0;
+  // SIZE_MAX -> assert not used without re-assignment
+  size_t cache_key_prefix_size = SIZE_MAX;
   char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
-  size_t compressed_cache_key_prefix_size = 0;
+  // SIZE_MAX -> assert not used without re-assignment
+  size_t compressed_cache_key_prefix_size = SIZE_MAX;
   PersistentCacheOptions persistent_cache_options;
 
   // Footer contains the fixed table information
index 52e56be81bf3113ab0c0940477e120e797a76bad..fff1397c8b0124dd1659f979ecfcb94fae18e11f 100644 (file)
@@ -228,6 +228,8 @@ Status ReadProperties(const ReadOptions& read_options,
 
   BlockContents block_contents;
   Status s;
+  // FIXME: should be a parameter for reading table properties to use persistent
+  // cache
   PersistentCacheOptions cache_options;
   ReadOptions ro = read_options;
   ro.verify_checksums = verify_checksum;